blob: 67e7efe12ff7e727c215ed0c67b1b063537c8eca [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Joe Perchese005d192012-05-16 19:58:40 +000092#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93
Randy Dunlap4fc268d2006-01-11 12:17:47 -080094#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070095#include <linux/errno.h>
Richard Cochrancb820f82013-07-19 19:40:09 +020096#include <linux/errqueue.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070097#include <linux/types.h>
98#include <linux/socket.h>
99#include <linux/in.h>
100#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700101#include <linux/module.h>
102#include <linux/proc_fs.h>
103#include <linux/seq_file.h>
104#include <linux/sched.h>
105#include <linux/timer.h>
106#include <linux/string.h>
107#include <linux/sockios.h>
108#include <linux/net.h>
109#include <linux/mm.h>
110#include <linux/slab.h>
111#include <linux/interrupt.h>
112#include <linux/poll.h>
113#include <linux/tcp.h>
114#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400115#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000116#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100117#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800118#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400119#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120
121#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122
123#include <linux/netdevice.h>
124#include <net/protocol.h>
125#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200126#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700127#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000129#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130#include <net/xfrm.h>
131#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700132#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000133#include <net/netprio_cgroup.h>
Craig Gallekeb4cb002015-06-15 11:26:18 -0400134#include <linux/sock_diag.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135
136#include <linux/filter.h>
Craig Gallek538950a2016-01-04 17:41:47 -0500137#include <net/sock_reuseport.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
Satoru Moriya3847ce32011-06-17 12:00:03 +0000139#include <trace/events/sock.h>
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141#ifdef CONFIG_INET
142#include <net/tcp.h>
143#endif
144
Eliezer Tamir076bb0c2013-07-10 17:13:17 +0300145#include <net/busy_poll.h>
Eliezer Tamir06021292013-06-10 11:39:50 +0300146
Glauber Costa36b77a52011-12-16 00:51:59 +0000147static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000148static LIST_HEAD(proto_list);
149
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700150/**
151 * sk_ns_capable - General socket capability test
152 * @sk: Socket to use a capability on or through
153 * @user_ns: The user namespace of the capability to use
154 * @cap: The capability to use
155 *
156 * Test to see if the opener of the socket had when the socket was
157 * created and the current process has the capability @cap in the user
158 * namespace @user_ns.
159 */
160bool sk_ns_capable(const struct sock *sk,
161 struct user_namespace *user_ns, int cap)
162{
163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 ns_capable(user_ns, cap);
165}
166EXPORT_SYMBOL(sk_ns_capable);
167
168/**
169 * sk_capable - Socket global capability test
170 * @sk: Socket to use a capability on or through
Masanari Iidae793c0f2014-09-04 23:44:36 +0900171 * @cap: The global capability to use
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700172 *
173 * Test to see if the opener of the socket had when the socket was
174 * created and the current process has the capability @cap in all user
175 * namespaces.
176 */
177bool sk_capable(const struct sock *sk, int cap)
178{
179 return sk_ns_capable(sk, &init_user_ns, cap);
180}
181EXPORT_SYMBOL(sk_capable);
182
183/**
184 * sk_net_capable - Network namespace socket capability test
185 * @sk: Socket to use a capability on or through
186 * @cap: The capability to use
187 *
Masanari Iidae793c0f2014-09-04 23:44:36 +0900188 * Test to see if the opener of the socket had when the socket was created
Eric W. Biedermana3b299d2014-04-23 14:26:56 -0700189 * and the current process has the capability @cap over the network namespace
190 * the socket is a member of.
191 */
192bool sk_net_capable(const struct sock *sk, int cap)
193{
194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195}
196EXPORT_SYMBOL(sk_net_capable);
197
Ingo Molnarda21f242006-07-03 00:25:12 -0700198/*
199 * Each address family might have different locking rules, so we have
200 * one slock key per address family:
201 */
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700202static struct lock_class_key af_family_keys[AF_MAX];
203static struct lock_class_key af_family_slock_keys[AF_MAX];
204
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700205/*
206 * Make lock validator output more readable. (we pre-construct these
207 * strings build-time, so that runtime initialization of socket
208 * locks is fast):
209 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700210static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700225};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700226static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800236 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700241};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700242static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700252 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Federico Vaga456db6a2013-05-28 05:02:44 +0000256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700257};
Ingo Molnarda21f242006-07-03 00:25:12 -0700258
259/*
260 * sk_callback_lock locking rules are per-address-family,
261 * so split the lock classes by using a per-AF key:
262 */
263static struct lock_class_key af_callback_keys[AF_MAX];
264
Linus Torvalds1da177e2005-04-16 15:20:36 -0700265/* Take into consideration the size of the struct sk_buff overhead in the
266 * determination of these values, since that is non-constant across
267 * platforms. This makes socket queueing behavior and performance
268 * not depend upon such differences.
269 */
270#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000271#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700272#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274
275/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700276__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200277EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700278__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200279EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700280__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700282
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300283/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700284int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000285EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286
Willem de Bruijnb245be12015-01-30 13:29:32 -0500287int sysctl_tstamp_allow_data __read_mostly = 1;
288
Mel Gormanc93bdd02012-07-31 16:44:19 -0700289struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290EXPORT_SYMBOL_GPL(memalloc_socks);
291
Mel Gorman7cb02402012-07-31 16:44:16 -0700292/**
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
295 *
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
299 */
300void sk_set_memalloc(struct sock *sk)
301{
302 sock_set_flag(sk, SOCK_MEMALLOC);
303 sk->sk_allocation |= __GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700304 static_key_slow_inc(&memalloc_socks);
Mel Gorman7cb02402012-07-31 16:44:16 -0700305}
306EXPORT_SYMBOL_GPL(sk_set_memalloc);
307
308void sk_clear_memalloc(struct sock *sk)
309{
310 sock_reset_flag(sk, SOCK_MEMALLOC);
311 sk->sk_allocation &= ~__GFP_MEMALLOC;
Mel Gormanc93bdd02012-07-31 16:44:19 -0700312 static_key_slow_dec(&memalloc_socks);
Mel Gormanc76562b2012-07-31 16:44:41 -0700313
314 /*
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
Mel Gorman5d753612015-06-10 21:02:04 -0400316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
Mel Gormanc76562b2012-07-31 16:44:41 -0700320 */
Mel Gorman5d753612015-06-10 21:02:04 -0400321 sk_mem_reclaim(sk);
Mel Gorman7cb02402012-07-31 16:44:16 -0700322}
323EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324
Mel Gormanb4b9e352012-07-31 16:44:26 -0700325int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326{
327 int ret;
328 unsigned long pflags = current->flags;
329
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332
333 current->flags |= PF_MEMALLOC;
334 ret = sk->sk_backlog_rcv(sk, skb);
335 tsk_restore_flags(current, pflags, PF_MEMALLOC);
336
337 return ret;
338}
339EXPORT_SYMBOL(__sk_backlog_rcv);
340
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342{
343 struct timeval tv;
344
345 if (optlen < sizeof(tv))
346 return -EINVAL;
347 if (copy_from_user(&tv, optval, sizeof(tv)))
348 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
Vasily Averinba780732007-05-24 16:58:54 -0700352 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700353 static int warned __read_mostly;
354
Vasily Averinba780732007-05-24 16:58:54 -0700355 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700356 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700357 warned++;
Joe Perchese005d192012-05-16 19:58:40 +0000358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 __func__, current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700360 }
Vasily Averinba780732007-05-24 16:58:54 -0700361 return 0;
362 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700363 *timeo_p = MAX_SCHEDULE_TIMEOUT;
364 if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 return 0;
366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 return 0;
369}
370
371static void sock_warn_obsolete_bsdism(const char *name)
372{
373 static int warned;
374 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900375 if (strcmp(warncomm, current->comm) && warned < 5) {
376 strcpy(warncomm, current->comm);
Joe Perchese005d192012-05-16 19:58:40 +0000377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 warncomm, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700379 warned++;
380 }
381}
382
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100383static bool sock_needs_netstamp(const struct sock *sk)
384{
385 switch (sk->sk_family) {
386 case AF_UNSPEC:
387 case AF_UNIX:
388 return false;
389 default:
390 return true;
391 }
392}
393
Eric Dumazet08e29af2011-11-28 12:04:18 +0000394static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900395{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000396 if (sk->sk_flags & flags) {
397 sk->sk_flags &= ~flags;
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +0100398 if (sock_needs_netstamp(sk) &&
399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000400 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700401 }
402}
403
404
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800405int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406{
Eric Dumazet766e90372009-10-14 20:40:11 -0700407 int err;
Neil Horman3b885782009-10-12 13:26:31 -0700408 unsigned long flags;
409 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800410
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700412 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000413 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700414 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800415 }
416
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700417 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800418 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700419 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800420
Mel Gormanc76562b2012-07-31 16:44:41 -0700421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700422 atomic_inc(&sk->sk_drops);
423 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800424 }
425
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800426 skb->dev = NULL;
427 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800428
Eric Dumazet7fee2262010-05-11 23:19:48 +0000429 /* we escape from rcu protected region, make sure we dont leak
430 * a norefcounted dst
431 */
432 skb_dst_force(skb);
433
Neil Horman3b885782009-10-12 13:26:31 -0700434 spin_lock_irqsave(&list->lock, flags);
Eyal Birger3bc3b962015-03-01 14:58:30 +0200435 sock_skb_set_dropcount(sk, skb);
Neil Horman3b885782009-10-12 13:26:31 -0700436 __skb_queue_tail(list, skb);
437 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800438
439 if (!sock_flag(sk, SOCK_DEAD))
David S. Miller676d2362014-04-11 16:15:36 -0400440 sk->sk_data_ready(sk);
Eric Dumazet766e90372009-10-14 20:40:11 -0700441 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800442}
443EXPORT_SYMBOL(sock_queue_rcv_skb);
444
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200445int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800446{
447 int rc = NET_RX_SUCCESS;
448
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700449 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800450 goto discard_and_relse;
451
452 skb->dev = NULL;
453
Sorin Dumitru274f4822014-07-22 21:16:51 +0300454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700455 atomic_inc(&sk->sk_drops);
456 goto discard_and_relse;
457 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200458 if (nested)
459 bh_lock_sock_nested(sk);
460 else
461 bh_lock_sock(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700462 if (!sock_owned_by_user(sk)) {
463 /*
464 * trylock + unlock semantics:
465 */
466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700468 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb92006-07-03 00:25:35 -0700469
470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000472 bh_unlock_sock(sk);
473 atomic_inc(&sk->sk_drops);
474 goto discard_and_relse;
475 }
476
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800477 bh_unlock_sock(sk);
478out:
479 sock_put(sk);
480 return rc;
481discard_and_relse:
482 kfree_skb(skb);
483 goto out;
484}
485EXPORT_SYMBOL(sk_receive_skb);
486
487struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000489 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800490
491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000492 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800494 dst_release(dst);
495 return NULL;
496 }
497
498 return dst;
499}
500EXPORT_SYMBOL(__sk_dst_check);
501
502struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503{
504 struct dst_entry *dst = sk_dst_get(sk);
505
506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 sk_dst_reset(sk);
508 dst_release(dst);
509 return NULL;
510 }
511
512 return dst;
513}
514EXPORT_SYMBOL(sk_dst_check);
515
Brian Haleyc91f6df2012-11-26 05:21:08 +0000516static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 int optlen)
David S. Miller48788092007-09-14 16:41:03 -0700518{
519 int ret = -ENOPROTOOPT;
520#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900521 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700522 char devname[IFNAMSIZ];
523 int index;
524
525 /* Sorry... */
526 ret = -EPERM;
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000527 if (!ns_capable(net->user_ns, CAP_NET_RAW))
David S. Miller48788092007-09-14 16:41:03 -0700528 goto out;
529
530 ret = -EINVAL;
531 if (optlen < 0)
532 goto out;
533
534 /* Bind this socket to a particular device like "eth0",
535 * as specified in the passed interface name. If the
536 * name is "" or the option length is zero the socket
537 * is not bound.
538 */
539 if (optlen > IFNAMSIZ - 1)
540 optlen = IFNAMSIZ - 1;
541 memset(devname, 0, sizeof(devname));
542
543 ret = -EFAULT;
544 if (copy_from_user(devname, optval, optlen))
545 goto out;
546
David S. Miller000ba2e2009-11-05 22:37:11 -0800547 index = 0;
548 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800549 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700550
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800551 rcu_read_lock();
552 dev = dev_get_by_name_rcu(net, devname);
553 if (dev)
554 index = dev->ifindex;
555 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700556 ret = -ENODEV;
557 if (!dev)
558 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700559 }
560
561 lock_sock(sk);
562 sk->sk_bound_dev_if = index;
563 sk_dst_reset(sk);
564 release_sock(sk);
565
566 ret = 0;
567
568out:
569#endif
570
571 return ret;
572}
573
Brian Haleyc91f6df2012-11-26 05:21:08 +0000574static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 int __user *optlen, int len)
576{
577 int ret = -ENOPROTOOPT;
578#ifdef CONFIG_NETDEVICES
579 struct net *net = sock_net(sk);
Brian Haleyc91f6df2012-11-26 05:21:08 +0000580 char devname[IFNAMSIZ];
Brian Haleyc91f6df2012-11-26 05:21:08 +0000581
582 if (sk->sk_bound_dev_if == 0) {
583 len = 0;
584 goto zero;
585 }
586
587 ret = -EINVAL;
588 if (len < IFNAMSIZ)
589 goto out;
590
Nicolas Schichan5dbe7c12013-06-26 17:23:42 +0200591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 if (ret)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000593 goto out;
Brian Haleyc91f6df2012-11-26 05:21:08 +0000594
595 len = strlen(devname) + 1;
596
597 ret = -EFAULT;
598 if (copy_to_user(optval, devname, len))
599 goto out;
600
601zero:
602 ret = -EFAULT;
603 if (put_user(len, optlen))
604 goto out;
605
606 ret = 0;
607
608out:
609#endif
610
611 return ret;
612}
613
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800614static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615{
616 if (valbool)
617 sock_set_flag(sk, bit);
618 else
619 sock_reset_flag(sk, bit);
620}
621
hannes@stressinduktion.orgf60e5992015-04-01 17:07:44 +0200622bool sk_mc_loop(struct sock *sk)
623{
624 if (dev_recursion_level())
625 return false;
626 if (!sk)
627 return true;
628 switch (sk->sk_family) {
629 case AF_INET:
630 return inet_sk(sk)->mc_loop;
631#if IS_ENABLED(CONFIG_IPV6)
632 case AF_INET6:
633 return inet6_sk(sk)->mc_loop;
634#endif
635 }
636 WARN_ON(1);
637 return true;
638}
639EXPORT_SYMBOL(sk_mc_loop);
640
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641/*
642 * This is meant for all protocols to use and covers goings on
643 * at the socket level. Everything here is generic.
644 */
645
646int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700647 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648{
Eric Dumazet2a915252009-05-27 11:30:05 +0000649 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650 int val;
651 int valbool;
652 struct linger ling;
653 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900654
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 /*
656 * Options without arguments
657 */
658
David S. Miller48788092007-09-14 16:41:03 -0700659 if (optname == SO_BINDTODEVICE)
Brian Haleyc91f6df2012-11-26 05:21:08 +0000660 return sock_setbindtodevice(sk, optval, optlen);
David S. Miller48788092007-09-14 16:41:03 -0700661
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700662 if (optlen < sizeof(int))
663 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900664
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665 if (get_user(val, (int __user *)optval))
666 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900667
Eric Dumazet2a915252009-05-27 11:30:05 +0000668 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670 lock_sock(sk);
671
Eric Dumazet2a915252009-05-27 11:30:05 +0000672 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700673 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000674 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700675 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000676 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800677 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700678 break;
679 case SO_REUSEADDR:
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700681 break;
Tom Herbert055dc212013-01-22 09:49:50 +0000682 case SO_REUSEPORT:
683 sk->sk_reuseport = valbool;
684 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700685 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000686 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000687 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700688 case SO_ERROR:
689 ret = -ENOPROTOOPT;
690 break;
691 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700693 break;
694 case SO_BROADCAST:
695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 break;
697 case SO_SNDBUF:
698 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000699 * about it this is right. Otherwise apps have to
700 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 * are treated in BSD as hints
702 */
703 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700704set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazet82981932012-04-26 20:07:59 +0000706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700708 sk->sk_write_space(sk);
709 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700710
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700711 case SO_SNDBUFFORCE:
712 if (!capable(CAP_NET_ADMIN)) {
713 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700714 break;
715 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700716 goto set_sndbuf;
717
718 case SO_RCVBUF:
719 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000720 * about it this is right. Otherwise apps have to
721 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 * are treated in BSD as hints
723 */
724 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700725set_rcvbuf:
726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 /*
728 * We double it on the way in to account for
729 * "struct sk_buff" etc. overhead. Applications
730 * assume that the SO_RCVBUF setting they make will
731 * allow that much actual data to be received on that
732 * socket.
733 *
734 * Applications are unaware that "struct sk_buff" and
735 * other overheads allocate from the receive buffer
736 * during socket buffer allocation.
737 *
738 * And after considering the possible alternatives,
739 * returning the value we actually used in getsockopt
740 * is the most desirable behavior.
741 */
Eric Dumazet82981932012-04-26 20:07:59 +0000742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700743 break;
744
745 case SO_RCVBUFFORCE:
746 if (!capable(CAP_NET_ADMIN)) {
747 ret = -EPERM;
748 break;
749 }
750 goto set_rcvbuf;
751
752 case SO_KEEPALIVE:
753#ifdef CONFIG_INET
Eric Dumazet3e109862012-09-24 07:00:11 +0000754 if (sk->sk_protocol == IPPROTO_TCP &&
755 sk->sk_type == SOCK_STREAM)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700756 tcp_set_keepalive(sk, valbool);
757#endif
758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 break;
760
761 case SO_OOBINLINE:
762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 break;
764
765 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -0700766 sk->sk_no_check_tx = valbool;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700767 break;
768
769 case SO_PRIORITY:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000770 if ((val >= 0 && val <= 6) ||
771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700772 sk->sk_priority = val;
773 else
774 ret = -EPERM;
775 break;
776
777 case SO_LINGER:
778 if (optlen < sizeof(ling)) {
779 ret = -EINVAL; /* 1003.1g */
780 break;
781 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000782 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700783 ret = -EFAULT;
784 break;
785 }
786 if (!ling.l_onoff)
787 sock_reset_flag(sk, SOCK_LINGER);
788 else {
789#if (BITS_PER_LONG == 32)
790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 else
793#endif
794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 sock_set_flag(sk, SOCK_LINGER);
796 }
797 break;
798
799 case SO_BSDCOMPAT:
800 sock_warn_obsolete_bsdism("setsockopt");
801 break;
802
803 case SO_PASSCRED:
804 if (valbool)
805 set_bit(SOCK_PASSCRED, &sock->flags);
806 else
807 clear_bit(SOCK_PASSCRED, &sock->flags);
808 break;
809
810 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700811 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700812 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700813 if (optname == SO_TIMESTAMP)
814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 else
816 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700817 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700819 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700820 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700823 break;
824
Patrick Ohly20d49472009-02-12 05:03:38 +0000825 case SO_TIMESTAMPING:
826 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000827 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000828 break;
829 }
Willem de Bruijnb245be12015-01-30 13:29:32 -0500830
Willem de Bruijn09c2d252014-08-04 22:11:47 -0400831 if (val & SOF_TIMESTAMPING_OPT_ID &&
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
WANG Congac5cc972015-12-16 23:39:04 -0800833 if (sk->sk_protocol == IPPROTO_TCP &&
834 sk->sk_type == SOCK_STREAM) {
Willem de Bruijn4ed2d762014-08-04 22:11:49 -0400835 if (sk->sk_state != TCP_ESTABLISHED) {
836 ret = -EINVAL;
837 break;
838 }
839 sk->sk_tskey = tcp_sk(sk)->snd_una;
840 } else {
841 sk->sk_tskey = 0;
842 }
843 }
Willem de Bruijnb9f40e22014-08-04 22:11:46 -0400844 sk->sk_tsflags = val;
Patrick Ohly20d49472009-02-12 05:03:38 +0000845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 sock_enable_timestamp(sk,
847 SOCK_TIMESTAMPING_RX_SOFTWARE);
848 else
849 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000851 break;
852
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 case SO_RCVLOWAT:
854 if (val < 0)
855 val = INT_MAX;
856 sk->sk_rcvlowat = val ? : 1;
857 break;
858
859 case SO_RCVTIMEO:
860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 break;
862
863 case SO_SNDTIMEO:
864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 break;
866
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700867 case SO_ATTACH_FILTER:
868 ret = -EINVAL;
869 if (optlen == sizeof(struct sock_fprog)) {
870 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700872 ret = -EFAULT;
873 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700876 ret = sk_attach_filter(&fprog, sk);
877 }
878 break;
879
Alexei Starovoitov89aa0752014-12-01 15:06:35 -0800880 case SO_ATTACH_BPF:
881 ret = -EINVAL;
882 if (optlen == sizeof(u32)) {
883 u32 ufd;
884
885 ret = -EFAULT;
886 if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 break;
888
889 ret = sk_attach_bpf(ufd, sk);
890 }
891 break;
892
Craig Gallek538950a2016-01-04 17:41:47 -0500893 case SO_ATTACH_REUSEPORT_CBPF:
894 ret = -EINVAL;
895 if (optlen == sizeof(struct sock_fprog)) {
896 struct sock_fprog fprog;
897
898 ret = -EFAULT;
899 if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 break;
901
902 ret = sk_reuseport_attach_filter(&fprog, sk);
903 }
904 break;
905
906 case SO_ATTACH_REUSEPORT_EBPF:
907 ret = -EINVAL;
908 if (optlen == sizeof(u32)) {
909 u32 ufd;
910
911 ret = -EFAULT;
912 if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 break;
914
915 ret = sk_reuseport_attach_bpf(ufd, sk);
916 }
917 break;
918
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700919 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700920 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700921 break;
922
Vincent Bernatd59577b2013-01-16 22:55:49 +0100923 case SO_LOCK_FILTER:
924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 ret = -EPERM;
926 else
927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 break;
929
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700930 case SO_PASSSEC:
931 if (valbool)
932 set_bit(SOCK_PASSSEC, &sock->flags);
933 else
934 clear_bit(SOCK_PASSSEC, &sock->flags);
935 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800936 case SO_MARK:
Eric W. Biederman5e1fccc2012-11-16 03:03:04 +0000937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800938 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000939 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800940 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800941 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700942
Neil Horman3b885782009-10-12 13:26:31 -0700943 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700945 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100946
947 case SO_WIFI_STATUS:
948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 break;
950
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000951 case SO_PEEK_OFF:
952 if (sock->ops->set_peek_off)
Sasha Levin12663bf2013-12-07 17:26:27 -0500953 ret = sock->ops->set_peek_off(sk, val);
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000954 else
955 ret = -EOPNOTSUPP;
956 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +0000957
958 case SO_NOFCS:
959 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 break;
961
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +0000962 case SO_SELECT_ERR_QUEUE:
963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 break;
965
Cong Wange0d10952013-08-01 11:10:25 +0800966#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +0300967 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +0300968 /* allow unprivileged users to decrease the value */
969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 ret = -EPERM;
971 else {
972 if (val < 0)
973 ret = -EINVAL;
974 else
975 sk->sk_ll_usec = val;
976 }
977 break;
978#endif
Eric Dumazet62748f32013-09-24 08:20:52 -0700979
980 case SO_MAX_PACING_RATE:
981 sk->sk_max_pacing_rate = val;
982 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 sk->sk_max_pacing_rate);
984 break;
985
Eric Dumazet70da2682015-10-08 19:33:21 -0700986 case SO_INCOMING_CPU:
987 sk->sk_incoming_cpu = val;
988 break;
989
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700990 default:
991 ret = -ENOPROTOOPT;
992 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900993 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 release_sock(sk);
995 return ret;
996}
Eric Dumazet2a915252009-05-27 11:30:05 +0000997EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998
999
stephen hemminger8f098982014-01-03 09:17:14 -08001000static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 struct ucred *ucred)
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001002{
1003 ucred->pid = pid_vnr(pid);
1004 ucred->uid = ucred->gid = -1;
1005 if (cred) {
1006 struct user_namespace *current_ns = current_user_ns();
1007
Eric W. Biedermanb2e4f542012-05-23 16:39:45 -06001008 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 ucred->gid = from_kgid_munged(current_ns, cred->egid);
Eric W. Biederman3f551f92010-06-13 03:28:59 +00001010 }
1011}
1012
Linus Torvalds1da177e2005-04-16 15:20:36 -07001013int sock_getsockopt(struct socket *sock, int level, int optname,
1014 char __user *optval, int __user *optlen)
1015{
1016 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001017
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001018 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001019 int val;
1020 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001021 struct timeval tm;
1022 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001023
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -08001024 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001025 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001026
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001027 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001028 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001029 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001030 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001031
Eugene Teo50fee1d2009-02-23 15:38:41 -08001032 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -08001033
Eric Dumazet2a915252009-05-27 11:30:05 +00001034 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001035 case SO_DEBUG:
1036 v.val = sock_flag(sk, SOCK_DBG);
1037 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001038
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001039 case SO_DONTROUTE:
1040 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001042
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001043 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001044 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001045 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001047 case SO_SNDBUF:
1048 v.val = sk->sk_sndbuf;
1049 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001050
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001051 case SO_RCVBUF:
1052 v.val = sk->sk_rcvbuf;
1053 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001054
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001055 case SO_REUSEADDR:
1056 v.val = sk->sk_reuse;
1057 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001058
Tom Herbert055dc212013-01-22 09:49:50 +00001059 case SO_REUSEPORT:
1060 v.val = sk->sk_reuseport;
1061 break;
1062
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001063 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001064 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001065 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001066
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001067 case SO_TYPE:
1068 v.val = sk->sk_type;
1069 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001070
Jan Engelhardt49c794e2009-08-04 07:28:28 +00001071 case SO_PROTOCOL:
1072 v.val = sk->sk_protocol;
1073 break;
1074
Jan Engelhardt0d6038e2009-08-04 07:28:29 +00001075 case SO_DOMAIN:
1076 v.val = sk->sk_family;
1077 break;
1078
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001079 case SO_ERROR:
1080 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +00001081 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001082 v.val = xchg(&sk->sk_err_soft, 0);
1083 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001084
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001085 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001086 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001087 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001088
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001089 case SO_NO_CHECK:
Tom Herbert28448b82014-05-23 08:47:19 -07001090 v.val = sk->sk_no_check_tx;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001091 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001093 case SO_PRIORITY:
1094 v.val = sk->sk_priority;
1095 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001096
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001097 case SO_LINGER:
1098 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001099 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001100 v.ling.l_linger = sk->sk_lingertime / HZ;
1101 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001102
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001103 case SO_BSDCOMPAT:
1104 sock_warn_obsolete_bsdism("getsockopt");
1105 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001107 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -07001108 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 break;
1111
1112 case SO_TIMESTAMPNS:
1113 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001114 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001115
Patrick Ohly20d49472009-02-12 05:03:38 +00001116 case SO_TIMESTAMPING:
Willem de Bruijnb9f40e22014-08-04 22:11:46 -04001117 v.val = sk->sk_tsflags;
Patrick Ohly20d49472009-02-12 05:03:38 +00001118 break;
1119
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001120 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001121 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001122 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 v.tm.tv_sec = 0;
1124 v.tm.tv_usec = 0;
1125 } else {
1126 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001129 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001131 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +00001132 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001133 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 v.tm.tv_sec = 0;
1135 v.tm.tv_usec = 0;
1136 } else {
1137 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 }
1140 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001142 case SO_RCVLOWAT:
1143 v.val = sk->sk_rcvlowat;
1144 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -07001145
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001146 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +00001147 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001148 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001149
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001150 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +00001151 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001152 break;
1153
1154 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001155 {
1156 struct ucred peercred;
1157 if (len > sizeof(peercred))
1158 len = sizeof(peercred);
1159 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001161 return -EFAULT;
1162 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001163 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001164
1165 case SO_PEERNAME:
1166 {
1167 char address[128];
1168
1169 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 return -ENOTCONN;
1171 if (lv < len)
1172 return -EINVAL;
1173 if (copy_to_user(optval, address, len))
1174 return -EFAULT;
1175 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001177
1178 /* Dubious BSD thing... Probably nobody even uses it, but
1179 * the UNIX standard wants it for whatever reason... -DaveM
1180 */
1181 case SO_ACCEPTCONN:
1182 v.val = sk->sk_state == TCP_LISTEN;
1183 break;
1184
1185 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001186 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001187 break;
1188
1189 case SO_PEERSEC:
1190 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001192 case SO_MARK:
1193 v.val = sk->sk_mark;
1194 break;
1195
Neil Horman3b885782009-10-12 13:26:31 -07001196 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001197 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001198 break;
1199
Johannes Berg6e3e9392011-11-09 10:15:42 +01001200 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001201 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001202 break;
1203
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001204 case SO_PEEK_OFF:
1205 if (!sock->ops->set_peek_off)
1206 return -EOPNOTSUPP;
1207
1208 v.val = sk->sk_peek_off;
1209 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001210 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001211 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001212 break;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001213
Pavel Emelyanovf7b86bf2012-10-18 23:55:56 +00001214 case SO_BINDTODEVICE:
Brian Haleyc91f6df2012-11-26 05:21:08 +00001215 return sock_getbindtodevice(sk, optval, optlen, len);
1216
Pavel Emelyanova8fc9272012-11-01 02:01:48 +00001217 case SO_GET_FILTER:
1218 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 if (len < 0)
1220 return len;
1221
1222 goto lenout;
Brian Haleyc91f6df2012-11-26 05:21:08 +00001223
Vincent Bernatd59577b2013-01-16 22:55:49 +01001224 case SO_LOCK_FILTER:
1225 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 break;
1227
Michal Sekletarea02f942014-01-17 17:09:45 +01001228 case SO_BPF_EXTENSIONS:
1229 v.val = bpf_tell_extensions();
1230 break;
1231
Keller, Jacob E7d4c04f2013-03-28 11:19:25 +00001232 case SO_SELECT_ERR_QUEUE:
1233 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 break;
1235
Cong Wange0d10952013-08-01 11:10:25 +08001236#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03001237 case SO_BUSY_POLL:
Eliezer Tamirdafcc432013-06-14 16:33:57 +03001238 v.val = sk->sk_ll_usec;
1239 break;
1240#endif
1241
Eric Dumazet62748f32013-09-24 08:20:52 -07001242 case SO_MAX_PACING_RATE:
1243 v.val = sk->sk_max_pacing_rate;
1244 break;
1245
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001246 case SO_INCOMING_CPU:
1247 v.val = sk->sk_incoming_cpu;
1248 break;
1249
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001250 default:
YOSHIFUJI Hideaki/吉藤英明443b5992015-03-23 18:04:13 +09001251 /* We implement the SO_SNDLOWAT etc to not be settable
1252 * (1003.1g 7).
1253 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001254 return -ENOPROTOOPT;
1255 }
1256
Linus Torvalds1da177e2005-04-16 15:20:36 -07001257 if (len > lv)
1258 len = lv;
1259 if (copy_to_user(optval, &v, len))
1260 return -EFAULT;
1261lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001262 if (put_user(len, optlen))
1263 return -EFAULT;
1264 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001265}
1266
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001267/*
1268 * Initialize an sk_lock.
1269 *
1270 * (We also register the sk_lock with the lock validator.)
1271 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001272static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001273{
Peter Zijlstraed075362006-12-06 20:35:24 -08001274 sock_lock_init_class_and_name(sk,
1275 af_family_slock_key_strings[sk->sk_family],
1276 af_family_slock_keys + sk->sk_family,
1277 af_family_key_strings[sk->sk_family],
1278 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07001279}
1280
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001281/*
1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001285 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001286static void sock_copy(struct sock *nsk, const struct sock *osk)
1287{
1288#ifdef CONFIG_SECURITY_NETWORK
1289 void *sptr = nsk->sk_security;
1290#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001291 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292
1293 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001296#ifdef CONFIG_SECURITY_NETWORK
1297 nsk->sk_security = sptr;
1298 security_sk_clone(osk, nsk);
1299#endif
1300}
1301
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001302void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303{
1304 unsigned long nulls1, nulls2;
1305
1306 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 if (nulls1 > nulls2)
1309 swap(nulls1, nulls2);
1310
1311 if (nulls1 != 0)
1312 memset((char *)sk, 0, nulls1);
1313 memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 nulls2 - nulls1 - sizeof(void *));
1315 memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 size - nulls2 - sizeof(void *));
1317}
1318EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001320static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001322{
1323 struct sock *sk;
1324 struct kmem_cache *slab;
1325
1326 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001327 if (slab != NULL) {
1328 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 if (!sk)
1330 return sk;
1331 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001332 if (prot->clear_sk)
1333 prot->clear_sk(sk, prot->obj_size);
1334 else
1335 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001336 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001337 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001338 sk = kmalloc(prot->obj_size, priority);
1339
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001340 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001341 kmemcheck_annotate_bitfield(sk, flags);
1342
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001343 if (security_sk_alloc(sk, family, priority))
1344 goto out_free;
1345
1346 if (!try_module_get(prot->owner))
1347 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001348 sk_tx_queue_clear(sk);
Tejun Heobd1060a2015-12-07 17:38:53 -05001349 cgroup_sk_alloc(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001350 }
1351
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001352 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001353
1354out_free_sec:
1355 security_sk_free(sk);
1356out_free:
1357 if (slab != NULL)
1358 kmem_cache_free(slab, sk);
1359 else
1360 kfree(sk);
1361 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001362}
1363
1364static void sk_prot_free(struct proto *prot, struct sock *sk)
1365{
1366 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001367 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001368
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001369 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001370 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001371
Tejun Heobd1060a2015-12-07 17:38:53 -05001372 cgroup_sk_free(&sk->sk_cgrp_data);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001373 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001374 if (slab != NULL)
1375 kmem_cache_free(slab, sk);
1376 else
1377 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001378 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001379}
1380
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381/**
1382 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001383 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001384 * @family: protocol family
1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386 * @prot: struct proto associated with this new sock instance
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001387 * @kern: is this to be a kernel socket?
Linus Torvalds1da177e2005-04-16 15:20:36 -07001388 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001389struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Eric W. Biederman11aa9c22015-05-08 21:09:13 -05001390 struct proto *prot, int kern)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001391{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001392 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001393
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001396 sk->sk_family = family;
1397 /*
1398 * See comment in struct sock definition to understand
1399 * why we need sk_prot_creator -acme
1400 */
1401 sk->sk_prot = sk->sk_prot_creator = prot;
1402 sock_lock_init(sk);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001403 sk->sk_net_refcnt = kern ? 0 : 1;
1404 if (likely(sk->sk_net_refcnt))
1405 get_net(net);
1406 sock_net_set(sk, net);
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001407 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001408
Tejun Heo2a56a1f2015-12-07 17:38:52 -05001409 sock_update_classid(&sk->sk_cgrp_data);
1410 sock_update_netprioidx(&sk->sk_cgrp_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411 }
Frank Filza79af592005-09-27 15:23:38 -07001412
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001413 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001414}
Eric Dumazet2a915252009-05-27 11:30:05 +00001415EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001416
Craig Gallekeb4cb002015-06-15 11:26:18 -04001417void sk_destruct(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418{
1419 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001420
1421 if (sk->sk_destruct)
1422 sk->sk_destruct(sk);
1423
Paul E. McKenneya898def2010-02-22 17:04:49 -08001424 filter = rcu_dereference_check(sk->sk_filter,
1425 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001427 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001428 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001429 }
Craig Gallek538950a2016-01-04 17:41:47 -05001430 if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 reuseport_detach_sock(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432
Eric Dumazet08e29af2011-11-28 12:04:18 +00001433 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434
1435 if (atomic_read(&sk->sk_omem_alloc))
Joe Perchese005d192012-05-16 19:58:40 +00001436 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001439 if (sk->sk_peer_cred)
1440 put_cred(sk->sk_peer_cred);
1441 put_pid(sk->sk_peer_pid);
Eric W. Biederman26abe142015-05-08 21:10:31 -05001442 if (likely(sk->sk_net_refcnt))
1443 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001444 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001445}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001446
Craig Gallekeb4cb002015-06-15 11:26:18 -04001447static void __sk_free(struct sock *sk)
1448{
Craig Gallekb9226222015-06-30 12:49:32 -04001449 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
Craig Gallekeb4cb002015-06-15 11:26:18 -04001450 sock_diag_broadcast_destroy(sk);
1451 else
1452 sk_destruct(sk);
1453}
1454
Eric Dumazet2b85a342009-06-11 02:55:43 -07001455void sk_free(struct sock *sk)
1456{
1457 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001458 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001459 * some packets are still in some tx queue.
1460 * If not null, sock_wfree() will call __sk_free(sk) later
1461 */
1462 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1463 __sk_free(sk);
1464}
Eric Dumazet2a915252009-05-27 11:30:05 +00001465EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466
Eric Dumazete56c57d2011-11-08 17:07:07 -05001467/**
1468 * sk_clone_lock - clone a socket, and lock its clone
1469 * @sk: the socket to clone
1470 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471 *
1472 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473 */
1474struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001475{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001476 struct sock *newsk;
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001477 bool is_charged = true;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001478
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001479 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001480 if (newsk != NULL) {
1481 struct sk_filter *filter;
1482
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001483 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001484
1485 /* SANITY */
Sowmini Varadhan8a681732015-07-30 15:50:36 +02001486 if (likely(newsk->sk_net_refcnt))
1487 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001488 sk_node_init(&newsk->sk_node);
1489 sock_lock_init(newsk);
1490 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001491 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001492 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001493
1494 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001495 /*
1496 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1497 */
1498 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001499 atomic_set(&newsk->sk_omem_alloc, 0);
1500 skb_queue_head_init(&newsk->sk_receive_queue);
1501 skb_queue_head_init(&newsk->sk_write_queue);
1502
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001503 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07001504 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505 af_callback_keys + newsk->sk_family,
1506 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001507
1508 newsk->sk_dst_cache = NULL;
1509 newsk->sk_wmem_queued = 0;
1510 newsk->sk_forward_alloc = 0;
1511 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001512 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513
1514 sock_reset_flag(newsk, SOCK_DONE);
1515 skb_queue_head_init(&newsk->sk_error_queue);
1516
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001517 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001518 if (filter != NULL)
Alexei Starovoitov278571b2014-07-30 20:34:12 -07001519 /* though it's an empty new sock, the charging may fail
1520 * if sysctl_optmem_max was changed between creation of
1521 * original socket and cloning
1522 */
1523 is_charged = sk_filter_charge(newsk, filter);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001524
Eric Dumazetd188ba82015-12-08 07:22:02 -08001525 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001526 /* It is still raw copy of parent, so invalidate
1527 * destructor and make plain sk_free() */
1528 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001529 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001530 sk_free(newsk);
1531 newsk = NULL;
1532 goto out;
1533 }
1534
1535 newsk->sk_err = 0;
1536 newsk->sk_priority = 0;
Eric Dumazet2c8c56e2014-11-11 05:54:28 -08001537 newsk->sk_incoming_cpu = raw_smp_processor_id();
Eric Dumazet33cf7c92015-03-11 18:53:14 -07001538 atomic64_set(&newsk->sk_cookie, 0);
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001539 /*
1540 * Before updating sk_refcnt, we must commit prior changes to memory
1541 * (Documentation/RCU/rculist_nulls.txt for details)
1542 */
1543 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001544 atomic_set(&newsk->sk_refcnt, 2);
1545
1546 /*
1547 * Increment the counter in the same struct proto as the master
1548 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1549 * is the same as sk->sk_prot->socks, as this field was copied
1550 * with memcpy).
1551 *
1552 * This _changes_ the previous behaviour, where
1553 * tcp_create_openreq_child always was incrementing the
1554 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1555 * to be taken into account in all callers. -acme
1556 */
1557 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001558 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001559 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001560
Johannes Weinerbaac50b2016-01-14 15:21:17 -08001561 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
Johannes Weiner3d596f72016-01-14 15:21:05 -08001562 sock_update_memcg(newsk);
Glauber Costaf3f511e2012-01-05 20:16:39 +00001563
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001564 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001565 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001566
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01001567 if (sock_needs_netstamp(sk) &&
1568 newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001569 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001570 }
1571out:
1572 return newsk;
1573}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001574EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001575
Andi Kleen99580892007-04-20 17:12:43 -07001576void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1577{
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001578 u32 max_segs = 1;
1579
Eric Dumazet6bd4f352015-12-02 21:53:57 -08001580 sk_dst_set(sk, dst);
Andi Kleen99580892007-04-20 17:12:43 -07001581 sk->sk_route_caps = dst->dev->features;
1582 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001583 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001584 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001585 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001586 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001587 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001588 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001589 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001590 sk->sk_gso_max_size = dst->dev->gso_max_size;
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001591 max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001592 }
Andi Kleen99580892007-04-20 17:12:43 -07001593 }
Eric Dumazetd6a4e262015-05-26 08:55:28 -07001594 sk->sk_gso_max_segs = max_segs;
Andi Kleen99580892007-04-20 17:12:43 -07001595}
1596EXPORT_SYMBOL_GPL(sk_setup_caps);
1597
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598/*
1599 * Simple resource managers for sockets.
1600 */
1601
1602
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001603/*
1604 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001605 */
1606void sock_wfree(struct sk_buff *skb)
1607{
1608 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001609 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001610
Eric Dumazetd99927f2009-09-24 10:49:24 +00001611 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1612 /*
1613 * Keep a reference on sk_wmem_alloc, this will be released
1614 * after sk_write_space() call
1615 */
1616 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001617 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001618 len = 1;
1619 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001620 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001621 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1622 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001623 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001624 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001625 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001626}
Eric Dumazet2a915252009-05-27 11:30:05 +00001627EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001628
Eric Dumazet9e17f8a2015-11-01 15:36:55 -08001629void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1630{
1631 skb_orphan(skb);
1632 skb->sk = sk;
1633#ifdef CONFIG_INET
1634 if (unlikely(!sk_fullsock(sk))) {
1635 skb->destructor = sock_edemux;
1636 sock_hold(sk);
1637 return;
1638 }
1639#endif
1640 skb->destructor = sock_wfree;
1641 skb_set_hash_from_sk(skb, sk);
1642 /*
1643 * We used to take a refcount on sk, but following operation
1644 * is enough to guarantee sk_free() wont free this sock until
1645 * all in-flight packets are completed
1646 */
1647 atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1648}
1649EXPORT_SYMBOL(skb_set_owner_w);
1650
Eric Dumazetf2f872f2013-07-30 17:55:08 -07001651void skb_orphan_partial(struct sk_buff *skb)
1652{
1653 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1654 * so we do not completely orphan skb, but transfert all
1655 * accounted bytes but one, to avoid unexpected reorders.
1656 */
1657 if (skb->destructor == sock_wfree
1658#ifdef CONFIG_INET
1659 || skb->destructor == tcp_wfree
1660#endif
1661 ) {
1662 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1663 skb->truesize = 1;
1664 } else {
1665 skb_orphan(skb);
1666 }
1667}
1668EXPORT_SYMBOL(skb_orphan_partial);
1669
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001670/*
1671 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001672 */
1673void sock_rfree(struct sk_buff *skb)
1674{
1675 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001676 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677
Eric Dumazetd361fd52010-07-10 22:45:17 +00001678 atomic_sub(len, &sk->sk_rmem_alloc);
1679 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001680}
Eric Dumazet2a915252009-05-27 11:30:05 +00001681EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682
Oliver Hartkopp7768eed2015-03-10 19:03:46 +01001683/*
1684 * Buffer destructor for skbs that are not used directly in read or write
1685 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1686 */
Alexander Duyck62bccb82014-09-04 13:31:35 -04001687void sock_efree(struct sk_buff *skb)
1688{
1689 sock_put(skb->sk);
1690}
1691EXPORT_SYMBOL(sock_efree);
1692
Eric W. Biederman976d02012012-05-23 17:16:53 -06001693kuid_t sock_i_uid(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001694{
Eric W. Biederman976d02012012-05-23 17:16:53 -06001695 kuid_t uid;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696
Eric Dumazetf064af12010-09-22 12:43:39 +00001697 read_lock_bh(&sk->sk_callback_lock);
Eric W. Biederman976d02012012-05-23 17:16:53 -06001698 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
Eric Dumazetf064af12010-09-22 12:43:39 +00001699 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001700 return uid;
1701}
Eric Dumazet2a915252009-05-27 11:30:05 +00001702EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703
1704unsigned long sock_i_ino(struct sock *sk)
1705{
1706 unsigned long ino;
1707
Eric Dumazetf064af12010-09-22 12:43:39 +00001708 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001709 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001710 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 return ino;
1712}
Eric Dumazet2a915252009-05-27 11:30:05 +00001713EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001714
1715/*
1716 * Allocate a skb from the socket's send buffer.
1717 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001718struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001719 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001720{
1721 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001722 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001723 if (skb) {
1724 skb_set_owner_w(skb, sk);
1725 return skb;
1726 }
1727 }
1728 return NULL;
1729}
Eric Dumazet2a915252009-05-27 11:30:05 +00001730EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731
1732/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001733 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001734 */
Al Virodd0fc662005-10-07 07:46:04 +01001735void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736{
Eric Dumazet95c96172012-04-15 05:58:06 +00001737 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1739 void *mem;
1740 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001741 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742 */
1743 atomic_add(size, &sk->sk_omem_alloc);
1744 mem = kmalloc(size, priority);
1745 if (mem)
1746 return mem;
1747 atomic_sub(size, &sk->sk_omem_alloc);
1748 }
1749 return NULL;
1750}
Eric Dumazet2a915252009-05-27 11:30:05 +00001751EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001752
Daniel Borkmann79e88652014-11-19 17:13:11 +01001753/* Free an option memory block. Note, we actually want the inline
1754 * here as this allows gcc to detect the nullify and fold away the
1755 * condition entirely.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 */
Daniel Borkmann79e88652014-11-19 17:13:11 +01001757static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1758 const bool nullify)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001759{
David S. Millere53da5f2014-10-14 17:02:37 -04001760 if (WARN_ON_ONCE(!mem))
1761 return;
Daniel Borkmann79e88652014-11-19 17:13:11 +01001762 if (nullify)
1763 kzfree(mem);
1764 else
1765 kfree(mem);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001766 atomic_sub(size, &sk->sk_omem_alloc);
1767}
Daniel Borkmann79e88652014-11-19 17:13:11 +01001768
1769void sock_kfree_s(struct sock *sk, void *mem, int size)
1770{
1771 __sock_kfree_s(sk, mem, size, false);
1772}
Eric Dumazet2a915252009-05-27 11:30:05 +00001773EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001774
Daniel Borkmann79e88652014-11-19 17:13:11 +01001775void sock_kzfree_s(struct sock *sk, void *mem, int size)
1776{
1777 __sock_kfree_s(sk, mem, size, true);
1778}
1779EXPORT_SYMBOL(sock_kzfree_s);
1780
Linus Torvalds1da177e2005-04-16 15:20:36 -07001781/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1782 I think, these locks should be removed for datagram sockets.
1783 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001784static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001785{
1786 DEFINE_WAIT(wait);
1787
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001788 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001789 for (;;) {
1790 if (!timeo)
1791 break;
1792 if (signal_pending(current))
1793 break;
1794 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001795 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001796 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1797 break;
1798 if (sk->sk_shutdown & SEND_SHUTDOWN)
1799 break;
1800 if (sk->sk_err)
1801 break;
1802 timeo = schedule_timeout(timeo);
1803 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001804 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001805 return timeo;
1806}
1807
1808
1809/*
1810 * Generic send/receive buffer handlers
1811 */
1812
Herbert Xu4cc7f682009-02-04 16:55:54 -08001813struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1814 unsigned long data_len, int noblock,
Eric Dumazet28d64272013-08-08 14:38:47 -07001815 int *errcode, int max_page_order)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816{
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001817 struct sk_buff *skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818 long timeo;
1819 int err;
1820
Linus Torvalds1da177e2005-04-16 15:20:36 -07001821 timeo = sock_sndtimeo(sk, noblock);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001822 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001823 err = sock_error(sk);
1824 if (err != 0)
1825 goto failure;
1826
1827 err = -EPIPE;
1828 if (sk->sk_shutdown & SEND_SHUTDOWN)
1829 goto failure;
1830
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001831 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1832 break;
Eric Dumazet28d64272013-08-08 14:38:47 -07001833
Eric Dumazet9cd3e072015-11-29 20:03:10 -08001834 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001835 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1836 err = -EAGAIN;
1837 if (!timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 goto failure;
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001839 if (signal_pending(current))
1840 goto interrupted;
1841 timeo = sock_wait_for_wmem(sk, timeo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001842 }
Eric Dumazet2e4e4412014-09-17 04:49:49 -07001843 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1844 errcode, sk->sk_allocation);
1845 if (skb)
1846 skb_set_owner_w(skb, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847 return skb;
1848
1849interrupted:
1850 err = sock_intr_errno(timeo);
1851failure:
1852 *errcode = err;
1853 return NULL;
1854}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001855EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001856
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001857struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858 int noblock, int *errcode)
1859{
Eric Dumazet28d64272013-08-08 14:38:47 -07001860 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861}
Eric Dumazet2a915252009-05-27 11:30:05 +00001862EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001863
Edward Jeef28ea362015-10-08 14:56:48 -07001864int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1865 struct sockcm_cookie *sockc)
1866{
1867 struct cmsghdr *cmsg;
1868
1869 for_each_cmsghdr(cmsg, msg) {
1870 if (!CMSG_OK(msg, cmsg))
1871 return -EINVAL;
1872 if (cmsg->cmsg_level != SOL_SOCKET)
1873 continue;
1874 switch (cmsg->cmsg_type) {
1875 case SO_MARK:
1876 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1877 return -EPERM;
1878 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1879 return -EINVAL;
1880 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1881 break;
1882 default:
1883 return -EINVAL;
1884 }
1885 }
1886 return 0;
1887}
1888EXPORT_SYMBOL(sock_cmsg_send);
1889
Eric Dumazet5640f762012-09-23 23:04:42 +00001890/* On 32bit arches, an skb frag is limited to 2^15 */
1891#define SKB_FRAG_PAGE_ORDER get_order(32768)
1892
Eric Dumazet400dfd32013-10-17 16:27:07 -07001893/**
1894 * skb_page_frag_refill - check that a page_frag contains enough room
1895 * @sz: minimum size of the fragment we want to get
1896 * @pfrag: pointer to page_frag
Eric Dumazet82d5e2b2014-09-08 04:00:00 -07001897 * @gfp: priority for memory allocation
Eric Dumazet400dfd32013-10-17 16:27:07 -07001898 *
1899 * Note: While this allocator tries to use high order pages, there is
1900 * no guarantee that allocations succeed. Therefore, @sz MUST be
1901 * less or equal than PAGE_SIZE.
1902 */
Eric Dumazetd9b29382014-08-27 20:49:34 -07001903bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
Eric Dumazet5640f762012-09-23 23:04:42 +00001904{
Eric Dumazet5640f762012-09-23 23:04:42 +00001905 if (pfrag->page) {
Joonsoo Kimfe896d12016-03-17 14:19:26 -07001906 if (page_ref_count(pfrag->page) == 1) {
Eric Dumazet5640f762012-09-23 23:04:42 +00001907 pfrag->offset = 0;
1908 return true;
1909 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001910 if (pfrag->offset + sz <= pfrag->size)
Eric Dumazet5640f762012-09-23 23:04:42 +00001911 return true;
1912 put_page(pfrag->page);
1913 }
1914
Eric Dumazetd9b29382014-08-27 20:49:34 -07001915 pfrag->offset = 0;
1916 if (SKB_FRAG_PAGE_ORDER) {
Mel Gormand0164ad2015-11-06 16:28:21 -08001917 /* Avoid direct reclaim but allow kswapd to wake */
1918 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1919 __GFP_COMP | __GFP_NOWARN |
1920 __GFP_NORETRY,
Eric Dumazetd9b29382014-08-27 20:49:34 -07001921 SKB_FRAG_PAGE_ORDER);
Eric Dumazet5640f762012-09-23 23:04:42 +00001922 if (likely(pfrag->page)) {
Eric Dumazetd9b29382014-08-27 20:49:34 -07001923 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
Eric Dumazet5640f762012-09-23 23:04:42 +00001924 return true;
1925 }
Eric Dumazetd9b29382014-08-27 20:49:34 -07001926 }
1927 pfrag->page = alloc_page(gfp);
1928 if (likely(pfrag->page)) {
1929 pfrag->size = PAGE_SIZE;
1930 return true;
1931 }
Eric Dumazet400dfd32013-10-17 16:27:07 -07001932 return false;
1933}
1934EXPORT_SYMBOL(skb_page_frag_refill);
1935
1936bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1937{
1938 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1939 return true;
1940
Eric Dumazet5640f762012-09-23 23:04:42 +00001941 sk_enter_memory_pressure(sk);
1942 sk_stream_moderate_sndbuf(sk);
1943 return false;
1944}
1945EXPORT_SYMBOL(sk_page_frag_refill);
1946
Linus Torvalds1da177e2005-04-16 15:20:36 -07001947static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001948 __releases(&sk->sk_lock.slock)
1949 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001950{
1951 DEFINE_WAIT(wait);
1952
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001953 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1955 TASK_UNINTERRUPTIBLE);
1956 spin_unlock_bh(&sk->sk_lock.slock);
1957 schedule();
1958 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001959 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001960 break;
1961 }
1962 finish_wait(&sk->sk_lock.wq, &wait);
1963}
1964
1965static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001966 __releases(&sk->sk_lock.slock)
1967 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001968{
1969 struct sk_buff *skb = sk->sk_backlog.head;
1970
1971 do {
1972 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1973 bh_unlock_sock(sk);
1974
1975 do {
1976 struct sk_buff *next = skb->next;
1977
Eric Dumazete4cbb022012-04-30 16:07:09 +00001978 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00001979 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001980 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001981 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001982
1983 /*
1984 * We are in process context here with softirqs
1985 * disabled, use cond_resched_softirq() to preempt.
1986 * This is safe to do because we've taken the backlog
1987 * queue private:
1988 */
1989 cond_resched_softirq();
1990
1991 skb = next;
1992 } while (skb != NULL);
1993
1994 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001995 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00001996
1997 /*
1998 * Doing the zeroing here guarantee we can not loop forever
1999 * while a wild producer attempts to flood us.
2000 */
2001 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002002}
2003
2004/**
2005 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07002006 * @sk: sock to wait on
2007 * @timeo: for how long
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002008 * @skb: last skb seen on sk_receive_queue
Linus Torvalds1da177e2005-04-16 15:20:36 -07002009 *
2010 * Now socket state including sk->sk_err is changed only under lock,
2011 * hence we may omit checks after joining wait queue.
2012 * We check receive queue before schedule() only as optimization;
2013 * it is very likely that release_sock() added new data.
2014 */
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002015int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002016{
2017 int rc;
2018 DEFINE_WAIT(wait);
2019
Eric Dumazetaa395142010-04-20 13:03:51 +00002020 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002021 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Sabrina Dubrocadfbafc92015-07-24 18:19:25 +02002022 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
Eric Dumazet9cd3e072015-11-29 20:03:10 -08002023 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
Eric Dumazetaa395142010-04-20 13:03:51 +00002024 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002025 return rc;
2026}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027EXPORT_SYMBOL(sk_wait_data);
2028
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002029/**
2030 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2031 * @sk: socket
2032 * @size: memory size to allocate
2033 * @kind: allocation type
2034 *
2035 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2036 * rmem allocation. This function assumes that protocols which have
2037 * memory_pressure use sk_wmem_queued as write buffer accounting.
2038 */
2039int __sk_mem_schedule(struct sock *sk, int size, int kind)
2040{
2041 struct proto *prot = sk->sk_prot;
2042 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00002043 long allocated;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002044
2045 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002046
Johannes Weinere8056052016-01-14 15:21:14 -08002047 allocated = sk_memory_allocated_add(sk, amt);
2048
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002049 if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2050 !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
Johannes Weinere8056052016-01-14 15:21:14 -08002051 goto suppress_allocation;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002052
2053 /* Under limit. */
Johannes Weinere8056052016-01-14 15:21:14 -08002054 if (allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00002055 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002056 return 1;
2057 }
2058
Johannes Weinere8056052016-01-14 15:21:14 -08002059 /* Under pressure. */
2060 if (allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00002061 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002062
Johannes Weinere8056052016-01-14 15:21:14 -08002063 /* Over hard limit. */
2064 if (allocated > sk_prot_mem_limits(sk, 2))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002065 goto suppress_allocation;
2066
2067 /* guarantee minimum buffer size under pressure */
2068 if (kind == SK_MEM_RECV) {
2069 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2070 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002071
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002072 } else { /* SK_MEM_SEND */
2073 if (sk->sk_type == SOCK_STREAM) {
2074 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2075 return 1;
2076 } else if (atomic_read(&sk->sk_wmem_alloc) <
2077 prot->sysctl_wmem[0])
2078 return 1;
2079 }
2080
Glauber Costa180d8cd2011-12-11 21:47:02 +00002081 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08002082 int alloc;
2083
Glauber Costa180d8cd2011-12-11 21:47:02 +00002084 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08002085 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002086 alloc = sk_sockets_allocated_read_positive(sk);
2087 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002088 sk_mem_pages(sk->sk_wmem_queued +
2089 atomic_read(&sk->sk_rmem_alloc) +
2090 sk->sk_forward_alloc))
2091 return 1;
2092 }
2093
2094suppress_allocation:
2095
2096 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2097 sk_stream_moderate_sndbuf(sk);
2098
2099 /* Fail only if socket is _under_ its sndbuf.
2100 * In this case we cannot block, so that we have to fail.
2101 */
2102 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2103 return 1;
2104 }
2105
Satoru Moriya3847ce32011-06-17 12:00:03 +00002106 trace_sock_exceed_buf_limit(sk, prot, allocated);
2107
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002108 /* Alas. Undo changes. */
2109 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002110
Glauber Costa0e90b312012-01-20 04:57:16 +00002111 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00002112
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002113 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2114 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
Johannes Weinere8056052016-01-14 15:21:14 -08002115
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002116 return 0;
2117}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002118EXPORT_SYMBOL(__sk_mem_schedule);
2119
2120/**
Jean Sacren69dba9b2015-08-27 18:05:49 -06002121 * __sk_mem_reclaim - reclaim memory_allocated
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002122 * @sk: socket
Eric Dumazet1a24e042015-05-15 12:39:25 -07002123 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002124 */
Eric Dumazet1a24e042015-05-15 12:39:25 -07002125void __sk_mem_reclaim(struct sock *sk, int amount)
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002126{
Eric Dumazet1a24e042015-05-15 12:39:25 -07002127 amount >>= SK_MEM_QUANTUM_SHIFT;
2128 sk_memory_allocated_sub(sk, amount);
2129 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002130
Johannes Weinerbaac50b2016-01-14 15:21:17 -08002131 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2132 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
Johannes Weinere8056052016-01-14 15:21:14 -08002133
Glauber Costa180d8cd2011-12-11 21:47:02 +00002134 if (sk_under_memory_pressure(sk) &&
2135 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2136 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002137}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08002138EXPORT_SYMBOL(__sk_mem_reclaim);
2139
2140
Linus Torvalds1da177e2005-04-16 15:20:36 -07002141/*
2142 * Set of default routines for initialising struct proto_ops when
2143 * the protocol does not support a particular function. In certain
2144 * cases where it makes no sense for a protocol to have a "do nothing"
2145 * function, some default processing is provided.
2146 */
2147
2148int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2149{
2150 return -EOPNOTSUPP;
2151}
Eric Dumazet2a915252009-05-27 11:30:05 +00002152EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002153
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002154int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155 int len, int flags)
2156{
2157 return -EOPNOTSUPP;
2158}
Eric Dumazet2a915252009-05-27 11:30:05 +00002159EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160
2161int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2162{
2163 return -EOPNOTSUPP;
2164}
Eric Dumazet2a915252009-05-27 11:30:05 +00002165EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
2167int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2168{
2169 return -EOPNOTSUPP;
2170}
Eric Dumazet2a915252009-05-27 11:30:05 +00002171EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002173int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002174 int *len, int peer)
2175{
2176 return -EOPNOTSUPP;
2177}
Eric Dumazet2a915252009-05-27 11:30:05 +00002178EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002179
Eric Dumazet2a915252009-05-27 11:30:05 +00002180unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002181{
2182 return 0;
2183}
Eric Dumazet2a915252009-05-27 11:30:05 +00002184EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002185
2186int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2187{
2188 return -EOPNOTSUPP;
2189}
Eric Dumazet2a915252009-05-27 11:30:05 +00002190EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002191
2192int sock_no_listen(struct socket *sock, int backlog)
2193{
2194 return -EOPNOTSUPP;
2195}
Eric Dumazet2a915252009-05-27 11:30:05 +00002196EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197
2198int sock_no_shutdown(struct socket *sock, int how)
2199{
2200 return -EOPNOTSUPP;
2201}
Eric Dumazet2a915252009-05-27 11:30:05 +00002202EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002203
2204int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002205 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002206{
2207 return -EOPNOTSUPP;
2208}
Eric Dumazet2a915252009-05-27 11:30:05 +00002209EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
2211int sock_no_getsockopt(struct socket *sock, int level, int optname,
2212 char __user *optval, int __user *optlen)
2213{
2214 return -EOPNOTSUPP;
2215}
Eric Dumazet2a915252009-05-27 11:30:05 +00002216EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002217
Ying Xue1b784142015-03-02 15:37:48 +08002218int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219{
2220 return -EOPNOTSUPP;
2221}
Eric Dumazet2a915252009-05-27 11:30:05 +00002222EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002223
Ying Xue1b784142015-03-02 15:37:48 +08002224int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2225 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002226{
2227 return -EOPNOTSUPP;
2228}
Eric Dumazet2a915252009-05-27 11:30:05 +00002229EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002230
2231int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2232{
2233 /* Mirror missing mmap method error code */
2234 return -ENODEV;
2235}
Eric Dumazet2a915252009-05-27 11:30:05 +00002236EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002237
2238ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2239{
2240 ssize_t res;
2241 struct msghdr msg = {.msg_flags = flags};
2242 struct kvec iov;
2243 char *kaddr = kmap(page);
2244 iov.iov_base = kaddr + offset;
2245 iov.iov_len = size;
2246 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2247 kunmap(page);
2248 return res;
2249}
Eric Dumazet2a915252009-05-27 11:30:05 +00002250EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251
2252/*
2253 * Default Socket Callbacks
2254 */
2255
2256static void sock_def_wakeup(struct sock *sk)
2257{
Eric Dumazet43815482010-04-29 11:01:49 +00002258 struct socket_wq *wq;
2259
2260 rcu_read_lock();
2261 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002262 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002263 wake_up_interruptible_all(&wq->wait);
2264 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002265}
2266
2267static void sock_def_error_report(struct sock *sk)
2268{
Eric Dumazet43815482010-04-29 11:01:49 +00002269 struct socket_wq *wq;
2270
2271 rcu_read_lock();
2272 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002273 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002274 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002275 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00002276 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002277}
2278
David S. Miller676d2362014-04-11 16:15:36 -04002279static void sock_def_readable(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002280{
Eric Dumazet43815482010-04-29 11:01:49 +00002281 struct socket_wq *wq;
2282
2283 rcu_read_lock();
2284 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002285 if (skwq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08002286 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07002287 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002288 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002289 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002290}
2291
2292static void sock_def_write_space(struct sock *sk)
2293{
Eric Dumazet43815482010-04-29 11:01:49 +00002294 struct socket_wq *wq;
2295
2296 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002297
2298 /* Do not wake up a writer until he can make "significant"
2299 * progress. --DaveM
2300 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002301 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002302 wq = rcu_dereference(sk->sk_wq);
Herbert Xu1ce0bf52015-11-26 13:55:39 +08002303 if (skwq_has_sleeper(wq))
Eric Dumazet43815482010-04-29 11:01:49 +00002304 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07002305 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002306
2307 /* Should agree with poll, otherwise some programs break */
2308 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002309 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002310 }
2311
Eric Dumazet43815482010-04-29 11:01:49 +00002312 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002313}
2314
2315static void sock_def_destruct(struct sock *sk)
2316{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317}
2318
2319void sk_send_sigurg(struct sock *sk)
2320{
2321 if (sk->sk_socket && sk->sk_socket->file)
2322 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002323 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002324}
Eric Dumazet2a915252009-05-27 11:30:05 +00002325EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002326
2327void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2328 unsigned long expires)
2329{
2330 if (!mod_timer(timer, expires))
2331 sock_hold(sk);
2332}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002333EXPORT_SYMBOL(sk_reset_timer);
2334
2335void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2336{
Ying Xue25cc4ae2013-02-03 20:32:57 +00002337 if (del_timer(timer))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002338 __sock_put(sk);
2339}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340EXPORT_SYMBOL(sk_stop_timer);
2341
2342void sock_init_data(struct socket *sock, struct sock *sk)
2343{
2344 skb_queue_head_init(&sk->sk_receive_queue);
2345 skb_queue_head_init(&sk->sk_write_queue);
2346 skb_queue_head_init(&sk->sk_error_queue);
2347
2348 sk->sk_send_head = NULL;
2349
2350 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002351
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 sk->sk_allocation = GFP_KERNEL;
2353 sk->sk_rcvbuf = sysctl_rmem_default;
2354 sk->sk_sndbuf = sysctl_wmem_default;
2355 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002356 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002357
2358 sock_set_flag(sk, SOCK_ZAPPED);
2359
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002360 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002361 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002362 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002363 sock->sk = sk;
2364 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002365 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002366
Linus Torvalds1da177e2005-04-16 15:20:36 -07002367 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07002368 lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 af_callback_keys + sk->sk_family,
2370 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002371
2372 sk->sk_state_change = sock_def_wakeup;
2373 sk->sk_data_ready = sock_def_readable;
2374 sk->sk_write_space = sock_def_write_space;
2375 sk->sk_error_report = sock_def_error_report;
2376 sk->sk_destruct = sock_def_destruct;
2377
Eric Dumazet5640f762012-09-23 23:04:42 +00002378 sk->sk_frag.page = NULL;
2379 sk->sk_frag.offset = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002380 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002381
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002382 sk->sk_peer_pid = NULL;
2383 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 sk->sk_write_pending = 0;
2385 sk->sk_rcvlowat = 1;
2386 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2387 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2388
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002389 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390
Cong Wange0d10952013-08-01 11:10:25 +08002391#ifdef CONFIG_NET_RX_BUSY_POLL
Eliezer Tamir06021292013-06-10 11:39:50 +03002392 sk->sk_napi_id = 0;
Eliezer Tamir64b0dc52013-07-10 17:13:36 +03002393 sk->sk_ll_usec = sysctl_net_busy_read;
Eliezer Tamir06021292013-06-10 11:39:50 +03002394#endif
2395
Eric Dumazet62748f32013-09-24 08:20:52 -07002396 sk->sk_max_pacing_rate = ~0U;
Eric Dumazet7eec4172013-10-08 15:16:00 -07002397 sk->sk_pacing_rate = ~0U;
Eric Dumazet70da2682015-10-08 19:33:21 -07002398 sk->sk_incoming_cpu = -1;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002399 /*
2400 * Before updating sk_refcnt, we must commit prior changes to memory
2401 * (Documentation/RCU/rculist_nulls.txt for details)
2402 */
2403 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002405 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002406}
Eric Dumazet2a915252009-05-27 11:30:05 +00002407EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002408
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002409void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002410{
2411 might_sleep();
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002412 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002413 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002414 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002415 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002416 spin_unlock(&sk->sk_lock.slock);
2417 /*
2418 * The sk_lock has mutex_lock() semantics here:
2419 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002420 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002421 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002423EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002424
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002425void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002426{
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002427 /*
2428 * The sk_lock has mutex_unlock() semantics:
2429 */
2430 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2431
2432 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002433 if (sk->sk_backlog.tail)
2434 __release_sock(sk);
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002435
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002436 /* Warning : release_cb() might need to release sk ownership,
2437 * ie call sock_release_ownership(sk) before us.
2438 */
Eric Dumazet46d3cea2012-07-11 05:50:31 +00002439 if (sk->sk_prot->release_cb)
2440 sk->sk_prot->release_cb(sk);
2441
Eric Dumazetc3f9b012014-03-10 09:50:11 -07002442 sock_release_ownership(sk);
Ingo Molnara5b5bb92006-07-03 00:25:35 -07002443 if (waitqueue_active(&sk->sk_lock.wq))
2444 wake_up(&sk->sk_lock.wq);
2445 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002446}
2447EXPORT_SYMBOL(release_sock);
2448
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002449/**
2450 * lock_sock_fast - fast version of lock_sock
2451 * @sk: socket
2452 *
2453 * This version should be used for very small section, where process wont block
2454 * return false if fast path is taken
2455 * sk_lock.slock locked, owned = 0, BH disabled
2456 * return true if slow path is taken
2457 * sk_lock.slock unlocked, owned = 1, BH enabled
2458 */
2459bool lock_sock_fast(struct sock *sk)
2460{
2461 might_sleep();
2462 spin_lock_bh(&sk->sk_lock.slock);
2463
2464 if (!sk->sk_lock.owned)
2465 /*
2466 * Note : We must disable BH
2467 */
2468 return false;
2469
2470 __lock_sock(sk);
2471 sk->sk_lock.owned = 1;
2472 spin_unlock(&sk->sk_lock.slock);
2473 /*
2474 * The sk_lock has mutex_lock() semantics here:
2475 */
2476 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2477 local_bh_enable();
2478 return true;
2479}
2480EXPORT_SYMBOL(lock_sock_fast);
2481
Linus Torvalds1da177e2005-04-16 15:20:36 -07002482int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002483{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002484 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002485 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002486 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002487 tv = ktime_to_timeval(sk->sk_stamp);
2488 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002489 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002490 if (tv.tv_sec == 0) {
2491 sk->sk_stamp = ktime_get_real();
2492 tv = ktime_to_timeval(sk->sk_stamp);
2493 }
2494 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002495}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002496EXPORT_SYMBOL(sock_get_timestamp);
2497
Eric Dumazetae40eb12007-03-18 17:33:16 -07002498int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2499{
2500 struct timespec ts;
2501 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002502 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002503 ts = ktime_to_timespec(sk->sk_stamp);
2504 if (ts.tv_sec == -1)
2505 return -ENOENT;
2506 if (ts.tv_sec == 0) {
2507 sk->sk_stamp = ktime_get_real();
2508 ts = ktime_to_timespec(sk->sk_stamp);
2509 }
2510 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2511}
2512EXPORT_SYMBOL(sock_get_timestampns);
2513
Patrick Ohly20d49472009-02-12 05:03:38 +00002514void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002515{
Patrick Ohly20d49472009-02-12 05:03:38 +00002516 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002517 unsigned long previous_flags = sk->sk_flags;
2518
Patrick Ohly20d49472009-02-12 05:03:38 +00002519 sock_set_flag(sk, flag);
2520 /*
2521 * we just set one of the two flags which require net
2522 * time stamping, but time stamping might have been on
2523 * already because of the other one
2524 */
Hannes Frederic Sowa080a2702015-10-26 13:51:37 +01002525 if (sock_needs_netstamp(sk) &&
2526 !(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002527 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002528 }
2529}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530
Richard Cochrancb820f82013-07-19 19:40:09 +02002531int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2532 int level, int type)
2533{
2534 struct sock_exterr_skb *serr;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002535 struct sk_buff *skb;
Richard Cochrancb820f82013-07-19 19:40:09 +02002536 int copied, err;
2537
2538 err = -EAGAIN;
Willem de Bruijn364a9e92014-08-31 21:30:27 -04002539 skb = sock_dequeue_err_skb(sk);
Richard Cochrancb820f82013-07-19 19:40:09 +02002540 if (skb == NULL)
2541 goto out;
2542
2543 copied = skb->len;
2544 if (copied > len) {
2545 msg->msg_flags |= MSG_TRUNC;
2546 copied = len;
2547 }
David S. Miller51f3d022014-11-05 16:46:40 -05002548 err = skb_copy_datagram_msg(skb, 0, msg, copied);
Richard Cochrancb820f82013-07-19 19:40:09 +02002549 if (err)
2550 goto out_free_skb;
2551
2552 sock_recv_timestamp(msg, sk, skb);
2553
2554 serr = SKB_EXT_ERR(skb);
2555 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2556
2557 msg->msg_flags |= MSG_ERRQUEUE;
2558 err = copied;
2559
Richard Cochrancb820f82013-07-19 19:40:09 +02002560out_free_skb:
2561 kfree_skb(skb);
2562out:
2563 return err;
2564}
2565EXPORT_SYMBOL(sock_recv_errqueue);
2566
Linus Torvalds1da177e2005-04-16 15:20:36 -07002567/*
2568 * Get a socket option on an socket.
2569 *
2570 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2571 * asynchronous errors should be reported by getsockopt. We assume
2572 * this means if you specify SO_ERROR (otherwise whats the point of it).
2573 */
2574int sock_common_getsockopt(struct socket *sock, int level, int optname,
2575 char __user *optval, int __user *optlen)
2576{
2577 struct sock *sk = sock->sk;
2578
2579 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2580}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002581EXPORT_SYMBOL(sock_common_getsockopt);
2582
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002583#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002584int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2585 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002586{
2587 struct sock *sk = sock->sk;
2588
Johannes Berg1e51f952007-03-06 13:44:06 -08002589 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002590 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2591 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002592 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2593}
2594EXPORT_SYMBOL(compat_sock_common_getsockopt);
2595#endif
2596
Ying Xue1b784142015-03-02 15:37:48 +08002597int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2598 int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002599{
2600 struct sock *sk = sock->sk;
2601 int addr_len = 0;
2602 int err;
2603
Ying Xue1b784142015-03-02 15:37:48 +08002604 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605 flags & ~MSG_DONTWAIT, &addr_len);
2606 if (err >= 0)
2607 msg->msg_namelen = addr_len;
2608 return err;
2609}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002610EXPORT_SYMBOL(sock_common_recvmsg);
2611
2612/*
2613 * Set socket options on an inet socket.
2614 */
2615int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002616 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617{
2618 struct sock *sk = sock->sk;
2619
2620 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2621}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622EXPORT_SYMBOL(sock_common_setsockopt);
2623
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002624#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002625int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002626 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002627{
2628 struct sock *sk = sock->sk;
2629
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002630 if (sk->sk_prot->compat_setsockopt != NULL)
2631 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2632 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002633 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2634}
2635EXPORT_SYMBOL(compat_sock_common_setsockopt);
2636#endif
2637
Linus Torvalds1da177e2005-04-16 15:20:36 -07002638void sk_common_release(struct sock *sk)
2639{
2640 if (sk->sk_prot->destroy)
2641 sk->sk_prot->destroy(sk);
2642
2643 /*
2644 * Observation: when sock_common_release is called, processes have
2645 * no access to socket. But net still has.
2646 * Step one, detach it from networking:
2647 *
2648 * A. Remove from hash tables.
2649 */
2650
2651 sk->sk_prot->unhash(sk);
2652
2653 /*
2654 * In this point socket cannot receive new packets, but it is possible
2655 * that some packets are in flight because some CPU runs receiver and
2656 * did hash table lookup before we unhashed socket. They will achieve
2657 * receive queue and will be purged by socket destructor.
2658 *
2659 * Also we still have packets pending on receive queue and probably,
2660 * our own packets waiting in device queues. sock_destroy will drain
2661 * receive queue, but transmitted packets will delay socket destruction
2662 * until the last reference will be released.
2663 */
2664
2665 sock_orphan(sk);
2666
2667 xfrm_sk_free_policy(sk);
2668
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002669 sk_refcnt_debug_release(sk);
Eric Dumazet5640f762012-09-23 23:04:42 +00002670
2671 if (sk->sk_frag.page) {
2672 put_page(sk->sk_frag.page);
2673 sk->sk_frag.page = NULL;
2674 }
2675
Linus Torvalds1da177e2005-04-16 15:20:36 -07002676 sock_put(sk);
2677}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678EXPORT_SYMBOL(sk_common_release);
2679
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002680#ifdef CONFIG_PROC_FS
2681#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002682struct prot_inuse {
2683 int val[PROTO_INUSE_NR];
2684};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002685
2686static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002687
2688#ifdef CONFIG_NET_NS
2689void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2690{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002691 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002692}
2693EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2694
2695int sock_prot_inuse_get(struct net *net, struct proto *prot)
2696{
2697 int cpu, idx = prot->inuse_idx;
2698 int res = 0;
2699
2700 for_each_possible_cpu(cpu)
2701 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2702
2703 return res >= 0 ? res : 0;
2704}
2705EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2706
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002707static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002708{
2709 net->core.inuse = alloc_percpu(struct prot_inuse);
2710 return net->core.inuse ? 0 : -ENOMEM;
2711}
2712
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002713static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002714{
2715 free_percpu(net->core.inuse);
2716}
2717
2718static struct pernet_operations net_inuse_ops = {
2719 .init = sock_inuse_init_net,
2720 .exit = sock_inuse_exit_net,
2721};
2722
2723static __init int net_inuse_init(void)
2724{
2725 if (register_pernet_subsys(&net_inuse_ops))
2726 panic("Cannot initialize net inuse counters");
2727
2728 return 0;
2729}
2730
2731core_initcall(net_inuse_init);
2732#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002733static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2734
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002735void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002736{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002737 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002738}
2739EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2740
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002741int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002742{
2743 int cpu, idx = prot->inuse_idx;
2744 int res = 0;
2745
2746 for_each_possible_cpu(cpu)
2747 res += per_cpu(prot_inuse, cpu).val[idx];
2748
2749 return res >= 0 ? res : 0;
2750}
2751EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002752#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002753
2754static void assign_proto_idx(struct proto *prot)
2755{
2756 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2757
2758 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
Joe Perchese005d192012-05-16 19:58:40 +00002759 pr_err("PROTO_INUSE_NR exhausted\n");
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002760 return;
2761 }
2762
2763 set_bit(prot->inuse_idx, proto_inuse_idx);
2764}
2765
2766static void release_proto_idx(struct proto *prot)
2767{
2768 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2769 clear_bit(prot->inuse_idx, proto_inuse_idx);
2770}
2771#else
2772static inline void assign_proto_idx(struct proto *prot)
2773{
2774}
2775
2776static inline void release_proto_idx(struct proto *prot)
2777{
2778}
2779#endif
2780
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002781static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2782{
2783 if (!rsk_prot)
2784 return;
2785 kfree(rsk_prot->slab_name);
2786 rsk_prot->slab_name = NULL;
Julia Lawalladf78ed2015-09-13 14:15:18 +02002787 kmem_cache_destroy(rsk_prot->slab);
2788 rsk_prot->slab = NULL;
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002789}
2790
2791static int req_prot_init(const struct proto *prot)
2792{
2793 struct request_sock_ops *rsk_prot = prot->rsk_prot;
2794
2795 if (!rsk_prot)
2796 return 0;
2797
2798 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2799 prot->name);
2800 if (!rsk_prot->slab_name)
2801 return -ENOMEM;
2802
2803 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2804 rsk_prot->obj_size, 0,
Eric Dumazete96f78a2015-10-03 06:27:28 -07002805 prot->slab_flags, NULL);
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002806
2807 if (!rsk_prot->slab) {
2808 pr_crit("%s: Can't create request sock SLAB cache!\n",
2809 prot->name);
2810 return -ENOMEM;
2811 }
2812 return 0;
2813}
2814
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815int proto_register(struct proto *prot, int alloc_slab)
2816{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002817 if (alloc_slab) {
2818 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002819 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2820 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821
2822 if (prot->slab == NULL) {
Joe Perchese005d192012-05-16 19:58:40 +00002823 pr_crit("%s: Can't create sock SLAB cache!\n",
2824 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002825 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002826 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002827
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002828 if (req_prot_init(prot))
2829 goto out_free_request_sock_slab;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002830
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002831 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002832 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002833
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002834 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002835 goto out_free_request_sock_slab;
2836
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002837 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002838 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002839 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002840 0,
Eric Dumazet52db70d2015-04-10 06:07:18 -07002841 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002842 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002843 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002844 goto out_free_timewait_sock_slab_name;
2845 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002846 }
2847
Glauber Costa36b77a52011-12-16 00:51:59 +00002848 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002849 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002850 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00002851 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002852 return 0;
2853
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002854out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002855 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002856out_free_request_sock_slab:
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002857 req_prot_cleanup(prot->rsk_prot);
2858
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002859 kmem_cache_destroy(prot->slab);
2860 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002861out:
2862 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002863}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002864EXPORT_SYMBOL(proto_register);
2865
2866void proto_unregister(struct proto *prot)
2867{
Glauber Costa36b77a52011-12-16 00:51:59 +00002868 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002869 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002870 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00002871 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872
Julia Lawalladf78ed2015-09-13 14:15:18 +02002873 kmem_cache_destroy(prot->slab);
2874 prot->slab = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002875
Eric Dumazet0159dfd2015-03-12 16:44:07 -07002876 req_prot_cleanup(prot->rsk_prot);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002877
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002878 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002879 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002880 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002881 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002882 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002884EXPORT_SYMBOL(proto_unregister);
2885
2886#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002887static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00002888 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002889{
Glauber Costa36b77a52011-12-16 00:51:59 +00002890 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002891 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002892}
2893
2894static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2895{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002896 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002897}
2898
2899static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00002900 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901{
Glauber Costa36b77a52011-12-16 00:51:59 +00002902 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002903}
2904
2905static char proto_method_implemented(const void *method)
2906{
2907 return method == NULL ? 'n' : 'y';
2908}
Glauber Costa180d8cd2011-12-11 21:47:02 +00002909static long sock_prot_memory_allocated(struct proto *proto)
2910{
Jeffrin Josecb75a362012-04-25 19:17:29 +05302911 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002912}
2913
2914static char *sock_prot_memory_pressure(struct proto *proto)
2915{
2916 return proto->memory_pressure != NULL ?
2917 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2918}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002919
2920static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2921{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002922
Eric Dumazet8d987e52010-11-09 23:24:26 +00002923 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2925 proto->name,
2926 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002927 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00002928 sock_prot_memory_allocated(proto),
2929 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002930 proto->max_header,
2931 proto->slab == NULL ? "no" : "yes",
2932 module_name(proto->owner),
2933 proto_method_implemented(proto->close),
2934 proto_method_implemented(proto->connect),
2935 proto_method_implemented(proto->disconnect),
2936 proto_method_implemented(proto->accept),
2937 proto_method_implemented(proto->ioctl),
2938 proto_method_implemented(proto->init),
2939 proto_method_implemented(proto->destroy),
2940 proto_method_implemented(proto->shutdown),
2941 proto_method_implemented(proto->setsockopt),
2942 proto_method_implemented(proto->getsockopt),
2943 proto_method_implemented(proto->sendmsg),
2944 proto_method_implemented(proto->recvmsg),
2945 proto_method_implemented(proto->sendpage),
2946 proto_method_implemented(proto->bind),
2947 proto_method_implemented(proto->backlog_rcv),
2948 proto_method_implemented(proto->hash),
2949 proto_method_implemented(proto->unhash),
2950 proto_method_implemented(proto->get_port),
2951 proto_method_implemented(proto->enter_memory_pressure));
2952}
2953
2954static int proto_seq_show(struct seq_file *seq, void *v)
2955{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002956 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002957 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2958 "protocol",
2959 "size",
2960 "sockets",
2961 "memory",
2962 "press",
2963 "maxhdr",
2964 "slab",
2965 "module",
2966 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2967 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002968 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969 return 0;
2970}
2971
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002972static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002973 .start = proto_seq_start,
2974 .next = proto_seq_next,
2975 .stop = proto_seq_stop,
2976 .show = proto_seq_show,
2977};
2978
2979static int proto_seq_open(struct inode *inode, struct file *file)
2980{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002981 return seq_open_net(inode, file, &proto_seq_ops,
2982 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002983}
2984
Arjan van de Ven9a321442007-02-12 00:55:35 -08002985static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002986 .owner = THIS_MODULE,
2987 .open = proto_seq_open,
2988 .read = seq_read,
2989 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002990 .release = seq_release_net,
2991};
2992
2993static __net_init int proto_init_net(struct net *net)
2994{
Gao fengd4beaa62013-02-18 01:34:54 +00002995 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
Eric Dumazet14e943d2008-11-19 15:14:01 -08002996 return -ENOMEM;
2997
2998 return 0;
2999}
3000
3001static __net_exit void proto_exit_net(struct net *net)
3002{
Gao fengece31ff2013-02-18 01:34:56 +00003003 remove_proc_entry("protocols", net->proc_net);
Eric Dumazet14e943d2008-11-19 15:14:01 -08003004}
3005
3006
3007static __net_initdata struct pernet_operations proto_net_ops = {
3008 .init = proto_init_net,
3009 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003010};
3011
3012static int __init proto_init(void)
3013{
Eric Dumazet14e943d2008-11-19 15:14:01 -08003014 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003015}
3016
3017subsys_initcall(proto_init);
3018
3019#endif /* PROC_FS */