blob: 9d144ee7e3798976ab16831491d73d63a05290f7 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +090035 * code. The ACK stuff can wait and needs major
Linus Torvalds1da177e2005-04-16 15:20:36 -070036 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 *
85 *
86 * This program is free software; you can redistribute it and/or
87 * modify it under the terms of the GNU General Public License
88 * as published by the Free Software Foundation; either version
89 * 2 of the License, or (at your option) any later version.
90 */
91
Randy Dunlap4fc268d2006-01-11 12:17:47 -080092#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070093#include <linux/errno.h>
94#include <linux/types.h>
95#include <linux/socket.h>
96#include <linux/in.h>
97#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070098#include <linux/module.h>
99#include <linux/proc_fs.h>
100#include <linux/seq_file.h>
101#include <linux/sched.h>
102#include <linux/timer.h>
103#include <linux/string.h>
104#include <linux/sockios.h>
105#include <linux/net.h>
106#include <linux/mm.h>
107#include <linux/slab.h>
108#include <linux/interrupt.h>
109#include <linux/poll.h>
110#include <linux/tcp.h>
111#include <linux/init.h>
Al Viroa1f8e7f72006-10-19 16:08:53 -0400112#include <linux/highmem.h>
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000113#include <linux/user_namespace.h>
Ingo Molnarc5905af2012-02-24 08:31:31 +0100114#include <linux/static_key.h>
David S. Miller3969eb32012-01-09 13:44:23 -0800115#include <linux/memcontrol.h>
David S. Miller8c1ae102012-05-03 02:25:55 -0400116#include <linux/prefetch.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118#include <asm/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119
120#include <linux/netdevice.h>
121#include <net/protocol.h>
122#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +0200123#include <net/net_namespace.h>
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -0700124#include <net/request_sock.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700125#include <net/sock.h>
Patrick Ohly20d49472009-02-12 05:03:38 +0000126#include <linux/net_tstamp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127#include <net/xfrm.h>
128#include <linux/ipsec.h>
Herbert Xuf8451722010-05-24 00:12:34 -0700129#include <net/cls_cgroup.h>
Neil Horman5bc14212011-11-22 05:10:51 +0000130#include <net/netprio_cgroup.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700131
132#include <linux/filter.h>
133
Satoru Moriya3847ce32011-06-17 12:00:03 +0000134#include <trace/events/sock.h>
135
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136#ifdef CONFIG_INET
137#include <net/tcp.h>
138#endif
139
Glauber Costa36b77a52011-12-16 00:51:59 +0000140static DEFINE_MUTEX(proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000141static LIST_HEAD(proto_list);
142
143#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145{
146 struct proto *proto;
147 int ret = 0;
148
Glauber Costa36b77a52011-12-16 00:51:59 +0000149 mutex_lock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000150 list_for_each_entry(proto, &proto_list, node) {
151 if (proto->init_cgroup) {
152 ret = proto->init_cgroup(cgrp, ss);
153 if (ret)
154 goto out;
155 }
156 }
157
Glauber Costa36b77a52011-12-16 00:51:59 +0000158 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000159 return ret;
160out:
161 list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 if (proto->destroy_cgroup)
Li Zefan761b3ef52012-01-31 13:47:36 +0800163 proto->destroy_cgroup(cgrp);
Glauber Costa36b77a52011-12-16 00:51:59 +0000164 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000165 return ret;
166}
167
Li Zefan761b3ef52012-01-31 13:47:36 +0800168void mem_cgroup_sockets_destroy(struct cgroup *cgrp)
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000169{
170 struct proto *proto;
171
Glauber Costa36b77a52011-12-16 00:51:59 +0000172 mutex_lock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000173 list_for_each_entry_reverse(proto, &proto_list, node)
174 if (proto->destroy_cgroup)
Li Zefan761b3ef52012-01-31 13:47:36 +0800175 proto->destroy_cgroup(cgrp);
Glauber Costa36b77a52011-12-16 00:51:59 +0000176 mutex_unlock(&proto_list_mutex);
Glauber Costad1a4c0b2011-12-11 21:47:04 +0000177}
178#endif
179
Ingo Molnarda21f242006-07-03 00:25:12 -0700180/*
181 * Each address family might have different locking rules, so we have
182 * one slock key per address family:
183 */
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700184static struct lock_class_key af_family_keys[AF_MAX];
185static struct lock_class_key af_family_slock_keys[AF_MAX];
186
Ingo Molnarc5905af2012-02-24 08:31:31 +0100187struct static_key memcg_socket_limit_enabled;
Glauber Costae1aab162011-12-11 21:47:03 +0000188EXPORT_SYMBOL(memcg_socket_limit_enabled);
189
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700190/*
191 * Make lock validator output more readable. (we pre-construct these
192 * strings build-time, so that runtime initialization of socket
193 * locks is fast):
194 */
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700195static const char *const af_family_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700196 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
197 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
198 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
199 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
200 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
201 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
202 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800203 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700204 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800205 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700206 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700207 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800208 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300209 "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700210};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700211static const char *const af_family_slock_key_strings[AF_MAX+1] = {
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700212 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
213 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
214 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
215 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
216 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
217 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
218 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800219 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700220 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
Oliver Hartkoppcd05acf2007-12-16 15:59:24 -0800221 "slock-27" , "slock-28" , "slock-AF_CAN" ,
David Howells17926a72007-04-26 15:48:28 -0700222 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700223 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800224 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300225 "slock-AF_NFC" , "slock-AF_MAX"
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700226};
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700227static const char *const af_family_clock_key_strings[AF_MAX+1] = {
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700228 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
229 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
230 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
231 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
232 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
233 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
234 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
Andy Grovercbd151b2009-02-26 23:43:19 -0800235 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700236 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
Oliver Hartkoppb4942af2008-07-23 14:06:04 -0700237 "clock-27" , "clock-28" , "clock-AF_CAN" ,
David Howellse51f8022007-07-21 19:30:16 -0700238 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
Remi Denis-Courmontbce7b152008-09-22 19:51:15 -0700239 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
Miloslav Trmač6f107b52010-12-08 14:35:34 +0800240 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
Aloisio Almeida Jrc7fe3b52011-07-01 19:31:35 -0300241 "clock-AF_NFC" , "clock-AF_MAX"
Peter Zijlstra443aef0e2007-07-19 01:49:00 -0700242};
Ingo Molnarda21f242006-07-03 00:25:12 -0700243
244/*
245 * sk_callback_lock locking rules are per-address-family,
246 * so split the lock classes by using a per-AF key:
247 */
248static struct lock_class_key af_callback_keys[AF_MAX];
249
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250/* Take into consideration the size of the struct sk_buff overhead in the
251 * determination of these values, since that is non-constant across
252 * platforms. This makes socket queueing behavior and performance
253 * not depend upon such differences.
254 */
255#define _SK_MEM_PACKETS 256
Eric Dumazet87fb4b72011-10-13 07:28:54 +0000256#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259
260/* Run time adjustable parameters. */
Brian Haleyab32ea52006-09-22 14:15:41 -0700261__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200262EXPORT_SYMBOL(sysctl_wmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700263__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
Hans Schillstrom6d8ebc82012-04-30 08:13:50 +0200264EXPORT_SYMBOL(sysctl_rmem_max);
Brian Haleyab32ea52006-09-22 14:15:41 -0700265__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
266__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300268/* Maximal space eaten by iovec or ancillary data plus some space */
Brian Haleyab32ea52006-09-22 14:15:41 -0700269int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet2a915252009-05-27 11:30:05 +0000270EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271
Neil Horman5bc14212011-11-22 05:10:51 +0000272#if defined(CONFIG_CGROUPS)
273#if !defined(CONFIG_NET_CLS_CGROUP)
Herbert Xuf8451722010-05-24 00:12:34 -0700274int net_cls_subsys_id = -1;
275EXPORT_SYMBOL_GPL(net_cls_subsys_id);
276#endif
Neil Horman5bc14212011-11-22 05:10:51 +0000277#if !defined(CONFIG_NETPRIO_CGROUP)
278int net_prio_subsys_id = -1;
279EXPORT_SYMBOL_GPL(net_prio_subsys_id);
280#endif
281#endif
Herbert Xuf8451722010-05-24 00:12:34 -0700282
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
284{
285 struct timeval tv;
286
287 if (optlen < sizeof(tv))
288 return -EINVAL;
289 if (copy_from_user(&tv, optval, sizeof(tv)))
290 return -EFAULT;
Vasily Averinba780732007-05-24 16:58:54 -0700291 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
292 return -EDOM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293
Vasily Averinba780732007-05-24 16:58:54 -0700294 if (tv.tv_sec < 0) {
Andrew Morton6f11df82007-07-09 13:16:00 -0700295 static int warned __read_mostly;
296
Vasily Averinba780732007-05-24 16:58:54 -0700297 *timeo_p = 0;
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700298 if (warned < 10 && net_ratelimit()) {
Vasily Averinba780732007-05-24 16:58:54 -0700299 warned++;
300 printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
301 "tries to set negative timeout\n",
Pavel Emelyanovba25f9d2007-10-18 23:40:40 -0700302 current->comm, task_pid_nr(current));
Ilpo Järvinen50aab542008-05-02 16:20:10 -0700303 }
Vasily Averinba780732007-05-24 16:58:54 -0700304 return 0;
305 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 *timeo_p = MAX_SCHEDULE_TIMEOUT;
307 if (tv.tv_sec == 0 && tv.tv_usec == 0)
308 return 0;
309 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
310 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
311 return 0;
312}
313
314static void sock_warn_obsolete_bsdism(const char *name)
315{
316 static int warned;
317 static char warncomm[TASK_COMM_LEN];
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900318 if (strcmp(warncomm, current->comm) && warned < 5) {
319 strcpy(warncomm, current->comm);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320 printk(KERN_WARNING "process `%s' is using obsolete "
321 "%s SO_BSDCOMPAT\n", warncomm, name);
322 warned++;
323 }
324}
325
Eric Dumazet08e29af2011-11-28 12:04:18 +0000326#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
327
328static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900329{
Eric Dumazet08e29af2011-11-28 12:04:18 +0000330 if (sk->sk_flags & flags) {
331 sk->sk_flags &= ~flags;
332 if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +0000333 net_disable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700334 }
335}
336
337
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800338int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
339{
Eric Dumazet766e90372009-10-14 20:40:11 -0700340 int err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800341 int skb_len;
Neil Horman3b885782009-10-12 13:26:31 -0700342 unsigned long flags;
343 struct sk_buff_head *list = &sk->sk_receive_queue;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800344
Eric Dumazet0fd7bac2011-12-21 07:11:44 +0000345 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700346 atomic_inc(&sk->sk_drops);
Satoru Moriya3847ce32011-06-17 12:00:03 +0000347 trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet766e90372009-10-14 20:40:11 -0700348 return -ENOMEM;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800349 }
350
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700351 err = sk_filter(sk, skb);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800352 if (err)
Eric Dumazet766e90372009-10-14 20:40:11 -0700353 return err;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800354
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800355 if (!sk_rmem_schedule(sk, skb->truesize)) {
Eric Dumazet766e90372009-10-14 20:40:11 -0700356 atomic_inc(&sk->sk_drops);
357 return -ENOBUFS;
Hideo Aoki3ab224b2007-12-31 00:11:19 -0800358 }
359
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800360 skb->dev = NULL;
361 skb_set_owner_r(skb, sk);
David S. Miller49ad9592008-12-17 22:11:38 -0800362
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800363 /* Cache the SKB length before we tack it onto the receive
364 * queue. Once it is added it no longer belongs to us and
365 * may be freed by other threads of control pulling packets
366 * from the queue.
367 */
368 skb_len = skb->len;
369
Eric Dumazet7fee2262010-05-11 23:19:48 +0000370 /* we escape from rcu protected region, make sure we dont leak
371 * a norefcounted dst
372 */
373 skb_dst_force(skb);
374
Neil Horman3b885782009-10-12 13:26:31 -0700375 spin_lock_irqsave(&list->lock, flags);
376 skb->dropcount = atomic_read(&sk->sk_drops);
377 __skb_queue_tail(list, skb);
378 spin_unlock_irqrestore(&list->lock, flags);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800379
380 if (!sock_flag(sk, SOCK_DEAD))
381 sk->sk_data_ready(sk, skb_len);
Eric Dumazet766e90372009-10-14 20:40:11 -0700382 return 0;
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800383}
384EXPORT_SYMBOL(sock_queue_rcv_skb);
385
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200386int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800387{
388 int rc = NET_RX_SUCCESS;
389
Dmitry Mishinfda9ef52006-08-31 15:28:39 -0700390 if (sk_filter(sk, skb))
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800391 goto discard_and_relse;
392
393 skb->dev = NULL;
394
Eric Dumazetf545a382012-04-22 23:34:26 +0000395 if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
Eric Dumazetc3774112010-04-27 15:13:20 -0700396 atomic_inc(&sk->sk_drops);
397 goto discard_and_relse;
398 }
Arnaldo Carvalho de Melo58a5a7b2006-11-16 14:06:06 -0200399 if (nested)
400 bh_lock_sock_nested(sk);
401 else
402 bh_lock_sock(sk);
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700403 if (!sock_owned_by_user(sk)) {
404 /*
405 * trylock + unlock semantics:
406 */
407 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
408
Peter Zijlstrac57943a2008-10-07 14:18:42 -0700409 rc = sk_backlog_rcv(sk, skb);
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -0700410
411 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
Eric Dumazetf545a382012-04-22 23:34:26 +0000412 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi8eae9392010-03-04 18:01:40 +0000413 bh_unlock_sock(sk);
414 atomic_inc(&sk->sk_drops);
415 goto discard_and_relse;
416 }
417
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800418 bh_unlock_sock(sk);
419out:
420 sock_put(sk);
421 return rc;
422discard_and_relse:
423 kfree_skb(skb);
424 goto out;
425}
426EXPORT_SYMBOL(sk_receive_skb);
427
Krishna Kumarea94ff32009-10-19 23:46:45 +0000428void sk_reset_txq(struct sock *sk)
429{
430 sk_tx_queue_clear(sk);
431}
432EXPORT_SYMBOL(sk_reset_txq);
433
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800434struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
435{
Eric Dumazetb6c67122010-04-08 23:03:29 +0000436 struct dst_entry *dst = __sk_dst_get(sk);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800437
438 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
Krishna Kumare022f0b2009-10-19 23:46:20 +0000439 sk_tx_queue_clear(sk);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +0000440 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
Denis Vlasenkof0088a52006-03-28 01:08:21 -0800441 dst_release(dst);
442 return NULL;
443 }
444
445 return dst;
446}
447EXPORT_SYMBOL(__sk_dst_check);
448
449struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
450{
451 struct dst_entry *dst = sk_dst_get(sk);
452
453 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
454 sk_dst_reset(sk);
455 dst_release(dst);
456 return NULL;
457 }
458
459 return dst;
460}
461EXPORT_SYMBOL(sk_dst_check);
462
David S. Miller48788092007-09-14 16:41:03 -0700463static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
464{
465 int ret = -ENOPROTOOPT;
466#ifdef CONFIG_NETDEVICES
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +0900467 struct net *net = sock_net(sk);
David S. Miller48788092007-09-14 16:41:03 -0700468 char devname[IFNAMSIZ];
469 int index;
470
471 /* Sorry... */
472 ret = -EPERM;
473 if (!capable(CAP_NET_RAW))
474 goto out;
475
476 ret = -EINVAL;
477 if (optlen < 0)
478 goto out;
479
480 /* Bind this socket to a particular device like "eth0",
481 * as specified in the passed interface name. If the
482 * name is "" or the option length is zero the socket
483 * is not bound.
484 */
485 if (optlen > IFNAMSIZ - 1)
486 optlen = IFNAMSIZ - 1;
487 memset(devname, 0, sizeof(devname));
488
489 ret = -EFAULT;
490 if (copy_from_user(devname, optval, optlen))
491 goto out;
492
David S. Miller000ba2e2009-11-05 22:37:11 -0800493 index = 0;
494 if (devname[0] != '\0') {
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800495 struct net_device *dev;
David S. Miller48788092007-09-14 16:41:03 -0700496
Eric Dumazetbf8e56b2009-11-05 21:03:39 -0800497 rcu_read_lock();
498 dev = dev_get_by_name_rcu(net, devname);
499 if (dev)
500 index = dev->ifindex;
501 rcu_read_unlock();
David S. Miller48788092007-09-14 16:41:03 -0700502 ret = -ENODEV;
503 if (!dev)
504 goto out;
David S. Miller48788092007-09-14 16:41:03 -0700505 }
506
507 lock_sock(sk);
508 sk->sk_bound_dev_if = index;
509 sk_dst_reset(sk);
510 release_sock(sk);
511
512 ret = 0;
513
514out:
515#endif
516
517 return ret;
518}
519
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800520static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
521{
522 if (valbool)
523 sock_set_flag(sk, bit);
524 else
525 sock_reset_flag(sk, bit);
526}
527
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528/*
529 * This is meant for all protocols to use and covers goings on
530 * at the socket level. Everything here is generic.
531 */
532
533int sock_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -0700534 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700535{
Eric Dumazet2a915252009-05-27 11:30:05 +0000536 struct sock *sk = sock->sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 int val;
538 int valbool;
539 struct linger ling;
540 int ret = 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900541
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542 /*
543 * Options without arguments
544 */
545
David S. Miller48788092007-09-14 16:41:03 -0700546 if (optname == SO_BINDTODEVICE)
547 return sock_bindtodevice(sk, optval, optlen);
548
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700549 if (optlen < sizeof(int))
550 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900551
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552 if (get_user(val, (int __user *)optval))
553 return -EFAULT;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900554
Eric Dumazet2a915252009-05-27 11:30:05 +0000555 valbool = val ? 1 : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556
557 lock_sock(sk);
558
Eric Dumazet2a915252009-05-27 11:30:05 +0000559 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700560 case SO_DEBUG:
Eric Dumazet2a915252009-05-27 11:30:05 +0000561 if (val && !capable(CAP_NET_ADMIN))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700562 ret = -EACCES;
Eric Dumazet2a915252009-05-27 11:30:05 +0000563 else
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800564 sock_valbool_flag(sk, SOCK_DBG, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700565 break;
566 case SO_REUSEADDR:
Pavel Emelyanov4a17fd52012-04-19 03:39:36 +0000567 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700568 break;
569 case SO_TYPE:
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000570 case SO_PROTOCOL:
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000571 case SO_DOMAIN:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700572 case SO_ERROR:
573 ret = -ENOPROTOOPT;
574 break;
575 case SO_DONTROUTE:
Pavel Emelyanovc0ef8772007-11-15 03:03:19 -0800576 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700577 break;
578 case SO_BROADCAST:
579 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
580 break;
581 case SO_SNDBUF:
582 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000583 * about it this is right. Otherwise apps have to
584 * play 'guess the biggest size' games. RCVBUF/SNDBUF
585 * are treated in BSD as hints
586 */
587 val = min_t(u32, val, sysctl_wmem_max);
Patrick McHardyb0573de2005-08-09 19:30:51 -0700588set_sndbuf:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700589 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
Eric Dumazet82981932012-04-26 20:07:59 +0000590 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
591 /* Wake up sending tasks if we upped the value. */
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700592 sk->sk_write_space(sk);
593 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700595 case SO_SNDBUFFORCE:
596 if (!capable(CAP_NET_ADMIN)) {
597 ret = -EPERM;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 break;
599 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700600 goto set_sndbuf;
601
602 case SO_RCVBUF:
603 /* Don't error on this BSD doesn't and if you think
Eric Dumazet82981932012-04-26 20:07:59 +0000604 * about it this is right. Otherwise apps have to
605 * play 'guess the biggest size' games. RCVBUF/SNDBUF
606 * are treated in BSD as hints
607 */
608 val = min_t(u32, val, sysctl_rmem_max);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700609set_rcvbuf:
610 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
611 /*
612 * We double it on the way in to account for
613 * "struct sk_buff" etc. overhead. Applications
614 * assume that the SO_RCVBUF setting they make will
615 * allow that much actual data to be received on that
616 * socket.
617 *
618 * Applications are unaware that "struct sk_buff" and
619 * other overheads allocate from the receive buffer
620 * during socket buffer allocation.
621 *
622 * And after considering the possible alternatives,
623 * returning the value we actually used in getsockopt
624 * is the most desirable behavior.
625 */
Eric Dumazet82981932012-04-26 20:07:59 +0000626 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700627 break;
628
629 case SO_RCVBUFFORCE:
630 if (!capable(CAP_NET_ADMIN)) {
631 ret = -EPERM;
632 break;
633 }
634 goto set_rcvbuf;
635
636 case SO_KEEPALIVE:
637#ifdef CONFIG_INET
638 if (sk->sk_protocol == IPPROTO_TCP)
639 tcp_set_keepalive(sk, valbool);
640#endif
641 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
642 break;
643
644 case SO_OOBINLINE:
645 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
646 break;
647
648 case SO_NO_CHECK:
649 sk->sk_no_check = valbool;
650 break;
651
652 case SO_PRIORITY:
653 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
654 sk->sk_priority = val;
655 else
656 ret = -EPERM;
657 break;
658
659 case SO_LINGER:
660 if (optlen < sizeof(ling)) {
661 ret = -EINVAL; /* 1003.1g */
662 break;
663 }
Eric Dumazet2a915252009-05-27 11:30:05 +0000664 if (copy_from_user(&ling, optval, sizeof(ling))) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700665 ret = -EFAULT;
666 break;
667 }
668 if (!ling.l_onoff)
669 sock_reset_flag(sk, SOCK_LINGER);
670 else {
671#if (BITS_PER_LONG == 32)
672 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
673 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
674 else
675#endif
676 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
677 sock_set_flag(sk, SOCK_LINGER);
678 }
679 break;
680
681 case SO_BSDCOMPAT:
682 sock_warn_obsolete_bsdism("setsockopt");
683 break;
684
685 case SO_PASSCRED:
686 if (valbool)
687 set_bit(SOCK_PASSCRED, &sock->flags);
688 else
689 clear_bit(SOCK_PASSCRED, &sock->flags);
690 break;
691
692 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700693 case SO_TIMESTAMPNS:
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700694 if (valbool) {
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700695 if (optname == SO_TIMESTAMP)
696 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
697 else
698 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700699 sock_set_flag(sk, SOCK_RCVTSTAMP);
Patrick Ohly20d49472009-02-12 05:03:38 +0000700 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700701 } else {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700702 sock_reset_flag(sk, SOCK_RCVTSTAMP);
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700703 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
704 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700705 break;
706
Patrick Ohly20d49472009-02-12 05:03:38 +0000707 case SO_TIMESTAMPING:
708 if (val & ~SOF_TIMESTAMPING_MASK) {
Rémi Denis-Courmontf249fb72009-07-20 00:47:04 +0000709 ret = -EINVAL;
Patrick Ohly20d49472009-02-12 05:03:38 +0000710 break;
711 }
712 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
713 val & SOF_TIMESTAMPING_TX_HARDWARE);
714 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
715 val & SOF_TIMESTAMPING_TX_SOFTWARE);
716 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
717 val & SOF_TIMESTAMPING_RX_HARDWARE);
718 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
719 sock_enable_timestamp(sk,
720 SOCK_TIMESTAMPING_RX_SOFTWARE);
721 else
722 sock_disable_timestamp(sk,
Eric Dumazet08e29af2011-11-28 12:04:18 +0000723 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
Patrick Ohly20d49472009-02-12 05:03:38 +0000724 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
725 val & SOF_TIMESTAMPING_SOFTWARE);
726 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
727 val & SOF_TIMESTAMPING_SYS_HARDWARE);
728 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
729 val & SOF_TIMESTAMPING_RAW_HARDWARE);
730 break;
731
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700732 case SO_RCVLOWAT:
733 if (val < 0)
734 val = INT_MAX;
735 sk->sk_rcvlowat = val ? : 1;
736 break;
737
738 case SO_RCVTIMEO:
739 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
740 break;
741
742 case SO_SNDTIMEO:
743 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
744 break;
745
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700746 case SO_ATTACH_FILTER:
747 ret = -EINVAL;
748 if (optlen == sizeof(struct sock_fprog)) {
749 struct sock_fprog fprog;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700751 ret = -EFAULT;
752 if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700755 ret = sk_attach_filter(&fprog, sk);
756 }
757 break;
758
759 case SO_DETACH_FILTER:
Pavel Emelyanov55b33322007-10-17 21:21:26 -0700760 ret = sk_detach_filter(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700761 break;
762
763 case SO_PASSSEC:
764 if (valbool)
765 set_bit(SOCK_PASSSEC, &sock->flags);
766 else
767 clear_bit(SOCK_PASSSEC, &sock->flags);
768 break;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800769 case SO_MARK:
770 if (!capable(CAP_NET_ADMIN))
771 ret = -EPERM;
Eric Dumazet2a915252009-05-27 11:30:05 +0000772 else
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800773 sk->sk_mark = val;
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -0800774 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700775
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776 /* We implement the SO_SNDLOWAT etc to
777 not be settable (1003.1g 5.3) */
Neil Horman3b885782009-10-12 13:26:31 -0700778 case SO_RXQ_OVFL:
Johannes Berg8083f0f2011-10-07 03:30:20 +0000779 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
Neil Horman3b885782009-10-12 13:26:31 -0700780 break;
Johannes Berg6e3e9392011-11-09 10:15:42 +0100781
782 case SO_WIFI_STATUS:
783 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
784 break;
785
Pavel Emelyanovef64a542012-02-21 07:31:34 +0000786 case SO_PEEK_OFF:
787 if (sock->ops->set_peek_off)
788 sock->ops->set_peek_off(sk, val);
789 else
790 ret = -EOPNOTSUPP;
791 break;
Ben Greear3bdc0eb2012-02-11 15:39:30 +0000792
793 case SO_NOFCS:
794 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
795 break;
796
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700797 default:
798 ret = -ENOPROTOOPT;
799 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900800 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801 release_sock(sk);
802 return ret;
803}
Eric Dumazet2a915252009-05-27 11:30:05 +0000804EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700805
806
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000807void cred_to_ucred(struct pid *pid, const struct cred *cred,
808 struct ucred *ucred)
809{
810 ucred->pid = pid_vnr(pid);
811 ucred->uid = ucred->gid = -1;
812 if (cred) {
813 struct user_namespace *current_ns = current_user_ns();
814
815 ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
816 ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
817 }
818}
David S. Miller39247732010-06-16 16:18:25 -0700819EXPORT_SYMBOL_GPL(cred_to_ucred);
Eric W. Biederman3f551f92010-06-13 03:28:59 +0000820
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821int sock_getsockopt(struct socket *sock, int level, int optname,
822 char __user *optval, int __user *optlen)
823{
824 struct sock *sk = sock->sk;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900825
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700826 union {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900827 int val;
828 struct linger ling;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 struct timeval tm;
830 } v;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900831
H Hartley Sweeten4d0392b2010-01-15 01:08:58 -0800832 int lv = sizeof(int);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 int len;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900834
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700835 if (get_user(len, optlen))
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900836 return -EFAULT;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700837 if (len < 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838 return -EINVAL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900839
Eugene Teo50fee1d2009-02-23 15:38:41 -0800840 memset(&v, 0, sizeof(v));
Clément Lecignedf0bca02009-02-12 16:59:09 -0800841
Eric Dumazet2a915252009-05-27 11:30:05 +0000842 switch (optname) {
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700843 case SO_DEBUG:
844 v.val = sock_flag(sk, SOCK_DBG);
845 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900846
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700847 case SO_DONTROUTE:
848 v.val = sock_flag(sk, SOCK_LOCALROUTE);
849 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900850
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700851 case SO_BROADCAST:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000852 v.val = sock_flag(sk, SOCK_BROADCAST);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700853 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700854
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700855 case SO_SNDBUF:
856 v.val = sk->sk_sndbuf;
857 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900858
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700859 case SO_RCVBUF:
860 v.val = sk->sk_rcvbuf;
861 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700862
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700863 case SO_REUSEADDR:
864 v.val = sk->sk_reuse;
865 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700867 case SO_KEEPALIVE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000868 v.val = sock_flag(sk, SOCK_KEEPOPEN);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700869 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700871 case SO_TYPE:
872 v.val = sk->sk_type;
873 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874
Jan Engelhardt49c794e2009-08-04 07:28:28 +0000875 case SO_PROTOCOL:
876 v.val = sk->sk_protocol;
877 break;
878
Jan Engelhardt0d6038e2009-08-04 07:28:29 +0000879 case SO_DOMAIN:
880 v.val = sk->sk_family;
881 break;
882
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700883 case SO_ERROR:
884 v.val = -sock_error(sk);
Eric Dumazet2a915252009-05-27 11:30:05 +0000885 if (v.val == 0)
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700886 v.val = xchg(&sk->sk_err_soft, 0);
887 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700889 case SO_OOBINLINE:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000890 v.val = sock_flag(sk, SOCK_URGINLINE);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700891 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900892
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700893 case SO_NO_CHECK:
894 v.val = sk->sk_no_check;
895 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700896
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700897 case SO_PRIORITY:
898 v.val = sk->sk_priority;
899 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900900
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700901 case SO_LINGER:
902 lv = sizeof(v.ling);
Eric Dumazet1b23a5d2012-05-16 05:57:07 +0000903 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700904 v.ling.l_linger = sk->sk_lingertime / HZ;
905 break;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900906
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700907 case SO_BSDCOMPAT:
908 sock_warn_obsolete_bsdism("getsockopt");
909 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700911 case SO_TIMESTAMP:
Eric Dumazet92f37fd2007-03-25 22:14:49 -0700912 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
913 !sock_flag(sk, SOCK_RCVTSTAMPNS);
914 break;
915
916 case SO_TIMESTAMPNS:
917 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700918 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919
Patrick Ohly20d49472009-02-12 05:03:38 +0000920 case SO_TIMESTAMPING:
921 v.val = 0;
922 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
923 v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
924 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
925 v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
926 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
927 v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
928 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
929 v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
930 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
931 v.val |= SOF_TIMESTAMPING_SOFTWARE;
932 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
933 v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
934 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
935 v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
936 break;
937
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700938 case SO_RCVTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000939 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700940 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
941 v.tm.tv_sec = 0;
942 v.tm.tv_usec = 0;
943 } else {
944 v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
945 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700946 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700947 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700949 case SO_SNDTIMEO:
Eric Dumazet2a915252009-05-27 11:30:05 +0000950 lv = sizeof(struct timeval);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700951 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
952 v.tm.tv_sec = 0;
953 v.tm.tv_usec = 0;
954 } else {
955 v.tm.tv_sec = sk->sk_sndtimeo / HZ;
956 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
957 }
958 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700959
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700960 case SO_RCVLOWAT:
961 v.val = sk->sk_rcvlowat;
962 break;
Catherine Zhang877ce7c2006-06-29 12:27:47 -0700963
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700964 case SO_SNDLOWAT:
Eric Dumazet2a915252009-05-27 11:30:05 +0000965 v.val = 1;
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700966 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700967
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700968 case SO_PASSCRED:
Eric Dumazet82981932012-04-26 20:07:59 +0000969 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700970 break;
971
972 case SO_PEERCRED:
Eric W. Biederman109f6e32010-06-13 03:30:14 +0000973 {
974 struct ucred peercred;
975 if (len > sizeof(peercred))
976 len = sizeof(peercred);
977 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
978 if (copy_to_user(optval, &peercred, len))
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700979 return -EFAULT;
980 goto lenout;
Eric W. Biederman109f6e32010-06-13 03:30:14 +0000981 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700982
983 case SO_PEERNAME:
984 {
985 char address[128];
986
987 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
988 return -ENOTCONN;
989 if (lv < len)
990 return -EINVAL;
991 if (copy_to_user(optval, address, len))
992 return -EFAULT;
993 goto lenout;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994 }
Stephen Hemmingere71a4782007-04-10 20:10:33 -0700995
996 /* Dubious BSD thing... Probably nobody even uses it, but
997 * the UNIX standard wants it for whatever reason... -DaveM
998 */
999 case SO_ACCEPTCONN:
1000 v.val = sk->sk_state == TCP_LISTEN;
1001 break;
1002
1003 case SO_PASSSEC:
Eric Dumazet82981932012-04-26 20:07:59 +00001004 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001005 break;
1006
1007 case SO_PEERSEC:
1008 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1009
Laszlo Attila Toth4a19ec52008-01-30 19:08:16 -08001010 case SO_MARK:
1011 v.val = sk->sk_mark;
1012 break;
1013
Neil Horman3b885782009-10-12 13:26:31 -07001014 case SO_RXQ_OVFL:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001015 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
Neil Horman3b885782009-10-12 13:26:31 -07001016 break;
1017
Johannes Berg6e3e9392011-11-09 10:15:42 +01001018 case SO_WIFI_STATUS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001019 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
Johannes Berg6e3e9392011-11-09 10:15:42 +01001020 break;
1021
Pavel Emelyanovef64a542012-02-21 07:31:34 +00001022 case SO_PEEK_OFF:
1023 if (!sock->ops->set_peek_off)
1024 return -EOPNOTSUPP;
1025
1026 v.val = sk->sk_peek_off;
1027 break;
David S. Millerbc2f7992012-02-24 14:48:34 -05001028 case SO_NOFCS:
Eric Dumazet1b23a5d2012-05-16 05:57:07 +00001029 v.val = sock_flag(sk, SOCK_NOFCS);
David S. Millerbc2f7992012-02-24 14:48:34 -05001030 break;
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001031 default:
1032 return -ENOPROTOOPT;
1033 }
1034
Linus Torvalds1da177e2005-04-16 15:20:36 -07001035 if (len > lv)
1036 len = lv;
1037 if (copy_to_user(optval, &v, len))
1038 return -EFAULT;
1039lenout:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001040 if (put_user(len, optlen))
1041 return -EFAULT;
1042 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001043}
1044
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07001045/*
1046 * Initialize an sk_lock.
1047 *
1048 * (We also register the sk_lock with the lock validator.)
1049 */
Dave Jonesb6f99a22007-03-22 12:27:49 -07001050static inline void sock_lock_init(struct sock *sk)
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07001051{
Peter Zijlstraed075362006-12-06 20:35:24 -08001052 sock_lock_init_class_and_name(sk,
1053 af_family_slock_key_strings[sk->sk_family],
1054 af_family_slock_keys + sk->sk_family,
1055 af_family_key_strings[sk->sk_family],
1056 af_family_keys + sk->sk_family);
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07001057}
1058
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001059/*
1060 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1061 * even temporarly, because of RCU lookups. sk_node should also be left as is.
Eric Dumazet68835ab2010-11-30 19:04:07 +00001062 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001063 */
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001064static void sock_copy(struct sock *nsk, const struct sock *osk)
1065{
1066#ifdef CONFIG_SECURITY_NETWORK
1067 void *sptr = nsk->sk_security;
1068#endif
Eric Dumazet68835ab2010-11-30 19:04:07 +00001069 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1070
1071 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1072 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1073
Pavel Emelyanovf1a6c4d2007-11-01 00:29:45 -07001074#ifdef CONFIG_SECURITY_NETWORK
1075 nsk->sk_security = sptr;
1076 security_sk_clone(osk, nsk);
1077#endif
1078}
1079
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001080/*
1081 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1082 * un-modified. Special care is taken when initializing object to zero.
1083 */
1084static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1085{
1086 if (offsetof(struct sock, sk_node.next) != 0)
1087 memset(sk, 0, offsetof(struct sock, sk_node.next));
1088 memset(&sk->sk_node.pprev, 0,
1089 size - offsetof(struct sock, sk_node.pprev));
1090}
1091
1092void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1093{
1094 unsigned long nulls1, nulls2;
1095
1096 nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1097 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1098 if (nulls1 > nulls2)
1099 swap(nulls1, nulls2);
1100
1101 if (nulls1 != 0)
1102 memset((char *)sk, 0, nulls1);
1103 memset((char *)sk + nulls1 + sizeof(void *), 0,
1104 nulls2 - nulls1 - sizeof(void *));
1105 memset((char *)sk + nulls2 + sizeof(void *), 0,
1106 size - nulls2 - sizeof(void *));
1107}
1108EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1109
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001110static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1111 int family)
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001112{
1113 struct sock *sk;
1114 struct kmem_cache *slab;
1115
1116 slab = prot->slab;
Eric Dumazete912b112009-07-08 19:36:05 +00001117 if (slab != NULL) {
1118 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1119 if (!sk)
1120 return sk;
1121 if (priority & __GFP_ZERO) {
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001122 if (prot->clear_sk)
1123 prot->clear_sk(sk, prot->obj_size);
1124 else
1125 sk_prot_clear_nulls(sk, prot->obj_size);
Eric Dumazete912b112009-07-08 19:36:05 +00001126 }
Octavian Purdilafcbdf092010-12-16 14:26:56 -08001127 } else
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001128 sk = kmalloc(prot->obj_size, priority);
1129
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001130 if (sk != NULL) {
Vegard Nossuma98b65a2009-02-26 14:46:57 +01001131 kmemcheck_annotate_bitfield(sk, flags);
1132
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001133 if (security_sk_alloc(sk, family, priority))
1134 goto out_free;
1135
1136 if (!try_module_get(prot->owner))
1137 goto out_free_sec;
Krishna Kumare022f0b2009-10-19 23:46:20 +00001138 sk_tx_queue_clear(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001139 }
1140
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001141 return sk;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001142
1143out_free_sec:
1144 security_sk_free(sk);
1145out_free:
1146 if (slab != NULL)
1147 kmem_cache_free(slab, sk);
1148 else
1149 kfree(sk);
1150 return NULL;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001151}
1152
1153static void sk_prot_free(struct proto *prot, struct sock *sk)
1154{
1155 struct kmem_cache *slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001156 struct module *owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001157
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001158 owner = prot->owner;
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001159 slab = prot->slab;
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001160
1161 security_sk_free(sk);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001162 if (slab != NULL)
1163 kmem_cache_free(slab, sk);
1164 else
1165 kfree(sk);
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001166 module_put(owner);
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001167}
1168
Herbert Xuf8451722010-05-24 00:12:34 -07001169#ifdef CONFIG_CGROUPS
1170void sock_update_classid(struct sock *sk)
1171{
Paul E. McKenney11441822010-10-06 17:15:35 -07001172 u32 classid;
Herbert Xuf8451722010-05-24 00:12:34 -07001173
Paul E. McKenney11441822010-10-06 17:15:35 -07001174 rcu_read_lock(); /* doing current task, which cannot vanish. */
1175 classid = task_cls_classid(current);
1176 rcu_read_unlock();
Herbert Xuf8451722010-05-24 00:12:34 -07001177 if (classid && classid != sk->sk_classid)
1178 sk->sk_classid = classid;
1179}
Herbert Xu82862742010-05-24 00:14:10 -07001180EXPORT_SYMBOL(sock_update_classid);
Neil Horman5bc14212011-11-22 05:10:51 +00001181
1182void sock_update_netprioidx(struct sock *sk)
1183{
Neil Horman5bc14212011-11-22 05:10:51 +00001184 if (in_interrupt())
1185 return;
Neil Horman2b73bc62012-02-10 05:43:38 +00001186
1187 sk->sk_cgrp_prioidx = task_netprioidx(current);
Neil Horman5bc14212011-11-22 05:10:51 +00001188}
1189EXPORT_SYMBOL_GPL(sock_update_netprioidx);
Herbert Xuf8451722010-05-24 00:12:34 -07001190#endif
1191
Linus Torvalds1da177e2005-04-16 15:20:36 -07001192/**
1193 * sk_alloc - All socket objects are allocated here
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001194 * @net: the applicable net namespace
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001195 * @family: protocol family
1196 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1197 * @prot: struct proto associated with this new sock instance
Linus Torvalds1da177e2005-04-16 15:20:36 -07001198 */
Eric W. Biederman1b8d7ae2007-10-08 23:24:22 -07001199struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
Pavel Emelyanov6257ff22007-11-01 00:39:31 -07001200 struct proto *prot)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001201{
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001202 struct sock *sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001203
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001204 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 if (sk) {
Pavel Emelyanov154adbc2007-11-01 00:38:43 -07001206 sk->sk_family = family;
1207 /*
1208 * See comment in struct sock definition to understand
1209 * why we need sk_prot_creator -acme
1210 */
1211 sk->sk_prot = sk->sk_prot_creator = prot;
1212 sock_lock_init(sk);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001213 sock_net_set(sk, get_net(net));
Jarek Poplawskid66ee052009-08-30 23:15:36 +00001214 atomic_set(&sk->sk_wmem_alloc, 1);
Herbert Xuf8451722010-05-24 00:12:34 -07001215
1216 sock_update_classid(sk);
Neil Horman5bc14212011-11-22 05:10:51 +00001217 sock_update_netprioidx(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001218 }
Frank Filza79af592005-09-27 15:23:38 -07001219
Pavel Emelyanov2e4afe72007-11-01 00:36:26 -07001220 return sk;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001221}
Eric Dumazet2a915252009-05-27 11:30:05 +00001222EXPORT_SYMBOL(sk_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001223
Eric Dumazet2b85a342009-06-11 02:55:43 -07001224static void __sk_free(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001225{
1226 struct sk_filter *filter;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001227
1228 if (sk->sk_destruct)
1229 sk->sk_destruct(sk);
1230
Paul E. McKenneya898def2010-02-22 17:04:49 -08001231 filter = rcu_dereference_check(sk->sk_filter,
1232 atomic_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001233 if (filter) {
Pavel Emelyanov309dd5f2007-10-17 21:21:51 -07001234 sk_filter_uncharge(sk, filter);
Stephen Hemmingera9b3cd72011-08-01 16:19:00 +00001235 RCU_INIT_POINTER(sk->sk_filter, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001236 }
1237
Eric Dumazet08e29af2011-11-28 12:04:18 +00001238 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239
1240 if (atomic_read(&sk->sk_omem_alloc))
1241 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
Harvey Harrison0dc47872008-03-05 20:47:47 -08001242 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001243
Eric W. Biederman109f6e32010-06-13 03:30:14 +00001244 if (sk->sk_peer_cred)
1245 put_cred(sk->sk_peer_cred);
1246 put_pid(sk->sk_peer_pid);
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001247 put_net(sock_net(sk));
Pavel Emelyanovc308c1b22007-11-01 00:33:50 -07001248 sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001249}
Eric Dumazet2b85a342009-06-11 02:55:43 -07001250
1251void sk_free(struct sock *sk)
1252{
1253 /*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001254 * We subtract one from sk_wmem_alloc and can know if
Eric Dumazet2b85a342009-06-11 02:55:43 -07001255 * some packets are still in some tx queue.
1256 * If not null, sock_wfree() will call __sk_free(sk) later
1257 */
1258 if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1259 __sk_free(sk);
1260}
Eric Dumazet2a915252009-05-27 11:30:05 +00001261EXPORT_SYMBOL(sk_free);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262
Denis V. Lunevedf02082008-02-29 11:18:32 -08001263/*
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001264 * Last sock_put should drop reference to sk->sk_net. It has already
1265 * been dropped in sk_change_net. Taking reference to stopping namespace
Denis V. Lunevedf02082008-02-29 11:18:32 -08001266 * is not an option.
Lucas De Marchi25985ed2011-03-30 22:57:33 -03001267 * Take reference to a socket to remove it from hash _alive_ and after that
Denis V. Lunevedf02082008-02-29 11:18:32 -08001268 * destroy it in the context of init_net.
1269 */
1270void sk_release_kernel(struct sock *sk)
1271{
1272 if (sk == NULL || sk->sk_socket == NULL)
1273 return;
1274
1275 sock_hold(sk);
1276 sock_release(sk->sk_socket);
Denis V. Lunev65a18ec2008-04-16 01:59:46 -07001277 release_net(sock_net(sk));
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001278 sock_net_set(sk, get_net(&init_net));
Denis V. Lunevedf02082008-02-29 11:18:32 -08001279 sock_put(sk);
1280}
David S. Miller45af1752008-02-29 11:33:19 -08001281EXPORT_SYMBOL(sk_release_kernel);
Denis V. Lunevedf02082008-02-29 11:18:32 -08001282
Stephen Rothwell475f1b52012-01-09 16:33:16 +11001283static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1284{
1285 if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1286 sock_update_memcg(newsk);
1287}
1288
Eric Dumazete56c57d2011-11-08 17:07:07 -05001289/**
1290 * sk_clone_lock - clone a socket, and lock its clone
1291 * @sk: the socket to clone
1292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1293 *
1294 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1295 */
1296struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001297{
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001298 struct sock *newsk;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001299
Pavel Emelyanov8fd1d172007-11-01 00:37:32 -07001300 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001301 if (newsk != NULL) {
1302 struct sk_filter *filter;
1303
Venkat Yekkirala892c1412006-08-04 23:08:56 -07001304 sock_copy(newsk, sk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001305
1306 /* SANITY */
YOSHIFUJI Hideaki3b1e0a62008-03-26 02:26:21 +09001307 get_net(sock_net(newsk));
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001308 sk_node_init(&newsk->sk_node);
1309 sock_lock_init(newsk);
1310 bh_lock_sock(newsk);
Eric Dumazetfa438cc2007-03-04 16:05:44 -08001311 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
Zhu Yi8eae9392010-03-04 18:01:40 +00001312 newsk->sk_backlog.len = 0;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001313
1314 atomic_set(&newsk->sk_rmem_alloc, 0);
Eric Dumazet2b85a342009-06-11 02:55:43 -07001315 /*
1316 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1317 */
1318 atomic_set(&newsk->sk_wmem_alloc, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001319 atomic_set(&newsk->sk_omem_alloc, 0);
1320 skb_queue_head_init(&newsk->sk_receive_queue);
1321 skb_queue_head_init(&newsk->sk_write_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07001322#ifdef CONFIG_NET_DMA
1323 skb_queue_head_init(&newsk->sk_async_wait_queue);
1324#endif
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001325
Eric Dumazetb6c67122010-04-08 23:03:29 +00001326 spin_lock_init(&newsk->sk_dst_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001327 rwlock_init(&newsk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07001328 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1329 af_callback_keys + newsk->sk_family,
1330 af_family_clock_key_strings[newsk->sk_family]);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001331
1332 newsk->sk_dst_cache = NULL;
1333 newsk->sk_wmem_queued = 0;
1334 newsk->sk_forward_alloc = 0;
1335 newsk->sk_send_head = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001336 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1337
1338 sock_reset_flag(newsk, SOCK_DONE);
1339 skb_queue_head_init(&newsk->sk_error_queue);
1340
Eric Dumazet0d7da9d2010-10-25 03:47:05 +00001341 filter = rcu_dereference_protected(newsk->sk_filter, 1);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001342 if (filter != NULL)
1343 sk_filter_charge(newsk, filter);
1344
1345 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1346 /* It is still raw copy of parent, so invalidate
1347 * destructor and make plain sk_free() */
1348 newsk->sk_destruct = NULL;
Thomas Gleixnerb0691c82011-10-25 02:30:50 +00001349 bh_unlock_sock(newsk);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001350 sk_free(newsk);
1351 newsk = NULL;
1352 goto out;
1353 }
1354
1355 newsk->sk_err = 0;
1356 newsk->sk_priority = 0;
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00001357 /*
1358 * Before updating sk_refcnt, we must commit prior changes to memory
1359 * (Documentation/RCU/rculist_nulls.txt for details)
1360 */
1361 smp_wmb();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001362 atomic_set(&newsk->sk_refcnt, 2);
1363
1364 /*
1365 * Increment the counter in the same struct proto as the master
1366 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1367 * is the same as sk->sk_prot->socks, as this field was copied
1368 * with memcpy).
1369 *
1370 * This _changes_ the previous behaviour, where
1371 * tcp_create_openreq_child always was incrementing the
1372 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1373 * to be taken into account in all callers. -acme
1374 */
1375 sk_refcnt_debug_inc(newsk);
David S. Miller972692e2008-06-17 22:41:38 -07001376 sk_set_socket(newsk, NULL);
Eric Dumazet43815482010-04-29 11:01:49 +00001377 newsk->sk_wq = NULL;
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001378
Glauber Costaf3f511e2012-01-05 20:16:39 +00001379 sk_update_clone(sk, newsk);
1380
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001381 if (newsk->sk_prot->sockets_allocated)
Glauber Costa180d8cd2011-12-11 21:47:02 +00001382 sk_sockets_allocated_inc(newsk);
Octavian Purdila704da5602010-01-08 00:00:09 -08001383
Eric Dumazet08e29af2011-11-28 12:04:18 +00001384 if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
Octavian Purdila704da5602010-01-08 00:00:09 -08001385 net_enable_timestamp();
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001386 }
1387out:
1388 return newsk;
1389}
Eric Dumazete56c57d2011-11-08 17:07:07 -05001390EXPORT_SYMBOL_GPL(sk_clone_lock);
Arnaldo Carvalho de Melo87d11ce2005-08-09 20:10:12 -07001391
Andi Kleen99580892007-04-20 17:12:43 -07001392void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1393{
1394 __sk_dst_set(sk, dst);
1395 sk->sk_route_caps = dst->dev->features;
1396 if (sk->sk_route_caps & NETIF_F_GSO)
Herbert Xu4fcd6b92007-05-31 22:15:50 -07001397 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
Eric Dumazeta4654192010-05-16 00:36:33 -07001398 sk->sk_route_caps &= ~sk->sk_route_nocaps;
Andi Kleen99580892007-04-20 17:12:43 -07001399 if (sk_can_gso(sk)) {
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001400 if (dst->header_len) {
Andi Kleen99580892007-04-20 17:12:43 -07001401 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001402 } else {
Andi Kleen99580892007-04-20 17:12:43 -07001403 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07001404 sk->sk_gso_max_size = dst->dev->gso_max_size;
1405 }
Andi Kleen99580892007-04-20 17:12:43 -07001406 }
1407}
1408EXPORT_SYMBOL_GPL(sk_setup_caps);
1409
Linus Torvalds1da177e2005-04-16 15:20:36 -07001410void __init sk_init(void)
1411{
Jan Beulich44813742009-09-21 17:03:05 -07001412 if (totalram_pages <= 4096) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001413 sysctl_wmem_max = 32767;
1414 sysctl_rmem_max = 32767;
1415 sysctl_wmem_default = 32767;
1416 sysctl_rmem_default = 32767;
Jan Beulich44813742009-09-21 17:03:05 -07001417 } else if (totalram_pages >= 131072) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418 sysctl_wmem_max = 131071;
1419 sysctl_rmem_max = 131071;
1420 }
1421}
1422
1423/*
1424 * Simple resource managers for sockets.
1425 */
1426
1427
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001428/*
1429 * Write buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 */
1431void sock_wfree(struct sk_buff *skb)
1432{
1433 struct sock *sk = skb->sk;
Eric Dumazetd99927f2009-09-24 10:49:24 +00001434 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435
Eric Dumazetd99927f2009-09-24 10:49:24 +00001436 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1437 /*
1438 * Keep a reference on sk_wmem_alloc, this will be released
1439 * after sk_write_space() call
1440 */
1441 atomic_sub(len - 1, &sk->sk_wmem_alloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001442 sk->sk_write_space(sk);
Eric Dumazetd99927f2009-09-24 10:49:24 +00001443 len = 1;
1444 }
Eric Dumazet2b85a342009-06-11 02:55:43 -07001445 /*
Eric Dumazetd99927f2009-09-24 10:49:24 +00001446 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1447 * could not do because of in-flight packets
Eric Dumazet2b85a342009-06-11 02:55:43 -07001448 */
Eric Dumazetd99927f2009-09-24 10:49:24 +00001449 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
Eric Dumazet2b85a342009-06-11 02:55:43 -07001450 __sk_free(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001451}
Eric Dumazet2a915252009-05-27 11:30:05 +00001452EXPORT_SYMBOL(sock_wfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001454/*
1455 * Read buffer destructor automatically called from kfree_skb.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 */
1457void sock_rfree(struct sk_buff *skb)
1458{
1459 struct sock *sk = skb->sk;
Eric Dumazetd361fd52010-07-10 22:45:17 +00001460 unsigned int len = skb->truesize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001461
Eric Dumazetd361fd52010-07-10 22:45:17 +00001462 atomic_sub(len, &sk->sk_rmem_alloc);
1463 sk_mem_uncharge(sk, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001464}
Eric Dumazet2a915252009-05-27 11:30:05 +00001465EXPORT_SYMBOL(sock_rfree);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001466
1467
1468int sock_i_uid(struct sock *sk)
1469{
1470 int uid;
1471
Eric Dumazetf064af12010-09-22 12:43:39 +00001472 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001473 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001474 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001475 return uid;
1476}
Eric Dumazet2a915252009-05-27 11:30:05 +00001477EXPORT_SYMBOL(sock_i_uid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001478
1479unsigned long sock_i_ino(struct sock *sk)
1480{
1481 unsigned long ino;
1482
Eric Dumazetf064af12010-09-22 12:43:39 +00001483 read_lock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
Eric Dumazetf064af12010-09-22 12:43:39 +00001485 read_unlock_bh(&sk->sk_callback_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001486 return ino;
1487}
Eric Dumazet2a915252009-05-27 11:30:05 +00001488EXPORT_SYMBOL(sock_i_ino);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001489
1490/*
1491 * Allocate a skb from the socket's send buffer.
1492 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001493struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001494 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001495{
1496 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Eric Dumazet2a915252009-05-27 11:30:05 +00001497 struct sk_buff *skb = alloc_skb(size, priority);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498 if (skb) {
1499 skb_set_owner_w(skb, sk);
1500 return skb;
1501 }
1502 }
1503 return NULL;
1504}
Eric Dumazet2a915252009-05-27 11:30:05 +00001505EXPORT_SYMBOL(sock_wmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001506
1507/*
1508 * Allocate a skb from the socket's receive buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001509 */
Victor Fusco86a76ca2005-07-08 14:57:47 -07001510struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
Al Virodd0fc662005-10-07 07:46:04 +01001511 gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001512{
1513 if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1514 struct sk_buff *skb = alloc_skb(size, priority);
1515 if (skb) {
1516 skb_set_owner_r(skb, sk);
1517 return skb;
1518 }
1519 }
1520 return NULL;
1521}
1522
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001523/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001524 * Allocate a memory block from the socket's option memory buffer.
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001525 */
Al Virodd0fc662005-10-07 07:46:04 +01001526void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527{
Eric Dumazet95c96172012-04-15 05:58:06 +00001528 if ((unsigned int)size <= sysctl_optmem_max &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07001529 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1530 void *mem;
1531 /* First do the add, to avoid the race if kmalloc
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001532 * might sleep.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001533 */
1534 atomic_add(size, &sk->sk_omem_alloc);
1535 mem = kmalloc(size, priority);
1536 if (mem)
1537 return mem;
1538 atomic_sub(size, &sk->sk_omem_alloc);
1539 }
1540 return NULL;
1541}
Eric Dumazet2a915252009-05-27 11:30:05 +00001542EXPORT_SYMBOL(sock_kmalloc);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543
1544/*
1545 * Free an option memory block.
1546 */
1547void sock_kfree_s(struct sock *sk, void *mem, int size)
1548{
1549 kfree(mem);
1550 atomic_sub(size, &sk->sk_omem_alloc);
1551}
Eric Dumazet2a915252009-05-27 11:30:05 +00001552EXPORT_SYMBOL(sock_kfree_s);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001553
1554/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1555 I think, these locks should be removed for datagram sockets.
1556 */
Eric Dumazet2a915252009-05-27 11:30:05 +00001557static long sock_wait_for_wmem(struct sock *sk, long timeo)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558{
1559 DEFINE_WAIT(wait);
1560
1561 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1562 for (;;) {
1563 if (!timeo)
1564 break;
1565 if (signal_pending(current))
1566 break;
1567 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001568 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001569 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1570 break;
1571 if (sk->sk_shutdown & SEND_SHUTDOWN)
1572 break;
1573 if (sk->sk_err)
1574 break;
1575 timeo = schedule_timeout(timeo);
1576 }
Eric Dumazetaa395142010-04-20 13:03:51 +00001577 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001578 return timeo;
1579}
1580
1581
1582/*
1583 * Generic send/receive buffer handlers
1584 */
1585
Herbert Xu4cc7f682009-02-04 16:55:54 -08001586struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1587 unsigned long data_len, int noblock,
1588 int *errcode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589{
1590 struct sk_buff *skb;
Al Viro7d877f32005-10-21 03:20:43 -04001591 gfp_t gfp_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001592 long timeo;
1593 int err;
1594
1595 gfp_mask = sk->sk_allocation;
1596 if (gfp_mask & __GFP_WAIT)
1597 gfp_mask |= __GFP_REPEAT;
1598
1599 timeo = sock_sndtimeo(sk, noblock);
1600 while (1) {
1601 err = sock_error(sk);
1602 if (err != 0)
1603 goto failure;
1604
1605 err = -EPIPE;
1606 if (sk->sk_shutdown & SEND_SHUTDOWN)
1607 goto failure;
1608
1609 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
Larry Woodmandb38c1792006-11-03 16:05:45 -08001610 skb = alloc_skb(header_len, gfp_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001611 if (skb) {
1612 int npages;
1613 int i;
1614
1615 /* No pages, we're done... */
1616 if (!data_len)
1617 break;
1618
1619 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1620 skb->truesize += data_len;
1621 skb_shinfo(skb)->nr_frags = npages;
1622 for (i = 0; i < npages; i++) {
1623 struct page *page;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624
1625 page = alloc_pages(sk->sk_allocation, 0);
1626 if (!page) {
1627 err = -ENOBUFS;
1628 skb_shinfo(skb)->nr_frags = i;
1629 kfree_skb(skb);
1630 goto failure;
1631 }
1632
Ian Campbellea2ab692011-08-22 23:44:58 +00001633 __skb_fill_page_desc(skb, i,
1634 page, 0,
1635 (data_len >= PAGE_SIZE ?
1636 PAGE_SIZE :
1637 data_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 data_len -= PAGE_SIZE;
1639 }
1640
1641 /* Full success... */
1642 break;
1643 }
1644 err = -ENOBUFS;
1645 goto failure;
1646 }
1647 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1648 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1649 err = -EAGAIN;
1650 if (!timeo)
1651 goto failure;
1652 if (signal_pending(current))
1653 goto interrupted;
1654 timeo = sock_wait_for_wmem(sk, timeo);
1655 }
1656
1657 skb_set_owner_w(skb, sk);
1658 return skb;
1659
1660interrupted:
1661 err = sock_intr_errno(timeo);
1662failure:
1663 *errcode = err;
1664 return NULL;
1665}
Herbert Xu4cc7f682009-02-04 16:55:54 -08001666EXPORT_SYMBOL(sock_alloc_send_pskb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001667
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001668struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 int noblock, int *errcode)
1670{
1671 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1672}
Eric Dumazet2a915252009-05-27 11:30:05 +00001673EXPORT_SYMBOL(sock_alloc_send_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001674
1675static void __lock_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001676 __releases(&sk->sk_lock.slock)
1677 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678{
1679 DEFINE_WAIT(wait);
1680
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001681 for (;;) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001682 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1683 TASK_UNINTERRUPTIBLE);
1684 spin_unlock_bh(&sk->sk_lock.slock);
1685 schedule();
1686 spin_lock_bh(&sk->sk_lock.slock);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001687 if (!sock_owned_by_user(sk))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 break;
1689 }
1690 finish_wait(&sk->sk_lock.wq, &wait);
1691}
1692
1693static void __release_sock(struct sock *sk)
Namhyung Kimf39234d2010-09-08 03:48:48 +00001694 __releases(&sk->sk_lock.slock)
1695 __acquires(&sk->sk_lock.slock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696{
1697 struct sk_buff *skb = sk->sk_backlog.head;
1698
1699 do {
1700 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1701 bh_unlock_sock(sk);
1702
1703 do {
1704 struct sk_buff *next = skb->next;
1705
Eric Dumazete4cbb022012-04-30 16:07:09 +00001706 prefetch(next);
Eric Dumazet7fee2262010-05-11 23:19:48 +00001707 WARN_ON_ONCE(skb_dst_is_noref(skb));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 skb->next = NULL;
Peter Zijlstrac57943a2008-10-07 14:18:42 -07001709 sk_backlog_rcv(sk, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001710
1711 /*
1712 * We are in process context here with softirqs
1713 * disabled, use cond_resched_softirq() to preempt.
1714 * This is safe to do because we've taken the backlog
1715 * queue private:
1716 */
1717 cond_resched_softirq();
1718
1719 skb = next;
1720 } while (skb != NULL);
1721
1722 bh_lock_sock(sk);
Stephen Hemmingere71a4782007-04-10 20:10:33 -07001723 } while ((skb = sk->sk_backlog.head) != NULL);
Zhu Yi8eae9392010-03-04 18:01:40 +00001724
1725 /*
1726 * Doing the zeroing here guarantee we can not loop forever
1727 * while a wild producer attempts to flood us.
1728 */
1729 sk->sk_backlog.len = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001730}
1731
1732/**
1733 * sk_wait_data - wait for data to arrive at sk_receive_queue
Pavel Pisa4dc3b162005-05-01 08:59:25 -07001734 * @sk: sock to wait on
1735 * @timeo: for how long
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 *
1737 * Now socket state including sk->sk_err is changed only under lock,
1738 * hence we may omit checks after joining wait queue.
1739 * We check receive queue before schedule() only as optimization;
1740 * it is very likely that release_sock() added new data.
1741 */
1742int sk_wait_data(struct sock *sk, long *timeo)
1743{
1744 int rc;
1745 DEFINE_WAIT(wait);
1746
Eric Dumazetaa395142010-04-20 13:03:51 +00001747 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001748 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1749 rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1750 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
Eric Dumazetaa395142010-04-20 13:03:51 +00001751 finish_wait(sk_sleep(sk), &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001752 return rc;
1753}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001754EXPORT_SYMBOL(sk_wait_data);
1755
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001756/**
1757 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1758 * @sk: socket
1759 * @size: memory size to allocate
1760 * @kind: allocation type
1761 *
1762 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1763 * rmem allocation. This function assumes that protocols which have
1764 * memory_pressure use sk_wmem_queued as write buffer accounting.
1765 */
1766int __sk_mem_schedule(struct sock *sk, int size, int kind)
1767{
1768 struct proto *prot = sk->sk_prot;
1769 int amt = sk_mem_pages(size);
Eric Dumazet8d987e52010-11-09 23:24:26 +00001770 long allocated;
Glauber Costae1aab162011-12-11 21:47:03 +00001771 int parent_status = UNDER_LIMIT;
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001772
1773 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001774
Glauber Costae1aab162011-12-11 21:47:03 +00001775 allocated = sk_memory_allocated_add(sk, amt, &parent_status);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001776
1777 /* Under limit. */
Glauber Costae1aab162011-12-11 21:47:03 +00001778 if (parent_status == UNDER_LIMIT &&
1779 allocated <= sk_prot_mem_limits(sk, 0)) {
Glauber Costa180d8cd2011-12-11 21:47:02 +00001780 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001781 return 1;
1782 }
1783
Glauber Costae1aab162011-12-11 21:47:03 +00001784 /* Under pressure. (we or our parents) */
1785 if ((parent_status > SOFT_LIMIT) ||
1786 allocated > sk_prot_mem_limits(sk, 1))
Glauber Costa180d8cd2011-12-11 21:47:02 +00001787 sk_enter_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001788
Glauber Costae1aab162011-12-11 21:47:03 +00001789 /* Over hard limit (we or our parents) */
1790 if ((parent_status == OVER_LIMIT) ||
1791 (allocated > sk_prot_mem_limits(sk, 2)))
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001792 goto suppress_allocation;
1793
1794 /* guarantee minimum buffer size under pressure */
1795 if (kind == SK_MEM_RECV) {
1796 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1797 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001798
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001799 } else { /* SK_MEM_SEND */
1800 if (sk->sk_type == SOCK_STREAM) {
1801 if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1802 return 1;
1803 } else if (atomic_read(&sk->sk_wmem_alloc) <
1804 prot->sysctl_wmem[0])
1805 return 1;
1806 }
1807
Glauber Costa180d8cd2011-12-11 21:47:02 +00001808 if (sk_has_memory_pressure(sk)) {
Eric Dumazet17483762008-11-25 21:16:35 -08001809 int alloc;
1810
Glauber Costa180d8cd2011-12-11 21:47:02 +00001811 if (!sk_under_memory_pressure(sk))
Eric Dumazet17483762008-11-25 21:16:35 -08001812 return 1;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001813 alloc = sk_sockets_allocated_read_positive(sk);
1814 if (sk_prot_mem_limits(sk, 2) > alloc *
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001815 sk_mem_pages(sk->sk_wmem_queued +
1816 atomic_read(&sk->sk_rmem_alloc) +
1817 sk->sk_forward_alloc))
1818 return 1;
1819 }
1820
1821suppress_allocation:
1822
1823 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1824 sk_stream_moderate_sndbuf(sk);
1825
1826 /* Fail only if socket is _under_ its sndbuf.
1827 * In this case we cannot block, so that we have to fail.
1828 */
1829 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1830 return 1;
1831 }
1832
Satoru Moriya3847ce32011-06-17 12:00:03 +00001833 trace_sock_exceed_buf_limit(sk, prot, allocated);
1834
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001835 /* Alas. Undo changes. */
1836 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
Glauber Costa180d8cd2011-12-11 21:47:02 +00001837
Glauber Costa0e90b312012-01-20 04:57:16 +00001838 sk_memory_allocated_sub(sk, amt);
Glauber Costa180d8cd2011-12-11 21:47:02 +00001839
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001840 return 0;
1841}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001842EXPORT_SYMBOL(__sk_mem_schedule);
1843
1844/**
1845 * __sk_reclaim - reclaim memory_allocated
1846 * @sk: socket
1847 */
1848void __sk_mem_reclaim(struct sock *sk)
1849{
Glauber Costa180d8cd2011-12-11 21:47:02 +00001850 sk_memory_allocated_sub(sk,
Glauber Costa0e90b312012-01-20 04:57:16 +00001851 sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001852 sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1853
Glauber Costa180d8cd2011-12-11 21:47:02 +00001854 if (sk_under_memory_pressure(sk) &&
1855 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1856 sk_leave_memory_pressure(sk);
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001857}
Hideo Aoki3ab224b2007-12-31 00:11:19 -08001858EXPORT_SYMBOL(__sk_mem_reclaim);
1859
1860
Linus Torvalds1da177e2005-04-16 15:20:36 -07001861/*
1862 * Set of default routines for initialising struct proto_ops when
1863 * the protocol does not support a particular function. In certain
1864 * cases where it makes no sense for a protocol to have a "do nothing"
1865 * function, some default processing is provided.
1866 */
1867
1868int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1869{
1870 return -EOPNOTSUPP;
1871}
Eric Dumazet2a915252009-05-27 11:30:05 +00001872EXPORT_SYMBOL(sock_no_bind);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001873
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001874int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001875 int len, int flags)
1876{
1877 return -EOPNOTSUPP;
1878}
Eric Dumazet2a915252009-05-27 11:30:05 +00001879EXPORT_SYMBOL(sock_no_connect);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001880
1881int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1882{
1883 return -EOPNOTSUPP;
1884}
Eric Dumazet2a915252009-05-27 11:30:05 +00001885EXPORT_SYMBOL(sock_no_socketpair);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001886
1887int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1888{
1889 return -EOPNOTSUPP;
1890}
Eric Dumazet2a915252009-05-27 11:30:05 +00001891EXPORT_SYMBOL(sock_no_accept);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001892
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001893int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001894 int *len, int peer)
1895{
1896 return -EOPNOTSUPP;
1897}
Eric Dumazet2a915252009-05-27 11:30:05 +00001898EXPORT_SYMBOL(sock_no_getname);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001899
Eric Dumazet2a915252009-05-27 11:30:05 +00001900unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001901{
1902 return 0;
1903}
Eric Dumazet2a915252009-05-27 11:30:05 +00001904EXPORT_SYMBOL(sock_no_poll);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001905
1906int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1907{
1908 return -EOPNOTSUPP;
1909}
Eric Dumazet2a915252009-05-27 11:30:05 +00001910EXPORT_SYMBOL(sock_no_ioctl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911
1912int sock_no_listen(struct socket *sock, int backlog)
1913{
1914 return -EOPNOTSUPP;
1915}
Eric Dumazet2a915252009-05-27 11:30:05 +00001916EXPORT_SYMBOL(sock_no_listen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001917
1918int sock_no_shutdown(struct socket *sock, int how)
1919{
1920 return -EOPNOTSUPP;
1921}
Eric Dumazet2a915252009-05-27 11:30:05 +00001922EXPORT_SYMBOL(sock_no_shutdown);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001923
1924int sock_no_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07001925 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926{
1927 return -EOPNOTSUPP;
1928}
Eric Dumazet2a915252009-05-27 11:30:05 +00001929EXPORT_SYMBOL(sock_no_setsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001930
1931int sock_no_getsockopt(struct socket *sock, int level, int optname,
1932 char __user *optval, int __user *optlen)
1933{
1934 return -EOPNOTSUPP;
1935}
Eric Dumazet2a915252009-05-27 11:30:05 +00001936EXPORT_SYMBOL(sock_no_getsockopt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937
1938int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1939 size_t len)
1940{
1941 return -EOPNOTSUPP;
1942}
Eric Dumazet2a915252009-05-27 11:30:05 +00001943EXPORT_SYMBOL(sock_no_sendmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944
1945int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1946 size_t len, int flags)
1947{
1948 return -EOPNOTSUPP;
1949}
Eric Dumazet2a915252009-05-27 11:30:05 +00001950EXPORT_SYMBOL(sock_no_recvmsg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001951
1952int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1953{
1954 /* Mirror missing mmap method error code */
1955 return -ENODEV;
1956}
Eric Dumazet2a915252009-05-27 11:30:05 +00001957EXPORT_SYMBOL(sock_no_mmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001958
1959ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1960{
1961 ssize_t res;
1962 struct msghdr msg = {.msg_flags = flags};
1963 struct kvec iov;
1964 char *kaddr = kmap(page);
1965 iov.iov_base = kaddr + offset;
1966 iov.iov_len = size;
1967 res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1968 kunmap(page);
1969 return res;
1970}
Eric Dumazet2a915252009-05-27 11:30:05 +00001971EXPORT_SYMBOL(sock_no_sendpage);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972
1973/*
1974 * Default Socket Callbacks
1975 */
1976
1977static void sock_def_wakeup(struct sock *sk)
1978{
Eric Dumazet43815482010-04-29 11:01:49 +00001979 struct socket_wq *wq;
1980
1981 rcu_read_lock();
1982 wq = rcu_dereference(sk->sk_wq);
1983 if (wq_has_sleeper(wq))
1984 wake_up_interruptible_all(&wq->wait);
1985 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001986}
1987
1988static void sock_def_error_report(struct sock *sk)
1989{
Eric Dumazet43815482010-04-29 11:01:49 +00001990 struct socket_wq *wq;
1991
1992 rcu_read_lock();
1993 wq = rcu_dereference(sk->sk_wq);
1994 if (wq_has_sleeper(wq))
1995 wake_up_interruptible_poll(&wq->wait, POLLERR);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08001996 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
Eric Dumazet43815482010-04-29 11:01:49 +00001997 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001998}
1999
2000static void sock_def_readable(struct sock *sk, int len)
2001{
Eric Dumazet43815482010-04-29 11:01:49 +00002002 struct socket_wq *wq;
2003
2004 rcu_read_lock();
2005 wq = rcu_dereference(sk->sk_wq);
2006 if (wq_has_sleeper(wq))
Eric Dumazet2c6607c2011-01-06 10:54:29 -08002007 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
Davide Libenzi37e55402009-03-31 15:24:21 -07002008 POLLRDNORM | POLLRDBAND);
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002009 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
Eric Dumazet43815482010-04-29 11:01:49 +00002010 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002011}
2012
2013static void sock_def_write_space(struct sock *sk)
2014{
Eric Dumazet43815482010-04-29 11:01:49 +00002015 struct socket_wq *wq;
2016
2017 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002018
2019 /* Do not wake up a writer until he can make "significant"
2020 * progress. --DaveM
2021 */
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002022 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
Eric Dumazet43815482010-04-29 11:01:49 +00002023 wq = rcu_dereference(sk->sk_wq);
2024 if (wq_has_sleeper(wq))
2025 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
Davide Libenzi37e55402009-03-31 15:24:21 -07002026 POLLWRNORM | POLLWRBAND);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027
2028 /* Should agree with poll, otherwise some programs break */
2029 if (sock_writeable(sk))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002030 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002031 }
2032
Eric Dumazet43815482010-04-29 11:01:49 +00002033 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002034}
2035
2036static void sock_def_destruct(struct sock *sk)
2037{
Jesper Juhla51482b2005-11-08 09:41:34 -08002038 kfree(sk->sk_protinfo);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039}
2040
2041void sk_send_sigurg(struct sock *sk)
2042{
2043 if (sk->sk_socket && sk->sk_socket->file)
2044 if (send_sigurg(&sk->sk_socket->file->f_owner))
Pavel Emelyanov8d8ad9d2007-11-26 20:10:50 +08002045 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002046}
Eric Dumazet2a915252009-05-27 11:30:05 +00002047EXPORT_SYMBOL(sk_send_sigurg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002048
2049void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2050 unsigned long expires)
2051{
2052 if (!mod_timer(timer, expires))
2053 sock_hold(sk);
2054}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002055EXPORT_SYMBOL(sk_reset_timer);
2056
2057void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2058{
2059 if (timer_pending(timer) && del_timer(timer))
2060 __sock_put(sk);
2061}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002062EXPORT_SYMBOL(sk_stop_timer);
2063
2064void sock_init_data(struct socket *sock, struct sock *sk)
2065{
2066 skb_queue_head_init(&sk->sk_receive_queue);
2067 skb_queue_head_init(&sk->sk_write_queue);
2068 skb_queue_head_init(&sk->sk_error_queue);
Chris Leech97fc2f02006-05-23 17:55:33 -07002069#ifdef CONFIG_NET_DMA
2070 skb_queue_head_init(&sk->sk_async_wait_queue);
2071#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002072
2073 sk->sk_send_head = NULL;
2074
2075 init_timer(&sk->sk_timer);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002076
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 sk->sk_allocation = GFP_KERNEL;
2078 sk->sk_rcvbuf = sysctl_rmem_default;
2079 sk->sk_sndbuf = sysctl_wmem_default;
2080 sk->sk_state = TCP_CLOSE;
David S. Miller972692e2008-06-17 22:41:38 -07002081 sk_set_socket(sk, sock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002082
2083 sock_set_flag(sk, SOCK_ZAPPED);
2084
Stephen Hemmingere71a4782007-04-10 20:10:33 -07002085 if (sock) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002086 sk->sk_type = sock->type;
Eric Dumazet43815482010-04-29 11:01:49 +00002087 sk->sk_wq = sock->wq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002088 sock->sk = sk;
2089 } else
Eric Dumazet43815482010-04-29 11:01:49 +00002090 sk->sk_wq = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002091
Eric Dumazetb6c67122010-04-08 23:03:29 +00002092 spin_lock_init(&sk->sk_dst_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002093 rwlock_init(&sk->sk_callback_lock);
Peter Zijlstra443aef0e2007-07-19 01:49:00 -07002094 lockdep_set_class_and_name(&sk->sk_callback_lock,
2095 af_callback_keys + sk->sk_family,
2096 af_family_clock_key_strings[sk->sk_family]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002097
2098 sk->sk_state_change = sock_def_wakeup;
2099 sk->sk_data_ready = sock_def_readable;
2100 sk->sk_write_space = sock_def_write_space;
2101 sk->sk_error_report = sock_def_error_report;
2102 sk->sk_destruct = sock_def_destruct;
2103
2104 sk->sk_sndmsg_page = NULL;
2105 sk->sk_sndmsg_off = 0;
Pavel Emelyanovef64a542012-02-21 07:31:34 +00002106 sk->sk_peek_off = -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002107
Eric W. Biederman109f6e32010-06-13 03:30:14 +00002108 sk->sk_peer_pid = NULL;
2109 sk->sk_peer_cred = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002110 sk->sk_write_pending = 0;
2111 sk->sk_rcvlowat = 1;
2112 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
2113 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
2114
Eric Dumazetf37f0af2008-04-13 21:39:26 -07002115 sk->sk_stamp = ktime_set(-1L, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002116
Eric Dumazet4dc6dc72009-07-15 23:13:10 +00002117 /*
2118 * Before updating sk_refcnt, we must commit prior changes to memory
2119 * (Documentation/RCU/rculist_nulls.txt for details)
2120 */
2121 smp_wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122 atomic_set(&sk->sk_refcnt, 1);
Wang Chen33c732c2007-11-13 20:30:01 -08002123 atomic_set(&sk->sk_drops, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002124}
Eric Dumazet2a915252009-05-27 11:30:05 +00002125EXPORT_SYMBOL(sock_init_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002126
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002127void lock_sock_nested(struct sock *sk, int subclass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002128{
2129 might_sleep();
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07002130 spin_lock_bh(&sk->sk_lock.slock);
John Heffnerd2e91172007-09-12 10:44:19 +02002131 if (sk->sk_lock.owned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002132 __lock_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002133 sk->sk_lock.owned = 1;
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07002134 spin_unlock(&sk->sk_lock.slock);
2135 /*
2136 * The sk_lock has mutex_lock() semantics here:
2137 */
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002138 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07002139 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140}
Peter Zijlstrafcc70d52006-11-08 22:44:35 -08002141EXPORT_SYMBOL(lock_sock_nested);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142
Harvey Harrisonb5606c22008-02-13 15:03:16 -08002143void release_sock(struct sock *sk)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144{
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07002145 /*
2146 * The sk_lock has mutex_unlock() semantics:
2147 */
2148 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2149
2150 spin_lock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 if (sk->sk_backlog.tail)
2152 __release_sock(sk);
John Heffnerd2e91172007-09-12 10:44:19 +02002153 sk->sk_lock.owned = 0;
Ingo Molnara5b5bb9a2006-07-03 00:25:35 -07002154 if (waitqueue_active(&sk->sk_lock.wq))
2155 wake_up(&sk->sk_lock.wq);
2156 spin_unlock_bh(&sk->sk_lock.slock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002157}
2158EXPORT_SYMBOL(release_sock);
2159
Eric Dumazet8a74ad62010-05-26 19:20:18 +00002160/**
2161 * lock_sock_fast - fast version of lock_sock
2162 * @sk: socket
2163 *
2164 * This version should be used for very small section, where process wont block
2165 * return false if fast path is taken
2166 * sk_lock.slock locked, owned = 0, BH disabled
2167 * return true if slow path is taken
2168 * sk_lock.slock unlocked, owned = 1, BH enabled
2169 */
2170bool lock_sock_fast(struct sock *sk)
2171{
2172 might_sleep();
2173 spin_lock_bh(&sk->sk_lock.slock);
2174
2175 if (!sk->sk_lock.owned)
2176 /*
2177 * Note : We must disable BH
2178 */
2179 return false;
2180
2181 __lock_sock(sk);
2182 sk->sk_lock.owned = 1;
2183 spin_unlock(&sk->sk_lock.slock);
2184 /*
2185 * The sk_lock has mutex_lock() semantics here:
2186 */
2187 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2188 local_bh_enable();
2189 return true;
2190}
2191EXPORT_SYMBOL(lock_sock_fast);
2192
Linus Torvalds1da177e2005-04-16 15:20:36 -07002193int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002194{
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002195 struct timeval tv;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002197 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002198 tv = ktime_to_timeval(sk->sk_stamp);
2199 if (tv.tv_sec == -1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200 return -ENOENT;
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07002201 if (tv.tv_sec == 0) {
2202 sk->sk_stamp = ktime_get_real();
2203 tv = ktime_to_timeval(sk->sk_stamp);
2204 }
2205 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002206}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207EXPORT_SYMBOL(sock_get_timestamp);
2208
Eric Dumazetae40eb12007-03-18 17:33:16 -07002209int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2210{
2211 struct timespec ts;
2212 if (!sock_flag(sk, SOCK_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002213 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
Eric Dumazetae40eb12007-03-18 17:33:16 -07002214 ts = ktime_to_timespec(sk->sk_stamp);
2215 if (ts.tv_sec == -1)
2216 return -ENOENT;
2217 if (ts.tv_sec == 0) {
2218 sk->sk_stamp = ktime_get_real();
2219 ts = ktime_to_timespec(sk->sk_stamp);
2220 }
2221 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2222}
2223EXPORT_SYMBOL(sock_get_timestampns);
2224
Patrick Ohly20d49472009-02-12 05:03:38 +00002225void sock_enable_timestamp(struct sock *sk, int flag)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002226{
Patrick Ohly20d49472009-02-12 05:03:38 +00002227 if (!sock_flag(sk, flag)) {
Eric Dumazet08e29af2011-11-28 12:04:18 +00002228 unsigned long previous_flags = sk->sk_flags;
2229
Patrick Ohly20d49472009-02-12 05:03:38 +00002230 sock_set_flag(sk, flag);
2231 /*
2232 * we just set one of the two flags which require net
2233 * time stamping, but time stamping might have been on
2234 * already because of the other one
2235 */
Eric Dumazet08e29af2011-11-28 12:04:18 +00002236 if (!(previous_flags & SK_FLAGS_TIMESTAMP))
Patrick Ohly20d49472009-02-12 05:03:38 +00002237 net_enable_timestamp();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238 }
2239}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002240
2241/*
2242 * Get a socket option on an socket.
2243 *
2244 * FIX: POSIX 1003.1g is very ambiguous here. It states that
2245 * asynchronous errors should be reported by getsockopt. We assume
2246 * this means if you specify SO_ERROR (otherwise whats the point of it).
2247 */
2248int sock_common_getsockopt(struct socket *sock, int level, int optname,
2249 char __user *optval, int __user *optlen)
2250{
2251 struct sock *sk = sock->sk;
2252
2253 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2254}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002255EXPORT_SYMBOL(sock_common_getsockopt);
2256
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002257#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002258int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2259 char __user *optval, int __user *optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002260{
2261 struct sock *sk = sock->sk;
2262
Johannes Berg1e51f952007-03-06 13:44:06 -08002263 if (sk->sk_prot->compat_getsockopt != NULL)
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002264 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2265 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002266 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2267}
2268EXPORT_SYMBOL(compat_sock_common_getsockopt);
2269#endif
2270
Linus Torvalds1da177e2005-04-16 15:20:36 -07002271int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2272 struct msghdr *msg, size_t size, int flags)
2273{
2274 struct sock *sk = sock->sk;
2275 int addr_len = 0;
2276 int err;
2277
2278 err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2279 flags & ~MSG_DONTWAIT, &addr_len);
2280 if (err >= 0)
2281 msg->msg_namelen = addr_len;
2282 return err;
2283}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284EXPORT_SYMBOL(sock_common_recvmsg);
2285
2286/*
2287 * Set socket options on an inet socket.
2288 */
2289int sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002290 char __user *optval, unsigned int optlen)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291{
2292 struct sock *sk = sock->sk;
2293
2294 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2295}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002296EXPORT_SYMBOL(sock_common_setsockopt);
2297
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002298#ifdef CONFIG_COMPAT
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002299int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
David S. Millerb7058842009-09-30 16:12:20 -07002300 char __user *optval, unsigned int optlen)
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002301{
2302 struct sock *sk = sock->sk;
2303
Arnaldo Carvalho de Melo543d9cf2006-03-20 22:48:35 -08002304 if (sk->sk_prot->compat_setsockopt != NULL)
2305 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2306 optval, optlen);
Dmitry Mishin3fdadf72006-03-20 22:45:21 -08002307 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2308}
2309EXPORT_SYMBOL(compat_sock_common_setsockopt);
2310#endif
2311
Linus Torvalds1da177e2005-04-16 15:20:36 -07002312void sk_common_release(struct sock *sk)
2313{
2314 if (sk->sk_prot->destroy)
2315 sk->sk_prot->destroy(sk);
2316
2317 /*
2318 * Observation: when sock_common_release is called, processes have
2319 * no access to socket. But net still has.
2320 * Step one, detach it from networking:
2321 *
2322 * A. Remove from hash tables.
2323 */
2324
2325 sk->sk_prot->unhash(sk);
2326
2327 /*
2328 * In this point socket cannot receive new packets, but it is possible
2329 * that some packets are in flight because some CPU runs receiver and
2330 * did hash table lookup before we unhashed socket. They will achieve
2331 * receive queue and will be purged by socket destructor.
2332 *
2333 * Also we still have packets pending on receive queue and probably,
2334 * our own packets waiting in device queues. sock_destroy will drain
2335 * receive queue, but transmitted packets will delay socket destruction
2336 * until the last reference will be released.
2337 */
2338
2339 sock_orphan(sk);
2340
2341 xfrm_sk_free_policy(sk);
2342
Arnaldo Carvalho de Meloe6848972005-08-09 19:45:38 -07002343 sk_refcnt_debug_release(sk);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002344 sock_put(sk);
2345}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346EXPORT_SYMBOL(sk_common_release);
2347
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002348#ifdef CONFIG_PROC_FS
2349#define PROTO_INUSE_NR 64 /* should be enough for the first time */
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002350struct prot_inuse {
2351 int val[PROTO_INUSE_NR];
2352};
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002353
2354static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002355
2356#ifdef CONFIG_NET_NS
2357void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2358{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002359 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002360}
2361EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2362
2363int sock_prot_inuse_get(struct net *net, struct proto *prot)
2364{
2365 int cpu, idx = prot->inuse_idx;
2366 int res = 0;
2367
2368 for_each_possible_cpu(cpu)
2369 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2370
2371 return res >= 0 ? res : 0;
2372}
2373EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2374
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002375static int __net_init sock_inuse_init_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002376{
2377 net->core.inuse = alloc_percpu(struct prot_inuse);
2378 return net->core.inuse ? 0 : -ENOMEM;
2379}
2380
Alexey Dobriyan2c8c1e72010-01-17 03:35:32 +00002381static void __net_exit sock_inuse_exit_net(struct net *net)
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002382{
2383 free_percpu(net->core.inuse);
2384}
2385
2386static struct pernet_operations net_inuse_ops = {
2387 .init = sock_inuse_init_net,
2388 .exit = sock_inuse_exit_net,
2389};
2390
2391static __init int net_inuse_init(void)
2392{
2393 if (register_pernet_subsys(&net_inuse_ops))
2394 panic("Cannot initialize net inuse counters");
2395
2396 return 0;
2397}
2398
2399core_initcall(net_inuse_init);
2400#else
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002401static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2402
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002403void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002404{
Eric Dumazetd6d9ca02010-07-19 10:48:49 +00002405 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002406}
2407EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2408
Pavel Emelyanovc29a0bc2008-03-31 19:41:46 -07002409int sock_prot_inuse_get(struct net *net, struct proto *prot)
Pavel Emelyanov1338d462008-03-28 16:38:43 -07002410{
2411 int cpu, idx = prot->inuse_idx;
2412 int res = 0;
2413
2414 for_each_possible_cpu(cpu)
2415 res += per_cpu(prot_inuse, cpu).val[idx];
2416
2417 return res >= 0 ? res : 0;
2418}
2419EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
Pavel Emelyanov70ee1152008-03-31 19:42:16 -07002420#endif
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002421
2422static void assign_proto_idx(struct proto *prot)
2423{
2424 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2425
2426 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2427 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2428 return;
2429 }
2430
2431 set_bit(prot->inuse_idx, proto_inuse_idx);
2432}
2433
2434static void release_proto_idx(struct proto *prot)
2435{
2436 if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2437 clear_bit(prot->inuse_idx, proto_inuse_idx);
2438}
2439#else
2440static inline void assign_proto_idx(struct proto *prot)
2441{
2442}
2443
2444static inline void release_proto_idx(struct proto *prot)
2445{
2446}
2447#endif
2448
Linus Torvalds1da177e2005-04-16 15:20:36 -07002449int proto_register(struct proto *prot, int alloc_slab)
2450{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451 if (alloc_slab) {
2452 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
Eric Dumazet271b72c2008-10-29 02:11:14 -07002453 SLAB_HWCACHE_ALIGN | prot->slab_flags,
2454 NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002455
2456 if (prot->slab == NULL) {
2457 printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2458 prot->name);
Pavel Emelyanov60e76632008-03-28 16:39:10 -07002459 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002460 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002461
2462 if (prot->rsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002463 prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002464 if (prot->rsk_prot->slab_name == NULL)
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002465 goto out_free_sock_slab;
2466
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002467 prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002468 prot->rsk_prot->obj_size, 0,
Paul Mundt20c2df82007-07-20 10:11:58 +09002469 SLAB_HWCACHE_ALIGN, NULL);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002470
2471 if (prot->rsk_prot->slab == NULL) {
2472 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2473 prot->name);
2474 goto out_free_request_sock_slab_name;
2475 }
2476 }
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002477
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002478 if (prot->twsk_prot != NULL) {
Alexey Dobriyanfaf23422010-02-17 09:34:12 +00002479 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002480
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002481 if (prot->twsk_prot->twsk_slab_name == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002482 goto out_free_request_sock_slab;
2483
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002484 prot->twsk_prot->twsk_slab =
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002485 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002486 prot->twsk_prot->twsk_obj_size,
Eric Dumazet3ab5aee2008-11-16 19:40:17 -08002487 0,
2488 SLAB_HWCACHE_ALIGN |
2489 prot->slab_flags,
Paul Mundt20c2df82007-07-20 10:11:58 +09002490 NULL);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002491 if (prot->twsk_prot->twsk_slab == NULL)
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002492 goto out_free_timewait_sock_slab_name;
2493 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494 }
2495
Glauber Costa36b77a52011-12-16 00:51:59 +00002496 mutex_lock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002497 list_add(&prot->node, &proto_list);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002498 assign_proto_idx(prot);
Glauber Costa36b77a52011-12-16 00:51:59 +00002499 mutex_unlock(&proto_list_mutex);
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002500 return 0;
2501
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002502out_free_timewait_sock_slab_name:
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002503 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002504out_free_request_sock_slab:
2505 if (prot->rsk_prot && prot->rsk_prot->slab) {
2506 kmem_cache_destroy(prot->rsk_prot->slab);
2507 prot->rsk_prot->slab = NULL;
2508 }
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002509out_free_request_sock_slab_name:
Dan Carpenter72150e92010-03-06 01:04:45 +00002510 if (prot->rsk_prot)
2511 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002512out_free_sock_slab:
2513 kmem_cache_destroy(prot->slab);
2514 prot->slab = NULL;
Pavel Emelyanovb733c002007-11-07 02:23:38 -08002515out:
2516 return -ENOBUFS;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002517}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518EXPORT_SYMBOL(proto_register);
2519
2520void proto_unregister(struct proto *prot)
2521{
Glauber Costa36b77a52011-12-16 00:51:59 +00002522 mutex_lock(&proto_list_mutex);
Pavel Emelyanov13ff3d62008-03-28 16:38:17 -07002523 release_proto_idx(prot);
Patrick McHardy0a3f4352005-09-06 19:47:50 -07002524 list_del(&prot->node);
Glauber Costa36b77a52011-12-16 00:51:59 +00002525 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002526
2527 if (prot->slab != NULL) {
2528 kmem_cache_destroy(prot->slab);
2529 prot->slab = NULL;
2530 }
2531
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002532 if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002533 kmem_cache_destroy(prot->rsk_prot->slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002534 kfree(prot->rsk_prot->slab_name);
Arnaldo Carvalho de Melo2e6599c2005-06-18 22:46:52 -07002535 prot->rsk_prot->slab = NULL;
2536 }
2537
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002538 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002539 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
Catalin Marinas7e56b5d2008-11-21 16:45:22 -08002540 kfree(prot->twsk_prot->twsk_slab_name);
Arnaldo Carvalho de Melo6d6ee432005-12-13 23:25:19 -08002541 prot->twsk_prot->twsk_slab = NULL;
Arnaldo Carvalho de Melo8feaf0c02005-08-09 20:09:30 -07002542 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002543}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002544EXPORT_SYMBOL(proto_unregister);
2545
2546#ifdef CONFIG_PROC_FS
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
Glauber Costa36b77a52011-12-16 00:51:59 +00002548 __acquires(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002549{
Glauber Costa36b77a52011-12-16 00:51:59 +00002550 mutex_lock(&proto_list_mutex);
Pavel Emelianov60f04382007-07-09 13:15:14 -07002551 return seq_list_start_head(&proto_list, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552}
2553
2554static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2555{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002556 return seq_list_next(v, &proto_list, pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002557}
2558
2559static void proto_seq_stop(struct seq_file *seq, void *v)
Glauber Costa36b77a52011-12-16 00:51:59 +00002560 __releases(proto_list_mutex)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002561{
Glauber Costa36b77a52011-12-16 00:51:59 +00002562 mutex_unlock(&proto_list_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563}
2564
2565static char proto_method_implemented(const void *method)
2566{
2567 return method == NULL ? 'n' : 'y';
2568}
Glauber Costa180d8cd2011-12-11 21:47:02 +00002569static long sock_prot_memory_allocated(struct proto *proto)
2570{
Jeffrin Josecb75a362012-04-25 19:17:29 +05302571 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
Glauber Costa180d8cd2011-12-11 21:47:02 +00002572}
2573
2574static char *sock_prot_memory_pressure(struct proto *proto)
2575{
2576 return proto->memory_pressure != NULL ?
2577 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2578}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002579
2580static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2581{
Glauber Costa180d8cd2011-12-11 21:47:02 +00002582
Eric Dumazet8d987e52010-11-09 23:24:26 +00002583 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2585 proto->name,
2586 proto->obj_size,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002587 sock_prot_inuse_get(seq_file_net(seq), proto),
Glauber Costa180d8cd2011-12-11 21:47:02 +00002588 sock_prot_memory_allocated(proto),
2589 sock_prot_memory_pressure(proto),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590 proto->max_header,
2591 proto->slab == NULL ? "no" : "yes",
2592 module_name(proto->owner),
2593 proto_method_implemented(proto->close),
2594 proto_method_implemented(proto->connect),
2595 proto_method_implemented(proto->disconnect),
2596 proto_method_implemented(proto->accept),
2597 proto_method_implemented(proto->ioctl),
2598 proto_method_implemented(proto->init),
2599 proto_method_implemented(proto->destroy),
2600 proto_method_implemented(proto->shutdown),
2601 proto_method_implemented(proto->setsockopt),
2602 proto_method_implemented(proto->getsockopt),
2603 proto_method_implemented(proto->sendmsg),
2604 proto_method_implemented(proto->recvmsg),
2605 proto_method_implemented(proto->sendpage),
2606 proto_method_implemented(proto->bind),
2607 proto_method_implemented(proto->backlog_rcv),
2608 proto_method_implemented(proto->hash),
2609 proto_method_implemented(proto->unhash),
2610 proto_method_implemented(proto->get_port),
2611 proto_method_implemented(proto->enter_memory_pressure));
2612}
2613
2614static int proto_seq_show(struct seq_file *seq, void *v)
2615{
Pavel Emelianov60f04382007-07-09 13:15:14 -07002616 if (v == &proto_list)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002617 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2618 "protocol",
2619 "size",
2620 "sockets",
2621 "memory",
2622 "press",
2623 "maxhdr",
2624 "slab",
2625 "module",
2626 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2627 else
Pavel Emelianov60f04382007-07-09 13:15:14 -07002628 proto_seq_printf(seq, list_entry(v, struct proto, node));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002629 return 0;
2630}
2631
Stephen Hemmingerf6908082007-03-12 14:34:29 -07002632static const struct seq_operations proto_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633 .start = proto_seq_start,
2634 .next = proto_seq_next,
2635 .stop = proto_seq_stop,
2636 .show = proto_seq_show,
2637};
2638
2639static int proto_seq_open(struct inode *inode, struct file *file)
2640{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002641 return seq_open_net(inode, file, &proto_seq_ops,
2642 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643}
2644
Arjan van de Ven9a321442007-02-12 00:55:35 -08002645static const struct file_operations proto_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002646 .owner = THIS_MODULE,
2647 .open = proto_seq_open,
2648 .read = seq_read,
2649 .llseek = seq_lseek,
Eric Dumazet14e943d2008-11-19 15:14:01 -08002650 .release = seq_release_net,
2651};
2652
2653static __net_init int proto_init_net(struct net *net)
2654{
2655 if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2656 return -ENOMEM;
2657
2658 return 0;
2659}
2660
2661static __net_exit void proto_exit_net(struct net *net)
2662{
2663 proc_net_remove(net, "protocols");
2664}
2665
2666
2667static __net_initdata struct pernet_operations proto_net_ops = {
2668 .init = proto_init_net,
2669 .exit = proto_exit_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002670};
2671
2672static int __init proto_init(void)
2673{
Eric Dumazet14e943d2008-11-19 15:14:01 -08002674 return register_pernet_subsys(&proto_net_ops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675}
2676
2677subsys_initcall(proto_init);
2678
2679#endif /* PROC_FS */