1 /*        $NetBSD: uipc_socket.c,v 1.313 2024/12/06 18:44:00 riastradh Exp $    */
2 
3 /*
4  * Copyright (c) 2002, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2004 The FreeBSD Foundation
34  * Copyright (c) 2004 Robert Watson
35  * Copyright (c) 1982, 1986, 1988, 1990, 1993
36  *        The Regents of the University of California.  All rights reserved.
37  *
38  * Redistribution and use in source and binary forms, with or without
39  * modification, are permitted provided that the following conditions
40  * are met:
41  * 1. Redistributions of source code must retain the above copyright
42  *    notice, this list of conditions and the following disclaimer.
43  * 2. Redistributions in binary form must reproduce the above copyright
44  *    notice, this list of conditions and the following disclaimer in the
45  *    documentation and/or other materials provided with the distribution.
46  * 3. Neither the name of the University nor the names of its contributors
47  *    may be used to endorse or promote products derived from this software
48  *    without specific prior written permission.
49  *
50  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60  * SUCH DAMAGE.
61  *
62  *        @(#)uipc_socket.c   8.6 (Berkeley) 5/2/95
63  */
64 
65 /*
66  * Socket operation routines.
67  *
68  * These routines are called by the routines in sys_socket.c or from a
69  * system process, and implement the semantics of socket operations by
70  * switching out to the protocol specific routines.
71  */
72 
73 #include <sys/cdefs.h>
74 __KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.313 2024/12/06 18:44:00 riastradh Exp $");
75 
76 #ifdef _KERNEL_OPT
77 #include "opt_compat_netbsd.h"
78 #include "opt_mbuftrace.h"
79 #include "opt_multiprocessor.h"         /* XXX */
80 #include "opt_pipe.h"
81 #include "opt_sctp.h"
82 #include "opt_sock_counters.h"
83 #include "opt_somaxkva.h"
84 #include "opt_sosend_loan.h"
85 #endif
86 
87 #include <sys/param.h>
88 #include <sys/types.h>
89 
90 #include <sys/compat_stub.h>
91 #include <sys/condvar.h>
92 #include <sys/domain.h>
93 #include <sys/event.h>
94 #include <sys/file.h>
95 #include <sys/filedesc.h>
96 #include <sys/kauth.h>
97 #include <sys/kernel.h>
98 #include <sys/kmem.h>
99 #include <sys/kthread.h>
100 #include <sys/mbuf.h>
101 #include <sys/mutex.h>
102 #include <sys/poll.h>
103 #include <sys/proc.h>
104 #include <sys/protosw.h>
105 #include <sys/resourcevar.h>
106 #include <sys/sdt.h>
107 #include <sys/signalvar.h>
108 #include <sys/socket.h>
109 #include <sys/socketvar.h>
110 #include <sys/systm.h>
111 #include <sys/uidinfo.h>
112 
113 #include <compat/sys/socket.h>
114 #include <compat/sys/time.h>
115 
116 #include <uvm/uvm_extern.h>
117 #include <uvm/uvm_loan.h>
118 #include <uvm/uvm_page.h>
119 
120 #ifdef SCTP
121 #include <netinet/sctp_route.h>
122 #endif
123 
124 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
125 
126 extern const struct fileops socketops;
127 
128 static int          sooptions;
129 extern int          somaxconn;                              /* patchable (XXX sysctl) */
130 int                 somaxconn = SOMAXCONN;
131 kmutex_t  *softnet_lock;
132 
133 #ifdef SOSEND_COUNTERS
134 #include <sys/device.h>
135 
136 static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
137     NULL, "sosend", "loan big");
138 static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
139     NULL, "sosend", "copy big");
140 static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
141     NULL, "sosend", "copy small");
142 static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
143     NULL, "sosend", "kva limit");
144 
145 #define   SOSEND_COUNTER_INCR(ev)                 (ev)->ev_count++
146 
147 EVCNT_ATTACH_STATIC(sosend_loan_big);
148 EVCNT_ATTACH_STATIC(sosend_copy_big);
149 EVCNT_ATTACH_STATIC(sosend_copy_small);
150 EVCNT_ATTACH_STATIC(sosend_kvalimit);
151 #else
152 
153 #define   SOSEND_COUNTER_INCR(ev)                 /* nothing */
154 
155 #endif /* SOSEND_COUNTERS */
156 
157 #if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
158 int sock_loan_thresh = -1;
159 #else
160 int sock_loan_thresh = 4096;
161 #endif
162 
163 static kmutex_t so_pendfree_lock;
164 static struct mbuf *so_pendfree = NULL;
165 
166 #ifndef SOMAXKVA
167 #define   SOMAXKVA (16 * 1024 * 1024)
168 #endif
169 int somaxkva = SOMAXKVA;
170 static int socurkva;
171 static kcondvar_t socurkva_cv;
172 
173 #ifndef SOFIXEDBUF
174 #define SOFIXEDBUF true
175 #endif
176 bool sofixedbuf = SOFIXEDBUF;
177 
178 static kauth_listener_t socket_listener;
179 
180 #define   SOCK_LOAN_CHUNK               65536
181 
182 static void sopendfree_thread(void *);
183 static kcondvar_t pendfree_thread_cv;
184 static lwp_t *sopendfree_lwp;
185 
186 static void sysctl_kern_socket_setup(void);
187 static struct sysctllog *socket_sysctllog;
188 
189 static vsize_t
sokvareserve(struct socket * so,vsize_t len)190 sokvareserve(struct socket *so, vsize_t len)
191 {
192           int error;
193 
194           mutex_enter(&so_pendfree_lock);
195           while (socurkva + len > somaxkva) {
196                     SOSEND_COUNTER_INCR(&sosend_kvalimit);
197                     error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
198                     if (error) {
199                               len = 0;
200                               break;
201                     }
202           }
203           socurkva += len;
204           mutex_exit(&so_pendfree_lock);
205           return len;
206 }
207 
208 static void
sokvaunreserve(vsize_t len)209 sokvaunreserve(vsize_t len)
210 {
211 
212           mutex_enter(&so_pendfree_lock);
213           socurkva -= len;
214           cv_broadcast(&socurkva_cv);
215           mutex_exit(&so_pendfree_lock);
216 }
217 
218 /*
219  * sokvaalloc: allocate kva for loan.
220  */
221 vaddr_t
sokvaalloc(vaddr_t sva,vsize_t len,struct socket * so)222 sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so)
223 {
224           vaddr_t lva;
225 
226           if (sokvareserve(so, len) == 0)
227                     return 0;
228 
229           lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask,
230               UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
231           if (lva == 0) {
232                     sokvaunreserve(len);
233                     return 0;
234           }
235 
236           return lva;
237 }
238 
239 /*
240  * sokvafree: free kva for loan.
241  */
242 void
sokvafree(vaddr_t sva,vsize_t len)243 sokvafree(vaddr_t sva, vsize_t len)
244 {
245 
246           uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
247           sokvaunreserve(len);
248 }
249 
250 static void
sodoloanfree(struct vm_page ** pgs,void * buf,size_t size)251 sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
252 {
253           vaddr_t sva, eva;
254           vsize_t len;
255           int npgs;
256 
257           KASSERT(pgs != NULL);
258 
259           eva = round_page((vaddr_t) buf + size);
260           sva = trunc_page((vaddr_t) buf);
261           len = eva - sva;
262           npgs = len >> PAGE_SHIFT;
263 
264           pmap_kremove(sva, len);
265           pmap_update(pmap_kernel());
266           uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
267           sokvafree(sva, len);
268 }
269 
270 /*
271  * sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock
272  * so_pendfree_lock when freeing mbufs.
273  */
274 static void
sopendfree_thread(void * v)275 sopendfree_thread(void *v)
276 {
277           struct mbuf *m, *next;
278           size_t rv;
279 
280           mutex_enter(&so_pendfree_lock);
281 
282           for (;;) {
283                     rv = 0;
284                     while (so_pendfree != NULL) {
285                               m = so_pendfree;
286                               so_pendfree = NULL;
287                               mutex_exit(&so_pendfree_lock);
288 
289                               for (; m != NULL; m = next) {
290                                         next = m->m_next;
291                                         KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) ==
292                                             0);
293                                         KASSERT(m->m_ext.ext_refcnt == 0);
294 
295                                         rv += m->m_ext.ext_size;
296                                         sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
297                                             m->m_ext.ext_size);
298                                         pool_cache_put(mb_cache, m);
299                               }
300 
301                               mutex_enter(&so_pendfree_lock);
302                     }
303                     if (rv)
304                               cv_broadcast(&socurkva_cv);
305                     cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
306           }
307           panic("sopendfree_thread");
308           /* NOTREACHED */
309 }
310 
311 void
soloanfree(struct mbuf * m,void * buf,size_t size,void * arg)312 soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
313 {
314 
315           KASSERT(m != NULL);
316 
317           /*
318            * postpone freeing mbuf.
319            *
320            * we can't do it in interrupt context
321            * because we need to put kva back to kernel_map.
322            */
323 
324           mutex_enter(&so_pendfree_lock);
325           m->m_next = so_pendfree;
326           so_pendfree = m;
327           cv_signal(&pendfree_thread_cv);
328           mutex_exit(&so_pendfree_lock);
329 }
330 
331 static long
sosend_loan(struct socket * so,struct uio * uio,struct mbuf * m,long space)332 sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
333 {
334           struct iovec *iov = uio->uio_iov;
335           vaddr_t sva, eva;
336           vsize_t len;
337           vaddr_t lva;
338           int npgs, error;
339           vaddr_t va;
340           int i;
341 
342           if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
343                     return 0;
344 
345           if (iov->iov_len < (size_t) space)
346                     space = iov->iov_len;
347           if (space > SOCK_LOAN_CHUNK)
348                     space = SOCK_LOAN_CHUNK;
349 
350           eva = round_page((vaddr_t) iov->iov_base + space);
351           sva = trunc_page((vaddr_t) iov->iov_base);
352           len = eva - sva;
353           npgs = len >> PAGE_SHIFT;
354 
355           KASSERT(npgs <= M_EXT_MAXPAGES);
356 
357           lva = sokvaalloc(sva, len, so);
358           if (lva == 0)
359                     return 0;
360 
361           error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
362               m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
363           if (error) {
364                     sokvafree(lva, len);
365                     return 0;
366           }
367 
368           for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
369                     pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
370                         VM_PROT_READ, 0);
371           pmap_update(pmap_kernel());
372 
373           lva += (vaddr_t) iov->iov_base & PAGE_MASK;
374 
375           MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
376           m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
377 
378           uio->uio_resid -= space;
379           /* uio_offset not updated, not set/used for write(2) */
380           uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
381           uio->uio_iov->iov_len -= space;
382           if (uio->uio_iov->iov_len == 0) {
383                     uio->uio_iov++;
384                     uio->uio_iovcnt--;
385           }
386 
387           return space;
388 }
389 
390 static int
socket_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)391 socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
392     void *arg0, void *arg1, void *arg2, void *arg3)
393 {
394           int result;
395           enum kauth_network_req req;
396 
397           result = KAUTH_RESULT_DEFER;
398           req = (enum kauth_network_req)(uintptr_t)arg0;
399 
400           if ((action != KAUTH_NETWORK_SOCKET) &&
401               (action != KAUTH_NETWORK_BIND))
402                     return result;
403 
404           switch (req) {
405           case KAUTH_REQ_NETWORK_BIND_PORT:
406                     result = KAUTH_RESULT_ALLOW;
407                     break;
408 
409           case KAUTH_REQ_NETWORK_SOCKET_DROP: {
410                     /* Normal users can only drop their own connections. */
411                     struct socket *so = (struct socket *)arg1;
412 
413                     if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0)
414                               result = KAUTH_RESULT_ALLOW;
415 
416                     break;
417                     }
418 
419           case KAUTH_REQ_NETWORK_SOCKET_OPEN:
420                     /* We allow "raw" routing/bluetooth sockets to anyone. */
421                     switch ((u_long)arg1) {
422                     case PF_ROUTE:
423                     case PF_OROUTE:
424                     case PF_BLUETOOTH:
425                     case PF_CAN:
426                               result = KAUTH_RESULT_ALLOW;
427                               break;
428                     default:
429                               /* Privileged, let secmodel handle this. */
430                               if ((u_long)arg2 == SOCK_RAW)
431                                         break;
432                               result = KAUTH_RESULT_ALLOW;
433                               break;
434                     }
435                     break;
436 
437           case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
438                     result = KAUTH_RESULT_ALLOW;
439 
440                     break;
441 
442           default:
443                     break;
444           }
445 
446           return result;
447 }
448 
449 void
soinit(void)450 soinit(void)
451 {
452 
453           sysctl_kern_socket_setup();
454 
455 #ifdef SCTP
456           /* Update the SCTP function hooks if necessary*/
457 
458         vec_sctp_add_ip_address = sctp_add_ip_address;
459         vec_sctp_delete_ip_address = sctp_delete_ip_address;
460 #endif
461 
462           mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
463           softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
464           cv_init(&socurkva_cv, "sokva");
465           cv_init(&pendfree_thread_cv, "sopendfr");
466           soinit2();
467 
468           /* Set the initial adjusted socket buffer size. */
469           if (sb_max_set(sb_max))
470                     panic("bad initial sb_max value: %lu", sb_max);
471 
472           socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
473               socket_listener_cb, NULL);
474 }
475 
476 void
soinit1(void)477 soinit1(void)
478 {
479           int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
480               sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
481           if (error)
482                     panic("soinit1 %d", error);
483 }
484 
485 /*
486  * socreate: create a new socket of the specified type and the protocol.
487  *
488  * => Caller may specify another socket for lock sharing (must not be held).
489  * => Returns the new socket without lock held.
490  */
491 int
socreate(int dom,struct socket ** aso,int type,int proto,struct lwp * l,struct socket * lockso)492 socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
493     struct socket *lockso)
494 {
495           const struct protosw *prp;
496           struct socket *so;
497           uid_t uid;
498           int error;
499           kmutex_t *lock;
500 
501           error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
502               KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
503               KAUTH_ARG(proto));
504           if (error != 0)
505                     return error;
506 
507           if (proto)
508                     prp = pffindproto(dom, proto, type);
509           else
510                     prp = pffindtype(dom, type);
511           if (prp == NULL) {
512                     /* no support for domain */
513                     if (pffinddomain(dom) == 0)
514                               return SET_ERROR(EAFNOSUPPORT);
515                     /* no support for socket type */
516                     if (proto == 0 && type != 0)
517                               return SET_ERROR(EPROTOTYPE);
518                     return SET_ERROR(EPROTONOSUPPORT);
519           }
520           if (prp->pr_usrreqs == NULL)
521                     return SET_ERROR(EPROTONOSUPPORT);
522           if (prp->pr_type != type)
523                     return SET_ERROR(EPROTOTYPE);
524 
525           so = soget(true);
526           so->so_type = type;
527           so->so_proto = prp;
528           so->so_send = sosend;
529           so->so_receive = soreceive;
530           so->so_options = sooptions;
531 #ifdef MBUFTRACE
532           so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
533           so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
534           so->so_mowner = &prp->pr_domain->dom_mowner;
535 #endif
536           uid = kauth_cred_geteuid(l->l_cred);
537           so->so_uidinfo = uid_find(uid);
538           so->so_egid = kauth_cred_getegid(l->l_cred);
539           so->so_cpid = l->l_proc->p_pid;
540 
541           /*
542            * Lock assigned and taken during PCB attach, unless we share
543            * the lock with another socket, e.g. socketpair(2) case.
544            */
545           if (lockso) {
546                     /*
547                      * lockso->so_lock should be stable at this point, so
548                      * no need for atomic_load_*.
549                      */
550                     lock = lockso->so_lock;
551                     so->so_lock = lock;
552                     mutex_obj_hold(lock);
553                     mutex_enter(lock);
554           }
555 
556           /* Attach the PCB (returns with the socket lock held). */
557           error = (*prp->pr_usrreqs->pr_attach)(so, proto);
558           KASSERT(solocked(so));
559 
560           if (error) {
561                     KASSERT(so->so_pcb == NULL);
562                     so->so_state |= SS_NOFDREF;
563                     sofree(so);
564                     return error;
565           }
566           so->so_cred = kauth_cred_hold(l->l_cred);
567           sounlock(so);
568 
569           *aso = so;
570           return 0;
571 }
572 
573 /*
574  * fsocreate: create a socket and a file descriptor associated with it.
575  * Returns the allocated file structure in *fpp, but the descriptor
576  * is not visible yet for the process.
577  * Caller is responsible for calling fd_affix() for the returned *fpp once
578  * it's socket initialization is finished successfully, or fd_abort() if it's
579  * initialization fails.
580  *
581  *
582  * => On success, write file descriptor to *fdout and *fpp and return zero.
583  * => On failure, return non-zero; *fdout and *fpp will be undefined.
584  */
585 int
fsocreate(int domain,struct socket ** sop,int type,int proto,int * fdout,file_t ** fpp,struct socket * lockso)586 fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout,
587     file_t **fpp, struct socket *lockso)
588 {
589           lwp_t *l = curlwp;
590           int error, fd, flags;
591           struct socket *so;
592           file_t *fp;
593 
594           flags = type & SOCK_FLAGS_MASK;
595           type &= ~SOCK_FLAGS_MASK;
596           error = socreate(domain, &so, type, proto, l, lockso);
597           if (error) {
598                     return error;
599           }
600 
601           if ((error = fd_allocfile(&fp, &fd)) != 0) {
602                     soclose(so);
603                     return error;
604           }
605           fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
606           fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
607               ((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
608           fp->f_type = DTYPE_SOCKET;
609           fp->f_ops = &socketops;
610           if (flags & SOCK_NONBLOCK) {
611                     so->so_state |= SS_NBIO;
612           }
613           fp->f_socket = so;
614 
615           if (sop != NULL) {
616                     *sop = so;
617           }
618           *fdout = fd;
619           *fpp = fp;
620           return error;
621 }
622 
623 int
sofamily(const struct socket * so)624 sofamily(const struct socket *so)
625 {
626           const struct protosw *pr;
627           const struct domain *dom;
628 
629           if ((pr = so->so_proto) == NULL)
630                     return AF_UNSPEC;
631           if ((dom = pr->pr_domain) == NULL)
632                     return AF_UNSPEC;
633           return dom->dom_family;
634 }
635 
636 int
sobind(struct socket * so,struct sockaddr * nam,struct lwp * l)637 sobind(struct socket *so, struct sockaddr *nam, struct lwp *l)
638 {
639           int error;
640 
641           solock(so);
642           if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
643                     sounlock(so);
644                     return SET_ERROR(EAFNOSUPPORT);
645           }
646           error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l);
647           sounlock(so);
648           return error;
649 }
650 
651 int
solisten(struct socket * so,int backlog,struct lwp * l)652 solisten(struct socket *so, int backlog, struct lwp *l)
653 {
654           int error;
655           short oldopt, oldqlimit;
656 
657           solock(so);
658           if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
659               SS_ISDISCONNECTING)) != 0) {
660                     sounlock(so);
661                     return SET_ERROR(EINVAL);
662           }
663           oldopt = so->so_options;
664           oldqlimit = so->so_qlimit;
665           if (TAILQ_EMPTY(&so->so_q))
666                     so->so_options |= SO_ACCEPTCONN;
667           if (backlog < 0)
668                     backlog = 0;
669           so->so_qlimit = uimin(backlog, somaxconn);
670 
671           error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l);
672           if (error != 0) {
673                     so->so_options = oldopt;
674                     so->so_qlimit = oldqlimit;
675                     sounlock(so);
676                     return error;
677           }
678           sounlock(so);
679           return 0;
680 }
681 
682 void
sofree(struct socket * so)683 sofree(struct socket *so)
684 {
685           u_int refs;
686 
687           KASSERT(solocked(so));
688 
689           if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
690                     sounlock(so);
691                     return;
692           }
693           if (so->so_head) {
694                     /*
695                      * We must not decommission a socket that's on the accept(2)
696                      * queue.  If we do, then accept(2) may hang after select(2)
697                      * indicated that the listening socket was ready.
698                      */
699                     if (!soqremque(so, 0)) {
700                               sounlock(so);
701                               return;
702                     }
703           }
704           if (so->so_rcv.sb_hiwat)
705                     (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
706                         RLIM_INFINITY);
707           if (so->so_snd.sb_hiwat)
708                     (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
709                         RLIM_INFINITY);
710           sbrelease(&so->so_snd, so);
711           KASSERT(!cv_has_waiters(&so->so_cv));
712           KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv));
713           KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
714           sorflush(so);
715           refs = so->so_aborting;       /* XXX */
716           /* Remove accept filter if one is present. */
717           if (so->so_accf != NULL)
718                     (void)accept_filt_clear(so);
719           sounlock(so);
720           if (refs == 0)                /* XXX */
721                     soput(so);
722 }
723 
724 /*
725  * soclose: close a socket on last file table reference removal.
726  * Initiate disconnect if connected.  Free socket when disconnect complete.
727  */
728 int
soclose(struct socket * so)729 soclose(struct socket *so)
730 {
731           struct socket *so2;
732           int error = 0;
733 
734           solock(so);
735           if (so->so_options & SO_ACCEPTCONN) {
736                     for (;;) {
737                               if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
738                                         KASSERT(solocked2(so, so2));
739                                         (void) soqremque(so2, 0);
740                                         /* soabort drops the lock. */
741                                         (void) soabort(so2);
742                                         solock(so);
743                                         continue;
744                               }
745                               if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) {
746                                         KASSERT(solocked2(so, so2));
747                                         (void) soqremque(so2, 1);
748                                         /* soabort drops the lock. */
749                                         (void) soabort(so2);
750                                         solock(so);
751                                         continue;
752                               }
753                               break;
754                     }
755           }
756           if (so->so_pcb == NULL)
757                     goto discard;
758           if (so->so_state & SS_ISCONNECTED) {
759                     if ((so->so_state & SS_ISDISCONNECTING) == 0) {
760                               error = sodisconnect(so);
761                               if (error)
762                                         goto drop;
763                     }
764                     if (so->so_options & SO_LINGER) {
765                               if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) ==
766                                   (SS_ISDISCONNECTING|SS_NBIO))
767                                         goto drop;
768                               while (so->so_state & SS_ISCONNECTED) {
769                                         error = sowait(so, true, so->so_linger * hz);
770                                         if (error)
771                                                   break;
772                               }
773                     }
774           }
775  drop:
776           if (so->so_pcb) {
777                     KASSERT(solocked(so));
778                     (*so->so_proto->pr_usrreqs->pr_detach)(so);
779           }
780  discard:
781           KASSERT((so->so_state & SS_NOFDREF) == 0);
782           kauth_cred_free(so->so_cred);
783           so->so_cred = NULL;
784           so->so_state |= SS_NOFDREF;
785           sofree(so);
786           return error;
787 }
788 
789 /*
790  * Must be called with the socket locked..  Will return with it unlocked.
791  */
792 int
soabort(struct socket * so)793 soabort(struct socket *so)
794 {
795           u_int refs;
796           int error;
797 
798           KASSERT(solocked(so));
799           KASSERT(so->so_head == NULL);
800 
801           so->so_aborting++;            /* XXX */
802           error = (*so->so_proto->pr_usrreqs->pr_abort)(so);
803           refs = --so->so_aborting;     /* XXX */
804           if (error || (refs == 0)) {
805                     sofree(so);
806           } else {
807                     sounlock(so);
808           }
809           return error;
810 }
811 
812 int
soaccept(struct socket * so,struct sockaddr * nam)813 soaccept(struct socket *so, struct sockaddr *nam)
814 {
815           int error;
816 
817           KASSERT(solocked(so));
818           KASSERT((so->so_state & SS_NOFDREF) != 0);
819 
820           so->so_state &= ~SS_NOFDREF;
821           if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
822               (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
823                     error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam);
824           else
825                     error = SET_ERROR(ECONNABORTED);
826 
827           return error;
828 }
829 
830 int
soconnect(struct socket * so,struct sockaddr * nam,struct lwp * l)831 soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l)
832 {
833           int error;
834 
835           KASSERT(solocked(so));
836 
837           if (so->so_options & SO_ACCEPTCONN)
838                     return SET_ERROR(EOPNOTSUPP);
839           /*
840            * If protocol is connection-based, can only connect once.
841            * Otherwise, if connected, try to disconnect first.
842            * This allows user to disconnect by connecting to, e.g.,
843            * a null address.
844            */
845           if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
846               ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
847               (error = sodisconnect(so)))) {
848                     error = SET_ERROR(EISCONN);
849           } else {
850                     if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
851                               return SET_ERROR(EAFNOSUPPORT);
852                     }
853                     error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
854           }
855 
856           return error;
857 }
858 
859 int
soconnect2(struct socket * so1,struct socket * so2)860 soconnect2(struct socket *so1, struct socket *so2)
861 {
862           KASSERT(solocked2(so1, so2));
863 
864           return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2);
865 }
866 
867 int
sodisconnect(struct socket * so)868 sodisconnect(struct socket *so)
869 {
870           int error;
871 
872           KASSERT(solocked(so));
873 
874           if ((so->so_state & SS_ISCONNECTED) == 0) {
875                     error = SET_ERROR(ENOTCONN);
876           } else if (so->so_state & SS_ISDISCONNECTING) {
877                     error = SET_ERROR(EALREADY);
878           } else {
879                     error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so);
880           }
881           return error;
882 }
883 
884 #define   SBLOCKWAIT(f)       (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
885 /*
886  * Send on a socket.
887  * If send must go all at once and message is larger than
888  * send buffering, then hard error.
889  * Lock against other senders.
890  * If must go all at once and not enough room now, then
891  * inform user that this would block and do nothing.
892  * Otherwise, if nonblocking, send as much as possible.
893  * The data to be sent is described by "uio" if nonzero,
894  * otherwise by the mbuf chain "top" (which must be null
895  * if uio is not).  Data provided in mbuf chain must be small
896  * enough to send all at once.
897  *
898  * Returns nonzero on error, timeout or signal; callers
899  * must check for short counts if EINTR/ERESTART are returned.
900  * Data and control buffers are freed on return.
901  */
902 int
sosend(struct socket * so,struct sockaddr * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags,struct lwp * l)903 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
904           struct mbuf *top, struct mbuf *control, int flags, struct lwp *l)
905 {
906           struct mbuf **mp, *m;
907           long space, len, resid, clen, mlen;
908           int error, s, dontroute, atomic;
909           short wakeup_state = 0;
910 
911           clen = 0;
912 
913           /*
914            * solock() provides atomicity of access.  splsoftnet() prevents
915            * protocol processing soft interrupts from interrupting us and
916            * blocking (expensive).
917            */
918           s = splsoftnet();
919           solock(so);
920           atomic = sosendallatonce(so) || top;
921           if (uio)
922                     resid = uio->uio_resid;
923           else
924                     resid = top->m_pkthdr.len;
925           /*
926            * In theory resid should be unsigned.
927            * However, space must be signed, as it might be less than 0
928            * if we over-committed, and we must use a signed comparison
929            * of space and resid.  On the other hand, a negative resid
930            * causes us to loop sending 0-length segments to the protocol.
931            */
932           if (resid < 0) {
933                     error = SET_ERROR(EINVAL);
934                     goto out;
935           }
936           dontroute =
937               (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
938               (so->so_proto->pr_flags & PR_ATOMIC);
939           l->l_ru.ru_msgsnd++;
940           if (control)
941                     clen = control->m_len;
942  restart:
943           if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
944                     goto out;
945           do {
946                     if (so->so_state & SS_CANTSENDMORE) {
947                               error = SET_ERROR(EPIPE);
948                               goto release;
949                     }
950                     if (so->so_error) {
951                               error = SET_ERROR(so->so_error);
952                               if ((flags & MSG_PEEK) == 0)
953                                         so->so_error = 0;
954                               goto release;
955                     }
956                     if ((so->so_state & SS_ISCONNECTED) == 0) {
957                               if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
958                                         if (resid || clen == 0) {
959                                                   error = SET_ERROR(ENOTCONN);
960                                                   goto release;
961                                         }
962                               } else if (addr == NULL) {
963                                         error = SET_ERROR(EDESTADDRREQ);
964                                         goto release;
965                               }
966                     }
967                     space = sbspace(&so->so_snd);
968                     if (flags & MSG_OOB)
969                               space += 1024;
970                     if ((atomic && resid > so->so_snd.sb_hiwat) ||
971                         clen > so->so_snd.sb_hiwat) {
972                               error = SET_ERROR(EMSGSIZE);
973                               goto release;
974                     }
975                     if (space < resid + clen &&
976                         (atomic || space < so->so_snd.sb_lowat || space < clen)) {
977                               if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
978                                         error = SET_ERROR(EWOULDBLOCK);
979                                         goto release;
980                               }
981                               sbunlock(&so->so_snd);
982                               if (wakeup_state & SS_RESTARTSYS) {
983                                         error = SET_ERROR(ERESTART);
984                                         goto out;
985                               }
986                               error = sbwait(&so->so_snd);
987                               if (error)
988                                         goto out;
989                               wakeup_state = so->so_state;
990                               goto restart;
991                     }
992                     wakeup_state = 0;
993                     mp = &top;
994                     space -= clen;
995                     do {
996                               if (uio == NULL) {
997                                         /*
998                                          * Data is prepackaged in "top".
999                                          */
1000                                         resid = 0;
1001                                         if (flags & MSG_EOR)
1002                                                   top->m_flags |= M_EOR;
1003                               } else do {
1004                                         sounlock(so);
1005                                         splx(s);
1006                                         if (top == NULL) {
1007                                                   m = m_gethdr(M_WAIT, MT_DATA);
1008                                                   mlen = MHLEN;
1009                                                   m->m_pkthdr.len = 0;
1010                                                   m_reset_rcvif(m);
1011                                         } else {
1012                                                   m = m_get(M_WAIT, MT_DATA);
1013                                                   mlen = MLEN;
1014                                         }
1015                                         MCLAIM(m, so->so_snd.sb_mowner);
1016                                         if (sock_loan_thresh >= 0 &&
1017                                             uio->uio_iov->iov_len >= sock_loan_thresh &&
1018                                             space >= sock_loan_thresh &&
1019                                             (len = sosend_loan(so, uio, m,
1020                                                                    space)) != 0) {
1021                                                   SOSEND_COUNTER_INCR(&sosend_loan_big);
1022                                                   space -= len;
1023                                                   goto have_data;
1024                                         }
1025                                         if (resid >= MINCLSIZE && space >= MCLBYTES) {
1026                                                   SOSEND_COUNTER_INCR(&sosend_copy_big);
1027                                                   m_clget(m, M_DONTWAIT);
1028                                                   if ((m->m_flags & M_EXT) == 0)
1029                                                             goto nopages;
1030                                                   mlen = MCLBYTES;
1031                                                   if (atomic && top == 0) {
1032                                                             len = lmin(MCLBYTES - max_hdr,
1033                                                                 resid);
1034                                                             m->m_data += max_hdr;
1035                                                   } else
1036                                                             len = lmin(MCLBYTES, resid);
1037                                                   space -= len;
1038                                         } else {
1039  nopages:
1040                                                   SOSEND_COUNTER_INCR(&sosend_copy_small);
1041                                                   len = lmin(lmin(mlen, resid), space);
1042                                                   space -= len;
1043                                                   /*
1044                                                    * For datagram protocols, leave room
1045                                                    * for protocol headers in first mbuf.
1046                                                    */
1047                                                   if (atomic && top == 0 && len < mlen)
1048                                                             m_align(m, len);
1049                                         }
1050                                         error = uiomove(mtod(m, void *), (int)len, uio);
1051  have_data:
1052                                         resid = uio->uio_resid;
1053                                         m->m_len = len;
1054                                         *mp = m;
1055                                         top->m_pkthdr.len += len;
1056                                         s = splsoftnet();
1057                                         solock(so);
1058                                         if (error != 0)
1059                                                   goto release;
1060                                         mp = &m->m_next;
1061                                         if (resid <= 0) {
1062                                                   if (flags & MSG_EOR)
1063                                                             top->m_flags |= M_EOR;
1064                                                   break;
1065                                         }
1066                               } while (space > 0 && atomic);
1067 
1068                               if (so->so_state & SS_CANTSENDMORE) {
1069                                         error = SET_ERROR(EPIPE);
1070                                         goto release;
1071                               }
1072                               if (dontroute)
1073                                         so->so_options |= SO_DONTROUTE;
1074                               if (resid > 0)
1075                                         so->so_state |= SS_MORETOCOME;
1076                               if (flags & MSG_OOB) {
1077                                         error = (*so->so_proto->pr_usrreqs->pr_sendoob)(
1078                                             so, top, control);
1079                               } else {
1080                                         error = (*so->so_proto->pr_usrreqs->pr_send)(so,
1081                                             top, addr, control, l);
1082                               }
1083                               if (dontroute)
1084                                         so->so_options &= ~SO_DONTROUTE;
1085                               if (resid > 0)
1086                                         so->so_state &= ~SS_MORETOCOME;
1087                               clen = 0;
1088                               control = NULL;
1089                               top = NULL;
1090                               mp = &top;
1091                               if (error != 0)
1092                                         goto release;
1093                     } while (resid && space > 0);
1094           } while (resid);
1095 
1096  release:
1097           sbunlock(&so->so_snd);
1098  out:
1099           sounlock(so);
1100           splx(s);
1101           m_freem(top);
1102           m_freem(control);
1103           return error;
1104 }
1105 
1106 /*
1107  * Following replacement or removal of the first mbuf on the first
1108  * mbuf chain of a socket buffer, push necessary state changes back
1109  * into the socket buffer so that other consumers see the values
1110  * consistently.  'nextrecord' is the caller's locally stored value of
1111  * the original value of sb->sb_mb->m_nextpkt which must be restored
1112  * when the lead mbuf changes.  NOTE: 'nextrecord' may be NULL.
1113  */
1114 static void
sbsync(struct sockbuf * sb,struct mbuf * nextrecord)1115 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
1116 {
1117 
1118           KASSERT(solocked(sb->sb_so));
1119 
1120           /*
1121            * First, update for the new value of nextrecord.  If necessary,
1122            * make it the first record.
1123            */
1124           if (sb->sb_mb != NULL)
1125                     sb->sb_mb->m_nextpkt = nextrecord;
1126           else
1127                     sb->sb_mb = nextrecord;
1128 
1129         /*
1130          * Now update any dependent socket buffer fields to reflect
1131          * the new state.  This is an inline of SB_EMPTY_FIXUP, with
1132          * the addition of a second clause that takes care of the
1133          * case where sb_mb has been updated, but remains the last
1134          * record.
1135          */
1136         if (sb->sb_mb == NULL) {
1137                 sb->sb_mbtail = NULL;
1138                 sb->sb_lastrecord = NULL;
1139         } else if (sb->sb_mb->m_nextpkt == NULL)
1140                 sb->sb_lastrecord = sb->sb_mb;
1141 }
1142 
1143 /*
1144  * Implement receive operations on a socket.
1145  *
1146  * We depend on the way that records are added to the sockbuf by sbappend*. In
1147  * particular, each record (mbufs linked through m_next) must begin with an
1148  * address if the protocol so specifies, followed by an optional mbuf or mbufs
1149  * containing ancillary data, and then zero or more mbufs of data.
1150  *
1151  * In order to avoid blocking network interrupts for the entire time here, we
1152  * splx() while doing the actual copy to user space. Although the sockbuf is
1153  * locked, new data may still be appended, and thus we must maintain
1154  * consistency of the sockbuf during that time.
1155  *
1156  * The caller may receive the data as a single mbuf chain by supplying an mbuf
1157  * **mp0 for use in returning the chain. The uio is then used only for the
1158  * count in uio_resid.
1159  */
1160 int
soreceive(struct socket * so,struct mbuf ** paddr,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp)1161 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
1162     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1163 {
1164           struct lwp *l = curlwp;
1165           struct mbuf *m, **mp, *mt;
1166           size_t len, offset, moff, orig_resid;
1167           int atomic, flags, error, s, type;
1168           const struct protosw *pr;
1169           struct mbuf *nextrecord;
1170           int mbuf_removed = 0;
1171           const struct domain *dom;
1172           short wakeup_state = 0;
1173 
1174           pr = so->so_proto;
1175           atomic = pr->pr_flags & PR_ATOMIC;
1176           dom = pr->pr_domain;
1177           mp = mp0;
1178           type = 0;
1179           orig_resid = uio->uio_resid;
1180 
1181           if (paddr != NULL)
1182                     *paddr = NULL;
1183           if (controlp != NULL)
1184                     *controlp = NULL;
1185           if (flagsp != NULL)
1186                     flags = *flagsp &~ MSG_EOR;
1187           else
1188                     flags = 0;
1189 
1190           if (flags & MSG_OOB) {
1191                     m = m_get(M_WAIT, MT_DATA);
1192                     solock(so);
1193                     error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK);
1194                     sounlock(so);
1195                     if (error)
1196                               goto bad;
1197                     do {
1198                               error = uiomove(mtod(m, void *),
1199                                   MIN(uio->uio_resid, m->m_len), uio);
1200                               m = m_free(m);
1201                     } while (uio->uio_resid > 0 && error == 0 && m);
1202 bad:
1203                     m_freem(m);
1204                     return error;
1205           }
1206           if (mp != NULL)
1207                     *mp = NULL;
1208 
1209           /*
1210            * solock() provides atomicity of access.  splsoftnet() prevents
1211            * protocol processing soft interrupts from interrupting us and
1212            * blocking (expensive).
1213            */
1214           s = splsoftnet();
1215           solock(so);
1216 restart:
1217           if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
1218                     sounlock(so);
1219                     splx(s);
1220                     return error;
1221           }
1222           m = so->so_rcv.sb_mb;
1223 
1224           /*
1225            * If we have less data than requested, block awaiting more
1226            * (subject to any timeout) if:
1227            *   1. the current count is less than the low water mark,
1228            *   2. MSG_WAITALL is set, and it is possible to do the entire
1229            *        receive operation at once if we block (resid <= hiwat), or
1230            *   3. MSG_DONTWAIT is not set.
1231            * If MSG_WAITALL is set but resid is larger than the receive buffer,
1232            * we have to do the receive in sections, and thus risk returning
1233            * a short count if a timeout or signal occurs after we start.
1234            */
1235           if (m == NULL ||
1236               ((flags & MSG_DONTWAIT) == 0 &&
1237                so->so_rcv.sb_cc < uio->uio_resid &&
1238                (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1239                 ((flags & MSG_WAITALL) &&
1240                  uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1241                m->m_nextpkt == NULL && !atomic)) {
1242 #ifdef DIAGNOSTIC
1243                     if (m == NULL && so->so_rcv.sb_cc)
1244                               panic("receive 1");
1245 #endif
1246                     if (so->so_error || so->so_rerror) {
1247                               u_short *e;
1248                               if (m != NULL)
1249                                         goto dontblock;
1250                               e = so->so_error ? &so->so_error : &so->so_rerror;
1251                               error = SET_ERROR(*e);
1252                               if ((flags & MSG_PEEK) == 0)
1253                                         *e = 0;
1254                               goto release;
1255                     }
1256                     if (so->so_state & SS_CANTRCVMORE) {
1257                               if (m != NULL)
1258                                         goto dontblock;
1259                               else
1260                                         goto release;
1261                     }
1262                     for (; m != NULL; m = m->m_next)
1263                               if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
1264                                         m = so->so_rcv.sb_mb;
1265                                         goto dontblock;
1266                               }
1267                     if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1268                         (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1269                               error = SET_ERROR(ENOTCONN);
1270                               goto release;
1271                     }
1272                     if (uio->uio_resid == 0)
1273                               goto release;
1274                     if ((so->so_state & SS_NBIO) ||
1275                         (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1276                               error = SET_ERROR(EWOULDBLOCK);
1277                               goto release;
1278                     }
1279                     SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
1280                     SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
1281                     sbunlock(&so->so_rcv);
1282                     if (wakeup_state & SS_RESTARTSYS)
1283                               error = SET_ERROR(ERESTART);
1284                     else
1285                               error = sbwait(&so->so_rcv);
1286                     if (error != 0) {
1287                               sounlock(so);
1288                               splx(s);
1289                               return error;
1290                     }
1291                     wakeup_state = so->so_state;
1292                     goto restart;
1293           }
1294 
1295 dontblock:
1296           /*
1297            * On entry here, m points to the first record of the socket buffer.
1298            * From this point onward, we maintain 'nextrecord' as a cache of the
1299            * pointer to the next record in the socket buffer.  We must keep the
1300            * various socket buffer pointers and local stack versions of the
1301            * pointers in sync, pushing out modifications before dropping the
1302            * socket lock, and re-reading them when picking it up.
1303            *
1304            * Otherwise, we will race with the network stack appending new data
1305            * or records onto the socket buffer by using inconsistent/stale
1306            * versions of the field, possibly resulting in socket buffer
1307            * corruption.
1308            *
1309            * By holding the high-level sblock(), we prevent simultaneous
1310            * readers from pulling off the front of the socket buffer.
1311            */
1312           if (l != NULL)
1313                     l->l_ru.ru_msgrcv++;
1314           KASSERT(m == so->so_rcv.sb_mb);
1315           SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
1316           SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1317           nextrecord = m->m_nextpkt;
1318 
1319           if (pr->pr_flags & PR_ADDR) {
1320                     KASSERT(m->m_type == MT_SONAME);
1321                     orig_resid = 0;
1322                     if (flags & MSG_PEEK) {
1323                               if (paddr)
1324                                         *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
1325                               m = m->m_next;
1326                     } else {
1327                               sbfree(&so->so_rcv, m);
1328                               mbuf_removed = 1;
1329                               if (paddr != NULL) {
1330                                         *paddr = m;
1331                                         so->so_rcv.sb_mb = m->m_next;
1332                                         m->m_next = NULL;
1333                                         m = so->so_rcv.sb_mb;
1334                               } else {
1335                                         m = so->so_rcv.sb_mb = m_free(m);
1336                               }
1337                               sbsync(&so->so_rcv, nextrecord);
1338                     }
1339           }
1340 
1341           if (pr->pr_flags & PR_ADDR_OPT) {
1342                     /*
1343                      * For SCTP we may be getting a whole message OR a partial
1344                      * delivery.
1345                      */
1346                     if (m->m_type == MT_SONAME) {
1347                               orig_resid = 0;
1348                               if (flags & MSG_PEEK) {
1349                                         if (paddr)
1350                                                   *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
1351                                         m = m->m_next;
1352                               } else {
1353                                         sbfree(&so->so_rcv, m);
1354                                         mbuf_removed = 1;
1355                                         if (paddr) {
1356                                                   *paddr = m;
1357                                                   so->so_rcv.sb_mb = m->m_next;
1358                                                   m->m_next = 0;
1359                                                   m = so->so_rcv.sb_mb;
1360                                         } else {
1361                                                   m = so->so_rcv.sb_mb = m_free(m);
1362                                         }
1363                                         sbsync(&so->so_rcv, nextrecord);
1364                               }
1365                     }
1366           }
1367 
1368           /*
1369            * Process one or more MT_CONTROL mbufs present before any data mbufs
1370            * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
1371            * just copy the data; if !MSG_PEEK, we call into the protocol to
1372            * perform externalization (or freeing if controlp == NULL).
1373            */
1374           if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
1375                     struct mbuf *cm = NULL, *cmn;
1376                     struct mbuf **cme = &cm;
1377 
1378                     do {
1379                               if (flags & MSG_PEEK) {
1380                                         if (controlp != NULL) {
1381                                                   *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT);
1382                                                   controlp = (*controlp == NULL ? NULL :
1383                                                       &(*controlp)->m_next);
1384                                         }
1385                                         m = m->m_next;
1386                               } else {
1387                                         sbfree(&so->so_rcv, m);
1388                                         so->so_rcv.sb_mb = m->m_next;
1389                                         m->m_next = NULL;
1390                                         *cme = m;
1391                                         cme = &(*cme)->m_next;
1392                                         m = so->so_rcv.sb_mb;
1393                               }
1394                     } while (m != NULL && m->m_type == MT_CONTROL);
1395                     if ((flags & MSG_PEEK) == 0)
1396                               sbsync(&so->so_rcv, nextrecord);
1397 
1398                     for (; cm != NULL; cm = cmn) {
1399                               cmn = cm->m_next;
1400                               cm->m_next = NULL;
1401                               type = mtod(cm, struct cmsghdr *)->cmsg_type;
1402                               if (controlp != NULL) {
1403                                         if (dom->dom_externalize != NULL &&
1404                                             type == SCM_RIGHTS) {
1405                                                   sounlock(so);
1406                                                   splx(s);
1407                                                   error = (*dom->dom_externalize)(cm, l,
1408                                                       (flags & MSG_CMSG_CLOEXEC) ?
1409                                                       O_CLOEXEC : 0);
1410                                                   s = splsoftnet();
1411                                                   solock(so);
1412                                         }
1413                                         *controlp = cm;
1414                                         while (*controlp != NULL)
1415                                                   controlp = &(*controlp)->m_next;
1416                               } else {
1417                                         /*
1418                                          * Dispose of any SCM_RIGHTS message that went
1419                                          * through the read path rather than recv.
1420                                          */
1421                                         if (dom->dom_dispose != NULL &&
1422                                             type == SCM_RIGHTS) {
1423                                                   sounlock(so);
1424                                                   (*dom->dom_dispose)(cm);
1425                                                   solock(so);
1426                                         }
1427                                         m_freem(cm);
1428                               }
1429                     }
1430                     if (m != NULL)
1431                               nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1432                     else
1433                               nextrecord = so->so_rcv.sb_mb;
1434                     orig_resid = 0;
1435           }
1436 
1437           /* If m is non-NULL, we have some data to read. */
1438           if (__predict_true(m != NULL)) {
1439                     type = m->m_type;
1440                     if (type == MT_OOBDATA)
1441                               flags |= MSG_OOB;
1442           }
1443           SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1444           SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1445 
1446           moff = 0;
1447           offset = 0;
1448           while (m != NULL && uio->uio_resid > 0 && error == 0) {
1449                     /*
1450                      * If the type of mbuf has changed, end the receive
1451                      * operation and do a short read.
1452                      */
1453                     if (m->m_type == MT_OOBDATA) {
1454                               if (type != MT_OOBDATA)
1455                                         break;
1456                     } else if (type == MT_OOBDATA) {
1457                               break;
1458                     } else if (m->m_type == MT_CONTROL) {
1459                               break;
1460                     }
1461 #ifdef DIAGNOSTIC
1462                     else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1463                               panic("%s: m_type=%d", __func__, m->m_type);
1464                     }
1465 #endif
1466 
1467                     so->so_state &= ~SS_RCVATMARK;
1468                     wakeup_state = 0;
1469                     len = uio->uio_resid;
1470                     if (so->so_oobmark && len > so->so_oobmark - offset)
1471                               len = so->so_oobmark - offset;
1472                     if (len > m->m_len - moff)
1473                               len = m->m_len - moff;
1474 
1475                     /*
1476                      * If mp is set, just pass back the mbufs.
1477                      * Otherwise copy them out via the uio, then free.
1478                      * Sockbuf must be consistent here (points to current mbuf,
1479                      * it points to next record) when we drop priority;
1480                      * we must note any additions to the sockbuf when we
1481                      * block interrupts again.
1482                      */
1483                     if (mp == NULL) {
1484                               SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1485                               SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1486                               sounlock(so);
1487                               splx(s);
1488                               error = uiomove(mtod(m, char *) + moff, len, uio);
1489                               s = splsoftnet();
1490                               solock(so);
1491                               if (error != 0) {
1492                                         /*
1493                                          * If any part of the record has been removed
1494                                          * (such as the MT_SONAME mbuf, which will
1495                                          * happen when PR_ADDR, and thus also
1496                                          * PR_ATOMIC, is set), then drop the entire
1497                                          * record to maintain the atomicity of the
1498                                          * receive operation.
1499                                          *
1500                                          * This avoids a later panic("receive 1a")
1501                                          * when compiled with DIAGNOSTIC.
1502                                          */
1503                                         if (m && mbuf_removed && atomic)
1504                                                   (void) sbdroprecord(&so->so_rcv);
1505 
1506                                         goto release;
1507                               }
1508                     } else {
1509                               uio->uio_resid -= len;
1510                     }
1511 
1512                     if (len == m->m_len - moff) {
1513                               if (m->m_flags & M_EOR)
1514                                         flags |= MSG_EOR;
1515 #ifdef SCTP
1516                               if (m->m_flags & M_NOTIFICATION)
1517                                         flags |= MSG_NOTIFICATION;
1518 #endif
1519                               if (flags & MSG_PEEK) {
1520                                         m = m->m_next;
1521                                         moff = 0;
1522                               } else {
1523                                         nextrecord = m->m_nextpkt;
1524                                         sbfree(&so->so_rcv, m);
1525                                         if (mp) {
1526                                                   *mp = m;
1527                                                   mp = &m->m_next;
1528                                                   so->so_rcv.sb_mb = m = m->m_next;
1529                                                   *mp = NULL;
1530                                         } else {
1531                                                   m = so->so_rcv.sb_mb = m_free(m);
1532                                         }
1533                                         /*
1534                                          * If m != NULL, we also know that
1535                                          * so->so_rcv.sb_mb != NULL.
1536                                          */
1537                                         KASSERT(so->so_rcv.sb_mb == m);
1538                                         if (m) {
1539                                                   m->m_nextpkt = nextrecord;
1540                                                   if (nextrecord == NULL)
1541                                                             so->so_rcv.sb_lastrecord = m;
1542                                         } else {
1543                                                   so->so_rcv.sb_mb = nextrecord;
1544                                                   SB_EMPTY_FIXUP(&so->so_rcv);
1545                                         }
1546                                         SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1547                                         SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1548                               }
1549                     } else if (flags & MSG_PEEK) {
1550                               moff += len;
1551                     } else {
1552                               if (mp != NULL) {
1553                                         mt = m_copym(m, 0, len, M_NOWAIT);
1554                                         if (__predict_false(mt == NULL)) {
1555                                                   sounlock(so);
1556                                                   mt = m_copym(m, 0, len, M_WAIT);
1557                                                   solock(so);
1558                                         }
1559                                         *mp = mt;
1560                               }
1561                               m->m_data += len;
1562                               m->m_len -= len;
1563                               so->so_rcv.sb_cc -= len;
1564                     }
1565 
1566                     if (so->so_oobmark) {
1567                               if ((flags & MSG_PEEK) == 0) {
1568                                         so->so_oobmark -= len;
1569                                         if (so->so_oobmark == 0) {
1570                                                   so->so_state |= SS_RCVATMARK;
1571                                                   break;
1572                                         }
1573                               } else {
1574                                         offset += len;
1575                                         if (offset == so->so_oobmark)
1576                                                   break;
1577                               }
1578                     } else {
1579                               so->so_state &= ~SS_POLLRDBAND;
1580                     }
1581                     if (flags & MSG_EOR)
1582                               break;
1583 
1584                     /*
1585                      * If the MSG_WAITALL flag is set (for non-atomic socket),
1586                      * we must not quit until "uio->uio_resid == 0" or an error
1587                      * termination.  If a signal/timeout occurs, return
1588                      * with a short count but without error.
1589                      * Keep sockbuf locked against other readers.
1590                      */
1591                     while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1592                         !sosendallatonce(so) && !nextrecord) {
1593                               if (so->so_error || so->so_rerror ||
1594                                   so->so_state & SS_CANTRCVMORE)
1595                                         break;
1596                               /*
1597                                * If we are peeking and the socket receive buffer is
1598                                * full, stop since we can't get more data to peek at.
1599                                */
1600                               if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
1601                                         break;
1602                               /*
1603                                * If we've drained the socket buffer, tell the
1604                                * protocol in case it needs to do something to
1605                                * get it filled again.
1606                                */
1607                               if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb)
1608                                         (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
1609                               SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1610                               SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1611                               if (wakeup_state & SS_RESTARTSYS)
1612                                         error = SET_ERROR(ERESTART);
1613                               else
1614                                         error = sbwait(&so->so_rcv);
1615                               if (error != 0) {
1616                                         sbunlock(&so->so_rcv);
1617                                         sounlock(so);
1618                                         splx(s);
1619                                         return 0;
1620                               }
1621                               if ((m = so->so_rcv.sb_mb) != NULL)
1622                                         nextrecord = m->m_nextpkt;
1623                               wakeup_state = so->so_state;
1624                     }
1625           }
1626 
1627           if (m && atomic) {
1628                     flags |= MSG_TRUNC;
1629                     if ((flags & MSG_PEEK) == 0)
1630                               (void) sbdroprecord(&so->so_rcv);
1631           }
1632           if ((flags & MSG_PEEK) == 0) {
1633                     if (m == NULL) {
1634                               /*
1635                                * First part is an inline SB_EMPTY_FIXUP().  Second
1636                                * part makes sure sb_lastrecord is up-to-date if
1637                                * there is still data in the socket buffer.
1638                                */
1639                               so->so_rcv.sb_mb = nextrecord;
1640                               if (so->so_rcv.sb_mb == NULL) {
1641                                         so->so_rcv.sb_mbtail = NULL;
1642                                         so->so_rcv.sb_lastrecord = NULL;
1643                               } else if (nextrecord->m_nextpkt == NULL)
1644                                         so->so_rcv.sb_lastrecord = nextrecord;
1645                     }
1646                     SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1647                     SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1648                     if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1649                               (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
1650           }
1651           if (orig_resid == uio->uio_resid && orig_resid &&
1652               (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1653                     sbunlock(&so->so_rcv);
1654                     goto restart;
1655           }
1656 
1657           if (flagsp != NULL)
1658                     *flagsp |= flags;
1659 release:
1660           sbunlock(&so->so_rcv);
1661           sounlock(so);
1662           splx(s);
1663           return error;
1664 }
1665 
1666 int
soshutdown(struct socket * so,int how)1667 soshutdown(struct socket *so, int how)
1668 {
1669           const struct protosw *pr;
1670           int error;
1671 
1672           KASSERT(solocked(so));
1673 
1674           pr = so->so_proto;
1675           if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
1676                     return SET_ERROR(EINVAL);
1677 
1678           if (how == SHUT_RD || how == SHUT_RDWR) {
1679                     sorflush(so);
1680                     error = 0;
1681           }
1682           if (how == SHUT_WR || how == SHUT_RDWR)
1683                     error = (*pr->pr_usrreqs->pr_shutdown)(so);
1684 
1685           return error;
1686 }
1687 
1688 void
sorestart(struct socket * so)1689 sorestart(struct socket *so)
1690 {
1691           /*
1692            * An application has called close() on an fd on which another
1693            * of its threads has called a socket system call.
1694            * Mark this and wake everyone up, and code that would block again
1695            * instead returns ERESTART.
1696            * On system call re-entry the fd is validated and EBADF returned.
1697            * Any other fd will block again on the 2nd syscall.
1698            */
1699           solock(so);
1700           so->so_state |= SS_RESTARTSYS;
1701           cv_broadcast(&so->so_cv);
1702           cv_broadcast(&so->so_snd.sb_cv);
1703           cv_broadcast(&so->so_rcv.sb_cv);
1704           sounlock(so);
1705 }
1706 
1707 void
sorflush(struct socket * so)1708 sorflush(struct socket *so)
1709 {
1710           struct sockbuf *sb, asb;
1711           const struct protosw *pr;
1712 
1713           KASSERT(solocked(so));
1714 
1715           sb = &so->so_rcv;
1716           pr = so->so_proto;
1717           socantrcvmore(so);
1718           sb->sb_flags |= SB_NOINTR;
1719           (void )sblock(sb, M_WAITOK);
1720           sbunlock(sb);
1721           asb = *sb;
1722           /*
1723            * Clear most of the sockbuf structure, but leave some of the
1724            * fields valid.
1725            */
1726           memset(&sb->sb_startzero, 0,
1727               sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
1728           if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
1729                     sounlock(so);
1730                     (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1731                     solock(so);
1732           }
1733           sbrelease(&asb, so);
1734 }
1735 
1736 /*
1737  * internal set SOL_SOCKET options
1738  */
1739 static int
sosetopt1(struct socket * so,const struct sockopt * sopt)1740 sosetopt1(struct socket *so, const struct sockopt *sopt)
1741 {
1742           int error, opt;
1743           int optval = 0; /* XXX: gcc */
1744           struct linger l;
1745           struct timeval tv;
1746 
1747           opt = sopt->sopt_name;
1748 
1749           switch (opt) {
1750 
1751           case SO_ACCEPTFILTER:
1752                     error = accept_filt_setopt(so, sopt);
1753                     KASSERT(solocked(so));
1754                     break;
1755 
1756           case SO_LINGER:
1757                     error = sockopt_get(sopt, &l, sizeof(l));
1758                     solock(so);
1759                     if (error)
1760                               break;
1761                     if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
1762                         l.l_linger > (INT_MAX / hz)) {
1763                               error = SET_ERROR(EDOM);
1764                               break;
1765                     }
1766                     so->so_linger = l.l_linger;
1767                     if (l.l_onoff)
1768                               so->so_options |= SO_LINGER;
1769                     else
1770                               so->so_options &= ~SO_LINGER;
1771                     break;
1772 
1773           case SO_DEBUG:
1774           case SO_KEEPALIVE:
1775           case SO_DONTROUTE:
1776           case SO_USELOOPBACK:
1777           case SO_BROADCAST:
1778           case SO_REUSEADDR:
1779           case SO_REUSEPORT:
1780           case SO_OOBINLINE:
1781           case SO_TIMESTAMP:
1782           case SO_NOSIGPIPE:
1783           case SO_RERROR:
1784                     error = sockopt_getint(sopt, &optval);
1785                     solock(so);
1786                     if (error)
1787                               break;
1788                     if (optval)
1789                               so->so_options |= opt;
1790                     else
1791                               so->so_options &= ~opt;
1792                     break;
1793 
1794           case SO_SNDBUF:
1795           case SO_RCVBUF:
1796           case SO_SNDLOWAT:
1797           case SO_RCVLOWAT:
1798                     error = sockopt_getint(sopt, &optval);
1799                     solock(so);
1800                     if (error)
1801                               break;
1802 
1803                     /*
1804                      * Values < 1 make no sense for any of these
1805                      * options, so disallow them.
1806                      */
1807                     if (optval < 1) {
1808                               error = SET_ERROR(EINVAL);
1809                               break;
1810                     }
1811 
1812                     switch (opt) {
1813                     case SO_SNDBUF:
1814                               if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
1815                                         error = SET_ERROR(ENOBUFS);
1816                                         break;
1817                               }
1818                               if (sofixedbuf)
1819                                         so->so_snd.sb_flags &= ~SB_AUTOSIZE;
1820                               break;
1821 
1822                     case SO_RCVBUF:
1823                               if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
1824                                         error = SET_ERROR(ENOBUFS);
1825                                         break;
1826                               }
1827                               if (sofixedbuf)
1828                                         so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1829                               break;
1830 
1831                     /*
1832                      * Make sure the low-water is never greater than
1833                      * the high-water.
1834                      */
1835                     case SO_SNDLOWAT:
1836                               if (optval > so->so_snd.sb_hiwat)
1837                                         optval = so->so_snd.sb_hiwat;
1838 
1839                               so->so_snd.sb_lowat = optval;
1840                               break;
1841 
1842                     case SO_RCVLOWAT:
1843                               if (optval > so->so_rcv.sb_hiwat)
1844                                         optval = so->so_rcv.sb_hiwat;
1845 
1846                               so->so_rcv.sb_lowat = optval;
1847                               break;
1848                     }
1849                     break;
1850 
1851           case SO_SNDTIMEO:
1852           case SO_RCVTIMEO:
1853                     solock(so);
1854                     error = sockopt_get(sopt, &tv, sizeof(tv));
1855                     if (error)
1856                               break;
1857 
1858                     if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
1859                               error = SET_ERROR(EDOM);
1860                               break;
1861                     }
1862                     if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
1863                               error = SET_ERROR(EDOM);
1864                               break;
1865                     }
1866 
1867                     optval = tv.tv_sec * hz + tv.tv_usec / tick;
1868                     if (optval == 0 && tv.tv_usec != 0)
1869                               optval = 1;
1870 
1871                     switch (opt) {
1872                     case SO_SNDTIMEO:
1873                               so->so_snd.sb_timeo = optval;
1874                               break;
1875                     case SO_RCVTIMEO:
1876                               so->so_rcv.sb_timeo = optval;
1877                               break;
1878                     }
1879                     break;
1880 
1881           default:
1882                     MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook,
1883                         (opt, so, sopt), enosys(), error);
1884                     if (error == ENOSYS || error == EPASSTHROUGH) {
1885                               solock(so);
1886                               error = SET_ERROR(ENOPROTOOPT);
1887                     }
1888                     break;
1889           }
1890           KASSERT(solocked(so));
1891           return error;
1892 }
1893 
1894 int
sosetopt(struct socket * so,struct sockopt * sopt)1895 sosetopt(struct socket *so, struct sockopt *sopt)
1896 {
1897           int error, prerr;
1898 
1899           if (sopt->sopt_level == SOL_SOCKET) {
1900                     error = sosetopt1(so, sopt);
1901                     KASSERT(solocked(so));
1902           } else {
1903                     error = SET_ERROR(ENOPROTOOPT);
1904                     solock(so);
1905           }
1906 
1907           if ((error == 0 || error == ENOPROTOOPT) &&
1908               so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
1909                     /* give the protocol stack a shot */
1910                     prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
1911                     if (prerr == 0)
1912                               error = 0;
1913                     else if (prerr != ENOPROTOOPT)
1914                               error = prerr;
1915           }
1916           sounlock(so);
1917           return error;
1918 }
1919 
1920 /*
1921  * so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
1922  */
1923 int
so_setsockopt(struct lwp * l,struct socket * so,int level,int name,const void * val,size_t valsize)1924 so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
1925     const void *val, size_t valsize)
1926 {
1927           struct sockopt sopt;
1928           int error;
1929 
1930           KASSERT(valsize == 0 || val != NULL);
1931 
1932           sockopt_init(&sopt, level, name, valsize);
1933           sockopt_set(&sopt, val, valsize);
1934 
1935           error = sosetopt(so, &sopt);
1936 
1937           sockopt_destroy(&sopt);
1938 
1939           return error;
1940 }
1941 
1942 /*
1943  * internal get SOL_SOCKET options
1944  */
1945 static int
sogetopt1(struct socket * so,struct sockopt * sopt)1946 sogetopt1(struct socket *so, struct sockopt *sopt)
1947 {
1948           int error, optval, opt;
1949           struct linger l;
1950           struct timeval tv;
1951 
1952           switch ((opt = sopt->sopt_name)) {
1953 
1954           case SO_ACCEPTFILTER:
1955                     error = accept_filt_getopt(so, sopt);
1956                     break;
1957 
1958           case SO_LINGER:
1959                     l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
1960                     l.l_linger = so->so_linger;
1961 
1962                     error = sockopt_set(sopt, &l, sizeof(l));
1963                     break;
1964 
1965           case SO_USELOOPBACK:
1966           case SO_DONTROUTE:
1967           case SO_DEBUG:
1968           case SO_KEEPALIVE:
1969           case SO_REUSEADDR:
1970           case SO_REUSEPORT:
1971           case SO_BROADCAST:
1972           case SO_OOBINLINE:
1973           case SO_TIMESTAMP:
1974           case SO_NOSIGPIPE:
1975           case SO_RERROR:
1976           case SO_ACCEPTCONN:
1977                     error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
1978                     break;
1979 
1980           case SO_TYPE:
1981                     error = sockopt_setint(sopt, so->so_type);
1982                     break;
1983 
1984           case SO_ERROR:
1985                     if (so->so_error == 0) {
1986                               so->so_error = so->so_rerror;
1987                               so->so_rerror = 0;
1988                     }
1989                     error = sockopt_setint(sopt, so->so_error);
1990                     so->so_error = 0;
1991                     break;
1992 
1993           case SO_SNDBUF:
1994                     error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
1995                     break;
1996 
1997           case SO_RCVBUF:
1998                     error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
1999                     break;
2000 
2001           case SO_SNDLOWAT:
2002                     error = sockopt_setint(sopt, so->so_snd.sb_lowat);
2003                     break;
2004 
2005           case SO_RCVLOWAT:
2006                     error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
2007                     break;
2008 
2009           case SO_SNDTIMEO:
2010           case SO_RCVTIMEO:
2011                     optval = (opt == SO_SNDTIMEO ?
2012                          so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
2013 
2014                     memset(&tv, 0, sizeof(tv));
2015                     tv.tv_sec = optval / hz;
2016                     tv.tv_usec = (optval % hz) * tick;
2017 
2018                     error = sockopt_set(sopt, &tv, sizeof(tv));
2019                     break;
2020 
2021           case SO_OVERFLOWED:
2022                     error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
2023                     break;
2024 
2025           default:
2026                     MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook,
2027                         (opt, so, sopt), enosys(), error);
2028                     if (error)
2029                               error = SET_ERROR(ENOPROTOOPT);
2030                     break;
2031           }
2032 
2033           return error;
2034 }
2035 
2036 int
sogetopt(struct socket * so,struct sockopt * sopt)2037 sogetopt(struct socket *so, struct sockopt *sopt)
2038 {
2039           int error;
2040 
2041           solock(so);
2042           if (sopt->sopt_level != SOL_SOCKET) {
2043                     if (so->so_proto && so->so_proto->pr_ctloutput) {
2044                               error = ((*so->so_proto->pr_ctloutput)
2045                                   (PRCO_GETOPT, so, sopt));
2046                     } else
2047                               error = SET_ERROR(ENOPROTOOPT);
2048           } else {
2049                     error = sogetopt1(so, sopt);
2050           }
2051           sounlock(so);
2052           return error;
2053 }
2054 
2055 /*
2056  * alloc sockopt data buffer buffer
2057  *        - will be released at destroy
2058  */
2059 static int
sockopt_alloc(struct sockopt * sopt,size_t len,km_flag_t kmflag)2060 sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
2061 {
2062           void *data;
2063 
2064           KASSERT(sopt->sopt_size == 0);
2065 
2066           if (len > sizeof(sopt->sopt_buf)) {
2067                     data = kmem_zalloc(len, kmflag);
2068                     if (data == NULL)
2069                               return SET_ERROR(ENOMEM);
2070                     sopt->sopt_data = data;
2071           } else
2072                     sopt->sopt_data = sopt->sopt_buf;
2073 
2074           sopt->sopt_size = len;
2075           return 0;
2076 }
2077 
2078 /*
2079  * initialise sockopt storage
2080  *        - MAY sleep during allocation
2081  */
2082 void
sockopt_init(struct sockopt * sopt,int level,int name,size_t size)2083 sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
2084 {
2085 
2086           memset(sopt, 0, sizeof(*sopt));
2087 
2088           sopt->sopt_level = level;
2089           sopt->sopt_name = name;
2090           (void)sockopt_alloc(sopt, size, KM_SLEEP);
2091 }
2092 
2093 /*
2094  * destroy sockopt storage
2095  *        - will release any held memory references
2096  */
2097 void
sockopt_destroy(struct sockopt * sopt)2098 sockopt_destroy(struct sockopt *sopt)
2099 {
2100 
2101           if (sopt->sopt_data != sopt->sopt_buf)
2102                     kmem_free(sopt->sopt_data, sopt->sopt_size);
2103 
2104           memset(sopt, 0, sizeof(*sopt));
2105 }
2106 
2107 /*
2108  * set sockopt value
2109  *        - value is copied into sockopt
2110  *        - memory is allocated when necessary, will not sleep
2111  */
2112 int
sockopt_set(struct sockopt * sopt,const void * buf,size_t len)2113 sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
2114 {
2115           int error;
2116 
2117           if (sopt->sopt_size == 0) {
2118                     error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2119                     if (error)
2120                               return error;
2121           }
2122 
2123           sopt->sopt_retsize = MIN(sopt->sopt_size, len);
2124           if (sopt->sopt_retsize > 0) {
2125                     memcpy(sopt->sopt_data, buf, sopt->sopt_retsize);
2126           }
2127 
2128           return 0;
2129 }
2130 
2131 /*
2132  * common case of set sockopt integer value
2133  */
2134 int
sockopt_setint(struct sockopt * sopt,int val)2135 sockopt_setint(struct sockopt *sopt, int val)
2136 {
2137 
2138           return sockopt_set(sopt, &val, sizeof(int));
2139 }
2140 
2141 /*
2142  * get sockopt value
2143  *        - correct size must be given
2144  */
2145 int
sockopt_get(const struct sockopt * sopt,void * buf,size_t len)2146 sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
2147 {
2148 
2149           if (sopt->sopt_size != len)
2150                     return SET_ERROR(EINVAL);
2151 
2152           memcpy(buf, sopt->sopt_data, len);
2153           return 0;
2154 }
2155 
2156 /*
2157  * common case of get sockopt integer value
2158  */
2159 int
sockopt_getint(const struct sockopt * sopt,int * valp)2160 sockopt_getint(const struct sockopt *sopt, int *valp)
2161 {
2162 
2163           return sockopt_get(sopt, valp, sizeof(int));
2164 }
2165 
2166 /*
2167  * set sockopt value from mbuf
2168  *        - ONLY for legacy code
2169  *        - mbuf is released by sockopt
2170  *        - will not sleep
2171  */
2172 int
sockopt_setmbuf(struct sockopt * sopt,struct mbuf * m)2173 sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
2174 {
2175           size_t len;
2176           int error;
2177 
2178           len = m_length(m);
2179 
2180           if (sopt->sopt_size == 0) {
2181                     error = sockopt_alloc(sopt, len, KM_NOSLEEP);
2182                     if (error)
2183                               return error;
2184           }
2185 
2186           sopt->sopt_retsize = MIN(sopt->sopt_size, len);
2187           m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data);
2188           m_freem(m);
2189 
2190           return 0;
2191 }
2192 
2193 /*
2194  * get sockopt value into mbuf
2195  *        - ONLY for legacy code
2196  *        - mbuf to be released by the caller
2197  *        - will not sleep
2198  */
2199 struct mbuf *
sockopt_getmbuf(const struct sockopt * sopt)2200 sockopt_getmbuf(const struct sockopt *sopt)
2201 {
2202           struct mbuf *m;
2203 
2204           if (sopt->sopt_size > MCLBYTES)
2205                     return NULL;
2206 
2207           m = m_get(M_DONTWAIT, MT_SOOPTS);
2208           if (m == NULL)
2209                     return NULL;
2210 
2211           if (sopt->sopt_size > MLEN) {
2212                     MCLGET(m, M_DONTWAIT);
2213                     if ((m->m_flags & M_EXT) == 0) {
2214                               m_free(m);
2215                               return NULL;
2216                     }
2217           }
2218 
2219           memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
2220           m->m_len = sopt->sopt_size;
2221 
2222           return m;
2223 }
2224 
2225 void
sohasoutofband(struct socket * so)2226 sohasoutofband(struct socket *so)
2227 {
2228 
2229           so->so_state |= SS_POLLRDBAND;
2230           fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
2231           selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
2232 }
2233 
2234 static void
filt_sordetach(struct knote * kn)2235 filt_sordetach(struct knote *kn)
2236 {
2237           struct socket *so;
2238 
2239           so = ((file_t *)kn->kn_obj)->f_socket;
2240           solock(so);
2241           if (selremove_knote(&so->so_rcv.sb_sel, kn))
2242                     so->so_rcv.sb_flags &= ~SB_KNOTE;
2243           sounlock(so);
2244 }
2245 
2246 /*ARGSUSED*/
2247 static int
filt_soread(struct knote * kn,long hint)2248 filt_soread(struct knote *kn, long hint)
2249 {
2250           struct socket *so;
2251           int rv;
2252 
2253           so = ((file_t *)kn->kn_obj)->f_socket;
2254           if (hint != NOTE_SUBMIT)
2255                     solock(so);
2256           kn->kn_data = so->so_rcv.sb_cc;
2257           if (so->so_state & SS_CANTRCVMORE) {
2258                     knote_set_eof(kn, 0);
2259                     kn->kn_fflags = so->so_error;
2260                     rv = 1;
2261           } else if (so->so_error || so->so_rerror)
2262                     rv = 1;
2263           else if (kn->kn_sfflags & NOTE_LOWAT)
2264                     rv = (kn->kn_data >= kn->kn_sdata);
2265           else
2266                     rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2267           if (hint != NOTE_SUBMIT)
2268                     sounlock(so);
2269           return rv;
2270 }
2271 
2272 static void
filt_sowdetach(struct knote * kn)2273 filt_sowdetach(struct knote *kn)
2274 {
2275           struct socket *so;
2276 
2277           so = ((file_t *)kn->kn_obj)->f_socket;
2278           solock(so);
2279           if (selremove_knote(&so->so_snd.sb_sel, kn))
2280                     so->so_snd.sb_flags &= ~SB_KNOTE;
2281           sounlock(so);
2282 }
2283 
2284 /*ARGSUSED*/
2285 static int
filt_sowrite(struct knote * kn,long hint)2286 filt_sowrite(struct knote *kn, long hint)
2287 {
2288           struct socket *so;
2289           int rv;
2290 
2291           so = ((file_t *)kn->kn_obj)->f_socket;
2292           if (hint != NOTE_SUBMIT)
2293                     solock(so);
2294           kn->kn_data = sbspace(&so->so_snd);
2295           if (so->so_state & SS_CANTSENDMORE) {
2296                     knote_set_eof(kn, 0);
2297                     kn->kn_fflags = so->so_error;
2298                     rv = 1;
2299           } else if (so->so_error)
2300                     rv = 1;
2301           else if (((so->so_state & SS_ISCONNECTED) == 0) &&
2302               (so->so_proto->pr_flags & PR_CONNREQUIRED))
2303                     rv = 0;
2304           else if (kn->kn_sfflags & NOTE_LOWAT)
2305                     rv = (kn->kn_data >= kn->kn_sdata);
2306           else
2307                     rv = (kn->kn_data >= so->so_snd.sb_lowat);
2308           if (hint != NOTE_SUBMIT)
2309                     sounlock(so);
2310           return rv;
2311 }
2312 
2313 static int
filt_soempty(struct knote * kn,long hint)2314 filt_soempty(struct knote *kn, long hint)
2315 {
2316           struct socket *so;
2317           int rv;
2318 
2319           so = ((file_t *)kn->kn_obj)->f_socket;
2320           if (hint != NOTE_SUBMIT)
2321                     solock(so);
2322           rv = (kn->kn_data = sbused(&so->so_snd)) == 0 ||
2323                (so->so_options & SO_ACCEPTCONN) != 0;
2324           if (hint != NOTE_SUBMIT)
2325                     sounlock(so);
2326           return rv;
2327 }
2328 
2329 /*ARGSUSED*/
2330 static int
filt_solisten(struct knote * kn,long hint)2331 filt_solisten(struct knote *kn, long hint)
2332 {
2333           struct socket *so;
2334           int rv;
2335 
2336           so = ((file_t *)kn->kn_obj)->f_socket;
2337 
2338           /*
2339            * Set kn_data to number of incoming connections, not
2340            * counting partial (incomplete) connections.
2341            */
2342           if (hint != NOTE_SUBMIT)
2343                     solock(so);
2344           kn->kn_data = so->so_qlen;
2345           rv = (kn->kn_data > 0);
2346           if (hint != NOTE_SUBMIT)
2347                     sounlock(so);
2348           return rv;
2349 }
2350 
2351 static const struct filterops solisten_filtops = {
2352           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
2353           .f_attach = NULL,
2354           .f_detach = filt_sordetach,
2355           .f_event = filt_solisten,
2356 };
2357 
2358 static const struct filterops soread_filtops = {
2359           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
2360           .f_attach = NULL,
2361           .f_detach = filt_sordetach,
2362           .f_event = filt_soread,
2363 };
2364 
2365 static const struct filterops sowrite_filtops = {
2366           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
2367           .f_attach = NULL,
2368           .f_detach = filt_sowdetach,
2369           .f_event = filt_sowrite,
2370 };
2371 
2372 static const struct filterops soempty_filtops = {
2373           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
2374           .f_attach = NULL,
2375           .f_detach = filt_sowdetach,
2376           .f_event = filt_soempty,
2377 };
2378 
2379 int
soo_kqfilter(struct file * fp,struct knote * kn)2380 soo_kqfilter(struct file *fp, struct knote *kn)
2381 {
2382           struct socket *so;
2383           struct sockbuf *sb;
2384 
2385           so = ((file_t *)kn->kn_obj)->f_socket;
2386           solock(so);
2387           switch (kn->kn_filter) {
2388           case EVFILT_READ:
2389                     if (so->so_options & SO_ACCEPTCONN)
2390                               kn->kn_fop = &solisten_filtops;
2391                     else
2392                               kn->kn_fop = &soread_filtops;
2393                     sb = &so->so_rcv;
2394                     break;
2395           case EVFILT_WRITE:
2396                     kn->kn_fop = &sowrite_filtops;
2397                     sb = &so->so_snd;
2398 
2399 #ifdef PIPE_SOCKETPAIR
2400                     if (so->so_state & SS_ISAPIPE) {
2401                               /* Other end of pipe has been closed. */
2402                               if (so->so_state & SS_ISDISCONNECTED) {
2403                                         sounlock(so);
2404                                         return SET_ERROR(EBADF);
2405                               }
2406                     }
2407 #endif
2408                     break;
2409           case EVFILT_EMPTY:
2410                     kn->kn_fop = &soempty_filtops;
2411                     sb = &so->so_snd;
2412                     break;
2413           default:
2414                     sounlock(so);
2415                     return SET_ERROR(EINVAL);
2416           }
2417           selrecord_knote(&sb->sb_sel, kn);
2418           sb->sb_flags |= SB_KNOTE;
2419           sounlock(so);
2420           return 0;
2421 }
2422 
2423 static int
sodopoll(struct socket * so,int events)2424 sodopoll(struct socket *so, int events)
2425 {
2426           int revents;
2427 
2428           revents = 0;
2429 
2430           if (events & (POLLIN | POLLRDNORM))
2431                     if (soreadable(so))
2432                               revents |= events & (POLLIN | POLLRDNORM);
2433 
2434           if (events & (POLLOUT | POLLWRNORM))
2435                     if (sowritable(so))
2436                               revents |= events & (POLLOUT | POLLWRNORM);
2437 
2438           if (events & (POLLPRI | POLLRDBAND))
2439                     if (so->so_state & SS_POLLRDBAND)
2440                               revents |= events & (POLLPRI | POLLRDBAND);
2441 
2442           return revents;
2443 }
2444 
2445 int
sopoll(struct socket * so,int events)2446 sopoll(struct socket *so, int events)
2447 {
2448           int revents = 0;
2449 
2450 #ifndef DIAGNOSTIC
2451           /*
2452            * Do a quick, unlocked check in expectation that the socket
2453            * will be ready for I/O.  Don't do this check if DIAGNOSTIC,
2454            * as the solocked() assertions will fail.
2455            */
2456           if ((revents = sodopoll(so, events)) != 0)
2457                     return revents;
2458 #endif
2459 
2460           solock(so);
2461           if ((revents = sodopoll(so, events)) == 0) {
2462                     if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2463                               selrecord(curlwp, &so->so_rcv.sb_sel);
2464                               so->so_rcv.sb_flags |= SB_NOTIFY;
2465                     }
2466 
2467                     if (events & (POLLOUT | POLLWRNORM)) {
2468                               selrecord(curlwp, &so->so_snd.sb_sel);
2469                               so->so_snd.sb_flags |= SB_NOTIFY;
2470                     }
2471           }
2472           sounlock(so);
2473 
2474           return revents;
2475 }
2476 
2477 struct mbuf **
sbsavetimestamp(int opt,struct mbuf ** mp)2478 sbsavetimestamp(int opt, struct mbuf **mp)
2479 {
2480           struct timeval tv;
2481           int error;
2482 
2483           memset(&tv, 0, sizeof(tv));
2484           microtime(&tv);
2485 
2486           MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error);
2487           if (error == 0)
2488                     return mp;
2489 
2490           if (opt & SO_TIMESTAMP) {
2491                     *mp = sbcreatecontrol(&tv, sizeof(tv),
2492                         SCM_TIMESTAMP, SOL_SOCKET);
2493                     if (*mp)
2494                               mp = &(*mp)->m_next;
2495           }
2496           return mp;
2497 }
2498 
2499 
2500 #include <sys/sysctl.h>
2501 
2502 static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
2503 static int sysctl_kern_sbmax(SYSCTLFN_PROTO);
2504 
2505 /*
2506  * sysctl helper routine for kern.somaxkva.  ensures that the given
2507  * value is not too small.
2508  * (XXX should we maybe make sure it's not too large as well?)
2509  */
2510 static int
sysctl_kern_somaxkva(SYSCTLFN_ARGS)2511 sysctl_kern_somaxkva(SYSCTLFN_ARGS)
2512 {
2513           int error, new_somaxkva;
2514           struct sysctlnode node;
2515 
2516           new_somaxkva = somaxkva;
2517           node = *rnode;
2518           node.sysctl_data = &new_somaxkva;
2519           error = sysctl_lookup(SYSCTLFN_CALL(&node));
2520           if (error || newp == NULL)
2521                     return error;
2522 
2523           if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
2524                     return SET_ERROR(EINVAL);
2525 
2526           mutex_enter(&so_pendfree_lock);
2527           somaxkva = new_somaxkva;
2528           cv_broadcast(&socurkva_cv);
2529           mutex_exit(&so_pendfree_lock);
2530 
2531           return error;
2532 }
2533 
2534 /*
2535  * sysctl helper routine for kern.sbmax. Basically just ensures that
2536  * any new value is not too small.
2537  */
2538 static int
sysctl_kern_sbmax(SYSCTLFN_ARGS)2539 sysctl_kern_sbmax(SYSCTLFN_ARGS)
2540 {
2541           int error, new_sbmax;
2542           struct sysctlnode node;
2543 
2544           new_sbmax = sb_max;
2545           node = *rnode;
2546           node.sysctl_data = &new_sbmax;
2547           error = sysctl_lookup(SYSCTLFN_CALL(&node));
2548           if (error || newp == NULL)
2549                     return error;
2550 
2551           KERNEL_LOCK(1, NULL);
2552           error = sb_max_set(new_sbmax);
2553           KERNEL_UNLOCK_ONE(NULL);
2554 
2555           return error;
2556 }
2557 
2558 /*
2559  * sysctl helper routine for kern.sooptions. Ensures that only allowed
2560  * options can be set.
2561  */
2562 static int
sysctl_kern_sooptions(SYSCTLFN_ARGS)2563 sysctl_kern_sooptions(SYSCTLFN_ARGS)
2564 {
2565           int error, new_options;
2566           struct sysctlnode node;
2567 
2568           new_options = sooptions;
2569           node = *rnode;
2570           node.sysctl_data = &new_options;
2571           error = sysctl_lookup(SYSCTLFN_CALL(&node));
2572           if (error || newp == NULL)
2573                     return error;
2574 
2575           if (new_options & ~SO_DEFOPTS)
2576                     return SET_ERROR(EINVAL);
2577 
2578           sooptions = new_options;
2579 
2580           return 0;
2581 }
2582 
2583 static void
sysctl_kern_socket_setup(void)2584 sysctl_kern_socket_setup(void)
2585 {
2586 
2587           KASSERT(socket_sysctllog == NULL);
2588 
2589           sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2590                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2591                            CTLTYPE_INT, "somaxkva",
2592                            SYSCTL_DESCR("Maximum amount of kernel memory to be "
2593                                         "used for socket buffers"),
2594                            sysctl_kern_somaxkva, 0, NULL, 0,
2595                            CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
2596 
2597           sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2598                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2599                            CTLTYPE_BOOL, "sofixedbuf",
2600                            SYSCTL_DESCR("Prevent scaling of fixed socket buffers"),
2601                            NULL, 0, &sofixedbuf, 0,
2602                            CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL);
2603 
2604           sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2605                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2606                            CTLTYPE_INT, "sbmax",
2607                            SYSCTL_DESCR("Maximum socket buffer size"),
2608                            sysctl_kern_sbmax, 0, NULL, 0,
2609                            CTL_KERN, KERN_SBMAX, CTL_EOL);
2610 
2611           sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
2612                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2613                            CTLTYPE_INT, "sooptions",
2614                            SYSCTL_DESCR("Default socket options"),
2615                            sysctl_kern_sooptions, 0, NULL, 0,
2616                            CTL_KERN, CTL_CREATE, CTL_EOL);
2617 }
2618