1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993
5 * The Regents of the University of California.
6 * Copyright (c) 2004 The FreeBSD Foundation
7 * Copyright (c) 2004-2008 Robert N. M. Watson
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
35 */
36
37 /*
38 * Comments on the socket life cycle:
39 *
40 * soalloc() sets of socket layer state for a socket, called only by
41 * socreate() and sonewconn(). Socket layer private.
42 *
43 * sodealloc() tears down socket layer state for a socket, called only by
44 * sofree() and sonewconn(). Socket layer private.
45 *
46 * pru_attach() associates protocol layer state with an allocated socket;
47 * called only once, may fail, aborting socket allocation. This is called
48 * from socreate() and sonewconn(). Socket layer private.
49 *
50 * pru_detach() disassociates protocol layer state from an attached socket,
51 * and will be called exactly once for sockets in which pru_attach() has
52 * been successfully called. If pru_attach() returned an error,
53 * pru_detach() will not be called. Socket layer private.
54 *
55 * pru_abort() and pru_close() notify the protocol layer that the last
56 * consumer of a socket is starting to tear down the socket, and that the
57 * protocol should terminate the connection. Historically, pru_abort() also
58 * detached protocol state from the socket state, but this is no longer the
59 * case.
60 *
61 * socreate() creates a socket and attaches protocol state. This is a public
62 * interface that may be used by socket layer consumers to create new
63 * sockets.
64 *
65 * sonewconn() creates a socket and attaches protocol state. This is a
66 * public interface that may be used by protocols to create new sockets when
67 * a new connection is received and will be available for accept() on a
68 * listen socket.
69 *
70 * soclose() destroys a socket after possibly waiting for it to disconnect.
71 * This is a public interface that socket consumers should use to close and
72 * release a socket when done with it.
73 *
74 * soabort() destroys a socket without waiting for it to disconnect (used
75 * only for incoming connections that are already partially or fully
76 * connected). This is used internally by the socket layer when clearing
77 * listen socket queues (due to overflow or close on the listen socket), but
78 * is also a public interface protocols may use to abort connections in
79 * their incomplete listen queues should they no longer be required. Sockets
80 * placed in completed connection listen queues should not be aborted for
81 * reasons described in the comment above the soclose() implementation. This
82 * is not a general purpose close routine, and except in the specific
83 * circumstances described here, should not be used.
84 *
85 * sofree() will free a socket and its protocol state if all references on
86 * the socket have been released, and is the public interface to attempt to
87 * free a socket when a reference is removed. This is a socket layer private
88 * interface.
89 *
90 * NOTE: In addition to socreate() and soclose(), which provide a single
91 * socket reference to the consumer to be managed as required, there are two
92 * calls to explicitly manage socket references, soref(), and sorele().
93 * Currently, these are generally required only when transitioning a socket
94 * from a listen queue to a file descriptor, in order to prevent garbage
95 * collection of the socket at an untimely moment. For a number of reasons,
96 * these interfaces are not preferred, and should be avoided.
97 *
98 * NOTE: With regard to VNETs the general rule is that callers do not set
99 * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
100 * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
101 * and sorflush(), which are usually called from a pre-set VNET context.
102 * sopoll() currently does not need a VNET context to be set.
103 */
104
105 #include <sys/cdefs.h>
106 #include "opt_inet.h"
107 #include "opt_inet6.h"
108 #include "opt_kern_tls.h"
109 #include "opt_sctp.h"
110
111 #include <sys/param.h>
112 #include <sys/systm.h>
113 #include <sys/fcntl.h>
114 #include <sys/limits.h>
115 #include <sys/lock.h>
116 #include <sys/mac.h>
117 #include <sys/malloc.h>
118 #include <sys/mbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/domain.h>
121 #include <sys/file.h> /* for struct knote */
122 #include <sys/hhook.h>
123 #include <sys/kernel.h>
124 #include <sys/khelp.h>
125 #include <sys/ktls.h>
126 #include <sys/event.h>
127 #include <sys/eventhandler.h>
128 #include <sys/poll.h>
129 #include <sys/priv.h>
130 #include <sys/proc.h>
131 #include <sys/protosw.h>
132 #include <sys/sbuf.h>
133 #include <sys/socket.h>
134 #include <sys/socketvar.h>
135 #include <sys/resourcevar.h>
136 #include <net/route.h>
137 #include <sys/signalvar.h>
138 #include <sys/stat.h>
139 #include <sys/sx.h>
140 #include <sys/sysctl.h>
141 #include <sys/taskqueue.h>
142 #include <sys/uio.h>
143 #include <sys/un.h>
144 #include <sys/unpcb.h>
145 #include <sys/jail.h>
146 #include <sys/syslog.h>
147 #include <netinet/in.h>
148 #include <netinet/in_pcb.h>
149 #include <netinet/tcp.h>
150
151 #include <net/vnet.h>
152
153 #include <security/mac/mac_framework.h>
154 #include <security/mac/mac_internal.h>
155
156 #include <vm/uma.h>
157
158 #ifdef COMPAT_FREEBSD32
159 #include <sys/mount.h>
160 #include <sys/sysent.h>
161 #include <compat/freebsd32/freebsd32.h>
162 #endif
163
164 static int soreceive_rcvoob(struct socket *so, struct uio *uio,
165 int flags);
166 static void so_rdknl_lock(void *);
167 static void so_rdknl_unlock(void *);
168 static void so_rdknl_assert_lock(void *, int);
169 static void so_wrknl_lock(void *);
170 static void so_wrknl_unlock(void *);
171 static void so_wrknl_assert_lock(void *, int);
172
173 static void filt_sordetach(struct knote *kn);
174 static int filt_soread(struct knote *kn, long hint);
175 static void filt_sowdetach(struct knote *kn);
176 static int filt_sowrite(struct knote *kn, long hint);
177 static int filt_soempty(struct knote *kn, long hint);
178 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
179 fo_kqfilter_t soo_kqfilter;
180
181 static struct filterops soread_filtops = {
182 .f_isfd = 1,
183 .f_detach = filt_sordetach,
184 .f_event = filt_soread,
185 };
186 static struct filterops sowrite_filtops = {
187 .f_isfd = 1,
188 .f_detach = filt_sowdetach,
189 .f_event = filt_sowrite,
190 };
191 static struct filterops soempty_filtops = {
192 .f_isfd = 1,
193 .f_detach = filt_sowdetach,
194 .f_event = filt_soempty,
195 };
196
197 so_gen_t so_gencnt; /* generation count for sockets */
198
199 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
200 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
201
202 #define VNET_SO_ASSERT(so) \
203 VNET_ASSERT(curvnet != NULL, \
204 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
205
206 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
207 #define V_socket_hhh VNET(socket_hhh)
208
209 /*
210 * Limit on the number of connections in the listen queue waiting
211 * for accept(2).
212 * NB: The original sysctl somaxconn is still available but hidden
213 * to prevent confusion about the actual purpose of this number.
214 */
215 VNET_DEFINE_STATIC(u_int, somaxconn) = SOMAXCONN;
216 #define V_somaxconn VNET(somaxconn)
217
218 static int
sysctl_somaxconn(SYSCTL_HANDLER_ARGS)219 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
220 {
221 int error;
222 u_int val;
223
224 val = V_somaxconn;
225 error = sysctl_handle_int(oidp, &val, 0, req);
226 if (error || !req->newptr )
227 return (error);
228
229 /*
230 * The purpose of the UINT_MAX / 3 limit, is so that the formula
231 * 3 * sol_qlimit / 2
232 * below, will not overflow.
233 */
234
235 if (val < 1 || val > UINT_MAX / 3)
236 return (EINVAL);
237
238 V_somaxconn = val;
239 return (0);
240 }
241 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int),
243 sysctl_somaxconn, "IU",
244 "Maximum listen socket pending connection accept queue size");
245 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
246 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0,
247 sizeof(u_int), sysctl_somaxconn, "IU",
248 "Maximum listen socket pending connection accept queue size (compat)");
249
250 static u_int numopensockets;
251 static int
sysctl_numopensockets(SYSCTL_HANDLER_ARGS)252 sysctl_numopensockets(SYSCTL_HANDLER_ARGS)
253 {
254 u_int val;
255
256 #ifdef VIMAGE
257 if(!IS_DEFAULT_VNET(curvnet))
258 val = curvnet->vnet_sockcnt;
259 else
260 #endif
261 val = numopensockets;
262 return (sysctl_handle_int(oidp, &val, 0, req));
263 }
264 SYSCTL_PROC(_kern_ipc, OID_AUTO, numopensockets,
265 CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_VNET, 0, sizeof(u_int),
266 sysctl_numopensockets, "IU", "Number of open sockets");
267
268 /*
269 * accept_mtx locks down per-socket fields relating to accept queues. See
270 * socketvar.h for an annotation of the protected fields of struct socket.
271 */
272 struct mtx accept_mtx;
273 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
274
275 /*
276 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
277 * so_gencnt field.
278 */
279 static struct mtx so_global_mtx;
280 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
281
282 /*
283 * General IPC sysctl name space, used by sockets and a variety of other IPC
284 * types.
285 */
286 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
287 "IPC");
288
289 /*
290 * Initialize the socket subsystem and set up the socket
291 * memory allocator.
292 */
293 static uma_zone_t socket_zone;
294 int maxsockets;
295
296 static void
socket_zone_change(void * tag)297 socket_zone_change(void *tag)
298 {
299
300 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
301 }
302
303 static void
socket_hhook_register(int subtype)304 socket_hhook_register(int subtype)
305 {
306
307 if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
308 &V_socket_hhh[subtype],
309 HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
310 printf("%s: WARNING: unable to register hook\n", __func__);
311 }
312
313 static void
socket_hhook_deregister(int subtype)314 socket_hhook_deregister(int subtype)
315 {
316
317 if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
318 printf("%s: WARNING: unable to deregister hook\n", __func__);
319 }
320
321 static void
socket_init(void * tag)322 socket_init(void *tag)
323 {
324
325 socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
326 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
327 maxsockets = uma_zone_set_max(socket_zone, maxsockets);
328 uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
329 EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
330 EVENTHANDLER_PRI_FIRST);
331 }
332 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
333
334 static void
socket_vnet_init(const void * unused __unused)335 socket_vnet_init(const void *unused __unused)
336 {
337 int i;
338
339 /* We expect a contiguous range */
340 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
341 socket_hhook_register(i);
342 }
343 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
344 socket_vnet_init, NULL);
345
346 static void
socket_vnet_uninit(const void * unused __unused)347 socket_vnet_uninit(const void *unused __unused)
348 {
349 int i;
350
351 for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
352 socket_hhook_deregister(i);
353 }
354 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
355 socket_vnet_uninit, NULL);
356
357 /*
358 * Initialise maxsockets. This SYSINIT must be run after
359 * tunable_mbinit().
360 */
361 static void
init_maxsockets(void * ignored)362 init_maxsockets(void *ignored)
363 {
364
365 TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
366 maxsockets = imax(maxsockets, maxfiles);
367 }
368 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
369
370 /*
371 * Sysctl to get and set the maximum global sockets limit. Notify protocols
372 * of the change so that they can update their dependent limits as required.
373 */
374 static int
sysctl_maxsockets(SYSCTL_HANDLER_ARGS)375 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
376 {
377 int error, newmaxsockets;
378
379 newmaxsockets = maxsockets;
380 error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
381 if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
382 if (newmaxsockets > maxsockets &&
383 newmaxsockets <= maxfiles) {
384 maxsockets = newmaxsockets;
385 EVENTHANDLER_INVOKE(maxsockets_change);
386 } else
387 error = EINVAL;
388 }
389 return (error);
390 }
391 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
392 CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE,
393 &maxsockets, 0, sysctl_maxsockets, "IU",
394 "Maximum number of sockets available");
395
396 /*
397 * Socket operation routines. These routines are called by the routines in
398 * sys_socket.c or from a system process, and implement the semantics of
399 * socket operations by switching out to the protocol specific routines.
400 */
401
402 /*
403 * Get a socket structure from our zone, and initialize it. Note that it
404 * would probably be better to allocate socket and PCB at the same time, but
405 * I'm not convinced that all the protocols can be easily modified to do
406 * this.
407 *
408 * soalloc() returns a socket with a ref count of 0.
409 */
410 static struct socket *
soalloc(struct vnet * vnet)411 soalloc(struct vnet *vnet)
412 {
413 struct socket *so;
414
415 so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
416 if (so == NULL)
417 return (NULL);
418 #ifdef MAC
419 if (mac_socket_init(so, M_NOWAIT) != 0) {
420 uma_zfree(socket_zone, so);
421 return (NULL);
422 }
423 #endif
424 if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
425 uma_zfree(socket_zone, so);
426 return (NULL);
427 }
428
429 /*
430 * The socket locking protocol allows to lock 2 sockets at a time,
431 * however, the first one must be a listening socket. WITNESS lacks
432 * a feature to change class of an existing lock, so we use DUPOK.
433 */
434 mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
435 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
436 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
437 so->so_rcv.sb_sel = &so->so_rdsel;
438 so->so_snd.sb_sel = &so->so_wrsel;
439 sx_init(&so->so_snd.sb_sx, "so_snd_sx");
440 sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
441 TAILQ_INIT(&so->so_snd.sb_aiojobq);
442 TAILQ_INIT(&so->so_rcv.sb_aiojobq);
443 TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
444 TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
445 #ifdef VIMAGE
446 VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
447 __func__, __LINE__, so));
448 so->so_vnet = vnet;
449 #endif
450 /* We shouldn't need the so_global_mtx */
451 if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
452 /* Do we need more comprehensive error returns? */
453 uma_zfree(socket_zone, so);
454 return (NULL);
455 }
456 mtx_lock(&so_global_mtx);
457 so->so_gencnt = ++so_gencnt;
458 ++numopensockets;
459 #ifdef VIMAGE
460 vnet->vnet_sockcnt++;
461 #endif
462 mtx_unlock(&so_global_mtx);
463
464 return (so);
465 }
466
467 /*
468 * Free the storage associated with a socket at the socket layer, tear down
469 * locks, labels, etc. All protocol state is assumed already to have been
470 * torn down (and possibly never set up) by the caller.
471 */
472 static void
sodealloc(struct socket * so)473 sodealloc(struct socket *so)
474 {
475
476 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
477 KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
478
479 mtx_lock(&so_global_mtx);
480 so->so_gencnt = ++so_gencnt;
481 --numopensockets; /* Could be below, but faster here. */
482 #ifdef VIMAGE
483 VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
484 __func__, __LINE__, so));
485 so->so_vnet->vnet_sockcnt--;
486 #endif
487 mtx_unlock(&so_global_mtx);
488 #ifdef MAC
489 mac_socket_destroy(so);
490 #endif
491 hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
492
493 khelp_destroy_osd(&so->osd);
494 if (SOLISTENING(so)) {
495 if (so->sol_accept_filter != NULL)
496 accept_filt_setopt(so, NULL);
497 } else {
498 if (so->so_rcv.sb_hiwat)
499 (void)chgsbsize(so->so_cred->cr_uidinfo,
500 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
501 if (so->so_snd.sb_hiwat)
502 (void)chgsbsize(so->so_cred->cr_uidinfo,
503 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
504 sx_destroy(&so->so_snd.sb_sx);
505 sx_destroy(&so->so_rcv.sb_sx);
506 SOCKBUF_LOCK_DESTROY(&so->so_snd);
507 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
508 }
509 crfree(so->so_cred);
510 mtx_destroy(&so->so_lock);
511 uma_zfree(socket_zone, so);
512 }
513
514 /*
515 * socreate returns a socket with a ref count of 1. The socket should be
516 * closed with soclose().
517 */
518 int
socreate(int dom,struct socket ** aso,int type,int proto,struct ucred * cred,struct thread * td)519 socreate(int dom, struct socket **aso, int type, int proto,
520 struct ucred *cred, struct thread *td)
521 {
522 struct protosw *prp;
523 struct socket *so;
524 int error;
525
526 if (proto)
527 prp = pffindproto(dom, proto, type);
528 else
529 prp = pffindtype(dom, type);
530
531 if (prp == NULL) {
532 /* No support for domain. */
533 if (pffinddomain(dom) == NULL)
534 return (EAFNOSUPPORT);
535 /* No support for socket type. */
536 if (proto == 0 && type != 0)
537 return (EPROTOTYPE);
538 return (EPROTONOSUPPORT);
539 }
540 if (prp->pr_usrreqs->pru_attach == NULL ||
541 prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
542 return (EPROTONOSUPPORT);
543
544 if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
545 return (EPROTONOSUPPORT);
546
547 if (prp->pr_type != type)
548 return (EPROTOTYPE);
549 so = soalloc(CRED_TO_VNET(cred));
550 if (so == NULL)
551 return (ENOBUFS);
552
553 so->so_type = type;
554 so->so_cred = crhold(cred);
555 if ((prp->pr_domain->dom_family == PF_INET) ||
556 (prp->pr_domain->dom_family == PF_INET6) ||
557 (prp->pr_domain->dom_family == PF_ROUTE))
558 so->so_fibnum = td->td_proc->p_fibnum;
559 else
560 so->so_fibnum = 0;
561 so->so_proto = prp;
562 #ifdef MAC
563 mac_socket_create(cred, so);
564 #endif
565 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
566 so_rdknl_assert_lock);
567 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
568 so_wrknl_assert_lock);
569 /*
570 * Auto-sizing of socket buffers is managed by the protocols and
571 * the appropriate flags must be set in the pru_attach function.
572 */
573 CURVNET_SET(so->so_vnet);
574 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
575 CURVNET_RESTORE();
576 if (error) {
577 sodealloc(so);
578 return (error);
579 }
580 soref(so);
581 *aso = so;
582 return (0);
583 }
584
585 #ifdef REGRESSION
586 static int regression_sonewconn_earlytest = 1;
587 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
588 ®ression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
589 #endif
590
591 static int sooverprio = LOG_DEBUG;
592 SYSCTL_INT(_kern_ipc, OID_AUTO, sooverprio, CTLFLAG_RW,
593 &sooverprio, 0, "Log priority for listen socket overflows: 0..7 or -1 to disable");
594
595 static struct timeval overinterval = { 60, 0 };
596 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
597 &overinterval,
598 "Delay in seconds between warnings for listen socket overflows");
599
600 /*
601 * When an attempt at a new connection is noted on a socket which accepts
602 * connections, sonewconn is called. If the connection is possible (subject
603 * to space constraints, etc.) then we allocate a new structure, properly
604 * linked into the data structure of the original socket, and return this.
605 * Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED.
606 *
607 * Note: the ref count on the socket is 0 on return.
608 */
609 struct socket *
sonewconn(struct socket * head,int connstatus)610 sonewconn(struct socket *head, int connstatus)
611 {
612 struct sbuf descrsb;
613 struct socket *so;
614 int len, overcount;
615 u_int qlen;
616 const char localprefix[] = "local:";
617 char descrbuf[SUNPATHLEN + sizeof(localprefix)];
618 #if defined(INET6)
619 char addrbuf[INET6_ADDRSTRLEN];
620 #elif defined(INET)
621 char addrbuf[INET_ADDRSTRLEN];
622 #endif
623 bool dolog, over;
624
625 SOLISTEN_LOCK(head);
626 over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
627 #ifdef REGRESSION
628 if (regression_sonewconn_earlytest && over) {
629 #else
630 if (over) {
631 #endif
632 head->sol_overcount++;
633 dolog = (sooverprio >= 0) &&
634 !!ratecheck(&head->sol_lastover, &overinterval);
635
636 /*
637 * If we're going to log, copy the overflow count and queue
638 * length from the listen socket before dropping the lock.
639 * Also, reset the overflow count.
640 */
641 if (dolog) {
642 overcount = head->sol_overcount;
643 head->sol_overcount = 0;
644 qlen = head->sol_qlen;
645 }
646 SOLISTEN_UNLOCK(head);
647
648 if (dolog) {
649 /*
650 * Try to print something descriptive about the
651 * socket for the error message.
652 */
653 sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
654 SBUF_FIXEDLEN);
655 switch (head->so_proto->pr_domain->dom_family) {
656 #if defined(INET) || defined(INET6)
657 #ifdef INET
658 case AF_INET:
659 #endif
660 #ifdef INET6
661 case AF_INET6:
662 if (head->so_proto->pr_domain->dom_family ==
663 AF_INET6 ||
664 (sotoinpcb(head)->inp_inc.inc_flags &
665 INC_ISIPV6)) {
666 ip6_sprintf(addrbuf,
667 &sotoinpcb(head)->inp_inc.inc6_laddr);
668 sbuf_printf(&descrsb, "[%s]", addrbuf);
669 } else
670 #endif
671 {
672 #ifdef INET
673 inet_ntoa_r(
674 sotoinpcb(head)->inp_inc.inc_laddr,
675 addrbuf);
676 sbuf_cat(&descrsb, addrbuf);
677 #endif
678 }
679 sbuf_printf(&descrsb, ":%hu (proto %u)",
680 ntohs(sotoinpcb(head)->inp_inc.inc_lport),
681 head->so_proto->pr_protocol);
682 break;
683 #endif /* INET || INET6 */
684 case AF_UNIX:
685 sbuf_cat(&descrsb, localprefix);
686 if (sotounpcb(head)->unp_addr != NULL)
687 len =
688 sotounpcb(head)->unp_addr->sun_len -
689 offsetof(struct sockaddr_un,
690 sun_path);
691 else
692 len = 0;
693 if (len > 0)
694 sbuf_bcat(&descrsb,
695 sotounpcb(head)->unp_addr->sun_path,
696 len);
697 else
698 sbuf_cat(&descrsb, "(unknown)");
699 break;
700 }
701
702 /*
703 * If we can't print something more specific, at least
704 * print the domain name.
705 */
706 if (sbuf_finish(&descrsb) != 0 ||
707 sbuf_len(&descrsb) <= 0) {
708 sbuf_clear(&descrsb);
709 sbuf_cat(&descrsb,
710 head->so_proto->pr_domain->dom_name ?:
711 "unknown");
712 sbuf_finish(&descrsb);
713 }
714 KASSERT(sbuf_len(&descrsb) > 0,
715 ("%s: sbuf creation failed", __func__));
716 log(LOG_PRI(sooverprio),
717 "%s: pcb %p (%s): Listen queue overflow: "
718 "%i already in queue awaiting acceptance "
719 "(%d occurrences)\n",
720 __func__, head->so_pcb, sbuf_data(&descrsb),
721 qlen, overcount);
722 sbuf_delete(&descrsb);
723
724 overcount = 0;
725 }
726
727 return (NULL);
728 }
729 SOLISTEN_UNLOCK(head);
730 VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
731 __func__, head));
732 so = soalloc(head->so_vnet);
733 if (so == NULL) {
734 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
735 "limit reached or out of memory\n",
736 __func__, head->so_pcb);
737 return (NULL);
738 }
739 so->so_listen = head;
740 so->so_type = head->so_type;
741 so->so_options = head->so_options & ~SO_ACCEPTCONN;
742 so->so_linger = head->so_linger;
743 so->so_state = head->so_state | SS_NOFDREF;
744 so->so_fibnum = head->so_fibnum;
745 so->so_proto = head->so_proto;
746 so->so_cred = crhold(head->so_cred);
747 #ifdef MAC
748 mac_socket_newconn(head, so);
749 #endif
750 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
751 so_rdknl_assert_lock);
752 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
753 so_wrknl_assert_lock);
754 VNET_SO_ASSERT(head);
755 if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
756 sodealloc(so);
757 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
758 __func__, head->so_pcb);
759 return (NULL);
760 }
761 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
762 sodealloc(so);
763 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
764 __func__, head->so_pcb);
765 return (NULL);
766 }
767 so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
768 so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
769 so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
770 so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
771 so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
772 so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
773
774 SOLISTEN_LOCK(head);
775 if (head->sol_accept_filter != NULL)
776 connstatus = 0;
777 so->so_state |= connstatus;
778 soref(head); /* A socket on (in)complete queue refs head. */
779 if (connstatus) {
780 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
781 so->so_qstate = SQ_COMP;
782 head->sol_qlen++;
783 solisten_wakeup(head); /* unlocks */
784 } else {
785 /*
786 * Keep removing sockets from the head until there's room for
787 * us to insert on the tail. In pre-locking revisions, this
788 * was a simple if(), but as we could be racing with other
789 * threads and soabort() requires dropping locks, we must
790 * loop waiting for the condition to be true.
791 */
792 while (head->sol_incqlen > head->sol_qlimit) {
793 struct socket *sp;
794
795 sp = TAILQ_FIRST(&head->sol_incomp);
796 TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
797 head->sol_incqlen--;
798 SOCK_LOCK(sp);
799 sp->so_qstate = SQ_NONE;
800 sp->so_listen = NULL;
801 SOCK_UNLOCK(sp);
802 sorele(head); /* does SOLISTEN_UNLOCK, head stays */
803 soabort(sp);
804 SOLISTEN_LOCK(head);
805 }
806 TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
807 so->so_qstate = SQ_INCOMP;
808 head->sol_incqlen++;
809 SOLISTEN_UNLOCK(head);
810 }
811 return (so);
812 }
813
814 #if defined(SCTP) || defined(SCTP_SUPPORT)
815 /*
816 * Socket part of sctp_peeloff(). Detach a new socket from an
817 * association. The new socket is returned with a reference.
818 */
819 struct socket *
820 sopeeloff(struct socket *head)
821 {
822 struct socket *so;
823
824 VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
825 __func__, __LINE__, head));
826 so = soalloc(head->so_vnet);
827 if (so == NULL) {
828 log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
829 "limit reached or out of memory\n",
830 __func__, head->so_pcb);
831 return (NULL);
832 }
833 so->so_type = head->so_type;
834 so->so_options = head->so_options;
835 so->so_linger = head->so_linger;
836 so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
837 so->so_fibnum = head->so_fibnum;
838 so->so_proto = head->so_proto;
839 so->so_cred = crhold(head->so_cred);
840 #ifdef MAC
841 mac_socket_newconn(head, so);
842 #endif
843 knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
844 so_rdknl_assert_lock);
845 knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
846 so_wrknl_assert_lock);
847 VNET_SO_ASSERT(head);
848 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
849 sodealloc(so);
850 log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
851 __func__, head->so_pcb);
852 return (NULL);
853 }
854 if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
855 sodealloc(so);
856 log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
857 __func__, head->so_pcb);
858 return (NULL);
859 }
860 so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
861 so->so_snd.sb_lowat = head->so_snd.sb_lowat;
862 so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
863 so->so_snd.sb_timeo = head->so_snd.sb_timeo;
864 so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
865 so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
866
867 soref(so);
868
869 return (so);
870 }
871 #endif /* SCTP */
872
873 int
874 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
875 {
876 int error;
877
878 CURVNET_SET(so->so_vnet);
879 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
880 CURVNET_RESTORE();
881 return (error);
882 }
883
884 int
885 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
886 {
887 int error;
888
889 CURVNET_SET(so->so_vnet);
890 error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
891 CURVNET_RESTORE();
892 return (error);
893 }
894
895 /*
896 * solisten() transitions a socket from a non-listening state to a listening
897 * state, but can also be used to update the listen queue depth on an
898 * existing listen socket. The protocol will call back into the sockets
899 * layer using solisten_proto_check() and solisten_proto() to check and set
900 * socket-layer listen state. Call backs are used so that the protocol can
901 * acquire both protocol and socket layer locks in whatever order is required
902 * by the protocol.
903 *
904 * Protocol implementors are advised to hold the socket lock across the
905 * socket-layer test and set to avoid races at the socket layer.
906 */
907 int
908 solisten(struct socket *so, int backlog, struct thread *td)
909 {
910 int error;
911
912 CURVNET_SET(so->so_vnet);
913 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
914 CURVNET_RESTORE();
915 return (error);
916 }
917
918 int
919 solisten_proto_check(struct socket *so)
920 {
921
922 SOCK_LOCK_ASSERT(so);
923
924 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
925 SS_ISDISCONNECTING))
926 return (EINVAL);
927 return (0);
928 }
929
930 void
931 solisten_proto(struct socket *so, int backlog)
932 {
933 int sbrcv_lowat, sbsnd_lowat;
934 u_int sbrcv_hiwat, sbsnd_hiwat;
935 short sbrcv_flags, sbsnd_flags;
936 sbintime_t sbrcv_timeo, sbsnd_timeo;
937
938 SOCK_LOCK_ASSERT(so);
939
940 if (SOLISTENING(so))
941 goto listening;
942
943 /*
944 * Change this socket to listening state.
945 */
946 sbrcv_lowat = so->so_rcv.sb_lowat;
947 sbsnd_lowat = so->so_snd.sb_lowat;
948 sbrcv_hiwat = so->so_rcv.sb_hiwat;
949 sbsnd_hiwat = so->so_snd.sb_hiwat;
950 sbrcv_flags = so->so_rcv.sb_flags;
951 sbsnd_flags = so->so_snd.sb_flags;
952 sbrcv_timeo = so->so_rcv.sb_timeo;
953 sbsnd_timeo = so->so_snd.sb_timeo;
954
955 #ifdef MAC
956 mac_socketpeer_label_free(so->so_peerlabel);
957 #endif
958
959 sbdestroy(&so->so_snd, so);
960 sbdestroy(&so->so_rcv, so);
961 sx_destroy(&so->so_snd.sb_sx);
962 sx_destroy(&so->so_rcv.sb_sx);
963 SOCKBUF_LOCK_DESTROY(&so->so_snd);
964 SOCKBUF_LOCK_DESTROY(&so->so_rcv);
965
966 #ifdef INVARIANTS
967 bzero(&so->so_rcv,
968 sizeof(struct socket) - offsetof(struct socket, so_rcv));
969 #endif
970
971 so->sol_sbrcv_lowat = sbrcv_lowat;
972 so->sol_sbsnd_lowat = sbsnd_lowat;
973 so->sol_sbrcv_hiwat = sbrcv_hiwat;
974 so->sol_sbsnd_hiwat = sbsnd_hiwat;
975 so->sol_sbrcv_flags = sbrcv_flags;
976 so->sol_sbsnd_flags = sbsnd_flags;
977 so->sol_sbrcv_timeo = sbrcv_timeo;
978 so->sol_sbsnd_timeo = sbsnd_timeo;
979
980 so->sol_qlen = so->sol_incqlen = 0;
981 TAILQ_INIT(&so->sol_incomp);
982 TAILQ_INIT(&so->sol_comp);
983
984 so->sol_accept_filter = NULL;
985 so->sol_accept_filter_arg = NULL;
986 so->sol_accept_filter_str = NULL;
987
988 so->sol_upcall = NULL;
989 so->sol_upcallarg = NULL;
990
991 so->so_options |= SO_ACCEPTCONN;
992
993 listening:
994 if (backlog < 0 || backlog > V_somaxconn)
995 backlog = V_somaxconn;
996 so->sol_qlimit = backlog;
997 }
998
999 /*
1000 * Wakeup listeners/subsystems once we have a complete connection.
1001 * Enters with lock, returns unlocked.
1002 */
1003 void
1004 solisten_wakeup(struct socket *sol)
1005 {
1006
1007 if (sol->sol_upcall != NULL)
1008 (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
1009 else {
1010 selwakeuppri(&sol->so_rdsel, PSOCK);
1011 KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
1012 }
1013 SOLISTEN_UNLOCK(sol);
1014 wakeup_one(&sol->sol_comp);
1015 if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
1016 pgsigio(&sol->so_sigio, SIGIO, 0);
1017 }
1018
1019 /*
1020 * Return single connection off a listening socket queue. Main consumer of
1021 * the function is kern_accept4(). Some modules, that do their own accept
1022 * management also use the function.
1023 *
1024 * Listening socket must be locked on entry and is returned unlocked on
1025 * return.
1026 * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
1027 */
1028 int
1029 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
1030 {
1031 struct socket *so;
1032 int error;
1033
1034 SOLISTEN_LOCK_ASSERT(head);
1035
1036 while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
1037 head->so_error == 0) {
1038 error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
1039 "accept", 0);
1040 if (error != 0) {
1041 SOLISTEN_UNLOCK(head);
1042 return (error);
1043 }
1044 }
1045 if (head->so_error) {
1046 error = head->so_error;
1047 head->so_error = 0;
1048 } else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
1049 error = EWOULDBLOCK;
1050 else
1051 error = 0;
1052 if (error) {
1053 SOLISTEN_UNLOCK(head);
1054 return (error);
1055 }
1056 so = TAILQ_FIRST(&head->sol_comp);
1057 SOCK_LOCK(so);
1058 KASSERT(so->so_qstate == SQ_COMP,
1059 ("%s: so %p not SQ_COMP", __func__, so));
1060 soref(so);
1061 head->sol_qlen--;
1062 so->so_qstate = SQ_NONE;
1063 so->so_listen = NULL;
1064 TAILQ_REMOVE(&head->sol_comp, so, so_list);
1065 if (flags & ACCEPT4_INHERIT)
1066 so->so_state |= (head->so_state & SS_NBIO);
1067 else
1068 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
1069 SOCK_UNLOCK(so);
1070 sorele(head);
1071
1072 *ret = so;
1073 return (0);
1074 }
1075
1076 /*
1077 * Evaluate the reference count and named references on a socket; if no
1078 * references remain, free it. This should be called whenever a reference is
1079 * released, such as in sorele(), but also when named reference flags are
1080 * cleared in socket or protocol code.
1081 *
1082 * sofree() will free the socket if:
1083 *
1084 * - There are no outstanding file descriptor references or related consumers
1085 * (so_count == 0).
1086 *
1087 * - The socket has been closed by user space, if ever open (SS_NOFDREF).
1088 *
1089 * - The protocol does not have an outstanding strong reference on the socket
1090 * (SS_PROTOREF).
1091 *
1092 * - The socket is not in a completed connection queue, so a process has been
1093 * notified that it is present. If it is removed, the user process may
1094 * block in accept() despite select() saying the socket was ready.
1095 */
1096 void
1097 sofree(struct socket *so)
1098 {
1099 struct protosw *pr = so->so_proto;
1100 bool last __diagused;
1101
1102 SOCK_LOCK_ASSERT(so);
1103
1104 if ((so->so_state & (SS_NOFDREF | SS_PROTOREF)) != SS_NOFDREF ||
1105 refcount_load(&so->so_count) != 0 || so->so_qstate == SQ_COMP) {
1106 SOCK_UNLOCK(so);
1107 return;
1108 }
1109
1110 if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
1111 struct socket *sol;
1112
1113 sol = so->so_listen;
1114 KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
1115
1116 /*
1117 * To solve race between close of a listening socket and
1118 * a socket on its incomplete queue, we need to lock both.
1119 * The order is first listening socket, then regular.
1120 * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
1121 * function and the listening socket are the only pointers
1122 * to so. To preserve so and sol, we reference both and then
1123 * relock.
1124 * After relock the socket may not move to so_comp since it
1125 * doesn't have PCB already, but it may be removed from
1126 * so_incomp. If that happens, we share responsiblity on
1127 * freeing the socket, but soclose() has already removed
1128 * it from queue.
1129 */
1130 soref(sol);
1131 soref(so);
1132 SOCK_UNLOCK(so);
1133 SOLISTEN_LOCK(sol);
1134 SOCK_LOCK(so);
1135 if (so->so_qstate == SQ_INCOMP) {
1136 KASSERT(so->so_listen == sol,
1137 ("%s: so %p migrated out of sol %p",
1138 __func__, so, sol));
1139 TAILQ_REMOVE(&sol->sol_incomp, so, so_list);
1140 sol->sol_incqlen--;
1141 last = refcount_release(&sol->so_count);
1142 KASSERT(!last, ("%s: released last reference for %p",
1143 __func__, sol));
1144 so->so_qstate = SQ_NONE;
1145 so->so_listen = NULL;
1146 } else
1147 KASSERT(so->so_listen == NULL,
1148 ("%s: so %p not on (in)comp with so_listen",
1149 __func__, so));
1150 sorele(sol);
1151 KASSERT(refcount_load(&so->so_count) == 1,
1152 ("%s: so %p count %u", __func__, so, so->so_count));
1153 so->so_count = 0;
1154 }
1155 if (SOLISTENING(so))
1156 so->so_error = ECONNABORTED;
1157 SOCK_UNLOCK(so);
1158
1159 if (so->so_dtor != NULL)
1160 so->so_dtor(so);
1161
1162 VNET_SO_ASSERT(so);
1163 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
1164 (*pr->pr_domain->dom_dispose)(so);
1165 if (pr->pr_usrreqs->pru_detach != NULL)
1166 (*pr->pr_usrreqs->pru_detach)(so);
1167
1168 /*
1169 * From this point on, we assume that no other references to this
1170 * socket exist anywhere else in the stack. Therefore, no locks need
1171 * to be acquired or held.
1172 *
1173 * We used to do a lot of socket buffer and socket locking here, as
1174 * well as invoke sorflush() and perform wakeups. The direct call to
1175 * dom_dispose() and sbdestroy() are an inlining of what was
1176 * necessary from sorflush().
1177 *
1178 * Notice that the socket buffer and kqueue state are torn down
1179 * before calling pru_detach. This means that protocols shold not
1180 * assume they can perform socket wakeups, etc, in their detach code.
1181 */
1182 if (!SOLISTENING(so)) {
1183 sbdestroy(&so->so_snd, so);
1184 sbdestroy(&so->so_rcv, so);
1185 }
1186 seldrain(&so->so_rdsel);
1187 seldrain(&so->so_wrsel);
1188 knlist_destroy(&so->so_rdsel.si_note);
1189 knlist_destroy(&so->so_wrsel.si_note);
1190 sodealloc(so);
1191 }
1192
1193 /*
1194 * Close a socket on last file table reference removal. Initiate disconnect
1195 * if connected. Free socket when disconnect complete.
1196 *
1197 * This function will sorele() the socket. Note that soclose() may be called
1198 * prior to the ref count reaching zero. The actual socket structure will
1199 * not be freed until the ref count reaches zero.
1200 */
1201 int
1202 soclose(struct socket *so)
1203 {
1204 struct accept_queue lqueue;
1205 struct socket *sp, *tsp;
1206 int error = 0;
1207 bool last __diagused;
1208
1209 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
1210
1211 CURVNET_SET(so->so_vnet);
1212 funsetown(&so->so_sigio);
1213 if (so->so_state & SS_ISCONNECTED) {
1214 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1215 error = sodisconnect(so);
1216 if (error) {
1217 if (error == ENOTCONN)
1218 error = 0;
1219 goto drop;
1220 }
1221 }
1222
1223 if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
1224 if ((so->so_state & SS_ISDISCONNECTING) &&
1225 (so->so_state & SS_NBIO))
1226 goto drop;
1227 while (so->so_state & SS_ISCONNECTED) {
1228 error = tsleep(&so->so_timeo,
1229 PSOCK | PCATCH, "soclos",
1230 so->so_linger * hz);
1231 if (error)
1232 break;
1233 }
1234 }
1235 }
1236
1237 drop:
1238 if (so->so_proto->pr_usrreqs->pru_close != NULL)
1239 (*so->so_proto->pr_usrreqs->pru_close)(so);
1240
1241 TAILQ_INIT(&lqueue);
1242 SOCK_LOCK(so);
1243 if (SOLISTENING(so)) {
1244 TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
1245 TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
1246
1247 so->sol_qlen = so->sol_incqlen = 0;
1248
1249 TAILQ_FOREACH(sp, &lqueue, so_list) {
1250 SOCK_LOCK(sp);
1251 sp->so_qstate = SQ_NONE;
1252 sp->so_listen = NULL;
1253 SOCK_UNLOCK(sp);
1254 last = refcount_release(&so->so_count);
1255 KASSERT(!last, ("%s: released last reference for %p",
1256 __func__, so));
1257 }
1258 }
1259 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
1260 so->so_state |= SS_NOFDREF;
1261 sorele(so);
1262 TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp) {
1263 SOCK_LOCK(sp);
1264 if (refcount_load(&sp->so_count) == 0) {
1265 SOCK_UNLOCK(sp);
1266 soabort(sp);
1267 } else {
1268 /* See the handling of queued sockets in sofree(). */
1269 SOCK_UNLOCK(sp);
1270 }
1271 }
1272 CURVNET_RESTORE();
1273 return (error);
1274 }
1275
1276 /*
1277 * soabort() is used to abruptly tear down a connection, such as when a
1278 * resource limit is reached (listen queue depth exceeded), or if a listen
1279 * socket is closed while there are sockets waiting to be accepted.
1280 *
1281 * This interface is tricky, because it is called on an unreferenced socket,
1282 * and must be called only by a thread that has actually removed the socket
1283 * from the listen queue it was on, or races with other threads are risked.
1284 *
1285 * This interface will call into the protocol code, so must not be called
1286 * with any socket locks held. Protocols do call it while holding their own
1287 * recursible protocol mutexes, but this is something that should be subject
1288 * to review in the future.
1289 */
1290 void
1291 soabort(struct socket *so)
1292 {
1293
1294 /*
1295 * In as much as is possible, assert that no references to this
1296 * socket are held. This is not quite the same as asserting that the
1297 * current thread is responsible for arranging for no references, but
1298 * is as close as we can get for now.
1299 */
1300 KASSERT(so->so_count == 0, ("soabort: so_count"));
1301 KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
1302 KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
1303 VNET_SO_ASSERT(so);
1304
1305 if (so->so_proto->pr_usrreqs->pru_abort != NULL)
1306 (*so->so_proto->pr_usrreqs->pru_abort)(so);
1307 SOCK_LOCK(so);
1308 sofree(so);
1309 }
1310
1311 int
1312 soaccept(struct socket *so, struct sockaddr **nam)
1313 {
1314 int error;
1315
1316 SOCK_LOCK(so);
1317 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF"));
1318 so->so_state &= ~SS_NOFDREF;
1319 SOCK_UNLOCK(so);
1320
1321 CURVNET_SET(so->so_vnet);
1322 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1323 CURVNET_RESTORE();
1324 return (error);
1325 }
1326
1327 int
1328 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
1329 {
1330
1331 return (soconnectat(AT_FDCWD, so, nam, td));
1332 }
1333
1334 int
1335 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
1336 {
1337 int error;
1338
1339 /* XXXMJ racy */
1340 if (SOLISTENING(so))
1341 return (EOPNOTSUPP);
1342
1343 CURVNET_SET(so->so_vnet);
1344 /*
1345 * If protocol is connection-based, can only connect once.
1346 * Otherwise, if connected, try to disconnect first. This allows
1347 * user to disconnect by connecting to, e.g., a null address.
1348 */
1349 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1350 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1351 (error = sodisconnect(so)))) {
1352 error = EISCONN;
1353 } else {
1354 /*
1355 * Prevent accumulated error from previous connection from
1356 * biting us.
1357 */
1358 so->so_error = 0;
1359 if (fd == AT_FDCWD) {
1360 error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
1361 nam, td);
1362 } else {
1363 error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
1364 so, nam, td);
1365 }
1366 }
1367 CURVNET_RESTORE();
1368
1369 return (error);
1370 }
1371
1372 int
1373 soconnect2(struct socket *so1, struct socket *so2)
1374 {
1375 int error;
1376
1377 CURVNET_SET(so1->so_vnet);
1378 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1379 CURVNET_RESTORE();
1380 return (error);
1381 }
1382
1383 int
1384 sodisconnect(struct socket *so)
1385 {
1386 int error;
1387
1388 if ((so->so_state & SS_ISCONNECTED) == 0)
1389 return (ENOTCONN);
1390 if (so->so_state & SS_ISDISCONNECTING)
1391 return (EALREADY);
1392 VNET_SO_ASSERT(so);
1393 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1394 return (error);
1395 }
1396
1397 int
1398 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
1399 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1400 {
1401 long space;
1402 ssize_t resid;
1403 int clen = 0, error, dontroute;
1404
1405 KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
1406 KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
1407 ("sosend_dgram: !PR_ATOMIC"));
1408
1409 if (uio != NULL)
1410 resid = uio->uio_resid;
1411 else
1412 resid = top->m_pkthdr.len;
1413 /*
1414 * In theory resid should be unsigned. However, space must be
1415 * signed, as it might be less than 0 if we over-committed, and we
1416 * must use a signed comparison of space and resid. On the other
1417 * hand, a negative resid causes us to loop sending 0-length
1418 * segments to the protocol.
1419 */
1420 if (resid < 0) {
1421 error = EINVAL;
1422 goto out;
1423 }
1424
1425 dontroute =
1426 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
1427 if (td != NULL)
1428 td->td_ru.ru_msgsnd++;
1429 if (control != NULL)
1430 clen = control->m_len;
1431
1432 SOCKBUF_LOCK(&so->so_snd);
1433 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1434 SOCKBUF_UNLOCK(&so->so_snd);
1435 error = EPIPE;
1436 goto out;
1437 }
1438 if (so->so_error) {
1439 error = so->so_error;
1440 so->so_error = 0;
1441 SOCKBUF_UNLOCK(&so->so_snd);
1442 goto out;
1443 }
1444 if ((so->so_state & SS_ISCONNECTED) == 0) {
1445 /*
1446 * `sendto' and `sendmsg' is allowed on a connection-based
1447 * socket if it supports implied connect. Return ENOTCONN if
1448 * not connected and no address is supplied.
1449 */
1450 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1451 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1452 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1453 !(resid == 0 && clen != 0)) {
1454 SOCKBUF_UNLOCK(&so->so_snd);
1455 error = ENOTCONN;
1456 goto out;
1457 }
1458 } else if (addr == NULL) {
1459 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1460 error = ENOTCONN;
1461 else
1462 error = EDESTADDRREQ;
1463 SOCKBUF_UNLOCK(&so->so_snd);
1464 goto out;
1465 }
1466 }
1467
1468 /*
1469 * Do we need MSG_OOB support in SOCK_DGRAM? Signs here may be a
1470 * problem and need fixing.
1471 */
1472 space = sbspace(&so->so_snd);
1473 if (flags & MSG_OOB)
1474 space += 1024;
1475 space -= clen;
1476 SOCKBUF_UNLOCK(&so->so_snd);
1477 if (resid > space) {
1478 error = EMSGSIZE;
1479 goto out;
1480 }
1481 if (uio == NULL) {
1482 resid = 0;
1483 if (flags & MSG_EOR)
1484 top->m_flags |= M_EOR;
1485 } else {
1486 /*
1487 * Copy the data from userland into a mbuf chain.
1488 * If no data is to be copied in, a single empty mbuf
1489 * is returned.
1490 */
1491 top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
1492 (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
1493 if (top == NULL) {
1494 error = EFAULT; /* only possible error */
1495 goto out;
1496 }
1497 space -= resid - uio->uio_resid;
1498 resid = uio->uio_resid;
1499 }
1500 KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
1501 /*
1502 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
1503 * than with.
1504 */
1505 if (dontroute) {
1506 SOCK_LOCK(so);
1507 so->so_options |= SO_DONTROUTE;
1508 SOCK_UNLOCK(so);
1509 }
1510 /*
1511 * XXX all the SBS_CANTSENDMORE checks previously done could be out
1512 * of date. We could have received a reset packet in an interrupt or
1513 * maybe we slept while doing page faults in uiomove() etc. We could
1514 * probably recheck again inside the locking protection here, but
1515 * there are probably other places that this also happens. We must
1516 * rethink this.
1517 */
1518 VNET_SO_ASSERT(so);
1519 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1520 (flags & MSG_OOB) ? PRUS_OOB :
1521 /*
1522 * If the user set MSG_EOF, the protocol understands this flag and
1523 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
1524 */
1525 ((flags & MSG_EOF) &&
1526 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1527 (resid <= 0)) ?
1528 PRUS_EOF :
1529 /* If there is more to send set PRUS_MORETOCOME */
1530 (flags & MSG_MORETOCOME) ||
1531 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
1532 top, addr, control, td);
1533 if (dontroute) {
1534 SOCK_LOCK(so);
1535 so->so_options &= ~SO_DONTROUTE;
1536 SOCK_UNLOCK(so);
1537 }
1538 clen = 0;
1539 control = NULL;
1540 top = NULL;
1541 out:
1542 if (top != NULL)
1543 m_freem(top);
1544 if (control != NULL)
1545 m_freem(control);
1546 return (error);
1547 }
1548
1549 /*
1550 * Send on a socket. If send must go all at once and message is larger than
1551 * send buffering, then hard error. Lock against other senders. If must go
1552 * all at once and not enough room now, then inform user that this would
1553 * block and do nothing. Otherwise, if nonblocking, send as much as
1554 * possible. The data to be sent is described by "uio" if nonzero, otherwise
1555 * by the mbuf chain "top" (which must be null if uio is not). Data provided
1556 * in mbuf chain must be small enough to send all at once.
1557 *
1558 * Returns nonzero on error, timeout or signal; callers must check for short
1559 * counts if EINTR/ERESTART are returned. Data and control buffers are freed
1560 * on return.
1561 */
1562 int
1563 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
1564 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1565 {
1566 long space;
1567 ssize_t resid;
1568 int clen = 0, error, dontroute;
1569 int atomic = sosendallatonce(so) || top;
1570 int pru_flag;
1571 #ifdef KERN_TLS
1572 struct ktls_session *tls;
1573 int tls_enq_cnt, tls_pruflag;
1574 uint8_t tls_rtype;
1575
1576 tls = NULL;
1577 tls_rtype = TLS_RLTYPE_APP;
1578 #endif
1579 if (uio != NULL)
1580 resid = uio->uio_resid;
1581 else if ((top->m_flags & M_PKTHDR) != 0)
1582 resid = top->m_pkthdr.len;
1583 else
1584 resid = m_length(top, NULL);
1585 /*
1586 * In theory resid should be unsigned. However, space must be
1587 * signed, as it might be less than 0 if we over-committed, and we
1588 * must use a signed comparison of space and resid. On the other
1589 * hand, a negative resid causes us to loop sending 0-length
1590 * segments to the protocol.
1591 *
1592 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1593 * type sockets since that's an error.
1594 */
1595 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1596 error = EINVAL;
1597 goto out;
1598 }
1599
1600 dontroute =
1601 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
1602 (so->so_proto->pr_flags & PR_ATOMIC);
1603 if (td != NULL)
1604 td->td_ru.ru_msgsnd++;
1605 if (control != NULL)
1606 clen = control->m_len;
1607
1608 error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
1609 if (error)
1610 goto out;
1611
1612 #ifdef KERN_TLS
1613 tls_pruflag = 0;
1614 tls = ktls_hold(so->so_snd.sb_tls_info);
1615 if (tls != NULL) {
1616 if (tls->mode == TCP_TLS_MODE_SW)
1617 tls_pruflag = PRUS_NOTREADY;
1618
1619 if (control != NULL) {
1620 struct cmsghdr *cm = mtod(control, struct cmsghdr *);
1621
1622 if (clen >= sizeof(*cm) &&
1623 cm->cmsg_type == TLS_SET_RECORD_TYPE) {
1624 tls_rtype = *((uint8_t *)CMSG_DATA(cm));
1625 clen = 0;
1626 m_freem(control);
1627 control = NULL;
1628 atomic = 1;
1629 }
1630 }
1631
1632 if (resid == 0 && !ktls_permit_empty_frames(tls)) {
1633 error = EINVAL;
1634 goto release;
1635 }
1636 }
1637 #endif
1638
1639 restart:
1640 do {
1641 SOCKBUF_LOCK(&so->so_snd);
1642 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
1643 SOCKBUF_UNLOCK(&so->so_snd);
1644 error = EPIPE;
1645 goto release;
1646 }
1647 if (so->so_error) {
1648 error = so->so_error;
1649 so->so_error = 0;
1650 SOCKBUF_UNLOCK(&so->so_snd);
1651 goto release;
1652 }
1653 if ((so->so_state & SS_ISCONNECTED) == 0) {
1654 /*
1655 * `sendto' and `sendmsg' is allowed on a connection-
1656 * based socket if it supports implied connect.
1657 * Return ENOTCONN if not connected and no address is
1658 * supplied.
1659 */
1660 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
1661 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
1662 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1663 !(resid == 0 && clen != 0)) {
1664 SOCKBUF_UNLOCK(&so->so_snd);
1665 error = ENOTCONN;
1666 goto release;
1667 }
1668 } else if (addr == NULL) {
1669 SOCKBUF_UNLOCK(&so->so_snd);
1670 if (so->so_proto->pr_flags & PR_CONNREQUIRED)
1671 error = ENOTCONN;
1672 else
1673 error = EDESTADDRREQ;
1674 goto release;
1675 }
1676 }
1677 space = sbspace(&so->so_snd);
1678 if (flags & MSG_OOB)
1679 space += 1024;
1680 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1681 clen > so->so_snd.sb_hiwat) {
1682 SOCKBUF_UNLOCK(&so->so_snd);
1683 error = EMSGSIZE;
1684 goto release;
1685 }
1686 if (space < resid + clen &&
1687 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
1688 if ((so->so_state & SS_NBIO) ||
1689 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
1690 SOCKBUF_UNLOCK(&so->so_snd);
1691 error = EWOULDBLOCK;
1692 goto release;
1693 }
1694 error = sbwait(&so->so_snd);
1695 SOCKBUF_UNLOCK(&so->so_snd);
1696 if (error)
1697 goto release;
1698 goto restart;
1699 }
1700 SOCKBUF_UNLOCK(&so->so_snd);
1701 space -= clen;
1702 do {
1703 if (uio == NULL) {
1704 resid = 0;
1705 if (flags & MSG_EOR)
1706 top->m_flags |= M_EOR;
1707 #ifdef KERN_TLS
1708 if (tls != NULL) {
1709 ktls_frame(top, tls, &tls_enq_cnt,
1710 tls_rtype);
1711 tls_rtype = TLS_RLTYPE_APP;
1712 }
1713 #endif
1714 } else {
1715 /*
1716 * Copy the data from userland into a mbuf
1717 * chain. If resid is 0, which can happen
1718 * only if we have control to send, then
1719 * a single empty mbuf is returned. This
1720 * is a workaround to prevent protocol send
1721 * methods to panic.
1722 */
1723 #ifdef KERN_TLS
1724 if (tls != NULL) {
1725 top = m_uiotombuf(uio, M_WAITOK, space,
1726 tls->params.max_frame_len,
1727 M_EXTPG |
1728 ((flags & MSG_EOR) ? M_EOR : 0));
1729 if (top != NULL) {
1730 ktls_frame(top, tls,
1731 &tls_enq_cnt, tls_rtype);
1732 }
1733 tls_rtype = TLS_RLTYPE_APP;
1734 } else
1735 #endif
1736 top = m_uiotombuf(uio, M_WAITOK, space,
1737 (atomic ? max_hdr : 0),
1738 (atomic ? M_PKTHDR : 0) |
1739 ((flags & MSG_EOR) ? M_EOR : 0));
1740 if (top == NULL) {
1741 error = EFAULT; /* only possible error */
1742 goto release;
1743 }
1744 space -= resid - uio->uio_resid;
1745 resid = uio->uio_resid;
1746 }
1747 if (dontroute) {
1748 SOCK_LOCK(so);
1749 so->so_options |= SO_DONTROUTE;
1750 SOCK_UNLOCK(so);
1751 }
1752 /*
1753 * XXX all the SBS_CANTSENDMORE checks previously
1754 * done could be out of date. We could have received
1755 * a reset packet in an interrupt or maybe we slept
1756 * while doing page faults in uiomove() etc. We
1757 * could probably recheck again inside the locking
1758 * protection here, but there are probably other
1759 * places that this also happens. We must rethink
1760 * this.
1761 */
1762 VNET_SO_ASSERT(so);
1763
1764 pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
1765 /*
1766 * If the user set MSG_EOF, the protocol understands
1767 * this flag and nothing left to send then use
1768 * PRU_SEND_EOF instead of PRU_SEND.
1769 */
1770 ((flags & MSG_EOF) &&
1771 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1772 (resid <= 0)) ?
1773 PRUS_EOF :
1774 /* If there is more to send set PRUS_MORETOCOME. */
1775 (flags & MSG_MORETOCOME) ||
1776 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1777
1778 #ifdef KERN_TLS
1779 pru_flag |= tls_pruflag;
1780 #endif
1781
1782 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1783 pru_flag, top, addr, control, td);
1784
1785 if (dontroute) {
1786 SOCK_LOCK(so);
1787 so->so_options &= ~SO_DONTROUTE;
1788 SOCK_UNLOCK(so);
1789 }
1790
1791 #ifdef KERN_TLS
1792 if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
1793 if (error != 0) {
1794 m_freem(top);
1795 top = NULL;
1796 } else {
1797 soref(so);
1798 ktls_enqueue(top, so, tls_enq_cnt);
1799 }
1800 }
1801 #endif
1802 clen = 0;
1803 control = NULL;
1804 top = NULL;
1805 if (error)
1806 goto release;
1807 } while (resid && space > 0);
1808 } while (resid);
1809
1810 release:
1811 SOCK_IO_SEND_UNLOCK(so);
1812 out:
1813 #ifdef KERN_TLS
1814 if (tls != NULL)
1815 ktls_free(tls);
1816 #endif
1817 if (top != NULL)
1818 m_freem(top);
1819 if (control != NULL)
1820 m_freem(control);
1821 return (error);
1822 }
1823
1824 int
1825 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1826 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
1827 {
1828 int error;
1829
1830 CURVNET_SET(so->so_vnet);
1831 if (!SOLISTENING(so))
1832 error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
1833 top, control, flags, td);
1834 else {
1835 m_freem(top);
1836 m_freem(control);
1837 error = ENOTCONN;
1838 }
1839 CURVNET_RESTORE();
1840 return (error);
1841 }
1842
1843 /*
1844 * The part of soreceive() that implements reading non-inline out-of-band
1845 * data from a socket. For more complete comments, see soreceive(), from
1846 * which this code originated.
1847 *
1848 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
1849 * unable to return an mbuf chain to the caller.
1850 */
1851 static int
1852 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1853 {
1854 struct protosw *pr = so->so_proto;
1855 struct mbuf *m;
1856 int error;
1857
1858 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
1859 VNET_SO_ASSERT(so);
1860
1861 m = m_get(M_WAITOK, MT_DATA);
1862 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1863 if (error)
1864 goto bad;
1865 do {
1866 error = uiomove(mtod(m, void *),
1867 (int) min(uio->uio_resid, m->m_len), uio);
1868 m = m_free(m);
1869 } while (uio->uio_resid && error == 0 && m);
1870 bad:
1871 if (m != NULL)
1872 m_freem(m);
1873 return (error);
1874 }
1875
1876 /*
1877 * Following replacement or removal of the first mbuf on the first mbuf chain
1878 * of a socket buffer, push necessary state changes back into the socket
1879 * buffer so that other consumers see the values consistently. 'nextrecord'
1880 * is the callers locally stored value of the original value of
1881 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
1882 * NOTE: 'nextrecord' may be NULL.
1883 */
1884 static __inline void
1885 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
1886 {
1887
1888 SOCKBUF_LOCK_ASSERT(sb);
1889 /*
1890 * First, update for the new value of nextrecord. If necessary, make
1891 * it the first record.
1892 */
1893 if (sb->sb_mb != NULL)
1894 sb->sb_mb->m_nextpkt = nextrecord;
1895 else
1896 sb->sb_mb = nextrecord;
1897
1898 /*
1899 * Now update any dependent socket buffer fields to reflect the new
1900 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
1901 * addition of a second clause that takes care of the case where
1902 * sb_mb has been updated, but remains the last record.
1903 */
1904 if (sb->sb_mb == NULL) {
1905 sb->sb_mbtail = NULL;
1906 sb->sb_lastrecord = NULL;
1907 } else if (sb->sb_mb->m_nextpkt == NULL)
1908 sb->sb_lastrecord = sb->sb_mb;
1909 }
1910
1911 /*
1912 * Implement receive operations on a socket. We depend on the way that
1913 * records are added to the sockbuf by sbappend. In particular, each record
1914 * (mbufs linked through m_next) must begin with an address if the protocol
1915 * so specifies, followed by an optional mbuf or mbufs containing ancillary
1916 * data, and then zero or more mbufs of data. In order to allow parallelism
1917 * between network receive and copying to user space, as well as avoid
1918 * sleeping with a mutex held, we release the socket buffer mutex during the
1919 * user space copy. Although the sockbuf is locked, new data may still be
1920 * appended, and thus we must maintain consistency of the sockbuf during that
1921 * time.
1922 *
1923 * The caller may receive the data as a single mbuf chain by supplying an
1924 * mbuf **mp0 for use in returning the chain. The uio is then used only for
1925 * the count in uio_resid.
1926 */
1927 int
1928 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
1929 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1930 {
1931 struct mbuf *m, **mp;
1932 int flags, error, offset;
1933 ssize_t len;
1934 struct protosw *pr = so->so_proto;
1935 struct mbuf *nextrecord;
1936 int moff, type = 0;
1937 ssize_t orig_resid = uio->uio_resid;
1938 bool report_real_len = false;
1939
1940 mp = mp0;
1941 if (psa != NULL)
1942 *psa = NULL;
1943 if (controlp != NULL)
1944 *controlp = NULL;
1945 if (flagsp != NULL) {
1946 report_real_len = *flagsp & MSG_TRUNC;
1947 *flagsp &= ~MSG_TRUNC;
1948 flags = *flagsp &~ MSG_EOR;
1949 } else
1950 flags = 0;
1951 if (flags & MSG_OOB)
1952 return (soreceive_rcvoob(so, uio, flags));
1953 if (mp != NULL)
1954 *mp = NULL;
1955 if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
1956 && uio->uio_resid) {
1957 VNET_SO_ASSERT(so);
1958 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1959 }
1960
1961 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
1962 if (error)
1963 return (error);
1964
1965 restart:
1966 SOCKBUF_LOCK(&so->so_rcv);
1967 m = so->so_rcv.sb_mb;
1968 /*
1969 * If we have less data than requested, block awaiting more (subject
1970 * to any timeout) if:
1971 * 1. the current count is less than the low water mark, or
1972 * 2. MSG_DONTWAIT is not set
1973 */
1974 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
1975 sbavail(&so->so_rcv) < uio->uio_resid) &&
1976 sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
1977 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
1978 KASSERT(m != NULL || !sbavail(&so->so_rcv),
1979 ("receive: m == %p sbavail == %u",
1980 m, sbavail(&so->so_rcv)));
1981 if (so->so_error || so->so_rerror) {
1982 if (m != NULL)
1983 goto dontblock;
1984 if (so->so_error)
1985 error = so->so_error;
1986 else
1987 error = so->so_rerror;
1988 if ((flags & MSG_PEEK) == 0) {
1989 if (so->so_error)
1990 so->so_error = 0;
1991 else
1992 so->so_rerror = 0;
1993 }
1994 SOCKBUF_UNLOCK(&so->so_rcv);
1995 goto release;
1996 }
1997 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1998 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1999 if (m != NULL)
2000 goto dontblock;
2001 #ifdef KERN_TLS
2002 else if (so->so_rcv.sb_tlsdcc == 0 &&
2003 so->so_rcv.sb_tlscc == 0) {
2004 #else
2005 else {
2006 #endif
2007 SOCKBUF_UNLOCK(&so->so_rcv);
2008 goto release;
2009 }
2010 }
2011 for (; m != NULL; m = m->m_next)
2012 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2013 m = so->so_rcv.sb_mb;
2014 goto dontblock;
2015 }
2016 if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
2017 SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
2018 (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2019 SOCKBUF_UNLOCK(&so->so_rcv);
2020 error = ENOTCONN;
2021 goto release;
2022 }
2023 if (uio->uio_resid == 0 && !report_real_len) {
2024 SOCKBUF_UNLOCK(&so->so_rcv);
2025 goto release;
2026 }
2027 if ((so->so_state & SS_NBIO) ||
2028 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2029 SOCKBUF_UNLOCK(&so->so_rcv);
2030 error = EWOULDBLOCK;
2031 goto release;
2032 }
2033 SBLASTRECORDCHK(&so->so_rcv);
2034 SBLASTMBUFCHK(&so->so_rcv);
2035 error = sbwait(&so->so_rcv);
2036 SOCKBUF_UNLOCK(&so->so_rcv);
2037 if (error)
2038 goto release;
2039 goto restart;
2040 }
2041 dontblock:
2042 /*
2043 * From this point onward, we maintain 'nextrecord' as a cache of the
2044 * pointer to the next record in the socket buffer. We must keep the
2045 * various socket buffer pointers and local stack versions of the
2046 * pointers in sync, pushing out modifications before dropping the
2047 * socket buffer mutex, and re-reading them when picking it up.
2048 *
2049 * Otherwise, we will race with the network stack appending new data
2050 * or records onto the socket buffer by using inconsistent/stale
2051 * versions of the field, possibly resulting in socket buffer
2052 * corruption.
2053 *
2054 * By holding the high-level sblock(), we prevent simultaneous
2055 * readers from pulling off the front of the socket buffer.
2056 */
2057 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2058 if (uio->uio_td)
2059 uio->uio_td->td_ru.ru_msgrcv++;
2060 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
2061 SBLASTRECORDCHK(&so->so_rcv);
2062 SBLASTMBUFCHK(&so->so_rcv);
2063 nextrecord = m->m_nextpkt;
2064 if (pr->pr_flags & PR_ADDR) {
2065 KASSERT(m->m_type == MT_SONAME,
2066 ("m->m_type == %d", m->m_type));
2067 orig_resid = 0;
2068 if (psa != NULL)
2069 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2070 M_NOWAIT);
2071 if (flags & MSG_PEEK) {
2072 m = m->m_next;
2073 } else {
2074 sbfree(&so->so_rcv, m);
2075 so->so_rcv.sb_mb = m_free(m);
2076 m = so->so_rcv.sb_mb;
2077 sockbuf_pushsync(&so->so_rcv, nextrecord);
2078 }
2079 }
2080
2081 /*
2082 * Process one or more MT_CONTROL mbufs present before any data mbufs
2083 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2084 * just copy the data; if !MSG_PEEK, we call into the protocol to
2085 * perform externalization (or freeing if controlp == NULL).
2086 */
2087 if (m != NULL && m->m_type == MT_CONTROL) {
2088 struct mbuf *cm = NULL, *cmn;
2089 struct mbuf **cme = &cm;
2090 #ifdef KERN_TLS
2091 struct cmsghdr *cmsg;
2092 struct tls_get_record tgr;
2093
2094 /*
2095 * For MSG_TLSAPPDATA, check for an alert record.
2096 * If found, return ENXIO without removing
2097 * it from the receive queue. This allows a subsequent
2098 * call without MSG_TLSAPPDATA to receive it.
2099 * Note that, for TLS, there should only be a single
2100 * control mbuf with the TLS_GET_RECORD message in it.
2101 */
2102 if (flags & MSG_TLSAPPDATA) {
2103 cmsg = mtod(m, struct cmsghdr *);
2104 if (cmsg->cmsg_type == TLS_GET_RECORD &&
2105 cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
2106 memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
2107 if (__predict_false(tgr.tls_type ==
2108 TLS_RLTYPE_ALERT)) {
2109 SOCKBUF_UNLOCK(&so->so_rcv);
2110 error = ENXIO;
2111 goto release;
2112 }
2113 }
2114 }
2115 #endif
2116
2117 do {
2118 if (flags & MSG_PEEK) {
2119 if (controlp != NULL) {
2120 *controlp = m_copym(m, 0, m->m_len,
2121 M_NOWAIT);
2122 controlp = &(*controlp)->m_next;
2123 }
2124 m = m->m_next;
2125 } else {
2126 sbfree(&so->so_rcv, m);
2127 so->so_rcv.sb_mb = m->m_next;
2128 m->m_next = NULL;
2129 *cme = m;
2130 cme = &(*cme)->m_next;
2131 m = so->so_rcv.sb_mb;
2132 }
2133 } while (m != NULL && m->m_type == MT_CONTROL);
2134 if ((flags & MSG_PEEK) == 0)
2135 sockbuf_pushsync(&so->so_rcv, nextrecord);
2136 while (cm != NULL) {
2137 cmn = cm->m_next;
2138 cm->m_next = NULL;
2139 if (pr->pr_domain->dom_externalize != NULL) {
2140 SOCKBUF_UNLOCK(&so->so_rcv);
2141 VNET_SO_ASSERT(so);
2142 error = (*pr->pr_domain->dom_externalize)
2143 (cm, controlp, flags);
2144 SOCKBUF_LOCK(&so->so_rcv);
2145 } else if (controlp != NULL)
2146 *controlp = cm;
2147 else
2148 m_freem(cm);
2149 if (controlp != NULL) {
2150 while (*controlp != NULL)
2151 controlp = &(*controlp)->m_next;
2152 }
2153 cm = cmn;
2154 }
2155 if (m != NULL)
2156 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
2157 else
2158 nextrecord = so->so_rcv.sb_mb;
2159 orig_resid = 0;
2160 }
2161 if (m != NULL) {
2162 if ((flags & MSG_PEEK) == 0) {
2163 KASSERT(m->m_nextpkt == nextrecord,
2164 ("soreceive: post-control, nextrecord !sync"));
2165 if (nextrecord == NULL) {
2166 KASSERT(so->so_rcv.sb_mb == m,
2167 ("soreceive: post-control, sb_mb!=m"));
2168 KASSERT(so->so_rcv.sb_lastrecord == m,
2169 ("soreceive: post-control, lastrecord!=m"));
2170 }
2171 }
2172 type = m->m_type;
2173 if (type == MT_OOBDATA)
2174 flags |= MSG_OOB;
2175 } else {
2176 if ((flags & MSG_PEEK) == 0) {
2177 KASSERT(so->so_rcv.sb_mb == nextrecord,
2178 ("soreceive: sb_mb != nextrecord"));
2179 if (so->so_rcv.sb_mb == NULL) {
2180 KASSERT(so->so_rcv.sb_lastrecord == NULL,
2181 ("soreceive: sb_lastercord != NULL"));
2182 }
2183 }
2184 }
2185 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2186 SBLASTRECORDCHK(&so->so_rcv);
2187 SBLASTMBUFCHK(&so->so_rcv);
2188
2189 /*
2190 * Now continue to read any data mbufs off of the head of the socket
2191 * buffer until the read request is satisfied. Note that 'type' is
2192 * used to store the type of any mbuf reads that have happened so far
2193 * such that soreceive() can stop reading if the type changes, which
2194 * causes soreceive() to return only one of regular data and inline
2195 * out-of-band data in a single socket receive operation.
2196 */
2197 moff = 0;
2198 offset = 0;
2199 while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
2200 && error == 0) {
2201 /*
2202 * If the type of mbuf has changed since the last mbuf
2203 * examined ('type'), end the receive operation.
2204 */
2205 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2206 if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
2207 if (type != m->m_type)
2208 break;
2209 } else if (type == MT_OOBDATA)
2210 break;
2211 else
2212 KASSERT(m->m_type == MT_DATA,
2213 ("m->m_type == %d", m->m_type));
2214 so->so_rcv.sb_state &= ~SBS_RCVATMARK;
2215 len = uio->uio_resid;
2216 if (so->so_oobmark && len > so->so_oobmark - offset)
2217 len = so->so_oobmark - offset;
2218 if (len > m->m_len - moff)
2219 len = m->m_len - moff;
2220 /*
2221 * If mp is set, just pass back the mbufs. Otherwise copy
2222 * them out via the uio, then free. Sockbuf must be
2223 * consistent here (points to current mbuf, it points to next
2224 * record) when we drop priority; we must note any additions
2225 * to the sockbuf when we block interrupts again.
2226 */
2227 if (mp == NULL) {
2228 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2229 SBLASTRECORDCHK(&so->so_rcv);
2230 SBLASTMBUFCHK(&so->so_rcv);
2231 SOCKBUF_UNLOCK(&so->so_rcv);
2232 if ((m->m_flags & M_EXTPG) != 0)
2233 error = m_unmapped_uiomove(m, moff, uio,
2234 (int)len);
2235 else
2236 error = uiomove(mtod(m, char *) + moff,
2237 (int)len, uio);
2238 SOCKBUF_LOCK(&so->so_rcv);
2239 if (error) {
2240 /*
2241 * The MT_SONAME mbuf has already been removed
2242 * from the record, so it is necessary to
2243 * remove the data mbufs, if any, to preserve
2244 * the invariant in the case of PR_ADDR that
2245 * requires MT_SONAME mbufs at the head of
2246 * each record.
2247 */
2248 if (pr->pr_flags & PR_ATOMIC &&
2249 ((flags & MSG_PEEK) == 0))
2250 (void)sbdroprecord_locked(&so->so_rcv);
2251 SOCKBUF_UNLOCK(&so->so_rcv);
2252 goto release;
2253 }
2254 } else
2255 uio->uio_resid -= len;
2256 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2257 if (len == m->m_len - moff) {
2258 if (m->m_flags & M_EOR)
2259 flags |= MSG_EOR;
2260 if (flags & MSG_PEEK) {
2261 m = m->m_next;
2262 moff = 0;
2263 } else {
2264 nextrecord = m->m_nextpkt;
2265 sbfree(&so->so_rcv, m);
2266 if (mp != NULL) {
2267 m->m_nextpkt = NULL;
2268 *mp = m;
2269 mp = &m->m_next;
2270 so->so_rcv.sb_mb = m = m->m_next;
2271 *mp = NULL;
2272 } else {
2273 so->so_rcv.sb_mb = m_free(m);
2274 m = so->so_rcv.sb_mb;
2275 }
2276 sockbuf_pushsync(&so->so_rcv, nextrecord);
2277 SBLASTRECORDCHK(&so->so_rcv);
2278 SBLASTMBUFCHK(&so->so_rcv);
2279 }
2280 } else {
2281 if (flags & MSG_PEEK)
2282 moff += len;
2283 else {
2284 if (mp != NULL) {
2285 if (flags & MSG_DONTWAIT) {
2286 *mp = m_copym(m, 0, len,
2287 M_NOWAIT);
2288 if (*mp == NULL) {
2289 /*
2290 * m_copym() couldn't
2291 * allocate an mbuf.
2292 * Adjust uio_resid back
2293 * (it was adjusted
2294 * down by len bytes,
2295 * which we didn't end
2296 * up "copying" over).
2297 */
2298 uio->uio_resid += len;
2299 break;
2300 }
2301 } else {
2302 SOCKBUF_UNLOCK(&so->so_rcv);
2303 *mp = m_copym(m, 0, len,
2304 M_WAITOK);
2305 SOCKBUF_LOCK(&so->so_rcv);
2306 }
2307 }
2308 sbcut_locked(&so->so_rcv, len);
2309 }
2310 }
2311 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2312 if (so->so_oobmark) {
2313 if ((flags & MSG_PEEK) == 0) {
2314 so->so_oobmark -= len;
2315 if (so->so_oobmark == 0) {
2316 so->so_rcv.sb_state |= SBS_RCVATMARK;
2317 break;
2318 }
2319 } else {
2320 offset += len;
2321 if (offset == so->so_oobmark)
2322 break;
2323 }
2324 }
2325 if (flags & MSG_EOR)
2326 break;
2327 /*
2328 * If the MSG_WAITALL flag is set (for non-atomic socket), we
2329 * must not quit until "uio->uio_resid == 0" or an error
2330 * termination. If a signal/timeout occurs, return with a
2331 * short count but without error. Keep sockbuf locked
2332 * against other readers.
2333 */
2334 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
2335 !sosendallatonce(so) && nextrecord == NULL) {
2336 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2337 if (so->so_error || so->so_rerror ||
2338 so->so_rcv.sb_state & SBS_CANTRCVMORE)
2339 break;
2340 /*
2341 * Notify the protocol that some data has been
2342 * drained before blocking.
2343 */
2344 if (pr->pr_flags & PR_WANTRCVD) {
2345 SOCKBUF_UNLOCK(&so->so_rcv);
2346 VNET_SO_ASSERT(so);
2347 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2348 SOCKBUF_LOCK(&so->so_rcv);
2349 if (__predict_false(so->so_rcv.sb_mb == NULL &&
2350 (so->so_error || so->so_rerror ||
2351 so->so_rcv.sb_state & SBS_CANTRCVMORE)))
2352 break;
2353 }
2354 SBLASTRECORDCHK(&so->so_rcv);
2355 SBLASTMBUFCHK(&so->so_rcv);
2356 /*
2357 * We could receive some data while was notifying
2358 * the protocol. Skip blocking in this case.
2359 */
2360 if (so->so_rcv.sb_mb == NULL) {
2361 error = sbwait(&so->so_rcv);
2362 if (error) {
2363 SOCKBUF_UNLOCK(&so->so_rcv);
2364 goto release;
2365 }
2366 }
2367 m = so->so_rcv.sb_mb;
2368 if (m != NULL)
2369 nextrecord = m->m_nextpkt;
2370 }
2371 }
2372
2373 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2374 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2375 if (report_real_len)
2376 uio->uio_resid -= m_length(m, NULL) - moff;
2377 flags |= MSG_TRUNC;
2378 if ((flags & MSG_PEEK) == 0)
2379 (void) sbdroprecord_locked(&so->so_rcv);
2380 }
2381 if ((flags & MSG_PEEK) == 0) {
2382 if (m == NULL) {
2383 /*
2384 * First part is an inline SB_EMPTY_FIXUP(). Second
2385 * part makes sure sb_lastrecord is up-to-date if
2386 * there is still data in the socket buffer.
2387 */
2388 so->so_rcv.sb_mb = nextrecord;
2389 if (so->so_rcv.sb_mb == NULL) {
2390 so->so_rcv.sb_mbtail = NULL;
2391 so->so_rcv.sb_lastrecord = NULL;
2392 } else if (nextrecord->m_nextpkt == NULL)
2393 so->so_rcv.sb_lastrecord = nextrecord;
2394 }
2395 SBLASTRECORDCHK(&so->so_rcv);
2396 SBLASTMBUFCHK(&so->so_rcv);
2397 /*
2398 * If soreceive() is being done from the socket callback,
2399 * then don't need to generate ACK to peer to update window,
2400 * since ACK will be generated on return to TCP.
2401 */
2402 if (!(flags & MSG_SOCALLBCK) &&
2403 (pr->pr_flags & PR_WANTRCVD)) {
2404 SOCKBUF_UNLOCK(&so->so_rcv);
2405 VNET_SO_ASSERT(so);
2406 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2407 SOCKBUF_LOCK(&so->so_rcv);
2408 }
2409 }
2410 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2411 if (orig_resid == uio->uio_resid && orig_resid &&
2412 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2413 SOCKBUF_UNLOCK(&so->so_rcv);
2414 goto restart;
2415 }
2416 SOCKBUF_UNLOCK(&so->so_rcv);
2417
2418 if (flagsp != NULL)
2419 *flagsp |= flags;
2420 release:
2421 SOCK_IO_RECV_UNLOCK(so);
2422 return (error);
2423 }
2424
2425 /*
2426 * Optimized version of soreceive() for stream (TCP) sockets.
2427 */
2428 int
2429 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
2430 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2431 {
2432 int len = 0, error = 0, flags, oresid;
2433 struct sockbuf *sb;
2434 struct mbuf *m, *n = NULL;
2435
2436 /* We only do stream sockets. */
2437 if (so->so_type != SOCK_STREAM)
2438 return (EINVAL);
2439 if (psa != NULL)
2440 *psa = NULL;
2441 if (flagsp != NULL)
2442 flags = *flagsp &~ MSG_EOR;
2443 else
2444 flags = 0;
2445 if (controlp != NULL)
2446 *controlp = NULL;
2447 if (flags & MSG_OOB)
2448 return (soreceive_rcvoob(so, uio, flags));
2449 if (mp0 != NULL)
2450 *mp0 = NULL;
2451
2452 sb = &so->so_rcv;
2453
2454 #ifdef KERN_TLS
2455 /*
2456 * KTLS store TLS records as records with a control message to
2457 * describe the framing.
2458 *
2459 * We check once here before acquiring locks to optimize the
2460 * common case.
2461 */
2462 if (sb->sb_tls_info != NULL)
2463 return (soreceive_generic(so, psa, uio, mp0, controlp,
2464 flagsp));
2465 #endif
2466
2467 /* Prevent other readers from entering the socket. */
2468 error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
2469 if (error)
2470 return (error);
2471 SOCKBUF_LOCK(sb);
2472
2473 #ifdef KERN_TLS
2474 if (sb->sb_tls_info != NULL) {
2475 SOCKBUF_UNLOCK(sb);
2476 SOCK_IO_RECV_UNLOCK(so);
2477 return (soreceive_generic(so, psa, uio, mp0, controlp,
2478 flagsp));
2479 }
2480 #endif
2481
2482 /* Easy one, no space to copyout anything. */
2483 if (uio->uio_resid == 0) {
2484 error = EINVAL;
2485 goto out;
2486 }
2487 oresid = uio->uio_resid;
2488
2489 /* We will never ever get anything unless we are or were connected. */
2490 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2491 error = ENOTCONN;
2492 goto out;
2493 }
2494
2495 restart:
2496 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2497
2498 /* Abort if socket has reported problems. */
2499 if (so->so_error) {
2500 if (sbavail(sb) > 0)
2501 goto deliver;
2502 if (oresid > uio->uio_resid)
2503 goto out;
2504 error = so->so_error;
2505 if (!(flags & MSG_PEEK))
2506 so->so_error = 0;
2507 goto out;
2508 }
2509
2510 /* Door is closed. Deliver what is left, if any. */
2511 if (sb->sb_state & SBS_CANTRCVMORE) {
2512 if (sbavail(sb) > 0)
2513 goto deliver;
2514 else
2515 goto out;
2516 }
2517
2518 /* Socket buffer is empty and we shall not block. */
2519 if (sbavail(sb) == 0 &&
2520 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
2521 error = EAGAIN;
2522 goto out;
2523 }
2524
2525 /* Socket buffer got some data that we shall deliver now. */
2526 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
2527 ((so->so_state & SS_NBIO) ||
2528 (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
2529 sbavail(sb) >= sb->sb_lowat ||
2530 sbavail(sb) >= uio->uio_resid ||
2531 sbavail(sb) >= sb->sb_hiwat) ) {
2532 goto deliver;
2533 }
2534
2535 /* On MSG_WAITALL we must wait until all data or error arrives. */
2536 if ((flags & MSG_WAITALL) &&
2537 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
2538 goto deliver;
2539
2540 /*
2541 * Wait and block until (more) data comes in.
2542 * NB: Drops the sockbuf lock during wait.
2543 */
2544 error = sbwait(sb);
2545 if (error)
2546 goto out;
2547 goto restart;
2548
2549 deliver:
2550 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2551 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
2552 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
2553
2554 /* Statistics. */
2555 if (uio->uio_td)
2556 uio->uio_td->td_ru.ru_msgrcv++;
2557
2558 /* Fill uio until full or current end of socket buffer is reached. */
2559 len = min(uio->uio_resid, sbavail(sb));
2560 if (mp0 != NULL) {
2561 /* Dequeue as many mbufs as possible. */
2562 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
2563 if (*mp0 == NULL)
2564 *mp0 = sb->sb_mb;
2565 else
2566 m_cat(*mp0, sb->sb_mb);
2567 for (m = sb->sb_mb;
2568 m != NULL && m->m_len <= len;
2569 m = m->m_next) {
2570 KASSERT(!(m->m_flags & M_NOTAVAIL),
2571 ("%s: m %p not available", __func__, m));
2572 len -= m->m_len;
2573 uio->uio_resid -= m->m_len;
2574 sbfree(sb, m);
2575 n = m;
2576 }
2577 n->m_next = NULL;
2578 sb->sb_mb = m;
2579 sb->sb_lastrecord = sb->sb_mb;
2580 if (sb->sb_mb == NULL)
2581 SB_EMPTY_FIXUP(sb);
2582 }
2583 /* Copy the remainder. */
2584 if (len > 0) {
2585 KASSERT(sb->sb_mb != NULL,
2586 ("%s: len > 0 && sb->sb_mb empty", __func__));
2587
2588 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
2589 if (m == NULL)
2590 len = 0; /* Don't flush data from sockbuf. */
2591 else
2592 uio->uio_resid -= len;
2593 if (*mp0 != NULL)
2594 m_cat(*mp0, m);
2595 else
2596 *mp0 = m;
2597 if (*mp0 == NULL) {
2598 error = ENOBUFS;
2599 goto out;
2600 }
2601 }
2602 } else {
2603 /* NB: Must unlock socket buffer as uiomove may sleep. */
2604 SOCKBUF_UNLOCK(sb);
2605 error = m_mbuftouio(uio, sb->sb_mb, len);
2606 SOCKBUF_LOCK(sb);
2607 if (error)
2608 goto out;
2609 }
2610 SBLASTRECORDCHK(sb);
2611 SBLASTMBUFCHK(sb);
2612
2613 /*
2614 * Remove the delivered data from the socket buffer unless we
2615 * were only peeking.
2616 */
2617 if (!(flags & MSG_PEEK)) {
2618 if (len > 0)
2619 sbdrop_locked(sb, len);
2620
2621 /* Notify protocol that we drained some data. */
2622 if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
2623 (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
2624 !(flags & MSG_SOCALLBCK))) {
2625 SOCKBUF_UNLOCK(sb);
2626 VNET_SO_ASSERT(so);
2627 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
2628 SOCKBUF_LOCK(sb);
2629 }
2630 }
2631
2632 /*
2633 * For MSG_WAITALL we may have to loop again and wait for
2634 * more data to come in.
2635 */
2636 if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
2637 goto restart;
2638 out:
2639 SBLASTRECORDCHK(sb);
2640 SBLASTMBUFCHK(sb);
2641 SOCKBUF_UNLOCK(sb);
2642 SOCK_IO_RECV_UNLOCK(so);
2643 return (error);
2644 }
2645
2646 /*
2647 * Optimized version of soreceive() for simple datagram cases from userspace.
2648 * Unlike in the stream case, we're able to drop a datagram if copyout()
2649 * fails, and because we handle datagrams atomically, we don't need to use a
2650 * sleep lock to prevent I/O interlacing.
2651 */
2652 int
2653 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
2654 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2655 {
2656 struct mbuf *m, *m2;
2657 int flags, error;
2658 ssize_t len;
2659 struct protosw *pr = so->so_proto;
2660 struct mbuf *nextrecord;
2661
2662 if (psa != NULL)
2663 *psa = NULL;
2664 if (controlp != NULL)
2665 *controlp = NULL;
2666 if (flagsp != NULL)
2667 flags = *flagsp &~ MSG_EOR;
2668 else
2669 flags = 0;
2670
2671 /*
2672 * For any complicated cases, fall back to the full
2673 * soreceive_generic().
2674 */
2675 if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
2676 return (soreceive_generic(so, psa, uio, mp0, controlp,
2677 flagsp));
2678
2679 /*
2680 * Enforce restrictions on use.
2681 */
2682 KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
2683 ("soreceive_dgram: wantrcvd"));
2684 KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
2685 KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
2686 ("soreceive_dgram: SBS_RCVATMARK"));
2687 KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
2688 ("soreceive_dgram: P_CONNREQUIRED"));
2689
2690 /*
2691 * Loop blocking while waiting for a datagram.
2692 */
2693 SOCKBUF_LOCK(&so->so_rcv);
2694 while ((m = so->so_rcv.sb_mb) == NULL) {
2695 KASSERT(sbavail(&so->so_rcv) == 0,
2696 ("soreceive_dgram: sb_mb NULL but sbavail %u",
2697 sbavail(&so->so_rcv)));
2698 if (so->so_error) {
2699 error = so->so_error;
2700 so->so_error = 0;
2701 SOCKBUF_UNLOCK(&so->so_rcv);
2702 return (error);
2703 }
2704 if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
2705 uio->uio_resid == 0) {
2706 SOCKBUF_UNLOCK(&so->so_rcv);
2707 return (0);
2708 }
2709 if ((so->so_state & SS_NBIO) ||
2710 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2711 SOCKBUF_UNLOCK(&so->so_rcv);
2712 return (EWOULDBLOCK);
2713 }
2714 SBLASTRECORDCHK(&so->so_rcv);
2715 SBLASTMBUFCHK(&so->so_rcv);
2716 error = sbwait(&so->so_rcv);
2717 if (error) {
2718 SOCKBUF_UNLOCK(&so->so_rcv);
2719 return (error);
2720 }
2721 }
2722 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
2723
2724 if (uio->uio_td)
2725 uio->uio_td->td_ru.ru_msgrcv++;
2726 SBLASTRECORDCHK(&so->so_rcv);
2727 SBLASTMBUFCHK(&so->so_rcv);
2728 nextrecord = m->m_nextpkt;
2729 if (nextrecord == NULL) {
2730 KASSERT(so->so_rcv.sb_lastrecord == m,
2731 ("soreceive_dgram: lastrecord != m"));
2732 }
2733
2734 KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
2735 ("soreceive_dgram: m_nextpkt != nextrecord"));
2736
2737 /*
2738 * Pull 'm' and its chain off the front of the packet queue.
2739 */
2740 so->so_rcv.sb_mb = NULL;
2741 sockbuf_pushsync(&so->so_rcv, nextrecord);
2742
2743 /*
2744 * Walk 'm's chain and free that many bytes from the socket buffer.
2745 */
2746 for (m2 = m; m2 != NULL; m2 = m2->m_next)
2747 sbfree(&so->so_rcv, m2);
2748
2749 /*
2750 * Do a few last checks before we let go of the lock.
2751 */
2752 SBLASTRECORDCHK(&so->so_rcv);
2753 SBLASTMBUFCHK(&so->so_rcv);
2754 SOCKBUF_UNLOCK(&so->so_rcv);
2755
2756 if (pr->pr_flags & PR_ADDR) {
2757 KASSERT(m->m_type == MT_SONAME,
2758 ("m->m_type == %d", m->m_type));
2759 if (psa != NULL)
2760 *psa = sodupsockaddr(mtod(m, struct sockaddr *),
2761 M_NOWAIT);
2762 m = m_free(m);
2763 }
2764 if (m == NULL) {
2765 /* XXXRW: Can this happen? */
2766 return (0);
2767 }
2768
2769 /*
2770 * Packet to copyout() is now in 'm' and it is disconnected from the
2771 * queue.
2772 *
2773 * Process one or more MT_CONTROL mbufs present before any data mbufs
2774 * in the first mbuf chain on the socket buffer. We call into the
2775 * protocol to perform externalization (or freeing if controlp ==
2776 * NULL). In some cases there can be only MT_CONTROL mbufs without
2777 * MT_DATA mbufs.
2778 */
2779 if (m->m_type == MT_CONTROL) {
2780 struct mbuf *cm = NULL, *cmn;
2781 struct mbuf **cme = &cm;
2782
2783 do {
2784 m2 = m->m_next;
2785 m->m_next = NULL;
2786 *cme = m;
2787 cme = &(*cme)->m_next;
2788 m = m2;
2789 } while (m != NULL && m->m_type == MT_CONTROL);
2790 while (cm != NULL) {
2791 cmn = cm->m_next;
2792 cm->m_next = NULL;
2793 if (pr->pr_domain->dom_externalize != NULL) {
2794 error = (*pr->pr_domain->dom_externalize)
2795 (cm, controlp, flags);
2796 } else if (controlp != NULL)
2797 *controlp = cm;
2798 else
2799 m_freem(cm);
2800 if (controlp != NULL) {
2801 while (*controlp != NULL)
2802 controlp = &(*controlp)->m_next;
2803 }
2804 cm = cmn;
2805 }
2806 }
2807 KASSERT(m == NULL || m->m_type == MT_DATA,
2808 ("soreceive_dgram: !data"));
2809 while (m != NULL && uio->uio_resid > 0) {
2810 len = uio->uio_resid;
2811 if (len > m->m_len)
2812 len = m->m_len;
2813 error = uiomove(mtod(m, char *), (int)len, uio);
2814 if (error) {
2815 m_freem(m);
2816 return (error);
2817 }
2818 if (len == m->m_len)
2819 m = m_free(m);
2820 else {
2821 m->m_data += len;
2822 m->m_len -= len;
2823 }
2824 }
2825 if (m != NULL) {
2826 flags |= MSG_TRUNC;
2827 m_freem(m);
2828 }
2829 if (flagsp != NULL)
2830 *flagsp |= flags;
2831 return (0);
2832 }
2833
2834 int
2835 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2836 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2837 {
2838 int error;
2839
2840 CURVNET_SET(so->so_vnet);
2841 if (!SOLISTENING(so))
2842 error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
2843 mp0, controlp, flagsp));
2844 else
2845 error = ENOTCONN;
2846 CURVNET_RESTORE();
2847 return (error);
2848 }
2849
2850 int
2851 soshutdown(struct socket *so, int how)
2852 {
2853 struct protosw *pr = so->so_proto;
2854 int error, soerror_enotconn;
2855
2856 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
2857 return (EINVAL);
2858
2859 soerror_enotconn = 0;
2860 if ((so->so_state &
2861 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
2862 /*
2863 * POSIX mandates us to return ENOTCONN when shutdown(2) is
2864 * invoked on a datagram sockets, however historically we would
2865 * actually tear socket down. This is known to be leveraged by
2866 * some applications to unblock process waiting in recvXXX(2)
2867 * by other process that it shares that socket with. Try to meet
2868 * both backward-compatibility and POSIX requirements by forcing
2869 * ENOTCONN but still asking protocol to perform pru_shutdown().
2870 */
2871 if (so->so_type != SOCK_DGRAM && !SOLISTENING(so))
2872 return (ENOTCONN);
2873 soerror_enotconn = 1;
2874 }
2875
2876 if (SOLISTENING(so)) {
2877 if (how != SHUT_WR) {
2878 SOLISTEN_LOCK(so);
2879 so->so_error = ECONNABORTED;
2880 solisten_wakeup(so); /* unlocks so */
2881 }
2882 goto done;
2883 }
2884
2885 CURVNET_SET(so->so_vnet);
2886 if (pr->pr_usrreqs->pru_flush != NULL)
2887 (*pr->pr_usrreqs->pru_flush)(so, how);
2888 if (how != SHUT_WR)
2889 sorflush(so);
2890 if (how != SHUT_RD) {
2891 error = (*pr->pr_usrreqs->pru_shutdown)(so);
2892 wakeup(&so->so_timeo);
2893 CURVNET_RESTORE();
2894 return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
2895 }
2896 wakeup(&so->so_timeo);
2897 CURVNET_RESTORE();
2898
2899 done:
2900 return (soerror_enotconn ? ENOTCONN : 0);
2901 }
2902
2903 void
2904 sorflush(struct socket *so)
2905 {
2906 struct sockbuf *sb = &so->so_rcv;
2907 struct protosw *pr = so->so_proto;
2908 struct socket aso;
2909 int error;
2910
2911 VNET_SO_ASSERT(so);
2912
2913 /*
2914 * In order to avoid calling dom_dispose with the socket buffer mutex
2915 * held, and in order to generally avoid holding the lock for a long
2916 * time, we make a copy of the socket buffer and clear the original
2917 * (except locks, state). The new socket buffer copy won't have
2918 * initialized locks so we can only call routines that won't use or
2919 * assert those locks.
2920 *
2921 * Dislodge threads currently blocked in receive and wait to acquire
2922 * a lock against other simultaneous readers before clearing the
2923 * socket buffer. Don't let our acquire be interrupted by a signal
2924 * despite any existing socket disposition on interruptable waiting.
2925 */
2926 socantrcvmore(so);
2927 error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
2928 KASSERT(error == 0, ("%s: cannot lock sock %p recv buffer",
2929 __func__, so));
2930
2931 /*
2932 * Invalidate/clear most of the sockbuf structure, but leave selinfo
2933 * and mutex data unchanged.
2934 */
2935 SOCKBUF_LOCK(sb);
2936 bzero(&aso, sizeof(aso));
2937 aso.so_pcb = so->so_pcb;
2938 bcopy(&sb->sb_startzero, &aso.so_rcv.sb_startzero,
2939 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2940 bzero(&sb->sb_startzero,
2941 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
2942 SOCKBUF_UNLOCK(sb);
2943 SOCK_IO_RECV_UNLOCK(so);
2944
2945 /*
2946 * Dispose of special rights and flush the copied socket. Don't call
2947 * any unsafe routines (that rely on locks being initialized) on aso.
2948 */
2949 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
2950 (*pr->pr_domain->dom_dispose)(&aso);
2951 sbrelease_internal(&aso.so_rcv, so);
2952 }
2953
2954 /*
2955 * Wrapper for Socket established helper hook.
2956 * Parameters: socket, context of the hook point, hook id.
2957 */
2958 static int inline
2959 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
2960 {
2961 struct socket_hhook_data hhook_data = {
2962 .so = so,
2963 .hctx = hctx,
2964 .m = NULL,
2965 .status = 0
2966 };
2967
2968 CURVNET_SET(so->so_vnet);
2969 HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
2970 CURVNET_RESTORE();
2971
2972 /* Ugly but needed, since hhooks return void for now */
2973 return (hhook_data.status);
2974 }
2975
2976 /*
2977 * Perhaps this routine, and sooptcopyout(), below, ought to come in an
2978 * additional variant to handle the case where the option value needs to be
2979 * some kind of integer, but not a specific size. In addition to their use
2980 * here, these functions are also called by the protocol-level pr_ctloutput()
2981 * routines.
2982 */
2983 int
2984 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
2985 {
2986 size_t valsize;
2987
2988 /*
2989 * If the user gives us more than we wanted, we ignore it, but if we
2990 * don't get the minimum length the caller wants, we return EINVAL.
2991 * On success, sopt->sopt_valsize is set to however much we actually
2992 * retrieved.
2993 */
2994 if ((valsize = sopt->sopt_valsize) < minlen)
2995 return EINVAL;
2996 if (valsize > len)
2997 sopt->sopt_valsize = valsize = len;
2998
2999 if (sopt->sopt_td != NULL)
3000 return (copyin(sopt->sopt_val, buf, valsize));
3001
3002 bcopy(sopt->sopt_val, buf, valsize);
3003 return (0);
3004 }
3005
3006 u_long nl_maxsockbuf = 512 * 1024 * 1024; /* 512M, XXX: init based on physmem */
3007
3008 u_long
3009 sogetmaxbuf(struct socket *so)
3010 {
3011 if (so->so_proto->pr_domain->dom_family != PF_NETLINK)
3012 return (sb_max);
3013 return ((priv_check(curthread, PRIV_NET_ROUTE) == 0) ? nl_maxsockbuf : sb_max);
3014 }
3015
3016 /*
3017 * Kernel version of setsockopt(2).
3018 *
3019 * XXX: optlen is size_t, not socklen_t
3020 */
3021 int
3022 so_setsockopt(struct socket *so, int level, int optname, void *optval,
3023 size_t optlen)
3024 {
3025 struct sockopt sopt;
3026
3027 sopt.sopt_level = level;
3028 sopt.sopt_name = optname;
3029 sopt.sopt_dir = SOPT_SET;
3030 sopt.sopt_val = optval;
3031 sopt.sopt_valsize = optlen;
3032 sopt.sopt_td = NULL;
3033 return (sosetopt(so, &sopt));
3034 }
3035
3036 int
3037 sosetopt(struct socket *so, struct sockopt *sopt)
3038 {
3039 int error, optval;
3040 struct linger l;
3041 struct timeval tv;
3042 sbintime_t val, *valp;
3043 uint32_t val32;
3044 #ifdef MAC
3045 struct mac extmac;
3046 #endif
3047
3048 CURVNET_SET(so->so_vnet);
3049 error = 0;
3050 if (sopt->sopt_level != SOL_SOCKET) {
3051 if (so->so_proto->pr_ctloutput != NULL)
3052 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3053 else
3054 error = ENOPROTOOPT;
3055 } else {
3056 switch (sopt->sopt_name) {
3057 case SO_ACCEPTFILTER:
3058 error = accept_filt_setopt(so, sopt);
3059 if (error)
3060 goto bad;
3061 break;
3062
3063 case SO_LINGER:
3064 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
3065 if (error)
3066 goto bad;
3067 if (l.l_linger < 0 ||
3068 l.l_linger > USHRT_MAX ||
3069 l.l_linger > (INT_MAX / hz)) {
3070 error = EDOM;
3071 goto bad;
3072 }
3073 SOCK_LOCK(so);
3074 so->so_linger = l.l_linger;
3075 if (l.l_onoff)
3076 so->so_options |= SO_LINGER;
3077 else
3078 so->so_options &= ~SO_LINGER;
3079 SOCK_UNLOCK(so);
3080 break;
3081
3082 case SO_DEBUG:
3083 case SO_KEEPALIVE:
3084 case SO_DONTROUTE:
3085 case SO_USELOOPBACK:
3086 case SO_BROADCAST:
3087 case SO_REUSEADDR:
3088 case SO_REUSEPORT:
3089 case SO_REUSEPORT_LB:
3090 case SO_OOBINLINE:
3091 case SO_TIMESTAMP:
3092 case SO_BINTIME:
3093 case SO_NOSIGPIPE:
3094 case SO_NO_DDP:
3095 case SO_NO_OFFLOAD:
3096 case SO_RERROR:
3097 error = sooptcopyin(sopt, &optval, sizeof optval,
3098 sizeof optval);
3099 if (error)
3100 goto bad;
3101 SOCK_LOCK(so);
3102 if (optval)
3103 so->so_options |= sopt->sopt_name;
3104 else
3105 so->so_options &= ~sopt->sopt_name;
3106 SOCK_UNLOCK(so);
3107 break;
3108
3109 case SO_SETFIB:
3110 error = sooptcopyin(sopt, &optval, sizeof optval,
3111 sizeof optval);
3112 if (error)
3113 goto bad;
3114
3115 if (optval < 0 || optval >= rt_numfibs) {
3116 error = EINVAL;
3117 goto bad;
3118 }
3119 if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
3120 (so->so_proto->pr_domain->dom_family == PF_INET6) ||
3121 (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
3122 so->so_fibnum = optval;
3123 else
3124 so->so_fibnum = 0;
3125 break;
3126
3127 case SO_USER_COOKIE:
3128 error = sooptcopyin(sopt, &val32, sizeof val32,
3129 sizeof val32);
3130 if (error)
3131 goto bad;
3132 so->so_user_cookie = val32;
3133 break;
3134
3135 case SO_SNDBUF:
3136 case SO_RCVBUF:
3137 case SO_SNDLOWAT:
3138 case SO_RCVLOWAT:
3139 error = sooptcopyin(sopt, &optval, sizeof optval,
3140 sizeof optval);
3141 if (error)
3142 goto bad;
3143
3144 /*
3145 * Values < 1 make no sense for any of these options,
3146 * so disallow them.
3147 */
3148 if (optval < 1) {
3149 error = EINVAL;
3150 goto bad;
3151 }
3152
3153 error = sbsetopt(so, sopt->sopt_name, optval);
3154 break;
3155
3156 case SO_SNDTIMEO:
3157 case SO_RCVTIMEO:
3158 #ifdef COMPAT_FREEBSD32
3159 if (SV_CURPROC_FLAG(SV_ILP32)) {
3160 struct timeval32 tv32;
3161
3162 error = sooptcopyin(sopt, &tv32, sizeof tv32,
3163 sizeof tv32);
3164 CP(tv32, tv, tv_sec);
3165 CP(tv32, tv, tv_usec);
3166 } else
3167 #endif
3168 error = sooptcopyin(sopt, &tv, sizeof tv,
3169 sizeof tv);
3170 if (error)
3171 goto bad;
3172 if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
3173 tv.tv_usec >= 1000000) {
3174 error = EDOM;
3175 goto bad;
3176 }
3177 if (tv.tv_sec > INT32_MAX)
3178 val = SBT_MAX;
3179 else
3180 val = tvtosbt(tv);
3181 SOCK_LOCK(so);
3182 valp = sopt->sopt_name == SO_SNDTIMEO ?
3183 (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
3184 &so->so_snd.sb_timeo) :
3185 (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
3186 &so->so_rcv.sb_timeo);
3187 *valp = val;
3188 SOCK_UNLOCK(so);
3189 break;
3190
3191 case SO_LABEL:
3192 #ifdef MAC
3193 error = sooptcopyin(sopt, &extmac, sizeof extmac,
3194 sizeof extmac);
3195 if (error)
3196 goto bad;
3197 error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
3198 so, &extmac);
3199 #else
3200 error = EOPNOTSUPP;
3201 #endif
3202 break;
3203
3204 case SO_TS_CLOCK:
3205 error = sooptcopyin(sopt, &optval, sizeof optval,
3206 sizeof optval);
3207 if (error)
3208 goto bad;
3209 if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
3210 error = EINVAL;
3211 goto bad;
3212 }
3213 so->so_ts_clock = optval;
3214 break;
3215
3216 case SO_MAX_PACING_RATE:
3217 error = sooptcopyin(sopt, &val32, sizeof(val32),
3218 sizeof(val32));
3219 if (error)
3220 goto bad;
3221 so->so_max_pacing_rate = val32;
3222 break;
3223
3224 default:
3225 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3226 error = hhook_run_socket(so, sopt,
3227 HHOOK_SOCKET_OPT);
3228 else
3229 error = ENOPROTOOPT;
3230 break;
3231 }
3232 if (error == 0 && so->so_proto->pr_ctloutput != NULL)
3233 (void)(*so->so_proto->pr_ctloutput)(so, sopt);
3234 }
3235 bad:
3236 CURVNET_RESTORE();
3237 return (error);
3238 }
3239
3240 /*
3241 * Helper routine for getsockopt.
3242 */
3243 int
3244 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
3245 {
3246 int error;
3247 size_t valsize;
3248
3249 error = 0;
3250
3251 /*
3252 * Documented get behavior is that we always return a value, possibly
3253 * truncated to fit in the user's buffer. Traditional behavior is
3254 * that we always tell the user precisely how much we copied, rather
3255 * than something useful like the total amount we had available for
3256 * her. Note that this interface is not idempotent; the entire
3257 * answer must be generated ahead of time.
3258 */
3259 valsize = min(len, sopt->sopt_valsize);
3260 sopt->sopt_valsize = valsize;
3261 if (sopt->sopt_val != NULL) {
3262 if (sopt->sopt_td != NULL)
3263 error = copyout(buf, sopt->sopt_val, valsize);
3264 else
3265 bcopy(buf, sopt->sopt_val, valsize);
3266 }
3267 return (error);
3268 }
3269
3270 int
3271 sogetopt(struct socket *so, struct sockopt *sopt)
3272 {
3273 int error, optval;
3274 struct linger l;
3275 struct timeval tv;
3276 #ifdef MAC
3277 struct mac extmac;
3278 #endif
3279
3280 CURVNET_SET(so->so_vnet);
3281 error = 0;
3282 if (sopt->sopt_level != SOL_SOCKET) {
3283 if (so->so_proto->pr_ctloutput != NULL)
3284 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3285 else
3286 error = ENOPROTOOPT;
3287 CURVNET_RESTORE();
3288 return (error);
3289 } else {
3290 switch (sopt->sopt_name) {
3291 case SO_ACCEPTFILTER:
3292 error = accept_filt_getopt(so, sopt);
3293 break;
3294
3295 case SO_LINGER:
3296 SOCK_LOCK(so);
3297 l.l_onoff = so->so_options & SO_LINGER;
3298 l.l_linger = so->so_linger;
3299 SOCK_UNLOCK(so);
3300 error = sooptcopyout(sopt, &l, sizeof l);
3301 break;
3302
3303 case SO_USELOOPBACK:
3304 case SO_DONTROUTE:
3305 case SO_DEBUG:
3306 case SO_KEEPALIVE:
3307 case SO_REUSEADDR:
3308 case SO_REUSEPORT:
3309 case SO_REUSEPORT_LB:
3310 case SO_BROADCAST:
3311 case SO_OOBINLINE:
3312 case SO_ACCEPTCONN:
3313 case SO_TIMESTAMP:
3314 case SO_BINTIME:
3315 case SO_NOSIGPIPE:
3316 case SO_NO_DDP:
3317 case SO_NO_OFFLOAD:
3318 case SO_RERROR:
3319 optval = so->so_options & sopt->sopt_name;
3320 integer:
3321 error = sooptcopyout(sopt, &optval, sizeof optval);
3322 break;
3323
3324 case SO_DOMAIN:
3325 optval = so->so_proto->pr_domain->dom_family;
3326 goto integer;
3327
3328 case SO_TYPE:
3329 optval = so->so_type;
3330 goto integer;
3331
3332 case SO_PROTOCOL:
3333 optval = so->so_proto->pr_protocol;
3334 goto integer;
3335
3336 case SO_ERROR:
3337 SOCK_LOCK(so);
3338 if (so->so_error) {
3339 optval = so->so_error;
3340 so->so_error = 0;
3341 } else {
3342 optval = so->so_rerror;
3343 so->so_rerror = 0;
3344 }
3345 SOCK_UNLOCK(so);
3346 goto integer;
3347
3348 case SO_SNDBUF:
3349 SOCK_LOCK(so);
3350 optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
3351 so->so_snd.sb_hiwat;
3352 SOCK_UNLOCK(so);
3353 goto integer;
3354
3355 case SO_RCVBUF:
3356 SOCK_LOCK(so);
3357 optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
3358 so->so_rcv.sb_hiwat;
3359 SOCK_UNLOCK(so);
3360 goto integer;
3361
3362 case SO_SNDLOWAT:
3363 SOCK_LOCK(so);
3364 optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
3365 so->so_snd.sb_lowat;
3366 SOCK_UNLOCK(so);
3367 goto integer;
3368
3369 case SO_RCVLOWAT:
3370 SOCK_LOCK(so);
3371 optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
3372 so->so_rcv.sb_lowat;
3373 SOCK_UNLOCK(so);
3374 goto integer;
3375
3376 case SO_SNDTIMEO:
3377 case SO_RCVTIMEO:
3378 SOCK_LOCK(so);
3379 tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
3380 (SOLISTENING(so) ? so->sol_sbsnd_timeo :
3381 so->so_snd.sb_timeo) :
3382 (SOLISTENING(so) ? so->sol_sbrcv_timeo :
3383 so->so_rcv.sb_timeo));
3384 SOCK_UNLOCK(so);
3385 #ifdef COMPAT_FREEBSD32
3386 if (SV_CURPROC_FLAG(SV_ILP32)) {
3387 struct timeval32 tv32;
3388
3389 CP(tv, tv32, tv_sec);
3390 CP(tv, tv32, tv_usec);
3391 error = sooptcopyout(sopt, &tv32, sizeof tv32);
3392 } else
3393 #endif
3394 error = sooptcopyout(sopt, &tv, sizeof tv);
3395 break;
3396
3397 case SO_LABEL:
3398 #ifdef MAC
3399 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3400 sizeof(extmac));
3401 if (error)
3402 goto bad;
3403 error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
3404 so, &extmac);
3405 if (error)
3406 goto bad;
3407 error = sooptcopyout(sopt, &extmac, sizeof extmac);
3408 #else
3409 error = EOPNOTSUPP;
3410 #endif
3411 break;
3412
3413 case SO_PEERLABEL:
3414 #ifdef MAC
3415 error = sooptcopyin(sopt, &extmac, sizeof(extmac),
3416 sizeof(extmac));
3417 if (error)
3418 goto bad;
3419 error = mac_getsockopt_peerlabel(
3420 sopt->sopt_td->td_ucred, so, &extmac);
3421 if (error)
3422 goto bad;
3423 error = sooptcopyout(sopt, &extmac, sizeof extmac);
3424 #else
3425 error = EOPNOTSUPP;
3426 #endif
3427 break;
3428
3429 case SO_LISTENQLIMIT:
3430 SOCK_LOCK(so);
3431 optval = SOLISTENING(so) ? so->sol_qlimit : 0;
3432 SOCK_UNLOCK(so);
3433 goto integer;
3434
3435 case SO_LISTENQLEN:
3436 SOCK_LOCK(so);
3437 optval = SOLISTENING(so) ? so->sol_qlen : 0;
3438 SOCK_UNLOCK(so);
3439 goto integer;
3440
3441 case SO_LISTENINCQLEN:
3442 SOCK_LOCK(so);
3443 optval = SOLISTENING(so) ? so->sol_incqlen : 0;
3444 SOCK_UNLOCK(so);
3445 goto integer;
3446
3447 case SO_TS_CLOCK:
3448 optval = so->so_ts_clock;
3449 goto integer;
3450
3451 case SO_MAX_PACING_RATE:
3452 optval = so->so_max_pacing_rate;
3453 goto integer;
3454
3455 default:
3456 if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
3457 error = hhook_run_socket(so, sopt,
3458 HHOOK_SOCKET_OPT);
3459 else
3460 error = ENOPROTOOPT;
3461 break;
3462 }
3463 }
3464 #ifdef MAC
3465 bad:
3466 #endif
3467 CURVNET_RESTORE();
3468 return (error);
3469 }
3470
3471 int
3472 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
3473 {
3474 struct mbuf *m, *m_prev;
3475 int sopt_size = sopt->sopt_valsize;
3476
3477 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3478 if (m == NULL)
3479 return ENOBUFS;
3480 if (sopt_size > MLEN) {
3481 MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
3482 if ((m->m_flags & M_EXT) == 0) {
3483 m_free(m);
3484 return ENOBUFS;
3485 }
3486 m->m_len = min(MCLBYTES, sopt_size);
3487 } else {
3488 m->m_len = min(MLEN, sopt_size);
3489 }
3490 sopt_size -= m->m_len;
3491 *mp = m;
3492 m_prev = m;
3493
3494 while (sopt_size) {
3495 MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
3496 if (m == NULL) {
3497 m_freem(*mp);
3498 return ENOBUFS;
3499 }
3500 if (sopt_size > MLEN) {
3501 MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
3502 M_NOWAIT);
3503 if ((m->m_flags & M_EXT) == 0) {
3504 m_freem(m);
3505 m_freem(*mp);
3506 return ENOBUFS;
3507 }
3508 m->m_len = min(MCLBYTES, sopt_size);
3509 } else {
3510 m->m_len = min(MLEN, sopt_size);
3511 }
3512 sopt_size -= m->m_len;
3513 m_prev->m_next = m;
3514 m_prev = m;
3515 }
3516 return (0);
3517 }
3518
3519 int
3520 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
3521 {
3522 struct mbuf *m0 = m;
3523
3524 if (sopt->sopt_val == NULL)
3525 return (0);
3526 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3527 if (sopt->sopt_td != NULL) {
3528 int error;
3529
3530 error = copyin(sopt->sopt_val, mtod(m, char *),
3531 m->m_len);
3532 if (error != 0) {
3533 m_freem(m0);
3534 return(error);
3535 }
3536 } else
3537 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
3538 sopt->sopt_valsize -= m->m_len;
3539 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3540 m = m->m_next;
3541 }
3542 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
3543 panic("ip6_sooptmcopyin");
3544 return (0);
3545 }
3546
3547 int
3548 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
3549 {
3550 struct mbuf *m0 = m;
3551 size_t valsize = 0;
3552
3553 if (sopt->sopt_val == NULL)
3554 return (0);
3555 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
3556 if (sopt->sopt_td != NULL) {
3557 int error;
3558
3559 error = copyout(mtod(m, char *), sopt->sopt_val,
3560 m->m_len);
3561 if (error != 0) {
3562 m_freem(m0);
3563 return(error);
3564 }
3565 } else
3566 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
3567 sopt->sopt_valsize -= m->m_len;
3568 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
3569 valsize += m->m_len;
3570 m = m->m_next;
3571 }
3572 if (m != NULL) {
3573 /* enough soopt buffer should be given from user-land */
3574 m_freem(m0);
3575 return(EINVAL);
3576 }
3577 sopt->sopt_valsize = valsize;
3578 return (0);
3579 }
3580
3581 /*
3582 * sohasoutofband(): protocol notifies socket layer of the arrival of new
3583 * out-of-band data, which will then notify socket consumers.
3584 */
3585 void
3586 sohasoutofband(struct socket *so)
3587 {
3588
3589 if (so->so_sigio != NULL)
3590 pgsigio(&so->so_sigio, SIGURG, 0);
3591 selwakeuppri(&so->so_rdsel, PSOCK);
3592 }
3593
3594 int
3595 sopoll(struct socket *so, int events, struct ucred *active_cred,
3596 struct thread *td)
3597 {
3598
3599 /*
3600 * We do not need to set or assert curvnet as long as everyone uses
3601 * sopoll_generic().
3602 */
3603 return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
3604 td));
3605 }
3606
3607 int
3608 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
3609 struct thread *td)
3610 {
3611 int revents;
3612
3613 SOCK_LOCK(so);
3614 if (SOLISTENING(so)) {
3615 if (!(events & (POLLIN | POLLRDNORM)))
3616 revents = 0;
3617 else if (!TAILQ_EMPTY(&so->sol_comp))
3618 revents = events & (POLLIN | POLLRDNORM);
3619 else if ((events & POLLINIGNEOF) == 0 && so->so_error)
3620 revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
3621 else {
3622 selrecord(td, &so->so_rdsel);
3623 revents = 0;
3624 }
3625 } else {
3626 revents = 0;
3627 SOCKBUF_LOCK(&so->so_snd);
3628 SOCKBUF_LOCK(&so->so_rcv);
3629 if (events & (POLLIN | POLLRDNORM))
3630 if (soreadabledata(so))
3631 revents |= events & (POLLIN | POLLRDNORM);
3632 if (events & (POLLOUT | POLLWRNORM))
3633 if (sowriteable(so))
3634 revents |= events & (POLLOUT | POLLWRNORM);
3635 if (events & (POLLPRI | POLLRDBAND))
3636 if (so->so_oobmark ||
3637 (so->so_rcv.sb_state & SBS_RCVATMARK))
3638 revents |= events & (POLLPRI | POLLRDBAND);
3639 if ((events & POLLINIGNEOF) == 0) {
3640 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3641 revents |= events & (POLLIN | POLLRDNORM);
3642 if (so->so_snd.sb_state & SBS_CANTSENDMORE)
3643 revents |= POLLHUP;
3644 }
3645 }
3646 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3647 revents |= events & POLLRDHUP;
3648 if (revents == 0) {
3649 if (events &
3650 (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
3651 selrecord(td, &so->so_rdsel);
3652 so->so_rcv.sb_flags |= SB_SEL;
3653 }
3654 if (events & (POLLOUT | POLLWRNORM)) {
3655 selrecord(td, &so->so_wrsel);
3656 so->so_snd.sb_flags |= SB_SEL;
3657 }
3658 }
3659 SOCKBUF_UNLOCK(&so->so_rcv);
3660 SOCKBUF_UNLOCK(&so->so_snd);
3661 }
3662 SOCK_UNLOCK(so);
3663 return (revents);
3664 }
3665
3666 int
3667 soo_kqfilter(struct file *fp, struct knote *kn)
3668 {
3669 struct socket *so = kn->kn_fp->f_data;
3670 struct sockbuf *sb;
3671 struct knlist *knl;
3672
3673 switch (kn->kn_filter) {
3674 case EVFILT_READ:
3675 kn->kn_fop = &soread_filtops;
3676 knl = &so->so_rdsel.si_note;
3677 sb = &so->so_rcv;
3678 break;
3679 case EVFILT_WRITE:
3680 kn->kn_fop = &sowrite_filtops;
3681 knl = &so->so_wrsel.si_note;
3682 sb = &so->so_snd;
3683 break;
3684 case EVFILT_EMPTY:
3685 kn->kn_fop = &soempty_filtops;
3686 knl = &so->so_wrsel.si_note;
3687 sb = &so->so_snd;
3688 break;
3689 default:
3690 return (EINVAL);
3691 }
3692
3693 SOCK_LOCK(so);
3694 if (SOLISTENING(so)) {
3695 knlist_add(knl, kn, 1);
3696 } else {
3697 SOCKBUF_LOCK(sb);
3698 knlist_add(knl, kn, 1);
3699 sb->sb_flags |= SB_KNOTE;
3700 SOCKBUF_UNLOCK(sb);
3701 }
3702 SOCK_UNLOCK(so);
3703 return (0);
3704 }
3705
3706 /*
3707 * Some routines that return EOPNOTSUPP for entry points that are not
3708 * supported by a protocol. Fill in as needed.
3709 */
3710 int
3711 pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
3712 {
3713
3714 return EOPNOTSUPP;
3715 }
3716
3717 int
3718 pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
3719 {
3720
3721 return EOPNOTSUPP;
3722 }
3723
3724 int
3725 pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
3726 {
3727
3728 return EOPNOTSUPP;
3729 }
3730
3731 int
3732 pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3733 {
3734
3735 return EOPNOTSUPP;
3736 }
3737
3738 int
3739 pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3740 struct thread *td)
3741 {
3742
3743 return EOPNOTSUPP;
3744 }
3745
3746 int
3747 pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
3748 {
3749
3750 return EOPNOTSUPP;
3751 }
3752
3753 int
3754 pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
3755 struct thread *td)
3756 {
3757
3758 return EOPNOTSUPP;
3759 }
3760
3761 int
3762 pru_connect2_notsupp(struct socket *so1, struct socket *so2)
3763 {
3764
3765 return EOPNOTSUPP;
3766 }
3767
3768 int
3769 pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
3770 struct ifnet *ifp, struct thread *td)
3771 {
3772
3773 return EOPNOTSUPP;
3774 }
3775
3776 int
3777 pru_disconnect_notsupp(struct socket *so)
3778 {
3779
3780 return EOPNOTSUPP;
3781 }
3782
3783 int
3784 pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
3785 {
3786
3787 return EOPNOTSUPP;
3788 }
3789
3790 int
3791 pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
3792 {
3793
3794 return EOPNOTSUPP;
3795 }
3796
3797 int
3798 pru_rcvd_notsupp(struct socket *so, int flags)
3799 {
3800
3801 return EOPNOTSUPP;
3802 }
3803
3804 int
3805 pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
3806 {
3807
3808 return EOPNOTSUPP;
3809 }
3810
3811 int
3812 pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
3813 struct sockaddr *addr, struct mbuf *control, struct thread *td)
3814 {
3815
3816 if (control != NULL)
3817 m_freem(control);
3818 if ((flags & PRUS_NOTREADY) == 0)
3819 m_freem(m);
3820 return (EOPNOTSUPP);
3821 }
3822
3823 int
3824 pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
3825 {
3826
3827 return (EOPNOTSUPP);
3828 }
3829
3830 /*
3831 * This isn't really a ``null'' operation, but it's the default one and
3832 * doesn't do anything destructive.
3833 */
3834 int
3835 pru_sense_null(struct socket *so, struct stat *sb)
3836 {
3837
3838 sb->st_blksize = so->so_snd.sb_hiwat;
3839 return 0;
3840 }
3841
3842 int
3843 pru_shutdown_notsupp(struct socket *so)
3844 {
3845
3846 return EOPNOTSUPP;
3847 }
3848
3849 int
3850 pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
3851 {
3852
3853 return EOPNOTSUPP;
3854 }
3855
3856 int
3857 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
3858 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
3859 {
3860
3861 return EOPNOTSUPP;
3862 }
3863
3864 int
3865 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
3866 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3867 {
3868
3869 return EOPNOTSUPP;
3870 }
3871
3872 int
3873 pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
3874 struct thread *td)
3875 {
3876
3877 return EOPNOTSUPP;
3878 }
3879
3880 static void
3881 filt_sordetach(struct knote *kn)
3882 {
3883 struct socket *so = kn->kn_fp->f_data;
3884
3885 so_rdknl_lock(so);
3886 knlist_remove(&so->so_rdsel.si_note, kn, 1);
3887 if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
3888 so->so_rcv.sb_flags &= ~SB_KNOTE;
3889 so_rdknl_unlock(so);
3890 }
3891
3892 /*ARGSUSED*/
3893 static int
3894 filt_soread(struct knote *kn, long hint)
3895 {
3896 struct socket *so;
3897
3898 so = kn->kn_fp->f_data;
3899
3900 if (SOLISTENING(so)) {
3901 SOCK_LOCK_ASSERT(so);
3902 kn->kn_data = so->sol_qlen;
3903 if (so->so_error) {
3904 kn->kn_flags |= EV_EOF;
3905 kn->kn_fflags = so->so_error;
3906 return (1);
3907 }
3908 return (!TAILQ_EMPTY(&so->sol_comp));
3909 }
3910
3911 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
3912
3913 kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
3914 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3915 kn->kn_flags |= EV_EOF;
3916 kn->kn_fflags = so->so_error;
3917 return (1);
3918 } else if (so->so_error || so->so_rerror)
3919 return (1);
3920
3921 if (kn->kn_sfflags & NOTE_LOWAT) {
3922 if (kn->kn_data >= kn->kn_sdata)
3923 return (1);
3924 } else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
3925 return (1);
3926
3927 /* This hook returning non-zero indicates an event, not error */
3928 return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
3929 }
3930
3931 static void
3932 filt_sowdetach(struct knote *kn)
3933 {
3934 struct socket *so = kn->kn_fp->f_data;
3935
3936 so_wrknl_lock(so);
3937 knlist_remove(&so->so_wrsel.si_note, kn, 1);
3938 if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
3939 so->so_snd.sb_flags &= ~SB_KNOTE;
3940 so_wrknl_unlock(so);
3941 }
3942
3943 /*ARGSUSED*/
3944 static int
3945 filt_sowrite(struct knote *kn, long hint)
3946 {
3947 struct socket *so;
3948
3949 so = kn->kn_fp->f_data;
3950
3951 if (SOLISTENING(so))
3952 return (0);
3953
3954 SOCKBUF_LOCK_ASSERT(&so->so_snd);
3955 kn->kn_data = sbspace(&so->so_snd);
3956
3957 hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
3958
3959 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
3960 kn->kn_flags |= EV_EOF;
3961 kn->kn_fflags = so->so_error;
3962 return (1);
3963 } else if (so->so_error) /* temporary udp error */
3964 return (1);
3965 else if (((so->so_state & SS_ISCONNECTED) == 0) &&
3966 (so->so_proto->pr_flags & PR_CONNREQUIRED))
3967 return (0);
3968 else if (kn->kn_sfflags & NOTE_LOWAT)
3969 return (kn->kn_data >= kn->kn_sdata);
3970 else
3971 return (kn->kn_data >= so->so_snd.sb_lowat);
3972 }
3973
3974 static int
3975 filt_soempty(struct knote *kn, long hint)
3976 {
3977 struct socket *so;
3978
3979 so = kn->kn_fp->f_data;
3980
3981 if (SOLISTENING(so))
3982 return (1);
3983
3984 SOCKBUF_LOCK_ASSERT(&so->so_snd);
3985 kn->kn_data = sbused(&so->so_snd);
3986
3987 if (kn->kn_data == 0)
3988 return (1);
3989 else
3990 return (0);
3991 }
3992
3993 int
3994 socheckuid(struct socket *so, uid_t uid)
3995 {
3996
3997 if (so == NULL)
3998 return (EPERM);
3999 if (so->so_cred->cr_uid != uid)
4000 return (EPERM);
4001 return (0);
4002 }
4003
4004 /*
4005 * These functions are used by protocols to notify the socket layer (and its
4006 * consumers) of state changes in the sockets driven by protocol-side events.
4007 */
4008
4009 /*
4010 * Procedures to manipulate state flags of socket and do appropriate wakeups.
4011 *
4012 * Normal sequence from the active (originating) side is that
4013 * soisconnecting() is called during processing of connect() call, resulting
4014 * in an eventual call to soisconnected() if/when the connection is
4015 * established. When the connection is torn down soisdisconnecting() is
4016 * called during processing of disconnect() call, and soisdisconnected() is
4017 * called when the connection to the peer is totally severed. The semantics
4018 * of these routines are such that connectionless protocols can call
4019 * soisconnected() and soisdisconnected() only, bypassing the in-progress
4020 * calls when setting up a ``connection'' takes no time.
4021 *
4022 * From the passive side, a socket is created with two queues of sockets:
4023 * so_incomp for connections in progress and so_comp for connections already
4024 * made and awaiting user acceptance. As a protocol is preparing incoming
4025 * connections, it creates a socket structure queued on so_incomp by calling
4026 * sonewconn(). When the connection is established, soisconnected() is
4027 * called, and transfers the socket structure to so_comp, making it available
4028 * to accept().
4029 *
4030 * If a socket is closed with sockets on either so_incomp or so_comp, these
4031 * sockets are dropped.
4032 *
4033 * If higher-level protocols are implemented in the kernel, the wakeups done
4034 * here will sometimes cause software-interrupt process scheduling.
4035 */
4036 void
4037 soisconnecting(struct socket *so)
4038 {
4039
4040 SOCK_LOCK(so);
4041 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
4042 so->so_state |= SS_ISCONNECTING;
4043 SOCK_UNLOCK(so);
4044 }
4045
4046 void
4047 soisconnected(struct socket *so)
4048 {
4049 bool last __diagused;
4050
4051 SOCK_LOCK(so);
4052 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
4053 so->so_state |= SS_ISCONNECTED;
4054
4055 if (so->so_qstate == SQ_INCOMP) {
4056 struct socket *head = so->so_listen;
4057 int ret;
4058
4059 KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
4060 /*
4061 * Promoting a socket from incomplete queue to complete, we
4062 * need to go through reverse order of locking. We first do
4063 * trylock, and if that doesn't succeed, we go the hard way
4064 * leaving a reference and rechecking consistency after proper
4065 * locking.
4066 */
4067 if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
4068 soref(head);
4069 SOCK_UNLOCK(so);
4070 SOLISTEN_LOCK(head);
4071 SOCK_LOCK(so);
4072 if (__predict_false(head != so->so_listen)) {
4073 /*
4074 * The socket went off the listen queue,
4075 * should be lost race to close(2) of sol.
4076 * The socket is about to soabort().
4077 */
4078 SOCK_UNLOCK(so);
4079 sorele(head);
4080 return;
4081 }
4082 last = refcount_release(&head->so_count);
4083 KASSERT(!last, ("%s: released last reference for %p",
4084 __func__, head));
4085 }
4086 again:
4087 if ((so->so_options & SO_ACCEPTFILTER) == 0) {
4088 TAILQ_REMOVE(&head->sol_incomp, so, so_list);
4089 head->sol_incqlen--;
4090 TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
4091 head->sol_qlen++;
4092 so->so_qstate = SQ_COMP;
4093 SOCK_UNLOCK(so);
4094 solisten_wakeup(head); /* unlocks */
4095 } else {
4096 SOCKBUF_LOCK(&so->so_rcv);
4097 soupcall_set(so, SO_RCV,
4098 head->sol_accept_filter->accf_callback,
4099 head->sol_accept_filter_arg);
4100 so->so_options &= ~SO_ACCEPTFILTER;
4101 ret = head->sol_accept_filter->accf_callback(so,
4102 head->sol_accept_filter_arg, M_NOWAIT);
4103 if (ret == SU_ISCONNECTED) {
4104 soupcall_clear(so, SO_RCV);
4105 SOCKBUF_UNLOCK(&so->so_rcv);
4106 goto again;
4107 }
4108 SOCKBUF_UNLOCK(&so->so_rcv);
4109 SOCK_UNLOCK(so);
4110 SOLISTEN_UNLOCK(head);
4111 }
4112 return;
4113 }
4114 SOCK_UNLOCK(so);
4115 wakeup(&so->so_timeo);
4116 sorwakeup(so);
4117 sowwakeup(so);
4118 }
4119
4120 void
4121 soisdisconnecting(struct socket *so)
4122 {
4123
4124 SOCK_LOCK(so);
4125 so->so_state &= ~SS_ISCONNECTING;
4126 so->so_state |= SS_ISDISCONNECTING;
4127
4128 if (!SOLISTENING(so)) {
4129 SOCKBUF_LOCK(&so->so_rcv);
4130 socantrcvmore_locked(so);
4131 SOCKBUF_LOCK(&so->so_snd);
4132 socantsendmore_locked(so);
4133 }
4134 SOCK_UNLOCK(so);
4135 wakeup(&so->so_timeo);
4136 }
4137
4138 void
4139 soisdisconnected(struct socket *so)
4140 {
4141
4142 SOCK_LOCK(so);
4143
4144 /*
4145 * There is at least one reader of so_state that does not
4146 * acquire socket lock, namely soreceive_generic(). Ensure
4147 * that it never sees all flags that track connection status
4148 * cleared, by ordering the update with a barrier semantic of
4149 * our release thread fence.
4150 */
4151 so->so_state |= SS_ISDISCONNECTED;
4152 atomic_thread_fence_rel();
4153 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
4154
4155 if (!SOLISTENING(so)) {
4156 SOCK_UNLOCK(so);
4157 SOCKBUF_LOCK(&so->so_rcv);
4158 socantrcvmore_locked(so);
4159 SOCKBUF_LOCK(&so->so_snd);
4160 sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
4161 socantsendmore_locked(so);
4162 } else
4163 SOCK_UNLOCK(so);
4164 wakeup(&so->so_timeo);
4165 }
4166
4167 int
4168 soiolock(struct socket *so, struct sx *sx, int flags)
4169 {
4170 int error;
4171
4172 KASSERT((flags & SBL_VALID) == flags,
4173 ("soiolock: invalid flags %#x", flags));
4174
4175 if ((flags & SBL_WAIT) != 0) {
4176 if ((flags & SBL_NOINTR) != 0) {
4177 sx_xlock(sx);
4178 } else {
4179 error = sx_xlock_sig(sx);
4180 if (error != 0)
4181 return (error);
4182 }
4183 } else if (!sx_try_xlock(sx)) {
4184 return (EWOULDBLOCK);
4185 }
4186
4187 if (__predict_false(SOLISTENING(so))) {
4188 sx_xunlock(sx);
4189 return (ENOTCONN);
4190 }
4191 return (0);
4192 }
4193
4194 void
4195 soiounlock(struct sx *sx)
4196 {
4197 sx_xunlock(sx);
4198 }
4199
4200 /*
4201 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
4202 */
4203 struct sockaddr *
4204 sodupsockaddr(const struct sockaddr *sa, int mflags)
4205 {
4206 struct sockaddr *sa2;
4207
4208 sa2 = malloc(sa->sa_len, M_SONAME, mflags);
4209 if (sa2)
4210 bcopy(sa, sa2, sa->sa_len);
4211 return sa2;
4212 }
4213
4214 /*
4215 * Register per-socket destructor.
4216 */
4217 void
4218 sodtor_set(struct socket *so, so_dtor_t *func)
4219 {
4220
4221 SOCK_LOCK_ASSERT(so);
4222 so->so_dtor = func;
4223 }
4224
4225 /*
4226 * Register per-socket buffer upcalls.
4227 */
4228 void
4229 soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg)
4230 {
4231 struct sockbuf *sb;
4232
4233 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4234
4235 switch (which) {
4236 case SO_RCV:
4237 sb = &so->so_rcv;
4238 break;
4239 case SO_SND:
4240 sb = &so->so_snd;
4241 break;
4242 default:
4243 panic("soupcall_set: bad which");
4244 }
4245 SOCKBUF_LOCK_ASSERT(sb);
4246 sb->sb_upcall = func;
4247 sb->sb_upcallarg = arg;
4248 sb->sb_flags |= SB_UPCALL;
4249 }
4250
4251 void
4252 soupcall_clear(struct socket *so, int which)
4253 {
4254 struct sockbuf *sb;
4255
4256 KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
4257
4258 switch (which) {
4259 case SO_RCV:
4260 sb = &so->so_rcv;
4261 break;
4262 case SO_SND:
4263 sb = &so->so_snd;
4264 break;
4265 default:
4266 panic("soupcall_clear: bad which");
4267 }
4268 SOCKBUF_LOCK_ASSERT(sb);
4269 KASSERT(sb->sb_upcall != NULL,
4270 ("%s: so %p no upcall to clear", __func__, so));
4271 sb->sb_upcall = NULL;
4272 sb->sb_upcallarg = NULL;
4273 sb->sb_flags &= ~SB_UPCALL;
4274 }
4275
4276 void
4277 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
4278 {
4279
4280 SOLISTEN_LOCK_ASSERT(so);
4281 so->sol_upcall = func;
4282 so->sol_upcallarg = arg;
4283 }
4284
4285 static void
4286 so_rdknl_lock(void *arg)
4287 {
4288 struct socket *so = arg;
4289
4290 if (SOLISTENING(so))
4291 SOCK_LOCK(so);
4292 else
4293 SOCKBUF_LOCK(&so->so_rcv);
4294 }
4295
4296 static void
4297 so_rdknl_unlock(void *arg)
4298 {
4299 struct socket *so = arg;
4300
4301 if (SOLISTENING(so))
4302 SOCK_UNLOCK(so);
4303 else
4304 SOCKBUF_UNLOCK(&so->so_rcv);
4305 }
4306
4307 static void
4308 so_rdknl_assert_lock(void *arg, int what)
4309 {
4310 struct socket *so = arg;
4311
4312 if (what == LA_LOCKED) {
4313 if (SOLISTENING(so))
4314 SOCK_LOCK_ASSERT(so);
4315 else
4316 SOCKBUF_LOCK_ASSERT(&so->so_rcv);
4317 } else {
4318 if (SOLISTENING(so))
4319 SOCK_UNLOCK_ASSERT(so);
4320 else
4321 SOCKBUF_UNLOCK_ASSERT(&so->so_rcv);
4322 }
4323 }
4324
4325 static void
4326 so_wrknl_lock(void *arg)
4327 {
4328 struct socket *so = arg;
4329
4330 if (SOLISTENING(so))
4331 SOCK_LOCK(so);
4332 else
4333 SOCKBUF_LOCK(&so->so_snd);
4334 }
4335
4336 static void
4337 so_wrknl_unlock(void *arg)
4338 {
4339 struct socket *so = arg;
4340
4341 if (SOLISTENING(so))
4342 SOCK_UNLOCK(so);
4343 else
4344 SOCKBUF_UNLOCK(&so->so_snd);
4345 }
4346
4347 static void
4348 so_wrknl_assert_lock(void *arg, int what)
4349 {
4350 struct socket *so = arg;
4351
4352 if (what == LA_LOCKED) {
4353 if (SOLISTENING(so))
4354 SOCK_LOCK_ASSERT(so);
4355 else
4356 SOCKBUF_LOCK_ASSERT(&so->so_snd);
4357 } else {
4358 if (SOLISTENING(so))
4359 SOCK_UNLOCK_ASSERT(so);
4360 else
4361 SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
4362 }
4363 }
4364
4365 /*
4366 * Create an external-format (``xsocket'') structure using the information in
4367 * the kernel-format socket structure pointed to by so. This is done to
4368 * reduce the spew of irrelevant information over this interface, to isolate
4369 * user code from changes in the kernel structure, and potentially to provide
4370 * information-hiding if we decide that some of this information should be
4371 * hidden from users.
4372 */
4373 void
4374 sotoxsocket(struct socket *so, struct xsocket *xso)
4375 {
4376
4377 bzero(xso, sizeof(*xso));
4378 xso->xso_len = sizeof *xso;
4379 xso->xso_so = (uintptr_t)so;
4380 xso->so_type = so->so_type;
4381 xso->so_options = so->so_options;
4382 xso->so_linger = so->so_linger;
4383 xso->so_state = so->so_state;
4384 xso->so_pcb = (uintptr_t)so->so_pcb;
4385 xso->xso_protocol = so->so_proto->pr_protocol;
4386 xso->xso_family = so->so_proto->pr_domain->dom_family;
4387 xso->so_timeo = so->so_timeo;
4388 xso->so_error = so->so_error;
4389 xso->so_uid = so->so_cred->cr_uid;
4390 xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
4391 if (SOLISTENING(so)) {
4392 xso->so_qlen = so->sol_qlen;
4393 xso->so_incqlen = so->sol_incqlen;
4394 xso->so_qlimit = so->sol_qlimit;
4395 xso->so_oobmark = 0;
4396 } else {
4397 xso->so_state |= so->so_qstate;
4398 xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
4399 xso->so_oobmark = so->so_oobmark;
4400 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
4401 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
4402 }
4403 }
4404
4405 struct sockbuf *
4406 so_sockbuf_rcv(struct socket *so)
4407 {
4408
4409 return (&so->so_rcv);
4410 }
4411
4412 struct sockbuf *
4413 so_sockbuf_snd(struct socket *so)
4414 {
4415
4416 return (&so->so_snd);
4417 }
4418
4419 int
4420 so_state_get(const struct socket *so)
4421 {
4422
4423 return (so->so_state);
4424 }
4425
4426 void
4427 so_state_set(struct socket *so, int val)
4428 {
4429
4430 so->so_state = val;
4431 }
4432
4433 int
4434 so_options_get(const struct socket *so)
4435 {
4436
4437 return (so->so_options);
4438 }
4439
4440 void
4441 so_options_set(struct socket *so, int val)
4442 {
4443
4444 so->so_options = val;
4445 }
4446
4447 int
4448 so_error_get(const struct socket *so)
4449 {
4450
4451 return (so->so_error);
4452 }
4453
4454 void
4455 so_error_set(struct socket *so, int val)
4456 {
4457
4458 so->so_error = val;
4459 }
4460
4461 int
4462 so_linger_get(const struct socket *so)
4463 {
4464
4465 return (so->so_linger);
4466 }
4467
4468 void
4469 so_linger_set(struct socket *so, int val)
4470 {
4471
4472 KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
4473 ("%s: val %d out of range", __func__, val));
4474
4475 so->so_linger = val;
4476 }
4477
4478 struct protosw *
4479 so_protosw_get(const struct socket *so)
4480 {
4481
4482 return (so->so_proto);
4483 }
4484
4485 void
4486 so_protosw_set(struct socket *so, struct protosw *val)
4487 {
4488
4489 so->so_proto = val;
4490 }
4491
4492 void
4493 so_sorwakeup(struct socket *so)
4494 {
4495
4496 sorwakeup(so);
4497 }
4498
4499 void
4500 so_sowwakeup(struct socket *so)
4501 {
4502
4503 sowwakeup(so);
4504 }
4505
4506 void
4507 so_sorwakeup_locked(struct socket *so)
4508 {
4509
4510 sorwakeup_locked(so);
4511 }
4512
4513 void
4514 so_sowwakeup_locked(struct socket *so)
4515 {
4516
4517 sowwakeup_locked(so);
4518 }
4519
4520 void
4521 so_lock(struct socket *so)
4522 {
4523
4524 SOCK_LOCK(so);
4525 }
4526
4527 void
4528 so_unlock(struct socket *so)
4529 {
4530
4531 SOCK_UNLOCK(so);
4532 }
4533