1 /* $OpenBSD: uipc_socket.c,v 1.372 2025/02/06 13:40:57 mvs Exp $ */
2 /* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1988, 1990, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
33 */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/proc.h>
38 #include <sys/file.h>
39 #include <sys/filedesc.h>
40 #include <sys/malloc.h>
41 #include <sys/mbuf.h>
42 #include <sys/domain.h>
43 #include <sys/event.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/unpcb.h>
47 #include <sys/socketvar.h>
48 #include <sys/signalvar.h>
49 #include <sys/pool.h>
50 #include <sys/atomic.h>
51 #include <sys/rwlock.h>
52 #include <sys/time.h>
53 #include <sys/refcnt.h>
54
55 #ifdef DDB
56 #include <machine/db_machdep.h>
57 #endif
58
59 void sbsync(struct sockbuf *, struct mbuf *);
60
61 int sosplice(struct socket *, int, off_t, struct timeval *);
62 void sounsplice(struct socket *, struct socket *, int);
63 void soidle(void *);
64 void sotask(void *);
65 int somove(struct socket *, int);
66 void sorflush(struct socket *);
67
68 void filt_sordetach(struct knote *kn);
69 int filt_soread(struct knote *kn, long hint);
70 void filt_sowdetach(struct knote *kn);
71 int filt_sowrite(struct knote *kn, long hint);
72 int filt_soexcept(struct knote *kn, long hint);
73
74 int filt_sowmodify(struct kevent *kev, struct knote *kn);
75 int filt_sowprocess(struct knote *kn, struct kevent *kev);
76
77 int filt_sormodify(struct kevent *kev, struct knote *kn);
78 int filt_sorprocess(struct knote *kn, struct kevent *kev);
79
80 int filt_soemodify(struct kevent *kev, struct knote *kn);
81 int filt_soeprocess(struct knote *kn, struct kevent *kev);
82
83 const struct filterops soread_filtops = {
84 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
85 .f_attach = NULL,
86 .f_detach = filt_sordetach,
87 .f_event = filt_soread,
88 .f_modify = filt_sormodify,
89 .f_process = filt_sorprocess,
90 };
91
92 const struct filterops sowrite_filtops = {
93 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
94 .f_attach = NULL,
95 .f_detach = filt_sowdetach,
96 .f_event = filt_sowrite,
97 .f_modify = filt_sowmodify,
98 .f_process = filt_sowprocess,
99 };
100
101 const struct filterops soexcept_filtops = {
102 .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
103 .f_attach = NULL,
104 .f_detach = filt_sordetach,
105 .f_event = filt_soexcept,
106 .f_modify = filt_soemodify,
107 .f_process = filt_soeprocess,
108 };
109
110 #ifndef SOMINCONN
111 #define SOMINCONN 80
112 #endif /* SOMINCONN */
113
114 int somaxconn = SOMAXCONN;
115 int sominconn = SOMINCONN;
116
117 struct pool socket_pool;
118 #ifdef SOCKET_SPLICE
119 struct pool sosplice_pool;
120 struct taskq *sosplice_taskq;
121 struct rwlock sosplice_lock = RWLOCK_INITIALIZER("sosplicelk");
122 #endif
123
124 void
soinit(void)125 soinit(void)
126 {
127 pool_init(&socket_pool, sizeof(struct socket), 0, IPL_SOFTNET, 0,
128 "sockpl", NULL);
129 #ifdef SOCKET_SPLICE
130 pool_init(&sosplice_pool, sizeof(struct sosplice), 0, IPL_SOFTNET, 0,
131 "sosppl", NULL);
132 #endif
133 }
134
135 struct socket *
soalloc(const struct protosw * prp,int wait)136 soalloc(const struct protosw *prp, int wait)
137 {
138 const struct domain *dp = prp->pr_domain;
139 const char *dom_name = dp->dom_name;
140 struct socket *so;
141
142 so = pool_get(&socket_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
143 PR_ZERO);
144 if (so == NULL)
145 return (NULL);
146
147 #ifdef WITNESS
148 /*
149 * XXX: Make WITNESS happy. AF_INET and AF_INET6 sockets could be
150 * spliced together.
151 */
152 switch (dp->dom_family) {
153 case AF_INET:
154 case AF_INET6:
155 dom_name = "inet46";
156 break;
157 }
158 #endif
159
160 refcnt_init_trace(&so->so_refcnt, DT_REFCNT_IDX_SOCKET);
161 rw_init_flags(&so->so_lock, dom_name, RWL_DUPOK);
162 rw_init(&so->so_rcv.sb_lock, "sbufrcv");
163 rw_init(&so->so_snd.sb_lock, "sbufsnd");
164 mtx_init_flags(&so->so_rcv.sb_mtx, IPL_MPFLOOR, "sbrcv", 0);
165 mtx_init_flags(&so->so_snd.sb_mtx, IPL_MPFLOOR, "sbsnd", 0);
166 klist_init_mutex(&so->so_rcv.sb_klist, &so->so_rcv.sb_mtx);
167 klist_init_mutex(&so->so_snd.sb_klist, &so->so_snd.sb_mtx);
168 sigio_init(&so->so_sigio);
169 TAILQ_INIT(&so->so_q0);
170 TAILQ_INIT(&so->so_q);
171
172 return (so);
173 }
174
175 /*
176 * Socket operation routines.
177 * These routines are called by the routines in
178 * sys_socket.c or from a system process, and
179 * implement the semantics of socket operations by
180 * switching out to the protocol specific routines.
181 */
182 int
socreate(int dom,struct socket ** aso,int type,int proto)183 socreate(int dom, struct socket **aso, int type, int proto)
184 {
185 struct proc *p = curproc; /* XXX */
186 const struct protosw *prp;
187 struct socket *so;
188 int error;
189
190 if (proto)
191 prp = pffindproto(dom, proto, type);
192 else
193 prp = pffindtype(dom, type);
194 if (prp == NULL || prp->pr_usrreqs == NULL)
195 return (EPROTONOSUPPORT);
196 if (prp->pr_type != type)
197 return (EPROTOTYPE);
198 so = soalloc(prp, M_WAIT);
199 so->so_type = type;
200 if (suser(p) == 0)
201 so->so_state = SS_PRIV;
202 so->so_ruid = p->p_ucred->cr_ruid;
203 so->so_euid = p->p_ucred->cr_uid;
204 so->so_rgid = p->p_ucred->cr_rgid;
205 so->so_egid = p->p_ucred->cr_gid;
206 so->so_cpid = p->p_p->ps_pid;
207 so->so_proto = prp;
208 so->so_snd.sb_timeo_nsecs = INFSLP;
209 so->so_rcv.sb_timeo_nsecs = INFSLP;
210
211 solock_shared(so);
212 error = pru_attach(so, proto, M_WAIT);
213 if (error) {
214 so->so_state |= SS_NOFDREF;
215 /* sofree() calls sounlock(). */
216 soref(so);
217 sofree(so, 1);
218 sounlock_shared(so);
219 sorele(so);
220 return (error);
221 }
222 sounlock_shared(so);
223 *aso = so;
224 return (0);
225 }
226
227 int
sobind(struct socket * so,struct mbuf * nam,struct proc * p)228 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
229 {
230 soassertlocked(so);
231 return pru_bind(so, nam, p);
232 }
233
234 int
solisten(struct socket * so,int backlog)235 solisten(struct socket *so, int backlog)
236 {
237 int somaxconn_local = atomic_load_int(&somaxconn);
238 int sominconn_local = atomic_load_int(&sominconn);
239 int error;
240
241 switch (so->so_type) {
242 case SOCK_STREAM:
243 case SOCK_SEQPACKET:
244 break;
245 default:
246 return (EOPNOTSUPP);
247 }
248
249 soassertlocked(so);
250
251 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
252 return (EINVAL);
253 #ifdef SOCKET_SPLICE
254 if (isspliced(so) || issplicedback(so))
255 return (EOPNOTSUPP);
256 #endif /* SOCKET_SPLICE */
257 error = pru_listen(so);
258 if (error)
259 return (error);
260 if (TAILQ_FIRST(&so->so_q) == NULL)
261 so->so_options |= SO_ACCEPTCONN;
262 if (backlog < 0 || backlog > somaxconn_local)
263 backlog = somaxconn_local;
264 if (backlog < sominconn_local)
265 backlog = sominconn_local;
266 so->so_qlimit = backlog;
267 return (0);
268 }
269
270 void
sorele(struct socket * so)271 sorele(struct socket *so)
272 {
273 if (refcnt_rele(&so->so_refcnt) == 0)
274 return;
275
276 sigio_free(&so->so_sigio);
277 klist_free(&so->so_rcv.sb_klist);
278 klist_free(&so->so_snd.sb_klist);
279
280 mtx_enter(&so->so_snd.sb_mtx);
281 sbrelease(so, &so->so_snd);
282 mtx_leave(&so->so_snd.sb_mtx);
283
284 if (so->so_proto->pr_flags & PR_RIGHTS &&
285 so->so_proto->pr_domain->dom_dispose)
286 (*so->so_proto->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
287 m_purge(so->so_rcv.sb_mb);
288
289 #ifdef SOCKET_SPLICE
290 if (so->so_sp)
291 pool_put(&sosplice_pool, so->so_sp);
292 #endif
293 pool_put(&socket_pool, so);
294 }
295
296 #define SOSP_FREEING_READ 1
297 #define SOSP_FREEING_WRITE 2
298 void
sofree(struct socket * so,int keep_lock)299 sofree(struct socket *so, int keep_lock)
300 {
301 int persocket = solock_persocket(so);
302
303 soassertlocked(so);
304
305 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
306 if (!keep_lock)
307 sounlock(so);
308 return;
309 }
310 if (so->so_head) {
311 struct socket *head = so->so_head;
312
313 /*
314 * We must not decommission a socket that's on the accept(2)
315 * queue. If we do, then accept(2) may hang after select(2)
316 * indicated that the listening socket was ready.
317 */
318 if (so->so_onq == &head->so_q) {
319 if (!keep_lock)
320 sounlock(so);
321 return;
322 }
323
324 if (persocket) {
325 soref(head);
326 sounlock(so);
327 solock(head);
328 solock(so);
329
330 if (so->so_onq != &head->so_q0) {
331 sounlock(so);
332 sounlock(head);
333 sorele(head);
334 return;
335 }
336 }
337
338 soqremque(so, 0);
339
340 if (persocket) {
341 sounlock(head);
342 sorele(head);
343 }
344 }
345
346 if (!keep_lock)
347 sounlock(so);
348 sorele(so);
349 }
350
351 static inline uint64_t
solinger_nsec(struct socket * so)352 solinger_nsec(struct socket *so)
353 {
354 if (so->so_linger == 0)
355 return INFSLP;
356
357 return SEC_TO_NSEC(so->so_linger);
358 }
359
360 /*
361 * Close a socket on last file table reference removal.
362 * Initiate disconnect if connected.
363 * Free socket when disconnect complete.
364 */
365 int
soclose(struct socket * so,int flags)366 soclose(struct socket *so, int flags)
367 {
368 struct socket *so2;
369 int error = 0;
370
371 solock(so);
372 /* Revoke async IO early. There is a final revocation in sofree(). */
373 sigio_free(&so->so_sigio);
374 if (so->so_state & SS_ISCONNECTED) {
375 if (so->so_pcb == NULL)
376 goto discard;
377 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
378 error = sodisconnect(so);
379 if (error)
380 goto drop;
381 }
382 if (so->so_options & SO_LINGER) {
383 if ((so->so_state & SS_ISDISCONNECTING) &&
384 (flags & MSG_DONTWAIT))
385 goto drop;
386 while (so->so_state & SS_ISCONNECTED) {
387 error = sosleep_nsec(so, &so->so_timeo,
388 PSOCK | PCATCH, "netcls",
389 solinger_nsec(so));
390 if (error)
391 break;
392 }
393 }
394 }
395 drop:
396 if (so->so_pcb) {
397 int error2;
398 error2 = pru_detach(so);
399 if (error == 0)
400 error = error2;
401 }
402 if (so->so_options & SO_ACCEPTCONN) {
403 int persocket = solock_persocket(so);
404
405 while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
406 soref(so2);
407 solock(so2);
408 (void) soqremque(so2, 0);
409 sounlock(so);
410 soabort(so2);
411 sounlock(so2);
412 sorele(so2);
413 solock(so);
414 }
415 while ((so2 = TAILQ_FIRST(&so->so_q)) != NULL) {
416 soref(so2);
417 solock_nonet(so2);
418 (void) soqremque(so2, 1);
419 if (persocket)
420 sounlock(so);
421 soabort(so2);
422 sounlock_nonet(so2);
423 sorele(so2);
424 if (persocket)
425 solock(so);
426 }
427 }
428 discard:
429 #ifdef SOCKET_SPLICE
430 if (so->so_sp) {
431 struct socket *soback;
432
433 sounlock(so);
434 mtx_enter(&so->so_snd.sb_mtx);
435 /*
436 * Concurrent sounsplice() locks `sb_mtx' mutexes on
437 * both `so_snd' and `so_rcv' before unsplice sockets.
438 */
439 if ((soback = so->so_sp->ssp_soback) == NULL) {
440 mtx_leave(&so->so_snd.sb_mtx);
441 goto notsplicedback;
442 }
443 soref(soback);
444 mtx_leave(&so->so_snd.sb_mtx);
445
446 /*
447 * `so' can be only unspliced, and never spliced again.
448 * Thus if issplicedback(so) check is positive, socket is
449 * still spliced and `ssp_soback' points to the same
450 * socket that `soback'.
451 */
452 sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR);
453 if (issplicedback(so)) {
454 int freeing = SOSP_FREEING_WRITE;
455
456 if (so->so_sp->ssp_soback == so)
457 freeing |= SOSP_FREEING_READ;
458 sounsplice(so->so_sp->ssp_soback, so, freeing);
459 }
460 sbunlock(&soback->so_rcv);
461 sorele(soback);
462
463 notsplicedback:
464 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
465 if (isspliced(so)) {
466 struct socket *sosp;
467 int freeing = SOSP_FREEING_READ;
468
469 if (so == so->so_sp->ssp_socket)
470 freeing |= SOSP_FREEING_WRITE;
471 sosp = soref(so->so_sp->ssp_socket);
472 sounsplice(so, so->so_sp->ssp_socket, freeing);
473 sorele(sosp);
474 }
475 sbunlock(&so->so_rcv);
476
477 timeout_del_barrier(&so->so_sp->ssp_idleto);
478 task_del(sosplice_taskq, &so->so_sp->ssp_task);
479 taskq_barrier(sosplice_taskq);
480
481 solock(so);
482 }
483 #endif /* SOCKET_SPLICE */
484
485 if (so->so_state & SS_NOFDREF)
486 panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
487 so->so_state |= SS_NOFDREF;
488
489 /* sofree() calls sounlock(). */
490 sofree(so, 0);
491 return (error);
492 }
493
494 void
soabort(struct socket * so)495 soabort(struct socket *so)
496 {
497 soassertlocked(so);
498 pru_abort(so);
499 }
500
501 int
soaccept(struct socket * so,struct mbuf * nam)502 soaccept(struct socket *so, struct mbuf *nam)
503 {
504 int error = 0;
505
506 soassertlocked(so);
507
508 if ((so->so_state & SS_NOFDREF) == 0)
509 panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
510 so->so_state &= ~SS_NOFDREF;
511 if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
512 (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
513 error = pru_accept(so, nam);
514 else
515 error = ECONNABORTED;
516 return (error);
517 }
518
519 int
soconnect(struct socket * so,struct mbuf * nam)520 soconnect(struct socket *so, struct mbuf *nam)
521 {
522 int error;
523
524 soassertlocked(so);
525
526 if (so->so_options & SO_ACCEPTCONN)
527 return (EOPNOTSUPP);
528 /*
529 * If protocol is connection-based, can only connect once.
530 * Otherwise, if connected, try to disconnect first.
531 * This allows user to disconnect by connecting to, e.g.,
532 * a null address.
533 */
534 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
535 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
536 (error = sodisconnect(so))))
537 error = EISCONN;
538 else
539 error = pru_connect(so, nam);
540 return (error);
541 }
542
543 int
soconnect2(struct socket * so1,struct socket * so2)544 soconnect2(struct socket *so1, struct socket *so2)
545 {
546 int error;
547
548 solock_pair(so1, so2);
549 error = pru_connect2(so1, so2);
550 sounlock_pair(so1, so2);
551
552 return (error);
553 }
554
555 int
sodisconnect(struct socket * so)556 sodisconnect(struct socket *so)
557 {
558 int error;
559
560 soassertlocked(so);
561
562 if ((so->so_state & SS_ISCONNECTED) == 0)
563 return (ENOTCONN);
564 if (so->so_state & SS_ISDISCONNECTING)
565 return (EALREADY);
566 error = pru_disconnect(so);
567 return (error);
568 }
569
570 int m_getuio(struct mbuf **, int, long, struct uio *);
571
572 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
573 /*
574 * Send on a socket.
575 * If send must go all at once and message is larger than
576 * send buffering, then hard error.
577 * Lock against other senders.
578 * If must go all at once and not enough room now, then
579 * inform user that this would block and do nothing.
580 * Otherwise, if nonblocking, send as much as possible.
581 * The data to be sent is described by "uio" if nonzero,
582 * otherwise by the mbuf chain "top" (which must be null
583 * if uio is not). Data provided in mbuf chain must be small
584 * enough to send all at once.
585 *
586 * Returns nonzero on error, timeout or signal; callers
587 * must check for short counts if EINTR/ERESTART are returned.
588 * Data and control buffers are freed on return.
589 */
590 int
sosend(struct socket * so,struct mbuf * addr,struct uio * uio,struct mbuf * top,struct mbuf * control,int flags)591 sosend(struct socket *so, struct mbuf *addr, struct uio *uio, struct mbuf *top,
592 struct mbuf *control, int flags)
593 {
594 long space, clen = 0;
595 size_t resid;
596 int error;
597 int atomic = sosendallatonce(so) || top;
598
599 if (uio)
600 resid = uio->uio_resid;
601 else
602 resid = top->m_pkthdr.len;
603 /* MSG_EOR on a SOCK_STREAM socket is invalid. */
604 if (so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
605 m_freem(top);
606 m_freem(control);
607 return (EINVAL);
608 }
609 if (uio && uio->uio_procp)
610 uio->uio_procp->p_ru.ru_msgsnd++;
611 if (control) {
612 /*
613 * In theory clen should be unsigned (since control->m_len is).
614 * However, space must be signed, as it might be less than 0
615 * if we over-committed, and we must use a signed comparison
616 * of space and clen.
617 */
618 clen = control->m_len;
619 /* reserve extra space for AF_UNIX's internalize */
620 if (so->so_proto->pr_domain->dom_family == AF_UNIX &&
621 clen >= CMSG_ALIGN(sizeof(struct cmsghdr)) &&
622 mtod(control, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
623 clen = CMSG_SPACE(
624 (clen - CMSG_ALIGN(sizeof(struct cmsghdr))) *
625 (sizeof(struct fdpass) / sizeof(int)));
626 }
627
628 #define snderr(errno) { error = errno; goto release; }
629
630 restart:
631 if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
632 goto out;
633 mtx_enter(&so->so_snd.sb_mtx);
634 so->so_snd.sb_state |= SS_ISSENDING;
635 do {
636 if (so->so_snd.sb_state & SS_CANTSENDMORE)
637 snderr(EPIPE);
638 if ((error = READ_ONCE(so->so_error))) {
639 so->so_error = 0;
640 snderr(error);
641 }
642 if ((so->so_state & SS_ISCONNECTED) == 0) {
643 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
644 if (!(resid == 0 && clen != 0))
645 snderr(ENOTCONN);
646 } else if (addr == NULL)
647 snderr(EDESTADDRREQ);
648 }
649 space = sbspace_locked(&so->so_snd);
650 if (flags & MSG_OOB)
651 space += 1024;
652 if (so->so_proto->pr_domain->dom_family == AF_UNIX) {
653 if (atomic && resid > so->so_snd.sb_hiwat)
654 snderr(EMSGSIZE);
655 } else {
656 if (clen > so->so_snd.sb_hiwat ||
657 (atomic && resid > so->so_snd.sb_hiwat - clen))
658 snderr(EMSGSIZE);
659 }
660 if (space < clen ||
661 (space - clen < resid &&
662 (atomic || space < so->so_snd.sb_lowat))) {
663 if (flags & MSG_DONTWAIT)
664 snderr(EWOULDBLOCK);
665 sbunlock(&so->so_snd);
666 error = sbwait(&so->so_snd);
667 so->so_snd.sb_state &= ~SS_ISSENDING;
668 mtx_leave(&so->so_snd.sb_mtx);
669 if (error)
670 goto out;
671 goto restart;
672 }
673 space -= clen;
674 do {
675 if (uio == NULL) {
676 /*
677 * Data is prepackaged in "top".
678 */
679 resid = 0;
680 if (flags & MSG_EOR)
681 top->m_flags |= M_EOR;
682 } else {
683 mtx_leave(&so->so_snd.sb_mtx);
684 error = m_getuio(&top, atomic, space, uio);
685 mtx_enter(&so->so_snd.sb_mtx);
686 if (error)
687 goto release;
688 space -= top->m_pkthdr.len;
689 resid = uio->uio_resid;
690 if (flags & MSG_EOR)
691 top->m_flags |= M_EOR;
692 }
693 if (resid == 0)
694 so->so_snd.sb_state &= ~SS_ISSENDING;
695 if (top && so->so_options & SO_ZEROIZE)
696 top->m_flags |= M_ZEROIZE;
697 mtx_leave(&so->so_snd.sb_mtx);
698 solock_shared(so);
699 if (flags & MSG_OOB)
700 error = pru_sendoob(so, top, addr, control);
701 else
702 error = pru_send(so, top, addr, control);
703 sounlock_shared(so);
704 mtx_enter(&so->so_snd.sb_mtx);
705 clen = 0;
706 control = NULL;
707 top = NULL;
708 if (error)
709 goto release;
710 } while (resid && space > 0);
711 } while (resid);
712
713 release:
714 so->so_snd.sb_state &= ~SS_ISSENDING;
715 mtx_leave(&so->so_snd.sb_mtx);
716 sbunlock(&so->so_snd);
717 out:
718 m_freem(top);
719 m_freem(control);
720 return (error);
721 }
722
723 int
m_getuio(struct mbuf ** mp,int atomic,long space,struct uio * uio)724 m_getuio(struct mbuf **mp, int atomic, long space, struct uio *uio)
725 {
726 struct mbuf *m, *top = NULL;
727 struct mbuf **nextp = ⊤
728 u_long len, mlen;
729 size_t resid = uio->uio_resid;
730 int error;
731
732 do {
733 if (top == NULL) {
734 MGETHDR(m, M_WAIT, MT_DATA);
735 mlen = MHLEN;
736 } else {
737 MGET(m, M_WAIT, MT_DATA);
738 mlen = MLEN;
739 }
740 /* chain mbuf together */
741 *nextp = m;
742 nextp = &m->m_next;
743
744 resid = ulmin(resid, space);
745 if (resid >= MINCLSIZE) {
746 MCLGETL(m, M_NOWAIT, ulmin(resid, MAXMCLBYTES));
747 if ((m->m_flags & M_EXT) == 0)
748 MCLGETL(m, M_NOWAIT, MCLBYTES);
749 if ((m->m_flags & M_EXT) == 0)
750 goto nopages;
751 mlen = m->m_ext.ext_size;
752 len = ulmin(mlen, resid);
753 /*
754 * For datagram protocols, leave room
755 * for protocol headers in first mbuf.
756 */
757 if (atomic && m == top && len < mlen - max_hdr)
758 m->m_data += max_hdr;
759 } else {
760 nopages:
761 len = ulmin(mlen, resid);
762 /*
763 * For datagram protocols, leave room
764 * for protocol headers in first mbuf.
765 */
766 if (atomic && m == top && len < mlen - max_hdr)
767 m_align(m, len);
768 }
769
770 error = uiomove(mtod(m, caddr_t), len, uio);
771 if (error) {
772 m_freem(top);
773 return (error);
774 }
775
776 /* adjust counters */
777 resid = uio->uio_resid;
778 space -= len;
779 m->m_len = len;
780 top->m_pkthdr.len += len;
781
782 /* Is there more space and more data? */
783 } while (space > 0 && resid > 0);
784
785 *mp = top;
786 return 0;
787 }
788
789 /*
790 * Following replacement or removal of the first mbuf on the first
791 * mbuf chain of a socket buffer, push necessary state changes back
792 * into the socket buffer so that other consumers see the values
793 * consistently. 'nextrecord' is the callers locally stored value of
794 * the original value of sb->sb_mb->m_nextpkt which must be restored
795 * when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
796 */
797 void
sbsync(struct sockbuf * sb,struct mbuf * nextrecord)798 sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
799 {
800
801 /*
802 * First, update for the new value of nextrecord. If necessary,
803 * make it the first record.
804 */
805 if (sb->sb_mb != NULL)
806 sb->sb_mb->m_nextpkt = nextrecord;
807 else
808 sb->sb_mb = nextrecord;
809
810 /*
811 * Now update any dependent socket buffer fields to reflect
812 * the new state. This is an inline of SB_EMPTY_FIXUP, with
813 * the addition of a second clause that takes care of the
814 * case where sb_mb has been updated, but remains the last
815 * record.
816 */
817 if (sb->sb_mb == NULL) {
818 sb->sb_mbtail = NULL;
819 sb->sb_lastrecord = NULL;
820 } else if (sb->sb_mb->m_nextpkt == NULL)
821 sb->sb_lastrecord = sb->sb_mb;
822 }
823
824 /*
825 * Implement receive operations on a socket.
826 * We depend on the way that records are added to the sockbuf
827 * by sbappend*. In particular, each record (mbufs linked through m_next)
828 * must begin with an address if the protocol so specifies,
829 * followed by an optional mbuf or mbufs containing ancillary data,
830 * and then zero or more mbufs of data.
831 * In order to avoid blocking network for the entire time here, we release
832 * the solock() while doing the actual copy to user space.
833 * Although the sockbuf is locked, new data may still be appended,
834 * and thus we must maintain consistency of the sockbuf during that time.
835 *
836 * The caller may receive the data as a single mbuf chain by supplying
837 * an mbuf **mp0 for use in returning the chain. The uio is then used
838 * only for the count in uio_resid.
839 */
840 int
soreceive(struct socket * so,struct mbuf ** paddr,struct uio * uio,struct mbuf ** mp0,struct mbuf ** controlp,int * flagsp,socklen_t controllen)841 soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
842 struct mbuf **mp0, struct mbuf **controlp, int *flagsp,
843 socklen_t controllen)
844 {
845 struct mbuf *m, **mp;
846 struct mbuf *cm;
847 u_long len, offset, moff;
848 int flags, error, error2, type, uio_error = 0;
849 const struct protosw *pr = so->so_proto;
850 struct mbuf *nextrecord;
851 size_t resid, orig_resid = uio->uio_resid;
852
853 mp = mp0;
854 if (paddr)
855 *paddr = NULL;
856 if (controlp)
857 *controlp = NULL;
858 if (flagsp)
859 flags = *flagsp &~ MSG_EOR;
860 else
861 flags = 0;
862 if (flags & MSG_OOB) {
863 m = m_get(M_WAIT, MT_DATA);
864 solock_shared(so);
865 error = pru_rcvoob(so, m, flags & MSG_PEEK);
866 sounlock_shared(so);
867 if (error)
868 goto bad;
869 do {
870 error = uiomove(mtod(m, caddr_t),
871 ulmin(uio->uio_resid, m->m_len), uio);
872 m = m_free(m);
873 } while (uio->uio_resid && error == 0 && m);
874 bad:
875 m_freem(m);
876 return (error);
877 }
878 if (mp)
879 *mp = NULL;
880
881 restart:
882 if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
883 return (error);
884 mtx_enter(&so->so_rcv.sb_mtx);
885
886 m = so->so_rcv.sb_mb;
887 #ifdef SOCKET_SPLICE
888 if (isspliced(so))
889 m = NULL;
890 #endif /* SOCKET_SPLICE */
891 /*
892 * If we have less data than requested, block awaiting more
893 * (subject to any timeout) if:
894 * 1. the current count is less than the low water mark,
895 * 2. MSG_WAITALL is set, and it is possible to do the entire
896 * receive operation at once if we block (resid <= hiwat), or
897 * 3. MSG_DONTWAIT is not set.
898 * If MSG_WAITALL is set but resid is larger than the receive buffer,
899 * we have to do the receive in sections, and thus risk returning
900 * a short count if a timeout or signal occurs after we start.
901 */
902 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
903 so->so_rcv.sb_cc < uio->uio_resid) &&
904 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
905 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
906 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
907 #ifdef DIAGNOSTIC
908 if (m == NULL && so->so_rcv.sb_cc)
909 #ifdef SOCKET_SPLICE
910 if (!isspliced(so))
911 #endif /* SOCKET_SPLICE */
912 panic("receive 1: so %p, so_type %d, sb_cc %lu",
913 so, so->so_type, so->so_rcv.sb_cc);
914 #endif
915 if ((error2 = READ_ONCE(so->so_error))) {
916 if (m)
917 goto dontblock;
918 error = error2;
919 if ((flags & MSG_PEEK) == 0)
920 so->so_error = 0;
921 goto release;
922 }
923 if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
924 if (m)
925 goto dontblock;
926 else if (so->so_rcv.sb_cc == 0)
927 goto release;
928 }
929 for (; m; m = m->m_next)
930 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
931 m = so->so_rcv.sb_mb;
932 goto dontblock;
933 }
934 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
935 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
936 error = ENOTCONN;
937 goto release;
938 }
939 if (uio->uio_resid == 0 && controlp == NULL)
940 goto release;
941 if (flags & MSG_DONTWAIT) {
942 error = EWOULDBLOCK;
943 goto release;
944 }
945 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
946 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
947
948 sbunlock(&so->so_rcv);
949 error = sbwait(&so->so_rcv);
950 mtx_leave(&so->so_rcv.sb_mtx);
951 if (error)
952 return (error);
953 goto restart;
954 }
955 dontblock:
956 /*
957 * On entry here, m points to the first record of the socket buffer.
958 * From this point onward, we maintain 'nextrecord' as a cache of the
959 * pointer to the next record in the socket buffer. We must keep the
960 * various socket buffer pointers and local stack versions of the
961 * pointers in sync, pushing out modifications before operations that
962 * may sleep, and re-reading them afterwards.
963 *
964 * Otherwise, we will race with the network stack appending new data
965 * or records onto the socket buffer by using inconsistent/stale
966 * versions of the field, possibly resulting in socket buffer
967 * corruption.
968 */
969 if (uio->uio_procp)
970 uio->uio_procp->p_ru.ru_msgrcv++;
971 KASSERT(m == so->so_rcv.sb_mb);
972 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
973 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
974 nextrecord = m->m_nextpkt;
975 if (pr->pr_flags & PR_ADDR) {
976 #ifdef DIAGNOSTIC
977 if (m->m_type != MT_SONAME)
978 panic("receive 1a: so %p, so_type %d, m %p, m_type %d",
979 so, so->so_type, m, m->m_type);
980 #endif
981 orig_resid = 0;
982 if (flags & MSG_PEEK) {
983 if (paddr)
984 *paddr = m_copym(m, 0, m->m_len, M_NOWAIT);
985 m = m->m_next;
986 } else {
987 sbfree(&so->so_rcv, m);
988 if (paddr) {
989 *paddr = m;
990 so->so_rcv.sb_mb = m->m_next;
991 m->m_next = NULL;
992 m = so->so_rcv.sb_mb;
993 } else {
994 so->so_rcv.sb_mb = m_free(m);
995 m = so->so_rcv.sb_mb;
996 }
997 sbsync(&so->so_rcv, nextrecord);
998 }
999 }
1000 while (m && m->m_type == MT_CONTROL && error == 0) {
1001 int skip = 0;
1002 if (flags & MSG_PEEK) {
1003 if (mtod(m, struct cmsghdr *)->cmsg_type ==
1004 SCM_RIGHTS) {
1005 /* don't leak internalized SCM_RIGHTS msgs */
1006 skip = 1;
1007 } else if (controlp)
1008 *controlp = m_copym(m, 0, m->m_len, M_NOWAIT);
1009 m = m->m_next;
1010 } else {
1011 sbfree(&so->so_rcv, m);
1012 so->so_rcv.sb_mb = m->m_next;
1013 m->m_nextpkt = m->m_next = NULL;
1014 cm = m;
1015 m = so->so_rcv.sb_mb;
1016 sbsync(&so->so_rcv, nextrecord);
1017 if (controlp) {
1018 if (pr->pr_domain->dom_externalize) {
1019 mtx_leave(&so->so_rcv.sb_mtx);
1020 error =
1021 (*pr->pr_domain->dom_externalize)
1022 (cm, controllen, flags);
1023 mtx_enter(&so->so_rcv.sb_mtx);
1024 }
1025 *controlp = cm;
1026 } else {
1027 /*
1028 * Dispose of any SCM_RIGHTS message that went
1029 * through the read path rather than recv.
1030 */
1031 if (pr->pr_domain->dom_dispose) {
1032 mtx_leave(&so->so_rcv.sb_mtx);
1033 pr->pr_domain->dom_dispose(cm);
1034 mtx_enter(&so->so_rcv.sb_mtx);
1035 }
1036 m_free(cm);
1037 }
1038 }
1039 if (m != NULL)
1040 nextrecord = so->so_rcv.sb_mb->m_nextpkt;
1041 else
1042 nextrecord = so->so_rcv.sb_mb;
1043 if (controlp && !skip)
1044 controlp = &(*controlp)->m_next;
1045 orig_resid = 0;
1046 }
1047
1048 /* If m is non-NULL, we have some data to read. */
1049 if (m) {
1050 type = m->m_type;
1051 if (type == MT_OOBDATA)
1052 flags |= MSG_OOB;
1053 if (m->m_flags & M_BCAST)
1054 flags |= MSG_BCAST;
1055 if (m->m_flags & M_MCAST)
1056 flags |= MSG_MCAST;
1057 }
1058 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
1059 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
1060
1061 moff = 0;
1062 offset = 0;
1063 while (m && uio->uio_resid > 0 && error == 0) {
1064 if (m->m_type == MT_OOBDATA) {
1065 if (type != MT_OOBDATA)
1066 break;
1067 } else if (type == MT_OOBDATA) {
1068 break;
1069 } else if (m->m_type == MT_CONTROL) {
1070 /*
1071 * If there is more than one control message in the
1072 * stream, we do a short read. Next can be received
1073 * or disposed by another system call.
1074 */
1075 break;
1076 #ifdef DIAGNOSTIC
1077 } else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
1078 panic("receive 3: so %p, so_type %d, m %p, m_type %d",
1079 so, so->so_type, m, m->m_type);
1080 #endif
1081 }
1082 so->so_rcv.sb_state &= ~SS_RCVATMARK;
1083 len = uio->uio_resid;
1084 if (so->so_oobmark && len > so->so_oobmark - offset)
1085 len = so->so_oobmark - offset;
1086 if (len > m->m_len - moff)
1087 len = m->m_len - moff;
1088 /*
1089 * If mp is set, just pass back the mbufs.
1090 * Otherwise copy them out via the uio, then free.
1091 * Sockbuf must be consistent here (points to current mbuf,
1092 * it points to next record) when we drop priority;
1093 * we must note any additions to the sockbuf when we
1094 * block interrupts again.
1095 */
1096 if (mp == NULL && uio_error == 0) {
1097 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
1098 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
1099 resid = uio->uio_resid;
1100 mtx_leave(&so->so_rcv.sb_mtx);
1101 uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
1102 mtx_enter(&so->so_rcv.sb_mtx);
1103 if (uio_error)
1104 uio->uio_resid = resid - len;
1105 } else
1106 uio->uio_resid -= len;
1107 if (len == m->m_len - moff) {
1108 if (m->m_flags & M_EOR)
1109 flags |= MSG_EOR;
1110 if (flags & MSG_PEEK) {
1111 m = m->m_next;
1112 moff = 0;
1113 orig_resid = 0;
1114 } else {
1115 nextrecord = m->m_nextpkt;
1116 sbfree(&so->so_rcv, m);
1117 if (mp) {
1118 *mp = m;
1119 mp = &m->m_next;
1120 so->so_rcv.sb_mb = m = m->m_next;
1121 *mp = NULL;
1122 } else {
1123 so->so_rcv.sb_mb = m_free(m);
1124 m = so->so_rcv.sb_mb;
1125 }
1126 /*
1127 * If m != NULL, we also know that
1128 * so->so_rcv.sb_mb != NULL.
1129 */
1130 KASSERT(so->so_rcv.sb_mb == m);
1131 if (m) {
1132 m->m_nextpkt = nextrecord;
1133 if (nextrecord == NULL)
1134 so->so_rcv.sb_lastrecord = m;
1135 } else {
1136 so->so_rcv.sb_mb = nextrecord;
1137 SB_EMPTY_FIXUP(&so->so_rcv);
1138 }
1139 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
1140 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1141 }
1142 } else {
1143 if (flags & MSG_PEEK) {
1144 moff += len;
1145 orig_resid = 0;
1146 } else {
1147 if (mp)
1148 *mp = m_copym(m, 0, len, M_WAIT);
1149 m->m_data += len;
1150 m->m_len -= len;
1151 so->so_rcv.sb_cc -= len;
1152 so->so_rcv.sb_datacc -= len;
1153 }
1154 }
1155 if (so->so_oobmark) {
1156 if ((flags & MSG_PEEK) == 0) {
1157 so->so_oobmark -= len;
1158 if (so->so_oobmark == 0) {
1159 so->so_rcv.sb_state |= SS_RCVATMARK;
1160 break;
1161 }
1162 } else {
1163 offset += len;
1164 if (offset == so->so_oobmark)
1165 break;
1166 }
1167 }
1168 if (flags & MSG_EOR)
1169 break;
1170 /*
1171 * If the MSG_WAITALL flag is set (for non-atomic socket),
1172 * we must not quit until "uio->uio_resid == 0" or an error
1173 * termination. If a signal/timeout occurs, return
1174 * with a short count but without error.
1175 * Keep sockbuf locked against other readers.
1176 */
1177 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
1178 !sosendallatonce(so) && !nextrecord) {
1179 if (so->so_rcv.sb_state & SS_CANTRCVMORE ||
1180 so->so_error)
1181 break;
1182 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
1183 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
1184 if (sbwait(&so->so_rcv)) {
1185 mtx_leave(&so->so_rcv.sb_mtx);
1186 sbunlock(&so->so_rcv);
1187 return (0);
1188 }
1189 if ((m = so->so_rcv.sb_mb) != NULL)
1190 nextrecord = m->m_nextpkt;
1191 }
1192 }
1193
1194 if (m && pr->pr_flags & PR_ATOMIC) {
1195 flags |= MSG_TRUNC;
1196 if ((flags & MSG_PEEK) == 0)
1197 sbdroprecord(&so->so_rcv);
1198 }
1199 if ((flags & MSG_PEEK) == 0) {
1200 if (m == NULL) {
1201 /*
1202 * First part is an inline SB_EMPTY_FIXUP(). Second
1203 * part makes sure sb_lastrecord is up-to-date if
1204 * there is still data in the socket buffer.
1205 */
1206 so->so_rcv.sb_mb = nextrecord;
1207 if (so->so_rcv.sb_mb == NULL) {
1208 so->so_rcv.sb_mbtail = NULL;
1209 so->so_rcv.sb_lastrecord = NULL;
1210 } else if (nextrecord->m_nextpkt == NULL)
1211 so->so_rcv.sb_lastrecord = nextrecord;
1212 }
1213 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
1214 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1215 if (pr->pr_flags & PR_WANTRCVD) {
1216 mtx_leave(&so->so_rcv.sb_mtx);
1217 solock_shared(so);
1218 pru_rcvd(so);
1219 sounlock_shared(so);
1220 mtx_enter(&so->so_rcv.sb_mtx);
1221 }
1222 }
1223 if (orig_resid == uio->uio_resid && orig_resid &&
1224 (flags & MSG_EOR) == 0 &&
1225 (so->so_rcv.sb_state & SS_CANTRCVMORE) == 0) {
1226 mtx_leave(&so->so_rcv.sb_mtx);
1227 sbunlock(&so->so_rcv);
1228 goto restart;
1229 }
1230
1231 if (uio_error)
1232 error = uio_error;
1233
1234 if (flagsp)
1235 *flagsp |= flags;
1236 release:
1237 mtx_leave(&so->so_rcv.sb_mtx);
1238 sbunlock(&so->so_rcv);
1239 return (error);
1240 }
1241
1242 int
soshutdown(struct socket * so,int how)1243 soshutdown(struct socket *so, int how)
1244 {
1245 int error = 0;
1246
1247 switch (how) {
1248 case SHUT_RD:
1249 sorflush(so);
1250 break;
1251 case SHUT_RDWR:
1252 sorflush(so);
1253 /* FALLTHROUGH */
1254 case SHUT_WR:
1255 solock(so);
1256 error = pru_shutdown(so);
1257 sounlock(so);
1258 break;
1259 default:
1260 error = EINVAL;
1261 break;
1262 }
1263
1264 return (error);
1265 }
1266
1267 void
sorflush(struct socket * so)1268 sorflush(struct socket *so)
1269 {
1270 struct sockbuf *sb = &so->so_rcv;
1271 struct mbuf *m;
1272 const struct protosw *pr = so->so_proto;
1273 int error;
1274
1275 error = sblock(sb, SBL_WAIT | SBL_NOINTR);
1276 /* with SBL_WAIT and SLB_NOINTR sblock() must not fail */
1277 KASSERT(error == 0);
1278
1279 solock_shared(so);
1280 socantrcvmore(so);
1281 sounlock_shared(so);
1282 mtx_enter(&sb->sb_mtx);
1283 m = sb->sb_mb;
1284 memset(&sb->sb_startzero, 0,
1285 (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
1286 sb->sb_timeo_nsecs = INFSLP;
1287 mtx_leave(&sb->sb_mtx);
1288 sbunlock(sb);
1289
1290 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1291 (*pr->pr_domain->dom_dispose)(m);
1292 m_purge(m);
1293 }
1294
1295 #ifdef SOCKET_SPLICE
1296
1297 #define so_splicelen so_sp->ssp_len
1298 #define so_splicemax so_sp->ssp_max
1299 #define so_idletv so_sp->ssp_idletv
1300 #define so_idleto so_sp->ssp_idleto
1301 #define so_splicetask so_sp->ssp_task
1302
1303 int
sosplice(struct socket * so,int fd,off_t max,struct timeval * tv)1304 sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
1305 {
1306 struct file *fp;
1307 struct socket *sosp;
1308 struct taskq *tq;
1309 int error = 0;
1310
1311 if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
1312 return (EPROTONOSUPPORT);
1313 if (max && max < 0)
1314 return (EINVAL);
1315 if (tv && (tv->tv_sec < 0 || !timerisvalid(tv)))
1316 return (EINVAL);
1317
1318 /* If no fd is given, unsplice by removing existing link. */
1319 if (fd < 0) {
1320 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1321 return (error);
1322 if (so->so_sp && so->so_sp->ssp_socket) {
1323 sosp = soref(so->so_sp->ssp_socket);
1324 sounsplice(so, so->so_sp->ssp_socket, 0);
1325 sorele(sosp);
1326 } else
1327 error = EPROTO;
1328 sbunlock(&so->so_rcv);
1329 return (error);
1330 }
1331
1332 if (sosplice_taskq == NULL) {
1333 rw_enter_write(&sosplice_lock);
1334 if (sosplice_taskq == NULL) {
1335 tq = taskq_create("sosplice", 1, IPL_SOFTNET,
1336 TASKQ_MPSAFE);
1337 if (tq == NULL) {
1338 rw_exit_write(&sosplice_lock);
1339 return (ENOMEM);
1340 }
1341 /* Ensure the taskq is fully visible to other CPUs. */
1342 membar_producer();
1343 sosplice_taskq = tq;
1344 }
1345 rw_exit_write(&sosplice_lock);
1346 } else {
1347 /* Ensure the taskq is fully visible on this CPU. */
1348 membar_consumer();
1349 }
1350
1351 /* Find sosp, the drain socket where data will be spliced into. */
1352 if ((error = getsock(curproc, fd, &fp)) != 0)
1353 return (error);
1354 sosp = fp->f_data;
1355
1356 if (sosp->so_proto->pr_usrreqs->pru_send !=
1357 so->so_proto->pr_usrreqs->pru_send) {
1358 error = EPROTONOSUPPORT;
1359 goto frele;
1360 }
1361
1362 if ((error = sblock(&so->so_rcv, SBL_WAIT)) != 0)
1363 goto frele;
1364 if ((error = sblock(&sosp->so_snd, SBL_WAIT)) != 0) {
1365 sbunlock(&so->so_rcv);
1366 goto frele;
1367 }
1368 solock_pair(so, sosp);
1369
1370 if ((so->so_options & SO_ACCEPTCONN) ||
1371 (sosp->so_options & SO_ACCEPTCONN)) {
1372 error = EOPNOTSUPP;
1373 goto release;
1374 }
1375 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1376 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1377 error = ENOTCONN;
1378 goto release;
1379 }
1380 if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
1381 error = ENOTCONN;
1382 goto release;
1383 }
1384 if (so->so_sp == NULL) {
1385 struct sosplice *so_sp;
1386
1387 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1388 timeout_set_flags(&so_sp->ssp_idleto, soidle, so,
1389 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
1390 task_set(&so_sp->ssp_task, sotask, so);
1391
1392 so->so_sp = so_sp;
1393 }
1394 if (sosp->so_sp == NULL) {
1395 struct sosplice *so_sp;
1396
1397 so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
1398 timeout_set_flags(&so_sp->ssp_idleto, soidle, sosp,
1399 KCLOCK_NONE, TIMEOUT_PROC | TIMEOUT_MPSAFE);
1400 task_set(&so_sp->ssp_task, sotask, sosp);
1401
1402 sosp->so_sp = so_sp;
1403 }
1404 if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
1405 error = EBUSY;
1406 goto release;
1407 }
1408
1409 so->so_splicelen = 0;
1410 so->so_splicemax = max;
1411 if (tv)
1412 so->so_idletv = *tv;
1413 else
1414 timerclear(&so->so_idletv);
1415
1416 /*
1417 * To prevent sorwakeup() calling somove() before this somove()
1418 * has finished, the socket buffers are not marked as spliced yet.
1419 */
1420
1421 /* Splice so and sosp together. */
1422 mtx_enter(&so->so_rcv.sb_mtx);
1423 mtx_enter(&sosp->so_snd.sb_mtx);
1424 so->so_sp->ssp_socket = sosp;
1425 sosp->so_sp->ssp_soback = so;
1426 mtx_leave(&sosp->so_snd.sb_mtx);
1427 mtx_leave(&so->so_rcv.sb_mtx);
1428
1429 sounlock_pair(so, sosp);
1430 sbunlock(&sosp->so_snd);
1431
1432 if (somove(so, M_WAIT)) {
1433 mtx_enter(&so->so_rcv.sb_mtx);
1434 mtx_enter(&sosp->so_snd.sb_mtx);
1435 so->so_rcv.sb_flags |= SB_SPLICE;
1436 sosp->so_snd.sb_flags |= SB_SPLICE;
1437 mtx_leave(&sosp->so_snd.sb_mtx);
1438 mtx_leave(&so->so_rcv.sb_mtx);
1439 }
1440
1441 sbunlock(&so->so_rcv);
1442 FRELE(fp, curproc);
1443 return (0);
1444
1445 release:
1446 sounlock_pair(so, sosp);
1447 sbunlock(&sosp->so_snd);
1448 sbunlock(&so->so_rcv);
1449 frele:
1450 FRELE(fp, curproc);
1451 return (error);
1452 }
1453
1454 void
sounsplice(struct socket * so,struct socket * sosp,int freeing)1455 sounsplice(struct socket *so, struct socket *sosp, int freeing)
1456 {
1457 sbassertlocked(&so->so_rcv);
1458
1459 mtx_enter(&so->so_rcv.sb_mtx);
1460 mtx_enter(&sosp->so_snd.sb_mtx);
1461 so->so_rcv.sb_flags &= ~SB_SPLICE;
1462 sosp->so_snd.sb_flags &= ~SB_SPLICE;
1463 so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
1464 mtx_leave(&sosp->so_snd.sb_mtx);
1465 mtx_leave(&so->so_rcv.sb_mtx);
1466
1467 task_del(sosplice_taskq, &so->so_splicetask);
1468 timeout_del(&so->so_idleto);
1469
1470 /* Do not wakeup a socket that is about to be freed. */
1471 if ((freeing & SOSP_FREEING_READ) == 0) {
1472 int readable;
1473
1474 solock_shared(so);
1475 mtx_enter(&so->so_rcv.sb_mtx);
1476 readable = soreadable(so);
1477 mtx_leave(&so->so_rcv.sb_mtx);
1478 if (readable)
1479 sorwakeup(so);
1480 sounlock_shared(so);
1481 }
1482 if ((freeing & SOSP_FREEING_WRITE) == 0) {
1483 solock_shared(sosp);
1484 if (sowriteable(sosp))
1485 sowwakeup(sosp);
1486 sounlock_shared(sosp);
1487 }
1488 }
1489
1490 void
soidle(void * arg)1491 soidle(void *arg)
1492 {
1493 struct socket *so = arg;
1494
1495 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1496 if (so->so_rcv.sb_flags & SB_SPLICE) {
1497 struct socket *sosp;
1498
1499 WRITE_ONCE(so->so_error, ETIMEDOUT);
1500 sosp = soref(so->so_sp->ssp_socket);
1501 sounsplice(so, so->so_sp->ssp_socket, 0);
1502 sorele(sosp);
1503 }
1504 sbunlock(&so->so_rcv);
1505 }
1506
1507 void
sotask(void * arg)1508 sotask(void *arg)
1509 {
1510 struct socket *so = arg;
1511 int doyield = 0;
1512
1513 sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
1514 if (so->so_rcv.sb_flags & SB_SPLICE) {
1515 if (so->so_proto->pr_flags & PR_WANTRCVD)
1516 doyield = 1;
1517 somove(so, M_DONTWAIT);
1518 }
1519 sbunlock(&so->so_rcv);
1520
1521 if (doyield) {
1522 /* Avoid user land starvation. */
1523 yield();
1524 }
1525 }
1526
1527 /*
1528 * Move data from receive buffer of spliced source socket to send
1529 * buffer of drain socket. Try to move as much as possible in one
1530 * big chunk. It is a TCP only implementation.
1531 * Return value 0 means splicing has been finished, 1 continue.
1532 */
1533 int
somove(struct socket * so,int wait)1534 somove(struct socket *so, int wait)
1535 {
1536 struct socket *sosp = so->so_sp->ssp_socket;
1537 struct mbuf *m, **mp, *nextrecord;
1538 u_long len, off, oobmark;
1539 long space;
1540 int error = 0, maxreached = 0, unsplice = 0;
1541 unsigned int rcvstate;
1542
1543 sbassertlocked(&so->so_rcv);
1544
1545 if (so->so_proto->pr_flags & PR_WANTRCVD)
1546 sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
1547
1548 mtx_enter(&so->so_rcv.sb_mtx);
1549 mtx_enter(&sosp->so_snd.sb_mtx);
1550
1551 nextpkt:
1552 if ((error = READ_ONCE(so->so_error)))
1553 goto release;
1554 if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
1555 error = EPIPE;
1556 goto release;
1557 }
1558
1559 error = READ_ONCE(sosp->so_error);
1560 if (error) {
1561 if (error != ETIMEDOUT && error != EFBIG && error != ELOOP)
1562 goto release;
1563 error = 0;
1564 }
1565 if ((sosp->so_state & SS_ISCONNECTED) == 0)
1566 goto release;
1567
1568 /* Calculate how many bytes can be copied now. */
1569 len = so->so_rcv.sb_datacc;
1570 if (so->so_splicemax) {
1571 KASSERT(so->so_splicelen < so->so_splicemax);
1572 if (so->so_splicemax <= so->so_splicelen + len) {
1573 len = so->so_splicemax - so->so_splicelen;
1574 maxreached = 1;
1575 }
1576 }
1577 space = sbspace_locked(&sosp->so_snd);
1578 if (so->so_oobmark && so->so_oobmark < len &&
1579 so->so_oobmark < space + 1024)
1580 space += 1024;
1581 if (space <= 0) {
1582 maxreached = 0;
1583 goto release;
1584 }
1585 if (space < len) {
1586 maxreached = 0;
1587 if (space < sosp->so_snd.sb_lowat)
1588 goto release;
1589 len = space;
1590 }
1591 sosp->so_snd.sb_state |= SS_ISSENDING;
1592
1593 SBLASTRECORDCHK(&so->so_rcv, "somove 1");
1594 SBLASTMBUFCHK(&so->so_rcv, "somove 1");
1595 m = so->so_rcv.sb_mb;
1596 if (m == NULL)
1597 goto release;
1598 nextrecord = m->m_nextpkt;
1599
1600 /* Drop address and control information not used with splicing. */
1601 if (so->so_proto->pr_flags & PR_ADDR) {
1602 #ifdef DIAGNOSTIC
1603 if (m->m_type != MT_SONAME)
1604 panic("somove soname: so %p, so_type %d, m %p, "
1605 "m_type %d", so, so->so_type, m, m->m_type);
1606 #endif
1607 m = m->m_next;
1608 }
1609 while (m && m->m_type == MT_CONTROL)
1610 m = m->m_next;
1611 if (m == NULL) {
1612 sbdroprecord(&so->so_rcv);
1613 if (so->so_proto->pr_flags & PR_WANTRCVD) {
1614 mtx_leave(&sosp->so_snd.sb_mtx);
1615 mtx_leave(&so->so_rcv.sb_mtx);
1616 solock_shared(so);
1617 pru_rcvd(so);
1618 sounlock_shared(so);
1619 mtx_enter(&so->so_rcv.sb_mtx);
1620 mtx_enter(&sosp->so_snd.sb_mtx);
1621 }
1622 goto nextpkt;
1623 }
1624
1625 /*
1626 * By splicing sockets connected to localhost, userland might create a
1627 * loop. Dissolve splicing with error if loop is detected by counter.
1628 *
1629 * If we deal with looped broadcast/multicast packet we bail out with
1630 * no error to suppress splice termination.
1631 */
1632 if ((m->m_flags & M_PKTHDR) &&
1633 ((m->m_pkthdr.ph_loopcnt++ >= M_MAXLOOP) ||
1634 ((m->m_flags & M_LOOP) && (m->m_flags & (M_BCAST|M_MCAST))))) {
1635 error = ELOOP;
1636 goto release;
1637 }
1638
1639 if (so->so_proto->pr_flags & PR_ATOMIC) {
1640 if ((m->m_flags & M_PKTHDR) == 0)
1641 panic("somove !PKTHDR: so %p, so_type %d, m %p, "
1642 "m_type %d", so, so->so_type, m, m->m_type);
1643 if (sosp->so_snd.sb_hiwat < m->m_pkthdr.len) {
1644 error = EMSGSIZE;
1645 goto release;
1646 }
1647 if (len < m->m_pkthdr.len)
1648 goto release;
1649 if (m->m_pkthdr.len < len) {
1650 maxreached = 0;
1651 len = m->m_pkthdr.len;
1652 }
1653 /*
1654 * Throw away the name mbuf after it has been assured
1655 * that the whole first record can be processed.
1656 */
1657 m = so->so_rcv.sb_mb;
1658 sbfree(&so->so_rcv, m);
1659 so->so_rcv.sb_mb = m_free(m);
1660 sbsync(&so->so_rcv, nextrecord);
1661 }
1662 /*
1663 * Throw away the control mbufs after it has been assured
1664 * that the whole first record can be processed.
1665 */
1666 m = so->so_rcv.sb_mb;
1667 while (m && m->m_type == MT_CONTROL) {
1668 sbfree(&so->so_rcv, m);
1669 so->so_rcv.sb_mb = m_free(m);
1670 m = so->so_rcv.sb_mb;
1671 sbsync(&so->so_rcv, nextrecord);
1672 }
1673
1674 SBLASTRECORDCHK(&so->so_rcv, "somove 2");
1675 SBLASTMBUFCHK(&so->so_rcv, "somove 2");
1676
1677 /* Take at most len mbufs out of receive buffer. */
1678 for (off = 0, mp = &m; off <= len && *mp;
1679 off += (*mp)->m_len, mp = &(*mp)->m_next) {
1680 u_long size = len - off;
1681
1682 #ifdef DIAGNOSTIC
1683 if ((*mp)->m_type != MT_DATA && (*mp)->m_type != MT_HEADER)
1684 panic("somove type: so %p, so_type %d, m %p, "
1685 "m_type %d", so, so->so_type, *mp, (*mp)->m_type);
1686 #endif
1687 if ((*mp)->m_len > size) {
1688 /*
1689 * Move only a partial mbuf at maximum splice length or
1690 * if the drain buffer is too small for this large mbuf.
1691 */
1692 if (!maxreached && sosp->so_snd.sb_datacc > 0) {
1693 len -= size;
1694 break;
1695 }
1696 *mp = m_copym(so->so_rcv.sb_mb, 0, size, wait);
1697 if (*mp == NULL) {
1698 len -= size;
1699 break;
1700 }
1701 so->so_rcv.sb_mb->m_data += size;
1702 so->so_rcv.sb_mb->m_len -= size;
1703 so->so_rcv.sb_cc -= size;
1704 so->so_rcv.sb_datacc -= size;
1705 } else {
1706 *mp = so->so_rcv.sb_mb;
1707 sbfree(&so->so_rcv, *mp);
1708 so->so_rcv.sb_mb = (*mp)->m_next;
1709 sbsync(&so->so_rcv, nextrecord);
1710 }
1711 }
1712 *mp = NULL;
1713
1714 SBLASTRECORDCHK(&so->so_rcv, "somove 3");
1715 SBLASTMBUFCHK(&so->so_rcv, "somove 3");
1716 SBCHECK(so, &so->so_rcv);
1717 if (m == NULL)
1718 goto release;
1719 m->m_nextpkt = NULL;
1720 if (m->m_flags & M_PKTHDR) {
1721 m_resethdr(m);
1722 m->m_pkthdr.len = len;
1723 }
1724
1725 /* Receive buffer did shrink by len bytes, adjust oob. */
1726 rcvstate = so->so_rcv.sb_state;
1727 so->so_rcv.sb_state &= ~SS_RCVATMARK;
1728 oobmark = so->so_oobmark;
1729 so->so_oobmark = oobmark > len ? oobmark - len : 0;
1730 if (oobmark) {
1731 if (oobmark == len)
1732 so->so_rcv.sb_state |= SS_RCVATMARK;
1733 if (oobmark >= len)
1734 oobmark = 0;
1735 }
1736
1737 /* Send window update to source peer as receive buffer has changed. */
1738 if (so->so_proto->pr_flags & PR_WANTRCVD) {
1739 mtx_leave(&sosp->so_snd.sb_mtx);
1740 mtx_leave(&so->so_rcv.sb_mtx);
1741 solock_shared(so);
1742 pru_rcvd(so);
1743 sounlock_shared(so);
1744 mtx_enter(&so->so_rcv.sb_mtx);
1745 mtx_enter(&sosp->so_snd.sb_mtx);
1746 }
1747
1748 /*
1749 * Handle oob data. If any malloc fails, ignore error.
1750 * TCP urgent data is not very reliable anyway.
1751 */
1752 while (((rcvstate & SS_RCVATMARK) || oobmark) &&
1753 (so->so_options & SO_OOBINLINE)) {
1754 struct mbuf *o = NULL;
1755
1756 if (rcvstate & SS_RCVATMARK) {
1757 o = m_get(wait, MT_DATA);
1758 rcvstate &= ~SS_RCVATMARK;
1759 } else if (oobmark) {
1760 o = m_split(m, oobmark, wait);
1761 if (o) {
1762 mtx_leave(&sosp->so_snd.sb_mtx);
1763 mtx_leave(&so->so_rcv.sb_mtx);
1764 solock_shared(sosp);
1765 error = pru_send(sosp, m, NULL, NULL);
1766 sounlock_shared(sosp);
1767 mtx_enter(&so->so_rcv.sb_mtx);
1768 mtx_enter(&sosp->so_snd.sb_mtx);
1769
1770 if (error) {
1771 if (sosp->so_snd.sb_state &
1772 SS_CANTSENDMORE)
1773 error = EPIPE;
1774 m_freem(o);
1775 goto release;
1776 }
1777 len -= oobmark;
1778 so->so_splicelen += oobmark;
1779 m = o;
1780 o = m_get(wait, MT_DATA);
1781 }
1782 oobmark = 0;
1783 }
1784 if (o) {
1785 o->m_len = 1;
1786 *mtod(o, caddr_t) = *mtod(m, caddr_t);
1787
1788 mtx_leave(&sosp->so_snd.sb_mtx);
1789 mtx_leave(&so->so_rcv.sb_mtx);
1790 solock_shared(sosp);
1791 error = pru_sendoob(sosp, o, NULL, NULL);
1792 sounlock_shared(sosp);
1793 mtx_enter(&so->so_rcv.sb_mtx);
1794 mtx_enter(&sosp->so_snd.sb_mtx);
1795
1796 if (error) {
1797 if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
1798 error = EPIPE;
1799 m_freem(m);
1800 goto release;
1801 }
1802 len -= 1;
1803 so->so_splicelen += 1;
1804 if (oobmark) {
1805 oobmark -= 1;
1806 if (oobmark == 0)
1807 rcvstate |= SS_RCVATMARK;
1808 }
1809 m_adj(m, 1);
1810 }
1811 }
1812
1813 /* Append all remaining data to drain socket. */
1814 if (so->so_rcv.sb_cc == 0 || maxreached)
1815 sosp->so_snd.sb_state &= ~SS_ISSENDING;
1816
1817 mtx_leave(&sosp->so_snd.sb_mtx);
1818 mtx_leave(&so->so_rcv.sb_mtx);
1819 solock_shared(sosp);
1820 error = pru_send(sosp, m, NULL, NULL);
1821 sounlock_shared(sosp);
1822 mtx_enter(&so->so_rcv.sb_mtx);
1823 mtx_enter(&sosp->so_snd.sb_mtx);
1824
1825 if (error) {
1826 if (sosp->so_snd.sb_state & SS_CANTSENDMORE ||
1827 sosp->so_pcb == NULL)
1828 error = EPIPE;
1829 goto release;
1830 }
1831 so->so_splicelen += len;
1832
1833 /* Move several packets if possible. */
1834 if (!maxreached && nextrecord)
1835 goto nextpkt;
1836
1837 release:
1838 sosp->so_snd.sb_state &= ~SS_ISSENDING;
1839
1840 if (!error && maxreached && so->so_splicemax == so->so_splicelen)
1841 error = EFBIG;
1842 if (error)
1843 WRITE_ONCE(so->so_error, error);
1844
1845 if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
1846 so->so_rcv.sb_cc == 0) ||
1847 (sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
1848 maxreached || error)
1849 unsplice = 1;
1850
1851 mtx_leave(&sosp->so_snd.sb_mtx);
1852 mtx_leave(&so->so_rcv.sb_mtx);
1853
1854 if (so->so_proto->pr_flags & PR_WANTRCVD)
1855 sbunlock(&so->so_snd);
1856
1857 if (unsplice) {
1858 soref(sosp);
1859 sounsplice(so, sosp, 0);
1860 sorele(sosp);
1861
1862 return (0);
1863 }
1864 if (timerisset(&so->so_idletv))
1865 timeout_add_tv(&so->so_idleto, &so->so_idletv);
1866 return (1);
1867 }
1868 #endif /* SOCKET_SPLICE */
1869
1870 void
sorwakeup(struct socket * so)1871 sorwakeup(struct socket *so)
1872 {
1873 #ifdef SOCKET_SPLICE
1874 if (so->so_proto->pr_flags & PR_SPLICE) {
1875 mtx_enter(&so->so_rcv.sb_mtx);
1876 if (so->so_rcv.sb_flags & SB_SPLICE)
1877 task_add(sosplice_taskq, &so->so_splicetask);
1878 if (isspliced(so)) {
1879 mtx_leave(&so->so_rcv.sb_mtx);
1880 return;
1881 }
1882 mtx_leave(&so->so_rcv.sb_mtx);
1883 }
1884 #endif
1885 sowakeup(so, &so->so_rcv);
1886 if (so->so_upcall)
1887 (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
1888 }
1889
1890 void
sowwakeup(struct socket * so)1891 sowwakeup(struct socket *so)
1892 {
1893 #ifdef SOCKET_SPLICE
1894 if (so->so_proto->pr_flags & PR_SPLICE) {
1895 mtx_enter(&so->so_snd.sb_mtx);
1896 if (so->so_snd.sb_flags & SB_SPLICE)
1897 task_add(sosplice_taskq,
1898 &so->so_sp->ssp_soback->so_splicetask);
1899 if (issplicedback(so)) {
1900 mtx_leave(&so->so_snd.sb_mtx);
1901 return;
1902 }
1903 mtx_leave(&so->so_snd.sb_mtx);
1904 }
1905 #endif
1906 sowakeup(so, &so->so_snd);
1907 }
1908
1909 int
sosetopt(struct socket * so,int level,int optname,struct mbuf * m)1910 sosetopt(struct socket *so, int level, int optname, struct mbuf *m)
1911 {
1912 int error = 0;
1913
1914 if (level != SOL_SOCKET) {
1915 if (so->so_proto->pr_ctloutput) {
1916 solock(so);
1917 error = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so,
1918 level, optname, m);
1919 sounlock(so);
1920 return (error);
1921 }
1922 error = ENOPROTOOPT;
1923 } else {
1924 switch (optname) {
1925
1926 case SO_LINGER:
1927 if (m == NULL || m->m_len != sizeof (struct linger) ||
1928 mtod(m, struct linger *)->l_linger < 0 ||
1929 mtod(m, struct linger *)->l_linger > SHRT_MAX)
1930 return (EINVAL);
1931
1932 solock(so);
1933 so->so_linger = mtod(m, struct linger *)->l_linger;
1934 if (*mtod(m, int *))
1935 so->so_options |= optname;
1936 else
1937 so->so_options &= ~optname;
1938 sounlock(so);
1939
1940 break;
1941 case SO_BINDANY:
1942 if ((error = suser(curproc)) != 0) /* XXX */
1943 return (error);
1944 /* FALLTHROUGH */
1945
1946 case SO_DEBUG:
1947 case SO_KEEPALIVE:
1948 case SO_USELOOPBACK:
1949 case SO_BROADCAST:
1950 case SO_REUSEADDR:
1951 case SO_REUSEPORT:
1952 case SO_OOBINLINE:
1953 case SO_TIMESTAMP:
1954 case SO_ZEROIZE:
1955 if (m == NULL || m->m_len < sizeof (int))
1956 return (EINVAL);
1957
1958 solock(so);
1959 if (*mtod(m, int *))
1960 so->so_options |= optname;
1961 else
1962 so->so_options &= ~optname;
1963 sounlock(so);
1964
1965 break;
1966 case SO_DONTROUTE:
1967 if (m == NULL || m->m_len < sizeof (int))
1968 return (EINVAL);
1969 if (*mtod(m, int *))
1970 error = EOPNOTSUPP;
1971 break;
1972
1973 case SO_SNDBUF:
1974 case SO_RCVBUF:
1975 case SO_SNDLOWAT:
1976 case SO_RCVLOWAT:
1977 {
1978 struct sockbuf *sb = (optname == SO_SNDBUF ||
1979 optname == SO_SNDLOWAT ?
1980 &so->so_snd : &so->so_rcv);
1981 u_long cnt;
1982
1983 if (m == NULL || m->m_len < sizeof (int))
1984 return (EINVAL);
1985 cnt = *mtod(m, int *);
1986 if ((long)cnt <= 0)
1987 cnt = 1;
1988
1989 mtx_enter(&sb->sb_mtx);
1990 switch (optname) {
1991 case SO_SNDBUF:
1992 case SO_RCVBUF:
1993 if (sb->sb_state &
1994 (SS_CANTSENDMORE | SS_CANTRCVMORE)) {
1995 error = EINVAL;
1996 break;
1997 }
1998 if (sbcheckreserve(cnt, sb->sb_wat) ||
1999 sbreserve(so, sb, cnt)) {
2000 error = ENOBUFS;
2001 break;
2002 }
2003 sb->sb_wat = cnt;
2004 break;
2005 case SO_SNDLOWAT:
2006 case SO_RCVLOWAT:
2007 sb->sb_lowat = (cnt > sb->sb_hiwat) ?
2008 sb->sb_hiwat : cnt;
2009 break;
2010 }
2011 mtx_leave(&sb->sb_mtx);
2012
2013 break;
2014 }
2015
2016 case SO_SNDTIMEO:
2017 case SO_RCVTIMEO:
2018 {
2019 struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2020 &so->so_snd : &so->so_rcv);
2021 struct timeval tv;
2022 uint64_t nsecs;
2023
2024 if (m == NULL || m->m_len < sizeof (tv))
2025 return (EINVAL);
2026 memcpy(&tv, mtod(m, struct timeval *), sizeof tv);
2027 if (!timerisvalid(&tv))
2028 return (EINVAL);
2029 nsecs = TIMEVAL_TO_NSEC(&tv);
2030 if (nsecs == UINT64_MAX)
2031 return (EDOM);
2032 if (nsecs == 0)
2033 nsecs = INFSLP;
2034
2035 mtx_enter(&sb->sb_mtx);
2036 sb->sb_timeo_nsecs = nsecs;
2037 mtx_leave(&sb->sb_mtx);
2038 break;
2039 }
2040
2041 case SO_RTABLE:
2042 if (so->so_proto->pr_domain &&
2043 so->so_proto->pr_domain->dom_protosw &&
2044 so->so_proto->pr_ctloutput) {
2045 const struct domain *dom =
2046 so->so_proto->pr_domain;
2047
2048 level = dom->dom_protosw->pr_protocol;
2049 solock(so);
2050 error = (*so->so_proto->pr_ctloutput)
2051 (PRCO_SETOPT, so, level, optname, m);
2052 sounlock(so);
2053 } else
2054 error = ENOPROTOOPT;
2055 break;
2056 #ifdef SOCKET_SPLICE
2057 case SO_SPLICE:
2058 if (m == NULL) {
2059 error = sosplice(so, -1, 0, NULL);
2060 } else if (m->m_len < sizeof(int)) {
2061 error = EINVAL;
2062 } else if (m->m_len < sizeof(struct splice)) {
2063 error = sosplice(so, *mtod(m, int *), 0, NULL);
2064 } else {
2065 error = sosplice(so,
2066 mtod(m, struct splice *)->sp_fd,
2067 mtod(m, struct splice *)->sp_max,
2068 &mtod(m, struct splice *)->sp_idle);
2069 }
2070 break;
2071 #endif /* SOCKET_SPLICE */
2072
2073 default:
2074 error = ENOPROTOOPT;
2075 break;
2076 }
2077 }
2078
2079 return (error);
2080 }
2081
2082 int
sogetopt(struct socket * so,int level,int optname,struct mbuf * m)2083 sogetopt(struct socket *so, int level, int optname, struct mbuf *m)
2084 {
2085 int error = 0;
2086
2087 if (level != SOL_SOCKET) {
2088 if (so->so_proto->pr_ctloutput) {
2089 m->m_len = 0;
2090
2091 solock(so);
2092 error = (*so->so_proto->pr_ctloutput)(PRCO_GETOPT, so,
2093 level, optname, m);
2094 sounlock(so);
2095 return (error);
2096 } else
2097 return (ENOPROTOOPT);
2098 } else {
2099 m->m_len = sizeof (int);
2100
2101 switch (optname) {
2102
2103 case SO_LINGER:
2104 m->m_len = sizeof (struct linger);
2105 solock_shared(so);
2106 mtod(m, struct linger *)->l_onoff =
2107 so->so_options & SO_LINGER;
2108 mtod(m, struct linger *)->l_linger = so->so_linger;
2109 sounlock_shared(so);
2110 break;
2111
2112 case SO_BINDANY:
2113 case SO_USELOOPBACK:
2114 case SO_DEBUG:
2115 case SO_KEEPALIVE:
2116 case SO_REUSEADDR:
2117 case SO_REUSEPORT:
2118 case SO_BROADCAST:
2119 case SO_OOBINLINE:
2120 case SO_ACCEPTCONN:
2121 case SO_TIMESTAMP:
2122 case SO_ZEROIZE:
2123 *mtod(m, int *) = so->so_options & optname;
2124 break;
2125
2126 case SO_DONTROUTE:
2127 *mtod(m, int *) = 0;
2128 break;
2129
2130 case SO_TYPE:
2131 *mtod(m, int *) = so->so_type;
2132 break;
2133
2134 case SO_ERROR:
2135 solock(so);
2136 *mtod(m, int *) = so->so_error;
2137 so->so_error = 0;
2138 sounlock(so);
2139
2140 break;
2141
2142 case SO_DOMAIN:
2143 *mtod(m, int *) = so->so_proto->pr_domain->dom_family;
2144 break;
2145
2146 case SO_PROTOCOL:
2147 *mtod(m, int *) = so->so_proto->pr_protocol;
2148 break;
2149
2150 case SO_SNDBUF:
2151 *mtod(m, int *) = so->so_snd.sb_hiwat;
2152 break;
2153
2154 case SO_RCVBUF:
2155 *mtod(m, int *) = so->so_rcv.sb_hiwat;
2156 break;
2157
2158 case SO_SNDLOWAT:
2159 *mtod(m, int *) = so->so_snd.sb_lowat;
2160 break;
2161
2162 case SO_RCVLOWAT:
2163 *mtod(m, int *) = so->so_rcv.sb_lowat;
2164 break;
2165
2166 case SO_SNDTIMEO:
2167 case SO_RCVTIMEO:
2168 {
2169 struct sockbuf *sb = (optname == SO_SNDTIMEO ?
2170 &so->so_snd : &so->so_rcv);
2171 struct timeval tv;
2172 uint64_t nsecs;
2173
2174 mtx_enter(&sb->sb_mtx);
2175 nsecs = sb->sb_timeo_nsecs;
2176 mtx_leave(&sb->sb_mtx);
2177
2178 m->m_len = sizeof(struct timeval);
2179 memset(&tv, 0, sizeof(tv));
2180 if (nsecs != INFSLP)
2181 NSEC_TO_TIMEVAL(nsecs, &tv);
2182 memcpy(mtod(m, struct timeval *), &tv, sizeof tv);
2183 break;
2184 }
2185
2186 case SO_RTABLE:
2187 if (so->so_proto->pr_domain &&
2188 so->so_proto->pr_domain->dom_protosw &&
2189 so->so_proto->pr_ctloutput) {
2190 const struct domain *dom =
2191 so->so_proto->pr_domain;
2192
2193 level = dom->dom_protosw->pr_protocol;
2194 solock(so);
2195 error = (*so->so_proto->pr_ctloutput)
2196 (PRCO_GETOPT, so, level, optname, m);
2197 sounlock(so);
2198 if (error)
2199 return (error);
2200 break;
2201 }
2202 return (ENOPROTOOPT);
2203
2204 #ifdef SOCKET_SPLICE
2205 case SO_SPLICE:
2206 {
2207 off_t len;
2208
2209 m->m_len = sizeof(off_t);
2210 solock_shared(so);
2211 len = so->so_sp ? so->so_sp->ssp_len : 0;
2212 sounlock_shared(so);
2213 memcpy(mtod(m, off_t *), &len, sizeof(off_t));
2214 break;
2215 }
2216 #endif /* SOCKET_SPLICE */
2217
2218 case SO_PEERCRED:
2219 if (so->so_proto->pr_protocol == AF_UNIX) {
2220 struct unpcb *unp = sotounpcb(so);
2221
2222 solock(so);
2223 if (unp->unp_flags & UNP_FEIDS) {
2224 m->m_len = sizeof(unp->unp_connid);
2225 memcpy(mtod(m, caddr_t),
2226 &(unp->unp_connid), m->m_len);
2227 sounlock(so);
2228 break;
2229 }
2230 sounlock(so);
2231
2232 return (ENOTCONN);
2233 }
2234 return (EOPNOTSUPP);
2235
2236 default:
2237 return (ENOPROTOOPT);
2238 }
2239 return (0);
2240 }
2241 }
2242
2243 void
sohasoutofband(struct socket * so)2244 sohasoutofband(struct socket *so)
2245 {
2246 pgsigio(&so->so_sigio, SIGURG, 0);
2247 knote(&so->so_rcv.sb_klist, 0);
2248 }
2249
2250 int
soo_kqfilter(struct file * fp,struct knote * kn)2251 soo_kqfilter(struct file *fp, struct knote *kn)
2252 {
2253 struct socket *so = kn->kn_fp->f_data;
2254 struct sockbuf *sb;
2255
2256 switch (kn->kn_filter) {
2257 case EVFILT_READ:
2258 kn->kn_fop = &soread_filtops;
2259 sb = &so->so_rcv;
2260 break;
2261 case EVFILT_WRITE:
2262 kn->kn_fop = &sowrite_filtops;
2263 sb = &so->so_snd;
2264 break;
2265 case EVFILT_EXCEPT:
2266 kn->kn_fop = &soexcept_filtops;
2267 sb = &so->so_rcv;
2268 break;
2269 default:
2270 return (EINVAL);
2271 }
2272
2273 klist_insert(&sb->sb_klist, kn);
2274
2275 return (0);
2276 }
2277
2278 void
filt_sordetach(struct knote * kn)2279 filt_sordetach(struct knote *kn)
2280 {
2281 struct socket *so = kn->kn_fp->f_data;
2282
2283 klist_remove(&so->so_rcv.sb_klist, kn);
2284 }
2285
2286 int
filt_soread(struct knote * kn,long hint)2287 filt_soread(struct knote *kn, long hint)
2288 {
2289 struct socket *so = kn->kn_fp->f_data;
2290 u_int state = READ_ONCE(so->so_state);
2291 u_int error = READ_ONCE(so->so_error);
2292 int rv = 0;
2293
2294 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2295
2296 if (so->so_options & SO_ACCEPTCONN) {
2297 short qlen = READ_ONCE(so->so_qlen);
2298
2299 soassertlocked_readonly(so);
2300
2301 kn->kn_data = qlen;
2302 rv = (kn->kn_data != 0);
2303
2304 if (kn->kn_flags & (__EV_POLL | __EV_SELECT)) {
2305 if (state & SS_ISDISCONNECTED) {
2306 kn->kn_flags |= __EV_HUP;
2307 rv = 1;
2308 } else {
2309 rv = qlen || soreadable(so);
2310 }
2311 }
2312
2313 return rv;
2314 }
2315
2316 kn->kn_data = so->so_rcv.sb_cc;
2317 #ifdef SOCKET_SPLICE
2318 if (isspliced(so)) {
2319 rv = 0;
2320 } else
2321 #endif /* SOCKET_SPLICE */
2322 if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
2323 kn->kn_flags |= EV_EOF;
2324 if (kn->kn_flags & __EV_POLL) {
2325 if (state & SS_ISDISCONNECTED)
2326 kn->kn_flags |= __EV_HUP;
2327 }
2328 kn->kn_fflags = error;
2329 rv = 1;
2330 } else if (error) {
2331 rv = 1;
2332 } else if (kn->kn_sfflags & NOTE_LOWAT) {
2333 rv = (kn->kn_data >= kn->kn_sdata);
2334 } else {
2335 rv = (kn->kn_data >= so->so_rcv.sb_lowat);
2336 }
2337
2338 return rv;
2339 }
2340
2341 void
filt_sowdetach(struct knote * kn)2342 filt_sowdetach(struct knote *kn)
2343 {
2344 struct socket *so = kn->kn_fp->f_data;
2345
2346 klist_remove(&so->so_snd.sb_klist, kn);
2347 }
2348
2349 int
filt_sowrite(struct knote * kn,long hint)2350 filt_sowrite(struct knote *kn, long hint)
2351 {
2352 struct socket *so = kn->kn_fp->f_data;
2353 u_int state = READ_ONCE(so->so_state);
2354 u_int error = READ_ONCE(so->so_error);
2355 int rv;
2356
2357 MUTEX_ASSERT_LOCKED(&so->so_snd.sb_mtx);
2358
2359 kn->kn_data = sbspace_locked(&so->so_snd);
2360 if (so->so_snd.sb_state & SS_CANTSENDMORE) {
2361 kn->kn_flags |= EV_EOF;
2362 if (kn->kn_flags & __EV_POLL) {
2363 if (state & SS_ISDISCONNECTED)
2364 kn->kn_flags |= __EV_HUP;
2365 }
2366 kn->kn_fflags = error;
2367 rv = 1;
2368 } else if (error) {
2369 rv = 1;
2370 } else if (((state & SS_ISCONNECTED) == 0) &&
2371 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2372 rv = 0;
2373 } else if (kn->kn_sfflags & NOTE_LOWAT) {
2374 rv = (kn->kn_data >= kn->kn_sdata);
2375 } else {
2376 rv = (kn->kn_data >= so->so_snd.sb_lowat);
2377 }
2378
2379 return (rv);
2380 }
2381
2382 int
filt_soexcept(struct knote * kn,long hint)2383 filt_soexcept(struct knote *kn, long hint)
2384 {
2385 struct socket *so = kn->kn_fp->f_data;
2386 int rv = 0;
2387
2388 MUTEX_ASSERT_LOCKED(&so->so_rcv.sb_mtx);
2389
2390 #ifdef SOCKET_SPLICE
2391 if (isspliced(so)) {
2392 rv = 0;
2393 } else
2394 #endif /* SOCKET_SPLICE */
2395 if (kn->kn_sfflags & NOTE_OOB) {
2396 if (so->so_oobmark || (so->so_rcv.sb_state & SS_RCVATMARK)) {
2397 kn->kn_fflags |= NOTE_OOB;
2398 kn->kn_data -= so->so_oobmark;
2399 rv = 1;
2400 }
2401 }
2402
2403 if (kn->kn_flags & __EV_POLL) {
2404 u_int state = READ_ONCE(so->so_state);
2405
2406 if (state & SS_ISDISCONNECTED) {
2407 kn->kn_flags |= __EV_HUP;
2408 rv = 1;
2409 }
2410 }
2411
2412 return rv;
2413 }
2414
2415 int
filt_sowmodify(struct kevent * kev,struct knote * kn)2416 filt_sowmodify(struct kevent *kev, struct knote *kn)
2417 {
2418 struct socket *so = kn->kn_fp->f_data;
2419 int rv;
2420
2421 mtx_enter(&so->so_snd.sb_mtx);
2422 rv = knote_modify(kev, kn);
2423 mtx_leave(&so->so_snd.sb_mtx);
2424
2425 return (rv);
2426 }
2427
2428 int
filt_sowprocess(struct knote * kn,struct kevent * kev)2429 filt_sowprocess(struct knote *kn, struct kevent *kev)
2430 {
2431 struct socket *so = kn->kn_fp->f_data;
2432 int rv;
2433
2434 mtx_enter(&so->so_snd.sb_mtx);
2435 rv = knote_process(kn, kev);
2436 mtx_leave(&so->so_snd.sb_mtx);
2437
2438 return (rv);
2439 }
2440
2441 int
filt_sormodify(struct kevent * kev,struct knote * kn)2442 filt_sormodify(struct kevent *kev, struct knote *kn)
2443 {
2444 struct socket *so = kn->kn_fp->f_data;
2445 int rv;
2446
2447 if (so->so_proto->pr_flags & PR_WANTRCVD)
2448 solock_shared(so);
2449 mtx_enter(&so->so_rcv.sb_mtx);
2450 rv = knote_modify(kev, kn);
2451 mtx_leave(&so->so_rcv.sb_mtx);
2452 if (so->so_proto->pr_flags & PR_WANTRCVD)
2453 sounlock_shared(so);
2454
2455 return (rv);
2456 }
2457
2458 int
filt_sorprocess(struct knote * kn,struct kevent * kev)2459 filt_sorprocess(struct knote *kn, struct kevent *kev)
2460 {
2461 struct socket *so = kn->kn_fp->f_data;
2462 int rv;
2463
2464 if (so->so_proto->pr_flags & PR_WANTRCVD)
2465 solock_shared(so);
2466 mtx_enter(&so->so_rcv.sb_mtx);
2467 rv = knote_process(kn, kev);
2468 mtx_leave(&so->so_rcv.sb_mtx);
2469 if (so->so_proto->pr_flags & PR_WANTRCVD)
2470 sounlock_shared(so);
2471
2472 return (rv);
2473 }
2474
2475 int
filt_soemodify(struct kevent * kev,struct knote * kn)2476 filt_soemodify(struct kevent *kev, struct knote *kn)
2477 {
2478 struct socket *so = kn->kn_fp->f_data;
2479 int rv;
2480
2481 mtx_enter(&so->so_rcv.sb_mtx);
2482 rv = knote_modify(kev, kn);
2483 mtx_leave(&so->so_rcv.sb_mtx);
2484
2485 return (rv);
2486 }
2487
2488 int
filt_soeprocess(struct knote * kn,struct kevent * kev)2489 filt_soeprocess(struct knote *kn, struct kevent *kev)
2490 {
2491 struct socket *so = kn->kn_fp->f_data;
2492 int rv;
2493
2494 mtx_enter(&so->so_rcv.sb_mtx);
2495 rv = knote_process(kn, kev);
2496 mtx_leave(&so->so_rcv.sb_mtx);
2497
2498 return (rv);
2499 }
2500
2501 #ifdef DDB
2502 void
2503 sobuf_print(struct sockbuf *,
2504 int (*)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))));
2505
2506 void
sobuf_print(struct sockbuf * sb,int (* pr)(const char *,...))2507 sobuf_print(struct sockbuf *sb,
2508 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2509 {
2510 (*pr)("\tsb_cc: %lu\n", sb->sb_cc);
2511 (*pr)("\tsb_datacc: %lu\n", sb->sb_datacc);
2512 (*pr)("\tsb_hiwat: %lu\n", sb->sb_hiwat);
2513 (*pr)("\tsb_wat: %lu\n", sb->sb_wat);
2514 (*pr)("\tsb_mbcnt: %lu\n", sb->sb_mbcnt);
2515 (*pr)("\tsb_mbmax: %lu\n", sb->sb_mbmax);
2516 (*pr)("\tsb_lowat: %ld\n", sb->sb_lowat);
2517 (*pr)("\tsb_mb: %p\n", sb->sb_mb);
2518 (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
2519 (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
2520 (*pr)("\tsb_flags: %04x\n", sb->sb_flags);
2521 (*pr)("\tsb_state: %04x\n", sb->sb_state);
2522 (*pr)("\tsb_timeo_nsecs: %llu\n", sb->sb_timeo_nsecs);
2523 }
2524
2525 void
so_print(void * v,int (* pr)(const char *,...))2526 so_print(void *v,
2527 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2))))
2528 {
2529 struct socket *so = v;
2530
2531 (*pr)("socket %p\n", so);
2532 (*pr)("so_type: %i\n", so->so_type);
2533 (*pr)("so_options: 0x%04x\n", so->so_options); /* %b */
2534 (*pr)("so_linger: %i\n", so->so_linger);
2535 (*pr)("so_state: 0x%04x\n", so->so_state);
2536 (*pr)("so_pcb: %p\n", so->so_pcb);
2537 (*pr)("so_proto: %p\n", so->so_proto);
2538 (*pr)("so_sigio: %p\n", so->so_sigio.sir_sigio);
2539
2540 (*pr)("so_head: %p\n", so->so_head);
2541 (*pr)("so_onq: %p\n", so->so_onq);
2542 (*pr)("so_q0: @%p first: %p\n", &so->so_q0, TAILQ_FIRST(&so->so_q0));
2543 (*pr)("so_q: @%p first: %p\n", &so->so_q, TAILQ_FIRST(&so->so_q));
2544 (*pr)("so_eq: next: %p\n", TAILQ_NEXT(so, so_qe));
2545 (*pr)("so_q0len: %i\n", so->so_q0len);
2546 (*pr)("so_qlen: %i\n", so->so_qlen);
2547 (*pr)("so_qlimit: %i\n", so->so_qlimit);
2548 (*pr)("so_timeo: %i\n", so->so_timeo);
2549 (*pr)("so_obmark: %lu\n", so->so_oobmark);
2550
2551 (*pr)("so_sp: %p\n", so->so_sp);
2552 if (so->so_sp != NULL) {
2553 (*pr)("\tssp_socket: %p\n", so->so_sp->ssp_socket);
2554 (*pr)("\tssp_soback: %p\n", so->so_sp->ssp_soback);
2555 (*pr)("\tssp_len: %lld\n",
2556 (unsigned long long)so->so_sp->ssp_len);
2557 (*pr)("\tssp_max: %lld\n",
2558 (unsigned long long)so->so_sp->ssp_max);
2559 (*pr)("\tssp_idletv: %lld %ld\n", so->so_sp->ssp_idletv.tv_sec,
2560 so->so_sp->ssp_idletv.tv_usec);
2561 (*pr)("\tssp_idleto: %spending (@%i)\n",
2562 timeout_pending(&so->so_sp->ssp_idleto) ? "" : "not ",
2563 so->so_sp->ssp_idleto.to_time);
2564 }
2565
2566 (*pr)("so_rcv:\n");
2567 sobuf_print(&so->so_rcv, pr);
2568 (*pr)("so_snd:\n");
2569 sobuf_print(&so->so_snd, pr);
2570
2571 (*pr)("so_upcall: %p so_upcallarg: %p\n",
2572 so->so_upcall, so->so_upcallarg);
2573
2574 (*pr)("so_euid: %d so_ruid: %d\n", so->so_euid, so->so_ruid);
2575 (*pr)("so_egid: %d so_rgid: %d\n", so->so_egid, so->so_rgid);
2576 (*pr)("so_cpid: %d\n", so->so_cpid);
2577 }
2578 #endif
2579