1 /*        $NetBSD: uipc_usrreq.c,v 1.208 2025/03/27 11:00:50 riastradh Exp $    */
2 
3 /*-
4  * Copyright (c) 1998, 2000, 2004, 2008, 2009, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center, and by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1982, 1986, 1989, 1991, 1993
35  *        The Regents of the University of California.  All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  * 3. Neither the name of the University nor the names of its contributors
46  *    may be used to endorse or promote products derived from this software
47  *    without specific prior written permission.
48  *
49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59  * SUCH DAMAGE.
60  *
61  *        @(#)uipc_usrreq.c   8.9 (Berkeley) 5/14/95
62  */
63 
64 /*
65  * Copyright (c) 1997 Christopher G. Demetriou.  All rights reserved.
66  *
67  * Redistribution and use in source and binary forms, with or without
68  * modification, are permitted provided that the following conditions
69  * are met:
70  * 1. Redistributions of source code must retain the above copyright
71  *    notice, this list of conditions and the following disclaimer.
72  * 2. Redistributions in binary form must reproduce the above copyright
73  *    notice, this list of conditions and the following disclaimer in the
74  *    documentation and/or other materials provided with the distribution.
75  * 3. All advertising materials mentioning features or use of this software
76  *    must display the following acknowledgement:
77  *        This product includes software developed by the University of
78  *        California, Berkeley and its contributors.
79  * 4. Neither the name of the University nor the names of its contributors
80  *    may be used to endorse or promote products derived from this software
81  *    without specific prior written permission.
82  *
83  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93  * SUCH DAMAGE.
94  *
95  *        @(#)uipc_usrreq.c   8.9 (Berkeley) 5/14/95
96  */
97 
98 #include <sys/cdefs.h>
99 __KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.208 2025/03/27 11:00:50 riastradh Exp $");
100 
101 #ifdef _KERNEL_OPT
102 #include "opt_compat_netbsd.h"
103 #endif
104 
105 #include <sys/param.h>
106 #include <sys/types.h>
107 
108 #include <sys/atomic.h>
109 #include <sys/compat_stub.h>
110 #include <sys/domain.h>
111 #include <sys/file.h>
112 #include <sys/filedesc.h>
113 #include <sys/kauth.h>
114 #include <sys/kernel.h>
115 #include <sys/kmem.h>
116 #include <sys/kthread.h>
117 #include <sys/mbuf.h>
118 #include <sys/namei.h>
119 #include <sys/proc.h>
120 #include <sys/protosw.h>
121 #include <sys/sdt.h>
122 #include <sys/socket.h>
123 #include <sys/socketvar.h>
124 #include <sys/stat.h>
125 #include <sys/systm.h>
126 #include <sys/uidinfo.h>
127 #include <sys/un.h>
128 #include <sys/unpcb.h>
129 #include <sys/vnode.h>
130 
131 #include <compat/net/route_70.h>
132 #include <compat/sys/socket.h>
133 
134 /*
135  * Unix communications domain.
136  *
137  * TODO:
138  *        RDM
139  *        rethink name space problems
140  *        need a proper out-of-band
141  *
142  * Notes on locking:
143  *
144  * The generic rules noted in uipc_socket2.c apply.  In addition:
145  *
146  * o We have a global lock, uipc_lock.
147  *
148  * o All datagram sockets are locked by uipc_lock.
149  *
150  * o For stream socketpairs, the two endpoints are created sharing the same
151  *   independent lock.  Sockets presented to PRU_CONNECT2 must already have
152  *   matching locks.
153  *
154  * o Stream sockets created via socket() start life with their own
155  *   independent lock.
156  *
157  * o Stream connections to a named endpoint are slightly more complicated.
158  *   Sockets that have called listen() have their lock pointer mutated to
159  *   the global uipc_lock.  When establishing a connection, the connecting
160  *   socket also has its lock mutated to uipc_lock, which matches the head
161  *   (listening socket).  We create a new socket for accept() to return, and
162  *   that also shares the head's lock.  Until the connection is completely
163  *   done on both ends, all three sockets are locked by uipc_lock.  Once the
164  *   connection is complete, the association with the head's lock is broken.
165  *   The connecting socket and the socket returned from accept() have their
166  *   lock pointers mutated away from uipc_lock, and back to the connecting
167  *   socket's original, independent lock.  The head continues to be locked
168  *   by uipc_lock.
169  *
170  * o If uipc_lock is determined to be a significant source of contention,
171  *   it could easily be hashed out.  It is difficult to simply make it an
172  *   independent lock because of visibility / garbage collection issues:
173  *   if a socket has been associated with a lock at any point, that lock
174  *   must remain valid until the socket is no longer visible in the system.
175  *   The lock must not be freed or otherwise destroyed until any sockets
176  *   that had referenced it have also been destroyed.
177  */
178 const struct sockaddr_un sun_noname = {
179           .sun_len = offsetof(struct sockaddr_un, sun_path),
180           .sun_family = AF_LOCAL,
181 };
182 ino_t     unp_ino;                      /* prototype for fake inode numbers */
183 
184 static struct mbuf * unp_addsockcred(struct lwp *, struct mbuf *);
185 static void   unp_discard_later(file_t *);
186 static void   unp_discard_now(file_t *);
187 static void   unp_disconnect1(struct unpcb *);
188 static bool   unp_drop(struct unpcb *, int);
189 static int    unp_internalize(struct mbuf **);
190 static void   unp_mark(file_t *);
191 static void   unp_scan(struct mbuf *, void (*)(file_t *), int);
192 static void   unp_shutdown1(struct unpcb *);
193 static void   unp_thread(void *);
194 static void   unp_thread_kick(void);
195 
196 static kmutex_t *uipc_lock;
197 
198 static kcondvar_t unp_thread_cv;
199 static lwp_t *unp_thread_lwp;
200 static SLIST_HEAD(,file) unp_thread_discard;
201 static int unp_defer;
202 static struct sysctllog *usrreq_sysctllog;
203 static void unp_sysctl_create(void);
204 
205 /* Compat interface */
206 
207 struct mbuf * stub_compat_70_unp_addsockcred(lwp_t *, struct mbuf *);
208 
stub_compat_70_unp_addsockcred(struct lwp * lwp,struct mbuf * control)209 struct mbuf * stub_compat_70_unp_addsockcred(struct lwp *lwp,
210     struct mbuf *control)
211 {
212 
213 /* just copy our initial argument */
214           return control;
215 }
216 
217 bool compat70_ocreds_valid = false;
218 
219 /*
220  * Initialize Unix protocols.
221  */
222 void
uipc_init(void)223 uipc_init(void)
224 {
225           int error;
226 
227           unp_sysctl_create();
228 
229           uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
230           cv_init(&unp_thread_cv, "unpgc");
231 
232           error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
233               NULL, &unp_thread_lwp, "unpgc");
234           if (error != 0)
235                     panic("uipc_init %d", error);
236 }
237 
238 static void
unp_connid(struct lwp * l,struct unpcb * unp,int flags)239 unp_connid(struct lwp *l, struct unpcb *unp, int flags)
240 {
241           unp->unp_connid.unp_pid = l->l_proc->p_pid;
242           unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
243           unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
244           unp->unp_flags |= flags;
245 }
246 
247 /*
248  * A connection succeeded: disassociate both endpoints from the head's
249  * lock, and make them share their own lock.  There is a race here: for
250  * a very brief time one endpoint will be locked by a different lock
251  * than the other end.  However, since the current thread holds the old
252  * lock (the listening socket's lock, the head) access can still only be
253  * made to one side of the connection.
254  */
255 static void
unp_setpeerlocks(struct socket * so,struct socket * so2)256 unp_setpeerlocks(struct socket *so, struct socket *so2)
257 {
258           struct unpcb *unp;
259           kmutex_t *lock;
260 
261           KASSERT(solocked2(so, so2));
262 
263           /*
264            * Bail out if either end of the socket is not yet fully
265            * connected or accepted.  We only break the lock association
266            * with the head when the pair of sockets stand completely
267            * on their own.
268            */
269           KASSERT(so->so_head == NULL);
270           if (so2->so_head != NULL)
271                     return;
272 
273           /*
274            * Drop references to old lock.  A third reference (from the
275            * queue head) must be held as we still hold its lock.  Bonus:
276            * we don't need to worry about garbage collecting the lock.
277            */
278           lock = so->so_lock;
279           KASSERT(lock == uipc_lock);
280           mutex_obj_free(lock);
281           mutex_obj_free(lock);
282 
283           /*
284            * Grab stream lock from the initiator and share between the two
285            * endpoints.  Issue memory barrier to ensure all modifications
286            * become globally visible before the lock change.  so2 is
287            * assumed not to have a stream lock, because it was created
288            * purely for the server side to accept this connection and
289            * started out life using the domain-wide lock.
290            */
291           unp = sotounpcb(so);
292           KASSERT(unp->unp_streamlock != NULL);
293           KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
294           lock = unp->unp_streamlock;
295           unp->unp_streamlock = NULL;
296           mutex_obj_hold(lock);
297           /*
298            * Ensure lock is initialized before publishing it with
299            * solockreset.  Pairs with atomic_load_consume in solock and
300            * various loops to reacquire lock after wakeup.
301            */
302           membar_release();
303           /*
304            * possible race if lock is not held - see comment in
305            * uipc_usrreq(PRU_ACCEPT).
306            */
307           KASSERT(mutex_owned(lock));
308           solockreset(so, lock);
309           solockreset(so2, lock);
310 }
311 
312 /*
313  * Reset a socket's lock back to the domain-wide lock.
314  */
315 static void
unp_resetlock(struct socket * so)316 unp_resetlock(struct socket *so)
317 {
318           kmutex_t *olock, *nlock;
319           struct unpcb *unp;
320 
321           KASSERT(solocked(so));
322 
323           olock = so->so_lock;
324           nlock = uipc_lock;
325           if (olock == nlock)
326                     return;
327           unp = sotounpcb(so);
328           KASSERT(unp->unp_streamlock == NULL);
329           unp->unp_streamlock = olock;
330           mutex_obj_hold(nlock);
331           mutex_enter(nlock);
332           solockreset(so, nlock);
333           mutex_exit(olock);
334 }
335 
336 static void
unp_free(struct unpcb * unp)337 unp_free(struct unpcb *unp)
338 {
339           if (unp->unp_addr)
340                     free(unp->unp_addr, M_SONAME);
341           if (unp->unp_streamlock != NULL)
342                     mutex_obj_free(unp->unp_streamlock);
343           kmem_free(unp, sizeof(*unp));
344 }
345 
346 static int
unp_output(struct mbuf * m,struct mbuf * control,struct unpcb * unp)347 unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp)
348 {
349           struct socket *so2;
350           const struct sockaddr_un *sun;
351 
352           /* XXX: server side closed the socket */
353           if (unp->unp_conn == NULL)
354                     return SET_ERROR(ECONNREFUSED);
355           so2 = unp->unp_conn->unp_socket;
356 
357           KASSERT(solocked(so2));
358 
359           if (unp->unp_addr)
360                     sun = unp->unp_addr;
361           else
362                     sun = &sun_noname;
363           if (unp->unp_conn->unp_flags & UNP_WANTCRED)
364                     control = unp_addsockcred(curlwp, control);
365           if (unp->unp_conn->unp_flags & UNP_OWANTCRED)
366                     MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
367                         stub_compat_70_unp_addsockcred(curlwp, control), control);
368           if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
369               control) == 0) {
370                     unp_dispose(control);
371                     m_freem(control);
372                     m_freem(m);
373                     /* Don't call soroverflow because we're returning this
374                      * error directly to the sender. */
375                     so2->so_rcv.sb_overflowed++;
376                     return SET_ERROR(ENOBUFS);
377           } else {
378                     sorwakeup(so2);
379                     return 0;
380           }
381 }
382 
383 static void
unp_setaddr(struct socket * so,struct sockaddr * nam,bool peeraddr)384 unp_setaddr(struct socket *so, struct sockaddr *nam, bool peeraddr)
385 {
386           const struct sockaddr_un *sun = NULL;
387           struct unpcb *unp;
388 
389           KASSERT(solocked(so));
390           unp = sotounpcb(so);
391 
392           if (peeraddr) {
393                     if (unp->unp_conn && unp->unp_conn->unp_addr)
394                               sun = unp->unp_conn->unp_addr;
395           } else {
396                     if (unp->unp_addr)
397                               sun = unp->unp_addr;
398           }
399           if (sun == NULL)
400                     sun = &sun_noname;
401 
402           memcpy(nam, sun, sun->sun_len);
403 }
404 
405 static int
unp_rcvd(struct socket * so,int flags,struct lwp * l)406 unp_rcvd(struct socket *so, int flags, struct lwp *l)
407 {
408           struct unpcb *unp = sotounpcb(so);
409           struct socket *so2;
410           u_int newhiwat;
411 
412           KASSERT(solocked(so));
413           KASSERT(unp != NULL);
414 
415           switch (so->so_type) {
416 
417           case SOCK_DGRAM:
418                     panic("uipc 1");
419                     /*NOTREACHED*/
420 
421           case SOCK_SEQPACKET: /* FALLTHROUGH */
422           case SOCK_STREAM:
423 #define   rcv (&so->so_rcv)
424 #define snd (&so2->so_snd)
425                     if (unp->unp_conn == 0)
426                               break;
427                     so2 = unp->unp_conn->unp_socket;
428                     KASSERT(solocked2(so, so2));
429                     /*
430                      * Adjust backpressure on sender
431                      * and wakeup any waiting to write.
432                      */
433                     snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
434                     unp->unp_mbcnt = rcv->sb_mbcnt;
435                     newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
436                     (void)chgsbsize(so2->so_uidinfo,
437                         &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
438                     unp->unp_cc = rcv->sb_cc;
439                     sowwakeup(so2);
440 #undef snd
441 #undef rcv
442                     break;
443 
444           default:
445                     panic("uipc 2");
446           }
447 
448           return 0;
449 }
450 
451 static int
unp_recvoob(struct socket * so,struct mbuf * m,int flags)452 unp_recvoob(struct socket *so, struct mbuf *m, int flags)
453 {
454           KASSERT(solocked(so));
455 
456           return SET_ERROR(EOPNOTSUPP);
457 }
458 
459 static int
unp_send(struct socket * so,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct lwp * l)460 unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
461     struct mbuf *control, struct lwp *l)
462 {
463           struct unpcb *unp = sotounpcb(so);
464           int error = 0;
465           u_int newhiwat;
466           struct socket *so2;
467 
468           KASSERT(solocked(so));
469           KASSERT(unp != NULL);
470           KASSERT(m != NULL);
471 
472           /*
473            * Note: unp_internalize() rejects any control message
474            * other than SCM_RIGHTS, and only allows one.  This
475            * has the side-effect of preventing a caller from
476            * forging SCM_CREDS.
477            */
478           if (control) {
479                     sounlock(so);
480                     error = unp_internalize(&control);
481                     solock(so);
482                     if (error != 0) {
483                               m_freem(control);
484                               m_freem(m);
485                               return error;
486                     }
487           }
488 
489           switch (so->so_type) {
490 
491           case SOCK_DGRAM: {
492                     KASSERT(so->so_lock == uipc_lock);
493                     if (nam) {
494                               if ((so->so_state & SS_ISCONNECTED) != 0)
495                                         error = SET_ERROR(EISCONN);
496                               else {
497                                         /*
498                                          * Note: once connected, the
499                                          * socket's lock must not be
500                                          * dropped until we have sent
501                                          * the message and disconnected.
502                                          * This is necessary to prevent
503                                          * intervening control ops, like
504                                          * another connection.
505                                          */
506                                         error = unp_connect(so, nam, l);
507                               }
508                     } else {
509                               if ((so->so_state & SS_ISCONNECTED) == 0)
510                                         error = SET_ERROR(ENOTCONN);
511                     }
512                     if (error) {
513                               unp_dispose(control);
514                               m_freem(control);
515                               m_freem(m);
516                               return error;
517                     }
518                     error = unp_output(m, control, unp);
519                     if (nam)
520                               unp_disconnect1(unp);
521                     break;
522           }
523 
524           case SOCK_SEQPACKET: /* FALLTHROUGH */
525           case SOCK_STREAM:
526 #define   rcv (&so2->so_rcv)
527 #define   snd (&so->so_snd)
528                     if (unp->unp_conn == NULL) {
529                               error = SET_ERROR(ENOTCONN);
530                               break;
531                     }
532                     so2 = unp->unp_conn->unp_socket;
533                     KASSERT(solocked2(so, so2));
534                     if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
535                               /*
536                                * Credentials are passed only once on
537                                * SOCK_STREAM and SOCK_SEQPACKET.
538                                */
539                               unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
540                               control = unp_addsockcred(l, control);
541                     }
542                     if (unp->unp_conn->unp_flags & UNP_OWANTCRED) {
543                               /*
544                                * Credentials are passed only once on
545                                * SOCK_STREAM and SOCK_SEQPACKET.
546                                */
547                               unp->unp_conn->unp_flags &= ~UNP_OWANTCRED;
548                               MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
549                                   stub_compat_70_unp_addsockcred(curlwp, control),
550                                   control);
551                     }
552                     /*
553                      * Send to paired receive port, and then reduce
554                      * send buffer hiwater marks to maintain backpressure.
555                      * Wake up readers.
556                      */
557                     if (control) {
558                               if (sbappendcontrol(rcv, m, control) != 0)
559                                         control = NULL;
560                     } else {
561                               switch(so->so_type) {
562                               case SOCK_SEQPACKET:
563                                         sbappendrecord(rcv, m);
564                                         break;
565                               case SOCK_STREAM:
566                                         sbappend(rcv, m);
567                                         break;
568                               default:
569                                         panic("uipc_usrreq");
570                                         break;
571                               }
572                     }
573                     snd->sb_mbmax -=
574                         rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
575                     unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
576                     newhiwat = snd->sb_hiwat -
577                         (rcv->sb_cc - unp->unp_conn->unp_cc);
578                     (void)chgsbsize(so->so_uidinfo,
579                         &snd->sb_hiwat, newhiwat, RLIM_INFINITY);
580                     unp->unp_conn->unp_cc = rcv->sb_cc;
581                     sorwakeup(so2);
582 #undef snd
583 #undef rcv
584                     if (control != NULL) {
585                               unp_dispose(control);
586                               m_freem(control);
587                     }
588                     break;
589 
590           default:
591                     panic("uipc 4");
592           }
593 
594           return error;
595 }
596 
597 static int
unp_sendoob(struct socket * so,struct mbuf * m,struct mbuf * control)598 unp_sendoob(struct socket *so, struct mbuf *m, struct mbuf * control)
599 {
600           KASSERT(solocked(so));
601 
602           m_freem(m);
603           m_freem(control);
604 
605           return SET_ERROR(EOPNOTSUPP);
606 }
607 
608 /*
609  * Unix domain socket option processing.
610  */
611 int
uipc_ctloutput(int op,struct socket * so,struct sockopt * sopt)612 uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
613 {
614           struct unpcb *unp = sotounpcb(so);
615           int optval = 0, error = 0;
616 
617           KASSERT(solocked(so));
618 
619           if (sopt->sopt_level != SOL_LOCAL) {
620                     error = SET_ERROR(ENOPROTOOPT);
621           } else switch (op) {
622 
623           case PRCO_SETOPT:
624                     switch (sopt->sopt_name) {
625                     case LOCAL_OCREDS:
626                               if (!compat70_ocreds_valid)  {
627                                         error = SET_ERROR(ENOPROTOOPT);
628                                         break;
629                               }
630                               /* FALLTHROUGH */
631                     case LOCAL_CREDS:
632                     case LOCAL_CONNWAIT:
633                               error = sockopt_getint(sopt, &optval);
634                               if (error)
635                                         break;
636                               switch (sopt->sopt_name) {
637 #define   OPTSET(bit) \
638           if (optval) \
639                     unp->unp_flags |= (bit); \
640           else \
641                     unp->unp_flags &= ~(bit);
642 
643                               case LOCAL_CREDS:
644                                         OPTSET(UNP_WANTCRED);
645                                         break;
646                               case LOCAL_CONNWAIT:
647                                         OPTSET(UNP_CONNWAIT);
648                                         break;
649                               case LOCAL_OCREDS:
650                                         OPTSET(UNP_OWANTCRED);
651                                         break;
652                               }
653                               break;
654 #undef OPTSET
655 
656                     default:
657                               error = SET_ERROR(ENOPROTOOPT);
658                               break;
659                     }
660                     break;
661 
662           case PRCO_GETOPT:
663                     sounlock(so);
664                     switch (sopt->sopt_name) {
665                     case LOCAL_PEEREID:
666                               if (unp->unp_flags & UNP_EIDSVALID) {
667                                         error = sockopt_set(sopt, &unp->unp_connid,
668                                             sizeof(unp->unp_connid));
669                               } else {
670                                         error = SET_ERROR(EINVAL);
671                               }
672                               break;
673                     case LOCAL_CREDS:
674 #define   OPTBIT(bit)         (unp->unp_flags & (bit) ? 1 : 0)
675 
676                               optval = OPTBIT(UNP_WANTCRED);
677                               error = sockopt_setint(sopt, optval);
678                               break;
679                     case LOCAL_OCREDS:
680                               if (compat70_ocreds_valid) {
681                                         optval = OPTBIT(UNP_OWANTCRED);
682                                         error = sockopt_setint(sopt, optval);
683                                         break;
684                               }
685 #undef OPTBIT
686                               /* FALLTHROUGH */
687                     default:
688                               error = SET_ERROR(ENOPROTOOPT);
689                               break;
690                     }
691                     solock(so);
692                     break;
693           }
694           return (error);
695 }
696 
697 /*
698  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
699  * for stream sockets, although the total for sender and receiver is
700  * actually only PIPSIZ.
701  * Datagram sockets really use the sendspace as the maximum datagram size,
702  * and don't really want to reserve the sendspace.  Their recvspace should
703  * be large enough for at least one max-size datagram plus address.
704  */
705 #ifndef PIPSIZ
706 #define   PIPSIZ    8192
707 #endif
708 u_long    unpst_sendspace = PIPSIZ;
709 u_long    unpst_recvspace = PIPSIZ;
710 u_long    unpdg_sendspace = 2*1024;     /* really max datagram size */
711 u_long    unpdg_recvspace = 16*1024;
712 
713 u_int     unp_rights;                             /* files in flight */
714 u_int     unp_rights_ratio = 2;                   /* limit, fraction of maxfiles */
715 
716 static int
unp_attach(struct socket * so,int proto)717 unp_attach(struct socket *so, int proto)
718 {
719           struct unpcb *unp = sotounpcb(so);
720           u_long sndspc, rcvspc;
721           int error;
722 
723           KASSERT(unp == NULL);
724 
725           switch (so->so_type) {
726           case SOCK_SEQPACKET:
727                     /* FALLTHROUGH */
728           case SOCK_STREAM:
729                     if (so->so_lock == NULL) {
730                               so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
731                               solock(so);
732                     }
733                     sndspc = unpst_sendspace;
734                     rcvspc = unpst_recvspace;
735                     break;
736 
737           case SOCK_DGRAM:
738                     if (so->so_lock == NULL) {
739                               mutex_obj_hold(uipc_lock);
740                               so->so_lock = uipc_lock;
741                               solock(so);
742                     }
743                     sndspc = unpdg_sendspace;
744                     rcvspc = unpdg_recvspace;
745                     break;
746 
747           default:
748                     panic("unp_attach");
749           }
750 
751           if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
752                     error = soreserve(so, sndspc, rcvspc);
753                     if (error) {
754                               return error;
755                     }
756           }
757 
758           unp = kmem_zalloc(sizeof(*unp), KM_SLEEP);
759           nanotime(&unp->unp_ctime);
760           unp->unp_socket = so;
761           so->so_pcb = unp;
762 
763           KASSERT(solocked(so));
764           return 0;
765 }
766 
767 static void
unp_detach(struct socket * so)768 unp_detach(struct socket *so)
769 {
770           struct unpcb *unp;
771           vnode_t *vp;
772 
773           unp = sotounpcb(so);
774           KASSERT(unp != NULL);
775           KASSERT(solocked(so));
776  retry:
777           if ((vp = unp->unp_vnode) != NULL) {
778                     sounlock(so);
779                     /* Acquire v_interlock to protect against unp_connect(). */
780                     /* XXXAD racy */
781                     mutex_enter(vp->v_interlock);
782                     vp->v_socket = NULL;
783                     mutex_exit(vp->v_interlock);
784                     vrele(vp);
785                     solock(so);
786                     unp->unp_vnode = NULL;
787           }
788           if (unp->unp_conn)
789                     unp_disconnect1(unp);
790           while (unp->unp_refs) {
791                     KASSERT(solocked2(so, unp->unp_refs->unp_socket));
792                     if (unp_drop(unp->unp_refs, SET_ERROR(ECONNRESET))) {
793                               solock(so);
794                               goto retry;
795                     }
796           }
797           soisdisconnected(so);
798           so->so_pcb = NULL;
799           if (unp_rights) {
800                     /*
801                      * Normally the receive buffer is flushed later, in sofree,
802                      * but if our receive buffer holds references to files that
803                      * are now garbage, we will enqueue those file references to
804                      * the garbage collector and kick it into action.
805                      */
806                     sorflush(so);
807                     unp_free(unp);
808                     unp_thread_kick();
809           } else
810                     unp_free(unp);
811 }
812 
813 static int
unp_accept(struct socket * so,struct sockaddr * nam)814 unp_accept(struct socket *so, struct sockaddr *nam)
815 {
816           struct unpcb *unp = sotounpcb(so);
817           struct socket *so2;
818 
819           KASSERT(solocked(so));
820           KASSERT(nam != NULL);
821 
822           /* XXX code review required to determine if unp can ever be NULL */
823           if (unp == NULL)
824                     return SET_ERROR(EINVAL);
825 
826           KASSERT(so->so_lock == uipc_lock);
827           /*
828            * Mark the initiating STREAM socket as connected *ONLY*
829            * after it's been accepted.  This prevents a client from
830            * overrunning a server and receiving ECONNREFUSED.
831            */
832           if (unp->unp_conn == NULL) {
833                     /*
834                      * This will use the empty socket and will not
835                      * allocate.
836                      */
837                     unp_setaddr(so, nam, true);
838                     return 0;
839           }
840           so2 = unp->unp_conn->unp_socket;
841           if (so2->so_state & SS_ISCONNECTING) {
842                     KASSERT(so->so_head == NULL || solocked2(so, so->so_head));
843                     KASSERT(so->so_head == NULL || solocked2(so2, so->so_head));
844                     soisconnected(so2);
845           }
846           /*
847            * If the connection is fully established, break the
848            * association with uipc_lock and give the connected
849            * pair a separate lock to share.
850            * There is a race here: sotounpcb(so2)->unp_streamlock
851            * is not locked, so when changing so2->so_lock
852            * another thread can grab it while so->so_lock is still
853            * pointing to the (locked) uipc_lock.
854            * this should be harmless, except that this makes
855            * solocked2() and solocked() unreliable.
856            * Another problem is that unp_setaddr() expects the
857            * the socket locked. Grabbing sotounpcb(so2)->unp_streamlock
858            * fixes both issues.
859            */
860           mutex_enter(sotounpcb(so2)->unp_streamlock);
861           unp_setpeerlocks(so2, so);
862           /*
863            * Only now return peer's address, as we may need to
864            * block in order to allocate memory.
865            *
866            * XXX Minor race: connection can be broken while
867            * lock is dropped in unp_setaddr().  We will return
868            * error == 0 and sun_noname as the peer address.
869            */
870           unp_setaddr(so, nam, true);
871           /* so_lock now points to unp_streamlock */
872           mutex_exit(so2->so_lock);
873           return 0;
874 }
875 
876 static int
unp_ioctl(struct socket * so,u_long cmd,void * nam,struct ifnet * ifp)877 unp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
878 {
879           return SET_ERROR(EOPNOTSUPP);
880 }
881 
882 static int
unp_stat(struct socket * so,struct stat * ub)883 unp_stat(struct socket *so, struct stat *ub)
884 {
885           struct unpcb *unp;
886           struct socket *so2;
887 
888           KASSERT(solocked(so));
889 
890           unp = sotounpcb(so);
891           if (unp == NULL)
892                     return SET_ERROR(EINVAL);
893 
894           ub->st_blksize = so->so_snd.sb_hiwat;
895           switch (so->so_type) {
896           case SOCK_SEQPACKET: /* FALLTHROUGH */
897           case SOCK_STREAM:
898                     if (unp->unp_conn == 0)
899                               break;
900 
901                     so2 = unp->unp_conn->unp_socket;
902                     KASSERT(solocked2(so, so2));
903                     ub->st_blksize += so2->so_rcv.sb_cc;
904                     break;
905           default:
906                     break;
907           }
908           ub->st_dev = NODEV;
909           if (unp->unp_ino == 0)
910                     unp->unp_ino = unp_ino++;
911           ub->st_atimespec = ub->st_mtimespec = ub->st_ctimespec = unp->unp_ctime;
912           ub->st_ino = unp->unp_ino;
913           ub->st_uid = so->so_uidinfo->ui_uid;
914           ub->st_gid = so->so_egid;
915           return (0);
916 }
917 
918 static int
unp_peeraddr(struct socket * so,struct sockaddr * nam)919 unp_peeraddr(struct socket *so, struct sockaddr *nam)
920 {
921           KASSERT(solocked(so));
922           KASSERT(sotounpcb(so) != NULL);
923           KASSERT(nam != NULL);
924 
925           unp_setaddr(so, nam, true);
926           return 0;
927 }
928 
929 static int
unp_sockaddr(struct socket * so,struct sockaddr * nam)930 unp_sockaddr(struct socket *so, struct sockaddr *nam)
931 {
932           KASSERT(solocked(so));
933           KASSERT(sotounpcb(so) != NULL);
934           KASSERT(nam != NULL);
935 
936           unp_setaddr(so, nam, false);
937           return 0;
938 }
939 
940 /*
941  * we only need to perform this allocation until syscalls other than
942  * bind are adjusted to use sockaddr_big.
943  */
944 static struct sockaddr_un *
makeun_sb(struct sockaddr * nam,size_t * addrlen)945 makeun_sb(struct sockaddr *nam, size_t *addrlen)
946 {
947           struct sockaddr_un *sun;
948 
949           *addrlen = nam->sa_len + 1;
950           sun = malloc(*addrlen, M_SONAME, M_WAITOK);
951           memcpy(sun, nam, nam->sa_len);
952           *(((char *)sun) + nam->sa_len) = '\0';
953           return sun;
954 }
955 
956 static int
unp_bind(struct socket * so,struct sockaddr * nam,struct lwp * l)957 unp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
958 {
959           struct sockaddr_un *sun;
960           struct unpcb *unp;
961           vnode_t *vp;
962           struct vattr vattr;
963           size_t addrlen;
964           int error;
965           struct pathbuf *pb;
966           struct nameidata nd;
967           proc_t *p;
968 
969           unp = sotounpcb(so);
970 
971           KASSERT(solocked(so));
972           KASSERT(unp != NULL);
973           KASSERT(nam != NULL);
974 
975           if (unp->unp_vnode != NULL)
976                     return SET_ERROR(EINVAL);
977           if ((unp->unp_flags & UNP_BUSY) != 0) {
978                     /*
979                      * EALREADY may not be strictly accurate, but since this
980                      * is a major application error it's hardly a big deal.
981                      */
982                     return SET_ERROR(EALREADY);
983           }
984           unp->unp_flags |= UNP_BUSY;
985           sounlock(so);
986 
987           p = l->l_proc;
988           sun = makeun_sb(nam, &addrlen);
989 
990           pb = pathbuf_create(sun->sun_path);
991           if (pb == NULL) {
992                     error = SET_ERROR(ENOMEM);
993                     goto bad;
994           }
995           NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, pb);
996 
997 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
998           if ((error = namei(&nd)) != 0) {
999                     pathbuf_destroy(pb);
1000                     goto bad;
1001           }
1002           vp = nd.ni_vp;
1003           if (vp != NULL) {
1004                     VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
1005                     if (nd.ni_dvp == vp)
1006                               vrele(nd.ni_dvp);
1007                     else
1008                               vput(nd.ni_dvp);
1009                     vrele(vp);
1010                     pathbuf_destroy(pb);
1011                     error = SET_ERROR(EADDRINUSE);
1012                     goto bad;
1013           }
1014           vattr_null(&vattr);
1015           vattr.va_type = VSOCK;
1016           vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
1017           error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1018           if (error) {
1019                     vput(nd.ni_dvp);
1020                     pathbuf_destroy(pb);
1021                     goto bad;
1022           }
1023           vp = nd.ni_vp;
1024           vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1025           solock(so);
1026           vp->v_socket = unp->unp_socket;
1027           unp->unp_vnode = vp;
1028           unp->unp_addrlen = addrlen;
1029           unp->unp_addr = sun;
1030           VOP_UNLOCK(vp);
1031           vput(nd.ni_dvp);
1032           unp->unp_flags &= ~UNP_BUSY;
1033           pathbuf_destroy(pb);
1034           return (0);
1035 
1036  bad:
1037           free(sun, M_SONAME);
1038           solock(so);
1039           unp->unp_flags &= ~UNP_BUSY;
1040           return (error);
1041 }
1042 
1043 static int
unp_listen(struct socket * so,struct lwp * l)1044 unp_listen(struct socket *so, struct lwp *l)
1045 {
1046           struct unpcb *unp = sotounpcb(so);
1047 
1048           KASSERT(solocked(so));
1049           KASSERT(unp != NULL);
1050 
1051           /*
1052            * If the socket can accept a connection, it must be
1053            * locked by uipc_lock.
1054            */
1055           unp_resetlock(so);
1056           if (unp->unp_vnode == NULL)
1057                     return SET_ERROR(EINVAL);
1058 
1059           unp_connid(l, unp, UNP_EIDSBIND);
1060           return 0;
1061 }
1062 
1063 static int
unp_disconnect(struct socket * so)1064 unp_disconnect(struct socket *so)
1065 {
1066           KASSERT(solocked(so));
1067           KASSERT(sotounpcb(so) != NULL);
1068 
1069           unp_disconnect1(sotounpcb(so));
1070           return 0;
1071 }
1072 
1073 static int
unp_shutdown(struct socket * so)1074 unp_shutdown(struct socket *so)
1075 {
1076           KASSERT(solocked(so));
1077           KASSERT(sotounpcb(so) != NULL);
1078 
1079           socantsendmore(so);
1080           unp_shutdown1(sotounpcb(so));
1081           return 0;
1082 }
1083 
1084 static int
unp_abort(struct socket * so)1085 unp_abort(struct socket *so)
1086 {
1087           KASSERT(solocked(so));
1088           KASSERT(sotounpcb(so) != NULL);
1089 
1090           (void)unp_drop(sotounpcb(so), SET_ERROR(ECONNABORTED));
1091           KASSERT(so->so_head == NULL);
1092           KASSERT(so->so_pcb != NULL);
1093           unp_detach(so);
1094           return 0;
1095 }
1096 
1097 static int
unp_connect1(struct socket * so,struct socket * so2,struct lwp * l)1098 unp_connect1(struct socket *so, struct socket *so2, struct lwp *l)
1099 {
1100           struct unpcb *unp = sotounpcb(so);
1101           struct unpcb *unp2;
1102 
1103           if (so2->so_type != so->so_type)
1104                     return SET_ERROR(EPROTOTYPE);
1105 
1106           /*
1107            * All three sockets involved must be locked by same lock:
1108            *
1109            * local endpoint (so)
1110            * remote endpoint (so2)
1111            * queue head (so2->so_head, only if PR_CONNREQUIRED)
1112            */
1113           KASSERT(solocked2(so, so2));
1114           KASSERT(so->so_head == NULL);
1115           if (so2->so_head != NULL) {
1116                     KASSERT(so2->so_lock == uipc_lock);
1117                     KASSERT(solocked2(so2, so2->so_head));
1118           }
1119 
1120           unp2 = sotounpcb(so2);
1121           unp->unp_conn = unp2;
1122 
1123           switch (so->so_type) {
1124 
1125           case SOCK_DGRAM:
1126                     unp->unp_nextref = unp2->unp_refs;
1127                     unp2->unp_refs = unp;
1128                     soisconnected(so);
1129                     break;
1130 
1131           case SOCK_SEQPACKET: /* FALLTHROUGH */
1132           case SOCK_STREAM:
1133 
1134                     /*
1135                      * SOCK_SEQPACKET and SOCK_STREAM cases are handled by callers
1136                      * which are unp_connect() or unp_connect2().
1137                      */
1138 
1139                     break;
1140 
1141           default:
1142                     panic("unp_connect1");
1143           }
1144 
1145           return 0;
1146 }
1147 
1148 int
unp_connect(struct socket * so,struct sockaddr * nam,struct lwp * l)1149 unp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
1150 {
1151           struct sockaddr_un *sun;
1152           vnode_t *vp;
1153           struct socket *so2, *so3;
1154           struct unpcb *unp, *unp2, *unp3;
1155           size_t addrlen;
1156           int error;
1157           struct pathbuf *pb;
1158           struct nameidata nd;
1159 
1160           unp = sotounpcb(so);
1161           if ((unp->unp_flags & UNP_BUSY) != 0) {
1162                     /*
1163                      * EALREADY may not be strictly accurate, but since this
1164                      * is a major application error it's hardly a big deal.
1165                      */
1166                     return SET_ERROR(EALREADY);
1167           }
1168           unp->unp_flags |= UNP_BUSY;
1169           sounlock(so);
1170 
1171           sun = makeun_sb(nam, &addrlen);
1172           pb = pathbuf_create(sun->sun_path);
1173           if (pb == NULL) {
1174                     error = SET_ERROR(ENOMEM);
1175                     goto bad2;
1176           }
1177 
1178           NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
1179 
1180           if ((error = namei(&nd)) != 0) {
1181                     pathbuf_destroy(pb);
1182                     goto bad2;
1183           }
1184           vp = nd.ni_vp;
1185           pathbuf_destroy(pb);
1186           if (vp->v_type != VSOCK) {
1187                     error = SET_ERROR(ENOTSOCK);
1188                     goto bad;
1189           }
1190           if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
1191                     goto bad;
1192           /* Acquire v_interlock to protect against unp_detach(). */
1193           mutex_enter(vp->v_interlock);
1194           so2 = vp->v_socket;
1195           if (so2 == NULL) {
1196                     mutex_exit(vp->v_interlock);
1197                     error = SET_ERROR(ECONNREFUSED);
1198                     goto bad;
1199           }
1200           if (so->so_type != so2->so_type) {
1201                     mutex_exit(vp->v_interlock);
1202                     error = SET_ERROR(EPROTOTYPE);
1203                     goto bad;
1204           }
1205           solock(so);
1206           unp_resetlock(so);
1207           mutex_exit(vp->v_interlock);
1208           if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1209                     /*
1210                      * This may seem somewhat fragile but is OK: if we can
1211                      * see SO_ACCEPTCONN set on the endpoint, then it must
1212                      * be locked by the domain-wide uipc_lock.
1213                      */
1214                     KASSERT((so2->so_options & SO_ACCEPTCONN) == 0 ||
1215                         so2->so_lock == uipc_lock);
1216                     if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
1217                         (so3 = sonewconn(so2, false)) == NULL) {
1218                               error = SET_ERROR(ECONNREFUSED);
1219                               sounlock(so);
1220                               goto bad;
1221                     }
1222                     unp2 = sotounpcb(so2);
1223                     unp3 = sotounpcb(so3);
1224                     if (unp2->unp_addr) {
1225                               unp3->unp_addr = malloc(unp2->unp_addrlen,
1226                                   M_SONAME, M_WAITOK);
1227                               memcpy(unp3->unp_addr, unp2->unp_addr,
1228                                   unp2->unp_addrlen);
1229                               unp3->unp_addrlen = unp2->unp_addrlen;
1230                     }
1231                     unp3->unp_flags = unp2->unp_flags;
1232                     so2 = so3;
1233                     /*
1234                      * The connector's (client's) credentials are copied from its
1235                      * process structure at the time of connect() (which is now).
1236                      */
1237                     unp_connid(l, unp3, UNP_EIDSVALID);
1238                      /*
1239                       * The receiver's (server's) credentials are copied from the
1240                       * unp_peercred member of socket on which the former called
1241                       * listen(); unp_listen() cached that process's credentials
1242                       * at that time so we can use them now.
1243                       */
1244                     if (unp2->unp_flags & UNP_EIDSBIND) {
1245                               memcpy(&unp->unp_connid, &unp2->unp_connid,
1246                                   sizeof(unp->unp_connid));
1247                               unp->unp_flags |= UNP_EIDSVALID;
1248                     }
1249           }
1250           error = unp_connect1(so, so2, l);
1251           if (error) {
1252                     sounlock(so);
1253                     goto bad;
1254           }
1255           unp2 = sotounpcb(so2);
1256           switch (so->so_type) {
1257 
1258           /*
1259            * SOCK_DGRAM and default cases are handled in prior call to
1260            * unp_connect1(), do not add a default case without fixing
1261            * unp_connect1().
1262            */
1263 
1264           case SOCK_SEQPACKET: /* FALLTHROUGH */
1265           case SOCK_STREAM:
1266                     unp2->unp_conn = unp;
1267                     if ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)
1268                               soisconnecting(so);
1269                     else
1270                               soisconnected(so);
1271                     soisconnected(so2);
1272                     /*
1273                      * If the connection is fully established, break the
1274                      * association with uipc_lock and give the connected
1275                      * pair a separate lock to share.
1276                      */
1277                     KASSERT(so2->so_head != NULL);
1278                     unp_setpeerlocks(so, so2);
1279                     break;
1280 
1281           }
1282           sounlock(so);
1283  bad:
1284           vput(vp);
1285  bad2:
1286           free(sun, M_SONAME);
1287           solock(so);
1288           unp->unp_flags &= ~UNP_BUSY;
1289           return (error);
1290 }
1291 
1292 int
unp_connect2(struct socket * so,struct socket * so2)1293 unp_connect2(struct socket *so, struct socket *so2)
1294 {
1295           struct unpcb *unp = sotounpcb(so);
1296           struct unpcb *unp2;
1297           int error = 0;
1298 
1299           KASSERT(solocked2(so, so2));
1300 
1301           error = unp_connect1(so, so2, curlwp);
1302           if (error)
1303                     return error;
1304 
1305           unp2 = sotounpcb(so2);
1306           switch (so->so_type) {
1307 
1308           /*
1309            * SOCK_DGRAM and default cases are handled in prior call to
1310            * unp_connect1(), do not add a default case without fixing
1311            * unp_connect1().
1312            */
1313 
1314           case SOCK_SEQPACKET: /* FALLTHROUGH */
1315           case SOCK_STREAM:
1316                     unp2->unp_conn = unp;
1317                     soisconnected(so);
1318                     soisconnected(so2);
1319                     break;
1320 
1321           }
1322           return error;
1323 }
1324 
1325 static void
unp_disconnect1(struct unpcb * unp)1326 unp_disconnect1(struct unpcb *unp)
1327 {
1328           struct unpcb *unp2 = unp->unp_conn;
1329           struct socket *so;
1330 
1331           if (unp2 == 0)
1332                     return;
1333           unp->unp_conn = 0;
1334           so = unp->unp_socket;
1335           switch (so->so_type) {
1336           case SOCK_DGRAM:
1337                     if (unp2->unp_refs == unp)
1338                               unp2->unp_refs = unp->unp_nextref;
1339                     else {
1340                               unp2 = unp2->unp_refs;
1341                               for (;;) {
1342                                         KASSERT(solocked2(so, unp2->unp_socket));
1343                                         if (unp2 == 0)
1344                                                   panic("unp_disconnect1");
1345                                         if (unp2->unp_nextref == unp)
1346                                                   break;
1347                                         unp2 = unp2->unp_nextref;
1348                               }
1349                               unp2->unp_nextref = unp->unp_nextref;
1350                     }
1351                     unp->unp_nextref = 0;
1352                     so->so_state &= ~SS_ISCONNECTED;
1353                     break;
1354 
1355           case SOCK_SEQPACKET: /* FALLTHROUGH */
1356           case SOCK_STREAM:
1357                     KASSERT(solocked2(so, unp2->unp_socket));
1358                     soisdisconnected(so);
1359                     unp2->unp_conn = 0;
1360                     soisdisconnected(unp2->unp_socket);
1361                     break;
1362           }
1363 }
1364 
1365 static void
unp_shutdown1(struct unpcb * unp)1366 unp_shutdown1(struct unpcb *unp)
1367 {
1368           struct socket *so;
1369 
1370           switch(unp->unp_socket->so_type) {
1371           case SOCK_SEQPACKET: /* FALLTHROUGH */
1372           case SOCK_STREAM:
1373                     if (unp->unp_conn && (so = unp->unp_conn->unp_socket))
1374                               socantrcvmore(so);
1375                     break;
1376           default:
1377                     break;
1378           }
1379 }
1380 
1381 static bool
unp_drop(struct unpcb * unp,int errno)1382 unp_drop(struct unpcb *unp, int errno)
1383 {
1384           struct socket *so = unp->unp_socket;
1385 
1386           KASSERT(solocked(so));
1387 
1388           so->so_error = errno;
1389           unp_disconnect1(unp);
1390           if (so->so_head) {
1391                     so->so_pcb = NULL;
1392                     /* sofree() drops the socket lock */
1393                     sofree(so);
1394                     unp_free(unp);
1395                     return true;
1396           }
1397           return false;
1398 }
1399 
1400 #ifdef notdef
unp_drain(void)1401 unp_drain(void)
1402 {
1403 
1404 }
1405 #endif
1406 
1407 int
unp_externalize(struct mbuf * rights,struct lwp * l,int flags)1408 unp_externalize(struct mbuf *rights, struct lwp *l, int flags)
1409 {
1410           struct cmsghdr * const cm = mtod(rights, struct cmsghdr *);
1411           struct proc * const p = l->l_proc;
1412           file_t **rp;
1413           int error = 0;
1414 
1415           const size_t nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
1416               sizeof(file_t *);
1417           if (nfds == 0)
1418                     goto noop;
1419 
1420           int * const fdp = kmem_alloc(nfds * sizeof(int), KM_SLEEP);
1421           rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
1422 
1423           /* Make sure the recipient should be able to see the files.. */
1424           rp = (file_t **)CMSG_DATA(cm);
1425           for (size_t i = 0; i < nfds; i++) {
1426                     file_t * const fp = *rp++;
1427                     if (fp == NULL) {
1428                               error = SET_ERROR(EINVAL);
1429                               goto out;
1430                     }
1431                     /*
1432                      * If we are in a chroot'ed directory, and
1433                      * someone wants to pass us a directory, make
1434                      * sure it's inside the subtree we're allowed
1435                      * to access.
1436                      */
1437                     if (p->p_cwdi->cwdi_rdir != NULL && fp->f_type == DTYPE_VNODE) {
1438                               vnode_t *vp = fp->f_vnode;
1439                               if ((vp->v_type == VDIR) &&
1440                                   !vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
1441                                         error = SET_ERROR(EPERM);
1442                                         goto out;
1443                               }
1444                     }
1445           }
1446 
1447  restart:
1448           /*
1449            * First loop -- allocate file descriptor table slots for the
1450            * new files.
1451            */
1452           for (size_t i = 0; i < nfds; i++) {
1453                     if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
1454                               /*
1455                                * Back out what we've done so far.
1456                                */
1457                               while (i-- > 0) {
1458                                         fd_abort(p, NULL, fdp[i]);
1459                               }
1460                               if (error == ENOSPC) {
1461                                         fd_tryexpand(p);
1462                                         error = 0;
1463                                         goto restart;
1464                               }
1465                               /*
1466                                * This is the error that has historically
1467                                * been returned, and some callers may
1468                                * expect it.
1469                                */
1470                               error = SET_ERROR(EMSGSIZE);
1471                               goto out;
1472                     }
1473           }
1474 
1475           /*
1476            * Now that adding them has succeeded, update all of the
1477            * file passing state and affix the descriptors.
1478            */
1479           rp = (file_t **)CMSG_DATA(cm);
1480           int *ofdp = (int *)CMSG_DATA(cm);
1481           for (size_t i = 0; i < nfds; i++) {
1482                     file_t * const fp = *rp++;
1483                     const int fd = fdp[i];
1484                     atomic_dec_uint(&unp_rights);
1485                     fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
1486                     fd_affix(p, fp, fd);
1487                     /*
1488                      * Done with this file pointer, replace it with a fd;
1489                      */
1490                     *ofdp++ = fd;
1491                     mutex_enter(&fp->f_lock);
1492                     fp->f_msgcount--;
1493                     mutex_exit(&fp->f_lock);
1494                     /*
1495                      * Note that fd_affix() adds a reference to the file.
1496                      * The file may already have been closed by another
1497                      * LWP in the process, so we must drop the reference
1498                      * added by unp_internalize() with closef().
1499                      */
1500                     closef(fp);
1501           }
1502 
1503           /*
1504            * Adjust length, in case of transition from large file_t
1505            * pointers to ints.
1506            */
1507           if (sizeof(file_t *) != sizeof(int)) {
1508                     cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
1509                     rights->m_len = CMSG_SPACE(nfds * sizeof(int));
1510           }
1511  out:
1512           if (__predict_false(error != 0)) {
1513                     file_t **const fpp = (file_t **)CMSG_DATA(cm);
1514                     for (size_t i = 0; i < nfds; i++)
1515                               unp_discard_now(fpp[i]);
1516                     /*
1517                      * Truncate the array so that nobody will try to interpret
1518                      * what is now garbage in it.
1519                      */
1520                     cm->cmsg_len = CMSG_LEN(0);
1521                     rights->m_len = CMSG_SPACE(0);
1522           }
1523           rw_exit(&p->p_cwdi->cwdi_lock);
1524           kmem_free(fdp, nfds * sizeof(int));
1525 
1526  noop:
1527           /*
1528            * Don't disclose kernel memory in the alignment space.
1529            */
1530           KASSERT(cm->cmsg_len <= rights->m_len);
1531           memset(&mtod(rights, char *)[cm->cmsg_len], 0, rights->m_len -
1532               cm->cmsg_len);
1533           return error;
1534 }
1535 
1536 static int
unp_internalize(struct mbuf ** controlp)1537 unp_internalize(struct mbuf **controlp)
1538 {
1539           filedesc_t *fdescp = curlwp->l_fd;
1540           fdtab_t *dt;
1541           struct mbuf *control = *controlp;
1542           struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
1543           file_t **rp, **files;
1544           file_t *fp;
1545           int i, fd, *fdp;
1546           int nfds, error;
1547           u_int maxmsg;
1548 
1549           error = 0;
1550           newcm = NULL;
1551 
1552           /* Sanity check the control message header. */
1553           if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
1554               cm->cmsg_len > control->m_len ||
1555               cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
1556                     return SET_ERROR(EINVAL);
1557 
1558           /*
1559            * Verify that the file descriptors are valid, and acquire
1560            * a reference to each.
1561            */
1562           nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
1563           fdp = (int *)CMSG_DATA(cm);
1564           maxmsg = maxfiles / unp_rights_ratio;
1565           for (i = 0; i < nfds; i++) {
1566                     fd = *fdp++;
1567                     if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
1568                               atomic_dec_uint(&unp_rights);
1569                               nfds = i;
1570                               error = SET_ERROR(EAGAIN);
1571                               goto out;
1572                     }
1573                     if ((fp = fd_getfile(fd)) == NULL
1574                         || fp->f_type == DTYPE_KQUEUE) {
1575                               if (fp)
1576                                         fd_putfile(fd);
1577                               atomic_dec_uint(&unp_rights);
1578                               nfds = i;
1579                               error = SET_ERROR(EBADF);
1580                               goto out;
1581                     }
1582           }
1583 
1584           /* Allocate new space and copy header into it. */
1585           newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
1586           if (newcm == NULL) {
1587                     error = SET_ERROR(E2BIG);
1588                     goto out;
1589           }
1590           memcpy(newcm, cm, sizeof(struct cmsghdr));
1591           memset(newcm + 1, 0, CMSG_LEN(0) - sizeof(struct cmsghdr));
1592           files = (file_t **)CMSG_DATA(newcm);
1593 
1594           /*
1595            * Transform the file descriptors into file_t pointers, in
1596            * reverse order so that if pointers are bigger than ints, the
1597            * int won't get until we're done.  No need to lock, as we have
1598            * already validated the descriptors with fd_getfile().
1599            */
1600           fdp = (int *)CMSG_DATA(cm) + nfds;
1601           rp = files + nfds;
1602           for (i = 0; i < nfds; i++) {
1603                     dt = atomic_load_consume(&fdescp->fd_dt);
1604                     fp = atomic_load_consume(&dt->dt_ff[*--fdp]->ff_file);
1605                     KASSERT(fp != NULL);
1606                     mutex_enter(&fp->f_lock);
1607                     *--rp = fp;
1608                     fp->f_count++;
1609                     fp->f_msgcount++;
1610                     mutex_exit(&fp->f_lock);
1611           }
1612 
1613  out:
1614           /* Release descriptor references. */
1615           fdp = (int *)CMSG_DATA(cm);
1616           for (i = 0; i < nfds; i++) {
1617                     fd_putfile(*fdp++);
1618                     if (error != 0) {
1619                               atomic_dec_uint(&unp_rights);
1620                     }
1621           }
1622 
1623           if (error == 0) {
1624                     if (control->m_flags & M_EXT) {
1625                               m_freem(control);
1626                               *controlp = control = m_get(M_WAIT, MT_CONTROL);
1627                     }
1628                     MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
1629                         M_MBUF, NULL, NULL);
1630                     cm = newcm;
1631                     /*
1632                      * Adjust message & mbuf to note amount of space
1633                      * actually used.
1634                      */
1635                     cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
1636                     control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
1637           }
1638 
1639           return error;
1640 }
1641 
1642 struct mbuf *
unp_addsockcred(struct lwp * l,struct mbuf * control)1643 unp_addsockcred(struct lwp *l, struct mbuf *control)
1644 {
1645           struct sockcred *sc;
1646           struct mbuf *m;
1647           void *p;
1648 
1649           m = sbcreatecontrol1(&p, SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)),
1650                     SCM_CREDS, SOL_SOCKET, M_WAITOK);
1651           if (m == NULL)
1652                     return control;
1653 
1654           sc = p;
1655           sc->sc_pid = l->l_proc->p_pid;
1656           sc->sc_uid = kauth_cred_getuid(l->l_cred);
1657           sc->sc_euid = kauth_cred_geteuid(l->l_cred);
1658           sc->sc_gid = kauth_cred_getgid(l->l_cred);
1659           sc->sc_egid = kauth_cred_getegid(l->l_cred);
1660           sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
1661 
1662           for (int i = 0; i < sc->sc_ngroups; i++)
1663                     sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
1664 
1665           return m_add(control, m);
1666 }
1667 
1668 /*
1669  * Do a mark-sweep GC of files in the system, to free up any which are
1670  * caught in flight to an about-to-be-closed socket.  Additionally,
1671  * process deferred file closures.
1672  */
1673 static void
unp_gc(file_t * dp)1674 unp_gc(file_t *dp)
1675 {
1676           extern    struct domain unixdomain;
1677           file_t *fp, *np;
1678           struct socket *so, *so1;
1679           u_int i, oflags, rflags;
1680           bool didwork;
1681 
1682           KASSERT(curlwp == unp_thread_lwp);
1683           KASSERT(mutex_owned(&filelist_lock));
1684 
1685           /*
1686            * First, process deferred file closures.
1687            */
1688           while (!SLIST_EMPTY(&unp_thread_discard)) {
1689                     fp = SLIST_FIRST(&unp_thread_discard);
1690                     KASSERT(fp->f_unpcount > 0);
1691                     KASSERT(fp->f_count > 0);
1692                     KASSERT(fp->f_msgcount > 0);
1693                     KASSERT(fp->f_count >= fp->f_unpcount);
1694                     KASSERT(fp->f_count >= fp->f_msgcount);
1695                     KASSERT(fp->f_msgcount >= fp->f_unpcount);
1696                     SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
1697                     i = fp->f_unpcount;
1698                     fp->f_unpcount = 0;
1699                     mutex_exit(&filelist_lock);
1700                     for (; i != 0; i--) {
1701                               unp_discard_now(fp);
1702                     }
1703                     mutex_enter(&filelist_lock);
1704           }
1705 
1706           /*
1707            * Clear mark bits.  Ensure that we don't consider new files
1708            * entering the file table during this loop (they will not have
1709            * FSCAN set).
1710            */
1711           unp_defer = 0;
1712           LIST_FOREACH(fp, &filehead, f_list) {
1713                     for (oflags = fp->f_flag;; oflags = rflags) {
1714                               rflags = atomic_cas_uint(&fp->f_flag, oflags,
1715                                   (oflags | FSCAN) & ~(FMARK|FDEFER));
1716                               if (__predict_true(oflags == rflags)) {
1717                                         break;
1718                               }
1719                     }
1720           }
1721 
1722           /*
1723            * Iterate over the set of sockets, marking ones believed (based on
1724            * refcount) to be referenced from a process, and marking for rescan
1725            * sockets which are queued on a socket.  Recan continues descending
1726            * and searching for sockets referenced by sockets (FDEFER), until
1727            * there are no more socket->socket references to be discovered.
1728            */
1729           do {
1730                     didwork = false;
1731                     for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1732                               KASSERT(mutex_owned(&filelist_lock));
1733                               np = LIST_NEXT(fp, f_list);
1734                               mutex_enter(&fp->f_lock);
1735                               if ((fp->f_flag & FDEFER) != 0) {
1736                                         atomic_and_uint(&fp->f_flag, ~FDEFER);
1737                                         unp_defer--;
1738                                         if (fp->f_count == 0) {
1739                                                   /*
1740                                                    * XXX: closef() doesn't pay attention
1741                                                    * to FDEFER
1742                                                    */
1743                                                   mutex_exit(&fp->f_lock);
1744                                                   continue;
1745                                         }
1746                               } else {
1747                                         if (fp->f_count == 0 ||
1748                                             (fp->f_flag & FMARK) != 0 ||
1749                                             fp->f_count == fp->f_msgcount ||
1750                                             fp->f_unpcount != 0) {
1751                                                   mutex_exit(&fp->f_lock);
1752                                                   continue;
1753                                         }
1754                               }
1755                               atomic_or_uint(&fp->f_flag, FMARK);
1756 
1757                               if (fp->f_type != DTYPE_SOCKET ||
1758                                   (so = fp->f_socket) == NULL ||
1759                                   so->so_proto->pr_domain != &unixdomain ||
1760                                   (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
1761                                         mutex_exit(&fp->f_lock);
1762                                         continue;
1763                               }
1764 
1765                               /* Gain file ref, mark our position, and unlock. */
1766                               didwork = true;
1767                               LIST_INSERT_AFTER(fp, dp, f_list);
1768                               fp->f_count++;
1769                               mutex_exit(&fp->f_lock);
1770                               mutex_exit(&filelist_lock);
1771 
1772                               /*
1773                                * Mark files referenced from sockets queued on the
1774                                * accept queue as well.
1775                                */
1776                               solock(so);
1777                               unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
1778                               if ((so->so_options & SO_ACCEPTCONN) != 0) {
1779                                         TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
1780                                                   unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1781                                         }
1782                                         TAILQ_FOREACH(so1, &so->so_q, so_qe) {
1783                                                   unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
1784                                         }
1785                               }
1786                               sounlock(so);
1787 
1788                               /* Re-lock and restart from where we left off. */
1789                               closef(fp);
1790                               mutex_enter(&filelist_lock);
1791                               np = LIST_NEXT(dp, f_list);
1792                               LIST_REMOVE(dp, f_list);
1793                     }
1794                     /*
1795                      * Bail early if we did nothing in the loop above.  Could
1796                      * happen because of concurrent activity causing unp_defer
1797                      * to get out of sync.
1798                      */
1799           } while (unp_defer != 0 && didwork);
1800 
1801           /*
1802            * Sweep pass.
1803            *
1804            * We grab an extra reference to each of the files that are
1805            * not otherwise accessible and then free the rights that are
1806            * stored in messages on them.
1807            */
1808           for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
1809                     KASSERT(mutex_owned(&filelist_lock));
1810                     np = LIST_NEXT(fp, f_list);
1811                     mutex_enter(&fp->f_lock);
1812 
1813                     /*
1814                      * Ignore non-sockets.
1815                      * Ignore dead sockets, or sockets with pending close.
1816                      * Ignore sockets obviously referenced elsewhere.
1817                      * Ignore sockets marked as referenced by our scan.
1818                      * Ignore new sockets that did not exist during the scan.
1819                      */
1820                     if (fp->f_type != DTYPE_SOCKET ||
1821                         fp->f_count == 0 || fp->f_unpcount != 0 ||
1822                         fp->f_count != fp->f_msgcount ||
1823                         (fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
1824                               mutex_exit(&fp->f_lock);
1825                               continue;
1826                     }
1827 
1828                     /* Gain file ref, mark our position, and unlock. */
1829                     LIST_INSERT_AFTER(fp, dp, f_list);
1830                     fp->f_count++;
1831                     mutex_exit(&fp->f_lock);
1832                     mutex_exit(&filelist_lock);
1833 
1834                     /*
1835                      * Flush all data from the socket's receive buffer.
1836                      * This will cause files referenced only by the
1837                      * socket to be queued for close.
1838                      */
1839                     so = fp->f_socket;
1840                     solock(so);
1841                     sorflush(so);
1842                     sounlock(so);
1843 
1844                     /* Re-lock and restart from where we left off. */
1845                     closef(fp);
1846                     mutex_enter(&filelist_lock);
1847                     np = LIST_NEXT(dp, f_list);
1848                     LIST_REMOVE(dp, f_list);
1849           }
1850 }
1851 
1852 /*
1853  * Garbage collector thread.  While SCM_RIGHTS messages are in transit,
1854  * wake once per second to garbage collect.  Run continually while we
1855  * have deferred closes to process.
1856  */
1857 static void
unp_thread(void * cookie)1858 unp_thread(void *cookie)
1859 {
1860           file_t *dp;
1861 
1862           /* Allocate a dummy file for our scans. */
1863           if ((dp = fgetdummy()) == NULL) {
1864                     panic("unp_thread");
1865           }
1866 
1867           mutex_enter(&filelist_lock);
1868           for (;;) {
1869                     KASSERT(mutex_owned(&filelist_lock));
1870                     if (SLIST_EMPTY(&unp_thread_discard)) {
1871                               if (unp_rights != 0) {
1872                                         (void)cv_timedwait(&unp_thread_cv,
1873                                             &filelist_lock, hz);
1874                               } else {
1875                                         cv_wait(&unp_thread_cv, &filelist_lock);
1876                               }
1877                     }
1878                     unp_gc(dp);
1879           }
1880           /* NOTREACHED */
1881 }
1882 
1883 /*
1884  * Kick the garbage collector into action if there is something for
1885  * it to process.
1886  */
1887 static void
unp_thread_kick(void)1888 unp_thread_kick(void)
1889 {
1890 
1891           if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) {
1892                     mutex_enter(&filelist_lock);
1893                     cv_signal(&unp_thread_cv);
1894                     mutex_exit(&filelist_lock);
1895           }
1896 }
1897 
1898 void
unp_dispose(struct mbuf * m)1899 unp_dispose(struct mbuf *m)
1900 {
1901 
1902           if (m)
1903                     unp_scan(m, unp_discard_later, 1);
1904 }
1905 
1906 void
unp_scan(struct mbuf * m0,void (* op)(file_t *),int discard)1907 unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
1908 {
1909           struct mbuf *m;
1910           file_t **rp, *fp;
1911           struct cmsghdr *cm;
1912           int i, qfds;
1913 
1914           while (m0) {
1915                     for (m = m0; m; m = m->m_next) {
1916                               if (m->m_type != MT_CONTROL ||
1917                                   m->m_len < sizeof(*cm)) {
1918                                         continue;
1919                               }
1920                               cm = mtod(m, struct cmsghdr *);
1921                               if (cm->cmsg_level != SOL_SOCKET ||
1922                                   cm->cmsg_type != SCM_RIGHTS)
1923                                         continue;
1924                               qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
1925                                   / sizeof(file_t *);
1926                               rp = (file_t **)CMSG_DATA(cm);
1927                               for (i = 0; i < qfds; i++) {
1928                                         fp = *rp;
1929                                         if (discard) {
1930                                                   *rp = 0;
1931                                         }
1932                                         (*op)(fp);
1933                                         rp++;
1934                               }
1935                     }
1936                     m0 = m0->m_nextpkt;
1937           }
1938 }
1939 
1940 void
unp_mark(file_t * fp)1941 unp_mark(file_t *fp)
1942 {
1943 
1944           if (fp == NULL)
1945                     return;
1946 
1947           /* If we're already deferred, don't screw up the defer count */
1948           mutex_enter(&fp->f_lock);
1949           if (fp->f_flag & (FMARK | FDEFER)) {
1950                     mutex_exit(&fp->f_lock);
1951                     return;
1952           }
1953 
1954           /*
1955            * Minimize the number of deferrals...  Sockets are the only type of
1956            * file which can hold references to another file, so just mark
1957            * other files, and defer unmarked sockets for the next pass.
1958            */
1959           if (fp->f_type == DTYPE_SOCKET) {
1960                     unp_defer++;
1961                     KASSERT(fp->f_count != 0);
1962                     atomic_or_uint(&fp->f_flag, FDEFER);
1963           } else {
1964                     atomic_or_uint(&fp->f_flag, FMARK);
1965           }
1966           mutex_exit(&fp->f_lock);
1967 }
1968 
1969 static void
unp_discard_now(file_t * fp)1970 unp_discard_now(file_t *fp)
1971 {
1972 
1973           if (fp == NULL)
1974                     return;
1975 
1976           KASSERT(fp->f_count > 0);
1977           KASSERT(fp->f_msgcount > 0);
1978 
1979           mutex_enter(&fp->f_lock);
1980           fp->f_msgcount--;
1981           mutex_exit(&fp->f_lock);
1982           atomic_dec_uint(&unp_rights);
1983           (void)closef(fp);
1984 }
1985 
1986 static void
unp_discard_later(file_t * fp)1987 unp_discard_later(file_t *fp)
1988 {
1989 
1990           if (fp == NULL)
1991                     return;
1992 
1993           KASSERT(fp->f_count > 0);
1994           KASSERT(fp->f_msgcount > 0);
1995 
1996           mutex_enter(&filelist_lock);
1997           if (fp->f_unpcount++ == 0) {
1998                     SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
1999           }
2000           mutex_exit(&filelist_lock);
2001 }
2002 
2003 static void
unp_sysctl_create(void)2004 unp_sysctl_create(void)
2005 {
2006 
2007           KASSERT(usrreq_sysctllog == NULL);
2008           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2009                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2010                            CTLTYPE_LONG, "sendspace",
2011                            SYSCTL_DESCR("Default stream send space"),
2012                            NULL, 0, &unpst_sendspace, 0,
2013                            CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
2014           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2015                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2016                            CTLTYPE_LONG, "recvspace",
2017                            SYSCTL_DESCR("Default stream recv space"),
2018                            NULL, 0, &unpst_recvspace, 0,
2019                            CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
2020           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2021                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2022                            CTLTYPE_LONG, "sendspace",
2023                            SYSCTL_DESCR("Default datagram send space"),
2024                            NULL, 0, &unpdg_sendspace, 0,
2025                            CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
2026           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2027                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2028                            CTLTYPE_LONG, "recvspace",
2029                            SYSCTL_DESCR("Default datagram recv space"),
2030                            NULL, 0, &unpdg_recvspace, 0,
2031                            CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
2032           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2033                            CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2034                            CTLTYPE_INT, "inflight",
2035                            SYSCTL_DESCR("File descriptors in flight"),
2036                            NULL, 0, &unp_rights, 0,
2037                            CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
2038           sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
2039                            CTLFLAG_PERMANENT|CTLFLAG_READONLY,
2040                            CTLTYPE_INT, "deferred",
2041                            SYSCTL_DESCR("File descriptors deferred for close"),
2042                            NULL, 0, &unp_defer, 0,
2043                            CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
2044 }
2045 
2046 const struct pr_usrreqs unp_usrreqs = {
2047           .pr_attach          = unp_attach,
2048           .pr_detach          = unp_detach,
2049           .pr_accept          = unp_accept,
2050           .pr_bind  = unp_bind,
2051           .pr_listen          = unp_listen,
2052           .pr_connect         = unp_connect,
2053           .pr_connect2        = unp_connect2,
2054           .pr_disconnect      = unp_disconnect,
2055           .pr_shutdown        = unp_shutdown,
2056           .pr_abort = unp_abort,
2057           .pr_ioctl = unp_ioctl,
2058           .pr_stat  = unp_stat,
2059           .pr_peeraddr        = unp_peeraddr,
2060           .pr_sockaddr        = unp_sockaddr,
2061           .pr_rcvd  = unp_rcvd,
2062           .pr_recvoob         = unp_recvoob,
2063           .pr_send  = unp_send,
2064           .pr_sendoob         = unp_sendoob,
2065 };
2066