xref: /dragonfly/sys/kern/uipc_syscalls.c (revision b272101acc636ac635f83d03265ef6a44a3ba51a)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1990, 1993
3  *        The Regents of the University of California.  All rights reserved.
4  *
5  * sendfile(2) and related extensions:
6  * Copyright (c) 1998, David Greenman. All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  *
32  *        @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
33  * $FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.65.2.17 2003/04/04 17:11:16 tegge Exp $
34  */
35 
36 #include "opt_ktrace.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/sysmsg.h>
42 #include <sys/malloc.h>
43 #include <sys/filedesc.h>
44 #include <sys/event.h>
45 #include <sys/proc.h>
46 #include <sys/fcntl.h>
47 #include <sys/file.h>
48 #include <sys/filio.h>
49 #include <sys/kern_syscall.h>
50 #include <sys/mbuf.h>
51 #include <sys/protosw.h>
52 #include <sys/sfbuf.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/socketops.h>
56 #include <sys/uio.h>
57 #include <sys/vnode.h>
58 #include <sys/lock.h>
59 #include <sys/mount.h>
60 #include <sys/jail.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 #include <vm/vm.h>
65 #include <vm/vm_object.h>
66 #include <vm/vm_page.h>
67 #include <vm/vm_pageout.h>
68 #include <vm/vm_kern.h>
69 #include <vm/vm_extern.h>
70 #include <sys/file2.h>
71 #include <sys/signalvar.h>
72 #include <sys/serialize.h>
73 
74 #include <sys/thread2.h>
75 #include <sys/msgport2.h>
76 #include <sys/socketvar2.h>
77 #include <net/netmsg2.h>
78 #include <vm/vm_page2.h>
79 
80 extern int use_soaccept_pred_fast;
81 extern int use_sendfile_async;
82 extern int use_soconnect_async;
83 
84 /*
85  * System call interface to the socket abstraction.
86  */
87 
88 extern    struct fileops socketops;
89 
90 /*
91  * socket_args(int domain, int type, int protocol)
92  */
93 int
kern_socket(int domain,int type,int protocol,int * res)94 kern_socket(int domain, int type, int protocol, int *res)
95 {
96           struct thread *td = curthread;
97           struct filedesc *fdp = td->td_proc->p_fd;
98           struct socket *so;
99           struct file *fp;
100           int fd, error;
101           u_int fflags = 0;
102           int oflags = 0;
103 
104           KKASSERT(td->td_lwp);
105 
106           if (type & SOCK_NONBLOCK) {
107                     type &= ~SOCK_NONBLOCK;
108                     fflags |= FNONBLOCK;
109           }
110           if (type & SOCK_CLOEXEC) {
111                     type &= ~SOCK_CLOEXEC;
112                     oflags |= O_CLOEXEC;
113           }
114 
115           error = falloc(td->td_lwp, &fp, &fd);
116           if (error)
117                     return (error);
118           error = socreate(domain, &so, type, protocol, td);
119           if (error) {
120                     fsetfd(fdp, NULL, fd);
121           } else {
122                     fp->f_type = DTYPE_SOCKET;
123                     fp->f_flag = FREAD | FWRITE | fflags;
124                     fp->f_ops = &socketops;
125                     fp->f_data = so;
126                     if (oflags & O_CLOEXEC)
127                               fdp->fd_files[fd].fileflags |= UF_EXCLOSE;
128                     *res = fd;
129                     fsetfd(fdp, fp, fd);
130           }
131           fdrop(fp);
132           return (error);
133 }
134 
135 /*
136  * MPALMOSTSAFE
137  */
138 int
sys_socket(struct sysmsg * sysmsg,const struct socket_args * uap)139 sys_socket(struct sysmsg *sysmsg, const struct socket_args *uap)
140 {
141           int error;
142 
143           error = kern_socket(uap->domain, uap->type, uap->protocol,
144                                   &sysmsg->sysmsg_iresult);
145 
146           return (error);
147 }
148 
149 int
kern_bind(int s,struct sockaddr * sa)150 kern_bind(int s, struct sockaddr *sa)
151 {
152           struct thread *td = curthread;
153           struct file *fp;
154           int error;
155 
156           error = holdsock(td, s, &fp);
157           if (error)
158                     return (error);
159           error = sobind((struct socket *)fp->f_data, sa, td);
160           dropfp(td, s, fp);
161 
162           return (error);
163 }
164 
165 /*
166  * bind_args(int s, caddr_t name, int namelen)
167  *
168  * MPALMOSTSAFE
169  */
170 int
sys_bind(struct sysmsg * sysmsg,const struct bind_args * uap)171 sys_bind(struct sysmsg *sysmsg, const struct bind_args *uap)
172 {
173           struct sockaddr *sa;
174           int error;
175 
176           error = getsockaddr(&sa, uap->name, uap->namelen);
177           if (error)
178                     return (error);
179           if (!prison_remote_ip(curthread, sa)) {
180                     kfree(sa, M_SONAME);
181                     return EAFNOSUPPORT;
182           }
183           error = kern_bind(uap->s, sa);
184           kfree(sa, M_SONAME);
185 
186           return (error);
187 }
188 
189 int
kern_listen(int s,int backlog)190 kern_listen(int s, int backlog)
191 {
192           struct thread *td = curthread;
193           struct file *fp;
194           int error;
195 
196           error = holdsock(td, s, &fp);
197           if (error)
198                     return (error);
199           error = solisten((struct socket *)fp->f_data, backlog, td);
200           dropfp(td, s, fp);
201 
202           return (error);
203 }
204 
205 /*
206  * listen_args(int s, int backlog)
207  *
208  * MPALMOSTSAFE
209  */
210 int
sys_listen(struct sysmsg * sysmsg,const struct listen_args * uap)211 sys_listen(struct sysmsg *sysmsg, const struct listen_args *uap)
212 {
213           int error;
214 
215           error = kern_listen(uap->s, uap->backlog);
216           return (error);
217 }
218 
219 /*
220  * Returns the accepted socket as well.
221  *
222  * NOTE!  The sockets sitting on so_comp/so_incomp might have 0 refs, the
223  *          pool token is absolutely required to avoid a sofree() race,
224  *          as well as to avoid tailq handling races.
225  */
226 static boolean_t
soaccept_predicate(struct netmsg_so_notify * msg)227 soaccept_predicate(struct netmsg_so_notify *msg)
228 {
229           struct socket *head = msg->base.nm_so;
230           struct socket *so;
231 
232           if (head->so_error != 0) {
233                     msg->base.lmsg.ms_error = head->so_error;
234                     return (TRUE);
235           }
236           lwkt_getpooltoken(head);
237           if (!TAILQ_EMPTY(&head->so_comp)) {
238                     /* Abuse nm_so field as copy in/copy out parameter. XXX JH */
239                     so = TAILQ_FIRST(&head->so_comp);
240                     KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == SS_COMP);
241                     TAILQ_REMOVE(&head->so_comp, so, so_list);
242                     head->so_qlen--;
243                     soclrstate(so, SS_COMP);
244 
245                     /*
246                      * Keep a reference before clearing the so_head
247                      * to avoid racing socket close in netisr.
248                      */
249                     soreference(so);
250                     so->so_head = NULL;
251 
252                     lwkt_relpooltoken(head);
253 
254                     msg->base.lmsg.ms_error = 0;
255                     msg->base.nm_so = so;
256                     return (TRUE);
257           }
258           lwkt_relpooltoken(head);
259           if (head->so_state & SS_CANTRCVMORE) {
260                     msg->base.lmsg.ms_error = ECONNABORTED;
261                     return (TRUE);
262           }
263           if (msg->nm_fflags & FNONBLOCK) {
264                     msg->base.lmsg.ms_error = EWOULDBLOCK;
265                     return (TRUE);
266           }
267 
268           return (FALSE);
269 }
270 
271 /*
272  * The second argument to kern_accept() is a handle to a struct sockaddr.
273  * This allows kern_accept() to return a pointer to an allocated struct
274  * sockaddr which must be freed later with FREE().  The caller must
275  * initialize *name to NULL.
276  */
277 int
kern_accept(int s,int fflags,struct sockaddr ** name,int * namelen,int * res,int sockflags)278 kern_accept(int s, int fflags, struct sockaddr **name, int *namelen, int *res,
279     int sockflags)
280 {
281           struct thread *td = curthread;
282           struct filedesc *fdp = td->td_proc->p_fd;
283           struct file *lfp = NULL;
284           struct file *nfp = NULL;
285           struct sockaddr *sa;
286           struct socket *head, *so;
287           struct netmsg_so_notify msg;
288           int fd;
289           u_int fflag;                  /* type must match fp->f_flag */
290           int error, tmp;
291 
292           *res = -1;
293           if (name && namelen && *namelen < 0)
294                     return (EINVAL);
295 
296           error = holdsock(td, s, &lfp);
297           if (error)
298                     return (error);
299 
300           error = falloc(td->td_lwp, &nfp, &fd);
301           if (error) {                  /* Probably ran out of file descriptors. */
302                     fdrop(lfp);
303                     return (error);
304           }
305           head = (struct socket *)lfp->f_data;
306           if ((head->so_options & SO_ACCEPTCONN) == 0) {
307                     error = EINVAL;
308                     goto done;
309           }
310 
311           if (fflags & O_FBLOCKING)
312                     fflags |= lfp->f_flag & ~FNONBLOCK;
313           else if (fflags & O_FNONBLOCKING)
314                     fflags |= lfp->f_flag | FNONBLOCK;
315           else
316                     fflags = lfp->f_flag;
317 
318           if (use_soaccept_pred_fast) {
319                     boolean_t pred;
320 
321                     /* Initialize necessary parts for soaccept_predicate() */
322                     netmsg_init(&msg.base, head, &netisr_apanic_rport, 0, NULL);
323                     msg.nm_fflags = fflags;
324 
325                     lwkt_getpooltoken(head);
326                     pred = soaccept_predicate(&msg);
327                     lwkt_relpooltoken(head);
328 
329                     if (pred) {
330                               error = msg.base.lmsg.ms_error;
331                               if (error)
332                                         goto done;
333                               else
334                                         goto accepted;
335                     }
336           }
337 
338           /* optimize for uniprocessor case later XXX JH */
339           netmsg_init_abortable(&msg.base, head, &curthread->td_msgport,
340                                     0, netmsg_so_notify, netmsg_so_notify_doabort);
341           msg.nm_predicate = soaccept_predicate;
342           msg.nm_fflags = fflags;
343           msg.nm_etype = NM_REVENT;
344           error = lwkt_domsg(head->so_port, &msg.base.lmsg, PCATCH);
345           if (error)
346                     goto done;
347 
348 accepted:
349           /*
350            * At this point we have the connection that's ready to be accepted.
351            *
352            * NOTE! soaccept_predicate() ref'd so for us, and soaccept() expects
353            *         to eat the ref and turn it into a descriptor.
354            */
355           so = msg.base.nm_so;
356 
357           fflag = lfp->f_flag;
358 
359           /* connection has been removed from the listen queue */
360           KNOTE(&head->so_rcv.ssb_kq.ki_note, 0);
361 
362           if (sockflags & SOCK_KERN_NOINHERIT) {
363                     fflag &= ~(FASYNC | FNONBLOCK);
364                     if (sockflags & SOCK_NONBLOCK)
365                               fflag |= FNONBLOCK;
366           } else {
367                     if (head->so_sigio != NULL)
368                               fsetown(fgetown(&head->so_sigio), &so->so_sigio);
369           }
370 
371           nfp->f_type = DTYPE_SOCKET;
372           nfp->f_flag = fflag;
373           nfp->f_ops = &socketops;
374           nfp->f_data = so;
375           /* Sync socket async state with file flags */
376           tmp = fflag & FASYNC;
377           fo_ioctl(nfp, FIOASYNC, (caddr_t)&tmp, td->td_ucred, NULL);
378 
379           sa = NULL;
380           if (so->so_faddr != NULL) {
381                     sa = so->so_faddr;
382                     so->so_faddr = NULL;
383 
384                     soaccept_generic(so);
385                     error = 0;
386           } else {
387                     error = soaccept(so, &sa);
388           }
389 
390           /*
391            * Set the returned name and namelen as applicable.  Set the returned
392            * namelen to 0 for older code which might ignore the return value
393            * from accept.
394            */
395           if (error == 0) {
396                     if (sa && name && namelen) {
397                               if (*namelen > sa->sa_len)
398                                         *namelen = sa->sa_len;
399                               *name = sa;
400                     } else {
401                               if (sa)
402                                         kfree(sa, M_SONAME);
403                     }
404           }
405 
406 done:
407           /*
408            * If an error occured clear the reserved descriptor, else associate
409            * nfp with it.
410            *
411            * Note that *res is normally ignored if an error is returned but
412            * a syscall message will still have access to the result code.
413            */
414           if (error) {
415                     fsetfd(fdp, NULL, fd);
416           } else {
417                     if (sockflags & SOCK_CLOEXEC)
418                               fdp->fd_files[fd].fileflags |= UF_EXCLOSE;
419                     *res = fd;
420                     fsetfd(fdp, nfp, fd);
421           }
422           fdrop(nfp);
423           dropfp(td, s, lfp);
424 
425           return (error);
426 }
427 
428 /*
429  * accept(int s, caddr_t name, int *anamelen)
430  *
431  * MPALMOSTSAFE
432  */
433 int
sys_accept(struct sysmsg * sysmsg,const struct accept_args * uap)434 sys_accept(struct sysmsg *sysmsg, const struct accept_args *uap)
435 {
436           struct sockaddr *sa = NULL;
437           int sa_len;
438           int error;
439 
440           if (uap->name) {
441                     error = copyin(uap->anamelen, &sa_len, sizeof(sa_len));
442                     if (error)
443                               return (error);
444 
445                     error = kern_accept(uap->s, 0, &sa, &sa_len,
446                                             &sysmsg->sysmsg_iresult, 0);
447 
448                     if (error == 0) {
449                               prison_local_ip(curthread, sa);
450                               error = copyout(sa, uap->name, sa_len);
451                     }
452                     if (error == 0) {
453                               error = copyout(&sa_len, uap->anamelen,
454                                                   sizeof(*uap->anamelen));
455                     }
456                     if (sa)
457                               kfree(sa, M_SONAME);
458           } else {
459                     error = kern_accept(uap->s, 0, NULL, 0,
460                                             &sysmsg->sysmsg_iresult, 0);
461           }
462           return (error);
463 }
464 
465 /*
466  * extaccept(int s, int fflags, caddr_t name, int *anamelen)
467  *
468  * MPALMOSTSAFE
469  */
470 int
sys_extaccept(struct sysmsg * sysmsg,const struct extaccept_args * uap)471 sys_extaccept(struct sysmsg *sysmsg, const struct extaccept_args *uap)
472 {
473           struct sockaddr *sa = NULL;
474           int sa_len;
475           int error;
476           int fflags = uap->flags & O_FMASK;
477 
478           if (uap->name) {
479                     error = copyin(uap->anamelen, &sa_len, sizeof(sa_len));
480                     if (error)
481                               return (error);
482 
483                     error = kern_accept(uap->s, fflags, &sa, &sa_len,
484                                             &sysmsg->sysmsg_iresult, 0);
485 
486                     if (error == 0) {
487                               prison_local_ip(curthread, sa);
488                               error = copyout(sa, uap->name, sa_len);
489                     }
490                     if (error == 0) {
491                               error = copyout(&sa_len, uap->anamelen,
492                                   sizeof(*uap->anamelen));
493                     }
494                     if (sa)
495                               kfree(sa, M_SONAME);
496           } else {
497                     error = kern_accept(uap->s, fflags, NULL, 0,
498                                             &sysmsg->sysmsg_iresult, 0);
499           }
500           return (error);
501 }
502 
503 /*
504  * accept4(int s, caddr_t name, int *anamelen, int flags)
505  *
506  * MPALMOSTSAFE
507  */
508 int
sys_accept4(struct sysmsg * sysmsg,const struct accept4_args * uap)509 sys_accept4(struct sysmsg *sysmsg, const struct accept4_args *uap)
510 {
511           struct sockaddr *sa = NULL;
512           int sa_len;
513           int error;
514           int sockflags;
515 
516           if (uap->flags & ~(SOCK_NONBLOCK | SOCK_CLOEXEC))
517                     return (EINVAL);
518           sockflags = uap->flags | SOCK_KERN_NOINHERIT;
519 
520           if (uap->name) {
521                     error = copyin(uap->anamelen, &sa_len, sizeof(sa_len));
522                     if (error)
523                               return (error);
524 
525                     error = kern_accept(uap->s, 0, &sa, &sa_len,
526                                             &sysmsg->sysmsg_iresult, sockflags);
527 
528                     if (error == 0) {
529                               prison_local_ip(curthread, sa);
530                               error = copyout(sa, uap->name, sa_len);
531                     }
532                     if (error == 0) {
533                               error = copyout(&sa_len, uap->anamelen,
534                                                   sizeof(*uap->anamelen));
535                     }
536                     if (sa)
537                               kfree(sa, M_SONAME);
538           } else {
539                     error = kern_accept(uap->s, 0, NULL, 0,
540                                             &sysmsg->sysmsg_iresult, sockflags);
541           }
542           return (error);
543 }
544 
545 /*
546  * Returns TRUE if predicate satisfied.
547  */
548 static boolean_t
soconnected_predicate(struct netmsg_so_notify * msg)549 soconnected_predicate(struct netmsg_so_notify *msg)
550 {
551           struct socket *so = msg->base.nm_so;
552 
553           /* check predicate */
554           if (!(so->so_state & SS_ISCONNECTING) || so->so_error != 0) {
555                     msg->base.lmsg.ms_error = so->so_error;
556                     return (TRUE);
557           }
558 
559           return (FALSE);
560 }
561 
562 int
kern_connect(int s,int fflags,struct sockaddr * sa)563 kern_connect(int s, int fflags, struct sockaddr *sa)
564 {
565           struct thread *td = curthread;
566           struct file *fp;
567           struct socket *so;
568           int error, interrupted = 0;
569 
570           error = holdsock(td, s, &fp);
571           if (error)
572                     return (error);
573           so = (struct socket *)fp->f_data;
574 
575           if (fflags & O_FBLOCKING)
576                     /* fflags &= ~FNONBLOCK; */;
577           else if (fflags & O_FNONBLOCKING)
578                     fflags |= FNONBLOCK;
579           else
580                     fflags = fp->f_flag;
581 
582           if (so->so_state & SS_ISCONNECTING) {
583                     error = EALREADY;
584                     goto done;
585           }
586           error = soconnect(so, sa, td, use_soconnect_async ? FALSE : TRUE);
587           if (error)
588                     goto bad;
589           if ((fflags & FNONBLOCK) && (so->so_state & SS_ISCONNECTING)) {
590                     error = EINPROGRESS;
591                     goto done;
592           }
593           if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
594                     struct netmsg_so_notify msg;
595 
596                     netmsg_init_abortable(&msg.base, so,
597                                               &curthread->td_msgport,
598                                               0,
599                                               netmsg_so_notify,
600                                               netmsg_so_notify_doabort);
601                     msg.nm_predicate = soconnected_predicate;
602                     msg.nm_etype = NM_REVENT;
603                     error = lwkt_domsg(so->so_port, &msg.base.lmsg, PCATCH);
604                     if (error == EINTR || error == ERESTART)
605                               interrupted = 1;
606           }
607           if (error == 0) {
608                     error = so->so_error;
609                     so->so_error = 0;
610           }
611 bad:
612           if (!interrupted)
613                     soclrstate(so, SS_ISCONNECTING);
614           if (error == ERESTART)
615                     error = EINTR;
616 done:
617           dropfp(td, s, fp);
618 
619           return (error);
620 }
621 
622 /*
623  * connect_args(int s, caddr_t name, int namelen)
624  *
625  * MPALMOSTSAFE
626  */
627 int
sys_connect(struct sysmsg * sysmsg,const struct connect_args * uap)628 sys_connect(struct sysmsg *sysmsg, const struct connect_args *uap)
629 {
630           struct sockaddr *sa;
631           int error;
632 
633           error = getsockaddr(&sa, uap->name, uap->namelen);
634           if (error)
635                     return (error);
636           if (!prison_remote_ip(curthread, sa)) {
637                     kfree(sa, M_SONAME);
638                     return EAFNOSUPPORT;
639           }
640           error = kern_connect(uap->s, 0, sa);
641           kfree(sa, M_SONAME);
642 
643           return (error);
644 }
645 
646 /*
647  * connect_args(int s, int fflags, caddr_t name, int namelen)
648  *
649  * MPALMOSTSAFE
650  */
651 int
sys_extconnect(struct sysmsg * sysmsg,const struct extconnect_args * uap)652 sys_extconnect(struct sysmsg *sysmsg, const struct extconnect_args *uap)
653 {
654           struct sockaddr *sa;
655           int error;
656           int fflags = uap->flags & O_FMASK;
657 
658           error = getsockaddr(&sa, uap->name, uap->namelen);
659           if (error)
660                     return (error);
661           if (!prison_remote_ip(curthread, sa)) {
662                     kfree(sa, M_SONAME);
663                     return EAFNOSUPPORT;
664           }
665           error = kern_connect(uap->s, fflags, sa);
666           kfree(sa, M_SONAME);
667 
668           return (error);
669 }
670 
671 int
kern_socketpair(int domain,int type,int protocol,int * sv)672 kern_socketpair(int domain, int type, int protocol, int *sv)
673 {
674           struct thread *td = curthread;
675           struct filedesc *fdp;
676           struct file *fp1, *fp2;
677           struct socket *so1, *so2;
678           struct ucred *cred = curthread->td_ucred;
679           int fd1, fd2, error;
680           u_int fflags = 0;
681           int oflags = 0;
682 
683           if (type & SOCK_NONBLOCK) {
684                     type &= ~SOCK_NONBLOCK;
685                     fflags |= FNONBLOCK;
686           }
687           if (type & SOCK_CLOEXEC) {
688                     type &= ~SOCK_CLOEXEC;
689                     oflags |= O_CLOEXEC;
690           }
691 
692           fdp = td->td_proc->p_fd;
693           error = socreate(domain, &so1, type, protocol, td);
694           if (error)
695                     return (error);
696           error = socreate(domain, &so2, type, protocol, td);
697           if (error)
698                     goto free1;
699           error = falloc(td->td_lwp, &fp1, &fd1);
700           if (error)
701                     goto free2;
702           sv[0] = fd1;
703           fp1->f_data = so1;
704           error = falloc(td->td_lwp, &fp2, &fd2);
705           if (error)
706                     goto free3;
707           fp2->f_data = so2;
708           sv[1] = fd2;
709           error = soconnect2(so1, so2, cred);
710           if (error)
711                     goto free4;
712           if (type == SOCK_DGRAM) {
713                     /*
714                      * Datagram socket connection is asymmetric.
715                      */
716                      error = soconnect2(so2, so1, cred);
717                      if (error)
718                               goto free4;
719           }
720           fp1->f_type = fp2->f_type = DTYPE_SOCKET;
721           fp1->f_flag = fp2->f_flag = FREAD|FWRITE|fflags;
722           fp1->f_ops = fp2->f_ops = &socketops;
723           if (oflags & O_CLOEXEC) {
724                     fdp->fd_files[fd1].fileflags |= UF_EXCLOSE;
725                     fdp->fd_files[fd2].fileflags |= UF_EXCLOSE;
726           }
727           fsetfd(fdp, fp1, fd1);
728           fsetfd(fdp, fp2, fd2);
729           fdrop(fp1);
730           fdrop(fp2);
731           return (error);
732 free4:
733           fsetfd(fdp, NULL, fd2);
734           fdrop(fp2);
735 free3:
736           fsetfd(fdp, NULL, fd1);
737           fdrop(fp1);
738 free2:
739           (void)soclose(so2, 0);
740 free1:
741           (void)soclose(so1, 0);
742           return (error);
743 }
744 
745 /*
746  * socketpair(int domain, int type, int protocol, int *rsv)
747  */
748 int
sys_socketpair(struct sysmsg * sysmsg,const struct socketpair_args * uap)749 sys_socketpair(struct sysmsg *sysmsg, const struct socketpair_args *uap)
750 {
751           int error, sockv[2];
752 
753           error = kern_socketpair(uap->domain, uap->type, uap->protocol, sockv);
754 
755           if (error == 0) {
756                     error = copyout(sockv, uap->rsv, sizeof(sockv));
757 
758                     if (error != 0) {
759                               kern_close(sockv[0]);
760                               kern_close(sockv[1]);
761                     }
762           }
763 
764           return (error);
765 }
766 
767 int
kern_sendmsg(int s,struct sockaddr * sa,struct uio * auio,struct mbuf * control,int flags,size_t * res)768 kern_sendmsg(int s, struct sockaddr *sa, struct uio *auio,
769                struct mbuf *control, int flags, size_t *res)
770 {
771           struct thread *td = curthread;
772           struct lwp *lp = td->td_lwp;
773           struct proc *p = td->td_proc;
774           struct file *fp;
775           size_t len;
776           int error;
777           struct socket *so;
778 #ifdef KTRACE
779           struct iovec *ktriov = NULL;
780           struct uio ktruio;
781 #endif
782 
783           error = holdsock(td, s, &fp);
784           if (error)
785                     return (error);
786 #ifdef KTRACE
787           if (KTRPOINT(td, KTR_GENIO)) {
788                     int iovlen = auio->uio_iovcnt * sizeof (struct iovec);
789 
790                     ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
791                     bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
792                     ktruio = *auio;
793           }
794 #endif
795           len = auio->uio_resid;
796           so = (struct socket *)fp->f_data;
797           if ((flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) {
798                     if (fp->f_flag & FNONBLOCK)
799                               flags |= MSG_FNONBLOCKING;
800           }
801           error = so_pru_sosend(so, sa, auio, NULL, control, flags, td);
802           if (error) {
803                     if (auio->uio_resid != len && (error == ERESTART ||
804                         error == EINTR || error == EWOULDBLOCK))
805                               error = 0;
806                     if (error == EPIPE && !(flags & MSG_NOSIGNAL) &&
807                         !(so->so_options & SO_NOSIGPIPE))
808                               lwpsignal(p, lp, SIGPIPE);
809           }
810 #ifdef KTRACE
811           if (ktriov != NULL) {
812                     if (error == 0) {
813                               ktruio.uio_iov = ktriov;
814                               ktruio.uio_resid = len - auio->uio_resid;
815                               ktrgenio(lp, s, UIO_WRITE, &ktruio, error);
816                     }
817                     kfree(ktriov, M_TEMP);
818           }
819 #endif
820           if (error == 0)
821                     *res  = len - auio->uio_resid;
822           dropfp(td, s, fp);
823 
824           return (error);
825 }
826 
827 /*
828  * sendto_args(int s, caddr_t buf, size_t len, int flags, caddr_t to, int tolen)
829  *
830  * MPALMOSTSAFE
831  */
832 int
sys_sendto(struct sysmsg * sysmsg,const struct sendto_args * uap)833 sys_sendto(struct sysmsg *sysmsg, const struct sendto_args *uap)
834 {
835           struct thread *td = curthread;
836           struct uio auio;
837           struct iovec aiov;
838           struct sockaddr *sa = NULL;
839           int error;
840 
841           if (uap->to) {
842                     error = getsockaddr(&sa, uap->to, uap->tolen);
843                     if (error)
844                               return (error);
845                     if (!prison_remote_ip(curthread, sa)) {
846                               kfree(sa, M_SONAME);
847                               return EAFNOSUPPORT;
848                     }
849           }
850           aiov.iov_base = uap->buf;
851           aiov.iov_len = uap->len;
852           auio.uio_iov = &aiov;
853           auio.uio_iovcnt = 1;
854           auio.uio_offset = 0;
855           auio.uio_resid = uap->len;
856           auio.uio_segflg = UIO_USERSPACE;
857           auio.uio_rw = UIO_WRITE;
858           auio.uio_td = td;
859 
860           error = kern_sendmsg(uap->s, sa, &auio, NULL, uap->flags,
861                                    &sysmsg->sysmsg_szresult);
862 
863           if (sa)
864                     kfree(sa, M_SONAME);
865           return (error);
866 }
867 
868 /*
869  * sendmsg_args(int s, caddr_t msg, int flags)
870  *
871  * MPALMOSTSAFE
872  */
873 int
sys_sendmsg(struct sysmsg * sysmsg,const struct sendmsg_args * uap)874 sys_sendmsg(struct sysmsg *sysmsg, const struct sendmsg_args *uap)
875 {
876           struct thread *td = curthread;
877           struct msghdr msg;
878           struct uio auio;
879           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
880           struct sockaddr *sa = NULL;
881           struct mbuf *control = NULL;
882           int error;
883 
884           error = copyin(uap->msg, (caddr_t)&msg, sizeof(msg));
885           if (error)
886                     return (error);
887 
888           /*
889            * Conditionally copyin msg.msg_name.
890            */
891           if (msg.msg_name) {
892                     error = getsockaddr(&sa, msg.msg_name, msg.msg_namelen);
893                     if (error)
894                               return (error);
895                     if (!prison_remote_ip(curthread, sa)) {
896                               kfree(sa, M_SONAME);
897                               return EAFNOSUPPORT;
898                     }
899           }
900 
901           /*
902            * Populate auio.
903            */
904           error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen,
905                                    &auio.uio_resid);
906           if (error)
907                     goto cleanup2;
908           auio.uio_iov = iov;
909           auio.uio_iovcnt = msg.msg_iovlen;
910           auio.uio_offset = 0;
911           auio.uio_segflg = UIO_USERSPACE;
912           auio.uio_rw = UIO_WRITE;
913           auio.uio_td = td;
914 
915           /*
916            * Conditionally copyin msg.msg_control.
917            */
918           if (msg.msg_control) {
919                     if (msg.msg_controllen < sizeof(struct cmsghdr) ||
920                         msg.msg_controllen > MLEN) {
921                               error = EINVAL;
922                               goto cleanup;
923                     }
924                     control = m_get(M_WAITOK, MT_CONTROL);
925                     control->m_len = msg.msg_controllen;
926                     error = copyin(msg.msg_control, mtod(control, caddr_t),
927                                      msg.msg_controllen);
928                     if (error) {
929                               m_free(control);
930                               goto cleanup;
931                     }
932           }
933 
934           error = kern_sendmsg(uap->s, sa, &auio, control, uap->flags,
935                                    &sysmsg->sysmsg_szresult);
936 
937 cleanup:
938           iovec_free(&iov, aiov);
939 cleanup2:
940           if (sa)
941                     kfree(sa, M_SONAME);
942           return (error);
943 }
944 
945 /*
946  * kern_recvmsg() takes a handle to sa and control.  If the handle is non-
947  * null, it returns a dynamically allocated struct sockaddr and an mbuf.
948  * Don't forget to FREE() and m_free() these if they are returned.
949  */
950 int
kern_recvmsg(int s,struct sockaddr ** sa,struct uio * auio,struct mbuf ** control,int * flags,size_t * res)951 kern_recvmsg(int s, struct sockaddr **sa, struct uio *auio,
952                struct mbuf **control, int *flags, size_t *res)
953 {
954           struct thread *td = curthread;
955           struct file *fp;
956           size_t len;
957           int error;
958           int lflags;
959           struct socket *so;
960 #ifdef KTRACE
961           struct iovec *ktriov = NULL;
962           struct uio ktruio;
963 #endif
964 
965           error = holdsock(td, s, &fp);
966           if (error)
967                     return (error);
968 #ifdef KTRACE
969           if (KTRPOINT(td, KTR_GENIO)) {
970                     int iovlen = auio->uio_iovcnt * sizeof (struct iovec);
971 
972                     ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
973                     bcopy(auio->uio_iov, ktriov, iovlen);
974                     ktruio = *auio;
975           }
976 #endif
977           len = auio->uio_resid;
978           so = (struct socket *)fp->f_data;
979 
980           if (flags == NULL || (*flags & (MSG_FNONBLOCKING|MSG_FBLOCKING)) == 0) {
981                     if (fp->f_flag & FNONBLOCK) {
982                               if (flags) {
983                                         *flags |= MSG_FNONBLOCKING;
984                               } else {
985                                         lflags = MSG_FNONBLOCKING;
986                                         flags = &lflags;
987                               }
988                     }
989           }
990 
991           error = so_pru_soreceive(so, sa, auio, NULL, control, flags);
992           if (error) {
993                     if (auio->uio_resid != len && (error == ERESTART ||
994                         error == EINTR || error == EWOULDBLOCK))
995                               error = 0;
996           }
997 #ifdef KTRACE
998           if (ktriov != NULL) {
999                     if (error == 0) {
1000                               ktruio.uio_iov = ktriov;
1001                               ktruio.uio_resid = len - auio->uio_resid;
1002                               ktrgenio(td->td_lwp, s, UIO_READ, &ktruio, error);
1003                     }
1004                     kfree(ktriov, M_TEMP);
1005           }
1006 #endif
1007           if (error == 0)
1008                     *res = len - auio->uio_resid;
1009           dropfp(td, s, fp);
1010 
1011           return (error);
1012 }
1013 
1014 /*
1015  * recvfrom_args(int s, caddr_t buf, size_t len, int flags,
1016  *                            caddr_t from, int *fromlenaddr)
1017  *
1018  * MPALMOSTSAFE
1019  */
1020 int
sys_recvfrom(struct sysmsg * sysmsg,const struct recvfrom_args * uap)1021 sys_recvfrom(struct sysmsg *sysmsg, const struct recvfrom_args *uap)
1022 {
1023           struct thread *td = curthread;
1024           struct uio auio;
1025           struct iovec aiov;
1026           struct sockaddr *sa = NULL;
1027           int error, fromlen;
1028           int flags;
1029 
1030           if (uap->from && uap->fromlenaddr) {
1031                     error = copyin(uap->fromlenaddr, &fromlen, sizeof(fromlen));
1032                     if (error)
1033                               return (error);
1034                     if (fromlen < 0)
1035                               return (EINVAL);
1036           } else {
1037                     fromlen = 0;
1038           }
1039           aiov.iov_base = uap->buf;
1040           aiov.iov_len = uap->len;
1041           auio.uio_iov = &aiov;
1042           auio.uio_iovcnt = 1;
1043           auio.uio_offset = 0;
1044           auio.uio_resid = uap->len;
1045           auio.uio_segflg = UIO_USERSPACE;
1046           auio.uio_rw = UIO_READ;
1047           auio.uio_td = td;
1048           flags = uap->flags;
1049 
1050           error = kern_recvmsg(uap->s, uap->from ? &sa : NULL, &auio, NULL,
1051                                    &flags, &sysmsg->sysmsg_szresult);
1052 
1053           if (error == 0 && uap->from) {
1054                     /* note: sa may still be NULL */
1055                     if (sa) {
1056                               fromlen = MIN(fromlen, sa->sa_len);
1057                               prison_local_ip(curthread, sa);
1058                               error = copyout(sa, uap->from, fromlen);
1059                     } else {
1060                               fromlen = 0;
1061                     }
1062                     if (error == 0) {
1063                               error = copyout(&fromlen, uap->fromlenaddr,
1064                                                   sizeof(fromlen));
1065                     }
1066           }
1067           if (sa)
1068                     kfree(sa, M_SONAME);
1069 
1070           return (error);
1071 }
1072 
1073 /*
1074  * recvmsg_args(int s, struct msghdr *msg, int flags)
1075  *
1076  * MPALMOSTSAFE
1077  */
1078 int
sys_recvmsg(struct sysmsg * sysmsg,const struct recvmsg_args * uap)1079 sys_recvmsg(struct sysmsg *sysmsg, const struct recvmsg_args *uap)
1080 {
1081           struct thread *td = curthread;
1082           struct msghdr msg;
1083           struct uio auio;
1084           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
1085           struct mbuf *m, *control = NULL;
1086           struct sockaddr *sa = NULL;
1087           caddr_t ctlbuf;
1088           socklen_t *ufromlenp, *ucontrollenp;
1089           int error, fromlen, controllen, len, flags, *uflagsp;
1090 
1091           /*
1092            * This copyin handles everything except the iovec.
1093            */
1094           error = copyin(uap->msg, &msg, sizeof(msg));
1095           if (error)
1096                     return (error);
1097 
1098           if (msg.msg_name && msg.msg_namelen < 0)
1099                     return (EINVAL);
1100           if (msg.msg_control && msg.msg_controllen < 0)
1101                     return (EINVAL);
1102 
1103           ufromlenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr,
1104                         msg_namelen));
1105           ucontrollenp = (socklen_t *)((caddr_t)uap->msg + offsetof(struct msghdr,
1106                            msg_controllen));
1107           uflagsp = (int *)((caddr_t)uap->msg + offsetof(struct msghdr,
1108                                                                       msg_flags));
1109 
1110           /*
1111            * Populate auio.
1112            */
1113           error = iovec_copyin(msg.msg_iov, &iov, aiov, msg.msg_iovlen,
1114                                    &auio.uio_resid);
1115           if (error)
1116                     return (error);
1117           auio.uio_iov = iov;
1118           auio.uio_iovcnt = msg.msg_iovlen;
1119           auio.uio_offset = 0;
1120           auio.uio_segflg = UIO_USERSPACE;
1121           auio.uio_rw = UIO_READ;
1122           auio.uio_td = td;
1123 
1124           flags = uap->flags;
1125 
1126           error = kern_recvmsg(uap->s,
1127                                    (msg.msg_name ? &sa : NULL), &auio,
1128                                    (msg.msg_control ? &control : NULL), &flags,
1129                                    &sysmsg->sysmsg_szresult);
1130 
1131           /*
1132            * Conditionally copyout the name and populate the namelen field.
1133            */
1134           if (error == 0 && msg.msg_name) {
1135                     /* note: sa may still be NULL */
1136                     if (sa != NULL) {
1137                               fromlen = MIN(msg.msg_namelen, sa->sa_len);
1138                               prison_local_ip(curthread, sa);
1139                               error = copyout(sa, msg.msg_name, fromlen);
1140                     } else {
1141                               fromlen = 0;
1142                     }
1143                     if (error == 0)
1144                               error = copyout(&fromlen, ufromlenp,
1145                                   sizeof(*ufromlenp));
1146           }
1147 
1148           /*
1149            * Copyout msg.msg_control and msg.msg_controllen.
1150            */
1151           if (error == 0 && msg.msg_control) {
1152                     len = msg.msg_controllen;
1153                     m = control;
1154                     ctlbuf = (caddr_t)msg.msg_control;
1155 
1156                     while(m && len > 0) {
1157                               unsigned int tocopy;
1158 
1159                               if (len >= m->m_len) {
1160                                         tocopy = m->m_len;
1161                               } else {
1162                                         msg.msg_flags |= MSG_CTRUNC;
1163                                         tocopy = len;
1164                               }
1165 
1166                               error = copyout(mtod(m, caddr_t), ctlbuf, tocopy);
1167                               if (error)
1168                                         goto cleanup;
1169 
1170                               ctlbuf += tocopy;
1171                               len -= tocopy;
1172                               m = m->m_next;
1173                     }
1174                     controllen = ctlbuf - (caddr_t)msg.msg_control;
1175                     error = copyout(&controllen, ucontrollenp,
1176                         sizeof(*ucontrollenp));
1177           }
1178 
1179           if (error == 0)
1180                     error = copyout(&flags, uflagsp, sizeof(*uflagsp));
1181 
1182 cleanup:
1183           if (sa)
1184                     kfree(sa, M_SONAME);
1185           iovec_free(&iov, aiov);
1186           if (control)
1187                     m_freem(control);
1188           return (error);
1189 }
1190 
1191 /*
1192  * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an
1193  * in kernel pointer instead of a userland pointer.  This allows us
1194  * to manipulate socket options in the emulation code.
1195  */
1196 int
kern_setsockopt(int s,struct sockopt * sopt)1197 kern_setsockopt(int s, struct sockopt *sopt)
1198 {
1199           struct thread *td = curthread;
1200           struct file *fp;
1201           int error;
1202 
1203           if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0)
1204                     return (EFAULT);
1205           if (sopt->sopt_val != NULL && sopt->sopt_valsize == 0)
1206                     return (EINVAL);
1207           if (sopt->sopt_valsize > SOMAXOPT_SIZE) /* unsigned */
1208                     return (EINVAL);
1209 
1210           error = holdsock(td, s, &fp);
1211           if (error)
1212                     return (error);
1213 
1214           error = sosetopt((struct socket *)fp->f_data, sopt);
1215           dropfp(td, s, fp);
1216 
1217           return (error);
1218 }
1219 
1220 /*
1221  * setsockopt_args(int s, int level, int name, caddr_t val, int valsize)
1222  *
1223  * MPALMOSTSAFE
1224  */
1225 int
sys_setsockopt(struct sysmsg * sysmsg,const struct setsockopt_args * uap)1226 sys_setsockopt(struct sysmsg *sysmsg, const struct setsockopt_args *uap)
1227 {
1228           struct thread *td = curthread;
1229           struct sockopt sopt;
1230           int error;
1231 
1232           sopt.sopt_level = uap->level;
1233           sopt.sopt_name = uap->name;
1234           sopt.sopt_valsize = uap->valsize;
1235           sopt.sopt_td = td;
1236           sopt.sopt_val = NULL;
1237 
1238           if (sopt.sopt_valsize > SOMAXOPT_SIZE) /* unsigned */
1239                     return (EINVAL);
1240           if (uap->val) {
1241                     sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP, M_WAITOK);
1242                     error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize);
1243                     if (error)
1244                               goto out;
1245           }
1246 
1247           error = kern_setsockopt(uap->s, &sopt);
1248 out:
1249           if (uap->val)
1250                     kfree(sopt.sopt_val, M_TEMP);
1251           return(error);
1252 }
1253 
1254 /*
1255  * If sopt->sopt_td == NULL, then sopt->sopt_val is treated as an
1256  * in kernel pointer instead of a userland pointer.  This allows us
1257  * to manipulate socket options in the emulation code.
1258  */
1259 int
kern_getsockopt(int s,struct sockopt * sopt)1260 kern_getsockopt(int s, struct sockopt *sopt)
1261 {
1262           struct thread *td = curthread;
1263           struct file *fp;
1264           int error;
1265 
1266           if (sopt->sopt_val == NULL && sopt->sopt_valsize != 0)
1267                     return (EFAULT);
1268           if (sopt->sopt_val != NULL && sopt->sopt_valsize == 0)
1269                     return (EINVAL);
1270 
1271           error = holdsock(td, s, &fp);
1272           if (error)
1273                     return (error);
1274 
1275           error = sogetopt((struct socket *)fp->f_data, sopt);
1276           dropfp(td, s, fp);
1277 
1278           return (error);
1279 }
1280 
1281 /*
1282  * getsockopt_args(int s, int level, int name, caddr_t val, int *avalsize)
1283  *
1284  * MPALMOSTSAFE
1285  */
1286 int
sys_getsockopt(struct sysmsg * sysmsg,const struct getsockopt_args * uap)1287 sys_getsockopt(struct sysmsg *sysmsg, const struct getsockopt_args *uap)
1288 {
1289           struct thread *td = curthread;
1290           struct sockopt sopt;
1291           int error, valsize, valszmax, mflag = 0;
1292 
1293           if (uap->val) {
1294                     error = copyin(uap->avalsize, &valsize, sizeof(valsize));
1295                     if (error)
1296                               return (error);
1297           } else {
1298                     valsize = 0;
1299           }
1300 
1301           sopt.sopt_level = uap->level;
1302           sopt.sopt_name = uap->name;
1303           sopt.sopt_valsize = valsize;
1304           sopt.sopt_td = td;
1305           sopt.sopt_val = NULL;
1306 
1307           if (td->td_proc->p_ucred->cr_uid == 0) {
1308                     valszmax = SOMAXOPT_SIZE0;
1309                     mflag = M_NULLOK;
1310           } else {
1311                     valszmax = SOMAXOPT_SIZE;
1312           }
1313           if (sopt.sopt_valsize > valszmax) /* unsigned */
1314                     return (EINVAL);
1315           if (uap->val) {
1316                     sopt.sopt_val = kmalloc(sopt.sopt_valsize, M_TEMP,
1317                         M_WAITOK | mflag);
1318                     if (sopt.sopt_val == NULL)
1319                               return (ENOBUFS);
1320                     error = copyin(uap->val, sopt.sopt_val, sopt.sopt_valsize);
1321                     if (error)
1322                               goto out;
1323           }
1324 
1325           error = kern_getsockopt(uap->s, &sopt);
1326           if (error)
1327                     goto out;
1328           valsize = sopt.sopt_valsize;
1329           error = copyout(&valsize, uap->avalsize, sizeof(valsize));
1330           if (error)
1331                     goto out;
1332           if (uap->val)
1333                     error = copyout(sopt.sopt_val, uap->val, sopt.sopt_valsize);
1334 out:
1335           if (uap->val)
1336                     kfree(sopt.sopt_val, M_TEMP);
1337           return (error);
1338 }
1339 
1340 /*
1341  * The second argument to kern_getsockname() is a handle to a struct sockaddr.
1342  * This allows kern_getsockname() to return a pointer to an allocated struct
1343  * sockaddr which must be freed later with FREE().  The caller must
1344  * initialize *name to NULL.
1345  */
1346 int
kern_getsockname(int s,struct sockaddr ** name,int * namelen)1347 kern_getsockname(int s, struct sockaddr **name, int *namelen)
1348 {
1349           struct thread *td = curthread;
1350           struct file *fp;
1351           struct socket *so;
1352           struct sockaddr *sa = NULL;
1353           int error;
1354 
1355           error = holdsock(td, s, &fp);
1356           if (error)
1357                     return (error);
1358           if (*namelen < 0) {
1359                     fdrop(fp);
1360                     return (EINVAL);
1361           }
1362           so = (struct socket *)fp->f_data;
1363           error = so_pru_sockaddr(so, &sa);
1364           if (error == 0) {
1365                     if (sa == NULL) {
1366                               *namelen = 0;
1367                     } else {
1368                               *namelen = MIN(*namelen, sa->sa_len);
1369                               *name = sa;
1370                     }
1371           }
1372           dropfp(td, s, fp);
1373 
1374           return (error);
1375 }
1376 
1377 /*
1378  * getsockname_args(int fdes, caddr_t asa, int *alen)
1379  *
1380  * Get socket name.
1381  *
1382  * MPALMOSTSAFE
1383  */
1384 int
sys_getsockname(struct sysmsg * sysmsg,const struct getsockname_args * uap)1385 sys_getsockname(struct sysmsg *sysmsg, const struct getsockname_args *uap)
1386 {
1387           struct sockaddr *sa = NULL;
1388           struct sockaddr satmp;
1389           int error, sa_len_in, sa_len_out;
1390 
1391           error = copyin(uap->alen, &sa_len_in, sizeof(sa_len_in));
1392           if (error)
1393                     return (error);
1394 
1395           sa_len_out = sa_len_in;
1396           error = kern_getsockname(uap->fdes, &sa, &sa_len_out);
1397 
1398           if (error == 0) {
1399                     if (sa) {
1400                               prison_local_ip(curthread, sa);
1401                               error = copyout(sa, uap->asa, sa_len_out);
1402                     } else {
1403                               /*
1404                                * unnamed uipc sockets don't bother storing
1405                                * sockaddr, simulate an AF_LOCAL sockaddr.
1406                                */
1407                               sa_len_out = sizeof(satmp);
1408                               if (sa_len_out > sa_len_in)
1409                                         sa_len_out = sa_len_in;
1410                               if (sa_len_out < 0)
1411                                         sa_len_out = 0;
1412                               bzero(&satmp, sizeof(satmp));
1413                               satmp.sa_len = sa_len_out;
1414                               satmp.sa_family = AF_LOCAL;
1415                               error = copyout(&satmp, uap->asa, sa_len_out);
1416                     }
1417           }
1418           if (error == 0 && sa_len_out != sa_len_in)
1419                     error = copyout(&sa_len_out, uap->alen, sizeof(*uap->alen));
1420           if (sa)
1421                     kfree(sa, M_SONAME);
1422           return (error);
1423 }
1424 
1425 /*
1426  * The second argument to kern_getpeername() is a handle to a struct sockaddr.
1427  * This allows kern_getpeername() to return a pointer to an allocated struct
1428  * sockaddr which must be freed later with FREE().  The caller must
1429  * initialize *name to NULL.
1430  */
1431 int
kern_getpeername(int s,struct sockaddr ** name,int * namelen)1432 kern_getpeername(int s, struct sockaddr **name, int *namelen)
1433 {
1434           struct thread *td = curthread;
1435           struct file *fp;
1436           struct socket *so;
1437           struct sockaddr *sa = NULL;
1438           int error;
1439 
1440           error = holdsock(td, s, &fp);
1441           if (error)
1442                     return (error);
1443           if (*namelen < 0) {
1444                     fdrop(fp);
1445                     return (EINVAL);
1446           }
1447           so = (struct socket *)fp->f_data;
1448           if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1449                     fdrop(fp);
1450                     return (ENOTCONN);
1451           }
1452           error = so_pru_peeraddr(so, &sa);
1453           if (error == 0) {
1454                     if (sa == NULL) {
1455                               *namelen = 0;
1456                     } else {
1457                               *namelen = MIN(*namelen, sa->sa_len);
1458                               *name = sa;
1459                     }
1460           }
1461           dropfp(td, s, fp);
1462 
1463           return (error);
1464 }
1465 
1466 /*
1467  * getpeername_args(int fdes, caddr_t asa, int *alen)
1468  *
1469  * Get name of peer for connected socket.
1470  *
1471  * MPALMOSTSAFE
1472  */
1473 int
sys_getpeername(struct sysmsg * sysmsg,const struct getpeername_args * uap)1474 sys_getpeername(struct sysmsg *sysmsg, const struct getpeername_args *uap)
1475 {
1476           struct sockaddr *sa = NULL;
1477           int error, sa_len;
1478 
1479           error = copyin(uap->alen, &sa_len, sizeof(sa_len));
1480           if (error)
1481                     return (error);
1482 
1483           error = kern_getpeername(uap->fdes, &sa, &sa_len);
1484 
1485           if (error == 0) {
1486                     prison_local_ip(curthread, sa);
1487                     error = copyout(sa, uap->asa, sa_len);
1488           }
1489           if (error == 0)
1490                     error = copyout(&sa_len, uap->alen, sizeof(*uap->alen));
1491           if (sa)
1492                     kfree(sa, M_SONAME);
1493           return (error);
1494 }
1495 
1496 int
getsockaddr(struct sockaddr ** namp,caddr_t uaddr,size_t len)1497 getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len)
1498 {
1499           struct sockaddr *sa;
1500           int error;
1501 
1502           *namp = NULL;
1503           if (len > SOCK_MAXADDRLEN)
1504                     return ENAMETOOLONG;
1505           if (len < offsetof(struct sockaddr, sa_data[0]))
1506                     return EDOM;
1507           sa = kmalloc(len, M_SONAME, M_WAITOK);
1508           error = copyin(uaddr, sa, len);
1509           if (error) {
1510                     kfree(sa, M_SONAME);
1511           } else {
1512                     sa->sa_len = len;
1513                     *namp = sa;
1514           }
1515           return error;
1516 }
1517 
1518 /*
1519  * Detach a mapped page and release resources back to the system.
1520  * We must release our wiring and if the object is ripped out
1521  * from under the vm_page we become responsible for freeing the
1522  * page.
1523  *
1524  * MPSAFE
1525  */
1526 static void
sf_buf_mfree(void * arg)1527 sf_buf_mfree(void *arg)
1528 {
1529           struct sf_buf *sf = arg;
1530           vm_page_t m;
1531 
1532           m = sf_buf_page(sf);
1533           if (sf_buf_free(sf)) {
1534                     /* sf invalid now */
1535                     vm_page_sbusy_drop(m);
1536 #if 0
1537                     if (m->object == NULL &&
1538                         m->wire_count == 0 &&
1539                         (m->flags & PG_NEED_COMMIT) == 0) {
1540                               vm_page_free(m);
1541                     } else {
1542                               vm_page_wakeup(m);
1543                     }
1544 #endif
1545           }
1546 }
1547 
1548 /*
1549  * sendfile(2).
1550  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1551  *         struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1552  *
1553  * Send a file specified by 'fd' and starting at 'offset' to a socket
1554  * specified by 's'. Send only 'nbytes' of the file or until EOF if
1555  * nbytes == 0. Optionally add a header and/or trailer to the socket
1556  * output. If specified, write the total number of bytes sent into *sbytes.
1557  *
1558  * In FreeBSD kern/uipc_syscalls.c,v 1.103, a bug was fixed that caused
1559  * the headers to count against the remaining bytes to be sent from
1560  * the file descriptor.  We may wish to implement a compatibility syscall
1561  * in the future.
1562  *
1563  * MPALMOSTSAFE
1564  */
1565 int
sys_sendfile(struct sysmsg * sysmsg,const struct sendfile_args * uap)1566 sys_sendfile(struct sysmsg *sysmsg, const struct sendfile_args *uap)
1567 {
1568           struct thread *td = curthread;
1569           struct file *fp;
1570           struct vnode *vp = NULL;
1571           struct sf_hdtr hdtr;
1572           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
1573           struct uio auio;
1574           struct mbuf *mheader = NULL;
1575           size_t hbytes = 0;
1576           size_t tbytes;
1577           off_t hdtr_size = 0;
1578           off_t sbytes;
1579           int error;
1580 
1581           /*
1582            * Do argument checking. Must be a regular file in, stream
1583            * type and connected socket out, positive offset.
1584            */
1585           fp = holdfp(td, uap->fd, FREAD);
1586           if (fp == NULL) {
1587                     return (EBADF);
1588           }
1589           if (fp->f_type != DTYPE_VNODE) {
1590                     fdrop(fp);
1591                     return (EINVAL);
1592           }
1593           vp = (struct vnode *)fp->f_data;
1594           vref(vp);
1595           dropfp(td, uap->fd, fp);
1596 
1597           /*
1598            * If specified, get the pointer to the sf_hdtr struct for
1599            * any headers/trailers.
1600            */
1601           if (uap->hdtr) {
1602                     error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1603                     if (error)
1604                               goto done;
1605                     /*
1606                      * Send any headers.
1607                      */
1608                     if (hdtr.headers) {
1609                               error = iovec_copyin(hdtr.headers, &iov, aiov,
1610                                                        hdtr.hdr_cnt, &hbytes);
1611                               if (error)
1612                                         goto done;
1613                               auio.uio_iov = iov;
1614                               auio.uio_iovcnt = hdtr.hdr_cnt;
1615                               auio.uio_offset = 0;
1616                               auio.uio_segflg = UIO_USERSPACE;
1617                               auio.uio_rw = UIO_WRITE;
1618                               auio.uio_td = td;
1619                               auio.uio_resid = hbytes;
1620 
1621                               mheader = m_uiomove(&auio);
1622 
1623                               iovec_free(&iov, aiov);
1624                               if (mheader == NULL)
1625                                         goto done;
1626                     }
1627           }
1628 
1629           error = kern_sendfile(vp, uap->s, uap->offset, uap->nbytes, mheader,
1630                                     &sbytes, uap->flags);
1631           if (error)
1632                     goto done;
1633 
1634           /*
1635            * Send trailers. Wimp out and use writev(2).
1636            */
1637           if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1638                     error = iovec_copyin(hdtr.trailers, &iov, aiov,
1639                                              hdtr.trl_cnt, &auio.uio_resid);
1640                     if (error)
1641                               goto done;
1642                     auio.uio_iov = iov;
1643                     auio.uio_iovcnt = hdtr.trl_cnt;
1644                     auio.uio_offset = 0;
1645                     auio.uio_segflg = UIO_USERSPACE;
1646                     auio.uio_rw = UIO_WRITE;
1647                     auio.uio_td = td;
1648 
1649                     tbytes = 0;         /* avoid gcc warnings */
1650                     error = kern_sendmsg(uap->s, NULL, &auio, NULL, 0, &tbytes);
1651 
1652                     iovec_free(&iov, aiov);
1653                     if (error)
1654                               goto done;
1655                     hdtr_size += tbytes;          /* trailer bytes successfully sent */
1656           }
1657 
1658 done:
1659           if (vp)
1660                     vrele(vp);
1661           if (uap->sbytes != NULL) {
1662                     sbytes += hdtr_size;
1663                     copyout(&sbytes, uap->sbytes, sizeof(off_t));
1664           }
1665           return (error);
1666 }
1667 
1668 int
kern_sendfile(struct vnode * vp,int sfd,off_t offset,size_t nbytes,struct mbuf * mheader,off_t * sbytes,int flags)1669 kern_sendfile(struct vnode *vp, int sfd, off_t offset, size_t nbytes,
1670                 struct mbuf *mheader, off_t *sbytes, int flags)
1671 {
1672           struct thread *td = curthread;
1673           struct vm_object *obj;
1674           struct socket *so;
1675           struct file *fp;
1676           struct mbuf *m, *mp;
1677           struct sf_buf *sf;
1678           struct vm_page *pg;
1679           off_t off, xfsize, xbytes;
1680           off_t hbytes = 0;
1681           int error = 0;
1682 
1683           if (vp->v_type != VREG) {
1684                     error = EINVAL;
1685                     goto done0;
1686           }
1687           if ((obj = vp->v_object) == NULL) {
1688                     error = EINVAL;
1689                     goto done0;
1690           }
1691           error = holdsock(td, sfd, &fp);
1692           if (error)
1693                     goto done0;
1694           so = (struct socket *)fp->f_data;
1695           if (so->so_type != SOCK_STREAM) {
1696                     error = EINVAL;
1697                     goto done1;
1698           }
1699           if ((so->so_state & SS_ISCONNECTED) == 0) {
1700                     error = ENOTCONN;
1701                     goto done1;
1702           }
1703           if (offset < 0) {
1704                     error = EINVAL;
1705                     goto done1;
1706           }
1707 
1708           /*
1709            * preallocation is required for asynchronous passing of mbufs,
1710            * otherwise we can wind up building up an infinite number of
1711            * mbufs during the asynchronous latency.
1712            */
1713           if ((so->so_snd.ssb_flags & (SSB_PREALLOC | SSB_STOPSUPP)) == 0) {
1714                     error = EINVAL;
1715                     goto done1;
1716           }
1717 
1718           *sbytes = 0;
1719           xbytes = 0;
1720 
1721           /*
1722            * Protect against multiple writers to the socket.
1723            * We need at least a shared lock on the VM object
1724            */
1725           ssb_lock(&so->so_snd, M_WAITOK);
1726           vm_object_hold_shared(obj);
1727 
1728           /*
1729            * Loop through the pages in the file, starting with the requested
1730            * offset. Get a file page (do I/O if necessary), map the file page
1731            * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1732            * it on the socket.
1733            */
1734           for (off = offset; ;
1735                off += xfsize, *sbytes += xfsize + hbytes, xbytes += xfsize) {
1736                     vm_pindex_t pindex;
1737                     vm_offset_t pgoff;
1738                     long space;
1739                     int loops;
1740 
1741                     pindex = OFF_TO_IDX(off);
1742                     loops = 0;
1743 
1744 retry_lookup:
1745                     /*
1746                      * Calculate the amount to transfer. Not to exceed a page,
1747                      * the EOF, or the passed in nbytes.
1748                      */
1749                     xfsize = vp->v_filesize - off;
1750                     if (xfsize > PAGE_SIZE)
1751                               xfsize = PAGE_SIZE;
1752                     pgoff = (vm_offset_t)(off & PAGE_MASK);
1753                     if (PAGE_SIZE - pgoff < xfsize)
1754                               xfsize = PAGE_SIZE - pgoff;
1755                     if (nbytes && xfsize > (nbytes - xbytes))
1756                               xfsize = nbytes - xbytes;
1757                     if (xfsize <= 0)
1758                               break;
1759                     /*
1760                      * Optimize the non-blocking case by looking at the socket space
1761                      * before going to the extra work of constituting the sf_buf.
1762                      */
1763                     if (so->so_snd.ssb_flags & SSB_PREALLOC)
1764                               space = ssb_space_prealloc(&so->so_snd);
1765                     else
1766                               space = ssb_space(&so->so_snd);
1767 
1768                     if ((fp->f_flag & FNONBLOCK) && space <= 0) {
1769                               if (so->so_state & SS_CANTSENDMORE)
1770                                         error = EPIPE;
1771                               else
1772                                         error = EAGAIN;
1773                               goto done;
1774                     }
1775 
1776                     /*
1777                      * Attempt to look up the page.
1778                      *
1779                      * Try to find the data using a shared vm_object token and
1780                      * vm_page_lookup_sbusy_try() first.
1781                      *
1782                      * If data is missing, use a UIO_NOCOPY VOP_READ to load
1783                      * the missing data and loop back up.  We avoid all sorts
1784                      * of problems by not trying to hold onto the page during
1785                      * the I/O.
1786                      *
1787                      * NOTE: The soft-busy will temporary block filesystem
1788                      *         truncation operations when a file is removed
1789                      *         while the sendfile is running.
1790                      */
1791                     pg = vm_page_lookup_sbusy_try(obj, pindex, pgoff, xfsize);
1792                     if (pg == NULL) {
1793                               struct uio auio;
1794                               struct iovec aiov;
1795                               int bsize;
1796 
1797                               if (++loops > 100000) {
1798                                         kprintf("sendfile: VOP operation failed "
1799                                                   "to retain page\n");
1800                                         error = EIO;
1801                                         goto done;
1802                               }
1803 
1804                               vm_object_drop(obj);
1805                               bsize = vp->v_mount->mnt_stat.f_iosize;
1806                               auio.uio_iov = &aiov;
1807                               auio.uio_iovcnt = 1;
1808                               aiov.iov_base = 0;
1809                               aiov.iov_len = MAXBSIZE;
1810                               auio.uio_resid = MAXBSIZE;
1811                               auio.uio_offset = trunc_page(off);
1812                               auio.uio_segflg = UIO_NOCOPY;
1813                               auio.uio_rw = UIO_READ;
1814                               auio.uio_td = td;
1815 
1816                               vn_lock(vp, LK_SHARED | LK_RETRY);
1817                               error = VOP_READ_FP(vp, &auio,
1818                                                    IO_VMIO | ((MAXBSIZE / bsize) << 16),
1819                                                    td->td_ucred, fp);
1820                               vn_unlock(vp);
1821                               vm_object_hold_shared(obj);
1822 
1823                               if (error)
1824                                         goto done;
1825                               goto retry_lookup;
1826                     }
1827 
1828                     /*
1829                      * Get a sendfile buf. We usually wait as long as necessary,
1830                      * but this wait can be interrupted.
1831                      */
1832                     if ((sf = sf_buf_alloc(pg)) == NULL) {
1833                               vm_page_sbusy_drop(pg);
1834                               /* vm_page_try_to_free(pg); */
1835                               error = EINTR;
1836                               goto done;
1837                     }
1838 
1839                     /*
1840                      * Get an mbuf header and set it up as having external storage.
1841                      */
1842                     MGETHDR(m, M_WAITOK, MT_DATA);
1843                     m->m_ext.ext_free = sf_buf_mfree;
1844                     m->m_ext.ext_ref = sf_buf_ref;
1845                     m->m_ext.ext_arg = sf;
1846                     m->m_ext.ext_buf = (void *)sf_buf_kva(sf);
1847                     m->m_ext.ext_size = PAGE_SIZE;
1848                     m->m_data = (char *)sf_buf_kva(sf) + pgoff;
1849                     m->m_flags |= M_EXT;
1850                     m->m_pkthdr.len = m->m_len = xfsize;
1851                     KKASSERT((m->m_flags & (M_EXT_CLUSTER)) == 0);
1852 
1853                     if (mheader != NULL) {
1854                               hbytes = mheader->m_pkthdr.len;
1855                               mheader->m_pkthdr.len += m->m_pkthdr.len;
1856                               m_cat(mheader, m);
1857                               m = mheader;
1858                               mheader = NULL;
1859                     } else {
1860                               hbytes = 0;
1861                     }
1862 
1863                     /*
1864                      * Add the buffer to the socket buffer chain.
1865                      */
1866                     crit_enter();
1867 retry_space:
1868                     /*
1869                      * Make sure that the socket is still able to take more data.
1870                      * CANTSENDMORE being true usually means that the connection
1871                      * was closed. so_error is true when an error was sensed after
1872                      * a previous send.
1873                      * The state is checked after the page mapping and buffer
1874                      * allocation above since those operations may block and make
1875                      * any socket checks stale. From this point forward, nothing
1876                      * blocks before the pru_send (or more accurately, any blocking
1877                      * results in a loop back to here to re-check).
1878                      */
1879                     if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1880                               if (so->so_state & SS_CANTSENDMORE) {
1881                                         error = EPIPE;
1882                               } else {
1883                                         error = so->so_error;
1884                                         so->so_error = 0;
1885                               }
1886                               m_freem(m);
1887                               crit_exit();
1888                               goto done;
1889                     }
1890                     /*
1891                      * Wait for socket space to become available. We do this just
1892                      * after checking the connection state above in order to avoid
1893                      * a race condition with ssb_wait().
1894                      */
1895                     if (so->so_snd.ssb_flags & SSB_PREALLOC)
1896                               space = ssb_space_prealloc(&so->so_snd);
1897                     else
1898                               space = ssb_space(&so->so_snd);
1899 
1900                     if (space < m->m_pkthdr.len && space < so->so_snd.ssb_lowat) {
1901                               if (fp->f_flag & FNONBLOCK) {
1902                                         m_freem(m);
1903                                         crit_exit();
1904                                         error = EAGAIN;
1905                                         goto done;
1906                               }
1907                               error = ssb_wait(&so->so_snd);
1908                               /*
1909                                * An error from ssb_wait usually indicates that we've
1910                                * been interrupted by a signal. If we've sent anything
1911                                * then return bytes sent, otherwise return the error.
1912                                */
1913                               if (error) {
1914                                         m_freem(m);
1915                                         crit_exit();
1916                                         goto done;
1917                               }
1918                               goto retry_space;
1919                     }
1920 
1921                     if (so->so_snd.ssb_flags & SSB_PREALLOC) {
1922                               for (mp = m; mp != NULL; mp = mp->m_next)
1923                                         ssb_preallocstream(&so->so_snd, mp);
1924                     }
1925                     if (use_sendfile_async)
1926                               error = so_pru_senda(so, 0, m, NULL, NULL, td);
1927                     else
1928                               error = so_pru_send(so, 0, m, NULL, NULL, td);
1929 
1930                     crit_exit();
1931                     if (error)
1932                               goto done;
1933           }
1934           if (mheader != NULL) {
1935                     *sbytes += mheader->m_pkthdr.len;
1936 
1937                     if (so->so_snd.ssb_flags & SSB_PREALLOC) {
1938                               for (mp = mheader; mp != NULL; mp = mp->m_next)
1939                                         ssb_preallocstream(&so->so_snd, mp);
1940                     }
1941                     if (use_sendfile_async)
1942                               error = so_pru_senda(so, 0, mheader, NULL, NULL, td);
1943                     else
1944                               error = so_pru_send(so, 0, mheader, NULL, NULL, td);
1945 
1946                     mheader = NULL;
1947           }
1948 done:
1949           vm_object_drop(obj);
1950           ssb_unlock(&so->so_snd);
1951 done1:
1952           dropfp(td, sfd, fp);
1953 done0:
1954           if (mheader != NULL)
1955                     m_freem(mheader);
1956           return (error);
1957 }
1958