1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
33 */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include "opt_capsicum.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_compat.h"
42 #include "opt_ktrace.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/capsicum.h>
47 #include <sys/condvar.h>
48 #include <sys/kernel.h>
49 #include <sys/lock.h>
50 #include <sys/mutex.h>
51 #include <sys/sysproto.h>
52 #include <sys/malloc.h>
53 #include <sys/filedesc.h>
54 #include <sys/event.h>
55 #include <sys/proc.h>
56 #include <sys/fcntl.h>
57 #include <sys/file.h>
58 #include <sys/filio.h>
59 #include <sys/jail.h>
60 #include <sys/mman.h>
61 #include <sys/mount.h>
62 #include <sys/mbuf.h>
63 #include <sys/protosw.h>
64 #include <sys/rwlock.h>
65 #include <sys/sf_buf.h>
66 #include <sys/sysent.h>
67 #include <sys/socket.h>
68 #include <sys/socketvar.h>
69 #include <sys/signalvar.h>
70 #include <sys/syscallsubr.h>
71 #include <sys/sysctl.h>
72 #include <sys/uio.h>
73 #include <sys/vnode.h>
74 #ifdef KTRACE
75 #include <sys/ktrace.h>
76 #endif
77 #ifdef COMPAT_FREEBSD32
78 #include <compat/freebsd32/freebsd32_util.h>
79 #endif
80
81 #include <net/vnet.h>
82
83 #include <security/audit/audit.h>
84 #include <security/mac/mac_framework.h>
85
86 #include <vm/vm.h>
87 #include <vm/vm_param.h>
88 #include <vm/vm_object.h>
89 #include <vm/vm_page.h>
90 #include <vm/vm_pager.h>
91 #include <vm/vm_kern.h>
92 #include <vm/vm_extern.h>
93 #include <vm/uma.h>
94
95 /*
96 * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
97 * and SOCK_NONBLOCK.
98 */
99 #define ACCEPT4_INHERIT 0x1
100 #define ACCEPT4_COMPAT 0x2
101
102 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
103 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
104
105 static int accept1(struct thread *td, int s, struct sockaddr *uname,
106 socklen_t *anamelen, int flags);
107 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
108 int compat);
109 static int getsockname1(struct thread *td, struct getsockname_args *uap,
110 int compat);
111 static int getpeername1(struct thread *td, struct getpeername_args *uap,
112 int compat);
113
114 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
115
116 static void
sfstat_init(const void * unused)117 sfstat_init(const void *unused)
118 {
119
120 COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
121 M_WAITOK);
122 }
123 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
124
125 static int
sfstat_sysctl(SYSCTL_HANDLER_ARGS)126 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
127 {
128 struct sfstat s;
129
130 COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
131 if (req->newptr)
132 COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
133 return (SYSCTL_OUT(req, &s, sizeof(s)));
134 }
135 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
136 NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
137
138 /*
139 * Convert a user file descriptor to a kernel file entry and check if required
140 * capability rights are present.
141 * A reference on the file entry is held upon returning.
142 */
143 int
getsock_cap(struct thread * td,int fd,cap_rights_t * rightsp,struct file ** fpp,u_int * fflagp)144 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
145 struct file **fpp, u_int *fflagp)
146 {
147 struct file *fp;
148 int error;
149
150 error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
151 if (error != 0)
152 return (error);
153 if (fp->f_type != DTYPE_SOCKET) {
154 fdrop(fp, td);
155 return (ENOTSOCK);
156 }
157 if (fflagp != NULL)
158 *fflagp = fp->f_flag;
159 *fpp = fp;
160 return (0);
161 }
162
163 /*
164 * System call interface to the socket abstraction.
165 */
166 #if defined(COMPAT_43)
167 #define COMPAT_OLDSOCK
168 #endif
169
170 int
sys_socket(td,uap)171 sys_socket(td, uap)
172 struct thread *td;
173 struct socket_args /* {
174 int domain;
175 int type;
176 int protocol;
177 } */ *uap;
178 {
179 struct socket *so;
180 struct file *fp;
181 int fd, error, type, oflag, fflag;
182
183 AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
184
185 type = uap->type;
186 oflag = 0;
187 fflag = 0;
188 if ((type & SOCK_CLOEXEC) != 0) {
189 type &= ~SOCK_CLOEXEC;
190 oflag |= O_CLOEXEC;
191 }
192 if ((type & SOCK_NONBLOCK) != 0) {
193 type &= ~SOCK_NONBLOCK;
194 fflag |= FNONBLOCK;
195 }
196
197 #ifdef MAC
198 error = mac_socket_check_create(td->td_ucred, uap->domain, type,
199 uap->protocol);
200 if (error != 0)
201 return (error);
202 #endif
203 error = falloc(td, &fp, &fd, oflag);
204 if (error != 0)
205 return (error);
206 /* An extra reference on `fp' has been held for us by falloc(). */
207 error = socreate(uap->domain, &so, type, uap->protocol,
208 td->td_ucred, td);
209 if (error != 0) {
210 fdclose(td, fp, fd);
211 } else {
212 finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
213 if ((fflag & FNONBLOCK) != 0)
214 (void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
215 td->td_retval[0] = fd;
216 }
217 fdrop(fp, td);
218 return (error);
219 }
220
221 /* ARGSUSED */
222 int
sys_bind(td,uap)223 sys_bind(td, uap)
224 struct thread *td;
225 struct bind_args /* {
226 int s;
227 caddr_t name;
228 int namelen;
229 } */ *uap;
230 {
231 struct sockaddr *sa;
232 int error;
233
234 error = getsockaddr(&sa, uap->name, uap->namelen);
235 if (error == 0) {
236 error = kern_bindat(td, AT_FDCWD, uap->s, sa);
237 free(sa, M_SONAME);
238 }
239 return (error);
240 }
241
242 int
kern_bindat(struct thread * td,int dirfd,int fd,struct sockaddr * sa)243 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
244 {
245 struct socket *so;
246 struct file *fp;
247 cap_rights_t rights;
248 int error;
249
250 AUDIT_ARG_FD(fd);
251 AUDIT_ARG_SOCKADDR(td, dirfd, sa);
252 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND),
253 &fp, NULL);
254 if (error != 0)
255 return (error);
256 so = fp->f_data;
257 #ifdef KTRACE
258 if (KTRPOINT(td, KTR_STRUCT))
259 ktrsockaddr(sa);
260 #endif
261 #ifdef MAC
262 error = mac_socket_check_bind(td->td_ucred, so, sa);
263 if (error == 0) {
264 #endif
265 if (dirfd == AT_FDCWD)
266 error = sobind(so, sa, td);
267 else
268 error = sobindat(dirfd, so, sa, td);
269 #ifdef MAC
270 }
271 #endif
272 fdrop(fp, td);
273 return (error);
274 }
275
276 /* ARGSUSED */
277 int
sys_bindat(td,uap)278 sys_bindat(td, uap)
279 struct thread *td;
280 struct bindat_args /* {
281 int fd;
282 int s;
283 caddr_t name;
284 int namelen;
285 } */ *uap;
286 {
287 struct sockaddr *sa;
288 int error;
289
290 error = getsockaddr(&sa, uap->name, uap->namelen);
291 if (error == 0) {
292 error = kern_bindat(td, uap->fd, uap->s, sa);
293 free(sa, M_SONAME);
294 }
295 return (error);
296 }
297
298 /* ARGSUSED */
299 int
sys_listen(td,uap)300 sys_listen(td, uap)
301 struct thread *td;
302 struct listen_args /* {
303 int s;
304 int backlog;
305 } */ *uap;
306 {
307 struct socket *so;
308 struct file *fp;
309 cap_rights_t rights;
310 int error;
311
312 AUDIT_ARG_FD(uap->s);
313 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN),
314 &fp, NULL);
315 if (error == 0) {
316 so = fp->f_data;
317 #ifdef MAC
318 error = mac_socket_check_listen(td->td_ucred, so);
319 if (error == 0)
320 #endif
321 error = solisten(so, uap->backlog, td);
322 fdrop(fp, td);
323 }
324 return(error);
325 }
326
327 /*
328 * accept1()
329 */
330 static int
accept1(td,s,uname,anamelen,flags)331 accept1(td, s, uname, anamelen, flags)
332 struct thread *td;
333 int s;
334 struct sockaddr *uname;
335 socklen_t *anamelen;
336 int flags;
337 {
338 struct sockaddr *name;
339 socklen_t namelen;
340 struct file *fp;
341 int error;
342
343 if (uname == NULL)
344 return (kern_accept4(td, s, NULL, NULL, flags, NULL));
345
346 error = copyin(anamelen, &namelen, sizeof (namelen));
347 if (error != 0)
348 return (error);
349
350 error = kern_accept4(td, s, &name, &namelen, flags, &fp);
351
352 if (error != 0)
353 return (error);
354
355 if (error == 0 && uname != NULL) {
356 #ifdef COMPAT_OLDSOCK
357 if (flags & ACCEPT4_COMPAT)
358 ((struct osockaddr *)name)->sa_family =
359 name->sa_family;
360 #endif
361 error = copyout(name, uname, namelen);
362 }
363 if (error == 0)
364 error = copyout(&namelen, anamelen,
365 sizeof(namelen));
366 if (error != 0)
367 fdclose(td, fp, td->td_retval[0]);
368 fdrop(fp, td);
369 free(name, M_SONAME);
370 return (error);
371 }
372
373 int
kern_accept(struct thread * td,int s,struct sockaddr ** name,socklen_t * namelen,struct file ** fp)374 kern_accept(struct thread *td, int s, struct sockaddr **name,
375 socklen_t *namelen, struct file **fp)
376 {
377 return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
378 }
379
380 int
kern_accept4(struct thread * td,int s,struct sockaddr ** name,socklen_t * namelen,int flags,struct file ** fp)381 kern_accept4(struct thread *td, int s, struct sockaddr **name,
382 socklen_t *namelen, int flags, struct file **fp)
383 {
384 struct file *headfp, *nfp = NULL;
385 struct sockaddr *sa = NULL;
386 struct socket *head, *so;
387 cap_rights_t rights;
388 u_int fflag;
389 pid_t pgid;
390 int error, fd, tmp;
391
392 if (name != NULL)
393 *name = NULL;
394
395 AUDIT_ARG_FD(s);
396 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT),
397 &headfp, &fflag);
398 if (error != 0)
399 return (error);
400 head = headfp->f_data;
401 if ((head->so_options & SO_ACCEPTCONN) == 0) {
402 error = EINVAL;
403 goto done;
404 }
405 #ifdef MAC
406 error = mac_socket_check_accept(td->td_ucred, head);
407 if (error != 0)
408 goto done;
409 #endif
410 error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
411 if (error != 0)
412 goto done;
413 ACCEPT_LOCK();
414 if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
415 ACCEPT_UNLOCK();
416 error = EWOULDBLOCK;
417 goto noconnection;
418 }
419 while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
420 if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
421 head->so_error = ECONNABORTED;
422 break;
423 }
424 error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
425 "accept", 0);
426 if (error != 0) {
427 ACCEPT_UNLOCK();
428 goto noconnection;
429 }
430 }
431 if (head->so_error) {
432 error = head->so_error;
433 head->so_error = 0;
434 ACCEPT_UNLOCK();
435 goto noconnection;
436 }
437 so = TAILQ_FIRST(&head->so_comp);
438 KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
439 KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
440
441 /*
442 * Before changing the flags on the socket, we have to bump the
443 * reference count. Otherwise, if the protocol calls sofree(),
444 * the socket will be released due to a zero refcount.
445 */
446 SOCK_LOCK(so); /* soref() and so_state update */
447 soref(so); /* file descriptor reference */
448
449 TAILQ_REMOVE(&head->so_comp, so, so_list);
450 head->so_qlen--;
451 if (flags & ACCEPT4_INHERIT)
452 so->so_state |= (head->so_state & SS_NBIO);
453 else
454 so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
455 so->so_qstate &= ~SQ_COMP;
456 so->so_head = NULL;
457
458 SOCK_UNLOCK(so);
459 ACCEPT_UNLOCK();
460
461 /* An extra reference on `nfp' has been held for us by falloc(). */
462 td->td_retval[0] = fd;
463
464 /* connection has been removed from the listen queue */
465 KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
466
467 if (flags & ACCEPT4_INHERIT) {
468 pgid = fgetown(&head->so_sigio);
469 if (pgid != 0)
470 fsetown(pgid, &so->so_sigio);
471 } else {
472 fflag &= ~(FNONBLOCK | FASYNC);
473 if (flags & SOCK_NONBLOCK)
474 fflag |= FNONBLOCK;
475 }
476
477 finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
478 /* Sync socket nonblocking/async state with file flags */
479 tmp = fflag & FNONBLOCK;
480 (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
481 tmp = fflag & FASYNC;
482 (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
483 sa = 0;
484 error = soaccept(so, &sa);
485 if (error != 0)
486 goto noconnection;
487 if (sa == NULL) {
488 if (name)
489 *namelen = 0;
490 goto done;
491 }
492 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
493 if (name) {
494 /* check sa_len before it is destroyed */
495 if (*namelen > sa->sa_len)
496 *namelen = sa->sa_len;
497 #ifdef KTRACE
498 if (KTRPOINT(td, KTR_STRUCT))
499 ktrsockaddr(sa);
500 #endif
501 *name = sa;
502 sa = NULL;
503 }
504 noconnection:
505 free(sa, M_SONAME);
506
507 /*
508 * close the new descriptor, assuming someone hasn't ripped it
509 * out from under us.
510 */
511 if (error != 0)
512 fdclose(td, nfp, fd);
513
514 /*
515 * Release explicitly held references before returning. We return
516 * a reference on nfp to the caller on success if they request it.
517 */
518 done:
519 if (fp != NULL) {
520 if (error == 0) {
521 *fp = nfp;
522 nfp = NULL;
523 } else
524 *fp = NULL;
525 }
526 if (nfp != NULL)
527 fdrop(nfp, td);
528 fdrop(headfp, td);
529 return (error);
530 }
531
532 int
sys_accept(td,uap)533 sys_accept(td, uap)
534 struct thread *td;
535 struct accept_args *uap;
536 {
537
538 return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
539 }
540
541 int
sys_accept4(td,uap)542 sys_accept4(td, uap)
543 struct thread *td;
544 struct accept4_args *uap;
545 {
546
547 if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
548 return (EINVAL);
549
550 return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
551 }
552
553 #ifdef COMPAT_OLDSOCK
554 int
oaccept(td,uap)555 oaccept(td, uap)
556 struct thread *td;
557 struct accept_args *uap;
558 {
559
560 return (accept1(td, uap->s, uap->name, uap->anamelen,
561 ACCEPT4_INHERIT | ACCEPT4_COMPAT));
562 }
563 #endif /* COMPAT_OLDSOCK */
564
565 /* ARGSUSED */
566 int
sys_connect(td,uap)567 sys_connect(td, uap)
568 struct thread *td;
569 struct connect_args /* {
570 int s;
571 caddr_t name;
572 int namelen;
573 } */ *uap;
574 {
575 struct sockaddr *sa;
576 int error;
577
578 error = getsockaddr(&sa, uap->name, uap->namelen);
579 if (error == 0) {
580 error = kern_connectat(td, AT_FDCWD, uap->s, sa);
581 free(sa, M_SONAME);
582 }
583 return (error);
584 }
585
586 int
kern_connectat(struct thread * td,int dirfd,int fd,struct sockaddr * sa)587 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
588 {
589 struct socket *so;
590 struct file *fp;
591 cap_rights_t rights;
592 int error, interrupted = 0;
593
594 AUDIT_ARG_FD(fd);
595 AUDIT_ARG_SOCKADDR(td, dirfd, sa);
596 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT),
597 &fp, NULL);
598 if (error != 0)
599 return (error);
600 so = fp->f_data;
601 if (so->so_state & SS_ISCONNECTING) {
602 error = EALREADY;
603 goto done1;
604 }
605 #ifdef KTRACE
606 if (KTRPOINT(td, KTR_STRUCT))
607 ktrsockaddr(sa);
608 #endif
609 #ifdef MAC
610 error = mac_socket_check_connect(td->td_ucred, so, sa);
611 if (error != 0)
612 goto bad;
613 #endif
614 if (dirfd == AT_FDCWD)
615 error = soconnect(so, sa, td);
616 else
617 error = soconnectat(dirfd, so, sa, td);
618 if (error != 0)
619 goto bad;
620 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
621 error = EINPROGRESS;
622 goto done1;
623 }
624 SOCK_LOCK(so);
625 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
626 error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
627 "connec", 0);
628 if (error != 0) {
629 if (error == EINTR || error == ERESTART)
630 interrupted = 1;
631 break;
632 }
633 }
634 if (error == 0) {
635 error = so->so_error;
636 so->so_error = 0;
637 }
638 SOCK_UNLOCK(so);
639 bad:
640 if (!interrupted)
641 so->so_state &= ~SS_ISCONNECTING;
642 if (error == ERESTART)
643 error = EINTR;
644 done1:
645 fdrop(fp, td);
646 return (error);
647 }
648
649 /* ARGSUSED */
650 int
sys_connectat(td,uap)651 sys_connectat(td, uap)
652 struct thread *td;
653 struct connectat_args /* {
654 int fd;
655 int s;
656 caddr_t name;
657 int namelen;
658 } */ *uap;
659 {
660 struct sockaddr *sa;
661 int error;
662
663 error = getsockaddr(&sa, uap->name, uap->namelen);
664 if (error == 0) {
665 error = kern_connectat(td, uap->fd, uap->s, sa);
666 free(sa, M_SONAME);
667 }
668 return (error);
669 }
670
671 int
kern_socketpair(struct thread * td,int domain,int type,int protocol,int * rsv)672 kern_socketpair(struct thread *td, int domain, int type, int protocol,
673 int *rsv)
674 {
675 struct file *fp1, *fp2;
676 struct socket *so1, *so2;
677 int fd, error, oflag, fflag;
678
679 AUDIT_ARG_SOCKET(domain, type, protocol);
680
681 oflag = 0;
682 fflag = 0;
683 if ((type & SOCK_CLOEXEC) != 0) {
684 type &= ~SOCK_CLOEXEC;
685 oflag |= O_CLOEXEC;
686 }
687 if ((type & SOCK_NONBLOCK) != 0) {
688 type &= ~SOCK_NONBLOCK;
689 fflag |= FNONBLOCK;
690 }
691 #ifdef MAC
692 /* We might want to have a separate check for socket pairs. */
693 error = mac_socket_check_create(td->td_ucred, domain, type,
694 protocol);
695 if (error != 0)
696 return (error);
697 #endif
698 error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
699 if (error != 0)
700 return (error);
701 error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
702 if (error != 0)
703 goto free1;
704 /* On success extra reference to `fp1' and 'fp2' is set by falloc. */
705 error = falloc(td, &fp1, &fd, oflag);
706 if (error != 0)
707 goto free2;
708 rsv[0] = fd;
709 fp1->f_data = so1; /* so1 already has ref count */
710 error = falloc(td, &fp2, &fd, oflag);
711 if (error != 0)
712 goto free3;
713 fp2->f_data = so2; /* so2 already has ref count */
714 rsv[1] = fd;
715 error = soconnect2(so1, so2);
716 if (error != 0)
717 goto free4;
718 if (type == SOCK_DGRAM) {
719 /*
720 * Datagram socket connection is asymmetric.
721 */
722 error = soconnect2(so2, so1);
723 if (error != 0)
724 goto free4;
725 }
726 finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
727 &socketops);
728 finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
729 &socketops);
730 if ((fflag & FNONBLOCK) != 0) {
731 (void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
732 (void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
733 }
734 fdrop(fp1, td);
735 fdrop(fp2, td);
736 return (0);
737 free4:
738 fdclose(td, fp2, rsv[1]);
739 fdrop(fp2, td);
740 free3:
741 fdclose(td, fp1, rsv[0]);
742 fdrop(fp1, td);
743 free2:
744 if (so2 != NULL)
745 (void)soclose(so2);
746 free1:
747 if (so1 != NULL)
748 (void)soclose(so1);
749 return (error);
750 }
751
752 int
sys_socketpair(struct thread * td,struct socketpair_args * uap)753 sys_socketpair(struct thread *td, struct socketpair_args *uap)
754 {
755 int error, sv[2];
756
757 error = kern_socketpair(td, uap->domain, uap->type,
758 uap->protocol, sv);
759 if (error != 0)
760 return (error);
761 error = copyout(sv, uap->rsv, 2 * sizeof(int));
762 if (error != 0) {
763 (void)kern_close(td, sv[0]);
764 (void)kern_close(td, sv[1]);
765 }
766 return (error);
767 }
768
769 static int
sendit(td,s,mp,flags)770 sendit(td, s, mp, flags)
771 struct thread *td;
772 int s;
773 struct msghdr *mp;
774 int flags;
775 {
776 struct mbuf *control;
777 struct sockaddr *to;
778 int error;
779
780 #ifdef CAPABILITY_MODE
781 if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
782 return (ECAPMODE);
783 #endif
784
785 if (mp->msg_name != NULL) {
786 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
787 if (error != 0) {
788 to = NULL;
789 goto bad;
790 }
791 mp->msg_name = to;
792 } else {
793 to = NULL;
794 }
795
796 if (mp->msg_control) {
797 if (mp->msg_controllen < sizeof(struct cmsghdr)
798 #ifdef COMPAT_OLDSOCK
799 && mp->msg_flags != MSG_COMPAT
800 #endif
801 ) {
802 error = EINVAL;
803 goto bad;
804 }
805 error = sockargs(&control, mp->msg_control,
806 mp->msg_controllen, MT_CONTROL);
807 if (error != 0)
808 goto bad;
809 #ifdef COMPAT_OLDSOCK
810 if (mp->msg_flags == MSG_COMPAT) {
811 struct cmsghdr *cm;
812
813 M_PREPEND(control, sizeof(*cm), M_WAITOK);
814 cm = mtod(control, struct cmsghdr *);
815 cm->cmsg_len = control->m_len;
816 cm->cmsg_level = SOL_SOCKET;
817 cm->cmsg_type = SCM_RIGHTS;
818 }
819 #endif
820 } else {
821 control = NULL;
822 }
823
824 error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
825
826 bad:
827 free(to, M_SONAME);
828 return (error);
829 }
830
831 int
kern_sendit(td,s,mp,flags,control,segflg)832 kern_sendit(td, s, mp, flags, control, segflg)
833 struct thread *td;
834 int s;
835 struct msghdr *mp;
836 int flags;
837 struct mbuf *control;
838 enum uio_seg segflg;
839 {
840 struct file *fp;
841 struct uio auio;
842 struct iovec *iov;
843 struct socket *so;
844 cap_rights_t rights;
845 #ifdef KTRACE
846 struct uio *ktruio = NULL;
847 #endif
848 ssize_t len;
849 int i, error;
850
851 AUDIT_ARG_FD(s);
852 cap_rights_init(&rights, CAP_SEND);
853 if (mp->msg_name != NULL) {
854 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
855 cap_rights_set(&rights, CAP_CONNECT);
856 }
857 error = getsock_cap(td, s, &rights, &fp, NULL);
858 if (error != 0)
859 return (error);
860 so = (struct socket *)fp->f_data;
861
862 #ifdef KTRACE
863 if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
864 ktrsockaddr(mp->msg_name);
865 #endif
866 #ifdef MAC
867 if (mp->msg_name != NULL) {
868 error = mac_socket_check_connect(td->td_ucred, so,
869 mp->msg_name);
870 if (error != 0)
871 goto bad;
872 }
873 error = mac_socket_check_send(td->td_ucred, so);
874 if (error != 0)
875 goto bad;
876 #endif
877
878 auio.uio_iov = mp->msg_iov;
879 auio.uio_iovcnt = mp->msg_iovlen;
880 auio.uio_segflg = segflg;
881 auio.uio_rw = UIO_WRITE;
882 auio.uio_td = td;
883 auio.uio_offset = 0; /* XXX */
884 auio.uio_resid = 0;
885 iov = mp->msg_iov;
886 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
887 if ((auio.uio_resid += iov->iov_len) < 0) {
888 error = EINVAL;
889 goto bad;
890 }
891 }
892 #ifdef KTRACE
893 if (KTRPOINT(td, KTR_GENIO))
894 ktruio = cloneuio(&auio);
895 #endif
896 len = auio.uio_resid;
897 error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
898 if (error != 0) {
899 if (auio.uio_resid != len && (error == ERESTART ||
900 error == EINTR || error == EWOULDBLOCK))
901 error = 0;
902 /* Generation of SIGPIPE can be controlled per socket */
903 if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
904 !(flags & MSG_NOSIGNAL)) {
905 PROC_LOCK(td->td_proc);
906 tdsignal(td, SIGPIPE);
907 PROC_UNLOCK(td->td_proc);
908 }
909 }
910 if (error == 0)
911 td->td_retval[0] = len - auio.uio_resid;
912 #ifdef KTRACE
913 if (ktruio != NULL) {
914 ktruio->uio_resid = td->td_retval[0];
915 ktrgenio(s, UIO_WRITE, ktruio, error);
916 }
917 #endif
918 bad:
919 fdrop(fp, td);
920 return (error);
921 }
922
923 int
sys_sendto(td,uap)924 sys_sendto(td, uap)
925 struct thread *td;
926 struct sendto_args /* {
927 int s;
928 caddr_t buf;
929 size_t len;
930 int flags;
931 caddr_t to;
932 int tolen;
933 } */ *uap;
934 {
935 struct msghdr msg;
936 struct iovec aiov;
937
938 msg.msg_name = uap->to;
939 msg.msg_namelen = uap->tolen;
940 msg.msg_iov = &aiov;
941 msg.msg_iovlen = 1;
942 msg.msg_control = 0;
943 #ifdef COMPAT_OLDSOCK
944 msg.msg_flags = 0;
945 #endif
946 aiov.iov_base = uap->buf;
947 aiov.iov_len = uap->len;
948 return (sendit(td, uap->s, &msg, uap->flags));
949 }
950
951 #ifdef COMPAT_OLDSOCK
952 int
osend(td,uap)953 osend(td, uap)
954 struct thread *td;
955 struct osend_args /* {
956 int s;
957 caddr_t buf;
958 int len;
959 int flags;
960 } */ *uap;
961 {
962 struct msghdr msg;
963 struct iovec aiov;
964
965 msg.msg_name = 0;
966 msg.msg_namelen = 0;
967 msg.msg_iov = &aiov;
968 msg.msg_iovlen = 1;
969 aiov.iov_base = uap->buf;
970 aiov.iov_len = uap->len;
971 msg.msg_control = 0;
972 msg.msg_flags = 0;
973 return (sendit(td, uap->s, &msg, uap->flags));
974 }
975
976 int
osendmsg(td,uap)977 osendmsg(td, uap)
978 struct thread *td;
979 struct osendmsg_args /* {
980 int s;
981 caddr_t msg;
982 int flags;
983 } */ *uap;
984 {
985 struct msghdr msg;
986 struct iovec *iov;
987 int error;
988
989 error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
990 if (error != 0)
991 return (error);
992 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
993 if (error != 0)
994 return (error);
995 msg.msg_iov = iov;
996 msg.msg_flags = MSG_COMPAT;
997 error = sendit(td, uap->s, &msg, uap->flags);
998 free(iov, M_IOV);
999 return (error);
1000 }
1001 #endif
1002
1003 int
sys_sendmsg(td,uap)1004 sys_sendmsg(td, uap)
1005 struct thread *td;
1006 struct sendmsg_args /* {
1007 int s;
1008 caddr_t msg;
1009 int flags;
1010 } */ *uap;
1011 {
1012 struct msghdr msg;
1013 struct iovec *iov;
1014 int error;
1015
1016 error = copyin(uap->msg, &msg, sizeof (msg));
1017 if (error != 0)
1018 return (error);
1019 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1020 if (error != 0)
1021 return (error);
1022 msg.msg_iov = iov;
1023 #ifdef COMPAT_OLDSOCK
1024 msg.msg_flags = 0;
1025 #endif
1026 error = sendit(td, uap->s, &msg, uap->flags);
1027 free(iov, M_IOV);
1028 return (error);
1029 }
1030
1031 int
kern_recvit(td,s,mp,fromseg,controlp)1032 kern_recvit(td, s, mp, fromseg, controlp)
1033 struct thread *td;
1034 int s;
1035 struct msghdr *mp;
1036 enum uio_seg fromseg;
1037 struct mbuf **controlp;
1038 {
1039 struct uio auio;
1040 struct iovec *iov;
1041 struct mbuf *m, *control = NULL;
1042 caddr_t ctlbuf;
1043 struct file *fp;
1044 struct socket *so;
1045 struct sockaddr *fromsa = NULL;
1046 cap_rights_t rights;
1047 #ifdef KTRACE
1048 struct uio *ktruio = NULL;
1049 #endif
1050 ssize_t len;
1051 int error, i;
1052
1053 if (controlp != NULL)
1054 *controlp = NULL;
1055
1056 AUDIT_ARG_FD(s);
1057 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV),
1058 &fp, NULL);
1059 if (error != 0)
1060 return (error);
1061 so = fp->f_data;
1062
1063 #ifdef MAC
1064 error = mac_socket_check_receive(td->td_ucred, so);
1065 if (error != 0) {
1066 fdrop(fp, td);
1067 return (error);
1068 }
1069 #endif
1070
1071 auio.uio_iov = mp->msg_iov;
1072 auio.uio_iovcnt = mp->msg_iovlen;
1073 auio.uio_segflg = UIO_USERSPACE;
1074 auio.uio_rw = UIO_READ;
1075 auio.uio_td = td;
1076 auio.uio_offset = 0; /* XXX */
1077 auio.uio_resid = 0;
1078 iov = mp->msg_iov;
1079 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
1080 if ((auio.uio_resid += iov->iov_len) < 0) {
1081 fdrop(fp, td);
1082 return (EINVAL);
1083 }
1084 }
1085 #ifdef KTRACE
1086 if (KTRPOINT(td, KTR_GENIO))
1087 ktruio = cloneuio(&auio);
1088 #endif
1089 len = auio.uio_resid;
1090 error = soreceive(so, &fromsa, &auio, NULL,
1091 (mp->msg_control || controlp) ? &control : NULL,
1092 &mp->msg_flags);
1093 if (error != 0) {
1094 if (auio.uio_resid != len && (error == ERESTART ||
1095 error == EINTR || error == EWOULDBLOCK))
1096 error = 0;
1097 }
1098 if (fromsa != NULL)
1099 AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
1100 #ifdef KTRACE
1101 if (ktruio != NULL) {
1102 ktruio->uio_resid = len - auio.uio_resid;
1103 ktrgenio(s, UIO_READ, ktruio, error);
1104 }
1105 #endif
1106 if (error != 0)
1107 goto out;
1108 td->td_retval[0] = len - auio.uio_resid;
1109 if (mp->msg_name) {
1110 len = mp->msg_namelen;
1111 if (len <= 0 || fromsa == NULL)
1112 len = 0;
1113 else {
1114 /* save sa_len before it is destroyed by MSG_COMPAT */
1115 len = MIN(len, fromsa->sa_len);
1116 #ifdef COMPAT_OLDSOCK
1117 if (mp->msg_flags & MSG_COMPAT)
1118 ((struct osockaddr *)fromsa)->sa_family =
1119 fromsa->sa_family;
1120 #endif
1121 if (fromseg == UIO_USERSPACE) {
1122 error = copyout(fromsa, mp->msg_name,
1123 (unsigned)len);
1124 if (error != 0)
1125 goto out;
1126 } else
1127 bcopy(fromsa, mp->msg_name, len);
1128 }
1129 mp->msg_namelen = len;
1130 }
1131 if (mp->msg_control && controlp == NULL) {
1132 #ifdef COMPAT_OLDSOCK
1133 /*
1134 * We assume that old recvmsg calls won't receive access
1135 * rights and other control info, esp. as control info
1136 * is always optional and those options didn't exist in 4.3.
1137 * If we receive rights, trim the cmsghdr; anything else
1138 * is tossed.
1139 */
1140 if (control && mp->msg_flags & MSG_COMPAT) {
1141 if (mtod(control, struct cmsghdr *)->cmsg_level !=
1142 SOL_SOCKET ||
1143 mtod(control, struct cmsghdr *)->cmsg_type !=
1144 SCM_RIGHTS) {
1145 mp->msg_controllen = 0;
1146 goto out;
1147 }
1148 control->m_len -= sizeof (struct cmsghdr);
1149 control->m_data += sizeof (struct cmsghdr);
1150 }
1151 #endif
1152 len = mp->msg_controllen;
1153 m = control;
1154 mp->msg_controllen = 0;
1155 ctlbuf = mp->msg_control;
1156
1157 while (m && len > 0) {
1158 unsigned int tocopy;
1159
1160 if (len >= m->m_len)
1161 tocopy = m->m_len;
1162 else {
1163 mp->msg_flags |= MSG_CTRUNC;
1164 tocopy = len;
1165 }
1166
1167 if ((error = copyout(mtod(m, caddr_t),
1168 ctlbuf, tocopy)) != 0)
1169 goto out;
1170
1171 ctlbuf += tocopy;
1172 len -= tocopy;
1173 m = m->m_next;
1174 }
1175 mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
1176 }
1177 out:
1178 fdrop(fp, td);
1179 #ifdef KTRACE
1180 if (fromsa && KTRPOINT(td, KTR_STRUCT))
1181 ktrsockaddr(fromsa);
1182 #endif
1183 free(fromsa, M_SONAME);
1184
1185 if (error == 0 && controlp != NULL)
1186 *controlp = control;
1187 else if (control)
1188 m_freem(control);
1189
1190 return (error);
1191 }
1192
1193 static int
recvit(td,s,mp,namelenp)1194 recvit(td, s, mp, namelenp)
1195 struct thread *td;
1196 int s;
1197 struct msghdr *mp;
1198 void *namelenp;
1199 {
1200 int error;
1201
1202 error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
1203 if (error != 0)
1204 return (error);
1205 if (namelenp != NULL) {
1206 error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
1207 #ifdef COMPAT_OLDSOCK
1208 if (mp->msg_flags & MSG_COMPAT)
1209 error = 0; /* old recvfrom didn't check */
1210 #endif
1211 }
1212 return (error);
1213 }
1214
1215 int
sys_recvfrom(td,uap)1216 sys_recvfrom(td, uap)
1217 struct thread *td;
1218 struct recvfrom_args /* {
1219 int s;
1220 caddr_t buf;
1221 size_t len;
1222 int flags;
1223 struct sockaddr * __restrict from;
1224 socklen_t * __restrict fromlenaddr;
1225 } */ *uap;
1226 {
1227 struct msghdr msg;
1228 struct iovec aiov;
1229 int error;
1230
1231 if (uap->fromlenaddr) {
1232 error = copyin(uap->fromlenaddr,
1233 &msg.msg_namelen, sizeof (msg.msg_namelen));
1234 if (error != 0)
1235 goto done2;
1236 } else {
1237 msg.msg_namelen = 0;
1238 }
1239 msg.msg_name = uap->from;
1240 msg.msg_iov = &aiov;
1241 msg.msg_iovlen = 1;
1242 aiov.iov_base = uap->buf;
1243 aiov.iov_len = uap->len;
1244 msg.msg_control = 0;
1245 msg.msg_flags = uap->flags;
1246 error = recvit(td, uap->s, &msg, uap->fromlenaddr);
1247 done2:
1248 return (error);
1249 }
1250
1251 #ifdef COMPAT_OLDSOCK
1252 int
orecvfrom(td,uap)1253 orecvfrom(td, uap)
1254 struct thread *td;
1255 struct recvfrom_args *uap;
1256 {
1257
1258 uap->flags |= MSG_COMPAT;
1259 return (sys_recvfrom(td, uap));
1260 }
1261 #endif
1262
1263 #ifdef COMPAT_OLDSOCK
1264 int
orecv(td,uap)1265 orecv(td, uap)
1266 struct thread *td;
1267 struct orecv_args /* {
1268 int s;
1269 caddr_t buf;
1270 int len;
1271 int flags;
1272 } */ *uap;
1273 {
1274 struct msghdr msg;
1275 struct iovec aiov;
1276
1277 msg.msg_name = 0;
1278 msg.msg_namelen = 0;
1279 msg.msg_iov = &aiov;
1280 msg.msg_iovlen = 1;
1281 aiov.iov_base = uap->buf;
1282 aiov.iov_len = uap->len;
1283 msg.msg_control = 0;
1284 msg.msg_flags = uap->flags;
1285 return (recvit(td, uap->s, &msg, NULL));
1286 }
1287
1288 /*
1289 * Old recvmsg. This code takes advantage of the fact that the old msghdr
1290 * overlays the new one, missing only the flags, and with the (old) access
1291 * rights where the control fields are now.
1292 */
1293 int
orecvmsg(td,uap)1294 orecvmsg(td, uap)
1295 struct thread *td;
1296 struct orecvmsg_args /* {
1297 int s;
1298 struct omsghdr *msg;
1299 int flags;
1300 } */ *uap;
1301 {
1302 struct msghdr msg;
1303 struct iovec *iov;
1304 int error;
1305
1306 error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
1307 if (error != 0)
1308 return (error);
1309 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1310 if (error != 0)
1311 return (error);
1312 msg.msg_flags = uap->flags | MSG_COMPAT;
1313 msg.msg_iov = iov;
1314 error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
1315 if (msg.msg_controllen && error == 0)
1316 error = copyout(&msg.msg_controllen,
1317 &uap->msg->msg_accrightslen, sizeof (int));
1318 free(iov, M_IOV);
1319 return (error);
1320 }
1321 #endif
1322
1323 int
sys_recvmsg(td,uap)1324 sys_recvmsg(td, uap)
1325 struct thread *td;
1326 struct recvmsg_args /* {
1327 int s;
1328 struct msghdr *msg;
1329 int flags;
1330 } */ *uap;
1331 {
1332 struct msghdr msg;
1333 struct iovec *uiov, *iov;
1334 int error;
1335
1336 error = copyin(uap->msg, &msg, sizeof (msg));
1337 if (error != 0)
1338 return (error);
1339 error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
1340 if (error != 0)
1341 return (error);
1342 msg.msg_flags = uap->flags;
1343 #ifdef COMPAT_OLDSOCK
1344 msg.msg_flags &= ~MSG_COMPAT;
1345 #endif
1346 uiov = msg.msg_iov;
1347 msg.msg_iov = iov;
1348 error = recvit(td, uap->s, &msg, NULL);
1349 if (error == 0) {
1350 msg.msg_iov = uiov;
1351 error = copyout(&msg, uap->msg, sizeof(msg));
1352 }
1353 free(iov, M_IOV);
1354 return (error);
1355 }
1356
1357 /* ARGSUSED */
1358 int
sys_shutdown(td,uap)1359 sys_shutdown(td, uap)
1360 struct thread *td;
1361 struct shutdown_args /* {
1362 int s;
1363 int how;
1364 } */ *uap;
1365 {
1366 struct socket *so;
1367 struct file *fp;
1368 cap_rights_t rights;
1369 int error;
1370
1371 AUDIT_ARG_FD(uap->s);
1372 error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN),
1373 &fp, NULL);
1374 if (error == 0) {
1375 so = fp->f_data;
1376 error = soshutdown(so, uap->how);
1377 /*
1378 * Previous versions did not return ENOTCONN, but 0 in
1379 * case the socket was not connected. Some important
1380 * programs like syslogd up to r279016, 2015-02-19,
1381 * still depend on this behavior.
1382 */
1383 if (error == ENOTCONN &&
1384 td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN)
1385 error = 0;
1386 fdrop(fp, td);
1387 }
1388 return (error);
1389 }
1390
1391 /* ARGSUSED */
1392 int
sys_setsockopt(td,uap)1393 sys_setsockopt(td, uap)
1394 struct thread *td;
1395 struct setsockopt_args /* {
1396 int s;
1397 int level;
1398 int name;
1399 caddr_t val;
1400 int valsize;
1401 } */ *uap;
1402 {
1403
1404 return (kern_setsockopt(td, uap->s, uap->level, uap->name,
1405 uap->val, UIO_USERSPACE, uap->valsize));
1406 }
1407
1408 int
kern_setsockopt(td,s,level,name,val,valseg,valsize)1409 kern_setsockopt(td, s, level, name, val, valseg, valsize)
1410 struct thread *td;
1411 int s;
1412 int level;
1413 int name;
1414 void *val;
1415 enum uio_seg valseg;
1416 socklen_t valsize;
1417 {
1418 struct socket *so;
1419 struct file *fp;
1420 struct sockopt sopt;
1421 cap_rights_t rights;
1422 int error;
1423
1424 if (val == NULL && valsize != 0)
1425 return (EFAULT);
1426 if ((int)valsize < 0)
1427 return (EINVAL);
1428
1429 sopt.sopt_dir = SOPT_SET;
1430 sopt.sopt_level = level;
1431 sopt.sopt_name = name;
1432 sopt.sopt_val = val;
1433 sopt.sopt_valsize = valsize;
1434 switch (valseg) {
1435 case UIO_USERSPACE:
1436 sopt.sopt_td = td;
1437 break;
1438 case UIO_SYSSPACE:
1439 sopt.sopt_td = NULL;
1440 break;
1441 default:
1442 panic("kern_setsockopt called with bad valseg");
1443 }
1444
1445 AUDIT_ARG_FD(s);
1446 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT),
1447 &fp, NULL);
1448 if (error == 0) {
1449 so = fp->f_data;
1450 error = sosetopt(so, &sopt);
1451 fdrop(fp, td);
1452 }
1453 return(error);
1454 }
1455
1456 /* ARGSUSED */
1457 int
sys_getsockopt(td,uap)1458 sys_getsockopt(td, uap)
1459 struct thread *td;
1460 struct getsockopt_args /* {
1461 int s;
1462 int level;
1463 int name;
1464 void * __restrict val;
1465 socklen_t * __restrict avalsize;
1466 } */ *uap;
1467 {
1468 socklen_t valsize;
1469 int error;
1470
1471 if (uap->val) {
1472 error = copyin(uap->avalsize, &valsize, sizeof (valsize));
1473 if (error != 0)
1474 return (error);
1475 }
1476
1477 error = kern_getsockopt(td, uap->s, uap->level, uap->name,
1478 uap->val, UIO_USERSPACE, &valsize);
1479
1480 if (error == 0)
1481 error = copyout(&valsize, uap->avalsize, sizeof (valsize));
1482 return (error);
1483 }
1484
1485 /*
1486 * Kernel version of getsockopt.
1487 * optval can be a userland or userspace. optlen is always a kernel pointer.
1488 */
1489 int
kern_getsockopt(td,s,level,name,val,valseg,valsize)1490 kern_getsockopt(td, s, level, name, val, valseg, valsize)
1491 struct thread *td;
1492 int s;
1493 int level;
1494 int name;
1495 void *val;
1496 enum uio_seg valseg;
1497 socklen_t *valsize;
1498 {
1499 struct socket *so;
1500 struct file *fp;
1501 struct sockopt sopt;
1502 cap_rights_t rights;
1503 int error;
1504
1505 if (val == NULL)
1506 *valsize = 0;
1507 if ((int)*valsize < 0)
1508 return (EINVAL);
1509
1510 sopt.sopt_dir = SOPT_GET;
1511 sopt.sopt_level = level;
1512 sopt.sopt_name = name;
1513 sopt.sopt_val = val;
1514 sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
1515 switch (valseg) {
1516 case UIO_USERSPACE:
1517 sopt.sopt_td = td;
1518 break;
1519 case UIO_SYSSPACE:
1520 sopt.sopt_td = NULL;
1521 break;
1522 default:
1523 panic("kern_getsockopt called with bad valseg");
1524 }
1525
1526 AUDIT_ARG_FD(s);
1527 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT),
1528 &fp, NULL);
1529 if (error == 0) {
1530 so = fp->f_data;
1531 error = sogetopt(so, &sopt);
1532 *valsize = sopt.sopt_valsize;
1533 fdrop(fp, td);
1534 }
1535 return (error);
1536 }
1537
1538 /*
1539 * getsockname1() - Get socket name.
1540 */
1541 /* ARGSUSED */
1542 static int
getsockname1(td,uap,compat)1543 getsockname1(td, uap, compat)
1544 struct thread *td;
1545 struct getsockname_args /* {
1546 int fdes;
1547 struct sockaddr * __restrict asa;
1548 socklen_t * __restrict alen;
1549 } */ *uap;
1550 int compat;
1551 {
1552 struct sockaddr *sa;
1553 socklen_t len;
1554 int error;
1555
1556 error = copyin(uap->alen, &len, sizeof(len));
1557 if (error != 0)
1558 return (error);
1559
1560 error = kern_getsockname(td, uap->fdes, &sa, &len);
1561 if (error != 0)
1562 return (error);
1563
1564 if (len != 0) {
1565 #ifdef COMPAT_OLDSOCK
1566 if (compat)
1567 ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1568 #endif
1569 error = copyout(sa, uap->asa, (u_int)len);
1570 }
1571 free(sa, M_SONAME);
1572 if (error == 0)
1573 error = copyout(&len, uap->alen, sizeof(len));
1574 return (error);
1575 }
1576
1577 int
kern_getsockname(struct thread * td,int fd,struct sockaddr ** sa,socklen_t * alen)1578 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
1579 socklen_t *alen)
1580 {
1581 struct socket *so;
1582 struct file *fp;
1583 cap_rights_t rights;
1584 socklen_t len;
1585 int error;
1586
1587 AUDIT_ARG_FD(fd);
1588 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME),
1589 &fp, NULL);
1590 if (error != 0)
1591 return (error);
1592 so = fp->f_data;
1593 *sa = NULL;
1594 CURVNET_SET(so->so_vnet);
1595 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
1596 CURVNET_RESTORE();
1597 if (error != 0)
1598 goto bad;
1599 if (*sa == NULL)
1600 len = 0;
1601 else
1602 len = MIN(*alen, (*sa)->sa_len);
1603 *alen = len;
1604 #ifdef KTRACE
1605 if (KTRPOINT(td, KTR_STRUCT))
1606 ktrsockaddr(*sa);
1607 #endif
1608 bad:
1609 fdrop(fp, td);
1610 if (error != 0 && *sa != NULL) {
1611 free(*sa, M_SONAME);
1612 *sa = NULL;
1613 }
1614 return (error);
1615 }
1616
1617 int
sys_getsockname(td,uap)1618 sys_getsockname(td, uap)
1619 struct thread *td;
1620 struct getsockname_args *uap;
1621 {
1622
1623 return (getsockname1(td, uap, 0));
1624 }
1625
1626 #ifdef COMPAT_OLDSOCK
1627 int
ogetsockname(td,uap)1628 ogetsockname(td, uap)
1629 struct thread *td;
1630 struct getsockname_args *uap;
1631 {
1632
1633 return (getsockname1(td, uap, 1));
1634 }
1635 #endif /* COMPAT_OLDSOCK */
1636
1637 /*
1638 * getpeername1() - Get name of peer for connected socket.
1639 */
1640 /* ARGSUSED */
1641 static int
getpeername1(td,uap,compat)1642 getpeername1(td, uap, compat)
1643 struct thread *td;
1644 struct getpeername_args /* {
1645 int fdes;
1646 struct sockaddr * __restrict asa;
1647 socklen_t * __restrict alen;
1648 } */ *uap;
1649 int compat;
1650 {
1651 struct sockaddr *sa;
1652 socklen_t len;
1653 int error;
1654
1655 error = copyin(uap->alen, &len, sizeof (len));
1656 if (error != 0)
1657 return (error);
1658
1659 error = kern_getpeername(td, uap->fdes, &sa, &len);
1660 if (error != 0)
1661 return (error);
1662
1663 if (len != 0) {
1664 #ifdef COMPAT_OLDSOCK
1665 if (compat)
1666 ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1667 #endif
1668 error = copyout(sa, uap->asa, (u_int)len);
1669 }
1670 free(sa, M_SONAME);
1671 if (error == 0)
1672 error = copyout(&len, uap->alen, sizeof(len));
1673 return (error);
1674 }
1675
1676 int
kern_getpeername(struct thread * td,int fd,struct sockaddr ** sa,socklen_t * alen)1677 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
1678 socklen_t *alen)
1679 {
1680 struct socket *so;
1681 struct file *fp;
1682 cap_rights_t rights;
1683 socklen_t len;
1684 int error;
1685
1686 AUDIT_ARG_FD(fd);
1687 error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME),
1688 &fp, NULL);
1689 if (error != 0)
1690 return (error);
1691 so = fp->f_data;
1692 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
1693 error = ENOTCONN;
1694 goto done;
1695 }
1696 *sa = NULL;
1697 CURVNET_SET(so->so_vnet);
1698 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
1699 CURVNET_RESTORE();
1700 if (error != 0)
1701 goto bad;
1702 if (*sa == NULL)
1703 len = 0;
1704 else
1705 len = MIN(*alen, (*sa)->sa_len);
1706 *alen = len;
1707 #ifdef KTRACE
1708 if (KTRPOINT(td, KTR_STRUCT))
1709 ktrsockaddr(*sa);
1710 #endif
1711 bad:
1712 if (error != 0 && *sa != NULL) {
1713 free(*sa, M_SONAME);
1714 *sa = NULL;
1715 }
1716 done:
1717 fdrop(fp, td);
1718 return (error);
1719 }
1720
1721 int
sys_getpeername(td,uap)1722 sys_getpeername(td, uap)
1723 struct thread *td;
1724 struct getpeername_args *uap;
1725 {
1726
1727 return (getpeername1(td, uap, 0));
1728 }
1729
1730 #ifdef COMPAT_OLDSOCK
1731 int
ogetpeername(td,uap)1732 ogetpeername(td, uap)
1733 struct thread *td;
1734 struct ogetpeername_args *uap;
1735 {
1736
1737 /* XXX uap should have type `getpeername_args *' to begin with. */
1738 return (getpeername1(td, (struct getpeername_args *)uap, 1));
1739 }
1740 #endif /* COMPAT_OLDSOCK */
1741
1742 int
sockargs(mp,buf,buflen,type)1743 sockargs(mp, buf, buflen, type)
1744 struct mbuf **mp;
1745 caddr_t buf;
1746 int buflen, type;
1747 {
1748 struct sockaddr *sa;
1749 struct mbuf *m;
1750 int error;
1751
1752 if (buflen > MLEN) {
1753 #ifdef COMPAT_OLDSOCK
1754 if (type == MT_SONAME && buflen <= 112)
1755 buflen = MLEN; /* unix domain compat. hack */
1756 else
1757 #endif
1758 if (buflen > MCLBYTES)
1759 return (EINVAL);
1760 }
1761 m = m_get2(buflen, M_WAITOK, type, 0);
1762 m->m_len = buflen;
1763 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1764 if (error != 0)
1765 (void) m_free(m);
1766 else {
1767 *mp = m;
1768 if (type == MT_SONAME) {
1769 sa = mtod(m, struct sockaddr *);
1770
1771 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1772 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1773 sa->sa_family = sa->sa_len;
1774 #endif
1775 sa->sa_len = buflen;
1776 }
1777 }
1778 return (error);
1779 }
1780
1781 int
getsockaddr(namp,uaddr,len)1782 getsockaddr(namp, uaddr, len)
1783 struct sockaddr **namp;
1784 caddr_t uaddr;
1785 size_t len;
1786 {
1787 struct sockaddr *sa;
1788 int error;
1789
1790 if (len > SOCK_MAXADDRLEN)
1791 return (ENAMETOOLONG);
1792 if (len < offsetof(struct sockaddr, sa_data[0]))
1793 return (EINVAL);
1794 sa = malloc(len, M_SONAME, M_WAITOK);
1795 error = copyin(uaddr, sa, len);
1796 if (error != 0) {
1797 free(sa, M_SONAME);
1798 } else {
1799 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1800 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1801 sa->sa_family = sa->sa_len;
1802 #endif
1803 sa->sa_len = len;
1804 *namp = sa;
1805 }
1806 return (error);
1807 }
1808
1809 struct sendfile_sync {
1810 struct mtx mtx;
1811 struct cv cv;
1812 unsigned count;
1813 };
1814
1815 /*
1816 * Add more references to a vm_page + sf_buf + sendfile_sync.
1817 */
1818 void
sf_ext_ref(void * arg1,void * arg2)1819 sf_ext_ref(void *arg1, void *arg2)
1820 {
1821 struct sf_buf *sf = arg1;
1822 struct sendfile_sync *sfs = arg2;
1823 vm_page_t pg = sf_buf_page(sf);
1824
1825 sf_buf_ref(sf);
1826
1827 vm_page_lock(pg);
1828 vm_page_wire(pg);
1829 vm_page_unlock(pg);
1830
1831 if (sfs != NULL) {
1832 mtx_lock(&sfs->mtx);
1833 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
1834 sfs->count++;
1835 mtx_unlock(&sfs->mtx);
1836 }
1837 }
1838
1839 /*
1840 * Detach mapped page and release resources back to the system.
1841 */
1842 void
sf_ext_free(void * arg1,void * arg2)1843 sf_ext_free(void *arg1, void *arg2)
1844 {
1845 struct sf_buf *sf = arg1;
1846 struct sendfile_sync *sfs = arg2;
1847 vm_page_t pg = sf_buf_page(sf);
1848
1849 sf_buf_free(sf);
1850
1851 vm_page_lock(pg);
1852 /*
1853 * Check for the object going away on us. This can
1854 * happen since we don't hold a reference to it.
1855 * If so, we're responsible for freeing the page.
1856 */
1857 if (vm_page_unwire(pg, PQ_INACTIVE) && pg->object == NULL)
1858 vm_page_free(pg);
1859 vm_page_unlock(pg);
1860
1861 if (sfs != NULL) {
1862 mtx_lock(&sfs->mtx);
1863 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
1864 if (--sfs->count == 0)
1865 cv_signal(&sfs->cv);
1866 mtx_unlock(&sfs->mtx);
1867 }
1868 }
1869
1870 /*
1871 * Same as above, but forces the page to be detached from the object
1872 * and go into free pool.
1873 */
1874 void
sf_ext_free_nocache(void * arg1,void * arg2)1875 sf_ext_free_nocache(void *arg1, void *arg2)
1876 {
1877 struct sf_buf *sf = arg1;
1878 struct sendfile_sync *sfs = arg2;
1879 vm_page_t pg = sf_buf_page(sf);
1880
1881 sf_buf_free(sf);
1882
1883 vm_page_lock(pg);
1884 if (vm_page_unwire(pg, PQ_NONE)) {
1885 vm_object_t obj;
1886
1887 /* Try to free the page, but only if it is cheap to. */
1888 if ((obj = pg->object) == NULL)
1889 vm_page_free(pg);
1890 else if (!vm_page_xbusied(pg) && VM_OBJECT_TRYWLOCK(obj)) {
1891 vm_page_free(pg);
1892 VM_OBJECT_WUNLOCK(obj);
1893 } else
1894 vm_page_deactivate(pg);
1895 }
1896 vm_page_unlock(pg);
1897
1898 if (sfs != NULL) {
1899 mtx_lock(&sfs->mtx);
1900 KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
1901 if (--sfs->count == 0)
1902 cv_signal(&sfs->cv);
1903 mtx_unlock(&sfs->mtx);
1904 }
1905 }
1906
1907 /*
1908 * sendfile(2)
1909 *
1910 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1911 * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1912 *
1913 * Send a file specified by 'fd' and starting at 'offset' to a socket
1914 * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
1915 * 0. Optionally add a header and/or trailer to the socket output. If
1916 * specified, write the total number of bytes sent into *sbytes.
1917 */
1918 int
sys_sendfile(struct thread * td,struct sendfile_args * uap)1919 sys_sendfile(struct thread *td, struct sendfile_args *uap)
1920 {
1921
1922 return (do_sendfile(td, uap, 0));
1923 }
1924
1925 static int
do_sendfile(struct thread * td,struct sendfile_args * uap,int compat)1926 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
1927 {
1928 struct sf_hdtr hdtr;
1929 struct uio *hdr_uio, *trl_uio;
1930 struct file *fp;
1931 cap_rights_t rights;
1932 off_t sbytes;
1933 int error;
1934
1935 /*
1936 * File offset must be positive. If it goes beyond EOF
1937 * we send only the header/trailer and no payload data.
1938 */
1939 if (uap->offset < 0)
1940 return (EINVAL);
1941
1942 hdr_uio = trl_uio = NULL;
1943
1944 if (uap->hdtr != NULL) {
1945 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1946 if (error != 0)
1947 goto out;
1948 if (hdtr.headers != NULL) {
1949 error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
1950 &hdr_uio);
1951 if (error != 0)
1952 goto out;
1953 }
1954 if (hdtr.trailers != NULL) {
1955 error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
1956 &trl_uio);
1957 if (error != 0)
1958 goto out;
1959 }
1960 }
1961
1962 AUDIT_ARG_FD(uap->fd);
1963
1964 /*
1965 * sendfile(2) can start at any offset within a file so we require
1966 * CAP_READ+CAP_SEEK = CAP_PREAD.
1967 */
1968 if ((error = fget_read(td, uap->fd,
1969 cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
1970 goto out;
1971 }
1972
1973 error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
1974 uap->nbytes, &sbytes, uap->flags, compat ? SFK_COMPAT : 0, td);
1975 fdrop(fp, td);
1976
1977 if (uap->sbytes != NULL)
1978 copyout(&sbytes, uap->sbytes, sizeof(off_t));
1979
1980 out:
1981 free(hdr_uio, M_IOV);
1982 free(trl_uio, M_IOV);
1983 return (error);
1984 }
1985
1986 #ifdef COMPAT_FREEBSD4
1987 int
freebsd4_sendfile(struct thread * td,struct freebsd4_sendfile_args * uap)1988 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
1989 {
1990 struct sendfile_args args;
1991
1992 args.fd = uap->fd;
1993 args.s = uap->s;
1994 args.offset = uap->offset;
1995 args.nbytes = uap->nbytes;
1996 args.hdtr = uap->hdtr;
1997 args.sbytes = uap->sbytes;
1998 args.flags = uap->flags;
1999
2000 return (do_sendfile(td, &args, 1));
2001 }
2002 #endif /* COMPAT_FREEBSD4 */
2003
2004 /*
2005 * How much data to put into page i of n.
2006 * Only first and last pages are special.
2007 */
2008 static inline off_t
xfsize(int i,int n,off_t off,off_t len)2009 xfsize(int i, int n, off_t off, off_t len)
2010 {
2011
2012 if (i == 0)
2013 return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
2014
2015 if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
2016 return ((off + len) & PAGE_MASK);
2017
2018 return (PAGE_SIZE);
2019 }
2020
2021 /*
2022 * Offset within object for i page.
2023 */
2024 static inline vm_offset_t
vmoff(int i,off_t off)2025 vmoff(int i, off_t off)
2026 {
2027
2028 if (i == 0)
2029 return ((vm_offset_t)off);
2030
2031 return (trunc_page(off + i * PAGE_SIZE));
2032 }
2033
2034 /*
2035 * Pretend as if we don't have enough space, subtract xfsize() of
2036 * all pages that failed.
2037 */
2038 static inline void
fixspace(int old,int new,off_t off,int * space)2039 fixspace(int old, int new, off_t off, int *space)
2040 {
2041
2042 KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
2043
2044 /* Subtract last one. */
2045 *space -= xfsize(old - 1, old, off, *space);
2046 old--;
2047
2048 if (new == old)
2049 /* There was only one page. */
2050 return;
2051
2052 /* Subtract first one. */
2053 if (new == 0) {
2054 *space -= xfsize(0, old, off, *space);
2055 new++;
2056 }
2057
2058 /* Rest of pages are full sized. */
2059 *space -= (old - new) * PAGE_SIZE;
2060
2061 KASSERT(*space >= 0, ("%s: space went backwards", __func__));
2062 }
2063
2064 /*
2065 * Structure describing a single sendfile(2) I/O, which may consist of
2066 * several underlying pager I/Os.
2067 *
2068 * The syscall context allocates the structure and initializes 'nios'
2069 * to 1. As sendfile_swapin() runs through pages and starts asynchronous
2070 * paging operations, it increments 'nios'.
2071 *
2072 * Every I/O completion calls sf_iodone(), which decrements the 'nios', and
2073 * the syscall also calls sf_iodone() after allocating all mbufs, linking them
2074 * and sending to socket. Whoever reaches zero 'nios' is responsible to
2075 * call pru_ready on the socket, to notify it of readyness of the data.
2076 */
2077 struct sf_io {
2078 volatile u_int nios;
2079 u_int error;
2080 int npages;
2081 struct file *sock_fp;
2082 struct mbuf *m;
2083 vm_page_t pa[];
2084 };
2085
2086 static void
sf_iodone(void * arg,vm_page_t * pg,int count,int error)2087 sf_iodone(void *arg, vm_page_t *pg, int count, int error)
2088 {
2089 struct sf_io *sfio = arg;
2090 struct socket *so;
2091
2092 for (int i = 0; i < count; i++)
2093 vm_page_xunbusy(pg[i]);
2094
2095 if (error)
2096 sfio->error = error;
2097
2098 if (!refcount_release(&sfio->nios))
2099 return;
2100
2101 so = sfio->sock_fp->f_data;
2102
2103 if (sfio->error) {
2104 struct mbuf *m;
2105
2106 /*
2107 * I/O operation failed. The state of data in the socket
2108 * is now inconsistent, and all what we can do is to tear
2109 * it down. Protocol abort method would tear down protocol
2110 * state, free all ready mbufs and detach not ready ones.
2111 * We will free the mbufs corresponding to this I/O manually.
2112 *
2113 * The socket would be marked with EIO and made available
2114 * for read, so that application receives EIO on next
2115 * syscall and eventually closes the socket.
2116 */
2117 so->so_proto->pr_usrreqs->pru_abort(so);
2118 so->so_error = EIO;
2119
2120 m = sfio->m;
2121 for (int i = 0; i < sfio->npages; i++)
2122 m = m_free(m);
2123 } else {
2124 CURVNET_SET(so->so_vnet);
2125 (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
2126 sfio->npages);
2127 CURVNET_RESTORE();
2128 }
2129
2130 /* XXXGL: curthread */
2131 fdrop(sfio->sock_fp, curthread);
2132 free(sfio, M_TEMP);
2133 }
2134
2135 /*
2136 * Iterate through pages vector and request paging for non-valid pages.
2137 */
2138 static int
sendfile_swapin(vm_object_t obj,struct sf_io * sfio,off_t off,off_t len,int npages,int rhpages,int flags)2139 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, off_t off, off_t len,
2140 int npages, int rhpages, int flags)
2141 {
2142 vm_page_t *pa = sfio->pa;
2143 int nios;
2144
2145 nios = 0;
2146 flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
2147
2148 /*
2149 * First grab all the pages and wire them. Note that we grab
2150 * only required pages. Readahead pages are dealt with later.
2151 */
2152 VM_OBJECT_WLOCK(obj);
2153 for (int i = 0; i < npages; i++) {
2154 pa[i] = vm_page_grab(obj, OFF_TO_IDX(vmoff(i, off)),
2155 VM_ALLOC_WIRED | VM_ALLOC_NORMAL | flags);
2156 if (pa[i] == NULL) {
2157 npages = i;
2158 rhpages = 0;
2159 break;
2160 }
2161 }
2162
2163 for (int i = 0; i < npages;) {
2164 int j, a, count, rv;
2165
2166 /* Skip valid pages. */
2167 if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
2168 xfsize(i, npages, off, len))) {
2169 vm_page_xunbusy(pa[i]);
2170 SFSTAT_INC(sf_pages_valid);
2171 i++;
2172 continue;
2173 }
2174
2175 /*
2176 * Now 'i' points to first invalid page, iterate further
2177 * to make 'j' point at first valid after a bunch of
2178 * invalid ones.
2179 */
2180 for (j = i + 1; j < npages; j++)
2181 if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
2182 xfsize(j, npages, off, len))) {
2183 SFSTAT_INC(sf_pages_valid);
2184 break;
2185 }
2186
2187 /*
2188 * Now we got region of invalid pages between 'i' and 'j'.
2189 * Check that they belong to pager. They may not be there,
2190 * which is a regular situation for shmem pager. For vnode
2191 * pager this happens only in case of sparse file.
2192 *
2193 * Important feature of vm_pager_has_page() is the hint
2194 * stored in 'a', about how many pages we can pagein after
2195 * this page in a single I/O.
2196 */
2197 while (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)),
2198 NULL, &a) && i < j) {
2199 pmap_zero_page(pa[i]);
2200 pa[i]->valid = VM_PAGE_BITS_ALL;
2201 pa[i]->dirty = 0;
2202 vm_page_xunbusy(pa[i]);
2203 i++;
2204 }
2205 if (i == j)
2206 continue;
2207
2208 /*
2209 * We want to pagein as many pages as possible, limited only
2210 * by the 'a' hint and actual request.
2211 *
2212 * We should not pagein into already valid page, thus if
2213 * 'j' didn't reach last page, trim by that page.
2214 *
2215 * When the pagein fulfils the request, also specify readahead.
2216 */
2217 if (j < npages)
2218 a = min(a, j - i - 1);
2219 count = min(a + 1, npages - i);
2220
2221 refcount_acquire(&sfio->nios);
2222 rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
2223 i + count == npages ? &rhpages : NULL,
2224 &sf_iodone, sfio);
2225 KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
2226 __func__, obj, pa[i]));
2227
2228 SFSTAT_INC(sf_iocnt);
2229 SFSTAT_ADD(sf_pages_read, count);
2230 if (i + count == npages)
2231 SFSTAT_ADD(sf_rhpages_read, rhpages);
2232
2233 #ifdef INVARIANTS
2234 for (j = i; j < i + count && j < npages; j++)
2235 KASSERT(pa[j] == vm_page_lookup(obj,
2236 OFF_TO_IDX(vmoff(j, off))),
2237 ("pa[j] %p lookup %p\n", pa[j],
2238 vm_page_lookup(obj, OFF_TO_IDX(vmoff(j, off)))));
2239 #endif
2240 i += count;
2241 nios++;
2242 }
2243
2244 VM_OBJECT_WUNLOCK(obj);
2245
2246 if (nios == 0 && npages != 0)
2247 SFSTAT_INC(sf_noiocnt);
2248
2249 return (nios);
2250 }
2251
2252 static int
sendfile_getobj(struct thread * td,struct file * fp,vm_object_t * obj_res,struct vnode ** vp_res,struct shmfd ** shmfd_res,off_t * obj_size,int * bsize)2253 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
2254 struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
2255 int *bsize)
2256 {
2257 struct vattr va;
2258 vm_object_t obj;
2259 struct vnode *vp;
2260 struct shmfd *shmfd;
2261 int error;
2262
2263 vp = *vp_res = NULL;
2264 obj = NULL;
2265 shmfd = *shmfd_res = NULL;
2266 *bsize = 0;
2267
2268 /*
2269 * The file descriptor must be a regular file and have a
2270 * backing VM object.
2271 */
2272 if (fp->f_type == DTYPE_VNODE) {
2273 vp = fp->f_vnode;
2274 vn_lock(vp, LK_SHARED | LK_RETRY);
2275 if (vp->v_type != VREG) {
2276 error = EINVAL;
2277 goto out;
2278 }
2279 *bsize = vp->v_mount->mnt_stat.f_iosize;
2280 error = VOP_GETATTR(vp, &va, td->td_ucred);
2281 if (error != 0)
2282 goto out;
2283 *obj_size = va.va_size;
2284 obj = vp->v_object;
2285 if (obj == NULL) {
2286 error = EINVAL;
2287 goto out;
2288 }
2289 } else if (fp->f_type == DTYPE_SHM) {
2290 error = 0;
2291 shmfd = fp->f_data;
2292 obj = shmfd->shm_object;
2293 *obj_size = shmfd->shm_size;
2294 } else {
2295 error = EINVAL;
2296 goto out;
2297 }
2298
2299 VM_OBJECT_WLOCK(obj);
2300 if ((obj->flags & OBJ_DEAD) != 0) {
2301 VM_OBJECT_WUNLOCK(obj);
2302 error = EBADF;
2303 goto out;
2304 }
2305
2306 /*
2307 * Temporarily increase the backing VM object's reference
2308 * count so that a forced reclamation of its vnode does not
2309 * immediately destroy it.
2310 */
2311 vm_object_reference_locked(obj);
2312 VM_OBJECT_WUNLOCK(obj);
2313 *obj_res = obj;
2314 *vp_res = vp;
2315 *shmfd_res = shmfd;
2316
2317 out:
2318 if (vp != NULL)
2319 VOP_UNLOCK(vp, 0);
2320 return (error);
2321 }
2322
2323 static int
kern_sendfile_getsock(struct thread * td,int s,struct file ** sock_fp,struct socket ** so)2324 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
2325 struct socket **so)
2326 {
2327 cap_rights_t rights;
2328 int error;
2329
2330 *sock_fp = NULL;
2331 *so = NULL;
2332
2333 /*
2334 * The socket must be a stream socket and connected.
2335 */
2336 error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND),
2337 sock_fp, NULL);
2338 if (error != 0)
2339 return (error);
2340 *so = (*sock_fp)->f_data;
2341 if ((*so)->so_type != SOCK_STREAM)
2342 return (EINVAL);
2343 if (((*so)->so_state & SS_ISCONNECTED) == 0)
2344 return (ENOTCONN);
2345 return (0);
2346 }
2347
2348 int
vn_sendfile(struct file * fp,int sockfd,struct uio * hdr_uio,struct uio * trl_uio,off_t offset,size_t nbytes,off_t * sent,int flags,int kflags,struct thread * td)2349 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
2350 struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
2351 int kflags, struct thread *td)
2352 {
2353 struct file *sock_fp;
2354 struct vnode *vp;
2355 struct vm_object *obj;
2356 struct socket *so;
2357 struct mbuf *m, *mh, *mhtail;
2358 struct sf_buf *sf;
2359 struct shmfd *shmfd;
2360 struct sendfile_sync *sfs;
2361 struct vattr va;
2362 off_t off, sbytes, rem, obj_size;
2363 int error, softerr, bsize, hdrlen;
2364
2365 obj = NULL;
2366 so = NULL;
2367 m = mh = NULL;
2368 sfs = NULL;
2369 sbytes = 0;
2370 softerr = 0;
2371
2372 error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
2373 if (error != 0)
2374 return (error);
2375
2376 error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
2377 if (error != 0)
2378 goto out;
2379
2380 #ifdef MAC
2381 error = mac_socket_check_send(td->td_ucred, so);
2382 if (error != 0)
2383 goto out;
2384 #endif
2385
2386 SFSTAT_INC(sf_syscalls);
2387 SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
2388
2389 if (flags & SF_SYNC) {
2390 sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
2391 mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
2392 cv_init(&sfs->cv, "sendfile");
2393 }
2394
2395 /* If headers are specified copy them into mbufs. */
2396 if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
2397 hdr_uio->uio_td = td;
2398 hdr_uio->uio_rw = UIO_WRITE;
2399 /*
2400 * In FBSD < 5.0 the nbytes to send also included
2401 * the header. If compat is specified subtract the
2402 * header size from nbytes.
2403 */
2404 if (kflags & SFK_COMPAT) {
2405 if (nbytes > hdr_uio->uio_resid)
2406 nbytes -= hdr_uio->uio_resid;
2407 else
2408 nbytes = 0;
2409 }
2410 mh = m_uiotombuf(hdr_uio, M_WAITOK, 0, 0, 0);
2411 hdrlen = m_length(mh, &mhtail);
2412 } else
2413 hdrlen = 0;
2414
2415 rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
2416
2417 /*
2418 * Protect against multiple writers to the socket.
2419 *
2420 * XXXRW: Historically this has assumed non-interruptibility, so now
2421 * we implement that, but possibly shouldn't.
2422 */
2423 (void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
2424
2425 /*
2426 * Loop through the pages of the file, starting with the requested
2427 * offset. Get a file page (do I/O if necessary), map the file page
2428 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
2429 * it on the socket.
2430 * This is done in two loops. The inner loop turns as many pages
2431 * as it can, up to available socket buffer space, without blocking
2432 * into mbufs to have it bulk delivered into the socket send buffer.
2433 * The outer loop checks the state and available space of the socket
2434 * and takes care of the overall progress.
2435 */
2436 for (off = offset; rem > 0; ) {
2437 struct sf_io *sfio;
2438 vm_page_t *pa;
2439 struct mbuf *mtail;
2440 int nios, space, npages, rhpages;
2441
2442 mtail = NULL;
2443 /*
2444 * Check the socket state for ongoing connection,
2445 * no errors and space in socket buffer.
2446 * If space is low allow for the remainder of the
2447 * file to be processed if it fits the socket buffer.
2448 * Otherwise block in waiting for sufficient space
2449 * to proceed, or if the socket is nonblocking, return
2450 * to userland with EAGAIN while reporting how far
2451 * we've come.
2452 * We wait until the socket buffer has significant free
2453 * space to do bulk sends. This makes good use of file
2454 * system read ahead and allows packet segmentation
2455 * offloading hardware to take over lots of work. If
2456 * we were not careful here we would send off only one
2457 * sfbuf at a time.
2458 */
2459 SOCKBUF_LOCK(&so->so_snd);
2460 if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
2461 so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
2462 retry_space:
2463 if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
2464 error = EPIPE;
2465 SOCKBUF_UNLOCK(&so->so_snd);
2466 goto done;
2467 } else if (so->so_error) {
2468 error = so->so_error;
2469 so->so_error = 0;
2470 SOCKBUF_UNLOCK(&so->so_snd);
2471 goto done;
2472 }
2473 space = sbspace(&so->so_snd);
2474 if (space < rem &&
2475 (space <= 0 ||
2476 space < so->so_snd.sb_lowat)) {
2477 if (so->so_state & SS_NBIO) {
2478 SOCKBUF_UNLOCK(&so->so_snd);
2479 error = EAGAIN;
2480 goto done;
2481 }
2482 /*
2483 * sbwait drops the lock while sleeping.
2484 * When we loop back to retry_space the
2485 * state may have changed and we retest
2486 * for it.
2487 */
2488 error = sbwait(&so->so_snd);
2489 /*
2490 * An error from sbwait usually indicates that we've
2491 * been interrupted by a signal. If we've sent anything
2492 * then return bytes sent, otherwise return the error.
2493 */
2494 if (error != 0) {
2495 SOCKBUF_UNLOCK(&so->so_snd);
2496 goto done;
2497 }
2498 goto retry_space;
2499 }
2500 SOCKBUF_UNLOCK(&so->so_snd);
2501
2502 /*
2503 * Reduce space in the socket buffer by the size of
2504 * the header mbuf chain.
2505 * hdrlen is set to 0 after the first loop.
2506 */
2507 space -= hdrlen;
2508
2509 if (vp != NULL) {
2510 error = vn_lock(vp, LK_SHARED);
2511 if (error != 0)
2512 goto done;
2513 error = VOP_GETATTR(vp, &va, td->td_ucred);
2514 if (error != 0 || off >= va.va_size) {
2515 VOP_UNLOCK(vp, 0);
2516 goto done;
2517 }
2518 if (va.va_size != obj_size) {
2519 if (nbytes == 0)
2520 rem += va.va_size - obj_size;
2521 else if (offset + nbytes > va.va_size)
2522 rem -= (offset + nbytes - va.va_size);
2523 obj_size = va.va_size;
2524 }
2525 }
2526
2527 if (space > rem)
2528 space = rem;
2529
2530 npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
2531
2532 /*
2533 * Calculate maximum allowed number of pages for readahead
2534 * at this iteration. First, we allow readahead up to "rem".
2535 * If application wants more, let it be, but there is no
2536 * reason to go above MAXPHYS. Also check against "obj_size",
2537 * since vm_pager_has_page() can hint beyond EOF.
2538 */
2539 rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) - npages;
2540 rhpages += SF_READAHEAD(flags);
2541 rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages);
2542 rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
2543 npages, rhpages);
2544
2545 sfio = malloc(sizeof(struct sf_io) +
2546 npages * sizeof(vm_page_t), M_TEMP, M_WAITOK);
2547 refcount_init(&sfio->nios, 1);
2548 sfio->error = 0;
2549
2550 nios = sendfile_swapin(obj, sfio, off, space, npages, rhpages,
2551 flags);
2552
2553 /*
2554 * Loop and construct maximum sized mbuf chain to be bulk
2555 * dumped into socket buffer.
2556 */
2557 pa = sfio->pa;
2558 for (int i = 0; i < npages; i++) {
2559 struct mbuf *m0;
2560
2561 /*
2562 * If a page wasn't grabbed successfully, then
2563 * trim the array. Can happen only with SF_NODISKIO.
2564 */
2565 if (pa[i] == NULL) {
2566 SFSTAT_INC(sf_busy);
2567 fixspace(npages, i, off, &space);
2568 npages = i;
2569 softerr = EBUSY;
2570 break;
2571 }
2572
2573 /*
2574 * Get a sendfile buf. When allocating the
2575 * first buffer for mbuf chain, we usually
2576 * wait as long as necessary, but this wait
2577 * can be interrupted. For consequent
2578 * buffers, do not sleep, since several
2579 * threads might exhaust the buffers and then
2580 * deadlock.
2581 */
2582 sf = sf_buf_alloc(pa[i],
2583 m != NULL ? SFB_NOWAIT : SFB_CATCH);
2584 if (sf == NULL) {
2585 SFSTAT_INC(sf_allocfail);
2586 for (int j = i; j < npages; j++) {
2587 vm_page_lock(pa[j]);
2588 vm_page_unwire(pa[j], PQ_INACTIVE);
2589 vm_page_unlock(pa[j]);
2590 }
2591 if (m == NULL)
2592 softerr = ENOBUFS;
2593 fixspace(npages, i, off, &space);
2594 npages = i;
2595 break;
2596 }
2597
2598 m0 = m_get(M_WAITOK, MT_DATA);
2599 m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
2600 m0->m_ext.ext_size = PAGE_SIZE;
2601 m0->m_ext.ext_arg1 = sf;
2602 m0->m_ext.ext_arg2 = sfs;
2603 /*
2604 * SF_NOCACHE sets the page as being freed upon send.
2605 * However, we ignore it for the last page in 'space',
2606 * if the page is truncated, and we got more data to
2607 * send (rem > space), or if we have readahead
2608 * configured (rhpages > 0).
2609 */
2610 if ((flags & SF_NOCACHE) == 0 ||
2611 (i == npages - 1 &&
2612 ((off + space) & PAGE_MASK) &&
2613 (rem > space || rhpages > 0)))
2614 m0->m_ext.ext_type = EXT_SFBUF;
2615 else
2616 m0->m_ext.ext_type = EXT_SFBUF_NOCACHE;
2617 m0->m_ext.ext_flags = 0;
2618 m0->m_flags |= (M_EXT | M_RDONLY);
2619 if (nios)
2620 m0->m_flags |= M_NOTREADY;
2621 m0->m_data = (char *)sf_buf_kva(sf) +
2622 (vmoff(i, off) & PAGE_MASK);
2623 m0->m_len = xfsize(i, npages, off, space);
2624
2625 if (i == 0)
2626 sfio->m = m0;
2627
2628 /* Append to mbuf chain. */
2629 if (mtail != NULL)
2630 mtail->m_next = m0;
2631 else
2632 m = m0;
2633 mtail = m0;
2634
2635 if (sfs != NULL) {
2636 mtx_lock(&sfs->mtx);
2637 sfs->count++;
2638 mtx_unlock(&sfs->mtx);
2639 }
2640 }
2641
2642 if (vp != NULL)
2643 VOP_UNLOCK(vp, 0);
2644
2645 /* Keep track of bytes processed. */
2646 off += space;
2647 rem -= space;
2648
2649 /* Prepend header, if any. */
2650 if (hdrlen) {
2651 mhtail->m_next = m;
2652 m = mh;
2653 mh = NULL;
2654 }
2655
2656 if (m == NULL) {
2657 KASSERT(softerr, ("%s: m NULL, no error", __func__));
2658 error = softerr;
2659 free(sfio, M_TEMP);
2660 goto done;
2661 }
2662
2663 /* Add the buffer chain to the socket buffer. */
2664 KASSERT(m_length(m, NULL) == space + hdrlen,
2665 ("%s: mlen %u space %d hdrlen %d",
2666 __func__, m_length(m, NULL), space, hdrlen));
2667
2668 CURVNET_SET(so->so_vnet);
2669 if (nios == 0) {
2670 /*
2671 * If sendfile_swapin() didn't initiate any I/Os,
2672 * which happens if all data is cached in VM, then
2673 * we can send data right now without the
2674 * PRUS_NOTREADY flag.
2675 */
2676 free(sfio, M_TEMP);
2677 error = (*so->so_proto->pr_usrreqs->pru_send)
2678 (so, 0, m, NULL, NULL, td);
2679 } else {
2680 sfio->sock_fp = sock_fp;
2681 sfio->npages = npages;
2682 fhold(sock_fp);
2683 error = (*so->so_proto->pr_usrreqs->pru_send)
2684 (so, PRUS_NOTREADY, m, NULL, NULL, td);
2685 sf_iodone(sfio, NULL, 0, 0);
2686 }
2687 CURVNET_RESTORE();
2688
2689 m = NULL; /* pru_send always consumes */
2690 if (error)
2691 goto done;
2692 sbytes += space + hdrlen;
2693 if (hdrlen)
2694 hdrlen = 0;
2695 if (softerr) {
2696 error = softerr;
2697 goto done;
2698 }
2699 }
2700
2701 /*
2702 * Send trailers. Wimp out and use writev(2).
2703 */
2704 if (trl_uio != NULL) {
2705 sbunlock(&so->so_snd);
2706 error = kern_writev(td, sockfd, trl_uio);
2707 if (error == 0)
2708 sbytes += td->td_retval[0];
2709 goto out;
2710 }
2711
2712 done:
2713 sbunlock(&so->so_snd);
2714 out:
2715 /*
2716 * If there was no error we have to clear td->td_retval[0]
2717 * because it may have been set by writev.
2718 */
2719 if (error == 0) {
2720 td->td_retval[0] = 0;
2721 }
2722 if (sent != NULL) {
2723 (*sent) = sbytes;
2724 }
2725 if (obj != NULL)
2726 vm_object_deallocate(obj);
2727 if (so)
2728 fdrop(sock_fp, td);
2729 if (m)
2730 m_freem(m);
2731 if (mh)
2732 m_freem(mh);
2733
2734 if (sfs != NULL) {
2735 mtx_lock(&sfs->mtx);
2736 if (sfs->count != 0)
2737 cv_wait(&sfs->cv, &sfs->mtx);
2738 KASSERT(sfs->count == 0, ("sendfile sync still busy"));
2739 cv_destroy(&sfs->cv);
2740 mtx_destroy(&sfs->mtx);
2741 free(sfs, M_TEMP);
2742 }
2743
2744 if (error == ERESTART)
2745 error = EINTR;
2746
2747 return (error);
2748 }
2749