xref: /dragonfly/sys/kern/sys_generic.c (revision bbd7e13329c5bb82b8ee744e2c8780d994efbfbf)
1 /*
2  * Copyright (c) 1982, 1986, 1989, 1993
3  *        The Regents of the University of California.  All rights reserved.
4  * (c) UNIX System Laboratories, Inc.
5  * All or some portions of this file are derived from material licensed
6  * to the University of California by American Telephone and Telegraph
7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8  * the permission of UNIX System Laboratories, Inc.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *        @(#)sys_generic.c   8.5 (Berkeley) 1/21/94
35  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
36  */
37 
38 #include "opt_ktrace.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/sysmsg.h>
43 #include <sys/event.h>
44 #include <sys/filedesc.h>
45 #include <sys/filio.h>
46 #include <sys/fcntl.h>
47 #include <sys/file.h>
48 #include <sys/proc.h>
49 #include <sys/signalvar.h>
50 #include <sys/socketvar.h>
51 #include <sys/malloc.h>
52 #include <sys/uio.h>
53 #include <sys/kernel.h>
54 #include <sys/kern_syscall.h>
55 #include <sys/mapped_ioctl.h>
56 #include <sys/poll.h>
57 #include <sys/queue.h>
58 #include <sys/resourcevar.h>
59 #include <sys/socketops.h>
60 #include <sys/sysctl.h>
61 #include <sys/sysent.h>
62 #include <sys/buf.h>
63 #ifdef KTRACE
64 #include <sys/ktrace.h>
65 #endif
66 #include <vm/vm.h>
67 #include <vm/vm_page.h>
68 
69 #include <sys/file2.h>
70 #include <sys/spinlock2.h>
71 #include <sys/signal2.h>
72 
73 #include <machine/limits.h>
74 
75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
79 
80 static struct krate krate_poll = { .freq = 1 };
81 
82 typedef struct kfd_set {
83         fd_mask     fds_bits[2];
84 } kfd_set;
85 
86 enum select_copyin_states {
87     COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE };
88 
89 struct select_kevent_copyin_args {
90           kfd_set             *read_set;
91           kfd_set             *write_set;
92           kfd_set             *except_set;
93           int                 active_set;         /* One of select_copyin_states */
94           struct lwp          *lwp;               /* Pointer to our lwp */
95           int                 num_fds;  /* Number of file descriptors (syscall arg) */
96           int                 proc_fds; /* Processed fd's (wraps) */
97           int                 error;              /* Returned to userland */
98 };
99 
100 struct poll_kevent_copyin_args {
101           struct lwp          *lwp;
102           struct pollfd       *fds;
103           int                 nfds;
104           int                 pfds;
105           int                 error;
106 };
107 
108 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token);
109 
110 static int          doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex,
111                                struct timespec *ts, int *res);
112 static int          dopoll(int nfds, struct pollfd *fds, struct timespec *ts,
113                            int *res, int flags);
114 static int          dofileread(int, struct file *, struct uio *, int, size_t *);
115 static int          dofilewrite(int, struct file *, struct uio *, int, size_t *);
116 
117 /*
118  * Read system call.
119  *
120  * MPSAFE
121  */
122 int
sys_read(struct sysmsg * sysmsg,const struct read_args * uap)123 sys_read(struct sysmsg *sysmsg, const struct read_args *uap)
124 {
125           struct thread *td = curthread;
126           struct uio auio;
127           struct iovec aiov;
128           int error;
129 
130           if ((ssize_t)uap->nbyte < 0)
131                     error = EINVAL;
132 
133           aiov.iov_base = uap->buf;
134           aiov.iov_len = uap->nbyte;
135           auio.uio_iov = &aiov;
136           auio.uio_iovcnt = 1;
137           auio.uio_offset = -1;
138           auio.uio_resid = uap->nbyte;
139           auio.uio_rw = UIO_READ;
140           auio.uio_segflg = UIO_USERSPACE;
141           auio.uio_td = td;
142 
143           error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
144           return(error);
145 }
146 
147 /*
148  * Positioned (Pread) read system call
149  *
150  * MPSAFE
151  */
152 int
sys_extpread(struct sysmsg * sysmsg,const struct extpread_args * uap)153 sys_extpread(struct sysmsg *sysmsg, const struct extpread_args *uap)
154 {
155           struct thread *td = curthread;
156           struct uio auio;
157           struct iovec aiov;
158           int error;
159           int flags;
160 
161           if ((ssize_t)uap->nbyte < 0)
162                     return(EINVAL);
163 
164           aiov.iov_base = uap->buf;
165           aiov.iov_len = uap->nbyte;
166           auio.uio_iov = &aiov;
167           auio.uio_iovcnt = 1;
168           auio.uio_offset = uap->offset;
169           auio.uio_resid = uap->nbyte;
170           auio.uio_rw = UIO_READ;
171           auio.uio_segflg = UIO_USERSPACE;
172           auio.uio_td = td;
173 
174           flags = uap->flags & O_FMASK;
175           if (uap->offset != (off_t)-1)
176                     flags |= O_FOFFSET;
177 
178           error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
179           return(error);
180 }
181 
182 /*
183  * Scatter read system call.
184  *
185  * MPSAFE
186  */
187 int
sys_readv(struct sysmsg * sysmsg,const struct readv_args * uap)188 sys_readv(struct sysmsg *sysmsg, const struct readv_args *uap)
189 {
190           struct thread *td = curthread;
191           struct uio auio;
192           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
193           int error;
194 
195           error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
196                                    &auio.uio_resid);
197           if (error)
198                     return (error);
199           auio.uio_iov = iov;
200           auio.uio_iovcnt = uap->iovcnt;
201           auio.uio_offset = -1;
202           auio.uio_rw = UIO_READ;
203           auio.uio_segflg = UIO_USERSPACE;
204           auio.uio_td = td;
205 
206           error = kern_preadv(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
207 
208           iovec_free(&iov, aiov);
209           return (error);
210 }
211 
212 
213 /*
214  * Scatter positioned read system call.
215  *
216  * MPSAFE
217  */
218 int
sys_extpreadv(struct sysmsg * sysmsg,const struct extpreadv_args * uap)219 sys_extpreadv(struct sysmsg *sysmsg, const struct extpreadv_args *uap)
220 {
221           struct thread *td = curthread;
222           struct uio auio;
223           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
224           int error;
225           int flags;
226 
227           error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
228                                    &auio.uio_resid);
229           if (error)
230                     return (error);
231           auio.uio_iov = iov;
232           auio.uio_iovcnt = uap->iovcnt;
233           auio.uio_offset = uap->offset;
234           auio.uio_rw = UIO_READ;
235           auio.uio_segflg = UIO_USERSPACE;
236           auio.uio_td = td;
237 
238           flags = uap->flags & O_FMASK;
239           if (uap->offset != (off_t)-1)
240                     flags |= O_FOFFSET;
241 
242           error = kern_preadv(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
243 
244           iovec_free(&iov, aiov);
245           return(error);
246 }
247 
248 /*
249  * MPSAFE
250  */
251 int
kern_preadv(int fd,struct uio * auio,int flags,size_t * res)252 kern_preadv(int fd, struct uio *auio, int flags, size_t *res)
253 {
254           struct thread *td = curthread;
255           struct file *fp;
256           int error;
257 
258           fp = holdfp(td, fd, FREAD);
259           if (fp == NULL)
260                     return (EBADF);
261           if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) {
262                     error = ESPIPE;
263           } else {
264                     error = dofileread(fd, fp, auio, flags, res);
265           }
266           dropfp(td, fd, fp);
267 
268           return(error);
269 }
270 
271 /*
272  * Common code for readv and preadv that reads data in
273  * from a file using the passed in uio, offset, and flags.
274  *
275  * MPALMOSTSAFE - ktrace needs help
276  */
277 static int
dofileread(int fd,struct file * fp,struct uio * auio,int flags,size_t * res)278 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
279 {
280           int error;
281           size_t len;
282 #ifdef KTRACE
283           struct thread *td = curthread;
284           struct iovec *ktriov = NULL;
285           struct uio ktruio;
286 #endif
287 
288 #ifdef KTRACE
289           /*
290            * if tracing, save a copy of iovec
291            */
292           if (KTRPOINT(td, KTR_GENIO))  {
293                     int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
294 
295                     ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
296                     bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
297                     ktruio = *auio;
298           }
299 #endif
300           len = auio->uio_resid;
301           error = fo_read(fp, auio, fp->f_cred, flags);
302           if (error) {
303                     if (auio->uio_resid != len && (error == ERESTART ||
304                         error == EINTR || error == EWOULDBLOCK))
305                               error = 0;
306           }
307 #ifdef KTRACE
308           if (ktriov != NULL) {
309                     if (error == 0) {
310                               ktruio.uio_iov = ktriov;
311                               ktruio.uio_resid = len - auio->uio_resid;
312                               ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error);
313                     }
314                     kfree(ktriov, M_TEMP);
315           }
316 #endif
317           if (error == 0)
318                     *res = len - auio->uio_resid;
319 
320           return(error);
321 }
322 
323 /*
324  * Write system call
325  *
326  * MPSAFE
327  */
328 int
sys_write(struct sysmsg * sysmsg,const struct write_args * uap)329 sys_write(struct sysmsg *sysmsg, const struct write_args *uap)
330 {
331           struct thread *td = curthread;
332           struct uio auio;
333           struct iovec aiov;
334           int error;
335 
336           if ((ssize_t)uap->nbyte < 0)
337                     error = EINVAL;
338 
339           aiov.iov_base = (void *)(uintptr_t)uap->buf;
340           aiov.iov_len = uap->nbyte;
341           auio.uio_iov = &aiov;
342           auio.uio_iovcnt = 1;
343           auio.uio_offset = -1;
344           auio.uio_resid = uap->nbyte;
345           auio.uio_rw = UIO_WRITE;
346           auio.uio_segflg = UIO_USERSPACE;
347           auio.uio_td = td;
348 
349           error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
350 
351           return(error);
352 }
353 
354 /*
355  * Pwrite system call
356  *
357  * MPSAFE
358  */
359 int
sys_extpwrite(struct sysmsg * sysmsg,const struct extpwrite_args * uap)360 sys_extpwrite(struct sysmsg *sysmsg, const struct extpwrite_args *uap)
361 {
362           struct thread *td = curthread;
363           struct uio auio;
364           struct iovec aiov;
365           int error;
366           int flags;
367 
368           if ((ssize_t)uap->nbyte < 0)
369                     error = EINVAL;
370 
371           aiov.iov_base = (void *)(uintptr_t)uap->buf;
372           aiov.iov_len = uap->nbyte;
373           auio.uio_iov = &aiov;
374           auio.uio_iovcnt = 1;
375           auio.uio_offset = uap->offset;
376           auio.uio_resid = uap->nbyte;
377           auio.uio_rw = UIO_WRITE;
378           auio.uio_segflg = UIO_USERSPACE;
379           auio.uio_td = td;
380 
381           flags = uap->flags & O_FMASK;
382           if (uap->offset != (off_t)-1)
383                     flags |= O_FOFFSET;
384           error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
385           return(error);
386 }
387 
388 /*
389  * MPSAFE
390  */
391 int
sys_writev(struct sysmsg * sysmsg,const struct writev_args * uap)392 sys_writev(struct sysmsg *sysmsg, const struct writev_args *uap)
393 {
394           struct thread *td = curthread;
395           struct uio auio;
396           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
397           int error;
398 
399           error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
400                                    &auio.uio_resid);
401           if (error)
402                     return (error);
403           auio.uio_iov = iov;
404           auio.uio_iovcnt = uap->iovcnt;
405           auio.uio_offset = -1;
406           auio.uio_rw = UIO_WRITE;
407           auio.uio_segflg = UIO_USERSPACE;
408           auio.uio_td = td;
409 
410           error = kern_pwritev(uap->fd, &auio, 0, &sysmsg->sysmsg_szresult);
411 
412           iovec_free(&iov, aiov);
413           return (error);
414 }
415 
416 
417 /*
418  * Gather positioned write system call
419  *
420  * MPSAFE
421  */
422 int
sys_extpwritev(struct sysmsg * sysmsg,const struct extpwritev_args * uap)423 sys_extpwritev(struct sysmsg *sysmsg, const struct extpwritev_args *uap)
424 {
425           struct thread *td = curthread;
426           struct uio auio;
427           struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
428           int error;
429           int flags;
430 
431           error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
432                                    &auio.uio_resid);
433           if (error)
434                     return (error);
435           auio.uio_iov = iov;
436           auio.uio_iovcnt = uap->iovcnt;
437           auio.uio_offset = uap->offset;
438           auio.uio_rw = UIO_WRITE;
439           auio.uio_segflg = UIO_USERSPACE;
440           auio.uio_td = td;
441 
442           flags = uap->flags & O_FMASK;
443           if (uap->offset != (off_t)-1)
444                     flags |= O_FOFFSET;
445 
446           error = kern_pwritev(uap->fd, &auio, flags, &sysmsg->sysmsg_szresult);
447 
448           iovec_free(&iov, aiov);
449           return(error);
450 }
451 
452 /*
453  * MPSAFE
454  */
455 int
kern_pwritev(int fd,struct uio * auio,int flags,size_t * res)456 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res)
457 {
458           struct thread *td = curthread;
459           struct file *fp;
460           int error;
461 
462           fp = holdfp(td, fd, FWRITE);
463           if (fp == NULL)
464                     return (EBADF);
465           else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) {
466                     error = ESPIPE;
467           } else {
468                     error = dofilewrite(fd, fp, auio, flags, res);
469           }
470           dropfp(td, fd, fp);
471 
472           return(error);
473 }
474 
475 /*
476  * Common code for writev and pwritev that writes data to
477  * a file using the passed in uio, offset, and flags.
478  *
479  * MPALMOSTSAFE - ktrace needs help
480  */
481 static int
dofilewrite(int fd,struct file * fp,struct uio * auio,int flags,size_t * res)482 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
483 {
484           struct thread *td = curthread;
485           struct lwp *lp = td->td_lwp;
486           int error;
487           size_t len;
488 #ifdef KTRACE
489           struct iovec *ktriov = NULL;
490           struct uio ktruio;
491 #endif
492 
493 #ifdef KTRACE
494           /*
495            * if tracing, save a copy of iovec and uio
496            */
497           if (KTRPOINT(td, KTR_GENIO))  {
498                     int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
499 
500                     ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
501                     bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
502                     ktruio = *auio;
503           }
504 #endif
505           len = auio->uio_resid;
506           error = fo_write(fp, auio, fp->f_cred, flags);
507           if (error) {
508                     if (auio->uio_resid != len && (error == ERESTART ||
509                         error == EINTR || error == EWOULDBLOCK))
510                               error = 0;
511                     /* Socket layer is responsible for issuing SIGPIPE. */
512                     if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
513                               lwpsignal(lp->lwp_proc, lp, SIGPIPE);
514           }
515 #ifdef KTRACE
516           if (ktriov != NULL) {
517                     if (error == 0) {
518                               ktruio.uio_iov = ktriov;
519                               ktruio.uio_resid = len - auio->uio_resid;
520                               ktrgenio(lp, fd, UIO_WRITE, &ktruio, error);
521                     }
522                     kfree(ktriov, M_TEMP);
523           }
524 #endif
525           if (error == 0)
526                     *res = len - auio->uio_resid;
527 
528           return(error);
529 }
530 
531 /*
532  * Ioctl system call
533  *
534  * MPSAFE
535  */
536 int
sys_ioctl(struct sysmsg * sysmsg,const struct ioctl_args * uap)537 sys_ioctl(struct sysmsg *sysmsg, const struct ioctl_args *uap)
538 {
539           int error;
540 
541           error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, sysmsg);
542           return (error);
543 }
544 
545 struct ioctl_map_entry {
546           const char *subsys;
547           struct ioctl_map_range *cmd_ranges;
548           LIST_ENTRY(ioctl_map_entry) entries;
549 };
550 
551 /*
552  * The true heart of all ioctl syscall handlers (native, emulation).
553  * If map != NULL, it will be searched for a matching entry for com,
554  * and appropriate conversions/conversion functions will be utilized.
555  *
556  * MPSAFE
557  */
558 int
mapped_ioctl(int fd,u_long com,caddr_t uspc_data,struct ioctl_map * map,struct sysmsg * msg)559 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map,
560                struct sysmsg *msg)
561 {
562           struct thread *td = curthread;
563           struct proc *p = td->td_proc;
564           struct ucred *cred;
565           struct file *fp;
566           struct ioctl_map_range *iomc = NULL;
567           int error;
568           u_int size;
569           u_long ocom = com;
570           caddr_t data, memp;
571           int tmp;
572 #define STK_PARAMS  128
573           union {
574               char stkbuf[STK_PARAMS];
575               long align;
576           } ubuf;
577 
578           KKASSERT(p);
579           cred = td->td_ucred;
580           memp = NULL;
581 
582           fp = holdfp(td, fd, FREAD|FWRITE);
583           if (fp == NULL)
584                     return(EBADF);
585 
586           if (map != NULL) {  /* obey translation map */
587                     u_long maskcmd;
588                     struct ioctl_map_entry *e;
589 
590                     maskcmd = com & map->mask;
591 
592                     lwkt_gettoken(&mioctl_token);
593                     LIST_FOREACH(e, &map->mapping, entries) {
594                               for (iomc = e->cmd_ranges; iomc->start != 0 ||
595                                    iomc->maptocmd != 0 || iomc->wrapfunc != NULL ||
596                                    iomc->mapfunc != NULL;
597                                    iomc++) {
598                                         if (maskcmd >= iomc->start &&
599                                             maskcmd <= iomc->end)
600                                                   break;
601                               }
602 
603                               /* Did we find a match? */
604                               if (iomc->start != 0 || iomc->maptocmd != 0 ||
605                                   iomc->wrapfunc != NULL || iomc->mapfunc != NULL)
606                                         break;
607                     }
608                     lwkt_reltoken(&mioctl_token);
609 
610                     if (iomc == NULL ||
611                         (iomc->start == 0 && iomc->maptocmd == 0
612                          && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) {
613                               krateprintf(&krate_poll,
614                                         "%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) "
615                                         "not implemented\n",
616                                      map->sys, fd, maskcmd,
617                                      (int)((maskcmd >> 8) & 0xff),
618                                      (int)(maskcmd & 0xff));
619                               error = EINVAL;
620                               goto done;
621                     }
622 
623                     /*
624                      * If it's a non-range one to one mapping, maptocmd should be
625                      * correct. If it's a ranged one to one mapping, we pass the
626                      * original value of com, and for a range mapped to a different
627                      * range, we always need a mapping function to translate the
628                      * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
629                      */
630                     if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) {
631                               com = iomc->maptocmd;
632                     } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) {
633                               if (iomc->mapfunc != NULL)
634                                         com = iomc->mapfunc(iomc->start, iomc->end,
635                                                                 iomc->start, iomc->end,
636                                                                 com, com);
637                     } else {
638                               if (iomc->mapfunc != NULL) {
639                                         com = iomc->mapfunc(iomc->start, iomc->end,
640                                                                 iomc->maptocmd, iomc->maptoend,
641                                                                 com, ocom);
642                               } else {
643                                         krateprintf(&krate_poll,
644                                                   "%s: Invalid mapping for fd=%d, "
645                                                   "cmd=%#lx ('%c',%d)\n",
646                                                   map->sys, fd, maskcmd,
647                                                   (int)((maskcmd >> 8) & 0xff),
648                                                   (int)(maskcmd & 0xff));
649                                         error = EINVAL;
650                                         goto done;
651                               }
652                     }
653           }
654 
655           switch (com) {
656           case FIONCLEX:
657                     error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
658                     goto done;
659           case FIOCLEX:
660                     error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
661                     goto done;
662           }
663 
664           /*
665            * Interpret high order word to find amount of data to be
666            * copied to/from the user's address space.
667            */
668           size = IOCPARM_LEN(com);
669           if (size > IOCPARM_MAX) {
670                     error = ENOTTY;
671                     goto done;
672           }
673 
674           if ((com & IOC_VOID) == 0 && size > sizeof(ubuf.stkbuf)) {
675                     memp = kmalloc(size, M_IOCTLOPS, M_WAITOK);
676                     data = memp;
677           } else {
678                     memp = NULL;
679                     data = ubuf.stkbuf;
680           }
681           if (com & IOC_VOID) {
682                     *(caddr_t *)data = uspc_data;
683           } else if (com & IOC_IN) {
684                     if (size != 0) {
685                               error = copyin(uspc_data, data, (size_t)size);
686                               if (error)
687                                         goto done;
688                     } else {
689                               *(caddr_t *)data = uspc_data;
690                     }
691           } else if ((com & IOC_OUT) != 0 && size) {
692                     /*
693                      * Zero the buffer so the user always
694                      * gets back something deterministic.
695                      */
696                     bzero(data, (size_t)size);
697           }
698 
699           switch (com) {
700           case FIONBIO:
701                     if ((tmp = *(int *)data))
702                               atomic_set_int(&fp->f_flag, FNONBLOCK);
703                     else
704                               atomic_clear_int(&fp->f_flag, FNONBLOCK);
705                     error = 0;
706                     break;
707 
708           case FIOASYNC:
709                     if ((tmp = *(int *)data))
710                               atomic_set_int(&fp->f_flag, FASYNC);
711                     else
712                               atomic_clear_int(&fp->f_flag, FASYNC);
713                     error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg);
714                     break;
715 
716           default:
717                     /*
718                      *  If there is a override function,
719                      *  call it instead of directly routing the call
720                      */
721                     if (map != NULL && iomc->wrapfunc != NULL)
722                               error = iomc->wrapfunc(fp, com, ocom, data, cred);
723                     else
724                               error = fo_ioctl(fp, com, data, cred, msg);
725                     /*
726                      * Copy any data to user, size was
727                      * already set and checked above.
728                      */
729                     if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
730                               error = copyout(data, uspc_data, (size_t)size);
731                     break;
732           }
733 done:
734           if (memp != NULL)
735                     kfree(memp, M_IOCTLOPS);
736           dropfp(td, fd, fp);
737 
738           return(error);
739 }
740 
741 /*
742  * MPSAFE
743  */
744 int
mapped_ioctl_register_handler(struct ioctl_map_handler * he)745 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
746 {
747           struct ioctl_map_entry *ne;
748 
749           KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
750                      he->subsys != NULL && *he->subsys != '\0');
751 
752           ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP,
753                          M_WAITOK | M_ZERO);
754 
755           ne->subsys = he->subsys;
756           ne->cmd_ranges = he->cmd_ranges;
757 
758           lwkt_gettoken(&mioctl_token);
759           LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
760           lwkt_reltoken(&mioctl_token);
761 
762           return(0);
763 }
764 
765 /*
766  * MPSAFE
767  */
768 int
mapped_ioctl_unregister_handler(struct ioctl_map_handler * he)769 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
770 {
771           struct ioctl_map_entry *ne;
772           int error = EINVAL;
773 
774           KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
775 
776           lwkt_gettoken(&mioctl_token);
777           LIST_FOREACH(ne, &he->map->mapping, entries) {
778                     if (ne->cmd_ranges == he->cmd_ranges) {
779                               LIST_REMOVE(ne, entries);
780                               kfree(ne, M_IOCTLMAP);
781                               error = 0;
782                               break;
783                     }
784           }
785           lwkt_reltoken(&mioctl_token);
786           return(error);
787 }
788 
789 static int          nseldebug;
790 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, "");
791 
792 /*
793  * Select system call.
794  *
795  * MPSAFE
796  */
797 int
sys_select(struct sysmsg * sysmsg,const struct select_args * uap)798 sys_select(struct sysmsg *sysmsg, const struct select_args *uap)
799 {
800           struct timeval ktv;
801           struct timespec *ktsp, kts;
802           int error;
803 
804           /*
805            * Get timeout if any.
806            */
807           if (uap->tv != NULL) {
808                     error = copyin(uap->tv, &ktv, sizeof (ktv));
809                     if (error)
810                               return (error);
811                     TIMEVAL_TO_TIMESPEC(&ktv, &kts);
812                     ktsp = &kts;
813           } else {
814                     ktsp = NULL;
815           }
816 
817           /*
818            * Do real work.
819            */
820           error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
821                                &sysmsg->sysmsg_result);
822 
823           return (error);
824 }
825 
826 
827 /*
828  * Pselect system call.
829  */
830 int
sys_pselect(struct sysmsg * sysmsg,const struct pselect_args * uap)831 sys_pselect(struct sysmsg *sysmsg, const struct pselect_args *uap)
832 {
833           struct thread *td = curthread;
834           struct lwp *lp = td->td_lwp;
835           struct timespec *ktsp, kts;
836           sigset_t sigmask;
837           int error;
838 
839           /*
840            * Get timeout if any.
841            */
842           if (uap->ts != NULL) {
843                     error = copyin(uap->ts, &kts, sizeof (kts));
844                     if (error)
845                               return (error);
846                     ktsp = &kts;
847           } else {
848                     ktsp = NULL;
849           }
850 
851           /*
852            * Install temporary signal mask if any provided.
853            */
854           if (uap->sigmask != NULL) {
855                     error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
856                     if (error)
857                               return (error);
858                     lwkt_gettoken(&lp->lwp_proc->p_token);
859                     lp->lwp_oldsigmask = lp->lwp_sigmask;
860                     SIG_CANTMASK(sigmask);
861                     lp->lwp_sigmask = sigmask;
862                     lwkt_reltoken(&lp->lwp_proc->p_token);
863                     sigirefs_wait(lp->lwp_proc);
864           }
865 
866           /*
867            * Do real job.
868            */
869           error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
870                                &sysmsg->sysmsg_result);
871 
872           if (uap->sigmask != NULL) {
873                     lwkt_gettoken(&lp->lwp_proc->p_token);
874                     /* doselect() responsible for turning ERESTART into EINTR */
875                     KKASSERT(error != ERESTART);
876                     if (error == EINTR) {
877                               /*
878                                * We can't restore the previous signal mask now
879                                * because it could block the signal that interrupted
880                                * us.  So make a note to restore it after executing
881                                * the handler.
882                                */
883                               lp->lwp_flags |= LWP_OLDMASK;
884                     } else {
885                               /*
886                                * No handler to run. Restore previous mask immediately.
887                                */
888                               lp->lwp_sigmask = lp->lwp_oldsigmask;
889                               sigirefs_wait(lp->lwp_proc);
890                     }
891                     lwkt_reltoken(&lp->lwp_proc->p_token);
892           }
893 
894           return (error);
895 }
896 
897 static int
select_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)898 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
899 {
900           struct select_kevent_copyin_args *skap = NULL;
901           struct kevent *kev;
902           int fd;
903           kfd_set *fdp = NULL;
904           short filter = 0;
905           u_int fflags = 0;
906 
907           skap = (struct select_kevent_copyin_args *)arg;
908 
909           if (*events == maxevents)
910                     return (0);
911 
912           while (skap->active_set < COPYIN_DONE) {
913                     switch (skap->active_set) {
914                     case COPYIN_READ:
915                               /*
916                                * Register descriptors for the read filter
917                                */
918                               fdp = skap->read_set;
919                               filter = EVFILT_READ;
920                               fflags = NOTE_OLDAPI;
921                               if (fdp)
922                                         break;
923                               ++skap->active_set;
924                               skap->proc_fds = 0;
925                               /* fall through */
926                     case COPYIN_WRITE:
927                               /*
928                                * Register descriptors for the write filter
929                                */
930                               fdp = skap->write_set;
931                               filter = EVFILT_WRITE;
932                               fflags = NOTE_OLDAPI;
933                               if (fdp)
934                                         break;
935                               ++skap->active_set;
936                               skap->proc_fds = 0;
937                               /* fall through */
938                     case COPYIN_EXCEPT:
939                               /*
940                                * Register descriptors for the exception filter
941                                */
942                               fdp = skap->except_set;
943                               filter = EVFILT_EXCEPT;
944                               fflags = NOTE_OLDAPI | NOTE_OOB;
945                               if (fdp)
946                                         break;
947                               ++skap->active_set;
948                               skap->proc_fds = 0;
949                               /* fall through */
950                     case COPYIN_DONE:
951                               /*
952                                * Nothing left to register
953                                */
954                               return(0);
955                               /* NOT REACHED */
956                     }
957 
958                     while (skap->proc_fds < skap->num_fds) {
959                               fd = skap->proc_fds;
960                               if (FD_ISSET(fd, fdp)) {
961                                         kev = &kevp[*events];
962                                         EV_SET(kev, fd, filter,
963                                                EV_ADD|EV_ENABLE,
964                                                fflags, 0,
965                                                (void *)(uintptr_t)
966                                                   skap->lwp->lwp_kqueue_serial);
967                                         FD_CLR(fd, fdp);
968                                         ++*events;
969 
970                                         if (nseldebug) {
971                                                   kprintf(
972                                                       "select fd %d filter %d "
973                                                       "serial %ju\n", fd, filter,
974                                                       (uintmax_t)
975                                                       skap->lwp->lwp_kqueue_serial);
976                                         }
977                               }
978                               ++skap->proc_fds;
979                               if (*events == maxevents)
980                                         return (0);
981                     }
982                     skap->active_set++;
983                     skap->proc_fds = 0;
984           }
985 
986           return (0);
987 }
988 
989 static int
select_copyout(void * arg,struct kevent * kevp,int count,int * res)990 select_copyout(void *arg, struct kevent *kevp, int count, int *res)
991 {
992           struct select_kevent_copyin_args *skap;
993           struct kevent kev;
994           int i;
995           int n;
996 
997           skap = (struct select_kevent_copyin_args *)arg;
998 
999           for (i = 0; i < count; ++i) {
1000                     /*
1001                      * Filter out and delete spurious events
1002                      */
1003                     if ((uint64_t)(uintptr_t)kevp[i].udata !=
1004                         skap->lwp->lwp_kqueue_serial)
1005                     {
1006                               panic("select_copyout: unexpected udata");
1007 deregister:
1008                               kev = kevp[i];
1009                               kev.flags = EV_DISABLE|EV_DELETE;
1010                               n = 1;
1011                               kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n, 0);
1012                               if (nseldebug) {
1013                                         kprintf("select fd %ju mismatched serial %ju\n",
1014                                             (uintmax_t)kevp[i].ident,
1015                                             (uintmax_t)skap->lwp->lwp_kqueue_serial);
1016                               }
1017                               continue;
1018                     }
1019 
1020                     /*
1021                      * Handle errors
1022                      */
1023                     if (kevp[i].flags & EV_ERROR) {
1024                               int error = kevp[i].data;
1025 
1026                               switch (error) {
1027                               case EBADF:
1028                                         /*
1029                                          * A bad file descriptor is considered a
1030                                          * fatal error for select, bail out.
1031                                          */
1032                                         skap->error = error;
1033                                         *res = -1;
1034                                         return error;
1035 
1036                               default:
1037                                         /*
1038                                          * Select silently swallows any unknown errors
1039                                          * for descriptors in the read or write sets.
1040                                          *
1041                                          * ALWAYS filter out EOPNOTSUPP errors from
1042                                          * filters (at least until all filters support
1043                                          * EVFILT_EXCEPT)
1044                                          *
1045                                          * We also filter out ENODEV since dev_dkqfilter
1046                                          * returns ENODEV if EOPNOTSUPP is returned in an
1047                                          * inner call.
1048                                          *
1049                                          * XXX: fix this
1050                                          */
1051                                         if (kevp[i].filter != EVFILT_READ &&
1052                                             kevp[i].filter != EVFILT_WRITE &&
1053                                             error != EOPNOTSUPP &&
1054                                             error != ENODEV) {
1055                                                   skap->error = error;
1056                                                   *res = -1;
1057                                                   return error;
1058                                         }
1059                                         break;
1060                               }
1061 
1062                               /*
1063                                * We must deregister any unsupported select events
1064                                * to avoid a live-lock.
1065                                */
1066                               if (nseldebug) {
1067                                         kprintf("select fd %ju filter %d error %d\n",
1068                                                   (uintmax_t)kevp[i].ident,
1069                                                   kevp[i].filter, error);
1070                               }
1071                               goto deregister;
1072                     }
1073 
1074                     switch (kevp[i].filter) {
1075                     case EVFILT_READ:
1076                               FD_SET(kevp[i].ident, skap->read_set);
1077                               break;
1078                     case EVFILT_WRITE:
1079                               FD_SET(kevp[i].ident, skap->write_set);
1080                               break;
1081                     case EVFILT_EXCEPT:
1082                               FD_SET(kevp[i].ident, skap->except_set);
1083                               break;
1084                     }
1085 
1086                     ++*res;
1087           }
1088 
1089           return (0);
1090 }
1091 
1092 /*
1093  * Copy select bits in from userland.  Allocate kernel memory if the
1094  * set is large.
1095  */
1096 static int
getbits(int bytes,fd_set * in_set,kfd_set ** out_set,kfd_set * tmp_set)1097 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set)
1098 {
1099           int error;
1100 
1101           if (in_set) {
1102                     if (bytes < sizeof(*tmp_set))
1103                               *out_set = tmp_set;
1104                     else
1105                               *out_set = kmalloc(bytes, M_SELECT, M_WAITOK);
1106                     error = copyin(in_set, *out_set, bytes);
1107           } else {
1108                     *out_set = NULL;
1109                     error = 0;
1110           }
1111           return (error);
1112 }
1113 
1114 /*
1115  * Copy returned select bits back out to userland.
1116  */
1117 static int
putbits(int bytes,kfd_set * in_set,fd_set * out_set)1118 putbits(int bytes, kfd_set *in_set, fd_set *out_set)
1119 {
1120           int error;
1121 
1122           if (in_set) {
1123                     error = copyout(in_set, out_set, bytes);
1124           } else {
1125                     error = 0;
1126           }
1127           return (error);
1128 }
1129 
1130 static int
dotimeout_only(struct timespec * ts)1131 dotimeout_only(struct timespec *ts)
1132 {
1133           return(nanosleep1(ts, NULL));
1134 }
1135 
1136 /*
1137  * Common code for sys_select() and sys_pselect().
1138  *
1139  * in, out and ex are userland pointers.  ts must point to validated
1140  * kernel-side timeout value or NULL for infinite timeout.  res must
1141  * point to syscall return value.
1142  */
1143 static int
doselect(int nd,fd_set * read,fd_set * write,fd_set * except,struct timespec * ts,int * res)1144 doselect(int nd, fd_set *read, fd_set *write, fd_set *except,
1145            struct timespec *ts, int *res)
1146 {
1147           struct proc *p = curproc;
1148           struct select_kevent_copyin_args *kap, ka;
1149           int bytes, error;
1150           kfd_set read_tmp;
1151           kfd_set write_tmp;
1152           kfd_set except_tmp;
1153 
1154           *res = 0;
1155           if (nd < 0)
1156                     return (EINVAL);
1157           if (nd == 0 && ts)
1158                     return (dotimeout_only(ts));
1159 
1160           if (nd > p->p_fd->fd_nfiles)            /* limit kmalloc */
1161                     nd = p->p_fd->fd_nfiles;
1162 
1163           kap = &ka;
1164           kap->lwp = curthread->td_lwp;
1165           kap->num_fds = nd;
1166           kap->proc_fds = 0;
1167           kap->error = 0;
1168           kap->active_set = COPYIN_READ;
1169 
1170           /*
1171            * Calculate bytes based on the number of __fd_mask[] array entries
1172            * multiplied by the size of __fd_mask.
1173            */
1174           bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask);
1175 
1176           /* kap->read_set = NULL; not needed */
1177           kap->write_set = NULL;
1178           kap->except_set = NULL;
1179 
1180           error = getbits(bytes, read, &kap->read_set, &read_tmp);
1181           if (error == 0)
1182                     error = getbits(bytes, write, &kap->write_set, &write_tmp);
1183           if (error == 0)
1184                     error = getbits(bytes, except, &kap->except_set, &except_tmp);
1185           if (error)
1186                     goto done;
1187 
1188           /*
1189            * NOTE: Make sure the max events passed to kern_kevent() is
1190            *         effectively unlimited.  (nd * 3) accomplishes this.
1191            *
1192            *         (*res) continues to increment as returned events are
1193            *         loaded in.
1194            */
1195           error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap,
1196                                   select_copyin, select_copyout, ts,
1197                                   KEVENT_AUTO_STALE);
1198           if (error == 0)
1199                     error = putbits(bytes, kap->read_set, read);
1200           if (error == 0)
1201                     error = putbits(bytes, kap->write_set, write);
1202           if (error == 0)
1203                     error = putbits(bytes, kap->except_set, except);
1204 
1205           /*
1206            * An error from an individual event that should be passed
1207            * back to userland (EBADF)
1208            */
1209           if (kap->error)
1210                     error = kap->error;
1211 
1212           /*
1213            * Clean up.
1214            */
1215 done:
1216           if (kap->read_set && kap->read_set != &read_tmp)
1217                     kfree(kap->read_set, M_SELECT);
1218           if (kap->write_set && kap->write_set != &write_tmp)
1219                     kfree(kap->write_set, M_SELECT);
1220           if (kap->except_set && kap->except_set != &except_tmp)
1221                     kfree(kap->except_set, M_SELECT);
1222 
1223           kap->lwp->lwp_kqueue_serial += kap->num_fds;
1224 
1225           return (error);
1226 }
1227 
1228 /*
1229  * Poll system call.
1230  *
1231  * MPSAFE
1232  */
1233 int
sys_poll(struct sysmsg * sysmsg,const struct poll_args * uap)1234 sys_poll(struct sysmsg *sysmsg, const struct poll_args *uap)
1235 {
1236           struct timespec ts, *tsp;
1237           int error;
1238 
1239           if (uap->timeout != INFTIM) {
1240                     if (uap->timeout < 0)
1241                               return (EINVAL);
1242                     ts.tv_sec = uap->timeout / 1000;
1243                     ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000;
1244                     tsp = &ts;
1245           } else {
1246                     tsp = NULL;
1247           }
1248 
1249           error = dopoll(uap->nfds, uap->fds, tsp, &sysmsg->sysmsg_result, 0);
1250 
1251           return (error);
1252 }
1253 
1254 /*
1255  * Ppoll system call.
1256  *
1257  * MPSAFE
1258  */
1259 int
sys_ppoll(struct sysmsg * sysmsg,const struct ppoll_args * uap)1260 sys_ppoll(struct sysmsg *sysmsg, const struct ppoll_args *uap)
1261 {
1262           struct thread *td = curthread;
1263           struct lwp *lp = td->td_lwp;
1264           struct timespec *ktsp, kts;
1265           sigset_t sigmask;
1266           int error;
1267 
1268           /*
1269            * Get timeout if any.
1270            */
1271           if (uap->ts != NULL) {
1272                     error = copyin(uap->ts, &kts, sizeof (kts));
1273                     if (error)
1274                               return (error);
1275                     ktsp = &kts;
1276           } else {
1277                     ktsp = NULL;
1278           }
1279 
1280           /*
1281            * Install temporary signal mask if any provided.
1282            */
1283           if (uap->sigmask != NULL) {
1284                     error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
1285                     if (error)
1286                               return (error);
1287                     lwkt_gettoken(&lp->lwp_proc->p_token);
1288                     lp->lwp_oldsigmask = lp->lwp_sigmask;
1289                     SIG_CANTMASK(sigmask);
1290                     lp->lwp_sigmask = sigmask;
1291                     lwkt_reltoken(&lp->lwp_proc->p_token);
1292                     sigirefs_wait(lp->lwp_proc);
1293           }
1294 
1295           error = dopoll(uap->nfds, uap->fds, ktsp, &sysmsg->sysmsg_result,
1296               ktsp != NULL ? KEVENT_TIMEOUT_PRECISE : 0);
1297 
1298           if (uap->sigmask != NULL) {
1299                     lwkt_gettoken(&lp->lwp_proc->p_token);
1300                     /* dopoll() responsible for turning ERESTART into EINTR */
1301                     KKASSERT(error != ERESTART);
1302                     if (error == EINTR) {
1303                               /*
1304                                * We can't restore the previous signal mask now
1305                                * because it could block the signal that interrupted
1306                                * us.  So make a note to restore it after executing
1307                                * the handler.
1308                                */
1309                               lp->lwp_flags |= LWP_OLDMASK;
1310                     } else {
1311                               /*
1312                                * No handler to run. Restore previous mask immediately.
1313                                */
1314                               lp->lwp_sigmask = lp->lwp_oldsigmask;
1315                               sigirefs_wait(lp->lwp_proc);
1316                     }
1317                     lwkt_reltoken(&lp->lwp_proc->p_token);
1318           }
1319 
1320           return (error);
1321 }
1322 
1323 static int
poll_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)1324 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
1325 {
1326           struct poll_kevent_copyin_args *pkap;
1327           struct pollfd *pfd;
1328           struct kevent *kev;
1329           int kev_count;
1330 
1331           pkap = (struct poll_kevent_copyin_args *)arg;
1332 
1333           while (pkap->pfds < pkap->nfds) {
1334                     pfd = &pkap->fds[pkap->pfds];
1335 
1336                     /* Clear return events */
1337                     pfd->revents = 0;
1338 
1339                     /* Do not check if fd is equal to -1 */
1340                     if (pfd->fd == -1) {
1341                               ++pkap->pfds;
1342                               continue;
1343                     }
1344 
1345                     /*
1346                      * NOTE: pfd->events == 0 implies POLLHUP in BSDs.  Used
1347                      *         by at least sshd and X11 udev support.
1348                      */
1349                     kev_count = 0;
1350                     if (pfd->events == 0)
1351                               kev_count++;
1352                     if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM))
1353                               kev_count++;
1354                     if (pfd->events & (POLLOUT | POLLWRNORM))
1355                               kev_count++;
1356                     if (pfd->events & (POLLPRI | POLLRDBAND))
1357                               kev_count++;
1358 
1359                     if (*events + kev_count > maxevents)
1360                               return (0);
1361 
1362                     /*
1363                      * NOTE: A combined serial number and poll array index is
1364                      *         stored in kev->udata.
1365                      *
1366                      * NOTE: Events will be registered with KEVENT_UNIQUE_NOTES
1367                      *         set, using kev->data for the uniqifier.  kev->data
1368                      *         is an implied in the actual registration.
1369                      */
1370                     kev = &kevp[*events];
1371 
1372                     /*
1373                      * Implied POLLHUP
1374                      */
1375                     if (pfd->events == 0) {
1376                               int notes = NOTE_OLDAPI | NOTE_HUPONLY;
1377 
1378                               EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE,
1379                                      notes, pkap->pfds, (void *)(uintptr_t)
1380                                         (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1381                     }
1382 
1383                     /*
1384                      * Nominal read events
1385                      */
1386                     if (pfd->events & (POLLIN | POLLHUP | POLLRDNORM)) {
1387                               int notes = NOTE_OLDAPI;
1388                               if ((pfd->events & (POLLIN | POLLRDNORM)) == 0)
1389                                         notes |= NOTE_HUPONLY;
1390 
1391                               EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE,
1392                                      notes, pkap->pfds, (void *)(uintptr_t)
1393                                         (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1394                     }
1395 
1396                     /*
1397                      * Nominal write events
1398                      */
1399                     if (pfd->events & (POLLOUT | POLLWRNORM)) {
1400                               EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE,
1401                                      NOTE_OLDAPI, pkap->pfds, (void *)(uintptr_t)
1402                                         (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1403                     }
1404 
1405                     /*
1406                      * Nominal exceptional events
1407                      */
1408                     if (pfd->events & (POLLPRI | POLLRDBAND)) {
1409                               EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE,
1410                                      NOTE_OLDAPI | NOTE_OOB, pkap->pfds,
1411                                      (void *)(uintptr_t)
1412                                         (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
1413                     }
1414 
1415                     if (nseldebug) {
1416                               kprintf("poll index %d/%d fd %d events %08x "
1417                                   "serial %ju\n", pkap->pfds, pkap->nfds-1,
1418                                   pfd->fd, pfd->events,
1419                                   (uintmax_t)pkap->lwp->lwp_kqueue_serial);
1420                     }
1421 
1422                     ++pkap->pfds;
1423                     (*events) += kev_count;
1424           }
1425 
1426           return (0);
1427 }
1428 
1429 static int
poll_copyout(void * arg,struct kevent * kevp,int count,int * res)1430 poll_copyout(void *arg, struct kevent *kevp, int count, int *res)
1431 {
1432           struct poll_kevent_copyin_args *pkap;
1433           struct pollfd *pfd;
1434           struct kevent kev;
1435           int count_res;
1436           int i;
1437           int n;
1438           uint64_t pi;
1439 
1440           pkap = (struct poll_kevent_copyin_args *)arg;
1441 
1442           for (i = 0; i < count; ++i) {
1443                     /*
1444                      * Extract the poll array index and delete spurious events.
1445                      * We can easily tell if the serial number is incorrect
1446                      * by checking whether the extracted index is out of range.
1447                      */
1448                     pi = (uint64_t)(uintptr_t)kevp[i].udata -
1449                          pkap->lwp->lwp_kqueue_serial;
1450                     if (pi >= pkap->nfds) {
1451                               panic("poll_copyout: unexpected udata");
1452 deregister:
1453                               kev = kevp[i];
1454                               kev.flags = EV_DISABLE|EV_DELETE;
1455                               kev.data = pi;      /* uniquifier */
1456                               n = 1;
1457                               kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n,
1458                                                   KEVENT_UNIQUE_NOTES);
1459                               if (nseldebug) {
1460                                         kprintf("poll index %ju out of range against "
1461                                             "serial %ju\n", (uintmax_t)pi,
1462                                             (uintmax_t)pkap->lwp->lwp_kqueue_serial);
1463                               }
1464                               continue;
1465                     }
1466 
1467                     /*
1468                      * Locate the pollfd and process events
1469                      */
1470                     pfd = &pkap->fds[pi];
1471                     if (kevp[i].ident == pfd->fd) {
1472                               /*
1473                                * A single descriptor may generate an error against
1474                                * more than one filter, make sure to set the
1475                                * appropriate flags but do not increment (*res)
1476                                * more than once.
1477                                */
1478                               count_res = (pfd->revents == 0);
1479                               if (kevp[i].flags & EV_ERROR) {
1480                                         switch(kevp[i].data) {
1481                                         case EBADF:
1482                                         case POLLNVAL:
1483                                                   /* Bad file descriptor */
1484                                                   if (count_res)
1485                                                             ++*res;
1486                                                   pfd->revents |= POLLNVAL;
1487                                                   break;
1488                                         default:
1489                                                   /*
1490                                                    * Poll silently swallows any unknown
1491                                                    * errors except in the case of POLLPRI
1492                                                    * (OOB/urgent data).
1493                                                    *
1494                                                    * ALWAYS filter out EOPNOTSUPP errors
1495                                                    * from filters, common applications
1496                                                    * set POLLPRI|POLLRDBAND and most
1497                                                    * filters do not support EVFILT_EXCEPT.
1498                                                    *
1499                                                    * We also filter out ENODEV since
1500                                                    * dev_dkqfilter returns ENODEV if
1501                                                    * EOPNOTSUPP is returned in an
1502                                                    * inner call.
1503                                                    *
1504                                                    * XXX: fix this
1505                                                    */
1506                                                   if (kevp[i].filter != EVFILT_READ &&
1507                                                       kevp[i].filter != EVFILT_WRITE &&
1508                                                       kevp[i].data != EOPNOTSUPP &&
1509                                                       kevp[i].data != ENODEV) {
1510                                                             if (count_res)
1511                                                                       ++*res;
1512                                                             pfd->revents |= POLLERR;
1513                                                   }
1514                                                   break;
1515                                         }
1516                                         if (pfd->revents == 0 && nseldebug) {
1517                                                   kprintf("poll index EV_ERROR %ju fd %d "
1518                                                             "filter %d error %jd\n",
1519                                                             (uintmax_t)pi, pfd->fd,
1520                                                             kevp[i].filter,
1521                                                             (intmax_t)kevp[i].data);
1522                                         }
1523 
1524                                         /*
1525                                          * Silently deregister any unhandled EV_ERROR
1526                                          * condition (usually EOPNOTSUPP).
1527                                          */
1528                                         if (pfd->revents == 0)
1529                                                   goto deregister;
1530                                         continue;
1531                               }
1532 
1533                               switch (kevp[i].filter) {
1534                               case EVFILT_READ:
1535                                         /*
1536                                          * NODATA on the read side can indicate a
1537                                          * half-closed situation and not necessarily
1538                                          * a disconnect, so depend on the user
1539                                          * issuing a read() and getting 0 bytes back.
1540                                          *
1541                                          * If EV_HUP is set the peer completely
1542                                          * disconnected and we can set POLLHUP.
1543                                          * Linux can return POLLHUP even if read
1544                                          * data has not been drained, so we should
1545                                          * too.
1546                                          */
1547                                         /* if (kevp[i].flags & EV_NODATA) */ {
1548                                                   if (kevp[i].flags & EV_HUP)
1549                                                             pfd->revents |= POLLHUP;
1550                                         }
1551                                         if ((kevp[i].flags & EV_EOF) &&
1552                                             kevp[i].fflags != 0)
1553                                                   pfd->revents |= POLLERR;
1554                                         if (pfd->events & POLLIN)
1555                                                   pfd->revents |= POLLIN;
1556                                         if (pfd->events & POLLRDNORM)
1557                                                   pfd->revents |= POLLRDNORM;
1558                                         break;
1559                               case EVFILT_WRITE:
1560                                         /*
1561                                          * As per the OpenGroup POLLHUP is mutually
1562                                          * exclusive with the writability flags.  I
1563                                          * consider this a bit broken but...
1564                                          *
1565                                          * In this case a disconnect is implied even
1566                                          * for a half-closed (write side) situation.
1567                                          */
1568                                         if (kevp[i].flags & EV_EOF) {
1569                                                   pfd->revents |= POLLHUP;
1570                                                   if (kevp[i].fflags != 0)
1571                                                             pfd->revents |= POLLERR;
1572                                         } else {
1573                                                   if (pfd->events & POLLOUT)
1574                                                             pfd->revents |= POLLOUT;
1575                                                   if (pfd->events & POLLWRNORM)
1576                                                             pfd->revents |= POLLWRNORM;
1577                                         }
1578                                         break;
1579                               case EVFILT_EXCEPT:
1580                                         /*
1581                                          * EV_NODATA should never be tagged for this
1582                                          * filter.
1583                                          */
1584                                         if (pfd->events & POLLPRI)
1585                                                   pfd->revents |= POLLPRI;
1586                                         if (pfd->events & POLLRDBAND)
1587                                                   pfd->revents |= POLLRDBAND;
1588                                         break;
1589                               }
1590 
1591                               if (nseldebug) {
1592                                         kprintf("poll index %ju/%d fd %d "
1593                                             "revents %08x\n", (uintmax_t)pi, pkap->nfds,
1594                                             pfd->fd, pfd->revents);
1595                               }
1596 
1597                               if (count_res && pfd->revents)
1598                                         ++*res;
1599                     }
1600 
1601                     /*
1602                      * We must deregister any kqueue poll event that does not
1603                      * set poll return bits to prevent a live-lock.
1604                      */
1605                     if (pfd->revents == 0) {
1606                               krateprintf(&krate_poll,
1607                                         "poll index %ju no-action %ju/%d "
1608                                         "events=%08x kevpfilt=%d/%08x\n",
1609                                         (uintmax_t)pi, (uintmax_t)kevp[i].ident,
1610                                         pfd->fd, pfd->events,
1611                                         kevp[i].filter, kevp[i].flags);
1612                               goto deregister;
1613                     }
1614           }
1615 
1616           return (0);
1617 }
1618 
1619 static int
dopoll(int nfds,struct pollfd * fds,struct timespec * ts,int * res,int flags)1620 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res, int flags)
1621 {
1622           struct poll_kevent_copyin_args ka;
1623           struct pollfd sfds[64];
1624           int bytes;
1625           int error;
1626 
1627           flags |= KEVENT_AUTO_STALE | KEVENT_UNIQUE_NOTES;
1628 
1629         *res = 0;
1630         if (nfds < 0)
1631                 return (EINVAL);
1632 
1633           if (nfds == 0 && ts)
1634                     return (dotimeout_only(ts));
1635 
1636           /*
1637            * This is a bit arbitrary but we need to limit internal kmallocs.
1638            */
1639         if (nfds > maxfilesperproc * 2)
1640                 nfds = maxfilesperproc * 2;
1641           bytes = sizeof(struct pollfd) * nfds;
1642 
1643           ka.lwp = curthread->td_lwp;
1644           ka.nfds = nfds;
1645           ka.pfds = 0;
1646           ka.error = 0;
1647 
1648           if (ka.nfds < 64)
1649                     ka.fds = sfds;
1650           else
1651                     ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK);
1652 
1653           error = copyin(fds, ka.fds, bytes);
1654 
1655           if (error == 0)
1656                     error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka,
1657                                             poll_copyin, poll_copyout, ts, flags);
1658 
1659           if (error == 0)
1660                     error = copyout(ka.fds, fds, bytes);
1661 
1662           if (ka.fds != sfds)
1663                     kfree(ka.fds, M_SELECT);
1664 
1665           ka.lwp->lwp_kqueue_serial += nfds;
1666 
1667           return (error);
1668 }
1669 
1670 static int
socket_wait_copyin(void * arg,struct kevent * kevp,int maxevents,int * events)1671 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
1672 {
1673           return (0);
1674 }
1675 
1676 static int
socket_wait_copyout(void * arg,struct kevent * kevp,int count,int * res)1677 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res)
1678 {
1679           ++*res;
1680           return (0);
1681 }
1682 
1683 extern    struct fileops socketops;
1684 
1685 /*
1686  * NOTE: Callers of socket_wait() must already have a reference on the
1687  *         socket.
1688  */
1689 int
socket_wait(struct socket * so,struct timespec * ts,int * res)1690 socket_wait(struct socket *so, struct timespec *ts, int *res)
1691 {
1692           struct thread *td = curthread;
1693           struct file *fp;
1694           struct kqueue kq;
1695           struct kevent kev;
1696           int error, fd;
1697           int n;
1698 
1699           if ((error = falloc(td->td_lwp, &fp, &fd)) != 0)
1700                     return (error);
1701 
1702           fp->f_type = DTYPE_SOCKET;
1703           fp->f_flag = FREAD | FWRITE;
1704           fp->f_ops = &socketops;
1705           fp->f_data = so;
1706           fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd);
1707           fsetfdflags(td->td_proc->p_fd, fd, UF_EXCLOSE);
1708 
1709           bzero(&kq, sizeof(kq));
1710           kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd);
1711           EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL);
1712           n = 1;
1713           if ((error = kqueue_register(&kq, &kev, &n, 0)) != 0) {
1714                     fdrop(fp);
1715                     return (error);
1716           }
1717 
1718           error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin,
1719                                   socket_wait_copyout, ts, 0);
1720 
1721           EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL);
1722           n = 1;
1723           kqueue_register(&kq, &kev, &n, 0);
1724           fp->f_ops = &badfileops;
1725           fdrop(fp);
1726 
1727           return (error);
1728 }
1729 
1730 /*
1731  * OpenBSD poll system call.
1732  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
1733  *
1734  * MPSAFE
1735  */
1736 int
sys_openbsd_poll(struct sysmsg * sysmsg,const struct openbsd_poll_args * uap)1737 sys_openbsd_poll(struct sysmsg *sysmsg, const struct openbsd_poll_args *uap)
1738 {
1739           return (sys_poll(sysmsg, (const struct poll_args *)uap));
1740 }
1741 
1742 /*ARGSUSED*/
1743 int
seltrue(cdev_t dev,int events)1744 seltrue(cdev_t dev, int events)
1745 {
1746           return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
1747 }
1748