1 /*        $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $       */
2 
3 /*-
4  * Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
34  * Copyright (c) 2009 Apple, Inc
35  * All rights reserved.
36  *
37  * Redistribution and use in source and binary forms, with or without
38  * modification, are permitted provided that the following conditions
39  * are met:
40  * 1. Redistributions of source code must retain the above copyright
41  *    notice, this list of conditions and the following disclaimer.
42  * 2. Redistributions in binary form must reproduce the above copyright
43  *    notice, this list of conditions and the following disclaimer in the
44  *    documentation and/or other materials provided with the distribution.
45  *
46  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
47  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
50  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
51  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
52  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
53  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
54  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
55  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56  * SUCH DAMAGE.
57  *
58  * FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
59  */
60 
61 #ifdef _KERNEL_OPT
62 #include "opt_ddb.h"
63 #endif /* _KERNEL_OPT */
64 
65 #include <sys/cdefs.h>
66 __KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $");
67 
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/kernel.h>
71 #include <sys/wait.h>
72 #include <sys/proc.h>
73 #include <sys/file.h>
74 #include <sys/select.h>
75 #include <sys/queue.h>
76 #include <sys/event.h>
77 #include <sys/eventvar.h>
78 #include <sys/poll.h>
79 #include <sys/kmem.h>
80 #include <sys/stat.h>
81 #include <sys/filedesc.h>
82 #include <sys/syscallargs.h>
83 #include <sys/kauth.h>
84 #include <sys/conf.h>
85 #include <sys/atomic.h>
86 
87 static int          kqueue_scan(file_t *, size_t, struct kevent *,
88                                   const struct timespec *, register_t *,
89                                   const struct kevent_ops *, struct kevent *,
90                                   size_t);
91 static int          kqueue_ioctl(file_t *, u_long, void *);
92 static int          kqueue_fcntl(file_t *, u_int, void *);
93 static int          kqueue_poll(file_t *, int);
94 static int          kqueue_kqfilter(file_t *, struct knote *);
95 static int          kqueue_stat(file_t *, struct stat *);
96 static int          kqueue_close(file_t *);
97 static void         kqueue_restart(file_t *);
98 static int          kqueue_fpathconf(file_t *, int, register_t *);
99 static int          kqueue_register(struct kqueue *, struct kevent *);
100 static void         kqueue_doclose(struct kqueue *, struct klist *, int);
101 
102 static void         knote_detach(struct knote *, filedesc_t *fdp, bool);
103 static void         knote_enqueue(struct knote *);
104 static void         knote_activate(struct knote *);
105 static void         knote_activate_locked(struct knote *);
106 static void         knote_deactivate_locked(struct knote *);
107 
108 static void         filt_kqdetach(struct knote *);
109 static int          filt_kqueue(struct knote *, long hint);
110 static int          filt_procattach(struct knote *);
111 static void         filt_procdetach(struct knote *);
112 static int          filt_proc(struct knote *, long hint);
113 static int          filt_fileattach(struct knote *);
114 static void         filt_timerexpire(void *x);
115 static int          filt_timerattach(struct knote *);
116 static void         filt_timerdetach(struct knote *);
117 static int          filt_timer(struct knote *, long hint);
118 static int          filt_timertouch(struct knote *, struct kevent *, long type);
119 static int          filt_userattach(struct knote *);
120 static void         filt_userdetach(struct knote *);
121 static int          filt_user(struct knote *, long hint);
122 static int          filt_usertouch(struct knote *, struct kevent *, long type);
123 
124 /*
125  * Private knote state that should never be exposed outside
126  * of kern_event.c
127  *
128  * Field locking:
129  *
130  * q      kn_kq->kq_lock
131  */
132 struct knote_impl {
133           struct knote        ki_knote;
134           unsigned int        ki_influx;          /* q: in-flux counter */
135           kmutex_t  ki_foplock;         /* for kn_filterops */
136 };
137 
138 #define   KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote)
139 #define   KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote)
140 
141 static inline struct knote *
knote_alloc(bool sleepok)142 knote_alloc(bool sleepok)
143 {
144           struct knote_impl *ki;
145 
146           ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
147           mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
148 
149           return KIMPL_TO_KNOTE(ki);
150 }
151 
152 static inline void
knote_free(struct knote * kn)153 knote_free(struct knote *kn)
154 {
155           struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
156 
157           mutex_destroy(&ki->ki_foplock);
158           kmem_free(ki, sizeof(*ki));
159 }
160 
161 static inline void
knote_foplock_enter(struct knote * kn)162 knote_foplock_enter(struct knote *kn)
163 {
164           mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
165 }
166 
167 static inline void
knote_foplock_exit(struct knote * kn)168 knote_foplock_exit(struct knote *kn)
169 {
170           mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
171 }
172 
173 static inline bool __diagused
knote_foplock_owned(struct knote * kn)174 knote_foplock_owned(struct knote *kn)
175 {
176           return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
177 }
178 
179 static const struct fileops kqueueops = {
180           .fo_name = "kqueue",
181           .fo_read = (void *)enxio,
182           .fo_write = (void *)enxio,
183           .fo_ioctl = kqueue_ioctl,
184           .fo_fcntl = kqueue_fcntl,
185           .fo_poll = kqueue_poll,
186           .fo_stat = kqueue_stat,
187           .fo_close = kqueue_close,
188           .fo_kqfilter = kqueue_kqfilter,
189           .fo_restart = kqueue_restart,
190           .fo_fpathconf = kqueue_fpathconf,
191 };
192 
193 static void
filt_nopdetach(struct knote * kn __unused)194 filt_nopdetach(struct knote *kn __unused)
195 {
196 }
197 
198 static int
filt_nopevent(struct knote * kn __unused,long hint __unused)199 filt_nopevent(struct knote *kn __unused, long hint __unused)
200 {
201           return 0;
202 }
203 
204 static const struct filterops nop_fd_filtops = {
205           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
206           .f_attach = NULL,
207           .f_detach = filt_nopdetach,
208           .f_event = filt_nopevent,
209 };
210 
211 static const struct filterops nop_filtops = {
212           .f_flags = FILTEROP_MPSAFE,
213           .f_attach = NULL,
214           .f_detach = filt_nopdetach,
215           .f_event = filt_nopevent,
216 };
217 
218 static const struct filterops kqread_filtops = {
219           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
220           .f_attach = NULL,
221           .f_detach = filt_kqdetach,
222           .f_event = filt_kqueue,
223 };
224 
225 static const struct filterops proc_filtops = {
226           .f_flags = FILTEROP_MPSAFE,
227           .f_attach = filt_procattach,
228           .f_detach = filt_procdetach,
229           .f_event = filt_proc,
230 };
231 
232 /*
233  * file_filtops is not marked MPSAFE because it's going to call
234  * fileops::fo_kqfilter(), which might not be.  That function,
235  * however, will override the knote's filterops, and thus will
236  * inherit the MPSAFE-ness of the back-end at that time.
237  */
238 static const struct filterops file_filtops = {
239           .f_flags = FILTEROP_ISFD,
240           .f_attach = filt_fileattach,
241           .f_detach = NULL,
242           .f_event = NULL,
243 };
244 
245 static const struct filterops timer_filtops = {
246           .f_flags = FILTEROP_MPSAFE,
247           .f_attach = filt_timerattach,
248           .f_detach = filt_timerdetach,
249           .f_event = filt_timer,
250           .f_touch = filt_timertouch,
251 };
252 
253 static const struct filterops user_filtops = {
254           .f_flags = FILTEROP_MPSAFE,
255           .f_attach = filt_userattach,
256           .f_detach = filt_userdetach,
257           .f_event = filt_user,
258           .f_touch = filt_usertouch,
259 };
260 
261 static u_int        kq_ncallouts = 0;
262 static int          kq_calloutmax = (4 * 1024);
263 
264 #define   KN_HASHSIZE                   64                  /* XXX should be tunable */
265 #define   KN_HASH(val, mask)  (((val) ^ (val >> 8)) & (mask))
266 
267 extern const struct filterops fs_filtops;         /* vfs_syscalls.c */
268 extern const struct filterops sig_filtops;        /* kern_sig.c */
269 
270 /*
271  * Table for all system-defined filters.
272  * These should be listed in the numeric order of the EVFILT_* defines.
273  * If filtops is NULL, the filter isn't implemented in NetBSD.
274  * End of list is when name is NULL.
275  *
276  * Note that 'refcnt' is meaningless for built-in filters.
277  */
278 struct kfilter {
279           const char          *name;              /* name of filter */
280           uint32_t  filter;             /* id of filter */
281           unsigned  refcnt;             /* reference count */
282           const struct filterops *filtops;/* operations for filter */
283           size_t              namelen;  /* length of name string */
284 };
285 
286 /* System defined filters */
287 static struct kfilter sys_kfilters[] = {
288           { "EVFILT_READ",    EVFILT_READ,        0, &file_filtops, 0 },
289           { "EVFILT_WRITE",   EVFILT_WRITE,       0, &file_filtops, 0, },
290           { "EVFILT_AIO",               EVFILT_AIO,         0, NULL, 0 },
291           { "EVFILT_VNODE",   EVFILT_VNODE,       0, &file_filtops, 0 },
292           { "EVFILT_PROC",    EVFILT_PROC,        0, &proc_filtops, 0 },
293           { "EVFILT_SIGNAL",  EVFILT_SIGNAL,      0, &sig_filtops, 0 },
294           { "EVFILT_TIMER",   EVFILT_TIMER,       0, &timer_filtops, 0 },
295           { "EVFILT_FS",                EVFILT_FS,          0, &fs_filtops, 0 },
296           { "EVFILT_USER",    EVFILT_USER,        0, &user_filtops, 0 },
297           { "EVFILT_EMPTY",   EVFILT_EMPTY,       0, &file_filtops, 0 },
298           { NULL,                       0,                  0, NULL, 0 },
299 };
300 
301 /* User defined kfilters */
302 static struct kfilter         *user_kfilters;               /* array */
303 static int                    user_kfilterc;                /* current offset */
304 static int                    user_kfiltermaxc;   /* max size so far */
305 static size_t                 user_kfiltersz;               /* size of allocated memory */
306 
307 /*
308  * Global Locks.
309  *
310  * Lock order:
311  *
312  *        kqueue_filter_lock
313  *        -> kn_kq->kq_fdp->fd_lock
314  *        -> knote foplock (if taken)
315  *        -> object lock (e.g., device driver lock, &c.)
316  *        -> kn_kq->kq_lock
317  *
318  * Locking rules.  ==> indicates the lock is acquired by the backing
319  * object, locks prior are acquired before calling filter ops:
320  *
321  *        f_attach: fdp->fd_lock -> knote foplock ->
322  *          (maybe) KERNEL_LOCK ==> backing object lock
323  *
324  *        f_detach: fdp->fd_lock -> knote foplock ->
325  *           (maybe) KERNEL_LOCK ==> backing object lock
326  *
327  *        f_event via kevent: fdp->fd_lock -> knote foplock ->
328  *           (maybe) KERNEL_LOCK ==> backing object lock
329  *           N.B. NOTE_SUBMIT will never be set in the "hint" argument
330  *           in this case.
331  *
332  *        f_event via knote (via backing object: Whatever caller guarantees.
333  *        Typically:
334  *                  f_event(NOTE_SUBMIT): caller has already acquired backing
335  *                      object lock.
336  *                  f_event(!NOTE_SUBMIT): caller has not acquired backing object,
337  *                      lock or has possibly acquired KERNEL_LOCK.  Backing object
338  *                      lock may or may not be acquired as-needed.
339  *        N.B. the knote foplock will **not** be acquired in this case.  The
340  *        caller guarantees that klist_fini() will not be called concurrently
341  *        with knote().
342  *
343  *        f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
344  *            N.B. knote foplock is **not** acquired in this case and
345  *            the caller must guarantee that klist_fini() will never
346  *            be called.  kevent_register() restricts filters that
347  *            provide f_touch to known-safe cases.
348  *
349  *        klist_fini(): Caller must guarantee that no more knotes can
350  *            be attached to the klist, and must **not** hold the backing
351  *            object's lock; klist_fini() itself will acquire the foplock
352  *            of each knote on the klist.
353  *
354  * Locking rules when detaching knotes:
355  *
356  * There are some situations where knote submission may require dropping
357  * locks (see knote_proc_fork()).  In order to support this, it's possible
358  * to mark a knote as being 'in-flux'.  Such a knote is guaranteed not to
359  * be detached while it remains in-flux.  Because it will not be detached,
360  * locks can be dropped so e.g. memory can be allocated, locks on other
361  * data structures can be acquired, etc.  During this time, any attempt to
362  * detach an in-flux knote must wait until the knote is no longer in-flux.
363  * When this happens, the knote is marked for death (KN_WILLDETACH) and the
364  * LWP who gets to finish the detach operation is recorded in the knote's
365  * 'udata' field (which is no longer required for its original purpose once
366  * a knote is so marked).  Code paths that lead to knote_detach() must ensure
367  * that their LWP is the one tasked with its final demise after waiting for
368  * the in-flux status of the knote to clear.  Note that once a knote is
369  * marked KN_WILLDETACH, no code paths may put it into an in-flux state.
370  *
371  * Once the special circumstances have been handled, the locks are re-
372  * acquired in the proper order (object lock -> kq_lock), the knote taken
373  * out of flux, and any waiters are notified.  Because waiters must have
374  * also dropped *their* locks in order to safely block, they must re-
375  * validate all of their assumptions; see knote_detach_quiesce().  See also
376  * the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
377  * cases.
378  *
379  * When kqueue_scan() encounters an in-flux knote, the situation is
380  * treated like another LWP's list marker.
381  *
382  * LISTEN WELL: It is important to not hold knotes in flux for an
383  * extended period of time! In-flux knotes effectively block any
384  * progress of the kqueue_scan() operation.  Any code paths that place
385  * knotes in-flux should be careful to not block for indefinite periods
386  * of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
387  * KM_SLEEP is not).
388  */
389 static krwlock_t    kqueue_filter_lock; /* lock on filter lists */
390 
391 #define   KQ_FLUX_WAIT(kq)    (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
392 #define   KQ_FLUX_WAKEUP(kq)  cv_broadcast(&kq->kq_cv)
393 
394 static inline bool
kn_in_flux(struct knote * kn)395 kn_in_flux(struct knote *kn)
396 {
397           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
398           return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
399 }
400 
401 static inline bool
kn_enter_flux(struct knote * kn)402 kn_enter_flux(struct knote *kn)
403 {
404           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
405 
406           if (kn->kn_status & KN_WILLDETACH) {
407                     return false;
408           }
409 
410           struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
411           KASSERT(ki->ki_influx < UINT_MAX);
412           ki->ki_influx++;
413 
414           return true;
415 }
416 
417 static inline bool
kn_leave_flux(struct knote * kn)418 kn_leave_flux(struct knote *kn)
419 {
420           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
421 
422           struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
423           KASSERT(ki->ki_influx > 0);
424           ki->ki_influx--;
425           return ki->ki_influx == 0;
426 }
427 
428 static void
kn_wait_flux(struct knote * kn,bool can_loop)429 kn_wait_flux(struct knote *kn, bool can_loop)
430 {
431           struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
432           bool loop;
433 
434           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
435 
436           /*
437            * It may not be safe for us to touch the knote again after
438            * dropping the kq_lock.  The caller has let us know in
439            * 'can_loop'.
440            */
441           for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
442                     KQ_FLUX_WAIT(kn->kn_kq);
443           }
444 }
445 
446 #define   KNOTE_WILLDETACH(kn)                                                            \
447 do {                                                                                      \
448           (kn)->kn_status |= KN_WILLDETACH;                                     \
449           (kn)->kn_kevent.udata = curlwp;                                                 \
450 } while (/*CONSTCOND*/0)
451 
452 /*
453  * Wait until the specified knote is in a quiescent state and
454  * safe to detach.  Returns true if we potentially blocked (and
455  * thus dropped our locks).
456  */
457 static bool
knote_detach_quiesce(struct knote * kn)458 knote_detach_quiesce(struct knote *kn)
459 {
460           struct kqueue *kq = kn->kn_kq;
461           filedesc_t *fdp = kq->kq_fdp;
462 
463           KASSERT(mutex_owned(&fdp->fd_lock));
464 
465           mutex_spin_enter(&kq->kq_lock);
466           /*
467            * There are two cases where we might see KN_WILLDETACH here:
468            *
469            * 1. Someone else has already started detaching the knote but
470            *    had to wait for it to settle first.
471            *
472            * 2. We had to wait for it to settle, and had to come back
473            *    around after re-acquiring the locks.
474            *
475            * When KN_WILLDETACH is set, we also set the LWP that claimed
476            * the prize of finishing the detach in the 'udata' field of the
477            * knote (which will never be used again for its usual purpose
478            * once the note is in this state).  If it doesn't point to us,
479            * we must drop the locks and let them in to finish the job.
480            *
481            * Otherwise, once we have claimed the knote for ourselves, we
482            * can finish waiting for it to settle.  The is the only scenario
483            * where touching a detaching knote is safe after dropping the
484            * locks.
485            */
486           if ((kn->kn_status & KN_WILLDETACH) != 0 &&
487               kn->kn_kevent.udata != curlwp) {
488                     /*
489                      * N.B. it is NOT safe for us to touch the knote again
490                      * after dropping the locks here.  The caller must go
491                      * back around and re-validate everything.  However, if
492                      * the knote is in-flux, we want to block to minimize
493                      * busy-looping.
494                      */
495                     mutex_exit(&fdp->fd_lock);
496                     if (kn_in_flux(kn)) {
497                               kn_wait_flux(kn, false);
498                               mutex_spin_exit(&kq->kq_lock);
499                               return true;
500                     }
501                     mutex_spin_exit(&kq->kq_lock);
502                     preempt_point();
503                     return true;
504           }
505           /*
506            * If we get here, we know that we will be claiming the
507            * detach responsibilies, or that we already have and
508            * this is the second attempt after re-validation.
509            */
510           KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
511                     kn->kn_kevent.udata == curlwp);
512           /*
513            * Similarly, if we get here, either we are just claiming it
514            * and may have to wait for it to settle, or if this is the
515            * second attempt after re-validation that no other code paths
516            * have put it in-flux.
517            */
518           KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
519                     kn_in_flux(kn) == false);
520           KNOTE_WILLDETACH(kn);
521           if (kn_in_flux(kn)) {
522                     mutex_exit(&fdp->fd_lock);
523                     kn_wait_flux(kn, true);
524                     /*
525                      * It is safe for us to touch the knote again after
526                      * dropping the locks, but the caller must still
527                      * re-validate everything because other aspects of
528                      * the environment may have changed while we blocked.
529                      */
530                     KASSERT(kn_in_flux(kn) == false);
531                     mutex_spin_exit(&kq->kq_lock);
532                     return true;
533           }
534           mutex_spin_exit(&kq->kq_lock);
535 
536           return false;
537 }
538 
539 /*
540  * Calls into the filterops need to be resilient against things which
541  * destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
542  * chasing garbage pointers (to data, or even potentially code in a
543  * module about to be unloaded).  To that end, we acquire the
544  * knote foplock before calling into the filter ops.  When a driver
545  * (or anything else) is tearing down its klist, klist_fini() enumerates
546  * each knote, acquires its foplock, and replaces the filterops with a
547  * nop stub, allowing knote detach (when descriptors are closed) to safely
548  * proceed.
549  */
550 
551 static int
filter_attach(struct knote * kn)552 filter_attach(struct knote *kn)
553 {
554           int rv;
555 
556           KASSERT(knote_foplock_owned(kn));
557           KASSERT(kn->kn_fop != NULL);
558           KASSERT(kn->kn_fop->f_attach != NULL);
559 
560           /*
561            * N.B. that kn->kn_fop may change as the result of calling
562            * f_attach().  After f_attach() returns, kn->kn_fop may not
563            * be modified by code outside of klist_fini().
564            */
565           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
566                     rv = kn->kn_fop->f_attach(kn);
567           } else {
568                     KERNEL_LOCK(1, NULL);
569                     rv = kn->kn_fop->f_attach(kn);
570                     KERNEL_UNLOCK_ONE(NULL);
571           }
572 
573           return rv;
574 }
575 
576 static void
filter_detach(struct knote * kn)577 filter_detach(struct knote *kn)
578 {
579 
580           KASSERT(knote_foplock_owned(kn));
581           KASSERT(kn->kn_fop != NULL);
582           KASSERT(kn->kn_fop->f_detach != NULL);
583 
584           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
585                     kn->kn_fop->f_detach(kn);
586           } else {
587                     KERNEL_LOCK(1, NULL);
588                     kn->kn_fop->f_detach(kn);
589                     KERNEL_UNLOCK_ONE(NULL);
590           }
591 }
592 
593 static int
filter_event(struct knote * kn,long hint,bool submitting)594 filter_event(struct knote *kn, long hint, bool submitting)
595 {
596           int rv;
597 
598           /* See knote(). */
599           KASSERT(submitting || knote_foplock_owned(kn));
600           KASSERT(kn->kn_fop != NULL);
601           KASSERT(kn->kn_fop->f_event != NULL);
602 
603           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
604                     rv = kn->kn_fop->f_event(kn, hint);
605           } else {
606                     KERNEL_LOCK(1, NULL);
607                     rv = kn->kn_fop->f_event(kn, hint);
608                     KERNEL_UNLOCK_ONE(NULL);
609           }
610 
611           return rv;
612 }
613 
614 static int
filter_touch(struct knote * kn,struct kevent * kev,long type)615 filter_touch(struct knote *kn, struct kevent *kev, long type)
616 {
617 
618           /*
619            * XXX We cannot assert that the knote foplock is held here
620            * XXX beause we cannot safely acquire it in all cases
621            * XXX where "touch" will be used in kqueue_scan().  We just
622            * XXX have to assume that f_touch will always be safe to call,
623            * XXX and kqueue_register() allows only the two known-safe
624            * XXX users of that op.
625            */
626 
627           KASSERT(kn->kn_fop != NULL);
628           KASSERT(kn->kn_fop->f_touch != NULL);
629 
630           return kn->kn_fop->f_touch(kn, kev, type);
631 }
632 
633 static kauth_listener_t       kqueue_listener;
634 
635 static int
kqueue_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)636 kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
637     void *arg0, void *arg1, void *arg2, void *arg3)
638 {
639           struct proc *p;
640           int result;
641 
642           result = KAUTH_RESULT_DEFER;
643           p = arg0;
644 
645           if (action != KAUTH_PROCESS_KEVENT_FILTER)
646                     return result;
647 
648           if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) ||
649               ISSET(p->p_flag, PK_SUGID)))
650                     return result;
651 
652           result = KAUTH_RESULT_ALLOW;
653 
654           return result;
655 }
656 
657 /*
658  * Initialize the kqueue subsystem.
659  */
660 void
kqueue_init(void)661 kqueue_init(void)
662 {
663 
664           rw_init(&kqueue_filter_lock);
665 
666           kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
667               kqueue_listener_cb, NULL);
668 }
669 
670 /*
671  * Find kfilter entry by name, or NULL if not found.
672  */
673 static struct kfilter *
kfilter_byname_sys(const char * name)674 kfilter_byname_sys(const char *name)
675 {
676           int i;
677 
678           KASSERT(rw_lock_held(&kqueue_filter_lock));
679 
680           for (i = 0; sys_kfilters[i].name != NULL; i++) {
681                     if (strcmp(name, sys_kfilters[i].name) == 0)
682                               return &sys_kfilters[i];
683           }
684           return NULL;
685 }
686 
687 static struct kfilter *
kfilter_byname_user(const char * name)688 kfilter_byname_user(const char *name)
689 {
690           int i;
691 
692           KASSERT(rw_lock_held(&kqueue_filter_lock));
693 
694           /* user filter slots have a NULL name if previously deregistered */
695           for (i = 0; i < user_kfilterc ; i++) {
696                     if (user_kfilters[i].name != NULL &&
697                         strcmp(name, user_kfilters[i].name) == 0)
698                               return &user_kfilters[i];
699           }
700           return NULL;
701 }
702 
703 static struct kfilter *
kfilter_byname(const char * name)704 kfilter_byname(const char *name)
705 {
706           struct kfilter *kfilter;
707 
708           KASSERT(rw_lock_held(&kqueue_filter_lock));
709 
710           if ((kfilter = kfilter_byname_sys(name)) != NULL)
711                     return kfilter;
712 
713           return kfilter_byname_user(name);
714 }
715 
716 /*
717  * Find kfilter entry by filter id, or NULL if not found.
718  * Assumes entries are indexed in filter id order, for speed.
719  */
720 static struct kfilter *
kfilter_byfilter(uint32_t filter)721 kfilter_byfilter(uint32_t filter)
722 {
723           struct kfilter *kfilter;
724 
725           KASSERT(rw_lock_held(&kqueue_filter_lock));
726 
727           if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
728                     kfilter = &sys_kfilters[filter];
729           else if (user_kfilters != NULL &&
730               filter < EVFILT_SYSCOUNT + user_kfilterc)
731                                                   /* it's a user filter */
732                     kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
733           else
734                     return (NULL);                /* out of range */
735           KASSERT(kfilter->filter == filter);     /* sanity check! */
736           return (kfilter);
737 }
738 
739 /*
740  * Register a new kfilter. Stores the entry in user_kfilters.
741  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
742  * If retfilter != NULL, the new filterid is returned in it.
743  */
744 int
kfilter_register(const char * name,const struct filterops * filtops,int * retfilter)745 kfilter_register(const char *name, const struct filterops *filtops,
746                      int *retfilter)
747 {
748           struct kfilter *kfilter;
749           size_t len;
750           int i;
751 
752           if (name == NULL || name[0] == '\0' || filtops == NULL)
753                     return (EINVAL);    /* invalid args */
754 
755           rw_enter(&kqueue_filter_lock, RW_WRITER);
756           if (kfilter_byname(name) != NULL) {
757                     rw_exit(&kqueue_filter_lock);
758                     return (EEXIST);    /* already exists */
759           }
760           if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
761                     rw_exit(&kqueue_filter_lock);
762                     return (EINVAL);    /* too many */
763           }
764 
765           for (i = 0; i < user_kfilterc; i++) {
766                     kfilter = &user_kfilters[i];
767                     if (kfilter->name == NULL) {
768                               /* Previously deregistered slot.  Reuse. */
769                               goto reuse;
770                     }
771           }
772 
773           /* check if need to grow user_kfilters */
774           if (user_kfilterc + 1 > user_kfiltermaxc) {
775                     /* Grow in KFILTER_EXTENT chunks. */
776                     user_kfiltermaxc += KFILTER_EXTENT;
777                     len = user_kfiltermaxc * sizeof(*kfilter);
778                     kfilter = kmem_alloc(len, KM_SLEEP);
779                     memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
780                     if (user_kfilters != NULL) {
781                               memcpy(kfilter, user_kfilters, user_kfiltersz);
782                               kmem_free(user_kfilters, user_kfiltersz);
783                     }
784                     user_kfiltersz = len;
785                     user_kfilters = kfilter;
786           }
787           /* Adding new slot */
788           kfilter = &user_kfilters[user_kfilterc++];
789 reuse:
790           kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
791 
792           kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
793 
794           kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
795           memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
796 
797           if (retfilter != NULL)
798                     *retfilter = kfilter->filter;
799           rw_exit(&kqueue_filter_lock);
800 
801           return (0);
802 }
803 
804 /*
805  * Unregister a kfilter previously registered with kfilter_register.
806  * This retains the filter id, but clears the name and frees filtops (filter
807  * operations), so that the number isn't reused during a boot.
808  * Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
809  */
810 int
kfilter_unregister(const char * name)811 kfilter_unregister(const char *name)
812 {
813           struct kfilter *kfilter;
814 
815           if (name == NULL || name[0] == '\0')
816                     return (EINVAL);    /* invalid name */
817 
818           rw_enter(&kqueue_filter_lock, RW_WRITER);
819           if (kfilter_byname_sys(name) != NULL) {
820                     rw_exit(&kqueue_filter_lock);
821                     return (EINVAL);    /* can't detach system filters */
822           }
823 
824           kfilter = kfilter_byname_user(name);
825           if (kfilter == NULL) {
826                     rw_exit(&kqueue_filter_lock);
827                     return (ENOENT);
828           }
829           if (kfilter->refcnt != 0) {
830                     rw_exit(&kqueue_filter_lock);
831                     return (EBUSY);
832           }
833 
834           /* Cast away const (but we know it's safe. */
835           kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
836           kfilter->name = NULL;         /* mark as `not implemented' */
837 
838           if (kfilter->filtops != NULL) {
839                     /* Cast away const (but we know it's safe. */
840                     kmem_free(__UNCONST(kfilter->filtops),
841                         sizeof(*kfilter->filtops));
842                     kfilter->filtops = NULL; /* mark as `not implemented' */
843           }
844           rw_exit(&kqueue_filter_lock);
845 
846           return (0);
847 }
848 
849 
850 /*
851  * Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
852  * descriptors. Calls fileops kqfilter method for given file descriptor.
853  */
854 static int
filt_fileattach(struct knote * kn)855 filt_fileattach(struct knote *kn)
856 {
857           file_t *fp;
858 
859           fp = kn->kn_obj;
860 
861           return (*fp->f_ops->fo_kqfilter)(fp, kn);
862 }
863 
864 /*
865  * Filter detach method for EVFILT_READ on kqueue descriptor.
866  */
867 static void
filt_kqdetach(struct knote * kn)868 filt_kqdetach(struct knote *kn)
869 {
870           struct kqueue *kq;
871 
872           kq = ((file_t *)kn->kn_obj)->f_kqueue;
873 
874           mutex_spin_enter(&kq->kq_lock);
875           selremove_knote(&kq->kq_sel, kn);
876           mutex_spin_exit(&kq->kq_lock);
877 }
878 
879 /*
880  * Filter event method for EVFILT_READ on kqueue descriptor.
881  */
882 /*ARGSUSED*/
883 static int
filt_kqueue(struct knote * kn,long hint)884 filt_kqueue(struct knote *kn, long hint)
885 {
886           struct kqueue *kq;
887           int rv;
888 
889           kq = ((file_t *)kn->kn_obj)->f_kqueue;
890 
891           if (hint != NOTE_SUBMIT)
892                     mutex_spin_enter(&kq->kq_lock);
893           kn->kn_data = KQ_COUNT(kq);
894           rv = (kn->kn_data > 0);
895           if (hint != NOTE_SUBMIT)
896                     mutex_spin_exit(&kq->kq_lock);
897 
898           return rv;
899 }
900 
901 /*
902  * Filter attach method for EVFILT_PROC.
903  */
904 static int
filt_procattach(struct knote * kn)905 filt_procattach(struct knote *kn)
906 {
907           struct proc *p;
908 
909           mutex_enter(&proc_lock);
910           p = proc_find(kn->kn_id);
911           if (p == NULL) {
912                     mutex_exit(&proc_lock);
913                     return ESRCH;
914           }
915 
916           /*
917            * Fail if it's not owned by you, or the last exec gave us
918            * setuid/setgid privs (unless you're root).
919            */
920           mutex_enter(p->p_lock);
921           mutex_exit(&proc_lock);
922           if (kauth_authorize_process(curlwp->l_cred,
923               KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
924                     mutex_exit(p->p_lock);
925                     return EACCES;
926           }
927 
928           kn->kn_obj = p;
929           kn->kn_flags |= EV_CLEAR;     /* automatically set */
930 
931           /*
932            * NOTE_CHILD is only ever generated internally; don't let it
933            * leak in from user-space.  See knote_proc_fork_track().
934            */
935           kn->kn_sfflags &= ~NOTE_CHILD;
936 
937           klist_insert(&p->p_klist, kn);
938           mutex_exit(p->p_lock);
939 
940           return 0;
941 }
942 
943 /*
944  * Filter detach method for EVFILT_PROC.
945  *
946  * The knote may be attached to a different process, which may exit,
947  * leaving nothing for the knote to be attached to.  So when the process
948  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
949  * it will be deleted when read out.  However, as part of the knote deletion,
950  * this routine is called, so a check is needed to avoid actually performing
951  * a detach, because the original process might not exist any more.
952  */
953 static void
filt_procdetach(struct knote * kn)954 filt_procdetach(struct knote *kn)
955 {
956           struct kqueue *kq = kn->kn_kq;
957           struct proc *p;
958 
959           /*
960            * We have to synchronize with knote_proc_exit(), but we
961            * are forced to acquire the locks in the wrong order here
962            * because we can't be sure kn->kn_obj is valid unless
963            * KN_DETACHED is not set.
964            */
965  again:
966           mutex_spin_enter(&kq->kq_lock);
967           if ((kn->kn_status & KN_DETACHED) == 0) {
968                     p = kn->kn_obj;
969                     if (!mutex_tryenter(p->p_lock)) {
970                               mutex_spin_exit(&kq->kq_lock);
971                               preempt_point();
972                               goto again;
973                     }
974                     kn->kn_status |= KN_DETACHED;
975                     klist_remove(&p->p_klist, kn);
976                     mutex_exit(p->p_lock);
977           }
978           mutex_spin_exit(&kq->kq_lock);
979 }
980 
981 /*
982  * Filter event method for EVFILT_PROC.
983  *
984  * Due to some of the complexities of process locking, we have special
985  * entry points for delivering knote submissions.  filt_proc() is used
986  * only to check for activation from kqueue_register() and kqueue_scan().
987  */
988 static int
filt_proc(struct knote * kn,long hint)989 filt_proc(struct knote *kn, long hint)
990 {
991           struct kqueue *kq = kn->kn_kq;
992           uint32_t fflags;
993 
994           /*
995            * Because we share the same klist with signal knotes, just
996            * ensure that we're not being invoked for the proc-related
997            * submissions.
998            */
999           KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);
1000 
1001           mutex_spin_enter(&kq->kq_lock);
1002           fflags = kn->kn_fflags;
1003           mutex_spin_exit(&kq->kq_lock);
1004 
1005           return fflags != 0;
1006 }
1007 
1008 void
knote_proc_exec(struct proc * p)1009 knote_proc_exec(struct proc *p)
1010 {
1011           struct knote *kn, *tmpkn;
1012           struct kqueue *kq;
1013           uint32_t fflags;
1014 
1015           mutex_enter(p->p_lock);
1016 
1017           SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
1018                     /* N.B. EVFILT_SIGNAL knotes are on this same list. */
1019                     if (kn->kn_fop == &sig_filtops) {
1020                               continue;
1021                     }
1022                     KASSERT(kn->kn_fop == &proc_filtops);
1023 
1024                     kq = kn->kn_kq;
1025                     mutex_spin_enter(&kq->kq_lock);
1026                     fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
1027                     if (fflags) {
1028                               knote_activate_locked(kn);
1029                     }
1030                     mutex_spin_exit(&kq->kq_lock);
1031           }
1032 
1033           mutex_exit(p->p_lock);
1034 }
1035 
1036 static int __noinline
knote_proc_fork_track(struct proc * p1,struct proc * p2,struct knote * okn)1037 knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
1038 {
1039           struct kqueue *kq = okn->kn_kq;
1040 
1041           KASSERT(mutex_owned(&kq->kq_lock));
1042           KASSERT(mutex_owned(p1->p_lock));
1043 
1044           /*
1045            * We're going to put this knote into flux while we drop
1046            * the locks and create and attach a new knote to track the
1047            * child.  If we are not able to enter flux, then this knote
1048            * is about to go away, so skip the notification.
1049            */
1050           if (!kn_enter_flux(okn)) {
1051                     return 0;
1052           }
1053 
1054           mutex_spin_exit(&kq->kq_lock);
1055           mutex_exit(p1->p_lock);
1056 
1057           /*
1058            * We actually have to register *two* new knotes:
1059            *
1060            * ==> One for the NOTE_CHILD notification.  This is a forced
1061            *     ONESHOT note.
1062            *
1063            * ==> One to actually track the child process as it subsequently
1064            *     forks, execs, and, ultimately, exits.
1065            *
1066            * If we only register a single knote, then it's possible for
1067            * for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
1068            * notification if the child exits before the tracking process
1069            * has received the NOTE_CHILD notification, which applications
1070            * aren't expecting (the event's 'data' field would be clobbered,
1071            * for example).
1072            *
1073            * To do this, what we have here is an **extremely** stripped-down
1074            * version of kqueue_register() that has the following properties:
1075            *
1076            * ==> Does not block to allocate memory.  If we are unable
1077            *     to allocate memory, we return ENOMEM.
1078            *
1079            * ==> Does not search for existing knotes; we know there
1080            *     are not any because this is a new process that isn't
1081            *     even visible to other processes yet.
1082            *
1083            * ==> Assumes that the knhash for our kq's descriptor table
1084            *     already exists (after all, we're already tracking
1085            *     processes with knotes if we got here).
1086            *
1087            * ==> Directly attaches the new tracking knote to the child
1088            *     process.
1089            *
1090            * The whole point is to do the minimum amount of work while the
1091            * knote is held in-flux, and to avoid doing extra work in general
1092            * (we already have the new child process; why bother looking it
1093            * up again?).
1094            */
1095           filedesc_t *fdp = kq->kq_fdp;
1096           struct knote *knchild, *kntrack;
1097           int error = 0;
1098 
1099           knchild = knote_alloc(false);
1100           kntrack = knote_alloc(false);
1101           if (__predict_false(knchild == NULL || kntrack == NULL)) {
1102                     error = ENOMEM;
1103                     goto out;
1104           }
1105 
1106           kntrack->kn_obj = p2;
1107           kntrack->kn_id = p2->p_pid;
1108           kntrack->kn_kq = kq;
1109           kntrack->kn_fop = okn->kn_fop;
1110           kntrack->kn_kfilter = okn->kn_kfilter;
1111           kntrack->kn_sfflags = okn->kn_sfflags;
1112           kntrack->kn_sdata = p1->p_pid;
1113 
1114           kntrack->kn_kevent.ident = p2->p_pid;
1115           kntrack->kn_kevent.filter = okn->kn_filter;
1116           kntrack->kn_kevent.flags =
1117               okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
1118           kntrack->kn_kevent.fflags = 0;
1119           kntrack->kn_kevent.data = 0;
1120           kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */
1121 
1122           /*
1123            * The child note does not need to be attached to the
1124            * new proc's klist at all.
1125            */
1126           *knchild = *kntrack;
1127           knchild->kn_status = KN_DETACHED;
1128           knchild->kn_sfflags = 0;
1129           knchild->kn_kevent.flags |= EV_ONESHOT;
1130           knchild->kn_kevent.fflags = NOTE_CHILD;
1131           knchild->kn_kevent.data = p1->p_pid;               /* parent */
1132 
1133           mutex_enter(&fdp->fd_lock);
1134 
1135           /*
1136            * We need to check to see if the kq is closing, and skip
1137            * attaching the knote if so.  Normally, this isn't necessary
1138            * when coming in the front door because the file descriptor
1139            * layer will synchronize this.
1140            *
1141            * It's safe to test KQ_CLOSING without taking the kq_lock
1142            * here because that flag is only ever set when the fd_lock
1143            * is also held.
1144            */
1145           if (__predict_false(kq->kq_count & KQ_CLOSING)) {
1146                     mutex_exit(&fdp->fd_lock);
1147                     goto out;
1148           }
1149 
1150           /*
1151            * We do the "insert into FD table" and "attach to klist" steps
1152            * in the opposite order of kqueue_register() here to avoid
1153            * having to take p2->p_lock twice.  But this is OK because we
1154            * hold fd_lock across the entire operation.
1155            */
1156 
1157           mutex_enter(p2->p_lock);
1158           error = kauth_authorize_process(curlwp->l_cred,
1159               KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
1160           if (__predict_false(error != 0)) {
1161                     mutex_exit(p2->p_lock);
1162                     mutex_exit(&fdp->fd_lock);
1163                     error = EACCES;
1164                     goto out;
1165           }
1166           klist_insert(&p2->p_klist, kntrack);
1167           mutex_exit(p2->p_lock);
1168 
1169           KASSERT(fdp->fd_knhashmask != 0);
1170           KASSERT(fdp->fd_knhash != NULL);
1171           struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
1172               fdp->fd_knhashmask)];
1173           SLIST_INSERT_HEAD(list, kntrack, kn_link);
1174           SLIST_INSERT_HEAD(list, knchild, kn_link);
1175 
1176           /* This adds references for knchild *and* kntrack. */
1177           atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);
1178 
1179           knote_activate(knchild);
1180 
1181           kntrack = NULL;
1182           knchild = NULL;
1183 
1184           mutex_exit(&fdp->fd_lock);
1185 
1186  out:
1187           if (__predict_false(knchild != NULL)) {
1188                     knote_free(knchild);
1189           }
1190           if (__predict_false(kntrack != NULL)) {
1191                     knote_free(kntrack);
1192           }
1193           mutex_enter(p1->p_lock);
1194           mutex_spin_enter(&kq->kq_lock);
1195 
1196           if (kn_leave_flux(okn)) {
1197                     KQ_FLUX_WAKEUP(kq);
1198           }
1199 
1200           return error;
1201 }
1202 
1203 void
knote_proc_fork(struct proc * p1,struct proc * p2)1204 knote_proc_fork(struct proc *p1, struct proc *p2)
1205 {
1206           struct knote *kn;
1207           struct kqueue *kq;
1208           uint32_t fflags;
1209 
1210           mutex_enter(p1->p_lock);
1211 
1212           /*
1213            * N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
1214            * don't want to pre-fetch the next knote; in the event we
1215            * have to drop p_lock, we will have put the knote in-flux,
1216            * meaning that no one will be able to detach it until we
1217            * have taken the knote out of flux.  However, that does
1218            * NOT stop someone else from detaching the next note in the
1219            * list while we have it unlocked.  Thus, we want to fetch
1220            * the next note in the list only after we have re-acquired
1221            * the lock, and using SLIST_FOREACH() will satisfy that.
1222            */
1223           SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
1224                     /* N.B. EVFILT_SIGNAL knotes are on this same list. */
1225                     if (kn->kn_fop == &sig_filtops) {
1226                               continue;
1227                     }
1228                     KASSERT(kn->kn_fop == &proc_filtops);
1229 
1230                     kq = kn->kn_kq;
1231                     mutex_spin_enter(&kq->kq_lock);
1232                     kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
1233                     if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
1234                               /*
1235                                * This will drop kq_lock and p_lock and
1236                                * re-acquire them before it returns.
1237                                */
1238                               if (knote_proc_fork_track(p1, p2, kn)) {
1239                                         kn->kn_fflags |= NOTE_TRACKERR;
1240                               }
1241                               KASSERT(mutex_owned(p1->p_lock));
1242                               KASSERT(mutex_owned(&kq->kq_lock));
1243                     }
1244                     fflags = kn->kn_fflags;
1245                     if (fflags) {
1246                               knote_activate_locked(kn);
1247                     }
1248                     mutex_spin_exit(&kq->kq_lock);
1249           }
1250 
1251           mutex_exit(p1->p_lock);
1252 }
1253 
1254 void
knote_proc_exit(struct proc * p)1255 knote_proc_exit(struct proc *p)
1256 {
1257           struct knote *kn;
1258           struct kqueue *kq;
1259 
1260           KASSERT(mutex_owned(p->p_lock));
1261 
1262           while (!SLIST_EMPTY(&p->p_klist)) {
1263                     kn = SLIST_FIRST(&p->p_klist);
1264                     kq = kn->kn_kq;
1265 
1266                     KASSERT(kn->kn_obj == p);
1267 
1268                     mutex_spin_enter(&kq->kq_lock);
1269                     kn->kn_data = P_WAITSTATUS(p);
1270                     /*
1271                      * Mark as ONESHOT, so that the knote is g/c'ed
1272                      * when read.
1273                      */
1274                     kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1275                     kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;
1276 
1277                     /*
1278                      * Detach the knote from the process and mark it as such.
1279                      * N.B. EVFILT_SIGNAL are also on p_klist, but by the
1280                      * time we get here, all open file descriptors for this
1281                      * process have been released, meaning that signal knotes
1282                      * will have already been detached.
1283                      *
1284                      * We need to synchronize this with filt_procdetach().
1285                      */
1286                     KASSERT(kn->kn_fop == &proc_filtops);
1287                     if ((kn->kn_status & KN_DETACHED) == 0) {
1288                               kn->kn_status |= KN_DETACHED;
1289                               SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
1290                     }
1291 
1292                     /*
1293                      * Always activate the knote for NOTE_EXIT regardless
1294                      * of whether or not the listener cares about it.
1295                      * This matches historical behavior.
1296                      */
1297                     knote_activate_locked(kn);
1298                     mutex_spin_exit(&kq->kq_lock);
1299           }
1300 }
1301 
1302 #define   FILT_TIMER_NOSCHED  ((uintptr_t)-1)
1303 
1304 static int
filt_timercompute(struct kevent * kev,uintptr_t * tticksp)1305 filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
1306 {
1307           struct timespec ts;
1308           uintptr_t tticks;
1309 
1310           if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
1311                     return EINVAL;
1312           }
1313 
1314           /*
1315            * Convert the event 'data' to a timespec, then convert the
1316            * timespec to callout ticks.
1317            */
1318           switch (kev->fflags & NOTE_TIMER_UNITMASK) {
1319           case NOTE_SECONDS:
1320                     ts.tv_sec = kev->data;
1321                     ts.tv_nsec = 0;
1322                     break;
1323 
1324           case NOTE_MSECONDS:           /* == historical value 0 */
1325                     ts.tv_sec = kev->data / 1000;
1326                     ts.tv_nsec = (kev->data % 1000) * 1000000;
1327                     break;
1328 
1329           case NOTE_USECONDS:
1330                     ts.tv_sec = kev->data / 1000000;
1331                     ts.tv_nsec = (kev->data % 1000000) * 1000;
1332                     break;
1333 
1334           case NOTE_NSECONDS:
1335                     ts.tv_sec = kev->data / 1000000000;
1336                     ts.tv_nsec = kev->data % 1000000000;
1337                     break;
1338 
1339           default:
1340                     return EINVAL;
1341           }
1342 
1343           if (kev->fflags & NOTE_ABSTIME) {
1344                     struct timespec deadline = ts;
1345 
1346                     /*
1347                      * Get current time.
1348                      *
1349                      * XXX This is CLOCK_REALTIME.  There is no way to
1350                      * XXX specify CLOCK_MONOTONIC.
1351                      */
1352                     nanotime(&ts);
1353 
1354                     /* Absolute timers do not repeat. */
1355                     kev->data = FILT_TIMER_NOSCHED;
1356 
1357                     /* If we're past the deadline, then the event will fire. */
1358                     if (timespeccmp(&deadline, &ts, <=)) {
1359                               tticks = FILT_TIMER_NOSCHED;
1360                               goto out;
1361                     }
1362 
1363                     /* Calculate how much time is left. */
1364                     timespecsub(&deadline, &ts, &ts);
1365           } else {
1366                     /* EV_CLEAR automatically set for relative timers. */
1367                     kev->flags |= EV_CLEAR;
1368           }
1369 
1370           tticks = tstohz(&ts);
1371 
1372           /* if the supplied value is under our resolution, use 1 tick */
1373           if (tticks == 0) {
1374                     if (kev->data == 0)
1375                               return EINVAL;
1376                     tticks = 1;
1377           } else if (tticks > INT_MAX) {
1378                     return EINVAL;
1379           }
1380 
1381           if ((kev->flags & EV_ONESHOT) != 0) {
1382                     /* Timer does not repeat. */
1383                     kev->data = FILT_TIMER_NOSCHED;
1384           } else {
1385                     KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
1386                     kev->data = tticks;
1387           }
1388 
1389  out:
1390           *tticksp = tticks;
1391 
1392           return 0;
1393 }
1394 
1395 static void
filt_timerexpire(void * knx)1396 filt_timerexpire(void *knx)
1397 {
1398           struct knote *kn = knx;
1399           struct kqueue *kq = kn->kn_kq;
1400 
1401           mutex_spin_enter(&kq->kq_lock);
1402           kn->kn_data++;
1403           knote_activate_locked(kn);
1404           if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
1405                     KASSERT(kn->kn_sdata > 0);
1406                     KASSERT(kn->kn_sdata <= INT_MAX);
1407                     callout_schedule((callout_t *)kn->kn_hook,
1408                         (int)kn->kn_sdata);
1409           }
1410           mutex_spin_exit(&kq->kq_lock);
1411 }
1412 
1413 static inline void
filt_timerstart(struct knote * kn,uintptr_t tticks)1414 filt_timerstart(struct knote *kn, uintptr_t tticks)
1415 {
1416           callout_t *calloutp = kn->kn_hook;
1417 
1418           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
1419           KASSERT(!callout_pending(calloutp));
1420 
1421           if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
1422                     kn->kn_data = 1;
1423           } else {
1424                     KASSERT(tticks <= INT_MAX);
1425                     callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
1426           }
1427 }
1428 
1429 static int
filt_timerattach(struct knote * kn)1430 filt_timerattach(struct knote *kn)
1431 {
1432           callout_t *calloutp;
1433           struct kqueue *kq;
1434           uintptr_t tticks;
1435           int error;
1436 
1437           struct kevent kev = {
1438                     .flags = kn->kn_flags,
1439                     .fflags = kn->kn_sfflags,
1440                     .data = kn->kn_sdata,
1441           };
1442 
1443           error = filt_timercompute(&kev, &tticks);
1444           if (error) {
1445                     return error;
1446           }
1447 
1448           if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
1449               (calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
1450                     atomic_dec_uint(&kq_ncallouts);
1451                     return ENOMEM;
1452           }
1453           callout_init(calloutp, CALLOUT_MPSAFE);
1454 
1455           kq = kn->kn_kq;
1456           mutex_spin_enter(&kq->kq_lock);
1457 
1458           kn->kn_sdata = kev.data;
1459           kn->kn_flags = kev.flags;
1460           KASSERT(kn->kn_sfflags == kev.fflags);
1461           kn->kn_hook = calloutp;
1462 
1463           filt_timerstart(kn, tticks);
1464 
1465           mutex_spin_exit(&kq->kq_lock);
1466 
1467           return (0);
1468 }
1469 
1470 static void
filt_timerdetach(struct knote * kn)1471 filt_timerdetach(struct knote *kn)
1472 {
1473           callout_t *calloutp;
1474           struct kqueue *kq = kn->kn_kq;
1475 
1476           /* prevent rescheduling when we expire */
1477           mutex_spin_enter(&kq->kq_lock);
1478           kn->kn_sdata = FILT_TIMER_NOSCHED;
1479           mutex_spin_exit(&kq->kq_lock);
1480 
1481           calloutp = (callout_t *)kn->kn_hook;
1482 
1483           /*
1484            * Attempt to stop the callout.  This will block if it's
1485            * already running.
1486            */
1487           callout_halt(calloutp, NULL);
1488 
1489           callout_destroy(calloutp);
1490           kmem_free(calloutp, sizeof(*calloutp));
1491           atomic_dec_uint(&kq_ncallouts);
1492 }
1493 
1494 static int
filt_timertouch(struct knote * kn,struct kevent * kev,long type)1495 filt_timertouch(struct knote *kn, struct kevent *kev, long type)
1496 {
1497           struct kqueue *kq = kn->kn_kq;
1498           callout_t *calloutp;
1499           uintptr_t tticks;
1500           int error;
1501 
1502           KASSERT(mutex_owned(&kq->kq_lock));
1503 
1504           switch (type) {
1505           case EVENT_REGISTER:
1506                     /* Only relevant for EV_ADD. */
1507                     if ((kev->flags & EV_ADD) == 0) {
1508                               return 0;
1509                     }
1510 
1511                     /*
1512                      * Stop the timer, under the assumption that if
1513                      * an application is re-configuring the timer,
1514                      * they no longer care about the old one.  We
1515                      * can safely drop the kq_lock while we wait
1516                      * because fdp->fd_lock will be held throughout,
1517                      * ensuring that no one can sneak in with an
1518                      * EV_DELETE or close the kq.
1519                      */
1520                     KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));
1521 
1522                     calloutp = kn->kn_hook;
1523                     callout_halt(calloutp, &kq->kq_lock);
1524                     KASSERT(mutex_owned(&kq->kq_lock));
1525                     knote_deactivate_locked(kn);
1526                     kn->kn_data = 0;
1527 
1528                     error = filt_timercompute(kev, &tticks);
1529                     if (error) {
1530                               return error;
1531                     }
1532                     kn->kn_sdata = kev->data;
1533                     kn->kn_flags = kev->flags;
1534                     kn->kn_sfflags = kev->fflags;
1535                     filt_timerstart(kn, tticks);
1536                     break;
1537 
1538           case EVENT_PROCESS:
1539                     *kev = kn->kn_kevent;
1540                     break;
1541 
1542           default:
1543                     panic("%s: invalid type (%ld)", __func__, type);
1544           }
1545 
1546           return 0;
1547 }
1548 
1549 static int
filt_timer(struct knote * kn,long hint)1550 filt_timer(struct knote *kn, long hint)
1551 {
1552           struct kqueue *kq = kn->kn_kq;
1553           int rv;
1554 
1555           mutex_spin_enter(&kq->kq_lock);
1556           rv = (kn->kn_data != 0);
1557           mutex_spin_exit(&kq->kq_lock);
1558 
1559           return rv;
1560 }
1561 
1562 static int
filt_userattach(struct knote * kn)1563 filt_userattach(struct knote *kn)
1564 {
1565           struct kqueue *kq = kn->kn_kq;
1566 
1567           /*
1568            * EVFILT_USER knotes are not attached to anything in the kernel.
1569            */
1570           mutex_spin_enter(&kq->kq_lock);
1571           kn->kn_hook = NULL;
1572           if (kn->kn_fflags & NOTE_TRIGGER)
1573                     kn->kn_hookid = 1;
1574           else
1575                     kn->kn_hookid = 0;
1576           mutex_spin_exit(&kq->kq_lock);
1577           return (0);
1578 }
1579 
1580 static void
filt_userdetach(struct knote * kn)1581 filt_userdetach(struct knote *kn)
1582 {
1583 
1584           /*
1585            * EVFILT_USER knotes are not attached to anything in the kernel.
1586            */
1587 }
1588 
1589 static int
filt_user(struct knote * kn,long hint)1590 filt_user(struct knote *kn, long hint)
1591 {
1592           struct kqueue *kq = kn->kn_kq;
1593           int hookid;
1594 
1595           mutex_spin_enter(&kq->kq_lock);
1596           hookid = kn->kn_hookid;
1597           mutex_spin_exit(&kq->kq_lock);
1598 
1599           return hookid;
1600 }
1601 
1602 static int
filt_usertouch(struct knote * kn,struct kevent * kev,long type)1603 filt_usertouch(struct knote *kn, struct kevent *kev, long type)
1604 {
1605           int ffctrl;
1606 
1607           KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
1608 
1609           switch (type) {
1610           case EVENT_REGISTER:
1611                     if (kev->fflags & NOTE_TRIGGER)
1612                               kn->kn_hookid = 1;
1613 
1614                     ffctrl = kev->fflags & NOTE_FFCTRLMASK;
1615                     kev->fflags &= NOTE_FFLAGSMASK;
1616                     switch (ffctrl) {
1617                     case NOTE_FFNOP:
1618                               break;
1619 
1620                     case NOTE_FFAND:
1621                               kn->kn_sfflags &= kev->fflags;
1622                               break;
1623 
1624                     case NOTE_FFOR:
1625                               kn->kn_sfflags |= kev->fflags;
1626                               break;
1627 
1628                     case NOTE_FFCOPY:
1629                               kn->kn_sfflags = kev->fflags;
1630                               break;
1631 
1632                     default:
1633                               /* XXX Return error? */
1634                               break;
1635                     }
1636                     kn->kn_sdata = kev->data;
1637                     if (kev->flags & EV_CLEAR) {
1638                               kn->kn_hookid = 0;
1639                               kn->kn_data = 0;
1640                               kn->kn_fflags = 0;
1641                     }
1642                     break;
1643 
1644           case EVENT_PROCESS:
1645                     *kev = kn->kn_kevent;
1646                     kev->fflags = kn->kn_sfflags;
1647                     kev->data = kn->kn_sdata;
1648                     if (kn->kn_flags & EV_CLEAR) {
1649                               kn->kn_hookid = 0;
1650                               kn->kn_data = 0;
1651                               kn->kn_fflags = 0;
1652                     }
1653                     break;
1654 
1655           default:
1656                     panic("filt_usertouch() - invalid type (%ld)", type);
1657                     break;
1658           }
1659 
1660           return 0;
1661 }
1662 
1663 /*
1664  * filt_seltrue:
1665  *
1666  *        This filter "event" routine simulates seltrue().
1667  */
1668 int
filt_seltrue(struct knote * kn,long hint)1669 filt_seltrue(struct knote *kn, long hint)
1670 {
1671 
1672           /*
1673            * We don't know how much data can be read/written,
1674            * but we know that it *can* be.  This is about as
1675            * good as select/poll does as well.
1676            */
1677           kn->kn_data = 0;
1678           return (1);
1679 }
1680 
1681 /*
1682  * This provides full kqfilter entry for device switch tables, which
1683  * has same effect as filter using filt_seltrue() as filter method.
1684  */
1685 static void
filt_seltruedetach(struct knote * kn)1686 filt_seltruedetach(struct knote *kn)
1687 {
1688           /* Nothing to do */
1689 }
1690 
1691 const struct filterops seltrue_filtops = {
1692           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
1693           .f_attach = NULL,
1694           .f_detach = filt_seltruedetach,
1695           .f_event = filt_seltrue,
1696 };
1697 
1698 int
seltrue_kqfilter(dev_t dev,struct knote * kn)1699 seltrue_kqfilter(dev_t dev, struct knote *kn)
1700 {
1701           switch (kn->kn_filter) {
1702           case EVFILT_READ:
1703           case EVFILT_WRITE:
1704                     kn->kn_fop = &seltrue_filtops;
1705                     break;
1706           default:
1707                     return (EINVAL);
1708           }
1709 
1710           /* Nothing more to do */
1711           return (0);
1712 }
1713 
1714 /*
1715  * kqueue(2) system call.
1716  */
1717 static int
kqueue1(struct lwp * l,int flags,register_t * retval)1718 kqueue1(struct lwp *l, int flags, register_t *retval)
1719 {
1720           struct kqueue *kq;
1721           file_t *fp;
1722           int fd, error;
1723 
1724           if ((error = fd_allocfile(&fp, &fd)) != 0)
1725                     return error;
1726           fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
1727           fp->f_type = DTYPE_KQUEUE;
1728           fp->f_ops = &kqueueops;
1729           kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
1730           mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
1731           cv_init(&kq->kq_cv, "kqueue");
1732           selinit(&kq->kq_sel);
1733           TAILQ_INIT(&kq->kq_head);
1734           fp->f_kqueue = kq;
1735           *retval = fd;
1736           kq->kq_fdp = curlwp->l_fd;
1737           fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
1738           fd_affix(curproc, fp, fd);
1739           return error;
1740 }
1741 
1742 /*
1743  * kqueue(2) system call.
1744  */
1745 int
sys_kqueue(struct lwp * l,const void * v,register_t * retval)1746 sys_kqueue(struct lwp *l, const void *v, register_t *retval)
1747 {
1748           return kqueue1(l, 0, retval);
1749 }
1750 
1751 int
sys_kqueue1(struct lwp * l,const struct sys_kqueue1_args * uap,register_t * retval)1752 sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
1753     register_t *retval)
1754 {
1755           /* {
1756                     syscallarg(int) flags;
1757           } */
1758           return kqueue1(l, SCARG(uap, flags), retval);
1759 }
1760 
1761 /*
1762  * kevent(2) system call.
1763  */
1764 int
kevent_fetch_changes(void * ctx,const struct kevent * changelist,struct kevent * changes,size_t index,int n)1765 kevent_fetch_changes(void *ctx, const struct kevent *changelist,
1766     struct kevent *changes, size_t index, int n)
1767 {
1768 
1769           return copyin(changelist + index, changes, n * sizeof(*changes));
1770 }
1771 
1772 int
kevent_put_events(void * ctx,struct kevent * events,struct kevent * eventlist,size_t index,int n)1773 kevent_put_events(void *ctx, struct kevent *events,
1774     struct kevent *eventlist, size_t index, int n)
1775 {
1776 
1777           return copyout(events, eventlist + index, n * sizeof(*events));
1778 }
1779 
1780 static const struct kevent_ops kevent_native_ops = {
1781           .keo_private = NULL,
1782           .keo_fetch_timeout = copyin,
1783           .keo_fetch_changes = kevent_fetch_changes,
1784           .keo_put_events = kevent_put_events,
1785 };
1786 
1787 int
sys___kevent100(struct lwp * l,const struct sys___kevent100_args * uap,register_t * retval)1788 sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap,
1789     register_t *retval)
1790 {
1791           /* {
1792                     syscallarg(int) fd;
1793                     syscallarg(const struct kevent *) changelist;
1794                     syscallarg(size_t) nchanges;
1795                     syscallarg(struct kevent *) eventlist;
1796                     syscallarg(size_t) nevents;
1797                     syscallarg(const struct timespec *) timeout;
1798           } */
1799 
1800           return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
1801               SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
1802               SCARG(uap, timeout), &kevent_native_ops);
1803 }
1804 
1805 int
kevent1(register_t * retval,int fd,const struct kevent * changelist,size_t nchanges,struct kevent * eventlist,size_t nevents,const struct timespec * timeout,const struct kevent_ops * keops)1806 kevent1(register_t *retval, int fd,
1807           const struct kevent *changelist, size_t nchanges,
1808           struct kevent *eventlist, size_t nevents,
1809           const struct timespec *timeout,
1810           const struct kevent_ops *keops)
1811 {
1812           struct kevent *kevp;
1813           struct kqueue *kq;
1814           struct timespec     ts;
1815           size_t i, n, ichange;
1816           int nerrors, error;
1817           struct kevent kevbuf[KQ_NEVENTS];       /* approx 300 bytes on 64-bit */
1818           file_t *fp;
1819 
1820           /* check that we're dealing with a kq */
1821           fp = fd_getfile(fd);
1822           if (fp == NULL)
1823                     return (EBADF);
1824 
1825           if (fp->f_type != DTYPE_KQUEUE) {
1826                     fd_putfile(fd);
1827                     return (EBADF);
1828           }
1829 
1830           if (timeout != NULL) {
1831                     error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
1832                     if (error)
1833                               goto done;
1834                     timeout = &ts;
1835           }
1836 
1837           kq = fp->f_kqueue;
1838           nerrors = 0;
1839           ichange = 0;
1840 
1841           /* traverse list of events to register */
1842           while (nchanges > 0) {
1843                     n = MIN(nchanges, __arraycount(kevbuf));
1844                     error = (*keops->keo_fetch_changes)(keops->keo_private,
1845                         changelist, kevbuf, ichange, n);
1846                     if (error)
1847                               goto done;
1848                     for (i = 0; i < n; i++) {
1849                               kevp = &kevbuf[i];
1850                               kevp->flags &= ~EV_SYSFLAGS;
1851                               /* register each knote */
1852                               error = kqueue_register(kq, kevp);
1853                               if (!error && !(kevp->flags & EV_RECEIPT))
1854                                         continue;
1855                               if (nevents == 0)
1856                                         goto done;
1857                               kevp->flags = EV_ERROR;
1858                               kevp->data = error;
1859                               error = (*keops->keo_put_events)
1860                                         (keops->keo_private, kevp,
1861                                          eventlist, nerrors, 1);
1862                               if (error)
1863                                         goto done;
1864                               nevents--;
1865                               nerrors++;
1866                     }
1867                     nchanges -= n;      /* update the results */
1868                     ichange += n;
1869           }
1870           if (nerrors) {
1871                     *retval = nerrors;
1872                     error = 0;
1873                     goto done;
1874           }
1875 
1876           /* actually scan through the events */
1877           error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
1878               kevbuf, __arraycount(kevbuf));
1879  done:
1880           fd_putfile(fd);
1881           return (error);
1882 }
1883 
1884 /*
1885  * Register a given kevent kev onto the kqueue
1886  */
1887 static int
kqueue_register(struct kqueue * kq,struct kevent * kev)1888 kqueue_register(struct kqueue *kq, struct kevent *kev)
1889 {
1890           struct kfilter *kfilter;
1891           filedesc_t *fdp;
1892           file_t *fp;
1893           fdfile_t *ff;
1894           struct knote *kn, *newkn;
1895           struct klist *list;
1896           int error, fd, rv;
1897 
1898           fdp = kq->kq_fdp;
1899           fp = NULL;
1900           kn = NULL;
1901           error = 0;
1902           fd = 0;
1903 
1904           newkn = knote_alloc(true);
1905 
1906           rw_enter(&kqueue_filter_lock, RW_READER);
1907           kfilter = kfilter_byfilter(kev->filter);
1908           if (kfilter == NULL || kfilter->filtops == NULL) {
1909                     /* filter not found nor implemented */
1910                     rw_exit(&kqueue_filter_lock);
1911                     knote_free(newkn);
1912                     return (EINVAL);
1913           }
1914 
1915           /* search if knote already exists */
1916           if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
1917                     /* monitoring a file descriptor */
1918                     /* validate descriptor */
1919                     if (kev->ident > INT_MAX
1920                         || (fp = fd_getfile(fd = kev->ident)) == NULL) {
1921                               rw_exit(&kqueue_filter_lock);
1922                               knote_free(newkn);
1923                               return EBADF;
1924                     }
1925                     mutex_enter(&fdp->fd_lock);
1926                     ff = fdp->fd_dt->dt_ff[fd];
1927                     if (ff->ff_refcnt & FR_CLOSING) {
1928                               error = EBADF;
1929                               goto doneunlock;
1930                     }
1931                     if (fd <= fdp->fd_lastkqfile) {
1932                               SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
1933                                         if (kq == kn->kn_kq &&
1934                                             kev->filter == kn->kn_filter)
1935                                                   break;
1936                               }
1937                     }
1938           } else {
1939                     /*
1940                      * not monitoring a file descriptor, so
1941                      * lookup knotes in internal hash table
1942                      */
1943                     mutex_enter(&fdp->fd_lock);
1944                     if (fdp->fd_knhashmask != 0) {
1945                               list = &fdp->fd_knhash[
1946                                   KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
1947                               SLIST_FOREACH(kn, list, kn_link) {
1948                                         if (kev->ident == kn->kn_id &&
1949                                             kq == kn->kn_kq &&
1950                                             kev->filter == kn->kn_filter)
1951                                                   break;
1952                               }
1953                     }
1954           }
1955 
1956           /* It's safe to test KQ_CLOSING while holding only the fd_lock. */
1957           KASSERT(mutex_owned(&fdp->fd_lock));
1958           KASSERT((kq->kq_count & KQ_CLOSING) == 0);
1959 
1960           /*
1961            * kn now contains the matching knote, or NULL if no match
1962            */
1963           if (kn == NULL) {
1964                     if (kev->flags & EV_ADD) {
1965                               /* create new knote */
1966                               kn = newkn;
1967                               newkn = NULL;
1968                               kn->kn_obj = fp;
1969                               kn->kn_id = kev->ident;
1970                               kn->kn_kq = kq;
1971                               kn->kn_fop = kfilter->filtops;
1972                               kn->kn_kfilter = kfilter;
1973                               kn->kn_sfflags = kev->fflags;
1974                               kn->kn_sdata = kev->data;
1975                               kev->fflags = 0;
1976                               kev->data = 0;
1977                               kn->kn_kevent = *kev;
1978 
1979                               KASSERT(kn->kn_fop != NULL);
1980                               /*
1981                                * XXX Allow only known-safe users of f_touch.
1982                                * XXX See filter_touch() for details.
1983                                */
1984                               if (kn->kn_fop->f_touch != NULL &&
1985                                   kn->kn_fop != &timer_filtops &&
1986                                   kn->kn_fop != &user_filtops) {
1987                                         error = ENOTSUP;
1988                                         goto fail_ev_add;
1989                               }
1990 
1991                               /*
1992                                * apply reference count to knote structure, and
1993                                * do not release it at the end of this routine.
1994                                */
1995                               fp = NULL;
1996 
1997                               if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
1998                                         /*
1999                                          * If knote is not on an fd, store on
2000                                          * internal hash table.
2001                                          */
2002                                         if (fdp->fd_knhashmask == 0) {
2003                                                   /* XXXAD can block with fd_lock held */
2004                                                   fdp->fd_knhash = hashinit(KN_HASHSIZE,
2005                                                       HASH_LIST, true,
2006                                                       &fdp->fd_knhashmask);
2007                                         }
2008                                         list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
2009                                             fdp->fd_knhashmask)];
2010                               } else {
2011                                         /* Otherwise, knote is on an fd. */
2012                                         list = (struct klist *)
2013                                             &fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
2014                                         if ((int)kn->kn_id > fdp->fd_lastkqfile)
2015                                                   fdp->fd_lastkqfile = kn->kn_id;
2016                               }
2017                               SLIST_INSERT_HEAD(list, kn, kn_link);
2018 
2019                               /*
2020                                * N.B. kn->kn_fop may change as the result
2021                                * of filter_attach()!
2022                                */
2023                               knote_foplock_enter(kn);
2024                               error = filter_attach(kn);
2025                               if (error != 0) {
2026 #ifdef DEBUG
2027                                         struct proc *p = curlwp->l_proc;
2028                                         const file_t *ft = kn->kn_obj;
2029                                         printf("%s: %s[%d]: event type %d not "
2030                                             "supported for file type %d/%s "
2031                                             "(error %d)\n", __func__,
2032                                             p->p_comm, p->p_pid,
2033                                             kn->kn_filter, ft ? ft->f_type : -1,
2034                                             ft ? ft->f_ops->fo_name : "?", error);
2035 #endif
2036 
2037  fail_ev_add:
2038                                         /*
2039                                          * N.B. no need to check for this note to
2040                                          * be in-flux, since it was never visible
2041                                          * to the monitored object.
2042                                          *
2043                                          * knote_detach() drops fdp->fd_lock
2044                                          */
2045                                         knote_foplock_exit(kn);
2046                                         mutex_enter(&kq->kq_lock);
2047                                         KNOTE_WILLDETACH(kn);
2048                                         KASSERT(kn_in_flux(kn) == false);
2049                                         mutex_exit(&kq->kq_lock);
2050                                         knote_detach(kn, fdp, false);
2051                                         goto done;
2052                               }
2053                               atomic_inc_uint(&kfilter->refcnt);
2054                               goto done_ev_add;
2055                     } else {
2056                               /* No matching knote and the EV_ADD flag is not set. */
2057                               error = ENOENT;
2058                               goto doneunlock;
2059                     }
2060           }
2061 
2062           if (kev->flags & EV_DELETE) {
2063                     /*
2064                      * Let the world know that this knote is about to go
2065                      * away, and wait for it to settle if it's currently
2066                      * in-flux.
2067                      */
2068                     mutex_spin_enter(&kq->kq_lock);
2069                     if (kn->kn_status & KN_WILLDETACH) {
2070                               /*
2071                                * This knote is already on its way out,
2072                                * so just be done.
2073                                */
2074                               mutex_spin_exit(&kq->kq_lock);
2075                               goto doneunlock;
2076                     }
2077                     KNOTE_WILLDETACH(kn);
2078                     if (kn_in_flux(kn)) {
2079                               mutex_exit(&fdp->fd_lock);
2080                               /*
2081                                * It's safe for us to conclusively wait for
2082                                * this knote to settle because we know we'll
2083                                * be completing the detach.
2084                                */
2085                               kn_wait_flux(kn, true);
2086                               KASSERT(kn_in_flux(kn) == false);
2087                               mutex_spin_exit(&kq->kq_lock);
2088                               mutex_enter(&fdp->fd_lock);
2089                     } else {
2090                               mutex_spin_exit(&kq->kq_lock);
2091                     }
2092 
2093                     /* knote_detach() drops fdp->fd_lock */
2094                     knote_detach(kn, fdp, true);
2095                     goto done;
2096           }
2097 
2098           /*
2099            * The user may change some filter values after the
2100            * initial EV_ADD, but doing so will not reset any
2101            * filter which have already been triggered.
2102            */
2103           knote_foplock_enter(kn);
2104           kn->kn_kevent.udata = kev->udata;
2105           KASSERT(kn->kn_fop != NULL);
2106           if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
2107               kn->kn_fop->f_touch != NULL) {
2108                     mutex_spin_enter(&kq->kq_lock);
2109                     error = filter_touch(kn, kev, EVENT_REGISTER);
2110                     mutex_spin_exit(&kq->kq_lock);
2111                     if (__predict_false(error != 0)) {
2112                               /* Never a new knote (which would consume newkn). */
2113                               KASSERT(newkn != NULL);
2114                               knote_foplock_exit(kn);
2115                               goto doneunlock;
2116                     }
2117           } else {
2118                     kn->kn_sfflags = kev->fflags;
2119                     kn->kn_sdata = kev->data;
2120           }
2121 
2122           /*
2123            * We can get here if we are trying to attach
2124            * an event to a file descriptor that does not
2125            * support events, and the attach routine is
2126            * broken and does not return an error.
2127            */
2128  done_ev_add:
2129           rv = filter_event(kn, 0, false);
2130           if (rv)
2131                     knote_activate(kn);
2132 
2133           knote_foplock_exit(kn);
2134 
2135           /* disable knote */
2136           if ((kev->flags & EV_DISABLE)) {
2137                     mutex_spin_enter(&kq->kq_lock);
2138                     if ((kn->kn_status & KN_DISABLED) == 0)
2139                               kn->kn_status |= KN_DISABLED;
2140                     mutex_spin_exit(&kq->kq_lock);
2141           }
2142 
2143           /* enable knote */
2144           if ((kev->flags & EV_ENABLE)) {
2145                     knote_enqueue(kn);
2146           }
2147  doneunlock:
2148           mutex_exit(&fdp->fd_lock);
2149  done:
2150           rw_exit(&kqueue_filter_lock);
2151           if (newkn != NULL)
2152                     knote_free(newkn);
2153           if (fp != NULL)
2154                     fd_putfile(fd);
2155           return (error);
2156 }
2157 
2158 #define KN_FMT(buf, kn) \
2159     (snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
2160 
2161 #if defined(DDB)
2162 void
kqueue_printit(struct kqueue * kq,bool full,void (* pr)(const char *,...))2163 kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
2164 {
2165           const struct knote *kn;
2166           u_int count;
2167           int nmarker;
2168           char buf[128];
2169 
2170           count = 0;
2171           nmarker = 0;
2172 
2173           (*pr)("kqueue %p (restart=%d count=%u):\n", kq,
2174               !!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
2175           (*pr)("  Queued knotes:\n");
2176           TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
2177                     if (kn->kn_status & KN_MARKER) {
2178                               nmarker++;
2179                     } else {
2180                               count++;
2181                     }
2182                     (*pr)("    knote %p: kq=%p status=%s\n",
2183                         kn, kn->kn_kq, KN_FMT(buf, kn));
2184                     (*pr)("      id=0x%lx (%lu) filter=%d\n",
2185                         (u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
2186                     if (kn->kn_kq != kq) {
2187                               (*pr)("      !!! kn->kn_kq != kq\n");
2188                     }
2189           }
2190           if (count != KQ_COUNT(kq)) {
2191                     (*pr)("  !!! count(%u) != KQ_COUNT(%u)\n",
2192                         count, KQ_COUNT(kq));
2193           }
2194 }
2195 #endif /* DDB */
2196 
2197 #if defined(DEBUG)
2198 static void
kqueue_check(const char * func,size_t line,const struct kqueue * kq)2199 kqueue_check(const char *func, size_t line, const struct kqueue *kq)
2200 {
2201           const struct knote *kn;
2202           u_int count;
2203           int nmarker;
2204           char buf[128];
2205 
2206           KASSERT(mutex_owned(&kq->kq_lock));
2207 
2208           count = 0;
2209           nmarker = 0;
2210           TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
2211                     if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
2212                               panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
2213                                   func, line, kq, kn, KN_FMT(buf, kn));
2214                     }
2215                     if ((kn->kn_status & KN_MARKER) == 0) {
2216                               if (kn->kn_kq != kq) {
2217                                         panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
2218                                             func, line, kq, kn, kn->kn_kq,
2219                                             KN_FMT(buf, kn));
2220                               }
2221                               if ((kn->kn_status & KN_ACTIVE) == 0) {
2222                                         panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
2223                                             func, line, kq, kn, KN_FMT(buf, kn));
2224                               }
2225                               count++;
2226                               if (count > KQ_COUNT(kq)) {
2227                                         panic("%s,%zu: kq=%p kq->kq_count(%u) != "
2228                                             "count(%d), nmarker=%d",
2229                                             func, line, kq, KQ_COUNT(kq), count,
2230                                             nmarker);
2231                               }
2232                     } else {
2233                               nmarker++;
2234                     }
2235           }
2236 }
2237 #define kq_check(a) kqueue_check(__func__, __LINE__, (a))
2238 #else /* defined(DEBUG) */
2239 #define   kq_check(a)         /* nothing */
2240 #endif /* defined(DEBUG) */
2241 
2242 static void
kqueue_restart(file_t * fp)2243 kqueue_restart(file_t *fp)
2244 {
2245           struct kqueue *kq = fp->f_kqueue;
2246           KASSERT(kq != NULL);
2247 
2248           mutex_spin_enter(&kq->kq_lock);
2249           kq->kq_count |= KQ_RESTART;
2250           cv_broadcast(&kq->kq_cv);
2251           mutex_spin_exit(&kq->kq_lock);
2252 }
2253 
2254 static int
kqueue_fpathconf(struct file * fp,int name,register_t * retval)2255 kqueue_fpathconf(struct file *fp, int name, register_t *retval)
2256 {
2257 
2258           return EINVAL;
2259 }
2260 
2261 /*
2262  * Scan through the list of events on fp (for a maximum of maxevents),
2263  * returning the results in to ulistp. Timeout is determined by tsp; if
2264  * NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
2265  * as appropriate.
2266  */
2267 static int
kqueue_scan(file_t * fp,size_t maxevents,struct kevent * ulistp,const struct timespec * tsp,register_t * retval,const struct kevent_ops * keops,struct kevent * kevbuf,size_t kevcnt)2268 kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
2269               const struct timespec *tsp, register_t *retval,
2270               const struct kevent_ops *keops, struct kevent *kevbuf,
2271               size_t kevcnt)
2272 {
2273           struct kqueue       *kq;
2274           struct kevent       *kevp;
2275           struct timespec     ats, sleepts;
2276           struct knote        *kn, *marker;
2277           struct knote_impl morker;
2278           size_t              count, nkev, nevents;
2279           int                 timeout, error, touch, rv, influx;
2280           filedesc_t          *fdp;
2281 
2282           fdp = curlwp->l_fd;
2283           kq = fp->f_kqueue;
2284           count = maxevents;
2285           nkev = nevents = error = 0;
2286           if (count == 0) {
2287                     *retval = 0;
2288                     return 0;
2289           }
2290 
2291           if (tsp) {                                        /* timeout supplied */
2292                     ats = *tsp;
2293                     if (inittimeleft(&ats, &sleepts) == -1) {
2294                               *retval = maxevents;
2295                               return EINVAL;
2296                     }
2297                     timeout = tstohz(&ats);
2298                     if (timeout <= 0)
2299                               timeout = -1;           /* do poll */
2300           } else {
2301                     /* no timeout, wait forever */
2302                     timeout = 0;
2303           }
2304 
2305           memset(&morker, 0, sizeof(morker));
2306           marker = &morker.ki_knote;
2307           marker->kn_kq = kq;
2308           marker->kn_status = KN_MARKER;
2309           mutex_spin_enter(&kq->kq_lock);
2310  retry:
2311           kevp = kevbuf;
2312           if (KQ_COUNT(kq) == 0) {
2313                     if (timeout >= 0) {
2314                               error = cv_timedwait_sig(&kq->kq_cv,
2315                                   &kq->kq_lock, timeout);
2316                               if (error == 0) {
2317                                         if (KQ_COUNT(kq) == 0 &&
2318                                             (kq->kq_count & KQ_RESTART)) {
2319                                                   /* return to clear file reference */
2320                                                   error = ERESTART;
2321                                         } else if (tsp == NULL || (timeout =
2322                                             gettimeleft(&ats, &sleepts)) > 0) {
2323                                                   goto retry;
2324                                         }
2325                               } else {
2326                                         /* don't restart after signals... */
2327                                         if (error == ERESTART)
2328                                                   error = EINTR;
2329                                         if (error == EWOULDBLOCK)
2330                                                   error = 0;
2331                               }
2332                     }
2333                     mutex_spin_exit(&kq->kq_lock);
2334                     goto done;
2335           }
2336 
2337           /* mark end of knote list */
2338           TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
2339           influx = 0;
2340 
2341           /*
2342            * Acquire the fdp->fd_lock interlock to avoid races with
2343            * file creation/destruction from other threads.
2344            */
2345           mutex_spin_exit(&kq->kq_lock);
2346 relock:
2347           mutex_enter(&fdp->fd_lock);
2348           mutex_spin_enter(&kq->kq_lock);
2349 
2350           while (count != 0) {
2351                     /*
2352                      * Get next knote.  We are guaranteed this will never
2353                      * be NULL because of the marker we inserted above.
2354                      */
2355                     kn = TAILQ_FIRST(&kq->kq_head);
2356 
2357                     bool kn_is_other_marker =
2358                         (kn->kn_status & KN_MARKER) != 0 && kn != marker;
2359                     bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
2360                     bool kn_is_in_flux = kn_in_flux(kn);
2361 
2362                     /*
2363                      * If we found a marker that's not ours, or this knote
2364                      * is in a state of flux, then wait for everything to
2365                      * settle down and go around again.
2366                      */
2367                     if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
2368                               if (influx) {
2369                                         influx = 0;
2370                                         KQ_FLUX_WAKEUP(kq);
2371                               }
2372                               mutex_exit(&fdp->fd_lock);
2373                               if (kn_is_other_marker || kn_is_in_flux) {
2374                                         KQ_FLUX_WAIT(kq);
2375                                         mutex_spin_exit(&kq->kq_lock);
2376                               } else {
2377                                         /*
2378                                          * Detaching but not in-flux?  Someone is
2379                                          * actively trying to finish the job; just
2380                                          * go around and try again.
2381                                          */
2382                                         KASSERT(kn_is_detaching);
2383                                         mutex_spin_exit(&kq->kq_lock);
2384                                         preempt_point();
2385                               }
2386                               goto relock;
2387                     }
2388 
2389                     TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2390                     if (kn == marker) {
2391                               /* it's our marker, stop */
2392                               KQ_FLUX_WAKEUP(kq);
2393                               if (count == maxevents) {
2394                                         mutex_exit(&fdp->fd_lock);
2395                                         goto retry;
2396                               }
2397                               break;
2398                     }
2399                     KASSERT((kn->kn_status & KN_BUSY) == 0);
2400 
2401                     kq_check(kq);
2402                     kn->kn_status &= ~KN_QUEUED;
2403                     kn->kn_status |= KN_BUSY;
2404                     kq_check(kq);
2405                     if (kn->kn_status & KN_DISABLED) {
2406                               kn->kn_status &= ~KN_BUSY;
2407                               kq->kq_count--;
2408                               /* don't want disabled events */
2409                               continue;
2410                     }
2411                     if ((kn->kn_flags & EV_ONESHOT) == 0) {
2412                               mutex_spin_exit(&kq->kq_lock);
2413                               KASSERT(mutex_owned(&fdp->fd_lock));
2414                               knote_foplock_enter(kn);
2415                               rv = filter_event(kn, 0, false);
2416                               knote_foplock_exit(kn);
2417                               mutex_spin_enter(&kq->kq_lock);
2418                               /* Re-poll if note was re-enqueued. */
2419                               if ((kn->kn_status & KN_QUEUED) != 0) {
2420                                         kn->kn_status &= ~KN_BUSY;
2421                                         /* Re-enqueue raised kq_count, lower it again */
2422                                         kq->kq_count--;
2423                                         influx = 1;
2424                                         continue;
2425                               }
2426                               if (rv == 0) {
2427                                         /*
2428                                          * non-ONESHOT event that hasn't triggered
2429                                          * again, so it will remain de-queued.
2430                                          */
2431                                         kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
2432                                         kq->kq_count--;
2433                                         influx = 1;
2434                                         continue;
2435                               }
2436                     } else {
2437                               /*
2438                                * Must NOT drop kq_lock until we can do
2439                                * the KNOTE_WILLDETACH() below.
2440                                */
2441                     }
2442                     KASSERT(kn->kn_fop != NULL);
2443                     touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
2444                                         kn->kn_fop->f_touch != NULL);
2445                     /* XXXAD should be got from f_event if !oneshot. */
2446                     KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
2447                     if (touch) {
2448                               (void)filter_touch(kn, kevp, EVENT_PROCESS);
2449                     } else {
2450                               *kevp = kn->kn_kevent;
2451                     }
2452                     kevp++;
2453                     nkev++;
2454                     influx = 1;
2455                     if (kn->kn_flags & EV_ONESHOT) {
2456                               /* delete ONESHOT events after retrieval */
2457                               KNOTE_WILLDETACH(kn);
2458                               kn->kn_status &= ~KN_BUSY;
2459                               kq->kq_count--;
2460                               KASSERT(kn_in_flux(kn) == false);
2461                               KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
2462                               KASSERT(kn->kn_kevent.udata == curlwp);
2463                               mutex_spin_exit(&kq->kq_lock);
2464                               knote_detach(kn, fdp, true);
2465                               mutex_enter(&fdp->fd_lock);
2466                               mutex_spin_enter(&kq->kq_lock);
2467                     } else if (kn->kn_flags & EV_CLEAR) {
2468                               /* clear state after retrieval */
2469                               kn->kn_data = 0;
2470                               kn->kn_fflags = 0;
2471                               /*
2472                                * Manually clear knotes who weren't
2473                                * 'touch'ed.
2474                                */
2475                               if (touch == 0) {
2476                                         kn->kn_data = 0;
2477                                         kn->kn_fflags = 0;
2478                               }
2479                               kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
2480                               kq->kq_count--;
2481                     } else if (kn->kn_flags & EV_DISPATCH) {
2482                               kn->kn_status |= KN_DISABLED;
2483                               kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
2484                               kq->kq_count--;
2485                     } else {
2486                               /* add event back on list */
2487                               kq_check(kq);
2488                               kn->kn_status |= KN_QUEUED;
2489                               kn->kn_status &= ~KN_BUSY;
2490                               TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2491                               kq_check(kq);
2492                     }
2493 
2494                     if (nkev == kevcnt) {
2495                               /* do copyouts in kevcnt chunks */
2496                               influx = 0;
2497                               KQ_FLUX_WAKEUP(kq);
2498                               mutex_spin_exit(&kq->kq_lock);
2499                               mutex_exit(&fdp->fd_lock);
2500                               error = (*keops->keo_put_events)
2501                                   (keops->keo_private,
2502                                   kevbuf, ulistp, nevents, nkev);
2503                               mutex_enter(&fdp->fd_lock);
2504                               mutex_spin_enter(&kq->kq_lock);
2505                               nevents += nkev;
2506                               nkev = 0;
2507                               kevp = kevbuf;
2508                     }
2509                     count--;
2510                     if (error != 0 || count == 0) {
2511                               /* remove marker */
2512                               TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
2513                               break;
2514                     }
2515           }
2516           KQ_FLUX_WAKEUP(kq);
2517           mutex_spin_exit(&kq->kq_lock);
2518           mutex_exit(&fdp->fd_lock);
2519 
2520 done:
2521           if (nkev != 0) {
2522                     /* copyout remaining events */
2523                     error = (*keops->keo_put_events)(keops->keo_private,
2524                         kevbuf, ulistp, nevents, nkev);
2525           }
2526           *retval = maxevents - count;
2527 
2528           return error;
2529 }
2530 
2531 /*
2532  * fileops ioctl method for a kqueue descriptor.
2533  *
2534  * Two ioctls are currently supported. They both use struct kfilter_mapping:
2535  *        KFILTER_BYNAME                find name for filter, and return result in
2536  *                                      name, which is of size len.
2537  *        KFILTER_BYFILTER    find filter for name. len is ignored.
2538  */
2539 /*ARGSUSED*/
2540 static int
kqueue_ioctl(file_t * fp,u_long com,void * data)2541 kqueue_ioctl(file_t *fp, u_long com, void *data)
2542 {
2543           struct kfilter_mapping        *km;
2544           const struct kfilter          *kfilter;
2545           char                          *name;
2546           int                           error;
2547 
2548           km = data;
2549           error = 0;
2550           name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
2551 
2552           switch (com) {
2553           case KFILTER_BYFILTER:        /* convert filter -> name */
2554                     rw_enter(&kqueue_filter_lock, RW_READER);
2555                     kfilter = kfilter_byfilter(km->filter);
2556                     if (kfilter != NULL) {
2557                               strlcpy(name, kfilter->name, KFILTER_MAXNAME);
2558                               rw_exit(&kqueue_filter_lock);
2559                               error = copyoutstr(name, km->name, km->len, NULL);
2560                     } else {
2561                               rw_exit(&kqueue_filter_lock);
2562                               error = ENOENT;
2563                     }
2564                     break;
2565 
2566           case KFILTER_BYNAME:          /* convert name -> filter */
2567                     error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
2568                     if (error) {
2569                               break;
2570                     }
2571                     rw_enter(&kqueue_filter_lock, RW_READER);
2572                     kfilter = kfilter_byname(name);
2573                     if (kfilter != NULL)
2574                               km->filter = kfilter->filter;
2575                     else
2576                               error = ENOENT;
2577                     rw_exit(&kqueue_filter_lock);
2578                     break;
2579 
2580           default:
2581                     error = ENOTTY;
2582                     break;
2583 
2584           }
2585           kmem_free(name, KFILTER_MAXNAME);
2586           return (error);
2587 }
2588 
2589 /*
2590  * fileops fcntl method for a kqueue descriptor.
2591  */
2592 static int
kqueue_fcntl(file_t * fp,u_int com,void * data)2593 kqueue_fcntl(file_t *fp, u_int com, void *data)
2594 {
2595 
2596           return (ENOTTY);
2597 }
2598 
2599 /*
2600  * fileops poll method for a kqueue descriptor.
2601  * Determine if kqueue has events pending.
2602  */
2603 static int
kqueue_poll(file_t * fp,int events)2604 kqueue_poll(file_t *fp, int events)
2605 {
2606           struct kqueue       *kq;
2607           int                 revents;
2608 
2609           kq = fp->f_kqueue;
2610 
2611           revents = 0;
2612           if (events & (POLLIN | POLLRDNORM)) {
2613                     mutex_spin_enter(&kq->kq_lock);
2614                     if (KQ_COUNT(kq) != 0) {
2615                               revents |= events & (POLLIN | POLLRDNORM);
2616                     } else {
2617                               selrecord(curlwp, &kq->kq_sel);
2618                     }
2619                     kq_check(kq);
2620                     mutex_spin_exit(&kq->kq_lock);
2621           }
2622 
2623           return revents;
2624 }
2625 
2626 /*
2627  * fileops stat method for a kqueue descriptor.
2628  * Returns dummy info, with st_size being number of events pending.
2629  */
2630 static int
kqueue_stat(file_t * fp,struct stat * st)2631 kqueue_stat(file_t *fp, struct stat *st)
2632 {
2633           struct kqueue *kq;
2634 
2635           kq = fp->f_kqueue;
2636 
2637           memset(st, 0, sizeof(*st));
2638           st->st_size = KQ_COUNT(kq);
2639           st->st_blksize = sizeof(struct kevent);
2640           st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
2641           st->st_blocks = 1;
2642           st->st_uid = kauth_cred_geteuid(fp->f_cred);
2643           st->st_gid = kauth_cred_getegid(fp->f_cred);
2644 
2645           return 0;
2646 }
2647 
2648 static void
kqueue_doclose(struct kqueue * kq,struct klist * list,int fd)2649 kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
2650 {
2651           struct knote *kn;
2652           filedesc_t *fdp;
2653 
2654           fdp = kq->kq_fdp;
2655 
2656           KASSERT(mutex_owned(&fdp->fd_lock));
2657 
2658  again:
2659           for (kn = SLIST_FIRST(list); kn != NULL;) {
2660                     if (kq != kn->kn_kq) {
2661                               kn = SLIST_NEXT(kn, kn_link);
2662                               continue;
2663                     }
2664                     if (knote_detach_quiesce(kn)) {
2665                               mutex_enter(&fdp->fd_lock);
2666                               goto again;
2667                     }
2668                     knote_detach(kn, fdp, true);
2669                     mutex_enter(&fdp->fd_lock);
2670                     kn = SLIST_FIRST(list);
2671           }
2672 }
2673 
2674 /*
2675  * fileops close method for a kqueue descriptor.
2676  */
2677 static int
kqueue_close(file_t * fp)2678 kqueue_close(file_t *fp)
2679 {
2680           struct kqueue *kq;
2681           filedesc_t *fdp;
2682           fdfile_t *ff;
2683           int i;
2684 
2685           kq = fp->f_kqueue;
2686           fp->f_kqueue = NULL;
2687           fp->f_type = 0;
2688           fdp = curlwp->l_fd;
2689 
2690           KASSERT(kq->kq_fdp == fdp);
2691 
2692           mutex_enter(&fdp->fd_lock);
2693 
2694           /*
2695            * We're doing to drop the fd_lock multiple times while
2696            * we detach knotes.  During this time, attempts to register
2697            * knotes via the back door (e.g. knote_proc_fork_track())
2698            * need to fail, lest they sneak in to attach a knote after
2699            * we've already drained the list it's destined for.
2700            *
2701            * We must acquire kq_lock here to set KQ_CLOSING (to serialize
2702            * with other code paths that modify kq_count without holding
2703            * the fd_lock), but once this bit is set, it's only safe to
2704            * test it while holding the fd_lock, and holding kq_lock while
2705            * doing so is not necessary.
2706            */
2707           mutex_enter(&kq->kq_lock);
2708           kq->kq_count |= KQ_CLOSING;
2709           mutex_exit(&kq->kq_lock);
2710 
2711           for (i = 0; i <= fdp->fd_lastkqfile; i++) {
2712                     if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
2713                               continue;
2714                     kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
2715           }
2716           if (fdp->fd_knhashmask != 0) {
2717                     for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
2718                               kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
2719                     }
2720           }
2721 
2722           mutex_exit(&fdp->fd_lock);
2723 
2724 #if defined(DEBUG)
2725           mutex_enter(&kq->kq_lock);
2726           kq_check(kq);
2727           mutex_exit(&kq->kq_lock);
2728 #endif /* DEBUG */
2729           KASSERT(TAILQ_EMPTY(&kq->kq_head));
2730           KASSERT(KQ_COUNT(kq) == 0);
2731           mutex_destroy(&kq->kq_lock);
2732           cv_destroy(&kq->kq_cv);
2733           seldestroy(&kq->kq_sel);
2734           kmem_free(kq, sizeof(*kq));
2735 
2736           return (0);
2737 }
2738 
2739 /*
2740  * struct fileops kqfilter method for a kqueue descriptor.
2741  * Event triggered when monitored kqueue changes.
2742  */
2743 static int
kqueue_kqfilter(file_t * fp,struct knote * kn)2744 kqueue_kqfilter(file_t *fp, struct knote *kn)
2745 {
2746           struct kqueue *kq;
2747 
2748           kq = ((file_t *)kn->kn_obj)->f_kqueue;
2749 
2750           KASSERT(fp == kn->kn_obj);
2751 
2752           if (kn->kn_filter != EVFILT_READ)
2753                     return EINVAL;
2754 
2755           kn->kn_fop = &kqread_filtops;
2756           mutex_enter(&kq->kq_lock);
2757           selrecord_knote(&kq->kq_sel, kn);
2758           mutex_exit(&kq->kq_lock);
2759 
2760           return 0;
2761 }
2762 
2763 
2764 /*
2765  * Walk down a list of knotes, activating them if their event has
2766  * triggered.  The caller's object lock (e.g. device driver lock)
2767  * must be held.
2768  */
2769 void
knote(struct klist * list,long hint)2770 knote(struct klist *list, long hint)
2771 {
2772           struct knote *kn, *tmpkn;
2773 
2774           SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
2775                     /*
2776                      * We assume here that the backing object's lock is
2777                      * already held if we're traversing the klist, and
2778                      * so acquiring the knote foplock would create a
2779                      * deadlock scenario.  But we also know that the klist
2780                      * won't disappear on us while we're here, so not
2781                      * acquiring it is safe.
2782                      */
2783                     if (filter_event(kn, hint, true)) {
2784                               knote_activate(kn);
2785                     }
2786           }
2787 }
2788 
2789 /*
2790  * Remove all knotes referencing a specified fd
2791  */
2792 void
knote_fdclose(int fd)2793 knote_fdclose(int fd)
2794 {
2795           struct klist *list;
2796           struct knote *kn;
2797           filedesc_t *fdp;
2798 
2799  again:
2800           fdp = curlwp->l_fd;
2801           mutex_enter(&fdp->fd_lock);
2802           list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
2803           while ((kn = SLIST_FIRST(list)) != NULL) {
2804                     if (knote_detach_quiesce(kn)) {
2805                               goto again;
2806                     }
2807                     knote_detach(kn, fdp, true);
2808                     mutex_enter(&fdp->fd_lock);
2809           }
2810           mutex_exit(&fdp->fd_lock);
2811 }
2812 
2813 /*
2814  * Drop knote.  Called with fdp->fd_lock held, and will drop before
2815  * returning.
2816  */
2817 static void
knote_detach(struct knote * kn,filedesc_t * fdp,bool dofop)2818 knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
2819 {
2820           struct klist *list;
2821           struct kqueue *kq;
2822 
2823           kq = kn->kn_kq;
2824 
2825           KASSERT((kn->kn_status & KN_MARKER) == 0);
2826           KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
2827           KASSERT(kn->kn_fop != NULL);
2828           KASSERT(mutex_owned(&fdp->fd_lock));
2829 
2830           /* Remove from monitored object. */
2831           if (dofop) {
2832                     knote_foplock_enter(kn);
2833                     filter_detach(kn);
2834                     knote_foplock_exit(kn);
2835           }
2836 
2837           /* Remove from descriptor table. */
2838           if (kn->kn_fop->f_flags & FILTEROP_ISFD)
2839                     list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
2840           else
2841                     list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
2842 
2843           SLIST_REMOVE(list, kn, knote, kn_link);
2844 
2845           /* Remove from kqueue. */
2846 again:
2847           mutex_spin_enter(&kq->kq_lock);
2848           KASSERT(kn_in_flux(kn) == false);
2849           if ((kn->kn_status & KN_QUEUED) != 0) {
2850                     kq_check(kq);
2851                     KASSERT(KQ_COUNT(kq) != 0);
2852                     kq->kq_count--;
2853                     TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2854                     kn->kn_status &= ~KN_QUEUED;
2855                     kq_check(kq);
2856           } else if (kn->kn_status & KN_BUSY) {
2857                     mutex_spin_exit(&kq->kq_lock);
2858                     goto again;
2859           }
2860           mutex_spin_exit(&kq->kq_lock);
2861 
2862           mutex_exit(&fdp->fd_lock);
2863           if (kn->kn_fop->f_flags & FILTEROP_ISFD)
2864                     fd_putfile(kn->kn_id);
2865           atomic_dec_uint(&kn->kn_kfilter->refcnt);
2866           knote_free(kn);
2867 }
2868 
2869 /*
2870  * Queue new event for knote.
2871  */
2872 static void
knote_enqueue(struct knote * kn)2873 knote_enqueue(struct knote *kn)
2874 {
2875           struct kqueue *kq;
2876 
2877           KASSERT((kn->kn_status & KN_MARKER) == 0);
2878 
2879           kq = kn->kn_kq;
2880 
2881           mutex_spin_enter(&kq->kq_lock);
2882           if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
2883                     /* Don't bother enqueueing a dying knote. */
2884                     goto out;
2885           }
2886           if ((kn->kn_status & KN_DISABLED) != 0) {
2887                     kn->kn_status &= ~KN_DISABLED;
2888           }
2889           if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
2890                     kq_check(kq);
2891                     kn->kn_status |= KN_QUEUED;
2892                     TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2893                     KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
2894                     kq->kq_count++;
2895                     kq_check(kq);
2896                     cv_broadcast(&kq->kq_cv);
2897                     selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
2898           }
2899  out:
2900           mutex_spin_exit(&kq->kq_lock);
2901 }
2902 /*
2903  * Queue new event for knote.
2904  */
2905 static void
knote_activate_locked(struct knote * kn)2906 knote_activate_locked(struct knote *kn)
2907 {
2908           struct kqueue *kq;
2909 
2910           KASSERT((kn->kn_status & KN_MARKER) == 0);
2911 
2912           kq = kn->kn_kq;
2913 
2914           if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
2915                     /* Don't bother enqueueing a dying knote. */
2916                     return;
2917           }
2918           kn->kn_status |= KN_ACTIVE;
2919           if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
2920                     kq_check(kq);
2921                     kn->kn_status |= KN_QUEUED;
2922                     TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2923                     KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
2924                     kq->kq_count++;
2925                     kq_check(kq);
2926                     cv_broadcast(&kq->kq_cv);
2927                     selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
2928           }
2929 }
2930 
2931 static void
knote_activate(struct knote * kn)2932 knote_activate(struct knote *kn)
2933 {
2934           struct kqueue *kq = kn->kn_kq;
2935 
2936           mutex_spin_enter(&kq->kq_lock);
2937           knote_activate_locked(kn);
2938           mutex_spin_exit(&kq->kq_lock);
2939 }
2940 
2941 static void
knote_deactivate_locked(struct knote * kn)2942 knote_deactivate_locked(struct knote *kn)
2943 {
2944           struct kqueue *kq = kn->kn_kq;
2945 
2946           if (kn->kn_status & KN_QUEUED) {
2947                     kq_check(kq);
2948                     kn->kn_status &= ~KN_QUEUED;
2949                     TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2950                     KASSERT(KQ_COUNT(kq) > 0);
2951                     kq->kq_count--;
2952                     kq_check(kq);
2953           }
2954           kn->kn_status &= ~KN_ACTIVE;
2955 }
2956 
2957 /*
2958  * Set EV_EOF on the specified knote.  Also allows additional
2959  * EV_* flags to be set (e.g. EV_ONESHOT).
2960  */
2961 void
knote_set_eof(struct knote * kn,uint32_t flags)2962 knote_set_eof(struct knote *kn, uint32_t flags)
2963 {
2964           struct kqueue *kq = kn->kn_kq;
2965 
2966           mutex_spin_enter(&kq->kq_lock);
2967           kn->kn_flags |= EV_EOF | flags;
2968           mutex_spin_exit(&kq->kq_lock);
2969 }
2970 
2971 /*
2972  * Clear EV_EOF on the specified knote.
2973  */
2974 void
knote_clear_eof(struct knote * kn)2975 knote_clear_eof(struct knote *kn)
2976 {
2977           struct kqueue *kq = kn->kn_kq;
2978 
2979           mutex_spin_enter(&kq->kq_lock);
2980           kn->kn_flags &= ~EV_EOF;
2981           mutex_spin_exit(&kq->kq_lock);
2982 }
2983 
2984 /*
2985  * Initialize a klist.
2986  */
2987 void
klist_init(struct klist * list)2988 klist_init(struct klist *list)
2989 {
2990           SLIST_INIT(list);
2991 }
2992 
2993 /*
2994  * Finalize a klist.
2995  */
2996 void
klist_fini(struct klist * list)2997 klist_fini(struct klist *list)
2998 {
2999           struct knote *kn;
3000 
3001           /*
3002            * Neuter all existing knotes on the klist because the list is
3003            * being destroyed.  The caller has guaranteed that no additional
3004            * knotes will be added to the list, that the backing object's
3005            * locks are not held (otherwise there is a locking order issue
3006            * with acquiring the knote foplock ), and that we can traverse
3007            * the list safely in this state.
3008            */
3009           SLIST_FOREACH(kn, list, kn_selnext) {
3010                     knote_foplock_enter(kn);
3011                     KASSERT(kn->kn_fop != NULL);
3012                     if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
3013                               kn->kn_fop = &nop_fd_filtops;
3014                     } else {
3015                               kn->kn_fop = &nop_filtops;
3016                     }
3017                     knote_foplock_exit(kn);
3018           }
3019 }
3020 
3021 /*
3022  * Insert a knote into a klist.
3023  */
3024 void
klist_insert(struct klist * list,struct knote * kn)3025 klist_insert(struct klist *list, struct knote *kn)
3026 {
3027           SLIST_INSERT_HEAD(list, kn, kn_selnext);
3028 }
3029 
3030 /*
3031  * Remove a knote from a klist.  Returns true if the last
3032  * knote was removed and the list is now empty.
3033  */
3034 bool
klist_remove(struct klist * list,struct knote * kn)3035 klist_remove(struct klist *list, struct knote *kn)
3036 {
3037           SLIST_REMOVE(list, kn, knote, kn_selnext);
3038           return SLIST_EMPTY(list);
3039 }
3040