xref: /dragonfly/sys/kern/kern_event.c (revision cc8e70bd591c943565dd618d131dcee0027ded02)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD: src/sys/kern/kern_event.c,v 1.2.2.10 2004/04/04 07:03:14 cperciva Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/proc.h>
33 #include <sys/malloc.h>
34 #include <sys/unistd.h>
35 #include <sys/file.h>
36 #include <sys/lock.h>
37 #include <sys/fcntl.h>
38 #include <sys/queue.h>
39 #include <sys/event.h>
40 #include <sys/eventvar.h>
41 #include <sys/protosw.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/stat.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysmsg.h>
47 #include <sys/thread.h>
48 #include <sys/uio.h>
49 #include <sys/signalvar.h>
50 #include <sys/filio.h>
51 #include <sys/ktr.h>
52 #include <sys/spinlock.h>
53 
54 #include <sys/thread2.h>
55 #include <sys/file2.h>
56 #include <sys/mplock2.h>
57 #include <sys/spinlock2.h>
58 
59 #define EVENT_REGISTER        1
60 #define EVENT_PROCESS         2
61 
62 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
63 
64 struct kevent_copyin_args {
65           const struct kevent_args *ka;
66           struct kevent                 *eventlist;
67           const struct kevent *changelist;
68           int                           pchanges;
69 };
70 
71 #define KNOTE_CACHE_MAX                 64
72 
73 struct knote_cache_list {
74           struct klist                  knote_cache;
75           int                           knote_cache_cnt;
76 } __cachealign;
77 
78 static int          kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
79                         struct knote *marker, int closedcounter, int flags);
80 static int          kqueue_read(struct file *fp, struct uio *uio,
81                         struct ucred *cred, int flags);
82 static int          kqueue_write(struct file *fp, struct uio *uio,
83                         struct ucred *cred, int flags);
84 static int          kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
85                         struct ucred *cred, struct sysmsg *msg);
86 static int          kqueue_kqfilter(struct file *fp, struct knote *kn);
87 static int          kqueue_stat(struct file *fp, struct stat *st,
88                         struct ucred *cred);
89 static int          kqueue_close(struct file *fp);
90 static void         kqueue_wakeup(struct kqueue *kq);
91 static int          filter_attach(struct knote *kn);
92 static int          filter_event(struct knote *kn, long hint);
93 
94 /*
95  * MPSAFE
96  */
97 static struct fileops kqueueops = {
98           .fo_read = kqueue_read,
99           .fo_write = kqueue_write,
100           .fo_ioctl = kqueue_ioctl,
101           .fo_kqfilter = kqueue_kqfilter,
102           .fo_stat = kqueue_stat,
103           .fo_close = kqueue_close,
104           .fo_shutdown = nofo_shutdown,
105           .fo_seek = badfo_seek
106 };
107 
108 static void         knote_attach(struct knote *kn);
109 static void         knote_drop(struct knote *kn);
110 static void         knote_detach_and_drop(struct knote *kn);
111 static void         knote_enqueue(struct knote *kn);
112 static void         knote_dequeue(struct knote *kn);
113 static struct       knote *knote_alloc(void);
114 static void         knote_free(struct knote *kn);
115 
116 static void         precise_sleep_intr(systimer_t info, int in_ipi,
117                                            struct intrframe *frame);
118 static int          precise_sleep(void *ident, int flags, const char *wmesg,
119                                     int us);
120 
121 static void         filt_kqdetach(struct knote *kn);
122 static int          filt_kqueue(struct knote *kn, long hint);
123 static int          filt_procattach(struct knote *kn);
124 static void         filt_procdetach(struct knote *kn);
125 static int          filt_proc(struct knote *kn, long hint);
126 static int          filt_fileattach(struct knote *kn);
127 static void         filt_timerexpire(void *knx);
128 static int          filt_timerattach(struct knote *kn);
129 static void         filt_timerdetach(struct knote *kn);
130 static int          filt_timer(struct knote *kn, long hint);
131 static int          filt_userattach(struct knote *kn);
132 static void         filt_userdetach(struct knote *kn);
133 static int          filt_user(struct knote *kn, long hint);
134 static void         filt_usertouch(struct knote *kn, struct kevent *kev,
135                                         u_long type);
136 static int          filt_fsattach(struct knote *kn);
137 static void         filt_fsdetach(struct knote *kn);
138 static int          filt_fs(struct knote *kn, long hint);
139 
140 static struct filterops file_filtops =
141           { FILTEROP_ISFD | FILTEROP_MPSAFE, filt_fileattach, NULL, NULL };
142 static struct filterops kqread_filtops =
143           { FILTEROP_ISFD | FILTEROP_MPSAFE, NULL, filt_kqdetach, filt_kqueue };
144 static struct filterops proc_filtops =
145           { FILTEROP_MPSAFE, filt_procattach, filt_procdetach, filt_proc };
146 static struct filterops timer_filtops =
147           { FILTEROP_MPSAFE, filt_timerattach, filt_timerdetach, filt_timer };
148 static struct filterops user_filtops =
149           { FILTEROP_MPSAFE, filt_userattach, filt_userdetach, filt_user };
150 static struct filterops fs_filtops =
151           { FILTEROP_MPSAFE, filt_fsattach, filt_fsdetach, filt_fs };
152 
153 static int                    kq_ncallouts = 0;
154 static int                    kq_calloutmax = 65536;
155 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
156     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
157 static int                    kq_checkloop = 1000000;
158 SYSCTL_INT(_kern, OID_AUTO, kq_checkloop, CTLFLAG_RW,
159     &kq_checkloop, 0, "Maximum number of loops for kqueue scan");
160 static int                    kq_sleep_threshold = 20000;
161 SYSCTL_INT(_kern, OID_AUTO, kq_sleep_threshold, CTLFLAG_RW,
162     &kq_sleep_threshold, 0, "Minimum sleep duration without busy-looping");
163 
164 #define KNOTE_ACTIVATE(kn) do {                                                 \
165           kn->kn_status |= KN_ACTIVE;                                           \
166           if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)                 \
167                     knote_enqueue(kn);                                          \
168 } while(0)
169 
170 #define   KN_HASHSIZE                   64                  /* XXX should be tunable */
171 #define KN_HASH(val, mask)    (((val) ^ (val >> 8)) & (mask))
172 
173 extern struct filterops aio_filtops;
174 extern struct filterops sig_filtops;
175 
176 /*
177  * Table for for all system-defined filters.
178  */
179 static struct filterops *sysfilt_ops[] = {
180           &file_filtops,                          /* EVFILT_READ */
181           &file_filtops,                          /* EVFILT_WRITE */
182           &aio_filtops,                           /* EVFILT_AIO */
183           &file_filtops,                          /* EVFILT_VNODE */
184           &proc_filtops,                          /* EVFILT_PROC */
185           &sig_filtops,                           /* EVFILT_SIGNAL */
186           &timer_filtops,                         /* EVFILT_TIMER */
187           &file_filtops,                          /* EVFILT_EXCEPT */
188           &user_filtops,                          /* EVFILT_USER */
189           &fs_filtops,                            /* EVFILT_FS */
190 };
191 
192 static struct knote_cache_list          knote_cache_lists[MAXCPU];
193 
194 /*
195  * Acquire a knote, return non-zero on success, 0 on failure.
196  *
197  * If we cannot acquire the knote we sleep and return 0.  The knote
198  * may be stale on return in this case and the caller must restart
199  * whatever loop they are in.
200  *
201  * Related kq token must be held.
202  */
203 static __inline int
knote_acquire(struct knote * kn)204 knote_acquire(struct knote *kn)
205 {
206           if (kn->kn_status & KN_PROCESSING) {
207                     kn->kn_status |= KN_WAITING | KN_REPROCESS;
208                     tsleep(kn, 0, "kqepts", hz);
209                     /* knote may be stale now */
210                     return(0);
211           }
212           kn->kn_status |= KN_PROCESSING;
213           return(1);
214 }
215 
216 /*
217  * Release an acquired knote, clearing KN_PROCESSING and handling any
218  * KN_REPROCESS events.
219  *
220  * Caller must be holding the related kq token
221  *
222  * Non-zero is returned if the knote is destroyed or detached.
223  */
224 static __inline int
knote_release(struct knote * kn)225 knote_release(struct knote *kn)
226 {
227           int ret;
228 
229           while (kn->kn_status & KN_REPROCESS) {
230                     kn->kn_status &= ~KN_REPROCESS;
231                     if (kn->kn_status & KN_WAITING) {
232                               kn->kn_status &= ~KN_WAITING;
233                               wakeup(kn);
234                     }
235                     if (kn->kn_status & KN_DELETING) {
236                               knote_detach_and_drop(kn);
237                               return(1);
238                               /* NOT REACHED */
239                     }
240                     if (filter_event(kn, 0))
241                               KNOTE_ACTIVATE(kn);
242           }
243           if (kn->kn_status & KN_DETACHED)
244                     ret = 1;
245           else
246                     ret = 0;
247           kn->kn_status &= ~KN_PROCESSING;
248           /* kn should not be accessed anymore */
249           return ret;
250 }
251 
252 static int
filt_fileattach(struct knote * kn)253 filt_fileattach(struct knote *kn)
254 {
255           return (fo_kqfilter(kn->kn_fp, kn));
256 }
257 
258 /*
259  * MPSAFE
260  */
261 static int
kqueue_kqfilter(struct file * fp,struct knote * kn)262 kqueue_kqfilter(struct file *fp, struct knote *kn)
263 {
264           struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
265 
266           if (kn->kn_filter != EVFILT_READ)
267                     return (EOPNOTSUPP);
268 
269           kn->kn_fop = &kqread_filtops;
270           knote_insert(&kq->kq_kqinfo.ki_note, kn);
271           return (0);
272 }
273 
274 static void
filt_kqdetach(struct knote * kn)275 filt_kqdetach(struct knote *kn)
276 {
277           struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
278 
279           knote_remove(&kq->kq_kqinfo.ki_note, kn);
280 }
281 
282 /*ARGSUSED*/
283 static int
filt_kqueue(struct knote * kn,long hint)284 filt_kqueue(struct knote *kn, long hint)
285 {
286           struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
287 
288           kn->kn_data = kq->kq_count;
289           return (kn->kn_data > 0);
290 }
291 
292 static int
filt_procattach(struct knote * kn)293 filt_procattach(struct knote *kn)
294 {
295           struct proc *p;
296           int immediate;
297 
298           immediate = 0;
299           p = pfind(kn->kn_id);
300           if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
301                     p = zpfind(kn->kn_id);
302                     immediate = 1;
303           }
304           if (p == NULL) {
305                     return (ESRCH);
306           }
307           if (!PRISON_CHECK(curthread->td_ucred, p->p_ucred)) {
308                     if (p)
309                               PRELE(p);
310                     return (EACCES);
311           }
312 
313           lwkt_gettoken(&p->p_token);
314           kn->kn_ptr.p_proc = p;
315           kn->kn_flags |= EV_CLEAR;               /* automatically set */
316 
317           /*
318            * internal flag indicating registration done by kernel
319            */
320           if (kn->kn_flags & EV_FLAG1) {
321                     kn->kn_data = kn->kn_sdata;             /* ppid */
322                     kn->kn_fflags = NOTE_CHILD;
323                     kn->kn_flags &= ~EV_FLAG1;
324           }
325 
326           knote_insert(&p->p_klist, kn);
327 
328           /*
329            * Immediately activate any exit notes if the target process is a
330            * zombie.  This is necessary to handle the case where the target
331            * process, e.g. a child, dies before the kevent is negistered.
332            */
333           if (immediate && filt_proc(kn, NOTE_EXIT))
334                     KNOTE_ACTIVATE(kn);
335           lwkt_reltoken(&p->p_token);
336           PRELE(p);
337 
338           return (0);
339 }
340 
341 /*
342  * The knote may be attached to a different process, which may exit,
343  * leaving nothing for the knote to be attached to.  So when the process
344  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
345  * it will be deleted when read out.  However, as part of the knote deletion,
346  * this routine is called, so a check is needed to avoid actually performing
347  * a detach, because the original process does not exist any more.
348  */
349 static void
filt_procdetach(struct knote * kn)350 filt_procdetach(struct knote *kn)
351 {
352           struct proc *p;
353 
354           if (kn->kn_status & KN_DETACHED)
355                     return;
356           p = kn->kn_ptr.p_proc;
357           knote_remove(&p->p_klist, kn);
358 }
359 
360 static int
filt_proc(struct knote * kn,long hint)361 filt_proc(struct knote *kn, long hint)
362 {
363           u_int event;
364 
365           /*
366            * mask off extra data
367            */
368           event = (u_int)hint & NOTE_PCTRLMASK;
369 
370           /*
371            * if the user is interested in this event, record it.
372            */
373           if (kn->kn_sfflags & event)
374                     kn->kn_fflags |= event;
375 
376           /*
377            * Process is gone, so flag the event as finished.  Detach the
378            * knote from the process now because the process will be poof,
379            * gone later on.
380            */
381           if (event == NOTE_EXIT) {
382                     struct proc *p = kn->kn_ptr.p_proc;
383                     if ((kn->kn_status & KN_DETACHED) == 0) {
384                               PHOLD(p);
385                               knote_remove(&p->p_klist, kn);
386                               kn->kn_status |= KN_DETACHED;
387                               kn->kn_data = p->p_xstat;
388                               kn->kn_ptr.p_proc = NULL;
389                               PRELE(p);
390                     }
391                     kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
392                     return (1);
393           }
394 
395           /*
396            * process forked, and user wants to track the new process,
397            * so attach a new knote to it, and immediately report an
398            * event with the parent's pid.
399            */
400           if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
401                     struct kevent kev;
402                     int error;
403                     int n;
404 
405                     /*
406                      * register knote with new process.
407                      */
408                     kev.ident = hint & NOTE_PDATAMASK;      /* pid */
409                     kev.filter = kn->kn_filter;
410                     kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
411                     kev.fflags = kn->kn_sfflags;
412                     kev.data = kn->kn_id;                             /* parent */
413                     kev.udata = kn->kn_kevent.udata;        /* preserve udata */
414                     n = 1;
415                     error = kqueue_register(kn->kn_kq, &kev, &n, 0);
416                     if (error)
417                               kn->kn_fflags |= NOTE_TRACKERR;
418           }
419 
420           return (kn->kn_fflags != 0);
421 }
422 
423 static void
filt_timerreset(struct knote * kn)424 filt_timerreset(struct knote *kn)
425 {
426           struct callout *calloutp;
427           struct timeval tv;
428           int tticks;
429 
430           tv.tv_sec = kn->kn_sdata / 1000;
431           tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
432           tticks = tvtohz_high(&tv);
433           calloutp = (struct callout *)kn->kn_hook;
434           callout_reset(calloutp, tticks, filt_timerexpire, kn);
435 }
436 
437 /*
438  * The callout interlocks with callout_stop() but can still
439  * race a deletion so if KN_DELETING is set we just don't touch
440  * the knote.
441  */
442 static void
filt_timerexpire(void * knx)443 filt_timerexpire(void *knx)
444 {
445           struct knote *kn = knx;
446           struct kqueue *kq = kn->kn_kq;
447 
448           lwkt_getpooltoken(kq);
449 
450           /*
451            * Open knote_acquire(), since we can't sleep in callout,
452            * however, we do need to record this expiration.
453            */
454           kn->kn_data++;
455           if (kn->kn_status & KN_PROCESSING) {
456                     kn->kn_status |= KN_REPROCESS;
457                     if ((kn->kn_status & KN_DELETING) == 0 &&
458                         (kn->kn_flags & EV_ONESHOT) == 0)
459                               filt_timerreset(kn);
460                     lwkt_relpooltoken(kq);
461                     return;
462           }
463           KASSERT((kn->kn_status & KN_DELETING) == 0,
464               ("acquire a deleting knote %#x", kn->kn_status));
465           kn->kn_status |= KN_PROCESSING;
466 
467           KNOTE_ACTIVATE(kn);
468           if ((kn->kn_flags & EV_ONESHOT) == 0)
469                     filt_timerreset(kn);
470 
471           knote_release(kn);
472 
473           lwkt_relpooltoken(kq);
474 }
475 
476 /*
477  * data contains amount of time to sleep, in milliseconds
478  */
479 static int
filt_timerattach(struct knote * kn)480 filt_timerattach(struct knote *kn)
481 {
482           struct callout *calloutp;
483           int prev_ncallouts;
484 
485           prev_ncallouts = atomic_fetchadd_int(&kq_ncallouts, 1);
486           if (prev_ncallouts >= kq_calloutmax) {
487                     atomic_subtract_int(&kq_ncallouts, 1);
488                     kn->kn_hook = NULL;
489                     return (ENOMEM);
490           }
491 
492           kn->kn_flags |= EV_CLEAR;               /* automatically set */
493           calloutp = kmalloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
494           callout_init_mp(calloutp);
495           kn->kn_hook = (caddr_t)calloutp;
496 
497           filt_timerreset(kn);
498           return (0);
499 }
500 
501 /*
502  * This function is called with the knote flagged locked but it is
503  * still possible to race a callout event due to the callback blocking.
504  */
505 static void
filt_timerdetach(struct knote * kn)506 filt_timerdetach(struct knote *kn)
507 {
508           struct callout *calloutp;
509 
510           calloutp = (struct callout *)kn->kn_hook;
511           callout_terminate(calloutp);
512           kn->kn_hook = NULL;
513           kfree(calloutp, M_KQUEUE);
514           atomic_subtract_int(&kq_ncallouts, 1);
515 }
516 
517 static int
filt_timer(struct knote * kn,long hint)518 filt_timer(struct knote *kn, long hint)
519 {
520           return (kn->kn_data != 0);
521 }
522 
523 /*
524  * EVFILT_USER
525  */
526 static int
filt_userattach(struct knote * kn)527 filt_userattach(struct knote *kn)
528 {
529           u_int ffctrl;
530 
531           kn->kn_hook = NULL;
532           if (kn->kn_sfflags & NOTE_TRIGGER)
533                     kn->kn_ptr.hookid = 1;
534           else
535                     kn->kn_ptr.hookid = 0;
536 
537           ffctrl = kn->kn_sfflags & NOTE_FFCTRLMASK;
538           kn->kn_sfflags &= NOTE_FFLAGSMASK;
539           switch (ffctrl) {
540           case NOTE_FFNOP:
541                     break;
542 
543           case NOTE_FFAND:
544                     kn->kn_fflags &= kn->kn_sfflags;
545                     break;
546 
547           case NOTE_FFOR:
548                     kn->kn_fflags |= kn->kn_sfflags;
549                     break;
550 
551           case NOTE_FFCOPY:
552                     kn->kn_fflags = kn->kn_sfflags;
553                     break;
554 
555           default:
556                     /* XXX Return error? */
557                     break;
558           }
559           /* We just happen to copy this value as well. Undocumented. */
560           kn->kn_data = kn->kn_sdata;
561 
562           return 0;
563 }
564 
565 static void
filt_userdetach(struct knote * kn)566 filt_userdetach(struct knote *kn)
567 {
568           /* nothing to do */
569 }
570 
571 static int
filt_user(struct knote * kn,long hint)572 filt_user(struct knote *kn, long hint)
573 {
574           return (kn->kn_ptr.hookid);
575 }
576 
577 static void
filt_usertouch(struct knote * kn,struct kevent * kev,u_long type)578 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
579 {
580           u_int ffctrl;
581 
582           switch (type) {
583           case EVENT_REGISTER:
584                     if (kev->fflags & NOTE_TRIGGER)
585                               kn->kn_ptr.hookid = 1;
586 
587                     ffctrl = kev->fflags & NOTE_FFCTRLMASK;
588                     kev->fflags &= NOTE_FFLAGSMASK;
589                     switch (ffctrl) {
590                     case NOTE_FFNOP:
591                               break;
592 
593                     case NOTE_FFAND:
594                               kn->kn_fflags &= kev->fflags;
595                               break;
596 
597                     case NOTE_FFOR:
598                               kn->kn_fflags |= kev->fflags;
599                               break;
600 
601                     case NOTE_FFCOPY:
602                               kn->kn_fflags = kev->fflags;
603                               break;
604 
605                     default:
606                               /* XXX Return error? */
607                               break;
608                     }
609                     /* We just happen to copy this value as well. Undocumented. */
610                     kn->kn_data = kev->data;
611 
612                     /*
613                      * This is not the correct use of EV_CLEAR in an event
614                      * modification, it should have been passed as a NOTE instead.
615                      * But we need to maintain compatibility with Apple & FreeBSD.
616                      *
617                      * Note however that EV_CLEAR can still be used when doing
618                      * the initial registration of the event and works as expected
619                      * (clears the event on reception).
620                      */
621                     if (kev->flags & EV_CLEAR) {
622                               kn->kn_ptr.hookid = 0;
623                               /*
624                                * Clearing kn->kn_data is fine, since it gets set
625                                * every time anyway. We just shouldn't clear
626                                * kn->kn_fflags here, since that would limit the
627                                * possible uses of this API. NOTE_FFAND or
628                                * NOTE_FFCOPY should be used for explicitly clearing
629                                * kn->kn_fflags.
630                                */
631                               kn->kn_data = 0;
632                     }
633                     break;
634 
635         case EVENT_PROCESS:
636                     *kev = kn->kn_kevent;
637                     kev->fflags = kn->kn_fflags;
638                     kev->data = kn->kn_data;
639                     if (kn->kn_flags & EV_CLEAR) {
640                               kn->kn_ptr.hookid = 0;
641                               /* kn_data, kn_fflags handled by parent */
642                     }
643                     break;
644 
645           default:
646                     panic("filt_usertouch() - invalid type (%ld)", type);
647                     break;
648           }
649 }
650 
651 /*
652  * EVFILT_FS
653  */
654 struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
655 
656 static int
filt_fsattach(struct knote * kn)657 filt_fsattach(struct knote *kn)
658 {
659           kn->kn_flags |= EV_CLEAR;
660           knote_insert(&fs_klist, kn);
661 
662           return (0);
663 }
664 
665 static void
filt_fsdetach(struct knote * kn)666 filt_fsdetach(struct knote *kn)
667 {
668           knote_remove(&fs_klist, kn);
669 }
670 
671 static int
filt_fs(struct knote * kn,long hint)672 filt_fs(struct knote *kn, long hint)
673 {
674           kn->kn_fflags |= hint;
675           return (kn->kn_fflags != 0);
676 }
677 
678 /*
679  * Initialize a kqueue.
680  *
681  * NOTE: The lwp/proc code initializes a kqueue for select/poll ops.
682  */
683 void
kqueue_init(struct kqueue * kq,struct filedesc * fdp)684 kqueue_init(struct kqueue *kq, struct filedesc *fdp)
685 {
686           bzero(kq, sizeof(*kq));
687           TAILQ_INIT(&kq->kq_knpend);
688           TAILQ_INIT(&kq->kq_knlist);
689           kq->kq_fdp = fdp;
690           SLIST_INIT(&kq->kq_kqinfo.ki_note);
691 }
692 
693 /*
694  * Terminate a kqueue.  Freeing the actual kq itself is left up to the
695  * caller (it might be embedded in a lwp so we don't do it here).
696  *
697  * The kq's knlist must be completely eradicated so block on any
698  * processing races.
699  */
700 void
kqueue_terminate(struct kqueue * kq)701 kqueue_terminate(struct kqueue *kq)
702 {
703           struct knote *kn;
704 
705           lwkt_getpooltoken(kq);
706           while ((kn = TAILQ_FIRST(&kq->kq_knlist)) != NULL) {
707                     if (knote_acquire(kn))
708                               knote_detach_and_drop(kn);
709           }
710           lwkt_relpooltoken(kq);
711 
712           if (kq->kq_knhash) {
713                     hashdestroy(kq->kq_knhash, M_KQUEUE, kq->kq_knhashmask);
714                     kq->kq_knhash = NULL;
715                     kq->kq_knhashmask = 0;
716           }
717 }
718 
719 /*
720  * MPSAFE
721  */
722 int
sys_kqueue(struct sysmsg * sysmsg,const struct kqueue_args * uap)723 sys_kqueue(struct sysmsg *sysmsg, const struct kqueue_args *uap)
724 {
725           struct thread *td = curthread;
726           struct kqueue *kq;
727           struct file *fp;
728           int fd, error;
729 
730           error = falloc(td->td_lwp, &fp, &fd);
731           if (error)
732                     return (error);
733           fp->f_flag = FREAD | FWRITE;
734           fp->f_type = DTYPE_KQUEUE;
735           fp->f_ops = &kqueueops;
736 
737           kq = kmalloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
738           kqueue_init(kq, td->td_proc->p_fd);
739           fp->f_data = kq;
740 
741           fsetfd(kq->kq_fdp, fp, fd);
742           sysmsg->sysmsg_result = fd;
743           fdrop(fp);
744           return (0);
745 }
746 
747 /*
748  * Copy 'count' items into the destination list pointed to by uap->eventlist.
749  */
750 static int
kevent_copyout(void * arg,struct kevent * kevp,int count,int * res)751 kevent_copyout(void *arg, struct kevent *kevp, int count, int *res)
752 {
753           struct kevent_copyin_args *kap;
754           int error;
755 
756           kap = (struct kevent_copyin_args *)arg;
757 
758           error = copyout(kevp, kap->eventlist, count * sizeof(*kevp));
759           if (error == 0) {
760                     kap->eventlist += count;
761                     *res += count;
762           } else {
763                     *res = -1;
764           }
765 
766           return (error);
767 }
768 
769 /*
770  * Copy at most 'max' items from the list pointed to by kap->changelist,
771  * return number of items in 'events'.
772  */
773 static int
kevent_copyin(void * arg,struct kevent * kevp,int max,int * events)774 kevent_copyin(void *arg, struct kevent *kevp, int max, int *events)
775 {
776           struct kevent_copyin_args *kap;
777           int error, count;
778 
779           kap = (struct kevent_copyin_args *)arg;
780 
781           count = min(kap->ka->nchanges - kap->pchanges, max);
782           error = copyin(kap->changelist, kevp, count * sizeof *kevp);
783           if (error == 0) {
784                     kap->changelist += count;
785                     kap->pchanges += count;
786                     *events = count;
787           }
788 
789           return (error);
790 }
791 
792 /*
793  * MPSAFE
794  */
795 int
kern_kevent(struct kqueue * kq,int nevents,int * res,void * uap,k_copyin_fn kevent_copyinfn,k_copyout_fn kevent_copyoutfn,struct timespec * tsp_in,int flags)796 kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
797               k_copyin_fn kevent_copyinfn, k_copyout_fn kevent_copyoutfn,
798               struct timespec *tsp_in, int flags)
799 {
800           struct kevent *kevp;
801           struct timespec *tsp, ats;
802           int i, n, total, error, nerrors = 0;
803           int gobbled;
804           int lres;
805           int limit = kq_checkloop;
806           int closedcounter;
807           struct kevent kev[KQ_NEVENTS];
808           struct knote marker;
809           struct lwkt_token *tok;
810 
811           if (tsp_in == NULL || tsp_in->tv_sec || tsp_in->tv_nsec)
812                     atomic_set_int(&curthread->td_mpflags, TDF_MP_BATCH_DEMARC);
813 
814           tsp = tsp_in;
815           *res = 0;
816 
817           closedcounter = kq->kq_fdp->fd_closedcounter;
818 
819           for (;;) {
820                     n = 0;
821                     error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
822                     if (error)
823                               return error;
824                     if (n == 0)
825                               break;
826                     for (i = 0; i < n; ++i)
827                               kev[i].flags &= ~EV_SYSFLAGS;
828                     for (i = 0; i < n; ++i) {
829                               gobbled = n - i;
830 
831                               error = kqueue_register(kq, &kev[i], &gobbled, flags);
832                               i += gobbled - 1;
833                               kevp = &kev[i];
834 
835                               /*
836                                * If a registration returns an error we
837                                * immediately post the error.  The kevent()
838                                * call itself will fail with the error if
839                                * no space is available for posting.
840                                *
841                                * Such errors normally bypass the timeout/blocking
842                                * code.  However, if the copyoutfn function refuses
843                                * to post the error (see sys_poll()), then we
844                                * ignore it too.
845                                */
846                               if (error || (kevp->flags & EV_RECEIPT)) {
847                                         kevp->flags = EV_ERROR;
848                                         kevp->data = error;
849                                         lres = *res;
850                                         kevent_copyoutfn(uap, kevp, 1, res);
851                                         if (*res < 0) {
852                                                   return error;
853                                         } else if (lres != *res) {
854                                                   nevents--;
855                                                   nerrors++;
856                                         }
857                               }
858                     }
859           }
860           if (nerrors)
861                     return 0;
862 
863           /*
864            * Acquire/wait for events - setup timeout
865            *
866            * If no timeout specified clean up the run path by clearing the
867            * PRECISE flag.
868            */
869           if (tsp != NULL) {
870                     if (tsp->tv_sec || tsp->tv_nsec) {
871                               getnanouptime(&ats);
872                               timespecadd(tsp, &ats, tsp);  /* tsp = target time */
873                     }
874           } else {
875                     flags &= ~KEVENT_TIMEOUT_PRECISE;
876           }
877 
878           /*
879            * Loop as required.
880            *
881            * Collect as many events as we can. Sleeping on successive
882            * loops is disabled if copyoutfn has incremented (*res).
883            *
884            * The loop stops if an error occurs, all events have been
885            * scanned (the marker has been reached), or fewer than the
886            * maximum number of events is found.
887            *
888            * The copyoutfn function does not have to increment (*res) in
889            * order for the loop to continue.
890            *
891            * NOTE: doselect() usually passes 0x7FFFFFFF for nevents.
892            */
893           total = 0;
894           error = 0;
895           marker.kn_filter = EVFILT_MARKER;
896           marker.kn_status = KN_PROCESSING;
897 
898           tok = lwkt_token_pool_lookup(kq);
899           flags = (flags & ~KEVENT_SCAN_MASK) | KEVENT_SCAN_INSERT_MARKER;
900 
901           while ((n = nevents - total) > 0) {
902                     if (n > KQ_NEVENTS)
903                               n = KQ_NEVENTS;
904 
905                     /*
906                      * Process all received events
907                      * Account for all non-spurious events in our total
908                      */
909                     i = kqueue_scan(kq, kev, n, &marker, closedcounter, flags);
910                     flags = (flags & ~KEVENT_SCAN_MASK) | KEVENT_SCAN_KEEP_MARKER;
911                     if (i) {
912                               lres = *res;
913                               error = kevent_copyoutfn(uap, kev, i, res);
914                               total += *res - lres;
915                               if (error)
916                                         break;
917                     }
918                     if (limit && --limit == 0)
919                               panic("kqueue: checkloop failed i=%d", i);
920 
921                     /*
922                      * Normally when fewer events are returned than requested
923                      * we can stop.  However, if only spurious events were
924                      * collected the copyout will not bump (*res) and we have
925                      * to continue.
926                      */
927                     if (i < n && *res)
928                               break;
929 
930                     /*
931                      * If no events were recorded (no events happened or the events
932                      * that did happen were all spurious), block until an event
933                      * occurs or the timeout occurs and reload the marker.
934                      *
935                      * If we saturated n (i == n) loop up without sleeping to
936                      * continue processing the list.
937                      */
938                     if (i != n && kq->kq_count == 0 && *res == 0) {
939                               int timeout;
940                               int ustimeout;
941 
942                               if (tsp == NULL) {
943                                         timeout = 0;
944                                         ustimeout = 0;
945                               } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
946                                         error = EWOULDBLOCK;
947                                         break;
948                               } else {
949                                         struct timespec atx = *tsp;
950 
951                                         getnanouptime(&ats);
952                                         timespecsub(&atx, &ats, &atx);
953                                         if (atx.tv_sec < 0 ||
954                                             (atx.tv_sec == 0 && atx.tv_nsec <= 0)) {
955                                                   error = EWOULDBLOCK;
956                                                   break;
957                                         }
958                                         if (flags & KEVENT_TIMEOUT_PRECISE) {
959                                                   if (atx.tv_sec == 0 &&
960                                                       atx.tv_nsec < kq_sleep_threshold) {
961                                                             ustimeout = kq_sleep_threshold /
962                                                                           1000;
963                                                   } else if (atx.tv_sec < 60) {
964                                                             ustimeout =
965                                                                       atx.tv_sec * 1000000 +
966                                                                       atx.tv_nsec / 1000;
967                                                   } else {
968                                                             ustimeout = 60 * 1000000;
969                                                   }
970                                                   if (ustimeout == 0)
971                                                             ustimeout = 1;
972                                                   timeout = 0;
973                                         } else if (atx.tv_sec > 60 * 60) {
974                                                   timeout = 60 * 60 * hz;
975                                                   ustimeout = 0;
976                                         } else {
977                                                   timeout = tstohz_high(&atx);
978                                                   ustimeout = 0;
979                                         }
980                               }
981 
982                               lwkt_gettoken(tok);
983                               if (kq->kq_count == 0) {
984                                         kq->kq_sleep_cnt++;
985                                         if (__predict_false(kq->kq_sleep_cnt == 0)) {
986                                                   /*
987                                                    * Guard against possible wrapping.  And
988                                                    * set it to 2, so that kqueue_wakeup()
989                                                    * can wake everyone up.
990                                                    */
991                                                   kq->kq_sleep_cnt = 2;
992                                         }
993                                         if (flags & KEVENT_TIMEOUT_PRECISE) {
994                                                   error = precise_sleep(kq, PCATCH,
995                                                                       "kqread", ustimeout);
996                                         } else {
997                                                   error = tsleep(kq, PCATCH,
998                                                                       "kqread", timeout);
999                                         }
1000 
1001                                         /* don't restart after signals... */
1002                                         if (error == ERESTART)
1003                                                   error = EINTR;
1004                                         if (error == EWOULDBLOCK)
1005                                                   error = 0;
1006                                         if (error) {
1007                                                   lwkt_reltoken(tok);
1008                                                   break;
1009                                         }
1010                                         flags = (flags & ~KEVENT_SCAN_MASK) |
1011                                                   KEVENT_SCAN_RELOAD_MARKER;
1012                               }
1013                               lwkt_reltoken(tok);
1014                     }
1015 
1016                     /*
1017                      * Deal with an edge case where spurious events can cause
1018                      * a loop to occur without moving the marker.  This can
1019                      * prevent kqueue_scan() from picking up new events which
1020                      * race us.  We must be sure to move the marker for this
1021                      * case.
1022                      *
1023                      * NOTE: We do not want to move the marker if events
1024                      *         were scanned because normal kqueue operations
1025                      *         may reactivate events.  Moving the marker in
1026                      *         that case could result in duplicates for the
1027                      *         same event.
1028                      */
1029                     if (i == 0) {
1030                               flags = (flags & ~KEVENT_SCAN_MASK) |
1031                                         KEVENT_SCAN_RELOAD_MARKER;
1032                     }
1033           }
1034 
1035           /*
1036            * Remove the marker
1037            */
1038           if ((flags & KEVENT_SCAN_INSERT_MARKER) == 0) {
1039                     lwkt_gettoken(tok);
1040                     TAILQ_REMOVE(&kq->kq_knpend, &marker, kn_tqe);
1041                     lwkt_reltoken(tok);
1042           }
1043 
1044           /* Timeouts do not return EWOULDBLOCK. */
1045           if (error == EWOULDBLOCK)
1046                     error = 0;
1047           return error;
1048 }
1049 
1050 /*
1051  * MPALMOSTSAFE
1052  */
1053 int
sys_kevent(struct sysmsg * sysmsg,const struct kevent_args * uap)1054 sys_kevent(struct sysmsg *sysmsg, const struct kevent_args *uap)
1055 {
1056           struct thread *td = curthread;
1057           struct timespec ts, *tsp;
1058           struct kqueue *kq;
1059           struct file *fp = NULL;
1060           struct kevent_copyin_args *kap, ka;
1061           int error;
1062 
1063           if (uap->timeout) {
1064                     error = copyin(uap->timeout, &ts, sizeof(ts));
1065                     if (error)
1066                               return (error);
1067                     tsp = &ts;
1068           } else {
1069                     tsp = NULL;
1070           }
1071           fp = holdfp(td, uap->fd, -1);
1072           if (fp == NULL)
1073                     return (EBADF);
1074           if (fp->f_type != DTYPE_KQUEUE) {
1075                     fdrop(fp);
1076                     return (EBADF);
1077           }
1078 
1079           kq = (struct kqueue *)fp->f_data;
1080 
1081           kap = &ka;
1082           kap->ka = uap;
1083           kap->pchanges = 0;
1084           kap->eventlist = uap->eventlist;
1085           kap->changelist = uap->changelist;
1086 
1087           error = kern_kevent(kq, uap->nevents, &sysmsg->sysmsg_result, kap,
1088                                   kevent_copyin, kevent_copyout, tsp, 0);
1089 
1090           dropfp(td, uap->fd, fp);
1091 
1092           return (error);
1093 }
1094 
1095 /*
1096  * Efficiently load multiple file pointers.  This significantly reduces
1097  * threaded overhead.  When doing simple polling we can depend on the
1098  * per-thread (fd,fp) cache.  With more descriptors, we batch.
1099  */
1100 static
1101 void
floadkevfps(thread_t td,struct filedesc * fdp,struct kevent * kev,struct file ** fp,int climit)1102 floadkevfps(thread_t td, struct filedesc *fdp, struct kevent *kev,
1103               struct file **fp, int climit)
1104 {
1105           struct filterops *fops;
1106           int tdcache;
1107 
1108           if (climit <= 2 && td->td_proc && td->td_proc->p_fd == fdp) {
1109                     tdcache = 1;
1110           } else {
1111                     tdcache = 0;
1112                     spin_lock_shared(&fdp->fd_spin);
1113           }
1114 
1115           while (climit) {
1116                     *fp = NULL;
1117                     if (kev->filter < 0 &&
1118                         kev->filter + EVFILT_SYSCOUNT >= 0) {
1119                               fops = sysfilt_ops[~kev->filter];
1120                               if (fops->f_flags & FILTEROP_ISFD) {
1121                                         if (tdcache) {
1122                                                   *fp = holdfp(td, kev->ident, -1);
1123                                         } else {
1124                                                   *fp = holdfp_fdp_locked(fdp,
1125                                                                                 kev->ident, -1);
1126                                         }
1127                               }
1128                     }
1129                     --climit;
1130                     ++fp;
1131                     ++kev;
1132           }
1133           if (tdcache == 0)
1134                     spin_unlock_shared(&fdp->fd_spin);
1135 }
1136 
1137 /*
1138  * Register up to *countp kev's.  Always registers at least 1.
1139  *
1140  * The number registered is returned in *countp.
1141  *
1142  * If an error occurs or a kev is flagged EV_RECEIPT, it is
1143  * processed and included in *countp, and processing then
1144  * stops.
1145  *
1146  * If flags contains KEVENT_UNIQUE_NOTES, kev->data contains an identifier
1147  * to further distinguish knotes which might otherwise have the same kq,
1148  * ident, and filter (used by *poll() because multiple pfds are allowed to
1149  * reference the same descriptor and implied kq filter).  kev->data is
1150  * implied to be zero for event processing when this flag is set.
1151  */
1152 int
kqueue_register(struct kqueue * kq,struct kevent * kev,int * countp,int flags)1153 kqueue_register(struct kqueue *kq, struct kevent *kev, int *countp, int flags)
1154 {
1155           struct filedesc *fdp = kq->kq_fdp;
1156           struct klist *list = NULL;
1157           struct filterops *fops;
1158           struct file *fp[KQ_NEVENTS];
1159           struct knote *kn = NULL;
1160           struct thread *td;
1161           int error;
1162           int count;
1163           int climit;
1164           int closedcounter;
1165           int uniqifier = 0;
1166           struct knote_cache_list *cache_list;
1167 
1168           td = curthread;
1169           climit = *countp;
1170           if (climit > KQ_NEVENTS)
1171                     climit = KQ_NEVENTS;
1172           closedcounter = fdp->fd_closedcounter;
1173           floadkevfps(td, fdp, kev, fp, climit);
1174 
1175           lwkt_getpooltoken(kq);
1176           count = 0;
1177           error = 0;
1178 
1179           /*
1180            * To avoid races, only one thread can register events on this
1181            * kqueue at a time.
1182            */
1183           while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) {
1184                     kq->kq_state |= KQ_REGWAIT;
1185                     tsleep(&kq->kq_regtd, 0, "kqreg", 0);
1186           }
1187           if (__predict_false(kq->kq_regtd != NULL)) {
1188                     /* Recursive calling of kqueue_register() */
1189                     td = NULL;
1190           } else {
1191                     /* Owner of the kq_regtd, i.e. td != NULL */
1192                     kq->kq_regtd = td;
1193           }
1194 
1195 loop:
1196           /*
1197            * knote uniqifiers are used by *poll() because there may be
1198            * multiple pfd[] entries for the same descriptor and filter.
1199            * The unique id is stored in kev->data and kev->data for the
1200            * kevent is implied to be zero.
1201            */
1202           if (flags & KEVENT_UNIQUE_NOTES) {
1203                     uniqifier = kev->data;
1204                     kev->data = 0;
1205           }
1206 
1207           if (kev->filter < 0) {
1208                     if (kev->filter + EVFILT_SYSCOUNT < 0) {
1209                               error = EINVAL;
1210                               ++count;
1211                               goto done;
1212                     }
1213                     fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
1214           } else {
1215                     /*
1216                      * XXX
1217                      * filter attach routine is responsible for insuring that
1218                      * the identifier can be attached to it.
1219                      */
1220                     error = EINVAL;
1221                     ++count;
1222                     goto done;
1223           }
1224 
1225           if (fops->f_flags & FILTEROP_ISFD) {
1226                     /* validate descriptor */
1227                     if (fp[count] == NULL) {
1228                               error = EBADF;
1229                               ++count;
1230                               goto done;
1231                     }
1232           }
1233 
1234           cache_list = &knote_cache_lists[mycpuid];
1235           if (SLIST_EMPTY(&cache_list->knote_cache)) {
1236                     struct knote *new_kn;
1237 
1238                     new_kn = knote_alloc();
1239                     crit_enter();
1240                     SLIST_INSERT_HEAD(&cache_list->knote_cache, new_kn, kn_link);
1241                     cache_list->knote_cache_cnt++;
1242                     crit_exit();
1243           }
1244 
1245           if (fp[count] != NULL) {
1246                     list = &fp[count]->f_klist;
1247           } else if (kq->kq_knhashmask) {
1248                     list = &kq->kq_knhash[
1249                                   KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1250           }
1251           if (list != NULL) {
1252                     lwkt_getpooltoken(list);
1253 again:
1254                     SLIST_FOREACH(kn, list, kn_link) {
1255                               if (kn->kn_kq == kq &&
1256                                   kn->kn_filter == kev->filter &&
1257                                   kn->kn_id == kev->ident &&
1258                                   kn->kn_uniqifier == uniqifier)
1259                               {
1260                                         if (knote_acquire(kn) == 0)
1261                                                   goto again;
1262                                         break;
1263                               }
1264                     }
1265                     lwkt_relpooltoken(list);
1266           }
1267 
1268           /*
1269            * NOTE: At this point if kn is non-NULL we will have acquired
1270            *         it and set KN_PROCESSING.
1271            */
1272           if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
1273                     error = ENOENT;
1274                     ++count;
1275                     goto done;
1276           }
1277 
1278           /*
1279            * kn now contains the matching knote, or NULL if no match
1280            */
1281           if (kev->flags & EV_ADD) {
1282                     if (kn == NULL) {
1283                               crit_enter();
1284                               kn = SLIST_FIRST(&cache_list->knote_cache);
1285                               if (kn == NULL) {
1286                                         crit_exit();
1287                                         kn = knote_alloc();
1288                               } else {
1289                                         SLIST_REMOVE_HEAD(&cache_list->knote_cache,
1290                                             kn_link);
1291                                         cache_list->knote_cache_cnt--;
1292                                         crit_exit();
1293                               }
1294                               kn->kn_fp = fp[count];
1295                               kn->kn_kq = kq;
1296                               kn->kn_fop = fops;
1297                               kn->kn_uniqifier = uniqifier;
1298 
1299                               /*
1300                                * apply reference count to knote structure, and
1301                                * do not release it at the end of this routine.
1302                                */
1303                               fp[count] = NULL;   /* safety */
1304 
1305                               kn->kn_sfflags = kev->fflags;
1306                               kn->kn_sdata = kev->data;
1307                               kev->fflags = 0;
1308                               kev->data = 0;
1309                               kn->kn_kevent = *kev;
1310 
1311                               /*
1312                                * KN_PROCESSING prevents the knote from getting
1313                                * ripped out from under us while we are trying
1314                                * to attach it, in case the attach blocks.
1315                                */
1316                               kn->kn_status = KN_PROCESSING;
1317                               knote_attach(kn);
1318                               if ((error = filter_attach(kn)) != 0) {
1319                                         kn->kn_status |= KN_DELETING | KN_REPROCESS;
1320                                         knote_drop(kn);
1321                                         ++count;
1322                                         goto done;
1323                               }
1324 
1325                               /*
1326                                * Interlock against close races which either tried
1327                                * to remove our knote while we were blocked or missed
1328                                * it entirely prior to our attachment.  We do not
1329                                * want to end up with a knote on a closed descriptor.
1330                                */
1331                               if ((fops->f_flags & FILTEROP_ISFD) &&
1332                                   checkfdclosed(curthread, fdp, kev->ident, kn->kn_fp,
1333                                                     closedcounter)) {
1334                                         kn->kn_status |= KN_DELETING | KN_REPROCESS;
1335                               }
1336                     } else {
1337                               /*
1338                                * The user may change some filter values after the
1339                                * initial EV_ADD, but doing so will not reset any
1340                                * filter which have already been triggered.
1341                                */
1342                               KKASSERT(kn->kn_status & KN_PROCESSING);
1343                               if (fops == &user_filtops) {
1344                                         filt_usertouch(kn, kev, EVENT_REGISTER);
1345                               } else {
1346                                         kn->kn_sfflags = kev->fflags;
1347                                         kn->kn_sdata = kev->data;
1348                                         kn->kn_kevent.udata = kev->udata;
1349                               }
1350                     }
1351 
1352                     /*
1353                      * Execute the filter event to immediately activate the
1354                      * knote if necessary.  If reprocessing events are pending
1355                      * due to blocking above we do not run the filter here
1356                      * but instead let knote_release() do it.  Otherwise we
1357                      * might run the filter on a deleted event.
1358                      */
1359                     if ((kn->kn_status & KN_REPROCESS) == 0) {
1360                               if (filter_event(kn, 0))
1361                                         KNOTE_ACTIVATE(kn);
1362                     }
1363           } else if (kev->flags & EV_DELETE) {
1364                     /*
1365                      * Delete the existing knote
1366                      */
1367                     knote_detach_and_drop(kn);
1368                     error = 0;
1369                     ++count;
1370                     goto done;
1371           } else {
1372                     /*
1373                      * Modify an existing event.
1374                      *
1375                      * The user may change some filter values after the
1376                      * initial EV_ADD, but doing so will not reset any
1377                      * filter which have already been triggered.
1378                      */
1379                     KKASSERT(kn->kn_status & KN_PROCESSING);
1380                     if (fops == &user_filtops) {
1381                               filt_usertouch(kn, kev, EVENT_REGISTER);
1382                     } else {
1383                               kn->kn_sfflags = kev->fflags;
1384                               kn->kn_sdata = kev->data;
1385                               kn->kn_kevent.udata = kev->udata;
1386                     }
1387 
1388                     /*
1389                      * Execute the filter event to immediately activate the
1390                      * knote if necessary.  If reprocessing events are pending
1391                      * due to blocking above we do not run the filter here
1392                      * but instead let knote_release() do it.  Otherwise we
1393                      * might run the filter on a deleted event.
1394                      */
1395                     if ((kn->kn_status & KN_REPROCESS) == 0) {
1396                               if (filter_event(kn, 0))
1397                                         KNOTE_ACTIVATE(kn);
1398                     }
1399           }
1400 
1401           /*
1402            * Disablement does not deactivate a knote here.
1403            */
1404           if ((kev->flags & EV_DISABLE) &&
1405               ((kn->kn_status & KN_DISABLED) == 0))
1406           {
1407                     kn->kn_status |= KN_DISABLED;
1408           }
1409 
1410           /*
1411            * Re-enablement may have to immediately enqueue an active knote.
1412            */
1413           if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1414                     kn->kn_status &= ~KN_DISABLED;
1415                     if ((kn->kn_status & KN_ACTIVE) &&
1416                         ((kn->kn_status & KN_QUEUED) == 0))
1417                     {
1418                               knote_enqueue(kn);
1419                     }
1420           }
1421 
1422           /*
1423            * Handle any required reprocessing
1424            */
1425           knote_release(kn);
1426           /* kn may be invalid now */
1427 
1428           /*
1429            * Loop control.  We stop on errors (above), and also stop after
1430            * processing EV_RECEIPT, so the caller can process it.
1431            */
1432           ++count;
1433           if (kev->flags & EV_RECEIPT) {
1434                     error = 0;
1435                     goto done;
1436           }
1437           ++kev;
1438           if (count < climit) {
1439                     if (fp[count-1])              /* drop unprocessed fp */
1440                               fdrop(fp[count-1]);
1441                     goto loop;
1442           }
1443 
1444           /*
1445            * Cleanup
1446            */
1447 done:
1448           if (td != NULL) { /* Owner of the kq_regtd */
1449                     kq->kq_regtd = NULL;
1450                     if (__predict_false(kq->kq_state & KQ_REGWAIT)) {
1451                               kq->kq_state &= ~KQ_REGWAIT;
1452                               wakeup(&kq->kq_regtd);
1453                     }
1454           }
1455           lwkt_relpooltoken(kq);
1456 
1457           /*
1458            * Drop unprocessed file pointers
1459            */
1460           *countp = count;
1461           if (count && fp[count-1])
1462                     fdrop(fp[count-1]);
1463           while (count < climit) {
1464                     if (fp[count])
1465                               fdrop(fp[count]);
1466                     ++count;
1467           }
1468           return (error);
1469 }
1470 
1471 /*
1472  * Scan the kqueue, return the number of active events placed in kevp up
1473  * to count.
1474  *
1475  * Continuous mode events may get recycled, do not continue scanning past
1476  * marker unless no events have been collected.
1477  */
1478 static int
kqueue_scan(struct kqueue * kq,struct kevent * kevp,int count,struct knote * marker,int closedcounter,int flags)1479 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
1480             struct knote *marker, int closedcounter, int flags)
1481 {
1482           struct knote *kn, local_marker;
1483           thread_t td = curthread;
1484           int total;
1485 
1486           total = 0;
1487           local_marker.kn_filter = EVFILT_MARKER;
1488           local_marker.kn_status = KN_PROCESSING;
1489 
1490           lwkt_getpooltoken(kq);
1491 
1492           /*
1493            * Adjust marker, insert initial marker, or leave the marker alone.
1494            *
1495            * Also setup our local_marker.
1496            */
1497           switch(flags & KEVENT_SCAN_MASK) {
1498           case KEVENT_SCAN_RELOAD_MARKER:
1499                     TAILQ_REMOVE(&kq->kq_knpend, marker, kn_tqe);
1500                     /* fall through */
1501           case KEVENT_SCAN_INSERT_MARKER:
1502                     TAILQ_INSERT_TAIL(&kq->kq_knpend, marker, kn_tqe);
1503                     break;
1504           }
1505           TAILQ_INSERT_HEAD(&kq->kq_knpend, &local_marker, kn_tqe);
1506 
1507           /*
1508            * Collect events.
1509            */
1510           while (count) {
1511                     kn = TAILQ_NEXT(&local_marker, kn_tqe);
1512                     if (kn->kn_filter == EVFILT_MARKER) {
1513                               /* Marker reached, we are done */
1514                               if (kn == marker)
1515                                         break;
1516 
1517                               /* Move local marker past some other threads marker */
1518                               kn = TAILQ_NEXT(kn, kn_tqe);
1519                               TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1520                               TAILQ_INSERT_BEFORE(kn, &local_marker, kn_tqe);
1521                               continue;
1522                     }
1523 
1524                     /*
1525                      * We can't skip a knote undergoing processing, otherwise
1526                      * we risk not returning it when the user process expects
1527                      * it should be returned.  Sleep and retry.
1528                      */
1529                     if (knote_acquire(kn) == 0)
1530                               continue;
1531 
1532                     /*
1533                      * Remove the event for processing.
1534                      *
1535                      * WARNING!  We must leave KN_QUEUED set to prevent the
1536                      *             event from being KNOTE_ACTIVATE()d while
1537                      *             the queue state is in limbo, in case we
1538                      *             block.
1539                      */
1540                     TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
1541                     kq->kq_count--;
1542 
1543                     /*
1544                      * Kernel select() and poll() functions cache previous
1545                      * operations on the assumption that future operations
1546                      * will use similr descriptor sets.  This removes any
1547                      * stale entries in a way that does not require a descriptor
1548                      * lookup and is thus not affected by close() races.
1549                      *
1550                      * Do not report to *_copyout()
1551                      */
1552                     if (flags & KEVENT_AUTO_STALE) {
1553                               if ((uint64_t)kn->kn_kevent.udata <
1554                                   curthread->td_lwp->lwp_kqueue_serial)
1555                               {
1556                                         kn->kn_status |= KN_DELETING | KN_REPROCESS |
1557                                                              KN_DISABLED;
1558                               }
1559                     }
1560 
1561                     /*
1562                      * If a descriptor is close()d out from under a poll/select,
1563                      * we want to report the event but delete the note because
1564                      * the note can wind up being 'stuck' on kq_knpend.
1565                      */
1566                     if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
1567                         checkfdclosed(td, kq->kq_fdp, kn->kn_kevent.ident,
1568                                           kn->kn_fp, closedcounter))
1569                     {
1570                               kn->kn_status |= KN_DELETING | KN_REPROCESS;
1571                     }
1572 
1573                     if (kn->kn_status & KN_DISABLED) {
1574                               /*
1575                                * If disabled we ensure the event is not queued
1576                                * but leave its active bit set.  On re-enablement
1577                                * the event may be immediately triggered.
1578                                */
1579                               kn->kn_status &= ~KN_QUEUED;
1580                     } else if ((kn->kn_flags & EV_ONESHOT) == 0 &&
1581                                  (kn->kn_status & KN_DELETING) == 0 &&
1582                                  filter_event(kn, 0) == 0) {
1583                               /*
1584                                * If not running in one-shot mode and the event
1585                                * is no longer present we ensure it is removed
1586                                * from the queue and ignore it.
1587                                */
1588                               kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1589                     } else {
1590                               /*
1591                                * Post the event
1592                                */
1593                               if (kn->kn_fop == &user_filtops)
1594                                         filt_usertouch(kn, kevp, EVENT_PROCESS);
1595                               else
1596                                         *kevp = kn->kn_kevent;
1597                               ++kevp;
1598                               ++total;
1599                               --count;
1600 
1601                               if (kn->kn_flags & EV_ONESHOT) {
1602                                         kn->kn_status &= ~KN_QUEUED;
1603                                         kn->kn_status |= KN_DELETING | KN_REPROCESS;
1604                               } else {
1605                                         if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1606                                                   if (kn->kn_flags & EV_CLEAR) {
1607                                                             kn->kn_data = 0;
1608                                                             kn->kn_fflags = 0;
1609                                                   }
1610                                                   if (kn->kn_flags & EV_DISPATCH) {
1611                                                             kn->kn_status |= KN_DISABLED;
1612                                                   }
1613                                                   kn->kn_status &= ~(KN_QUEUED |
1614                                                                          KN_ACTIVE);
1615                                         } else {
1616                                                   TAILQ_INSERT_TAIL(&kq->kq_knpend,
1617                                                                         kn,
1618                                                                         kn_tqe);
1619                                                   kq->kq_count++;
1620                                         }
1621                               }
1622                     }
1623 
1624                     /*
1625                      * Handle any post-processing states
1626                      */
1627                     knote_release(kn);
1628           }
1629           TAILQ_REMOVE(&kq->kq_knpend, &local_marker, kn_tqe);
1630 
1631           lwkt_relpooltoken(kq);
1632           return (total);
1633 }
1634 
1635 /*
1636  * XXX
1637  * This could be expanded to call kqueue_scan, if desired.
1638  *
1639  * MPSAFE
1640  */
1641 static int
kqueue_read(struct file * fp,struct uio * uio,struct ucred * cred,int flags)1642 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1643 {
1644           return (ENXIO);
1645 }
1646 
1647 /*
1648  * MPSAFE
1649  */
1650 static int
kqueue_write(struct file * fp,struct uio * uio,struct ucred * cred,int flags)1651 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred, int flags)
1652 {
1653           return (ENXIO);
1654 }
1655 
1656 /*
1657  * MPALMOSTSAFE
1658  */
1659 static int
kqueue_ioctl(struct file * fp,u_long com,caddr_t data,struct ucred * cred,struct sysmsg * msg)1660 kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
1661                struct ucred *cred, struct sysmsg *msg)
1662 {
1663           struct kqueue *kq;
1664           int error;
1665 
1666           kq = (struct kqueue *)fp->f_data;
1667           lwkt_getpooltoken(kq);
1668           switch(com) {
1669           case FIOASYNC:
1670                     if (*(int *)data)
1671                               kq->kq_state |= KQ_ASYNC;
1672                     else
1673                               kq->kq_state &= ~KQ_ASYNC;
1674                     error = 0;
1675                     break;
1676           case FIOSETOWN:
1677                     error = fsetown(*(int *)data, &kq->kq_sigio);
1678                     break;
1679           default:
1680                     error = ENOTTY;
1681                     break;
1682           }
1683           lwkt_relpooltoken(kq);
1684           return (error);
1685 }
1686 
1687 /*
1688  * MPSAFE
1689  */
1690 static int
kqueue_stat(struct file * fp,struct stat * st,struct ucred * cred)1691 kqueue_stat(struct file *fp, struct stat *st, struct ucred *cred)
1692 {
1693           struct kqueue *kq = (struct kqueue *)fp->f_data;
1694 
1695           bzero((void *)st, sizeof(*st));
1696           st->st_size = kq->kq_count;
1697           st->st_blksize = sizeof(struct kevent);
1698           st->st_mode = S_IFIFO;
1699           return (0);
1700 }
1701 
1702 /*
1703  * MPSAFE
1704  */
1705 static int
kqueue_close(struct file * fp)1706 kqueue_close(struct file *fp)
1707 {
1708           struct kqueue *kq = (struct kqueue *)fp->f_data;
1709 
1710           kqueue_terminate(kq);
1711 
1712           fp->f_data = NULL;
1713           funsetown(&kq->kq_sigio);
1714 
1715           kfree(kq, M_KQUEUE);
1716           return (0);
1717 }
1718 
1719 static void
kqueue_wakeup(struct kqueue * kq)1720 kqueue_wakeup(struct kqueue *kq)
1721 {
1722           if (kq->kq_sleep_cnt) {
1723                     u_int sleep_cnt = kq->kq_sleep_cnt;
1724 
1725                     kq->kq_sleep_cnt = 0;
1726                     if (sleep_cnt == 1)
1727                               wakeup_one(kq);
1728                     else
1729                               wakeup(kq);
1730           }
1731           KNOTE(&kq->kq_kqinfo.ki_note, 0);
1732 }
1733 
1734 /*
1735  * Calls filterops f_attach function, acquiring mplock if filter is not
1736  * marked as FILTEROP_MPSAFE.
1737  *
1738  * Caller must be holding the related kq token
1739  */
1740 static int
filter_attach(struct knote * kn)1741 filter_attach(struct knote *kn)
1742 {
1743           int ret;
1744 
1745           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1746                     ret = kn->kn_fop->f_attach(kn);
1747           } else {
1748                     get_mplock();
1749                     ret = kn->kn_fop->f_attach(kn);
1750                     rel_mplock();
1751           }
1752           return (ret);
1753 }
1754 
1755 /*
1756  * Detach the knote and drop it, destroying the knote.
1757  *
1758  * Calls filterops f_detach function, acquiring mplock if filter is not
1759  * marked as FILTEROP_MPSAFE.
1760  *
1761  * Caller must be holding the related kq token
1762  */
1763 static void
knote_detach_and_drop(struct knote * kn)1764 knote_detach_and_drop(struct knote *kn)
1765 {
1766           kn->kn_status |= KN_DELETING | KN_REPROCESS;
1767           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1768                     kn->kn_fop->f_detach(kn);
1769           } else {
1770                     get_mplock();
1771                     kn->kn_fop->f_detach(kn);
1772                     rel_mplock();
1773           }
1774           knote_drop(kn);
1775 }
1776 
1777 /*
1778  * Calls filterops f_event function, acquiring mplock if filter is not
1779  * marked as FILTEROP_MPSAFE.
1780  *
1781  * If the knote is in the middle of being created or deleted we cannot
1782  * safely call the filter op.
1783  *
1784  * Caller must be holding the related kq token
1785  */
1786 static int
filter_event(struct knote * kn,long hint)1787 filter_event(struct knote *kn, long hint)
1788 {
1789           int ret;
1790 
1791           if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
1792                     ret = kn->kn_fop->f_event(kn, hint);
1793           } else {
1794                     get_mplock();
1795                     ret = kn->kn_fop->f_event(kn, hint);
1796                     rel_mplock();
1797           }
1798           return (ret);
1799 }
1800 
1801 /*
1802  * Walk down a list of knotes, activating them if their event has triggered.
1803  *
1804  * If we encounter any knotes which are undergoing processing we just mark
1805  * them for reprocessing and do not try to [re]activate the knote.  However,
1806  * if a hint is being passed we have to wait and that makes things a bit
1807  * sticky.
1808  */
1809 void
knote(struct klist * list,long hint)1810 knote(struct klist *list, long hint)
1811 {
1812           struct kqueue *kq;
1813           struct knote *kn;
1814           struct knote *kntmp;
1815 
1816           lwkt_getpooltoken(list);
1817 restart:
1818           SLIST_FOREACH(kn, list, kn_next) {
1819                     kq = kn->kn_kq;
1820                     lwkt_getpooltoken(kq);
1821 
1822                     /* temporary verification hack */
1823                     SLIST_FOREACH(kntmp, list, kn_next) {
1824                               if (kn == kntmp)
1825                                         break;
1826                     }
1827                     if (kn != kntmp || kn->kn_kq != kq) {
1828                               lwkt_relpooltoken(kq);
1829                               goto restart;
1830                     }
1831 
1832                     if (kn->kn_status & KN_PROCESSING) {
1833                               /*
1834                                * Someone else is processing the knote, ask the
1835                                * other thread to reprocess it and don't mess
1836                                * with it otherwise.
1837                                */
1838                               if (hint == 0) {
1839                                         kn->kn_status |= KN_REPROCESS;
1840                                         lwkt_relpooltoken(kq);
1841                                         continue;
1842                               }
1843 
1844                               /*
1845                                * If the hint is non-zero we have to wait or risk
1846                                * losing the state the caller is trying to update.
1847                                *
1848                                * XXX This is a real problem, certain process
1849                                *     and signal filters will bump kn_data for
1850                                *     already-processed notes more than once if
1851                                *     we restart the list scan.  FIXME.
1852                                */
1853                               kn->kn_status |= KN_WAITING | KN_REPROCESS;
1854                               tsleep(kn, 0, "knotec", hz);
1855                               lwkt_relpooltoken(kq);
1856                               goto restart;
1857                     }
1858 
1859                     /*
1860                      * Become the reprocessing master ourselves.
1861                      *
1862                      * If hint is non-zero running the event is mandatory
1863                      * when not deleting so do it whether reprocessing is
1864                      * set or not.
1865                      */
1866                     kn->kn_status |= KN_PROCESSING;
1867                     if ((kn->kn_status & KN_DELETING) == 0) {
1868                               if (filter_event(kn, hint))
1869                                         KNOTE_ACTIVATE(kn);
1870                     }
1871                     if (knote_release(kn)) {
1872                               lwkt_relpooltoken(kq);
1873                               goto restart;
1874                     }
1875                     lwkt_relpooltoken(kq);
1876           }
1877           lwkt_relpooltoken(list);
1878 }
1879 
1880 /*
1881  * Insert knote at head of klist.
1882  *
1883  * This function may only be called via a filter function and thus
1884  * kq_token should already be held and marked for processing.
1885  */
1886 void
knote_insert(struct klist * klist,struct knote * kn)1887 knote_insert(struct klist *klist, struct knote *kn)
1888 {
1889           lwkt_getpooltoken(klist);
1890           KKASSERT(kn->kn_status & KN_PROCESSING);
1891           SLIST_INSERT_HEAD(klist, kn, kn_next);
1892           lwkt_relpooltoken(klist);
1893 }
1894 
1895 /*
1896  * Remove knote from a klist
1897  *
1898  * This function may only be called via a filter function and thus
1899  * kq_token should already be held and marked for processing.
1900  */
1901 void
knote_remove(struct klist * klist,struct knote * kn)1902 knote_remove(struct klist *klist, struct knote *kn)
1903 {
1904           lwkt_getpooltoken(klist);
1905           KKASSERT(kn->kn_status & KN_PROCESSING);
1906           SLIST_REMOVE(klist, kn, knote, kn_next);
1907           lwkt_relpooltoken(klist);
1908 }
1909 
1910 void
knote_assume_knotes(struct kqinfo * src,struct kqinfo * dst,struct filterops * ops,void * hook)1911 knote_assume_knotes(struct kqinfo *src, struct kqinfo *dst,
1912                         struct filterops *ops, void *hook)
1913 {
1914           struct kqueue *kq;
1915           struct knote *kn;
1916 
1917           lwkt_getpooltoken(&src->ki_note);
1918           lwkt_getpooltoken(&dst->ki_note);
1919           while ((kn = SLIST_FIRST(&src->ki_note)) != NULL) {
1920                     kq = kn->kn_kq;
1921                     lwkt_getpooltoken(kq);
1922                     if (SLIST_FIRST(&src->ki_note) != kn || kn->kn_kq != kq) {
1923                               lwkt_relpooltoken(kq);
1924                               continue;
1925                     }
1926                     if (knote_acquire(kn)) {
1927                               knote_remove(&src->ki_note, kn);
1928                               kn->kn_fop = ops;
1929                               kn->kn_hook = hook;
1930                               knote_insert(&dst->ki_note, kn);
1931                               knote_release(kn);
1932                               /* kn may be invalid now */
1933                     }
1934                     lwkt_relpooltoken(kq);
1935           }
1936           lwkt_relpooltoken(&dst->ki_note);
1937           lwkt_relpooltoken(&src->ki_note);
1938 }
1939 
1940 /*
1941  * Remove all knotes referencing a specified fd
1942  */
1943 void
knote_fdclose(struct file * fp,struct filedesc * fdp,int fd)1944 knote_fdclose(struct file *fp, struct filedesc *fdp, int fd)
1945 {
1946           struct kqueue *kq;
1947           struct knote *kn;
1948           struct knote *kntmp;
1949 
1950           lwkt_getpooltoken(&fp->f_klist);
1951 restart:
1952           SLIST_FOREACH(kn, &fp->f_klist, kn_link) {
1953                     if (kn->kn_kq->kq_fdp == fdp && kn->kn_id == fd) {
1954                               kq = kn->kn_kq;
1955                               lwkt_getpooltoken(kq);
1956 
1957                               /* temporary verification hack */
1958                               SLIST_FOREACH(kntmp, &fp->f_klist, kn_link) {
1959                                         if (kn == kntmp)
1960                                                   break;
1961                               }
1962                               if (kn != kntmp || kn->kn_kq->kq_fdp != fdp ||
1963                                   kn->kn_id != fd || kn->kn_kq != kq) {
1964                                         lwkt_relpooltoken(kq);
1965                                         goto restart;
1966                               }
1967                               if (knote_acquire(kn))
1968                                         knote_detach_and_drop(kn);
1969                               lwkt_relpooltoken(kq);
1970                               goto restart;
1971                     }
1972           }
1973           lwkt_relpooltoken(&fp->f_klist);
1974 }
1975 
1976 /*
1977  * Low level attach function.
1978  *
1979  * The knote should already be marked for processing.
1980  * Caller must hold the related kq token.
1981  */
1982 static void
knote_attach(struct knote * kn)1983 knote_attach(struct knote *kn)
1984 {
1985           struct klist *list;
1986           struct kqueue *kq = kn->kn_kq;
1987 
1988           if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
1989                     KKASSERT(kn->kn_fp);
1990                     list = &kn->kn_fp->f_klist;
1991           } else {
1992                     if (kq->kq_knhashmask == 0)
1993                               kq->kq_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1994                                                              &kq->kq_knhashmask);
1995                     list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1996           }
1997           lwkt_getpooltoken(list);
1998           SLIST_INSERT_HEAD(list, kn, kn_link);
1999           lwkt_relpooltoken(list);
2000           TAILQ_INSERT_HEAD(&kq->kq_knlist, kn, kn_kqlink);
2001 }
2002 
2003 /*
2004  * Low level drop function.
2005  *
2006  * The knote should already be marked for processing.
2007  * Caller must hold the related kq token.
2008  */
2009 static void
knote_drop(struct knote * kn)2010 knote_drop(struct knote *kn)
2011 {
2012           struct kqueue *kq;
2013           struct klist *list;
2014 
2015           kq = kn->kn_kq;
2016 
2017           if (kn->kn_fop->f_flags & FILTEROP_ISFD)
2018                     list = &kn->kn_fp->f_klist;
2019           else
2020                     list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2021 
2022           lwkt_getpooltoken(list);
2023           SLIST_REMOVE(list, kn, knote, kn_link);
2024           lwkt_relpooltoken(list);
2025           TAILQ_REMOVE(&kq->kq_knlist, kn, kn_kqlink);
2026           if (kn->kn_status & KN_QUEUED)
2027                     knote_dequeue(kn);
2028           if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
2029                     fdrop(kn->kn_fp);
2030                     kn->kn_fp = NULL;
2031           }
2032           knote_free(kn);
2033 }
2034 
2035 /*
2036  * Low level enqueue function.
2037  *
2038  * The knote should already be marked for processing.
2039  * Caller must be holding the kq token
2040  */
2041 static void
knote_enqueue(struct knote * kn)2042 knote_enqueue(struct knote *kn)
2043 {
2044           struct kqueue *kq = kn->kn_kq;
2045 
2046           KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2047           TAILQ_INSERT_TAIL(&kq->kq_knpend, kn, kn_tqe);
2048           kn->kn_status |= KN_QUEUED;
2049           ++kq->kq_count;
2050 
2051           /*
2052            * Send SIGIO on request (typically set up as a mailbox signal)
2053            */
2054           if (kq->kq_sigio && (kq->kq_state & KQ_ASYNC) && kq->kq_count == 1)
2055                     pgsigio(kq->kq_sigio, SIGIO, 0);
2056 
2057           kqueue_wakeup(kq);
2058 }
2059 
2060 /*
2061  * Low level dequeue function.
2062  *
2063  * The knote should already be marked for processing.
2064  * Caller must be holding the kq token
2065  */
2066 static void
knote_dequeue(struct knote * kn)2067 knote_dequeue(struct knote *kn)
2068 {
2069           struct kqueue *kq = kn->kn_kq;
2070 
2071           KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2072           TAILQ_REMOVE(&kq->kq_knpend, kn, kn_tqe);
2073           kn->kn_status &= ~KN_QUEUED;
2074           kq->kq_count--;
2075 }
2076 
2077 static struct knote *
knote_alloc(void)2078 knote_alloc(void)
2079 {
2080           return kmalloc(sizeof(struct knote), M_KQUEUE, M_WAITOK);
2081 }
2082 
2083 static void
knote_free(struct knote * kn)2084 knote_free(struct knote *kn)
2085 {
2086           struct knote_cache_list *cache_list;
2087 
2088           cache_list = &knote_cache_lists[mycpuid];
2089           if (cache_list->knote_cache_cnt < KNOTE_CACHE_MAX) {
2090                     crit_enter();
2091                     SLIST_INSERT_HEAD(&cache_list->knote_cache, kn, kn_link);
2092                     cache_list->knote_cache_cnt++;
2093                     crit_exit();
2094                     return;
2095           }
2096           kfree(kn, M_KQUEUE);
2097 }
2098 
2099 struct sleepinfo {
2100           void *ident;
2101           int timedout;
2102 };
2103 
2104 static void
precise_sleep_intr(systimer_t info,int in_ipi,struct intrframe * frame)2105 precise_sleep_intr(systimer_t info, int in_ipi, struct intrframe *frame)
2106 {
2107           struct sleepinfo *si;
2108 
2109           si = info->data;
2110           si->timedout = 1;
2111           wakeup(si->ident);
2112 }
2113 
2114 static int
precise_sleep(void * ident,int flags,const char * wmesg,int us)2115 precise_sleep(void *ident, int flags, const char *wmesg, int us)
2116 {
2117           struct systimer info;
2118           struct sleepinfo si = {
2119                     .ident = ident,
2120                     .timedout = 0,
2121           };
2122           int r;
2123 
2124           tsleep_interlock(ident, flags);
2125           systimer_init_oneshot(&info, precise_sleep_intr, &si, us);
2126           r = tsleep(ident, flags | PINTERLOCKED, wmesg, 0);
2127           systimer_del(&info);
2128           if (si.timedout)
2129                     r = EWOULDBLOCK;
2130 
2131           return r;
2132 }
2133