1 /*-
2 * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3 * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4 * Copyright (c) 2009 Apple, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include "opt_ktrace.h"
33 #include "opt_compat_mach.h"
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/capsicum.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/rwlock.h>
42 #include <sys/proc.h>
43 #include <sys/malloc.h>
44 #include <sys/unistd.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/kthread.h>
50 #include <sys/selinfo.h>
51 #include <sys/stdatomic.h>
52 #include <sys/queue.h>
53 #include <sys/event.h>
54 #include <sys/eventvar.h>
55 #include <sys/poll.h>
56 #include <sys/protosw.h>
57 #include <sys/sigio.h>
58 #include <sys/signalvar.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/stat.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysproto.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/taskqueue.h>
66 #include <sys/uio.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70
71 #include <vm/uma.h>
72
73 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
74
75 /*
76 * This lock is used if multiple kq locks are required. This possibly
77 * should be made into a per proc lock.
78 */
79 static struct mtx kq_global;
80 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
81 #define KQ_GLOBAL_LOCK(lck, haslck) do { \
82 if (!haslck) \
83 mtx_lock(lck); \
84 haslck = 1; \
85 } while (0)
86 #define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
87 if (haslck) \
88 mtx_unlock(lck); \
89 haslck = 0; \
90 } while (0)
91
92 TASKQUEUE_DEFINE_THREAD(kqueue);
93
94 static int kevent_copyout(void *arg, void *kevp, int count);
95 static int kevent_copyin(void *arg, void *kevp, int count);
96 static int kevent64_copyout(void *arg, void *kevp, int count);
97 static int kevent64_copyin(void *arg,void *kevp, int count);
98 static int kqueue_register(struct kqueue *kq, struct kevent64_s *kev,
99 struct thread *td, int waitok);
100 static int kqueue_acquire(struct file *fp, struct kqueue **kqp);
101 static void kqueue_release(struct kqueue *kq, int locked);
102 static int kqueue_expand(struct kqueue *kq, struct filterops *fops,
103 uintptr_t ident, int waitok);
104 static void kqueue_task(void *arg, int pending);
105 static int kqueue_scan(struct kqueue *kq, int maxevents,
106 struct kevent_copyops *k_ops,
107 const struct timespec *timeout,
108 struct kevent64_s *keva, struct thread *td);
109 static void kqueue_wakeup(struct kqueue *kq);
110 static struct filterops *kqueue_fo_find(int filt);
111 static void kqueue_fo_release(int filt);
112
113 static fo_rdwr_t kqueue_read;
114 static fo_rdwr_t kqueue_write;
115 static fo_truncate_t kqueue_truncate;
116 static fo_ioctl_t kqueue_ioctl;
117 static fo_poll_t kqueue_poll;
118 static fo_kqfilter_t kqueue_kqfilter;
119 static fo_stat_t kqueue_stat;
120 static fo_close_t kqueue_close;
121
122 static struct fileops kqueueops = {
123 .fo_read = kqueue_read,
124 .fo_write = kqueue_write,
125 .fo_truncate = kqueue_truncate,
126 .fo_ioctl = kqueue_ioctl,
127 .fo_poll = kqueue_poll,
128 .fo_kqfilter = kqueue_kqfilter,
129 .fo_stat = kqueue_stat,
130 .fo_close = kqueue_close,
131 .fo_chmod = invfo_chmod,
132 .fo_chown = invfo_chown,
133 .fo_sendfile = invfo_sendfile,
134 };
135
136 static int knote_attach(struct knote *kn, struct kqueue *kq);
137 static void knote_drop(struct knote *kn, struct thread *td);
138 void knote_enqueue(struct knote *kn);
139 static void knote_dequeue(struct knote *kn);
140 static void knote_init(void);
141 static struct knote *knote_alloc(int waitok);
142 static void knote_free(struct knote *kn);
143
144 static void filt_kqdetach(struct knote *kn);
145 static int filt_kqueue(struct knote *kn, long hint);
146 static int filt_procattach(struct knote *kn);
147 static void filt_procdetach(struct knote *kn);
148 static int filt_proc(struct knote *kn, long hint);
149 static int filt_fileattach(struct knote *kn);
150 static void filt_timerexpire(void *knx);
151 static int filt_timerattach(struct knote *kn);
152 static void filt_timerdetach(struct knote *kn);
153 static int filt_timer(struct knote *kn, long hint);
154 static int filt_userattach(struct knote *kn);
155 static void filt_userdetach(struct knote *kn);
156 static int filt_user(struct knote *kn, long hint);
157 static void filt_usertouch(struct knote *kn, struct kevent64_s *kev,
158 u_long type);
159
160
161 static struct filterops file_filtops = {
162 .f_isfd = 1,
163 .f_attach = filt_fileattach,
164 };
165 static struct filterops kqread_filtops = {
166 .f_isfd = 1,
167 .f_detach = filt_kqdetach,
168 .f_event = filt_kqueue,
169 };
170 /* XXX - move to kern_proc.c? */
171 static struct filterops proc_filtops = {
172 .f_isfd = 0,
173 .f_attach = filt_procattach,
174 .f_detach = filt_procdetach,
175 .f_event = filt_proc,
176 };
177 static struct filterops timer_filtops = {
178 .f_isfd = 0,
179 .f_attach = filt_timerattach,
180 .f_detach = filt_timerdetach,
181 .f_event = filt_timer,
182 };
183 static struct filterops user_filtops = {
184 .f_attach = filt_userattach,
185 .f_detach = filt_userdetach,
186 .f_event = filt_user,
187 .f_touch = filt_usertouch,
188 };
189
190 static uma_zone_t knote_zone;
191 static atomic_uint kq_ncallouts = ATOMIC_VAR_INIT(0);
192 static unsigned int kq_calloutmax = 4 * 1024;
193 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
194 &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
195
196 /* XXX - ensure not KN_INFLUX?? */
197 #define KNOTE_ACTIVATE(kn, islock) do { \
198 if ((islock)) \
199 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
200 else \
201 KQ_LOCK((kn)->kn_kq); \
202 (kn)->kn_status |= KN_ACTIVE; \
203 if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) \
204 knote_enqueue((kn)); \
205 if (!(islock)) \
206 KQ_UNLOCK((kn)->kn_kq); \
207 } while(0)
208 #define KQ_LOCK(kq) do { \
209 mtx_lock(&(kq)->kq_lock); \
210 } while (0)
211 #define KQ_FLUX_WAKEUP(kq) do { \
212 if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
213 (kq)->kq_state &= ~KQ_FLUXWAIT; \
214 wakeup((kq)); \
215 } \
216 } while (0)
217 #define KQ_UNLOCK_FLUX(kq) do { \
218 KQ_FLUX_WAKEUP(kq); \
219 mtx_unlock(&(kq)->kq_lock); \
220 } while (0)
221 #define KQ_UNLOCK(kq) do { \
222 mtx_unlock(&(kq)->kq_lock); \
223 } while (0)
224 #define KQ_OWNED(kq) do { \
225 mtx_assert(&(kq)->kq_lock, MA_OWNED); \
226 } while (0)
227 #define KQ_NOTOWNED(kq) do { \
228 mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
229 } while (0)
230 #define KN_LIST_LOCK(kn) do { \
231 if (kn->kn_knlist != NULL) \
232 kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
233 } while (0)
234 #define KN_LIST_UNLOCK(kn) do { \
235 if (kn->kn_knlist != NULL) \
236 kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
237 } while (0)
238 #define KNL_ASSERT_LOCK(knl, islocked) do { \
239 if (islocked) \
240 KNL_ASSERT_LOCKED(knl); \
241 else \
242 KNL_ASSERT_UNLOCKED(knl); \
243 } while (0)
244 #ifdef INVARIANTS
245 #define KNL_ASSERT_LOCKED(knl) do { \
246 knl->kl_assert_locked((knl)->kl_lockarg); \
247 } while (0)
248 #define KNL_ASSERT_UNLOCKED(knl) do { \
249 knl->kl_assert_unlocked((knl)->kl_lockarg); \
250 } while (0)
251 #else /* !INVARIANTS */
252 #define KNL_ASSERT_LOCKED(knl) do {} while(0)
253 #define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
254 #endif /* INVARIANTS */
255
256 #define KN_HASHSIZE 64 /* XXX should be tunable */
257 #define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
258
259 static int
filt_nullattach(struct knote * kn)260 filt_nullattach(struct knote *kn)
261 {
262
263 return (ENXIO);
264 };
265
266 struct filterops null_filtops = {
267 .f_isfd = 0,
268 .f_attach = filt_nullattach,
269 };
270
271 /* XXX - make SYSINIT to add these, and move into respective modules. */
272 extern struct filterops sig_filtops;
273 extern struct filterops fs_filtops;
274
275 /*
276 * Table for for all system-defined filters.
277 */
278 static struct mtx filterops_lock;
279 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
280 MTX_DEF);
281 static struct {
282 struct filterops *for_fop;
283 int for_refcnt;
284 } sysfilt_ops[EVFILT_SYSCOUNT] = {
285 { &file_filtops }, /* EVFILT_READ */
286 { &file_filtops }, /* EVFILT_WRITE */
287 { &null_filtops }, /* EVFILT_AIO */
288 { &file_filtops }, /* EVFILT_VNODE */
289 { &proc_filtops }, /* EVFILT_PROC */
290 { &sig_filtops }, /* EVFILT_SIGNAL */
291 { &timer_filtops }, /* EVFILT_TIMER */
292 { &null_filtops }, /* former EVFILT_NETDEV */
293 { &fs_filtops }, /* EVFILT_FS */
294 { &null_filtops }, /* EVFILT_LIO */
295 { &user_filtops, 1 }, /* EVFILT_USER */
296 { &null_filtops }, /* EVFILT_SENDFILE */
297 { &null_filtops }, /* EVFILT_MACHPORT */
298 #ifdef HAVE_EVFILT_VM
299 { &vm_filtops }, /* EVFILT_VM */
300 #else
301 { &null_filtops },
302 #endif
303 };
304
305 /*
306 * Simple redirection for all cdevsw style objects to call their fo_kqfilter
307 * method.
308 */
309 static int
filt_fileattach(struct knote * kn)310 filt_fileattach(struct knote *kn)
311 {
312
313 return (fo_kqfilter(kn->kn_fp, kn));
314 }
315
316 /*ARGSUSED*/
317 static int
kqueue_kqfilter(struct file * fp,struct knote * kn)318 kqueue_kqfilter(struct file *fp, struct knote *kn)
319 {
320 struct kqueue *kq = kn->kn_fp->f_data;
321
322 if (kn->kn_filter != EVFILT_READ)
323 return (EINVAL);
324
325 kn->kn_status |= KN_KQUEUE;
326 kn->kn_fop = &kqread_filtops;
327 knlist_add(&kq->kq_sel.si_note, kn, 0);
328
329 return (0);
330 }
331
332 static void
filt_kqdetach(struct knote * kn)333 filt_kqdetach(struct knote *kn)
334 {
335 struct kqueue *kq = kn->kn_fp->f_data;
336
337 knlist_remove(&kq->kq_sel.si_note, kn, 0);
338 }
339
340 /*ARGSUSED*/
341 static int
filt_kqueue(struct knote * kn,long hint)342 filt_kqueue(struct knote *kn, long hint)
343 {
344 struct kqueue *kq = kn->kn_fp->f_data;
345
346 kn->kn_data = kq->kq_count;
347 return (kn->kn_data > 0);
348 }
349
350 /* XXX - move to kern_proc.c? */
351 static int
filt_procattach(struct knote * kn)352 filt_procattach(struct knote *kn)
353 {
354 struct proc *p;
355 int immediate;
356 int error;
357
358 immediate = 0;
359 p = pfind(kn->kn_id);
360 if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
361 p = zpfind(kn->kn_id);
362 immediate = 1;
363 } else if (p != NULL && (p->p_flag & P_WEXIT)) {
364 immediate = 1;
365 }
366
367 if (p == NULL)
368 return (ESRCH);
369 if ((error = p_cansee(curthread, p))) {
370 PROC_UNLOCK(p);
371 return (error);
372 }
373
374 kn->kn_ptr.p_proc = p;
375 kn->kn_flags |= EV_CLEAR; /* automatically set */
376
377 /*
378 * internal flag indicating registration done by kernel
379 */
380 if (kn->kn_flags & EV_FLAG1) {
381 kn->kn_data = kn->kn_sdata; /* ppid */
382 kn->kn_fflags = NOTE_CHILD;
383 kn->kn_flags &= ~EV_FLAG1;
384 }
385
386 if (immediate == 0)
387 knlist_add(&p->p_klist, kn, 1);
388
389 /*
390 * Immediately activate any exit notes if the target process is a
391 * zombie. This is necessary to handle the case where the target
392 * process, e.g. a child, dies before the kevent is registered.
393 */
394 if (immediate && filt_proc(kn, NOTE_EXIT))
395 KNOTE_ACTIVATE(kn, 0);
396
397 PROC_UNLOCK(p);
398
399 return (0);
400 }
401
402 /*
403 * The knote may be attached to a different process, which may exit,
404 * leaving nothing for the knote to be attached to. So when the process
405 * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
406 * it will be deleted when read out. However, as part of the knote deletion,
407 * this routine is called, so a check is needed to avoid actually performing
408 * a detach, because the original process does not exist any more.
409 */
410 /* XXX - move to kern_proc.c? */
411 static void
filt_procdetach(struct knote * kn)412 filt_procdetach(struct knote *kn)
413 {
414 struct proc *p;
415
416 p = kn->kn_ptr.p_proc;
417 knlist_remove(&p->p_klist, kn, 0);
418 kn->kn_ptr.p_proc = NULL;
419 }
420
421 /* XXX - move to kern_proc.c? */
422 static int
filt_proc(struct knote * kn,long hint)423 filt_proc(struct knote *kn, long hint)
424 {
425 struct proc *p = kn->kn_ptr.p_proc;
426 u_int event;
427
428 /*
429 * mask off extra data
430 */
431 event = (u_int)hint & NOTE_PCTRLMASK;
432
433 /*
434 * if the user is interested in this event, record it.
435 */
436 if (kn->kn_sfflags & event)
437 kn->kn_fflags |= event;
438
439 /*
440 * process is gone, so flag the event as finished.
441 */
442 if (event == NOTE_EXIT) {
443 if (!(kn->kn_status & KN_DETACHED))
444 knlist_remove_inevent(&p->p_klist, kn);
445 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
446 kn->kn_ptr.p_proc = NULL;
447 if (kn->kn_fflags & NOTE_EXIT) {
448 kn->kn_data = p->p_xstat;
449 /* OS X compatibility */
450 if (kn->kn_sfflags & NOTE_EXITSTATUS)
451 kn->kn_fflags |= NOTE_EXITSTATUS;
452 }
453 if (kn->kn_fflags == 0)
454 kn->kn_flags |= EV_DROP;
455 return (1);
456 }
457
458 return (kn->kn_fflags != 0);
459 }
460
461 /*
462 * Called when the process forked. It mostly does the same as the
463 * knote(), activating all knotes registered to be activated when the
464 * process forked. Additionally, for each knote attached to the
465 * parent, check whether user wants to track the new process. If so
466 * attach a new knote to it, and immediately report an event with the
467 * child's pid.
468 */
469 void
knote_fork(struct knlist * list,int pid)470 knote_fork(struct knlist *list, int pid)
471 {
472 struct kqueue *kq;
473 struct knote *kn;
474 struct kevent64_s kev;
475 int error;
476
477 if (list == NULL)
478 return;
479 list->kl_lock(list->kl_lockarg);
480
481 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
482 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
483 continue;
484 kq = kn->kn_kq;
485 KQ_LOCK(kq);
486 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
487 KQ_UNLOCK(kq);
488 continue;
489 }
490
491 /*
492 * The same as knote(), activate the event.
493 */
494 if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
495 kn->kn_status |= KN_HASKQLOCK;
496 if (kn->kn_fop->f_event(kn, NOTE_FORK))
497 KNOTE_ACTIVATE(kn, 1);
498 kn->kn_status &= ~KN_HASKQLOCK;
499 KQ_UNLOCK(kq);
500 continue;
501 }
502
503 /*
504 * The NOTE_TRACK case. In addition to the activation
505 * of the event, we need to register new event to
506 * track the child. Drop the locks in preparation for
507 * the call to kqueue_register().
508 */
509 kn->kn_status |= KN_INFLUX;
510 KQ_UNLOCK(kq);
511 list->kl_unlock(list->kl_lockarg);
512
513 /*
514 * Activate existing knote and register a knote with
515 * new process.
516 */
517 kev.ident = pid;
518 kev.filter = kn->kn_filter;
519 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
520 kev.fflags = kn->kn_sfflags;
521 kev.data = kn->kn_id; /* parent */
522 kev.udata = kn->kn_kevent.udata;/* preserve udata */
523 error = kqueue_register(kq, &kev, NULL, 0);
524 if (error)
525 kn->kn_fflags |= NOTE_TRACKERR;
526 if (kn->kn_fop->f_event(kn, NOTE_FORK))
527 KNOTE_ACTIVATE(kn, 0);
528 KQ_LOCK(kq);
529 kn->kn_status &= ~KN_INFLUX;
530 KQ_UNLOCK_FLUX(kq);
531 list->kl_lock(list->kl_lockarg);
532 }
533 list->kl_unlock(list->kl_lockarg);
534 }
535
536 /*
537 * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
538 * interval timer support code.
539 */
540
541 #define NOTE_TIMER_PRECMASK (NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
542 NOTE_NSECONDS)
543
544 static __inline sbintime_t
timer2sbintime(intptr_t data,int flags)545 timer2sbintime(intptr_t data, int flags)
546 {
547 sbintime_t modifier;
548
549 switch (flags & NOTE_TIMER_PRECMASK) {
550 case NOTE_SECONDS:
551 modifier = SBT_1S;
552 break;
553 case NOTE_MSECONDS: /* FALLTHROUGH */
554 case 0:
555 modifier = SBT_1MS;
556 break;
557 case NOTE_USECONDS:
558 modifier = SBT_1US;
559 break;
560 case NOTE_NSECONDS:
561 modifier = SBT_1NS;
562 break;
563 default:
564 return (-1);
565 }
566
567 #ifdef __LP64__
568 if (data > SBT_MAX / modifier)
569 return (SBT_MAX);
570 #endif
571 return (modifier * data);
572 }
573
574 static void
filt_timerexpire(void * knx)575 filt_timerexpire(void *knx)
576 {
577 struct callout *calloutp;
578 struct knote *kn;
579
580 kn = knx;
581 kn->kn_data++;
582 KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
583
584 if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
585 calloutp = (struct callout *)kn->kn_hook;
586 *kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata,
587 kn->kn_sfflags);
588 callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
589 filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
590 }
591 }
592
593 /*
594 * data contains amount of time to sleep
595 */
596 static int
filt_timerattach(struct knote * kn)597 filt_timerattach(struct knote *kn)
598 {
599 struct callout *calloutp;
600 sbintime_t to;
601 unsigned int ncallouts;
602
603 if ((intptr_t)kn->kn_sdata < 0)
604 return (EINVAL);
605 if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
606 kn->kn_sdata = 1;
607 /* Only precision unit are supported in flags so far */
608 if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK)
609 return (EINVAL);
610
611 to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
612 if (to < 0)
613 return (EINVAL);
614
615 ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
616 do {
617 if (ncallouts >= kq_calloutmax)
618 return (ENOMEM);
619 } while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
620 &ncallouts, ncallouts + 1, memory_order_relaxed,
621 memory_order_relaxed));
622
623 kn->kn_flags |= EV_CLEAR; /* automatically set */
624 kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
625 kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK);
626 calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
627 callout_init(calloutp, CALLOUT_MPSAFE);
628 kn->kn_hook = calloutp;
629 *kn->kn_ptr.p_nexttime = to + sbinuptime();
630 callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
631 filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
632
633 return (0);
634 }
635
636 static void
filt_timerdetach(struct knote * kn)637 filt_timerdetach(struct knote *kn)
638 {
639 struct callout *calloutp;
640 unsigned int old;
641
642 calloutp = (struct callout *)kn->kn_hook;
643 callout_drain(calloutp);
644 free(calloutp, M_KQUEUE);
645 free(kn->kn_ptr.p_nexttime, M_KQUEUE);
646 old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
647 KASSERT(old > 0, ("Number of callouts cannot become negative"));
648 kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */
649 }
650
651 static int
filt_timer(struct knote * kn,long hint)652 filt_timer(struct knote *kn, long hint)
653 {
654
655 return (kn->kn_data != 0);
656 }
657
658 static int
filt_userattach(struct knote * kn)659 filt_userattach(struct knote *kn)
660 {
661
662 /*
663 * EVFILT_USER knotes are not attached to anything in the kernel.
664 */
665 kn->kn_hook = NULL;
666 if (kn->kn_fflags & NOTE_TRIGGER)
667 kn->kn_hookid = 1;
668 else
669 kn->kn_hookid = 0;
670 return (0);
671 }
672
673 static void
filt_userdetach(__unused struct knote * kn)674 filt_userdetach(__unused struct knote *kn)
675 {
676
677 /*
678 * EVFILT_USER knotes are not attached to anything in the kernel.
679 */
680 }
681
682 static int
filt_user(struct knote * kn,__unused long hint)683 filt_user(struct knote *kn, __unused long hint)
684 {
685
686 return (kn->kn_hookid);
687 }
688
689 static void
filt_usertouch(struct knote * kn,struct kevent64_s * kev,u_long type)690 filt_usertouch(struct knote *kn, struct kevent64_s *kev, u_long type)
691 {
692 u_int ffctrl;
693
694 switch (type) {
695 case EVENT_REGISTER:
696 if (kev->fflags & NOTE_TRIGGER)
697 kn->kn_hookid = 1;
698
699 ffctrl = kev->fflags & NOTE_FFCTRLMASK;
700 kev->fflags &= NOTE_FFLAGSMASK;
701 switch (ffctrl) {
702 case NOTE_FFNOP:
703 break;
704
705 case NOTE_FFAND:
706 kn->kn_sfflags &= kev->fflags;
707 break;
708
709 case NOTE_FFOR:
710 kn->kn_sfflags |= kev->fflags;
711 break;
712
713 case NOTE_FFCOPY:
714 kn->kn_sfflags = kev->fflags;
715 break;
716
717 default:
718 /* XXX Return error? */
719 break;
720 }
721 kn->kn_sdata = kev->data;
722 if (kev->flags & EV_CLEAR) {
723 kn->kn_hookid = 0;
724 kn->kn_data = 0;
725 kn->kn_fflags = 0;
726 }
727 break;
728
729 case EVENT_PROCESS:
730 *kev = kn->kn_kevent;
731 kev->fflags = kn->kn_sfflags;
732 kev->data = kn->kn_sdata;
733 if (kn->kn_flags & EV_CLEAR) {
734 kn->kn_hookid = 0;
735 kn->kn_data = 0;
736 kn->kn_fflags = 0;
737 }
738 break;
739
740 default:
741 panic("filt_usertouch() - invalid type (%ld)", type);
742 break;
743 }
744 }
745
746 int
sys_kqueue(struct thread * td,struct kqueue_args * uap)747 sys_kqueue(struct thread *td, struct kqueue_args *uap)
748 {
749 struct filedesc *fdp;
750 struct kqueue *kq;
751 struct file *fp;
752 int fd, error;
753
754 fdp = td->td_proc->p_fd;
755 error = falloc(td, &fp, &fd, 0);
756 if (error)
757 goto done2;
758
759 /* An extra reference on `fp' has been held for us by falloc(). */
760 kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
761 mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
762 TAILQ_INIT(&kq->kq_head);
763 kq->kq_fdp = fdp;
764 knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
765 TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
766
767 FILEDESC_XLOCK(fdp);
768 TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
769 FILEDESC_XUNLOCK(fdp);
770
771 finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
772 fdrop(fp, td);
773
774 td->td_retval[0] = fd;
775 done2:
776 return (error);
777 }
778
779 #ifndef _SYS_SYSPROTO_H_
780 struct kevent64_args {
781 int fd;
782 struct kevent64_s *changelist;
783 int nchanges;
784 struct kevent64_s *eventlist;
785 int nevents;
786 const struct timespec *timeout;
787 };
788 #endif
789 int
sys_kevent64(struct thread * td,struct kevent64_args * uap)790 sys_kevent64(struct thread *td, struct kevent64_args *uap)
791 {
792 struct timespec ts, *tsp;
793 struct kevent_copyops k_ops = { uap,
794 kevent64_copyout,
795 kevent64_copyin};
796 int error;
797
798 if (uap->timeout != NULL) {
799 error = copyin(uap->timeout, &ts, sizeof(ts));
800 if (error)
801 return (error);
802 tsp = &ts;
803 } else
804 tsp = NULL;
805
806 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
807 &k_ops, tsp, 1);
808
809 return (error);
810 }
811
812 #ifndef _SYS_SYSPROTO_H_
813 struct kevent_args {
814 int fd;
815 const struct kevent *changelist;
816 int nchanges;
817 struct kevent *eventlist;
818 int nevents;
819 const struct timespec *timeout;
820 };
821 #endif
822 int
sys_kevent(struct thread * td,struct kevent_args * uap)823 sys_kevent(struct thread *td, struct kevent_args *uap)
824 {
825 struct timespec ts, *tsp;
826 struct kevent_copyops k_ops = { uap,
827 kevent_copyout,
828 kevent_copyin};
829 int error;
830 #ifdef KTRACE
831 struct uio ktruio;
832 struct iovec ktriov;
833 struct uio *ktruioin = NULL;
834 struct uio *ktruioout = NULL;
835 #endif
836
837 if (uap->timeout != NULL) {
838 error = copyin(uap->timeout, &ts, sizeof(ts));
839 if (error)
840 return (error);
841 tsp = &ts;
842 } else
843 tsp = NULL;
844
845 #ifdef KTRACE
846 if (KTRPOINT(td, KTR_GENIO)) {
847 ktriov.iov_base = uap->changelist;
848 ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
849 ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
850 .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
851 .uio_td = td };
852 ktruioin = cloneuio(&ktruio);
853 ktriov.iov_base = uap->eventlist;
854 ktriov.iov_len = uap->nevents * sizeof(struct kevent);
855 ktruioout = cloneuio(&ktruio);
856 }
857 #endif
858
859 error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
860 &k_ops, tsp, 0);
861
862 #ifdef KTRACE
863 if (ktruioin != NULL) {
864 ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
865 ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
866 ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
867 ktrgenio(uap->fd, UIO_READ, ktruioout, error);
868 }
869 #endif
870
871 return (error);
872 }
873
874 /*
875 * Copy 'count' items into the destination list pointed to by uap->eventlist.
876 */
877 static int
kevent64_copyout(void * arg,void * kevp,int count)878 kevent64_copyout(void *arg, void *kevp, int count)
879 {
880 struct kevent64_args *uap;
881 int error;
882
883 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
884 uap = (struct kevent64_args *)arg;
885
886 error = copyout(kevp, uap->eventlist, count * sizeof(struct kevent64_s));
887 if (error == 0)
888 uap->eventlist += count;
889 return (error);
890 }
891
892 static int
kevent_copyout(void * arg,void * kevp,int count)893 kevent_copyout(void *arg, void *kevp, int count)
894 {
895 struct kevent64_s *kev;
896 struct kevent_args *uap;
897 int error, i;
898
899 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
900 uap = (struct kevent_args *)arg;
901 kev = (struct kevent64_s *)kevp;
902
903 for (i = 0; i < count; i++) {
904 error = copyout((const void *)&kev[i], uap->eventlist, sizeof(struct kevent));
905 if (error == 0)
906 uap->eventlist++;
907 }
908
909 return (error);
910 }
911
912 /*
913 * Copy 'count' items from the list pointed to by uap->changelist.
914 */
915 static int
kevent64_copyin(void * arg,void * kevp,int count)916 kevent64_copyin(void *arg, void *kevp, int count)
917 {
918 struct kevent64_args *uap;
919 int error;
920
921 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
922 uap = (struct kevent64_args *)arg;
923
924 error = copyin(uap->changelist, kevp, count * sizeof(struct kevent64_s));
925 if (error == 0)
926 uap->changelist += count;
927 return (error);
928 }
929
930 static int
kevent_copyin(void * arg,void * kevp,int count)931 kevent_copyin(void *arg, void *kevp, int count)
932 {
933 struct kevent_args *uap;
934 int error;
935
936 KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
937 uap = (struct kevent_args *)arg;
938
939 error = copyin(uap->changelist, kevp, count * sizeof(struct kevent));
940 if (error == 0)
941 uap->changelist += count;
942 return (error);
943 }
944
945 int
kern_kevent(struct thread * td,int fd,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout,int v1)946 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
947 struct kevent_copyops *k_ops, const struct timespec *timeout,
948 int v1)
949 {
950 struct kevent keva[KQ_NEVENTS];
951 struct kevent64_s keva64[KQ_NEVENTS];
952 struct kevent *changes;
953 struct kevent64_s kevtmp, *kevp;
954 struct kqueue *kq;
955 struct file *fp;
956 cap_rights_t rights;
957 int i, n, nerrors, error;
958
959 cap_rights_init(&rights);
960 if (nchanges > 0)
961 cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
962 if (nevents > 0)
963 cap_rights_set(&rights, CAP_KQUEUE_EVENT);
964 error = fget(td, fd, &rights, &fp);
965 if (error != 0)
966 return (error);
967
968 error = kqueue_acquire(fp, &kq);
969 if (error != 0)
970 goto done_norel;
971
972 nerrors = 0;
973 if (v1) {
974 struct kevent64_s *changes;
975 while (nchanges > 0) {
976 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
977 error = k_ops->k_copyin(k_ops->arg, keva64, n);
978 if (error)
979 goto done;
980 changes = keva64;
981 for (i = 0; i < n; i++) {
982 kevp = &changes[i];
983 if (!kevp->filter)
984 continue;
985 kevp->flags &= ~EV_SYSFLAGS;
986 error = kqueue_register(kq, kevp, td, 1);
987 if (error || (kevp->flags & EV_RECEIPT)) {
988 if (nevents != 0) {
989 kevp->flags = EV_ERROR;
990 kevp->data = error;
991 (void) k_ops->k_copyout(k_ops->arg,
992 kevp, 1);
993 nevents--;
994 nerrors++;
995 } else {
996 goto done;
997 }
998 }
999 }
1000 nchanges -= n;
1001 }
1002 goto check_errors;
1003 }
1004 kevtmp.ext[0] = 0;
1005 kevtmp.ext[1] = 0;
1006 while (nchanges > 0) {
1007 n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1008 error = k_ops->k_copyin(k_ops->arg, keva, n);
1009 if (error)
1010 goto done;
1011 changes = keva;
1012 for (i = 0; i < n; i++) {
1013 kevp = (struct kevent64_s *)&changes[i];
1014 if (!kevp->filter)
1015 continue;
1016 EV_SET64(&kevtmp, kevp->ident, kevp->filter, kevp->flags, kevp->fflags, kevp->data, kevp->udata, 0, 0);
1017 kevp = &kevtmp;
1018 kevp->flags &= ~EV_SYSFLAGS;
1019 error = kqueue_register(kq, kevp, td, 1);
1020 if (error || (kevp->flags & EV_RECEIPT)) {
1021 if (nevents != 0) {
1022 kevp->flags = EV_ERROR;
1023 kevp->data = error;
1024 (void) k_ops->k_copyout(k_ops->arg,
1025 kevp, 1);
1026 nevents--;
1027 nerrors++;
1028 } else {
1029 goto done;
1030 }
1031 }
1032 }
1033 nchanges -= n;
1034 }
1035 check_errors:
1036 if (nerrors) {
1037 td->td_retval[0] = nerrors;
1038 error = 0;
1039 goto done;
1040 }
1041 if (v1 == 0)
1042 for (i = 0; i < KQ_NEVENTS; i++) {
1043 struct kevent *k = &keva[i];
1044 EV_SET64(&keva64[i], k->ident, k->filter, k->flags, k->fflags, k->data, (uint64_t)k->udata, 0, 0);
1045 }
1046 error = kqueue_scan(kq, nevents, k_ops, timeout, keva64, td);
1047 done:
1048 kqueue_release(kq, 0);
1049 done_norel:
1050 fdrop(fp, td);
1051 return (error);
1052 }
1053
1054 int
kqueue_add_filteropts(int filt,struct filterops * filtops)1055 kqueue_add_filteropts(int filt, struct filterops *filtops)
1056 {
1057 int error;
1058
1059 error = 0;
1060 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1061 printf(
1062 "trying to add a filterop that is out of range: %d is beyond %d\n",
1063 ~filt, EVFILT_SYSCOUNT);
1064 return EINVAL;
1065 }
1066 mtx_lock(&filterops_lock);
1067 if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1068 sysfilt_ops[~filt].for_fop != NULL)
1069 error = EEXIST;
1070 else {
1071 sysfilt_ops[~filt].for_fop = filtops;
1072 sysfilt_ops[~filt].for_refcnt = 0;
1073 }
1074 mtx_unlock(&filterops_lock);
1075
1076 return (error);
1077 }
1078
1079 int
kqueue_del_filteropts(int filt)1080 kqueue_del_filteropts(int filt)
1081 {
1082 int error;
1083
1084 error = 0;
1085 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1086 return EINVAL;
1087
1088 mtx_lock(&filterops_lock);
1089 if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1090 sysfilt_ops[~filt].for_fop == NULL)
1091 error = EINVAL;
1092 else if (sysfilt_ops[~filt].for_refcnt != 0)
1093 error = EBUSY;
1094 else {
1095 sysfilt_ops[~filt].for_fop = &null_filtops;
1096 sysfilt_ops[~filt].for_refcnt = 0;
1097 }
1098 mtx_unlock(&filterops_lock);
1099
1100 return error;
1101 }
1102
1103 static struct filterops *
kqueue_fo_find(int filt)1104 kqueue_fo_find(int filt)
1105 {
1106
1107 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1108 return NULL;
1109
1110 mtx_lock(&filterops_lock);
1111 sysfilt_ops[~filt].for_refcnt++;
1112 if (sysfilt_ops[~filt].for_fop == NULL)
1113 sysfilt_ops[~filt].for_fop = &null_filtops;
1114 mtx_unlock(&filterops_lock);
1115
1116 return sysfilt_ops[~filt].for_fop;
1117 }
1118
1119 static void
kqueue_fo_release(int filt)1120 kqueue_fo_release(int filt)
1121 {
1122
1123 if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1124 return;
1125
1126 mtx_lock(&filterops_lock);
1127 KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1128 ("filter object refcount not valid on release"));
1129 sysfilt_ops[~filt].for_refcnt--;
1130 mtx_unlock(&filterops_lock);
1131 }
1132
1133 /*
1134 * A ref to kq (obtained via kqueue_acquire) must be held. waitok will
1135 * influence if memory allocation should wait. Make sure it is 0 if you
1136 * hold any mutexes.
1137 */
1138 static int
kqueue_register(struct kqueue * kq,struct kevent64_s * kev,struct thread * td,int waitok)1139 kqueue_register(struct kqueue *kq, struct kevent64_s *kev, struct thread *td, int waitok)
1140 {
1141 struct filterops *fops;
1142 struct file *fp;
1143 struct knote *kn, *tkn;
1144 cap_rights_t rights;
1145 int error, filt, event;
1146 int haskqglobal, filedesc_unlock;
1147
1148 fp = NULL;
1149 kn = NULL;
1150 error = 0;
1151 haskqglobal = 0;
1152 filedesc_unlock = 0;
1153
1154 filt = kev->filter;
1155 fops = kqueue_fo_find(filt);
1156 if (fops == NULL)
1157 return EINVAL;
1158
1159 tkn = knote_alloc(waitok); /* prevent waiting with locks */
1160
1161 findkn:
1162 if (fops->f_isfd) {
1163 KASSERT(td != NULL, ("td is NULL"));
1164 error = fget(td, kev->ident,
1165 cap_rights_init(&rights, CAP_EVENT), &fp);
1166 if (error)
1167 goto done;
1168
1169 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1170 kev->ident, 0) != 0) {
1171 /* try again */
1172 fdrop(fp, td);
1173 fp = NULL;
1174 error = kqueue_expand(kq, fops, kev->ident, waitok);
1175 if (error)
1176 goto done;
1177 goto findkn;
1178 }
1179
1180 if (fp->f_type == DTYPE_KQUEUE) {
1181 /*
1182 * if we add some inteligence about what we are doing,
1183 * we should be able to support events on ourselves.
1184 * We need to know when we are doing this to prevent
1185 * getting both the knlist lock and the kq lock since
1186 * they are the same thing.
1187 */
1188 if (fp->f_data == kq) {
1189 error = EINVAL;
1190 goto done;
1191 }
1192
1193 /*
1194 * Pre-lock the filedesc before the global
1195 * lock mutex, see the comment in
1196 * kqueue_close().
1197 */
1198 FILEDESC_XLOCK(td->td_proc->p_fd);
1199 filedesc_unlock = 1;
1200 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1201 }
1202
1203 KQ_LOCK(kq);
1204 if (kev->ident < kq->kq_knlistsize) {
1205 SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1206 if (kev->filter == kn->kn_filter)
1207 break;
1208 }
1209 } else {
1210 if ((kev->flags & EV_ADD) == EV_ADD)
1211 kqueue_expand(kq, fops, kev->ident, waitok);
1212
1213 KQ_LOCK(kq);
1214 if (kq->kq_knhashmask != 0) {
1215 struct klist *list;
1216
1217 list = &kq->kq_knhash[
1218 KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1219 SLIST_FOREACH(kn, list, kn_link)
1220 if (kev->ident == kn->kn_id &&
1221 kev->filter == kn->kn_filter)
1222 break;
1223 }
1224 }
1225
1226 /* knote is in the process of changing, wait for it to stablize. */
1227 if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1228 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1229 if (filedesc_unlock) {
1230 FILEDESC_XUNLOCK(td->td_proc->p_fd);
1231 filedesc_unlock = 0;
1232 }
1233 kq->kq_state |= KQ_FLUXWAIT;
1234 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1235 if (fp != NULL) {
1236 fdrop(fp, td);
1237 fp = NULL;
1238 }
1239 goto findkn;
1240 }
1241
1242 /*
1243 * kn now contains the matching knote, or NULL if no match
1244 */
1245 if (kn == NULL) {
1246 if (kev->flags & EV_ADD) {
1247 kn = tkn;
1248 tkn = NULL;
1249 if (kn == NULL) {
1250 KQ_UNLOCK(kq);
1251 error = ENOMEM;
1252 goto done;
1253 }
1254 kn->kn_fp = fp;
1255 kn->kn_kq = kq;
1256 kn->kn_fop = fops;
1257 /*
1258 * apply reference counts to knote structure, and
1259 * do not release it at the end of this routine.
1260 */
1261 fops = NULL;
1262 fp = NULL;
1263
1264 kn->kn_sfflags = kev->fflags;
1265 kn->kn_sdata = kev->data;
1266 kev->fflags = 0;
1267 kev->data = 0;
1268 kn->kn_kevent = *kev;
1269 kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1270 EV_ENABLE | EV_DISABLE);
1271 kn->kn_status = KN_INFLUX|KN_DETACHED;
1272
1273 error = knote_attach(kn, kq);
1274 KQ_UNLOCK(kq);
1275 if (error != 0) {
1276 tkn = kn;
1277 goto done;
1278 }
1279
1280 if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1281 knote_drop(kn, td);
1282 goto done;
1283 }
1284 KN_LIST_LOCK(kn);
1285 goto done_ev_add;
1286 } else {
1287 /* No matching knote and the EV_ADD flag is not set. */
1288 KQ_UNLOCK(kq);
1289 error = ENOENT;
1290 goto done;
1291 }
1292 }
1293
1294 if (kev->flags & EV_DELETE) {
1295 kn->kn_status |= KN_INFLUX;
1296 KQ_UNLOCK(kq);
1297 if (!(kn->kn_status & KN_DETACHED))
1298 kn->kn_fop->f_detach(kn);
1299 knote_drop(kn, td);
1300 goto done;
1301 }
1302
1303 /*
1304 * The user may change some filter values after the initial EV_ADD,
1305 * but doing so will not reset any filter which has already been
1306 * triggered.
1307 */
1308 kn->kn_status |= KN_INFLUX | KN_SCAN;
1309 KQ_UNLOCK(kq);
1310 KN_LIST_LOCK(kn);
1311 kn->kn_kevent.udata = kev->udata;
1312 if (!fops->f_isfd && fops->f_touch != NULL) {
1313 fops->f_touch(kn, kev, EVENT_REGISTER);
1314 } else {
1315 kn->kn_sfflags = kev->fflags;
1316 kn->kn_sdata = kev->data;
1317 kn->kn_kevent.ext[0] = kev->ext[0];
1318 kn->kn_kevent.ext[1] = kev->ext[1];
1319 }
1320
1321 /*
1322 * We can get here with kn->kn_knlist == NULL. This can happen when
1323 * the initial attach event decides that the event is "completed"
1324 * already. i.e. filt_procattach is called on a zombie process. It
1325 * will call filt_proc which will remove it from the list, and NULL
1326 * kn_knlist.
1327 */
1328 done_ev_add:
1329 if ((kev->flags & EV_DISABLE) &&
1330 ((kn->kn_status & KN_DISABLED) == 0)) {
1331 kn->kn_status |= KN_DISABLED;
1332 }
1333
1334 if ((kn->kn_status & KN_DISABLED) == 0 || (kev->flags & EV_ENABLE))
1335 event = kn->kn_fop->f_event(kn, 0);
1336 else
1337 event = 0;
1338 KQ_LOCK(kq);
1339 if (event)
1340 KNOTE_ACTIVATE(kn, 1);
1341 kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1342 KN_LIST_UNLOCK(kn);
1343 #ifdef KN_DEBUG
1344 #define IS_KN_DISABLED(kn) (!!(kn->kn_status & KN_DISABLED))
1345 #define IS_KN_QUEUED(kn) (!!(kn->kn_status & KN_QUEUED))
1346 #define IS_KN_ACTIVE(kn) (!!(kn->kn_status & KN_ACTIVE))
1347 if ((kev->flags & EV_ENABLE)) {
1348 printf("KN_DISABLED=%d KN_ACTIVE=%d KN_QUEUED=%d\n",
1349 IS_KN_DISABLED(kn), IS_KN_ACTIVE(kn), IS_KN_QUEUED(kn));
1350 }
1351 #endif
1352 if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1353 kn->kn_status &= ~KN_DISABLED;
1354 if ((kn->kn_status & KN_ACTIVE) &&
1355 ((kn->kn_status & KN_QUEUED) == 0))
1356 knote_enqueue(kn);
1357 }
1358 KQ_UNLOCK_FLUX(kq);
1359
1360 done:
1361 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1362 if (filedesc_unlock)
1363 FILEDESC_XUNLOCK(td->td_proc->p_fd);
1364 if (fp != NULL)
1365 fdrop(fp, td);
1366 if (tkn != NULL)
1367 knote_free(tkn);
1368 if (fops != NULL)
1369 kqueue_fo_release(filt);
1370 return (error);
1371 }
1372
1373 static int
kqueue_acquire(struct file * fp,struct kqueue ** kqp)1374 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1375 {
1376 int error;
1377 struct kqueue *kq;
1378
1379 error = 0;
1380
1381 kq = fp->f_data;
1382 if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1383 return (EBADF);
1384 *kqp = kq;
1385 KQ_LOCK(kq);
1386 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1387 KQ_UNLOCK(kq);
1388 return (EBADF);
1389 }
1390 kq->kq_refcnt++;
1391 KQ_UNLOCK(kq);
1392
1393 return error;
1394 }
1395
1396 static void
kqueue_release(struct kqueue * kq,int locked)1397 kqueue_release(struct kqueue *kq, int locked)
1398 {
1399 if (locked)
1400 KQ_OWNED(kq);
1401 else
1402 KQ_LOCK(kq);
1403 kq->kq_refcnt--;
1404 if (kq->kq_refcnt == 1)
1405 wakeup(&kq->kq_refcnt);
1406 if (!locked)
1407 KQ_UNLOCK(kq);
1408 }
1409
1410 static void
kqueue_schedtask(struct kqueue * kq)1411 kqueue_schedtask(struct kqueue *kq)
1412 {
1413
1414 KQ_OWNED(kq);
1415 KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1416 ("scheduling kqueue task while draining"));
1417
1418 if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1419 taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
1420 kq->kq_state |= KQ_TASKSCHED;
1421 }
1422 }
1423
1424 /*
1425 * Expand the kq to make sure we have storage for fops/ident pair.
1426 *
1427 * Return 0 on success (or no work necessary), return errno on failure.
1428 *
1429 * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1430 * If kqueue_register is called from a non-fd context, there usually/should
1431 * be no locks held.
1432 */
1433 static int
kqueue_expand(struct kqueue * kq,struct filterops * fops,uintptr_t ident,int waitok)1434 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1435 int waitok)
1436 {
1437 struct klist *list, *tmp_knhash, *to_free;
1438 u_long tmp_knhashmask;
1439 int size;
1440 int fd;
1441 int mflag = waitok ? M_WAITOK : M_NOWAIT;
1442
1443 KQ_NOTOWNED(kq);
1444
1445 to_free = NULL;
1446 if (fops->f_isfd) {
1447 fd = ident;
1448 if (kq->kq_knlistsize <= fd) {
1449 size = kq->kq_knlistsize;
1450 while (size <= fd)
1451 size += KQEXTENT;
1452 list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1453 if (list == NULL)
1454 return ENOMEM;
1455 KQ_LOCK(kq);
1456 if (kq->kq_knlistsize > fd) {
1457 to_free = list;
1458 list = NULL;
1459 } else {
1460 if (kq->kq_knlist != NULL) {
1461 bcopy(kq->kq_knlist, list,
1462 kq->kq_knlistsize * sizeof(*list));
1463 to_free = kq->kq_knlist;
1464 kq->kq_knlist = NULL;
1465 }
1466 bzero((caddr_t)list +
1467 kq->kq_knlistsize * sizeof(*list),
1468 (size - kq->kq_knlistsize) * sizeof(*list));
1469 kq->kq_knlistsize = size;
1470 kq->kq_knlist = list;
1471 }
1472 KQ_UNLOCK(kq);
1473 }
1474 } else {
1475 if (kq->kq_knhashmask == 0) {
1476 tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1477 &tmp_knhashmask);
1478 if (tmp_knhash == NULL)
1479 return ENOMEM;
1480 KQ_LOCK(kq);
1481 if (kq->kq_knhashmask == 0) {
1482 kq->kq_knhash = tmp_knhash;
1483 kq->kq_knhashmask = tmp_knhashmask;
1484 } else {
1485 to_free = tmp_knhash;
1486 }
1487 KQ_UNLOCK(kq);
1488 }
1489 }
1490 free(to_free, M_KQUEUE);
1491
1492 KQ_NOTOWNED(kq);
1493 return 0;
1494 }
1495
1496 static void
kqueue_task(void * arg,int pending)1497 kqueue_task(void *arg, int pending)
1498 {
1499 struct kqueue *kq;
1500 int haskqglobal;
1501
1502 haskqglobal = 0;
1503 kq = arg;
1504
1505 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1506 KQ_LOCK(kq);
1507
1508 KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1509
1510 kq->kq_state &= ~KQ_TASKSCHED;
1511 if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1512 wakeup(&kq->kq_state);
1513 }
1514 KQ_UNLOCK(kq);
1515 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1516 }
1517
1518 /*
1519 * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1520 * We treat KN_MARKER knotes as if they are INFLUX.
1521 */
1522 static int
kqueue_scan(struct kqueue * kq,int maxevents,struct kevent_copyops * k_ops,const struct timespec * tsp,struct kevent64_s * keva,struct thread * td)1523 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1524 const struct timespec *tsp, struct kevent64_s *keva, struct thread *td)
1525 {
1526 struct kevent64_s *kevp;
1527 struct knote *kn, *marker;
1528 sbintime_t asbt, rsbt;
1529 int count, error, haskqglobal, influx, nkev, touch;
1530
1531 count = maxevents;
1532 nkev = 0;
1533 error = 0;
1534 haskqglobal = 0;
1535
1536 if (maxevents == 0)
1537 goto done_nl;
1538
1539 rsbt = 0;
1540 if (tsp != NULL) {
1541 if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1542 tsp->tv_nsec >= 1000000000) {
1543 error = EINVAL;
1544 goto done_nl;
1545 }
1546 if (timespecisset(tsp)) {
1547 if (tsp->tv_sec <= INT32_MAX) {
1548 rsbt = tstosbt(*tsp);
1549 if (TIMESEL(&asbt, rsbt))
1550 asbt += tc_tick_sbt;
1551 if (asbt <= INT64_MAX - rsbt)
1552 asbt += rsbt;
1553 else
1554 asbt = 0;
1555 rsbt >>= tc_precexp;
1556 } else
1557 asbt = 0;
1558 } else
1559 asbt = -1;
1560 } else
1561 asbt = 0;
1562 marker = knote_alloc(1);
1563 if (marker == NULL) {
1564 error = ENOMEM;
1565 goto done_nl;
1566 }
1567 marker->kn_status = KN_MARKER;
1568 KQ_LOCK(kq);
1569
1570 retry:
1571 kevp = keva;
1572 if (kq->kq_count == 0) {
1573 if (asbt == -1) {
1574 error = EWOULDBLOCK;
1575 } else {
1576 kq->kq_state |= KQ_SLEEP;
1577 error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1578 "kqread", asbt, rsbt, C_ABSOLUTE);
1579 }
1580 if (error == 0)
1581 goto retry;
1582 /* don't restart after signals... */
1583 if (error == ERESTART)
1584 error = EINTR;
1585 else if (error == EWOULDBLOCK)
1586 error = 0;
1587 goto done;
1588 }
1589
1590 TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1591 influx = 0;
1592 while (count) {
1593 KQ_OWNED(kq);
1594 kn = TAILQ_FIRST(&kq->kq_head);
1595
1596 if ((kn->kn_status == KN_MARKER && kn != marker) ||
1597 (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1598 if (influx) {
1599 influx = 0;
1600 KQ_FLUX_WAKEUP(kq);
1601 }
1602 kq->kq_state |= KQ_FLUXWAIT;
1603 error = msleep(kq, &kq->kq_lock, PSOCK,
1604 "kqflxwt", 0);
1605 continue;
1606 }
1607
1608 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1609 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1610 kn->kn_status &= ~KN_QUEUED;
1611 kq->kq_count--;
1612 continue;
1613 }
1614 if (kn == marker) {
1615 KQ_FLUX_WAKEUP(kq);
1616 if (count == maxevents)
1617 goto retry;
1618 goto done;
1619 }
1620 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1621 ("KN_INFLUX set when not suppose to be"));
1622
1623 if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1624 kn->kn_status &= ~KN_QUEUED;
1625 kn->kn_status |= KN_INFLUX;
1626 kq->kq_count--;
1627 KQ_UNLOCK(kq);
1628 /*
1629 * We don't need to lock the list since we've marked
1630 * it _INFLUX.
1631 */
1632 if (!(kn->kn_status & KN_DETACHED))
1633 kn->kn_fop->f_detach(kn);
1634 knote_drop(kn, td);
1635 KQ_LOCK(kq);
1636 continue;
1637 } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1638 kn->kn_status &= ~KN_QUEUED;
1639 kn->kn_status |= KN_INFLUX;
1640 kq->kq_count--;
1641 KQ_UNLOCK(kq);
1642 /*
1643 * We don't need to lock the list since we've marked
1644 * it _INFLUX.
1645 */
1646 *kevp = kn->kn_kevent;
1647 if (!(kn->kn_status & KN_DETACHED))
1648 kn->kn_fop->f_detach(kn);
1649 knote_drop(kn, td);
1650 KQ_LOCK(kq);
1651 kn = NULL;
1652 } else {
1653 kn->kn_status |= KN_INFLUX | KN_SCAN;
1654 KQ_UNLOCK(kq);
1655 if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1656 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1657 KN_LIST_LOCK(kn);
1658 if (kn->kn_fop->f_event(kn, 0) == 0) {
1659 KQ_LOCK(kq);
1660 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1661 kn->kn_status &=
1662 ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
1663 KN_SCAN);
1664 kq->kq_count--;
1665 KN_LIST_UNLOCK(kn);
1666 influx = 1;
1667 continue;
1668 }
1669 touch = (!kn->kn_fop->f_isfd &&
1670 kn->kn_fop->f_touch != NULL);
1671 if (touch)
1672 kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1673 else
1674 *kevp = kn->kn_kevent;
1675 KQ_LOCK(kq);
1676 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1677 if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1678 /*
1679 * Manually clear knotes who weren't
1680 * 'touch'ed.
1681 */
1682 if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1683 kn->kn_data = 0;
1684 kn->kn_fflags = 0;
1685 }
1686 if (kn->kn_flags & EV_DISPATCH)
1687 kn->kn_status |= KN_DISABLED;
1688 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1689 kq->kq_count--;
1690 } else
1691 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1692
1693 kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1694 KN_LIST_UNLOCK(kn);
1695 influx = 1;
1696 }
1697
1698 /* we are returning a copy to the user */
1699 kevp++;
1700 nkev++;
1701 count--;
1702
1703 if (nkev == KQ_NEVENTS) {
1704 influx = 0;
1705 KQ_UNLOCK_FLUX(kq);
1706 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1707 nkev = 0;
1708 kevp = keva;
1709 KQ_LOCK(kq);
1710 if (error)
1711 break;
1712 }
1713 }
1714 TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1715 done:
1716 KQ_OWNED(kq);
1717 KQ_UNLOCK_FLUX(kq);
1718 knote_free(marker);
1719 done_nl:
1720 KQ_NOTOWNED(kq);
1721 if (nkev != 0)
1722 error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1723 td->td_retval[0] = maxevents - count;
1724 return (error);
1725 }
1726
1727 /*
1728 * XXX
1729 * This could be expanded to call kqueue_scan, if desired.
1730 */
1731 /*ARGSUSED*/
1732 static int
kqueue_read(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1733 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1734 int flags, struct thread *td)
1735 {
1736 return (ENXIO);
1737 }
1738
1739 /*ARGSUSED*/
1740 static int
kqueue_write(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1741 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1742 int flags, struct thread *td)
1743 {
1744 return (ENXIO);
1745 }
1746
1747 /*ARGSUSED*/
1748 static int
kqueue_truncate(struct file * fp,off_t length,struct ucred * active_cred,struct thread * td)1749 kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1750 struct thread *td)
1751 {
1752
1753 return (EINVAL);
1754 }
1755
1756 /*ARGSUSED*/
1757 static int
kqueue_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)1758 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1759 struct ucred *active_cred, struct thread *td)
1760 {
1761 /*
1762 * Enabling sigio causes two major problems:
1763 * 1) infinite recursion:
1764 * Synopsys: kevent is being used to track signals and have FIOASYNC
1765 * set. On receipt of a signal this will cause a kqueue to recurse
1766 * into itself over and over. Sending the sigio causes the kqueue
1767 * to become ready, which in turn posts sigio again, forever.
1768 * Solution: this can be solved by setting a flag in the kqueue that
1769 * we have a SIGIO in progress.
1770 * 2) locking problems:
1771 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1772 * us above the proc and pgrp locks.
1773 * Solution: Post a signal using an async mechanism, being sure to
1774 * record a generation count in the delivery so that we do not deliver
1775 * a signal to the wrong process.
1776 *
1777 * Note, these two mechanisms are somewhat mutually exclusive!
1778 */
1779 #if 0
1780 struct kqueue *kq;
1781
1782 kq = fp->f_data;
1783 switch (cmd) {
1784 case FIOASYNC:
1785 if (*(int *)data) {
1786 kq->kq_state |= KQ_ASYNC;
1787 } else {
1788 kq->kq_state &= ~KQ_ASYNC;
1789 }
1790 return (0);
1791
1792 case FIOSETOWN:
1793 return (fsetown(*(int *)data, &kq->kq_sigio));
1794
1795 case FIOGETOWN:
1796 *(int *)data = fgetown(&kq->kq_sigio);
1797 return (0);
1798 }
1799 #endif
1800
1801 return (ENOTTY);
1802 }
1803
1804 /*ARGSUSED*/
1805 static int
kqueue_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)1806 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1807 struct thread *td)
1808 {
1809 struct kqueue *kq;
1810 int revents = 0;
1811 int error;
1812
1813 if ((error = kqueue_acquire(fp, &kq)))
1814 return POLLERR;
1815
1816 KQ_LOCK(kq);
1817 if (events & (POLLIN | POLLRDNORM)) {
1818 if (kq->kq_count) {
1819 revents |= events & (POLLIN | POLLRDNORM);
1820 } else {
1821 selrecord(td, &kq->kq_sel);
1822 if (SEL_WAITING(&kq->kq_sel))
1823 kq->kq_state |= KQ_SEL;
1824 }
1825 }
1826 kqueue_release(kq, 1);
1827 KQ_UNLOCK(kq);
1828 return (revents);
1829 }
1830
1831 /*ARGSUSED*/
1832 static int
kqueue_stat(struct file * fp,struct stat * st,struct ucred * active_cred,struct thread * td)1833 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1834 struct thread *td)
1835 {
1836
1837 bzero((void *)st, sizeof *st);
1838 /*
1839 * We no longer return kq_count because the unlocked value is useless.
1840 * If you spent all this time getting the count, why not spend your
1841 * syscall better by calling kevent?
1842 *
1843 * XXX - This is needed for libc_r.
1844 */
1845 st->st_mode = S_IFIFO;
1846 return (0);
1847 }
1848
1849 /*ARGSUSED*/
1850 static int
kqueue_close(struct file * fp,struct thread * td)1851 kqueue_close(struct file *fp, struct thread *td)
1852 {
1853 struct kqueue *kq = fp->f_data;
1854 struct filedesc *fdp;
1855 struct knote *kn;
1856 int i;
1857 int error;
1858 int filedesc_unlock;
1859
1860 if ((error = kqueue_acquire(fp, &kq)))
1861 return error;
1862
1863 filedesc_unlock = 0;
1864 KQ_LOCK(kq);
1865
1866 KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1867 ("kqueue already closing"));
1868 kq->kq_state |= KQ_CLOSING;
1869 if (kq->kq_refcnt > 1)
1870 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1871
1872 KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1873 fdp = kq->kq_fdp;
1874
1875 KASSERT(knlist_empty(&kq->kq_sel.si_note),
1876 ("kqueue's knlist not empty"));
1877
1878 for (i = 0; i < kq->kq_knlistsize; i++) {
1879 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1880 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1881 kq->kq_state |= KQ_FLUXWAIT;
1882 msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1883 continue;
1884 }
1885 kn->kn_status |= KN_INFLUX;
1886 KQ_UNLOCK(kq);
1887 if (!(kn->kn_status & KN_DETACHED))
1888 kn->kn_fop->f_detach(kn);
1889 knote_drop(kn, td);
1890 KQ_LOCK(kq);
1891 }
1892 }
1893 if (kq->kq_knhashmask != 0) {
1894 for (i = 0; i <= kq->kq_knhashmask; i++) {
1895 while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1896 if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1897 kq->kq_state |= KQ_FLUXWAIT;
1898 msleep(kq, &kq->kq_lock, PSOCK,
1899 "kqclo2", 0);
1900 continue;
1901 }
1902 kn->kn_status |= KN_INFLUX;
1903 KQ_UNLOCK(kq);
1904 if (!(kn->kn_status & KN_DETACHED))
1905 kn->kn_fop->f_detach(kn);
1906 knote_drop(kn, td);
1907 KQ_LOCK(kq);
1908 }
1909 }
1910 }
1911
1912 if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1913 kq->kq_state |= KQ_TASKDRAIN;
1914 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1915 }
1916
1917 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1918 selwakeuppri(&kq->kq_sel, PSOCK);
1919 if (!SEL_WAITING(&kq->kq_sel))
1920 kq->kq_state &= ~KQ_SEL;
1921 }
1922
1923 KQ_UNLOCK(kq);
1924
1925 /*
1926 * We could be called due to the knote_drop() doing fdrop(),
1927 * called from kqueue_register(). In this case the global
1928 * lock is owned, and filedesc sx is locked before, to not
1929 * take the sleepable lock after non-sleepable.
1930 */
1931 if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
1932 FILEDESC_XLOCK(fdp);
1933 filedesc_unlock = 1;
1934 } else
1935 filedesc_unlock = 0;
1936 TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
1937 if (filedesc_unlock)
1938 FILEDESC_XUNLOCK(fdp);
1939
1940 seldrain(&kq->kq_sel);
1941 knlist_destroy(&kq->kq_sel.si_note);
1942 mtx_destroy(&kq->kq_lock);
1943 kq->kq_fdp = NULL;
1944
1945 if (kq->kq_knhash != NULL)
1946 free(kq->kq_knhash, M_KQUEUE);
1947 if (kq->kq_knlist != NULL)
1948 free(kq->kq_knlist, M_KQUEUE);
1949
1950 funsetown(&kq->kq_sigio);
1951 free(kq, M_KQUEUE);
1952 fp->f_data = NULL;
1953
1954 return (0);
1955 }
1956
1957 static void
kqueue_wakeup(struct kqueue * kq)1958 kqueue_wakeup(struct kqueue *kq)
1959 {
1960 KQ_OWNED(kq);
1961
1962 if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1963 kq->kq_state &= ~KQ_SLEEP;
1964 wakeup(kq);
1965 }
1966 if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1967 selwakeuppri(&kq->kq_sel, PSOCK);
1968 if (!SEL_WAITING(&kq->kq_sel))
1969 kq->kq_state &= ~KQ_SEL;
1970 }
1971 if (!knlist_empty(&kq->kq_sel.si_note))
1972 kqueue_schedtask(kq);
1973 if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1974 pgsigio(&kq->kq_sigio, SIGIO, 0);
1975 }
1976 }
1977
1978 /*
1979 * Walk down a list of knotes, activating them if their event has triggered.
1980 *
1981 * There is a possibility to optimize in the case of one kq watching another.
1982 * Instead of scheduling a task to wake it up, you could pass enough state
1983 * down the chain to make up the parent kqueue. Make this code functional
1984 * first.
1985 */
1986 void
knote(struct knlist * list,long hint,int lockflags)1987 knote(struct knlist *list, long hint, int lockflags)
1988 {
1989 struct kqueue *kq;
1990 struct knote *kn;
1991 int error;
1992
1993 if (list == NULL)
1994 return;
1995
1996 KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
1997
1998 if ((lockflags & KNF_LISTLOCKED) == 0)
1999 list->kl_lock(list->kl_lockarg);
2000
2001 /*
2002 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
2003 * the kqueue scheduling, but this will introduce four
2004 * lock/unlock's for each knote to test. If we do, continue to use
2005 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
2006 * only safe if you want to remove the current item, which we are
2007 * not doing.
2008 */
2009 SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
2010 kq = kn->kn_kq;
2011 KQ_LOCK(kq);
2012 if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
2013 /*
2014 * Do not process the influx notes, except for
2015 * the influx coming from the kq unlock in the
2016 * kqueue_scan(). In the later case, we do
2017 * not interfere with the scan, since the code
2018 * fragment in kqueue_scan() locks the knlist,
2019 * and cannot proceed until we finished.
2020 */
2021 KQ_UNLOCK(kq);
2022 } else if ((lockflags & KNF_NOKQLOCK) != 0) {
2023 kn->kn_status |= KN_INFLUX;
2024 KQ_UNLOCK(kq);
2025 error = kn->kn_fop->f_event(kn, hint);
2026 KQ_LOCK(kq);
2027 kn->kn_status &= ~KN_INFLUX;
2028 if (error)
2029 KNOTE_ACTIVATE(kn, 1);
2030 KQ_UNLOCK_FLUX(kq);
2031 } else {
2032 kn->kn_status |= KN_HASKQLOCK;
2033 if (kn->kn_fop->f_event(kn, hint))
2034 KNOTE_ACTIVATE(kn, 1);
2035 kn->kn_status &= ~KN_HASKQLOCK;
2036 KQ_UNLOCK(kq);
2037 }
2038 }
2039 if ((lockflags & KNF_LISTLOCKED) == 0)
2040 list->kl_unlock(list->kl_lockarg);
2041 }
2042
2043 /*
2044 * add a knote to a knlist
2045 */
2046 void
knlist_add(struct knlist * knl,struct knote * kn,int islocked)2047 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2048 {
2049 KNL_ASSERT_LOCK(knl, islocked);
2050 KQ_NOTOWNED(kn->kn_kq);
2051 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
2052 (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
2053 if (!islocked)
2054 knl->kl_lock(knl->kl_lockarg);
2055 SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2056 if (!islocked)
2057 knl->kl_unlock(knl->kl_lockarg);
2058 KQ_LOCK(kn->kn_kq);
2059 kn->kn_knlist = knl;
2060 kn->kn_status &= ~KN_DETACHED;
2061 KQ_UNLOCK(kn->kn_kq);
2062 }
2063
2064 static void
knlist_remove_kq(struct knlist * knl,struct knote * kn,int knlislocked,int kqislocked)2065 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
2066 {
2067 KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
2068 KNL_ASSERT_LOCK(knl, knlislocked);
2069 mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2070 if (!kqislocked)
2071 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
2072 ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
2073 if (!knlislocked)
2074 knl->kl_lock(knl->kl_lockarg);
2075 SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2076 kn->kn_knlist = NULL;
2077 if (!knlislocked)
2078 knl->kl_unlock(knl->kl_lockarg);
2079 if (!kqislocked)
2080 KQ_LOCK(kn->kn_kq);
2081 kn->kn_status |= KN_DETACHED;
2082 if (!kqislocked)
2083 KQ_UNLOCK(kn->kn_kq);
2084 }
2085
2086 /*
2087 * remove knote from the specified knlist
2088 */
2089 void
knlist_remove(struct knlist * knl,struct knote * kn,int islocked)2090 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2091 {
2092
2093 knlist_remove_kq(knl, kn, islocked, 0);
2094 }
2095
2096 /*
2097 * remove knote from the specified knlist while in f_event handler.
2098 */
2099 void
knlist_remove_inevent(struct knlist * knl,struct knote * kn)2100 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
2101 {
2102
2103 knlist_remove_kq(knl, kn, 1,
2104 (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
2105 }
2106
2107 int
knlist_empty(struct knlist * knl)2108 knlist_empty(struct knlist *knl)
2109 {
2110
2111 KNL_ASSERT_LOCKED(knl);
2112 return SLIST_EMPTY(&knl->kl_list);
2113 }
2114
2115 static struct mtx knlist_lock;
2116 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2117 MTX_DEF);
2118 static void knlist_mtx_lock(void *arg);
2119 static void knlist_mtx_unlock(void *arg);
2120
2121 static void
knlist_mtx_lock(void * arg)2122 knlist_mtx_lock(void *arg)
2123 {
2124
2125 mtx_lock((struct mtx *)arg);
2126 }
2127
2128 static void
knlist_mtx_unlock(void * arg)2129 knlist_mtx_unlock(void *arg)
2130 {
2131
2132 mtx_unlock((struct mtx *)arg);
2133 }
2134
2135 static void
knlist_mtx_assert_locked(void * arg)2136 knlist_mtx_assert_locked(void *arg)
2137 {
2138
2139 mtx_assert((struct mtx *)arg, MA_OWNED);
2140 }
2141
2142 static void
knlist_mtx_assert_unlocked(void * arg)2143 knlist_mtx_assert_unlocked(void *arg)
2144 {
2145
2146 mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2147 }
2148
2149 static void
knlist_rw_rlock(void * arg)2150 knlist_rw_rlock(void *arg)
2151 {
2152
2153 rw_rlock((struct rwlock *)arg);
2154 }
2155
2156 static void
knlist_rw_runlock(void * arg)2157 knlist_rw_runlock(void *arg)
2158 {
2159
2160 rw_runlock((struct rwlock *)arg);
2161 }
2162
2163 static void
knlist_rw_assert_locked(void * arg)2164 knlist_rw_assert_locked(void *arg)
2165 {
2166
2167 rw_assert((struct rwlock *)arg, RA_LOCKED);
2168 }
2169
2170 static void
knlist_rw_assert_unlocked(void * arg)2171 knlist_rw_assert_unlocked(void *arg)
2172 {
2173
2174 rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2175 }
2176
2177 void
knlist_init(struct knlist * knl,void * lock,void (* kl_lock)(void *),void (* kl_unlock)(void *),void (* kl_assert_locked)(void *),void (* kl_assert_unlocked)(void *))2178 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2179 void (*kl_unlock)(void *),
2180 void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
2181 {
2182
2183 if (lock == NULL)
2184 knl->kl_lockarg = &knlist_lock;
2185 else
2186 knl->kl_lockarg = lock;
2187
2188 if (kl_lock == NULL)
2189 knl->kl_lock = knlist_mtx_lock;
2190 else
2191 knl->kl_lock = kl_lock;
2192 if (kl_unlock == NULL)
2193 knl->kl_unlock = knlist_mtx_unlock;
2194 else
2195 knl->kl_unlock = kl_unlock;
2196 if (kl_assert_locked == NULL)
2197 knl->kl_assert_locked = knlist_mtx_assert_locked;
2198 else
2199 knl->kl_assert_locked = kl_assert_locked;
2200 if (kl_assert_unlocked == NULL)
2201 knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
2202 else
2203 knl->kl_assert_unlocked = kl_assert_unlocked;
2204
2205 SLIST_INIT(&knl->kl_list);
2206 }
2207
2208 void
knlist_init_mtx(struct knlist * knl,struct mtx * lock)2209 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2210 {
2211
2212 knlist_init(knl, lock, NULL, NULL, NULL, NULL);
2213 }
2214
2215 void
knlist_init_rw_reader(struct knlist * knl,struct rwlock * lock)2216 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2217 {
2218
2219 knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2220 knlist_rw_assert_locked, knlist_rw_assert_unlocked);
2221 }
2222
2223 void
knlist_destroy(struct knlist * knl)2224 knlist_destroy(struct knlist *knl)
2225 {
2226
2227 #ifdef INVARIANTS
2228 /*
2229 * if we run across this error, we need to find the offending
2230 * driver and have it call knlist_clear or knlist_delete.
2231 */
2232 if (!SLIST_EMPTY(&knl->kl_list))
2233 printf("WARNING: destroying knlist w/ knotes on it!\n");
2234 #endif
2235
2236 knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
2237 SLIST_INIT(&knl->kl_list);
2238 }
2239
2240 /*
2241 * Even if we are locked, we may need to drop the lock to allow any influx
2242 * knotes time to "settle".
2243 */
2244 void
knlist_cleardel(struct knlist * knl,struct thread * td,int islocked,int killkn)2245 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2246 {
2247 struct knote *kn, *kn2;
2248 struct kqueue *kq;
2249
2250 if (islocked)
2251 KNL_ASSERT_LOCKED(knl);
2252 else {
2253 KNL_ASSERT_UNLOCKED(knl);
2254 again: /* need to reacquire lock since we have dropped it */
2255 knl->kl_lock(knl->kl_lockarg);
2256 }
2257
2258 SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2259 kq = kn->kn_kq;
2260 KQ_LOCK(kq);
2261 if ((kn->kn_status & KN_INFLUX)) {
2262 KQ_UNLOCK(kq);
2263 continue;
2264 }
2265 knlist_remove_kq(knl, kn, 1, 1);
2266 if (killkn) {
2267 kn->kn_status |= KN_INFLUX | KN_DETACHED;
2268 KQ_UNLOCK(kq);
2269 knote_drop(kn, td);
2270 } else {
2271 /* Make sure cleared knotes disappear soon */
2272 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2273 KQ_UNLOCK(kq);
2274 }
2275 kq = NULL;
2276 }
2277
2278 if (!SLIST_EMPTY(&knl->kl_list)) {
2279 /* there are still KN_INFLUX remaining */
2280 kn = SLIST_FIRST(&knl->kl_list);
2281 kq = kn->kn_kq;
2282 KQ_LOCK(kq);
2283 KASSERT(kn->kn_status & KN_INFLUX,
2284 ("knote removed w/o list lock"));
2285 knl->kl_unlock(knl->kl_lockarg);
2286 kq->kq_state |= KQ_FLUXWAIT;
2287 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2288 kq = NULL;
2289 goto again;
2290 }
2291
2292 if (islocked)
2293 KNL_ASSERT_LOCKED(knl);
2294 else {
2295 knl->kl_unlock(knl->kl_lockarg);
2296 KNL_ASSERT_UNLOCKED(knl);
2297 }
2298 }
2299
2300 /*
2301 * Remove all knotes referencing a specified fd must be called with FILEDESC
2302 * lock. This prevents a race where a new fd comes along and occupies the
2303 * entry and we attach a knote to the fd.
2304 */
2305 void
knote_fdclose(struct thread * td,int fd)2306 knote_fdclose(struct thread *td, int fd)
2307 {
2308 struct filedesc *fdp = td->td_proc->p_fd;
2309 struct kqueue *kq;
2310 struct knote *kn;
2311 int influx;
2312
2313 FILEDESC_XLOCK_ASSERT(fdp);
2314
2315 /*
2316 * We shouldn't have to worry about new kevents appearing on fd
2317 * since filedesc is locked.
2318 */
2319 TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2320 KQ_LOCK(kq);
2321
2322 again:
2323 influx = 0;
2324 while (kq->kq_knlistsize > fd &&
2325 (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2326 if (kn->kn_status & KN_INFLUX) {
2327 /* someone else might be waiting on our knote */
2328 if (influx)
2329 wakeup(kq);
2330 kq->kq_state |= KQ_FLUXWAIT;
2331 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2332 goto again;
2333 }
2334 kn->kn_status |= KN_INFLUX;
2335 KQ_UNLOCK(kq);
2336 if (!(kn->kn_status & KN_DETACHED))
2337 kn->kn_fop->f_detach(kn);
2338 knote_drop(kn, td);
2339 influx = 1;
2340 KQ_LOCK(kq);
2341 }
2342 KQ_UNLOCK_FLUX(kq);
2343 }
2344 }
2345
2346 static int
knote_attach(struct knote * kn,struct kqueue * kq)2347 knote_attach(struct knote *kn, struct kqueue *kq)
2348 {
2349 struct klist *list;
2350
2351 KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2352 KQ_OWNED(kq);
2353
2354 if (kn->kn_fop->f_isfd) {
2355 if (kn->kn_id >= kq->kq_knlistsize)
2356 return ENOMEM;
2357 list = &kq->kq_knlist[kn->kn_id];
2358 } else {
2359 if (kq->kq_knhash == NULL)
2360 return ENOMEM;
2361 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2362 }
2363
2364 SLIST_INSERT_HEAD(list, kn, kn_link);
2365
2366 return 0;
2367 }
2368
2369 /*
2370 * knote must already have been detached using the f_detach method.
2371 * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2372 * to prevent other removal.
2373 */
2374 static void
knote_drop(struct knote * kn,struct thread * td)2375 knote_drop(struct knote *kn, struct thread *td)
2376 {
2377 struct kqueue *kq;
2378 struct klist *list;
2379
2380 kq = kn->kn_kq;
2381
2382 KQ_NOTOWNED(kq);
2383 KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2384 ("knote_drop called without KN_INFLUX set in kn_status"));
2385
2386 KQ_LOCK(kq);
2387 if (kn->kn_fop->f_isfd)
2388 list = &kq->kq_knlist[kn->kn_id];
2389 else
2390 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2391
2392 if (!SLIST_EMPTY(list))
2393 SLIST_REMOVE(list, kn, knote, kn_link);
2394 if (kn->kn_status & KN_QUEUED)
2395 knote_dequeue(kn);
2396 KQ_UNLOCK_FLUX(kq);
2397
2398 if (kn->kn_fop->f_isfd) {
2399 fdrop(kn->kn_fp, td);
2400 kn->kn_fp = NULL;
2401 }
2402 kqueue_fo_release(kn->kn_kevent.filter);
2403 kn->kn_fop = NULL;
2404 knote_free(kn);
2405 }
2406
2407 void
knote_enqueue(struct knote * kn)2408 knote_enqueue(struct knote *kn)
2409 {
2410 struct kqueue *kq = kn->kn_kq;
2411
2412 KQ_OWNED(kn->kn_kq);
2413 KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2414
2415 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2416 kn->kn_status |= KN_QUEUED;
2417 kq->kq_count++;
2418 kqueue_wakeup(kq);
2419 }
2420
2421 static void
knote_dequeue(struct knote * kn)2422 knote_dequeue(struct knote *kn)
2423 {
2424 struct kqueue *kq = kn->kn_kq;
2425
2426 KQ_OWNED(kn->kn_kq);
2427 KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2428
2429 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2430 kn->kn_status &= ~KN_QUEUED;
2431 kq->kq_count--;
2432 }
2433
2434 static void
knote_init(void)2435 knote_init(void)
2436 {
2437
2438 knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2439 NULL, NULL, UMA_ALIGN_PTR, 0);
2440 }
2441 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2442
2443 static struct knote *
knote_alloc(int waitok)2444 knote_alloc(int waitok)
2445 {
2446 return ((struct knote *)uma_zalloc(knote_zone,
2447 (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
2448 }
2449
2450 static void
knote_free(struct knote * kn)2451 knote_free(struct knote *kn)
2452 {
2453 if (kn != NULL)
2454 uma_zfree(knote_zone, kn);
2455 }
2456
2457 /*
2458 * Register the kev w/ the kq specified by fd.
2459 */
2460 int
kqfd_register(int fd,struct kevent64_s * kev,struct thread * td,int waitok)2461 kqfd_register(int fd, struct kevent64_s *kev, struct thread *td, int waitok)
2462 {
2463 struct kqueue *kq;
2464 struct file *fp;
2465 cap_rights_t rights;
2466 int error;
2467
2468 error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
2469 if (error != 0)
2470 return (error);
2471 if ((error = kqueue_acquire(fp, &kq)) != 0)
2472 goto noacquire;
2473
2474 error = kqueue_register(kq, kev, td, waitok);
2475
2476 kqueue_release(kq, 0);
2477
2478 noacquire:
2479 fdrop(fp, td);
2480
2481 return error;
2482 }
2483