xref: /trueos/sys/kern/kern_event.c (revision f9482c6581e4206e7e778f05c3676dc594712b51)
1 /*-
2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
3  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
4  * Copyright (c) 2009 Apple, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include "opt_ktrace.h"
33 #include "opt_compat_mach.h"
34 
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/capsicum.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/rwlock.h>
42 #include <sys/proc.h>
43 #include <sys/malloc.h>
44 #include <sys/unistd.h>
45 #include <sys/file.h>
46 #include <sys/filedesc.h>
47 #include <sys/filio.h>
48 #include <sys/fcntl.h>
49 #include <sys/kthread.h>
50 #include <sys/selinfo.h>
51 #include <sys/stdatomic.h>
52 #include <sys/queue.h>
53 #include <sys/event.h>
54 #include <sys/eventvar.h>
55 #include <sys/poll.h>
56 #include <sys/protosw.h>
57 #include <sys/sigio.h>
58 #include <sys/signalvar.h>
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/stat.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysproto.h>
64 #include <sys/syscallsubr.h>
65 #include <sys/taskqueue.h>
66 #include <sys/uio.h>
67 #ifdef KTRACE
68 #include <sys/ktrace.h>
69 #endif
70 
71 #include <vm/uma.h>
72 
73 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
74 
75 /*
76  * This lock is used if multiple kq locks are required.  This possibly
77  * should be made into a per proc lock.
78  */
79 static struct mtx	kq_global;
80 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
81 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
82 	if (!haslck)				\
83 		mtx_lock(lck);			\
84 	haslck = 1;				\
85 } while (0)
86 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
87 	if (haslck)				\
88 		mtx_unlock(lck);			\
89 	haslck = 0;				\
90 } while (0)
91 
92 TASKQUEUE_DEFINE_THREAD(kqueue);
93 
94 static int	kevent_copyout(void *arg, void *kevp, int count);
95 static int	kevent_copyin(void *arg, void *kevp, int count);
96 static int	kevent64_copyout(void *arg, void *kevp, int count);
97 static int	kevent64_copyin(void *arg,void *kevp, int count);
98 static int	kqueue_register(struct kqueue *kq, struct kevent64_s *kev,
99 		    struct thread *td, int waitok);
100 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
101 static void	kqueue_release(struct kqueue *kq, int locked);
102 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
103 		    uintptr_t ident, int waitok);
104 static void	kqueue_task(void *arg, int pending);
105 static int	kqueue_scan(struct kqueue *kq, int maxevents,
106 		    struct kevent_copyops *k_ops,
107 		    const struct timespec *timeout,
108 		    struct kevent64_s *keva, struct thread *td);
109 static void 	kqueue_wakeup(struct kqueue *kq);
110 static struct filterops *kqueue_fo_find(int filt);
111 static void	kqueue_fo_release(int filt);
112 
113 static fo_rdwr_t	kqueue_read;
114 static fo_rdwr_t	kqueue_write;
115 static fo_truncate_t	kqueue_truncate;
116 static fo_ioctl_t	kqueue_ioctl;
117 static fo_poll_t	kqueue_poll;
118 static fo_kqfilter_t	kqueue_kqfilter;
119 static fo_stat_t	kqueue_stat;
120 static fo_close_t	kqueue_close;
121 
122 static struct fileops kqueueops = {
123 	.fo_read = kqueue_read,
124 	.fo_write = kqueue_write,
125 	.fo_truncate = kqueue_truncate,
126 	.fo_ioctl = kqueue_ioctl,
127 	.fo_poll = kqueue_poll,
128 	.fo_kqfilter = kqueue_kqfilter,
129 	.fo_stat = kqueue_stat,
130 	.fo_close = kqueue_close,
131 	.fo_chmod = invfo_chmod,
132 	.fo_chown = invfo_chown,
133 	.fo_sendfile = invfo_sendfile,
134 };
135 
136 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
137 static void 	knote_drop(struct knote *kn, struct thread *td);
138 void 	knote_enqueue(struct knote *kn);
139 static void 	knote_dequeue(struct knote *kn);
140 static void 	knote_init(void);
141 static struct 	knote *knote_alloc(int waitok);
142 static void 	knote_free(struct knote *kn);
143 
144 static void	filt_kqdetach(struct knote *kn);
145 static int	filt_kqueue(struct knote *kn, long hint);
146 static int	filt_procattach(struct knote *kn);
147 static void	filt_procdetach(struct knote *kn);
148 static int	filt_proc(struct knote *kn, long hint);
149 static int	filt_fileattach(struct knote *kn);
150 static void	filt_timerexpire(void *knx);
151 static int	filt_timerattach(struct knote *kn);
152 static void	filt_timerdetach(struct knote *kn);
153 static int	filt_timer(struct knote *kn, long hint);
154 static int	filt_userattach(struct knote *kn);
155 static void	filt_userdetach(struct knote *kn);
156 static int	filt_user(struct knote *kn, long hint);
157 static void	filt_usertouch(struct knote *kn, struct kevent64_s *kev,
158 		    u_long type);
159 
160 
161 static struct filterops file_filtops = {
162 	.f_isfd = 1,
163 	.f_attach = filt_fileattach,
164 };
165 static struct filterops kqread_filtops = {
166 	.f_isfd = 1,
167 	.f_detach = filt_kqdetach,
168 	.f_event = filt_kqueue,
169 };
170 /* XXX - move to kern_proc.c?  */
171 static struct filterops proc_filtops = {
172 	.f_isfd = 0,
173 	.f_attach = filt_procattach,
174 	.f_detach = filt_procdetach,
175 	.f_event = filt_proc,
176 };
177 static struct filterops timer_filtops = {
178 	.f_isfd = 0,
179 	.f_attach = filt_timerattach,
180 	.f_detach = filt_timerdetach,
181 	.f_event = filt_timer,
182 };
183 static struct filterops user_filtops = {
184 	.f_attach = filt_userattach,
185 	.f_detach = filt_userdetach,
186 	.f_event = filt_user,
187 	.f_touch = filt_usertouch,
188 };
189 
190 static uma_zone_t	knote_zone;
191 static atomic_uint	kq_ncallouts = ATOMIC_VAR_INIT(0);
192 static unsigned int 	kq_calloutmax = 4 * 1024;
193 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
194     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
195 
196 /* XXX - ensure not KN_INFLUX?? */
197 #define KNOTE_ACTIVATE(kn, islock) do { 				\
198 	if ((islock))							\
199 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
200 	else								\
201 		KQ_LOCK((kn)->kn_kq);					\
202 	(kn)->kn_status |= KN_ACTIVE;					\
203 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
204 		knote_enqueue((kn));					\
205 	if (!(islock))							\
206 		KQ_UNLOCK((kn)->kn_kq);					\
207 } while(0)
208 #define KQ_LOCK(kq) do {						\
209 	mtx_lock(&(kq)->kq_lock);					\
210 } while (0)
211 #define KQ_FLUX_WAKEUP(kq) do {						\
212 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
213 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
214 		wakeup((kq));						\
215 	}								\
216 } while (0)
217 #define KQ_UNLOCK_FLUX(kq) do {						\
218 	KQ_FLUX_WAKEUP(kq);						\
219 	mtx_unlock(&(kq)->kq_lock);					\
220 } while (0)
221 #define KQ_UNLOCK(kq) do {						\
222 	mtx_unlock(&(kq)->kq_lock);					\
223 } while (0)
224 #define KQ_OWNED(kq) do {						\
225 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
226 } while (0)
227 #define KQ_NOTOWNED(kq) do {						\
228 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
229 } while (0)
230 #define KN_LIST_LOCK(kn) do {						\
231 	if (kn->kn_knlist != NULL)					\
232 		kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg);	\
233 } while (0)
234 #define KN_LIST_UNLOCK(kn) do {						\
235 	if (kn->kn_knlist != NULL) 					\
236 		kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg);	\
237 } while (0)
238 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
239 	if (islocked)							\
240 		KNL_ASSERT_LOCKED(knl);				\
241 	else								\
242 		KNL_ASSERT_UNLOCKED(knl);				\
243 } while (0)
244 #ifdef INVARIANTS
245 #define	KNL_ASSERT_LOCKED(knl) do {					\
246 	knl->kl_assert_locked((knl)->kl_lockarg);			\
247 } while (0)
248 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
249 	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
250 } while (0)
251 #else /* !INVARIANTS */
252 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
253 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
254 #endif /* INVARIANTS */
255 
256 #define	KN_HASHSIZE		64		/* XXX should be tunable */
257 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
258 
259 static int
filt_nullattach(struct knote * kn)260 filt_nullattach(struct knote *kn)
261 {
262 
263 	return (ENXIO);
264 };
265 
266 struct filterops null_filtops = {
267 	.f_isfd = 0,
268 	.f_attach = filt_nullattach,
269 };
270 
271 /* XXX - make SYSINIT to add these, and move into respective modules. */
272 extern struct filterops sig_filtops;
273 extern struct filterops fs_filtops;
274 
275 /*
276  * Table for for all system-defined filters.
277  */
278 static struct mtx	filterops_lock;
279 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
280 	MTX_DEF);
281 static struct {
282 	struct filterops *for_fop;
283 	int for_refcnt;
284 } sysfilt_ops[EVFILT_SYSCOUNT] = {
285 	{ &file_filtops },			/* EVFILT_READ */
286 	{ &file_filtops },			/* EVFILT_WRITE */
287 	{ &null_filtops },			/* EVFILT_AIO */
288 	{ &file_filtops },			/* EVFILT_VNODE */
289 	{ &proc_filtops },			/* EVFILT_PROC */
290 	{ &sig_filtops },			/* EVFILT_SIGNAL */
291 	{ &timer_filtops },			/* EVFILT_TIMER */
292 	{ &null_filtops },			/* former EVFILT_NETDEV */
293 	{ &fs_filtops },			/* EVFILT_FS */
294 	{ &null_filtops },			/* EVFILT_LIO */
295 	{ &user_filtops, 1 },			/* EVFILT_USER */
296 	{ &null_filtops },			/* EVFILT_SENDFILE */
297 	{ &null_filtops },		/* EVFILT_MACHPORT */
298 #ifdef HAVE_EVFILT_VM
299 	{ &vm_filtops },			/* EVFILT_VM */
300 #else
301 	{ &null_filtops },
302 #endif
303 };
304 
305 /*
306  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
307  * method.
308  */
309 static int
filt_fileattach(struct knote * kn)310 filt_fileattach(struct knote *kn)
311 {
312 
313 	return (fo_kqfilter(kn->kn_fp, kn));
314 }
315 
316 /*ARGSUSED*/
317 static int
kqueue_kqfilter(struct file * fp,struct knote * kn)318 kqueue_kqfilter(struct file *fp, struct knote *kn)
319 {
320 	struct kqueue *kq = kn->kn_fp->f_data;
321 
322 	if (kn->kn_filter != EVFILT_READ)
323 		return (EINVAL);
324 
325 	kn->kn_status |= KN_KQUEUE;
326 	kn->kn_fop = &kqread_filtops;
327 	knlist_add(&kq->kq_sel.si_note, kn, 0);
328 
329 	return (0);
330 }
331 
332 static void
filt_kqdetach(struct knote * kn)333 filt_kqdetach(struct knote *kn)
334 {
335 	struct kqueue *kq = kn->kn_fp->f_data;
336 
337 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
338 }
339 
340 /*ARGSUSED*/
341 static int
filt_kqueue(struct knote * kn,long hint)342 filt_kqueue(struct knote *kn, long hint)
343 {
344 	struct kqueue *kq = kn->kn_fp->f_data;
345 
346 	kn->kn_data = kq->kq_count;
347 	return (kn->kn_data > 0);
348 }
349 
350 /* XXX - move to kern_proc.c?  */
351 static int
filt_procattach(struct knote * kn)352 filt_procattach(struct knote *kn)
353 {
354 	struct proc *p;
355 	int immediate;
356 	int error;
357 
358 	immediate = 0;
359 	p = pfind(kn->kn_id);
360 	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
361 		p = zpfind(kn->kn_id);
362 		immediate = 1;
363 	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
364 		immediate = 1;
365 	}
366 
367 	if (p == NULL)
368 		return (ESRCH);
369 	if ((error = p_cansee(curthread, p))) {
370 		PROC_UNLOCK(p);
371 		return (error);
372 	}
373 
374 	kn->kn_ptr.p_proc = p;
375 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
376 
377 	/*
378 	 * internal flag indicating registration done by kernel
379 	 */
380 	if (kn->kn_flags & EV_FLAG1) {
381 		kn->kn_data = kn->kn_sdata;		/* ppid */
382 		kn->kn_fflags = NOTE_CHILD;
383 		kn->kn_flags &= ~EV_FLAG1;
384 	}
385 
386 	if (immediate == 0)
387 		knlist_add(&p->p_klist, kn, 1);
388 
389 	/*
390 	 * Immediately activate any exit notes if the target process is a
391 	 * zombie.  This is necessary to handle the case where the target
392 	 * process, e.g. a child, dies before the kevent is registered.
393 	 */
394 	if (immediate && filt_proc(kn, NOTE_EXIT))
395 		KNOTE_ACTIVATE(kn, 0);
396 
397 	PROC_UNLOCK(p);
398 
399 	return (0);
400 }
401 
402 /*
403  * The knote may be attached to a different process, which may exit,
404  * leaving nothing for the knote to be attached to.  So when the process
405  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
406  * it will be deleted when read out.  However, as part of the knote deletion,
407  * this routine is called, so a check is needed to avoid actually performing
408  * a detach, because the original process does not exist any more.
409  */
410 /* XXX - move to kern_proc.c?  */
411 static void
filt_procdetach(struct knote * kn)412 filt_procdetach(struct knote *kn)
413 {
414 	struct proc *p;
415 
416 	p = kn->kn_ptr.p_proc;
417 	knlist_remove(&p->p_klist, kn, 0);
418 	kn->kn_ptr.p_proc = NULL;
419 }
420 
421 /* XXX - move to kern_proc.c?  */
422 static int
filt_proc(struct knote * kn,long hint)423 filt_proc(struct knote *kn, long hint)
424 {
425 	struct proc *p = kn->kn_ptr.p_proc;
426 	u_int event;
427 
428 	/*
429 	 * mask off extra data
430 	 */
431 	event = (u_int)hint & NOTE_PCTRLMASK;
432 
433 	/*
434 	 * if the user is interested in this event, record it.
435 	 */
436 	if (kn->kn_sfflags & event)
437 		kn->kn_fflags |= event;
438 
439 	/*
440 	 * process is gone, so flag the event as finished.
441 	 */
442 	if (event == NOTE_EXIT) {
443 		if (!(kn->kn_status & KN_DETACHED))
444 			knlist_remove_inevent(&p->p_klist, kn);
445 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
446 		kn->kn_ptr.p_proc = NULL;
447 		if (kn->kn_fflags & NOTE_EXIT) {
448 			kn->kn_data = p->p_xstat;
449 			/* OS X compatibility */
450 			if (kn->kn_sfflags & NOTE_EXITSTATUS)
451 				kn->kn_fflags |= NOTE_EXITSTATUS;
452 		}
453 		if (kn->kn_fflags == 0)
454 			kn->kn_flags |= EV_DROP;
455 		return (1);
456 	}
457 
458 	return (kn->kn_fflags != 0);
459 }
460 
461 /*
462  * Called when the process forked. It mostly does the same as the
463  * knote(), activating all knotes registered to be activated when the
464  * process forked. Additionally, for each knote attached to the
465  * parent, check whether user wants to track the new process. If so
466  * attach a new knote to it, and immediately report an event with the
467  * child's pid.
468  */
469 void
knote_fork(struct knlist * list,int pid)470 knote_fork(struct knlist *list, int pid)
471 {
472 	struct kqueue *kq;
473 	struct knote *kn;
474 	struct kevent64_s kev;
475 	int error;
476 
477 	if (list == NULL)
478 		return;
479 	list->kl_lock(list->kl_lockarg);
480 
481 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
482 		if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
483 			continue;
484 		kq = kn->kn_kq;
485 		KQ_LOCK(kq);
486 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
487 			KQ_UNLOCK(kq);
488 			continue;
489 		}
490 
491 		/*
492 		 * The same as knote(), activate the event.
493 		 */
494 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
495 			kn->kn_status |= KN_HASKQLOCK;
496 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
497 				KNOTE_ACTIVATE(kn, 1);
498 			kn->kn_status &= ~KN_HASKQLOCK;
499 			KQ_UNLOCK(kq);
500 			continue;
501 		}
502 
503 		/*
504 		 * The NOTE_TRACK case. In addition to the activation
505 		 * of the event, we need to register new event to
506 		 * track the child. Drop the locks in preparation for
507 		 * the call to kqueue_register().
508 		 */
509 		kn->kn_status |= KN_INFLUX;
510 		KQ_UNLOCK(kq);
511 		list->kl_unlock(list->kl_lockarg);
512 
513 		/*
514 		 * Activate existing knote and register a knote with
515 		 * new process.
516 		 */
517 		kev.ident = pid;
518 		kev.filter = kn->kn_filter;
519 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
520 		kev.fflags = kn->kn_sfflags;
521 		kev.data = kn->kn_id;		/* parent */
522 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
523 		error = kqueue_register(kq, &kev, NULL, 0);
524 		if (error)
525 			kn->kn_fflags |= NOTE_TRACKERR;
526 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
527 			KNOTE_ACTIVATE(kn, 0);
528 		KQ_LOCK(kq);
529 		kn->kn_status &= ~KN_INFLUX;
530 		KQ_UNLOCK_FLUX(kq);
531 		list->kl_lock(list->kl_lockarg);
532 	}
533 	list->kl_unlock(list->kl_lockarg);
534 }
535 
536 /*
537  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
538  * interval timer support code.
539  */
540 
541 #define NOTE_TIMER_PRECMASK	(NOTE_SECONDS|NOTE_MSECONDS|NOTE_USECONDS| \
542 				NOTE_NSECONDS)
543 
544 static __inline sbintime_t
timer2sbintime(intptr_t data,int flags)545 timer2sbintime(intptr_t data, int flags)
546 {
547 	sbintime_t modifier;
548 
549 	switch (flags & NOTE_TIMER_PRECMASK) {
550 	case NOTE_SECONDS:
551 		modifier = SBT_1S;
552 		break;
553 	case NOTE_MSECONDS: /* FALLTHROUGH */
554 	case 0:
555 		modifier = SBT_1MS;
556 		break;
557 	case NOTE_USECONDS:
558 		modifier = SBT_1US;
559 		break;
560 	case NOTE_NSECONDS:
561 		modifier = SBT_1NS;
562 		break;
563 	default:
564 		return (-1);
565 	}
566 
567 #ifdef __LP64__
568 	if (data > SBT_MAX / modifier)
569 		return (SBT_MAX);
570 #endif
571 	return (modifier * data);
572 }
573 
574 static void
filt_timerexpire(void * knx)575 filt_timerexpire(void *knx)
576 {
577 	struct callout *calloutp;
578 	struct knote *kn;
579 
580 	kn = knx;
581 	kn->kn_data++;
582 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
583 
584 	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
585 		calloutp = (struct callout *)kn->kn_hook;
586 		*kn->kn_ptr.p_nexttime += timer2sbintime(kn->kn_sdata,
587 		    kn->kn_sfflags);
588 		callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
589 		    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
590 	}
591 }
592 
593 /*
594  * data contains amount of time to sleep
595  */
596 static int
filt_timerattach(struct knote * kn)597 filt_timerattach(struct knote *kn)
598 {
599 	struct callout *calloutp;
600 	sbintime_t to;
601 	unsigned int ncallouts;
602 
603 	if ((intptr_t)kn->kn_sdata < 0)
604 		return (EINVAL);
605 	if ((intptr_t)kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
606 		kn->kn_sdata = 1;
607 	/* Only precision unit are supported in flags so far */
608 	if (kn->kn_sfflags & ~NOTE_TIMER_PRECMASK)
609 		return (EINVAL);
610 
611 	to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
612 	if (to < 0)
613 		return (EINVAL);
614 
615 	ncallouts = atomic_load_explicit(&kq_ncallouts, memory_order_relaxed);
616 	do {
617 		if (ncallouts >= kq_calloutmax)
618 			return (ENOMEM);
619 	} while (!atomic_compare_exchange_weak_explicit(&kq_ncallouts,
620 	    &ncallouts, ncallouts + 1, memory_order_relaxed,
621 	    memory_order_relaxed));
622 
623 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
624 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
625 	kn->kn_ptr.p_nexttime = malloc(sizeof(sbintime_t), M_KQUEUE, M_WAITOK);
626 	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
627 	callout_init(calloutp, CALLOUT_MPSAFE);
628 	kn->kn_hook = calloutp;
629 	*kn->kn_ptr.p_nexttime = to + sbinuptime();
630 	callout_reset_sbt_on(calloutp, *kn->kn_ptr.p_nexttime, 0,
631 	    filt_timerexpire, kn, PCPU_GET(cpuid), C_ABSOLUTE);
632 
633 	return (0);
634 }
635 
636 static void
filt_timerdetach(struct knote * kn)637 filt_timerdetach(struct knote *kn)
638 {
639 	struct callout *calloutp;
640 	unsigned int old;
641 
642 	calloutp = (struct callout *)kn->kn_hook;
643 	callout_drain(calloutp);
644 	free(calloutp, M_KQUEUE);
645 	free(kn->kn_ptr.p_nexttime, M_KQUEUE);
646 	old = atomic_fetch_sub_explicit(&kq_ncallouts, 1, memory_order_relaxed);
647 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
648 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
649 }
650 
651 static int
filt_timer(struct knote * kn,long hint)652 filt_timer(struct knote *kn, long hint)
653 {
654 
655 	return (kn->kn_data != 0);
656 }
657 
658 static int
filt_userattach(struct knote * kn)659 filt_userattach(struct knote *kn)
660 {
661 
662 	/*
663 	 * EVFILT_USER knotes are not attached to anything in the kernel.
664 	 */
665 	kn->kn_hook = NULL;
666 	if (kn->kn_fflags & NOTE_TRIGGER)
667 		kn->kn_hookid = 1;
668 	else
669 		kn->kn_hookid = 0;
670 	return (0);
671 }
672 
673 static void
filt_userdetach(__unused struct knote * kn)674 filt_userdetach(__unused struct knote *kn)
675 {
676 
677 	/*
678 	 * EVFILT_USER knotes are not attached to anything in the kernel.
679 	 */
680 }
681 
682 static int
filt_user(struct knote * kn,__unused long hint)683 filt_user(struct knote *kn, __unused long hint)
684 {
685 
686 	return (kn->kn_hookid);
687 }
688 
689 static void
filt_usertouch(struct knote * kn,struct kevent64_s * kev,u_long type)690 filt_usertouch(struct knote *kn, struct kevent64_s *kev, u_long type)
691 {
692 	u_int ffctrl;
693 
694 	switch (type) {
695 	case EVENT_REGISTER:
696 		if (kev->fflags & NOTE_TRIGGER)
697 			kn->kn_hookid = 1;
698 
699 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
700 		kev->fflags &= NOTE_FFLAGSMASK;
701 		switch (ffctrl) {
702 		case NOTE_FFNOP:
703 			break;
704 
705 		case NOTE_FFAND:
706 			kn->kn_sfflags &= kev->fflags;
707 			break;
708 
709 		case NOTE_FFOR:
710 			kn->kn_sfflags |= kev->fflags;
711 			break;
712 
713 		case NOTE_FFCOPY:
714 			kn->kn_sfflags = kev->fflags;
715 			break;
716 
717 		default:
718 			/* XXX Return error? */
719 			break;
720 		}
721 		kn->kn_sdata = kev->data;
722 		if (kev->flags & EV_CLEAR) {
723 			kn->kn_hookid = 0;
724 			kn->kn_data = 0;
725 			kn->kn_fflags = 0;
726 		}
727 		break;
728 
729         case EVENT_PROCESS:
730 		*kev = kn->kn_kevent;
731 		kev->fflags = kn->kn_sfflags;
732 		kev->data = kn->kn_sdata;
733 		if (kn->kn_flags & EV_CLEAR) {
734 			kn->kn_hookid = 0;
735 			kn->kn_data = 0;
736 			kn->kn_fflags = 0;
737 		}
738 		break;
739 
740 	default:
741 		panic("filt_usertouch() - invalid type (%ld)", type);
742 		break;
743 	}
744 }
745 
746 int
sys_kqueue(struct thread * td,struct kqueue_args * uap)747 sys_kqueue(struct thread *td, struct kqueue_args *uap)
748 {
749 	struct filedesc *fdp;
750 	struct kqueue *kq;
751 	struct file *fp;
752 	int fd, error;
753 
754 	fdp = td->td_proc->p_fd;
755 	error = falloc(td, &fp, &fd, 0);
756 	if (error)
757 		goto done2;
758 
759 	/* An extra reference on `fp' has been held for us by falloc(). */
760 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
761 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
762 	TAILQ_INIT(&kq->kq_head);
763 	kq->kq_fdp = fdp;
764 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
765 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
766 
767 	FILEDESC_XLOCK(fdp);
768 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
769 	FILEDESC_XUNLOCK(fdp);
770 
771 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
772 	fdrop(fp, td);
773 
774 	td->td_retval[0] = fd;
775 done2:
776 	return (error);
777 }
778 
779 #ifndef _SYS_SYSPROTO_H_
780 struct kevent64_args {
781 	int fd;
782 	struct kevent64_s *changelist;
783 	int nchanges;
784 	struct kevent64_s *eventlist;
785 	int nevents;
786 	const struct timespec *timeout;
787 };
788 #endif
789 int
sys_kevent64(struct thread * td,struct kevent64_args * uap)790 sys_kevent64(struct thread *td, struct kevent64_args *uap)
791 {
792 	struct timespec ts, *tsp;
793 	struct kevent_copyops k_ops = { uap,
794 					kevent64_copyout,
795 					kevent64_copyin};
796 	int error;
797 
798 	if (uap->timeout != NULL) {
799 		error = copyin(uap->timeout, &ts, sizeof(ts));
800 		if (error)
801 			return (error);
802 		tsp = &ts;
803 	} else
804 		tsp = NULL;
805 
806 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
807 			&k_ops, tsp, 1);
808 
809 	return (error);
810 }
811 
812 #ifndef _SYS_SYSPROTO_H_
813 struct kevent_args {
814 	int	fd;
815 	const struct kevent *changelist;
816 	int	nchanges;
817 	struct	kevent *eventlist;
818 	int	nevents;
819 	const struct timespec *timeout;
820 };
821 #endif
822 int
sys_kevent(struct thread * td,struct kevent_args * uap)823 sys_kevent(struct thread *td, struct kevent_args *uap)
824 {
825 	struct timespec ts, *tsp;
826 	struct kevent_copyops k_ops = { uap,
827 					kevent_copyout,
828 					kevent_copyin};
829 	int error;
830 #ifdef KTRACE
831 	struct uio ktruio;
832 	struct iovec ktriov;
833 	struct uio *ktruioin = NULL;
834 	struct uio *ktruioout = NULL;
835 #endif
836 
837 	if (uap->timeout != NULL) {
838 		error = copyin(uap->timeout, &ts, sizeof(ts));
839 		if (error)
840 			return (error);
841 		tsp = &ts;
842 	} else
843 		tsp = NULL;
844 
845 #ifdef KTRACE
846 	if (KTRPOINT(td, KTR_GENIO)) {
847 		ktriov.iov_base = uap->changelist;
848 		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
849 		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
850 		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
851 		    .uio_td = td };
852 		ktruioin = cloneuio(&ktruio);
853 		ktriov.iov_base = uap->eventlist;
854 		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
855 		ktruioout = cloneuio(&ktruio);
856 	}
857 #endif
858 
859 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
860 			&k_ops, tsp, 0);
861 
862 #ifdef KTRACE
863 	if (ktruioin != NULL) {
864 		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
865 		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
866 		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
867 		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
868 	}
869 #endif
870 
871 	return (error);
872 }
873 
874 /*
875  * Copy 'count' items into the destination list pointed to by uap->eventlist.
876  */
877 static int
kevent64_copyout(void * arg,void * kevp,int count)878 kevent64_copyout(void *arg, void *kevp, int count)
879 {
880 	struct kevent64_args *uap;
881 	int error;
882 
883 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
884 	uap = (struct kevent64_args *)arg;
885 
886 	error = copyout(kevp, uap->eventlist, count * sizeof(struct kevent64_s));
887 	if (error == 0)
888 		uap->eventlist += count;
889 	return (error);
890 }
891 
892 static int
kevent_copyout(void * arg,void * kevp,int count)893 kevent_copyout(void *arg, void *kevp, int count)
894 {
895 	struct kevent64_s *kev;
896 	struct kevent_args *uap;
897 	int error, i;
898 
899 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
900 	uap = (struct kevent_args *)arg;
901 	kev = (struct kevent64_s *)kevp;
902 
903 	for (i = 0; i < count; i++) {
904 		error = copyout((const void *)&kev[i], uap->eventlist, sizeof(struct kevent));
905 		if (error == 0)
906 			uap->eventlist++;
907 	}
908 
909 	return (error);
910 }
911 
912 /*
913  * Copy 'count' items from the list pointed to by uap->changelist.
914  */
915 static int
kevent64_copyin(void * arg,void * kevp,int count)916 kevent64_copyin(void *arg, void *kevp, int count)
917 {
918 	struct kevent64_args *uap;
919 	int error;
920 
921 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
922 	uap = (struct kevent64_args *)arg;
923 
924 	error = copyin(uap->changelist, kevp, count * sizeof(struct kevent64_s));
925 	if (error == 0)
926 		uap->changelist += count;
927 	return (error);
928 }
929 
930 static int
kevent_copyin(void * arg,void * kevp,int count)931 kevent_copyin(void *arg, void *kevp, int count)
932 {
933 	struct kevent_args *uap;
934 	int error;
935 
936 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
937 	uap = (struct kevent_args *)arg;
938 
939 	error = copyin(uap->changelist, kevp, count * sizeof(struct kevent));
940 	if (error == 0)
941 		uap->changelist += count;
942 	return (error);
943 }
944 
945 int
kern_kevent(struct thread * td,int fd,int nchanges,int nevents,struct kevent_copyops * k_ops,const struct timespec * timeout,int v1)946 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
947 	struct kevent_copyops *k_ops, const struct timespec *timeout,
948 	int v1)
949 {
950 	struct kevent keva[KQ_NEVENTS];
951 	struct kevent64_s keva64[KQ_NEVENTS];
952 	struct kevent *changes;
953 	struct kevent64_s kevtmp, *kevp;
954 	struct kqueue *kq;
955 	struct file *fp;
956 	cap_rights_t rights;
957 	int i, n, nerrors, error;
958 
959 	cap_rights_init(&rights);
960 	if (nchanges > 0)
961 		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
962 	if (nevents > 0)
963 		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
964 	error = fget(td, fd, &rights, &fp);
965 	if (error != 0)
966 		return (error);
967 
968 	error = kqueue_acquire(fp, &kq);
969 	if (error != 0)
970 		goto done_norel;
971 
972 	nerrors = 0;
973 	if (v1) {
974 		struct kevent64_s *changes;
975 		while (nchanges > 0) {
976 			n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
977 			error = k_ops->k_copyin(k_ops->arg, keva64, n);
978 			if (error)
979 				goto done;
980 			changes = keva64;
981 			for (i = 0; i < n; i++) {
982 				kevp = &changes[i];
983 				if (!kevp->filter)
984 					continue;
985 				kevp->flags &= ~EV_SYSFLAGS;
986 				error = kqueue_register(kq, kevp, td, 1);
987 				if (error || (kevp->flags & EV_RECEIPT)) {
988 					if (nevents != 0) {
989 						kevp->flags = EV_ERROR;
990 						kevp->data = error;
991 						(void) k_ops->k_copyout(k_ops->arg,
992 												kevp, 1);
993 						nevents--;
994 						nerrors++;
995 					} else {
996 						goto done;
997 					}
998 				}
999 			}
1000 			nchanges -= n;
1001 		}
1002 		goto check_errors;
1003 	}
1004 	kevtmp.ext[0] = 0;
1005 	kevtmp.ext[1] = 0;
1006 	while (nchanges > 0) {
1007 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
1008 		error = k_ops->k_copyin(k_ops->arg, keva, n);
1009 		if (error)
1010 			goto done;
1011 		changes = keva;
1012 		for (i = 0; i < n; i++) {
1013 			kevp = (struct kevent64_s *)&changes[i];
1014 			if (!kevp->filter)
1015 				continue;
1016 			EV_SET64(&kevtmp, kevp->ident, kevp->filter, kevp->flags, kevp->fflags, kevp->data, kevp->udata, 0, 0);
1017 			kevp = &kevtmp;
1018 			kevp->flags &= ~EV_SYSFLAGS;
1019 			error = kqueue_register(kq, kevp, td, 1);
1020 			if (error || (kevp->flags & EV_RECEIPT)) {
1021 				if (nevents != 0) {
1022 					kevp->flags = EV_ERROR;
1023 					kevp->data = error;
1024 					(void) k_ops->k_copyout(k_ops->arg,
1025 					    kevp, 1);
1026 					nevents--;
1027 					nerrors++;
1028 				} else {
1029 					goto done;
1030 				}
1031 			}
1032 		}
1033 		nchanges -= n;
1034 	}
1035 check_errors:
1036 	if (nerrors) {
1037 		td->td_retval[0] = nerrors;
1038 		error = 0;
1039 		goto done;
1040 	}
1041 	if (v1 == 0)
1042 		for (i = 0; i < KQ_NEVENTS; i++) {
1043 			struct kevent *k = &keva[i];
1044 			EV_SET64(&keva64[i], k->ident, k->filter, k->flags, k->fflags, k->data, (uint64_t)k->udata, 0, 0);
1045 		}
1046 	error = kqueue_scan(kq, nevents, k_ops, timeout, keva64, td);
1047 done:
1048 	kqueue_release(kq, 0);
1049 done_norel:
1050 	fdrop(fp, td);
1051 	return (error);
1052 }
1053 
1054 int
kqueue_add_filteropts(int filt,struct filterops * filtops)1055 kqueue_add_filteropts(int filt, struct filterops *filtops)
1056 {
1057 	int error;
1058 
1059 	error = 0;
1060 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
1061 		printf(
1062 "trying to add a filterop that is out of range: %d is beyond %d\n",
1063 		    ~filt, EVFILT_SYSCOUNT);
1064 		return EINVAL;
1065 	}
1066 	mtx_lock(&filterops_lock);
1067 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
1068 	    sysfilt_ops[~filt].for_fop != NULL)
1069 		error = EEXIST;
1070 	else {
1071 		sysfilt_ops[~filt].for_fop = filtops;
1072 		sysfilt_ops[~filt].for_refcnt = 0;
1073 	}
1074 	mtx_unlock(&filterops_lock);
1075 
1076 	return (error);
1077 }
1078 
1079 int
kqueue_del_filteropts(int filt)1080 kqueue_del_filteropts(int filt)
1081 {
1082 	int error;
1083 
1084 	error = 0;
1085 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1086 		return EINVAL;
1087 
1088 	mtx_lock(&filterops_lock);
1089 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
1090 	    sysfilt_ops[~filt].for_fop == NULL)
1091 		error = EINVAL;
1092 	else if (sysfilt_ops[~filt].for_refcnt != 0)
1093 		error = EBUSY;
1094 	else {
1095 		sysfilt_ops[~filt].for_fop = &null_filtops;
1096 		sysfilt_ops[~filt].for_refcnt = 0;
1097 	}
1098 	mtx_unlock(&filterops_lock);
1099 
1100 	return error;
1101 }
1102 
1103 static struct filterops *
kqueue_fo_find(int filt)1104 kqueue_fo_find(int filt)
1105 {
1106 
1107 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1108 		return NULL;
1109 
1110 	mtx_lock(&filterops_lock);
1111 	sysfilt_ops[~filt].for_refcnt++;
1112 	if (sysfilt_ops[~filt].for_fop == NULL)
1113 		sysfilt_ops[~filt].for_fop = &null_filtops;
1114 	mtx_unlock(&filterops_lock);
1115 
1116 	return sysfilt_ops[~filt].for_fop;
1117 }
1118 
1119 static void
kqueue_fo_release(int filt)1120 kqueue_fo_release(int filt)
1121 {
1122 
1123 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
1124 		return;
1125 
1126 	mtx_lock(&filterops_lock);
1127 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
1128 	    ("filter object refcount not valid on release"));
1129 	sysfilt_ops[~filt].for_refcnt--;
1130 	mtx_unlock(&filterops_lock);
1131 }
1132 
1133 /*
1134  * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
1135  * influence if memory allocation should wait.  Make sure it is 0 if you
1136  * hold any mutexes.
1137  */
1138 static int
kqueue_register(struct kqueue * kq,struct kevent64_s * kev,struct thread * td,int waitok)1139 kqueue_register(struct kqueue *kq, struct kevent64_s *kev, struct thread *td, int waitok)
1140 {
1141 	struct filterops *fops;
1142 	struct file *fp;
1143 	struct knote *kn, *tkn;
1144 	cap_rights_t rights;
1145 	int error, filt, event;
1146 	int haskqglobal, filedesc_unlock;
1147 
1148 	fp = NULL;
1149 	kn = NULL;
1150 	error = 0;
1151 	haskqglobal = 0;
1152 	filedesc_unlock = 0;
1153 
1154 	filt = kev->filter;
1155 	fops = kqueue_fo_find(filt);
1156 	if (fops == NULL)
1157 		return EINVAL;
1158 
1159 	tkn = knote_alloc(waitok);		/* prevent waiting with locks */
1160 
1161 findkn:
1162 	if (fops->f_isfd) {
1163 		KASSERT(td != NULL, ("td is NULL"));
1164 		error = fget(td, kev->ident,
1165 		    cap_rights_init(&rights, CAP_EVENT), &fp);
1166 		if (error)
1167 			goto done;
1168 
1169 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
1170 		    kev->ident, 0) != 0) {
1171 			/* try again */
1172 			fdrop(fp, td);
1173 			fp = NULL;
1174 			error = kqueue_expand(kq, fops, kev->ident, waitok);
1175 			if (error)
1176 				goto done;
1177 			goto findkn;
1178 		}
1179 
1180 		if (fp->f_type == DTYPE_KQUEUE) {
1181 			/*
1182 			 * if we add some inteligence about what we are doing,
1183 			 * we should be able to support events on ourselves.
1184 			 * We need to know when we are doing this to prevent
1185 			 * getting both the knlist lock and the kq lock since
1186 			 * they are the same thing.
1187 			 */
1188 			if (fp->f_data == kq) {
1189 				error = EINVAL;
1190 				goto done;
1191 			}
1192 
1193 			/*
1194 			 * Pre-lock the filedesc before the global
1195 			 * lock mutex, see the comment in
1196 			 * kqueue_close().
1197 			 */
1198 			FILEDESC_XLOCK(td->td_proc->p_fd);
1199 			filedesc_unlock = 1;
1200 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1201 		}
1202 
1203 		KQ_LOCK(kq);
1204 		if (kev->ident < kq->kq_knlistsize) {
1205 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
1206 				if (kev->filter == kn->kn_filter)
1207 					break;
1208 		}
1209 	} else {
1210 		if ((kev->flags & EV_ADD) == EV_ADD)
1211 			kqueue_expand(kq, fops, kev->ident, waitok);
1212 
1213 		KQ_LOCK(kq);
1214 		if (kq->kq_knhashmask != 0) {
1215 			struct klist *list;
1216 
1217 			list = &kq->kq_knhash[
1218 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
1219 			SLIST_FOREACH(kn, list, kn_link)
1220 				if (kev->ident == kn->kn_id &&
1221 				    kev->filter == kn->kn_filter)
1222 					break;
1223 		}
1224 	}
1225 
1226 	/* knote is in the process of changing, wait for it to stablize. */
1227 	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1228 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1229 		if (filedesc_unlock) {
1230 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
1231 			filedesc_unlock = 0;
1232 		}
1233 		kq->kq_state |= KQ_FLUXWAIT;
1234 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
1235 		if (fp != NULL) {
1236 			fdrop(fp, td);
1237 			fp = NULL;
1238 		}
1239 		goto findkn;
1240 	}
1241 
1242 	/*
1243 	 * kn now contains the matching knote, or NULL if no match
1244 	 */
1245 	if (kn == NULL) {
1246 		if (kev->flags & EV_ADD) {
1247 			kn = tkn;
1248 			tkn = NULL;
1249 			if (kn == NULL) {
1250 				KQ_UNLOCK(kq);
1251 				error = ENOMEM;
1252 				goto done;
1253 			}
1254 			kn->kn_fp = fp;
1255 			kn->kn_kq = kq;
1256 			kn->kn_fop = fops;
1257 			/*
1258 			 * apply reference counts to knote structure, and
1259 			 * do not release it at the end of this routine.
1260 			 */
1261 			fops = NULL;
1262 			fp = NULL;
1263 
1264 			kn->kn_sfflags = kev->fflags;
1265 			kn->kn_sdata = kev->data;
1266 			kev->fflags = 0;
1267 			kev->data = 0;
1268 			kn->kn_kevent = *kev;
1269 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
1270 			    EV_ENABLE | EV_DISABLE);
1271 			kn->kn_status = KN_INFLUX|KN_DETACHED;
1272 
1273 			error = knote_attach(kn, kq);
1274 			KQ_UNLOCK(kq);
1275 			if (error != 0) {
1276 				tkn = kn;
1277 				goto done;
1278 			}
1279 
1280 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
1281 				knote_drop(kn, td);
1282 				goto done;
1283 			}
1284 			KN_LIST_LOCK(kn);
1285 			goto done_ev_add;
1286 		} else {
1287 			/* No matching knote and the EV_ADD flag is not set. */
1288 			KQ_UNLOCK(kq);
1289 			error = ENOENT;
1290 			goto done;
1291 		}
1292 	}
1293 
1294 	if (kev->flags & EV_DELETE) {
1295 		kn->kn_status |= KN_INFLUX;
1296 		KQ_UNLOCK(kq);
1297 		if (!(kn->kn_status & KN_DETACHED))
1298 			kn->kn_fop->f_detach(kn);
1299 		knote_drop(kn, td);
1300 		goto done;
1301 	}
1302 
1303 	/*
1304 	 * The user may change some filter values after the initial EV_ADD,
1305 	 * but doing so will not reset any filter which has already been
1306 	 * triggered.
1307 	 */
1308 	kn->kn_status |= KN_INFLUX | KN_SCAN;
1309 	KQ_UNLOCK(kq);
1310 	KN_LIST_LOCK(kn);
1311 	kn->kn_kevent.udata = kev->udata;
1312 	if (!fops->f_isfd && fops->f_touch != NULL) {
1313 		fops->f_touch(kn, kev, EVENT_REGISTER);
1314 	} else {
1315 		kn->kn_sfflags = kev->fflags;
1316 		kn->kn_sdata = kev->data;
1317 		kn->kn_kevent.ext[0] = kev->ext[0];
1318 		kn->kn_kevent.ext[1] = kev->ext[1];
1319 	}
1320 
1321 	/*
1322 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
1323 	 * the initial attach event decides that the event is "completed"
1324 	 * already.  i.e. filt_procattach is called on a zombie process.  It
1325 	 * will call filt_proc which will remove it from the list, and NULL
1326 	 * kn_knlist.
1327 	 */
1328 done_ev_add:
1329 	if ((kev->flags & EV_DISABLE) &&
1330 	    ((kn->kn_status & KN_DISABLED) == 0)) {
1331 		kn->kn_status |= KN_DISABLED;
1332 	}
1333 
1334 	if ((kn->kn_status & KN_DISABLED) == 0 || (kev->flags & EV_ENABLE))
1335 		event = kn->kn_fop->f_event(kn, 0);
1336 	else
1337 		event = 0;
1338 	KQ_LOCK(kq);
1339 	if (event)
1340 		KNOTE_ACTIVATE(kn, 1);
1341 	kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1342 	KN_LIST_UNLOCK(kn);
1343 #ifdef KN_DEBUG
1344 #define IS_KN_DISABLED(kn) (!!(kn->kn_status & KN_DISABLED))
1345 #define IS_KN_QUEUED(kn) (!!(kn->kn_status & KN_QUEUED))
1346 #define IS_KN_ACTIVE(kn) (!!(kn->kn_status & KN_ACTIVE))
1347 	if ((kev->flags & EV_ENABLE)) {
1348 		printf("KN_DISABLED=%d KN_ACTIVE=%d KN_QUEUED=%d\n",
1349 			   IS_KN_DISABLED(kn), IS_KN_ACTIVE(kn), IS_KN_QUEUED(kn));
1350 	}
1351 #endif
1352 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
1353 		kn->kn_status &= ~KN_DISABLED;
1354 		if ((kn->kn_status & KN_ACTIVE) &&
1355 		    ((kn->kn_status & KN_QUEUED) == 0))
1356 			knote_enqueue(kn);
1357 	}
1358 	KQ_UNLOCK_FLUX(kq);
1359 
1360 done:
1361 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1362 	if (filedesc_unlock)
1363 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
1364 	if (fp != NULL)
1365 		fdrop(fp, td);
1366 	if (tkn != NULL)
1367 		knote_free(tkn);
1368 	if (fops != NULL)
1369 		kqueue_fo_release(filt);
1370 	return (error);
1371 }
1372 
1373 static int
kqueue_acquire(struct file * fp,struct kqueue ** kqp)1374 kqueue_acquire(struct file *fp, struct kqueue **kqp)
1375 {
1376 	int error;
1377 	struct kqueue *kq;
1378 
1379 	error = 0;
1380 
1381 	kq = fp->f_data;
1382 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
1383 		return (EBADF);
1384 	*kqp = kq;
1385 	KQ_LOCK(kq);
1386 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
1387 		KQ_UNLOCK(kq);
1388 		return (EBADF);
1389 	}
1390 	kq->kq_refcnt++;
1391 	KQ_UNLOCK(kq);
1392 
1393 	return error;
1394 }
1395 
1396 static void
kqueue_release(struct kqueue * kq,int locked)1397 kqueue_release(struct kqueue *kq, int locked)
1398 {
1399 	if (locked)
1400 		KQ_OWNED(kq);
1401 	else
1402 		KQ_LOCK(kq);
1403 	kq->kq_refcnt--;
1404 	if (kq->kq_refcnt == 1)
1405 		wakeup(&kq->kq_refcnt);
1406 	if (!locked)
1407 		KQ_UNLOCK(kq);
1408 }
1409 
1410 static void
kqueue_schedtask(struct kqueue * kq)1411 kqueue_schedtask(struct kqueue *kq)
1412 {
1413 
1414 	KQ_OWNED(kq);
1415 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
1416 	    ("scheduling kqueue task while draining"));
1417 
1418 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
1419 		taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
1420 		kq->kq_state |= KQ_TASKSCHED;
1421 	}
1422 }
1423 
1424 /*
1425  * Expand the kq to make sure we have storage for fops/ident pair.
1426  *
1427  * Return 0 on success (or no work necessary), return errno on failure.
1428  *
1429  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
1430  * If kqueue_register is called from a non-fd context, there usually/should
1431  * be no locks held.
1432  */
1433 static int
kqueue_expand(struct kqueue * kq,struct filterops * fops,uintptr_t ident,int waitok)1434 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
1435 	int waitok)
1436 {
1437 	struct klist *list, *tmp_knhash, *to_free;
1438 	u_long tmp_knhashmask;
1439 	int size;
1440 	int fd;
1441 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
1442 
1443 	KQ_NOTOWNED(kq);
1444 
1445 	to_free = NULL;
1446 	if (fops->f_isfd) {
1447 		fd = ident;
1448 		if (kq->kq_knlistsize <= fd) {
1449 			size = kq->kq_knlistsize;
1450 			while (size <= fd)
1451 				size += KQEXTENT;
1452 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
1453 			if (list == NULL)
1454 				return ENOMEM;
1455 			KQ_LOCK(kq);
1456 			if (kq->kq_knlistsize > fd) {
1457 				to_free = list;
1458 				list = NULL;
1459 			} else {
1460 				if (kq->kq_knlist != NULL) {
1461 					bcopy(kq->kq_knlist, list,
1462 					    kq->kq_knlistsize * sizeof(*list));
1463 					to_free = kq->kq_knlist;
1464 					kq->kq_knlist = NULL;
1465 				}
1466 				bzero((caddr_t)list +
1467 				    kq->kq_knlistsize * sizeof(*list),
1468 				    (size - kq->kq_knlistsize) * sizeof(*list));
1469 				kq->kq_knlistsize = size;
1470 				kq->kq_knlist = list;
1471 			}
1472 			KQ_UNLOCK(kq);
1473 		}
1474 	} else {
1475 		if (kq->kq_knhashmask == 0) {
1476 			tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1477 			    &tmp_knhashmask);
1478 			if (tmp_knhash == NULL)
1479 				return ENOMEM;
1480 			KQ_LOCK(kq);
1481 			if (kq->kq_knhashmask == 0) {
1482 				kq->kq_knhash = tmp_knhash;
1483 				kq->kq_knhashmask = tmp_knhashmask;
1484 			} else {
1485 				to_free = tmp_knhash;
1486 			}
1487 			KQ_UNLOCK(kq);
1488 		}
1489 	}
1490 	free(to_free, M_KQUEUE);
1491 
1492 	KQ_NOTOWNED(kq);
1493 	return 0;
1494 }
1495 
1496 static void
kqueue_task(void * arg,int pending)1497 kqueue_task(void *arg, int pending)
1498 {
1499 	struct kqueue *kq;
1500 	int haskqglobal;
1501 
1502 	haskqglobal = 0;
1503 	kq = arg;
1504 
1505 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1506 	KQ_LOCK(kq);
1507 
1508 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1509 
1510 	kq->kq_state &= ~KQ_TASKSCHED;
1511 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1512 		wakeup(&kq->kq_state);
1513 	}
1514 	KQ_UNLOCK(kq);
1515 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1516 }
1517 
1518 /*
1519  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1520  * We treat KN_MARKER knotes as if they are INFLUX.
1521  */
1522 static int
kqueue_scan(struct kqueue * kq,int maxevents,struct kevent_copyops * k_ops,const struct timespec * tsp,struct kevent64_s * keva,struct thread * td)1523 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
1524     const struct timespec *tsp, struct kevent64_s *keva, struct thread *td)
1525 {
1526 	struct kevent64_s *kevp;
1527 	struct knote *kn, *marker;
1528 	sbintime_t asbt, rsbt;
1529 	int count, error, haskqglobal, influx, nkev, touch;
1530 
1531 	count = maxevents;
1532 	nkev = 0;
1533 	error = 0;
1534 	haskqglobal = 0;
1535 
1536 	if (maxevents == 0)
1537 		goto done_nl;
1538 
1539 	rsbt = 0;
1540 	if (tsp != NULL) {
1541 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
1542 		    tsp->tv_nsec >= 1000000000) {
1543 			error = EINVAL;
1544 			goto done_nl;
1545 		}
1546 		if (timespecisset(tsp)) {
1547 			if (tsp->tv_sec <= INT32_MAX) {
1548 				rsbt = tstosbt(*tsp);
1549 				if (TIMESEL(&asbt, rsbt))
1550 					asbt += tc_tick_sbt;
1551 				if (asbt <= INT64_MAX - rsbt)
1552 					asbt += rsbt;
1553 				else
1554 					asbt = 0;
1555 				rsbt >>= tc_precexp;
1556 			} else
1557 				asbt = 0;
1558 		} else
1559 			asbt = -1;
1560 	} else
1561 		asbt = 0;
1562 	marker = knote_alloc(1);
1563 	if (marker == NULL) {
1564 		error = ENOMEM;
1565 		goto done_nl;
1566 	}
1567 	marker->kn_status = KN_MARKER;
1568 	KQ_LOCK(kq);
1569 
1570 retry:
1571 	kevp = keva;
1572 	if (kq->kq_count == 0) {
1573 		if (asbt == -1) {
1574 			error = EWOULDBLOCK;
1575 		} else {
1576 			kq->kq_state |= KQ_SLEEP;
1577 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
1578 			    "kqread", asbt, rsbt, C_ABSOLUTE);
1579 		}
1580 		if (error == 0)
1581 			goto retry;
1582 		/* don't restart after signals... */
1583 		if (error == ERESTART)
1584 			error = EINTR;
1585 		else if (error == EWOULDBLOCK)
1586 			error = 0;
1587 		goto done;
1588 	}
1589 
1590 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1591 	influx = 0;
1592 	while (count) {
1593 		KQ_OWNED(kq);
1594 		kn = TAILQ_FIRST(&kq->kq_head);
1595 
1596 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
1597 		    (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1598 			if (influx) {
1599 				influx = 0;
1600 				KQ_FLUX_WAKEUP(kq);
1601 			}
1602 			kq->kq_state |= KQ_FLUXWAIT;
1603 			error = msleep(kq, &kq->kq_lock, PSOCK,
1604 			    "kqflxwt", 0);
1605 			continue;
1606 		}
1607 
1608 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1609 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1610 			kn->kn_status &= ~KN_QUEUED;
1611 			kq->kq_count--;
1612 			continue;
1613 		}
1614 		if (kn == marker) {
1615 			KQ_FLUX_WAKEUP(kq);
1616 			if (count == maxevents)
1617 				goto retry;
1618 			goto done;
1619 		}
1620 		KASSERT((kn->kn_status & KN_INFLUX) == 0,
1621 		    ("KN_INFLUX set when not suppose to be"));
1622 
1623 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
1624 			kn->kn_status &= ~KN_QUEUED;
1625 			kn->kn_status |= KN_INFLUX;
1626 			kq->kq_count--;
1627 			KQ_UNLOCK(kq);
1628 			/*
1629 			 * We don't need to lock the list since we've marked
1630 			 * it _INFLUX.
1631 			 */
1632 			if (!(kn->kn_status & KN_DETACHED))
1633 				kn->kn_fop->f_detach(kn);
1634 			knote_drop(kn, td);
1635 			KQ_LOCK(kq);
1636 			continue;
1637 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1638 			kn->kn_status &= ~KN_QUEUED;
1639 			kn->kn_status |= KN_INFLUX;
1640 			kq->kq_count--;
1641 			KQ_UNLOCK(kq);
1642 			/*
1643 			 * We don't need to lock the list since we've marked
1644 			 * it _INFLUX.
1645 			 */
1646 			*kevp = kn->kn_kevent;
1647 			if (!(kn->kn_status & KN_DETACHED))
1648 				kn->kn_fop->f_detach(kn);
1649 			knote_drop(kn, td);
1650 			KQ_LOCK(kq);
1651 			kn = NULL;
1652 		} else {
1653 			kn->kn_status |= KN_INFLUX | KN_SCAN;
1654 			KQ_UNLOCK(kq);
1655 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1656 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1657 			KN_LIST_LOCK(kn);
1658 			if (kn->kn_fop->f_event(kn, 0) == 0) {
1659 				KQ_LOCK(kq);
1660 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1661 				kn->kn_status &=
1662 				    ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
1663 				    KN_SCAN);
1664 				kq->kq_count--;
1665 				KN_LIST_UNLOCK(kn);
1666 				influx = 1;
1667 				continue;
1668 			}
1669 			touch = (!kn->kn_fop->f_isfd &&
1670 			    kn->kn_fop->f_touch != NULL);
1671 			if (touch)
1672 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
1673 			else
1674 				*kevp = kn->kn_kevent;
1675 			KQ_LOCK(kq);
1676 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1677 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
1678 				/*
1679 				 * Manually clear knotes who weren't
1680 				 * 'touch'ed.
1681 				 */
1682 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
1683 					kn->kn_data = 0;
1684 					kn->kn_fflags = 0;
1685 				}
1686 				if (kn->kn_flags & EV_DISPATCH)
1687 					kn->kn_status |= KN_DISABLED;
1688 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1689 				kq->kq_count--;
1690 			} else
1691 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1692 
1693 			kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
1694 			KN_LIST_UNLOCK(kn);
1695 			influx = 1;
1696 		}
1697 
1698 		/* we are returning a copy to the user */
1699 		kevp++;
1700 		nkev++;
1701 		count--;
1702 
1703 		if (nkev == KQ_NEVENTS) {
1704 			influx = 0;
1705 			KQ_UNLOCK_FLUX(kq);
1706 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1707 			nkev = 0;
1708 			kevp = keva;
1709 			KQ_LOCK(kq);
1710 			if (error)
1711 				break;
1712 		}
1713 	}
1714 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1715 done:
1716 	KQ_OWNED(kq);
1717 	KQ_UNLOCK_FLUX(kq);
1718 	knote_free(marker);
1719 done_nl:
1720 	KQ_NOTOWNED(kq);
1721 	if (nkev != 0)
1722 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
1723 	td->td_retval[0] = maxevents - count;
1724 	return (error);
1725 }
1726 
1727 /*
1728  * XXX
1729  * This could be expanded to call kqueue_scan, if desired.
1730  */
1731 /*ARGSUSED*/
1732 static int
kqueue_read(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1733 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1734 	int flags, struct thread *td)
1735 {
1736 	return (ENXIO);
1737 }
1738 
1739 /*ARGSUSED*/
1740 static int
kqueue_write(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1741 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1742 	 int flags, struct thread *td)
1743 {
1744 	return (ENXIO);
1745 }
1746 
1747 /*ARGSUSED*/
1748 static int
kqueue_truncate(struct file * fp,off_t length,struct ucred * active_cred,struct thread * td)1749 kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1750 	struct thread *td)
1751 {
1752 
1753 	return (EINVAL);
1754 }
1755 
1756 /*ARGSUSED*/
1757 static int
kqueue_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)1758 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1759 	struct ucred *active_cred, struct thread *td)
1760 {
1761 	/*
1762 	 * Enabling sigio causes two major problems:
1763 	 * 1) infinite recursion:
1764 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
1765 	 * set.  On receipt of a signal this will cause a kqueue to recurse
1766 	 * into itself over and over.  Sending the sigio causes the kqueue
1767 	 * to become ready, which in turn posts sigio again, forever.
1768 	 * Solution: this can be solved by setting a flag in the kqueue that
1769 	 * we have a SIGIO in progress.
1770 	 * 2) locking problems:
1771 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1772 	 * us above the proc and pgrp locks.
1773 	 * Solution: Post a signal using an async mechanism, being sure to
1774 	 * record a generation count in the delivery so that we do not deliver
1775 	 * a signal to the wrong process.
1776 	 *
1777 	 * Note, these two mechanisms are somewhat mutually exclusive!
1778 	 */
1779 #if 0
1780 	struct kqueue *kq;
1781 
1782 	kq = fp->f_data;
1783 	switch (cmd) {
1784 	case FIOASYNC:
1785 		if (*(int *)data) {
1786 			kq->kq_state |= KQ_ASYNC;
1787 		} else {
1788 			kq->kq_state &= ~KQ_ASYNC;
1789 		}
1790 		return (0);
1791 
1792 	case FIOSETOWN:
1793 		return (fsetown(*(int *)data, &kq->kq_sigio));
1794 
1795 	case FIOGETOWN:
1796 		*(int *)data = fgetown(&kq->kq_sigio);
1797 		return (0);
1798 	}
1799 #endif
1800 
1801 	return (ENOTTY);
1802 }
1803 
1804 /*ARGSUSED*/
1805 static int
kqueue_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)1806 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1807 	struct thread *td)
1808 {
1809 	struct kqueue *kq;
1810 	int revents = 0;
1811 	int error;
1812 
1813 	if ((error = kqueue_acquire(fp, &kq)))
1814 		return POLLERR;
1815 
1816 	KQ_LOCK(kq);
1817 	if (events & (POLLIN | POLLRDNORM)) {
1818 		if (kq->kq_count) {
1819 			revents |= events & (POLLIN | POLLRDNORM);
1820 		} else {
1821 			selrecord(td, &kq->kq_sel);
1822 			if (SEL_WAITING(&kq->kq_sel))
1823 				kq->kq_state |= KQ_SEL;
1824 		}
1825 	}
1826 	kqueue_release(kq, 1);
1827 	KQ_UNLOCK(kq);
1828 	return (revents);
1829 }
1830 
1831 /*ARGSUSED*/
1832 static int
kqueue_stat(struct file * fp,struct stat * st,struct ucred * active_cred,struct thread * td)1833 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1834 	struct thread *td)
1835 {
1836 
1837 	bzero((void *)st, sizeof *st);
1838 	/*
1839 	 * We no longer return kq_count because the unlocked value is useless.
1840 	 * If you spent all this time getting the count, why not spend your
1841 	 * syscall better by calling kevent?
1842 	 *
1843 	 * XXX - This is needed for libc_r.
1844 	 */
1845 	st->st_mode = S_IFIFO;
1846 	return (0);
1847 }
1848 
1849 /*ARGSUSED*/
1850 static int
kqueue_close(struct file * fp,struct thread * td)1851 kqueue_close(struct file *fp, struct thread *td)
1852 {
1853 	struct kqueue *kq = fp->f_data;
1854 	struct filedesc *fdp;
1855 	struct knote *kn;
1856 	int i;
1857 	int error;
1858 	int filedesc_unlock;
1859 
1860 	if ((error = kqueue_acquire(fp, &kq)))
1861 		return error;
1862 
1863 	filedesc_unlock = 0;
1864 	KQ_LOCK(kq);
1865 
1866 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1867 	    ("kqueue already closing"));
1868 	kq->kq_state |= KQ_CLOSING;
1869 	if (kq->kq_refcnt > 1)
1870 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1871 
1872 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1873 	fdp = kq->kq_fdp;
1874 
1875 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
1876 	    ("kqueue's knlist not empty"));
1877 
1878 	for (i = 0; i < kq->kq_knlistsize; i++) {
1879 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1880 			if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1881 				kq->kq_state |= KQ_FLUXWAIT;
1882 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
1883 				continue;
1884 			}
1885 			kn->kn_status |= KN_INFLUX;
1886 			KQ_UNLOCK(kq);
1887 			if (!(kn->kn_status & KN_DETACHED))
1888 				kn->kn_fop->f_detach(kn);
1889 			knote_drop(kn, td);
1890 			KQ_LOCK(kq);
1891 		}
1892 	}
1893 	if (kq->kq_knhashmask != 0) {
1894 		for (i = 0; i <= kq->kq_knhashmask; i++) {
1895 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1896 				if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1897 					kq->kq_state |= KQ_FLUXWAIT;
1898 					msleep(kq, &kq->kq_lock, PSOCK,
1899 					       "kqclo2", 0);
1900 					continue;
1901 				}
1902 				kn->kn_status |= KN_INFLUX;
1903 				KQ_UNLOCK(kq);
1904 				if (!(kn->kn_status & KN_DETACHED))
1905 					kn->kn_fop->f_detach(kn);
1906 				knote_drop(kn, td);
1907 				KQ_LOCK(kq);
1908 			}
1909 		}
1910 	}
1911 
1912 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1913 		kq->kq_state |= KQ_TASKDRAIN;
1914 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1915 	}
1916 
1917 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1918 		selwakeuppri(&kq->kq_sel, PSOCK);
1919 		if (!SEL_WAITING(&kq->kq_sel))
1920 			kq->kq_state &= ~KQ_SEL;
1921 	}
1922 
1923 	KQ_UNLOCK(kq);
1924 
1925 	/*
1926 	 * We could be called due to the knote_drop() doing fdrop(),
1927 	 * called from kqueue_register().  In this case the global
1928 	 * lock is owned, and filedesc sx is locked before, to not
1929 	 * take the sleepable lock after non-sleepable.
1930 	 */
1931 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
1932 		FILEDESC_XLOCK(fdp);
1933 		filedesc_unlock = 1;
1934 	} else
1935 		filedesc_unlock = 0;
1936 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
1937 	if (filedesc_unlock)
1938 		FILEDESC_XUNLOCK(fdp);
1939 
1940 	seldrain(&kq->kq_sel);
1941 	knlist_destroy(&kq->kq_sel.si_note);
1942 	mtx_destroy(&kq->kq_lock);
1943 	kq->kq_fdp = NULL;
1944 
1945 	if (kq->kq_knhash != NULL)
1946 		free(kq->kq_knhash, M_KQUEUE);
1947 	if (kq->kq_knlist != NULL)
1948 		free(kq->kq_knlist, M_KQUEUE);
1949 
1950 	funsetown(&kq->kq_sigio);
1951 	free(kq, M_KQUEUE);
1952 	fp->f_data = NULL;
1953 
1954 	return (0);
1955 }
1956 
1957 static void
kqueue_wakeup(struct kqueue * kq)1958 kqueue_wakeup(struct kqueue *kq)
1959 {
1960 	KQ_OWNED(kq);
1961 
1962 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1963 		kq->kq_state &= ~KQ_SLEEP;
1964 		wakeup(kq);
1965 	}
1966 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1967 		selwakeuppri(&kq->kq_sel, PSOCK);
1968 		if (!SEL_WAITING(&kq->kq_sel))
1969 			kq->kq_state &= ~KQ_SEL;
1970 	}
1971 	if (!knlist_empty(&kq->kq_sel.si_note))
1972 		kqueue_schedtask(kq);
1973 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1974 		pgsigio(&kq->kq_sigio, SIGIO, 0);
1975 	}
1976 }
1977 
1978 /*
1979  * Walk down a list of knotes, activating them if their event has triggered.
1980  *
1981  * There is a possibility to optimize in the case of one kq watching another.
1982  * Instead of scheduling a task to wake it up, you could pass enough state
1983  * down the chain to make up the parent kqueue.  Make this code functional
1984  * first.
1985  */
1986 void
knote(struct knlist * list,long hint,int lockflags)1987 knote(struct knlist *list, long hint, int lockflags)
1988 {
1989 	struct kqueue *kq;
1990 	struct knote *kn;
1991 	int error;
1992 
1993 	if (list == NULL)
1994 		return;
1995 
1996 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
1997 
1998 	if ((lockflags & KNF_LISTLOCKED) == 0)
1999 		list->kl_lock(list->kl_lockarg);
2000 
2001 	/*
2002 	 * If we unlock the list lock (and set KN_INFLUX), we can eliminate
2003 	 * the kqueue scheduling, but this will introduce four
2004 	 * lock/unlock's for each knote to test.  If we do, continue to use
2005 	 * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
2006 	 * only safe if you want to remove the current item, which we are
2007 	 * not doing.
2008 	 */
2009 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
2010 		kq = kn->kn_kq;
2011 		KQ_LOCK(kq);
2012 		if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
2013 			/*
2014 			 * Do not process the influx notes, except for
2015 			 * the influx coming from the kq unlock in the
2016 			 * kqueue_scan().  In the later case, we do
2017 			 * not interfere with the scan, since the code
2018 			 * fragment in kqueue_scan() locks the knlist,
2019 			 * and cannot proceed until we finished.
2020 			 */
2021 			KQ_UNLOCK(kq);
2022 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
2023 			kn->kn_status |= KN_INFLUX;
2024 			KQ_UNLOCK(kq);
2025 			error = kn->kn_fop->f_event(kn, hint);
2026 			KQ_LOCK(kq);
2027 			kn->kn_status &= ~KN_INFLUX;
2028 			if (error)
2029 				KNOTE_ACTIVATE(kn, 1);
2030 			KQ_UNLOCK_FLUX(kq);
2031 		} else {
2032 			kn->kn_status |= KN_HASKQLOCK;
2033 			if (kn->kn_fop->f_event(kn, hint))
2034 				KNOTE_ACTIVATE(kn, 1);
2035 			kn->kn_status &= ~KN_HASKQLOCK;
2036 			KQ_UNLOCK(kq);
2037 		}
2038 	}
2039 	if ((lockflags & KNF_LISTLOCKED) == 0)
2040 		list->kl_unlock(list->kl_lockarg);
2041 }
2042 
2043 /*
2044  * add a knote to a knlist
2045  */
2046 void
knlist_add(struct knlist * knl,struct knote * kn,int islocked)2047 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
2048 {
2049 	KNL_ASSERT_LOCK(knl, islocked);
2050 	KQ_NOTOWNED(kn->kn_kq);
2051 	KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
2052 	    (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
2053 	if (!islocked)
2054 		knl->kl_lock(knl->kl_lockarg);
2055 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
2056 	if (!islocked)
2057 		knl->kl_unlock(knl->kl_lockarg);
2058 	KQ_LOCK(kn->kn_kq);
2059 	kn->kn_knlist = knl;
2060 	kn->kn_status &= ~KN_DETACHED;
2061 	KQ_UNLOCK(kn->kn_kq);
2062 }
2063 
2064 static void
knlist_remove_kq(struct knlist * knl,struct knote * kn,int knlislocked,int kqislocked)2065 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
2066 {
2067 	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
2068 	KNL_ASSERT_LOCK(knl, knlislocked);
2069 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
2070 	if (!kqislocked)
2071 		KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
2072     ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
2073 	if (!knlislocked)
2074 		knl->kl_lock(knl->kl_lockarg);
2075 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
2076 	kn->kn_knlist = NULL;
2077 	if (!knlislocked)
2078 		knl->kl_unlock(knl->kl_lockarg);
2079 	if (!kqislocked)
2080 		KQ_LOCK(kn->kn_kq);
2081 	kn->kn_status |= KN_DETACHED;
2082 	if (!kqislocked)
2083 		KQ_UNLOCK(kn->kn_kq);
2084 }
2085 
2086 /*
2087  * remove knote from the specified knlist
2088  */
2089 void
knlist_remove(struct knlist * knl,struct knote * kn,int islocked)2090 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
2091 {
2092 
2093 	knlist_remove_kq(knl, kn, islocked, 0);
2094 }
2095 
2096 /*
2097  * remove knote from the specified knlist while in f_event handler.
2098  */
2099 void
knlist_remove_inevent(struct knlist * knl,struct knote * kn)2100 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
2101 {
2102 
2103 	knlist_remove_kq(knl, kn, 1,
2104 	    (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
2105 }
2106 
2107 int
knlist_empty(struct knlist * knl)2108 knlist_empty(struct knlist *knl)
2109 {
2110 
2111 	KNL_ASSERT_LOCKED(knl);
2112 	return SLIST_EMPTY(&knl->kl_list);
2113 }
2114 
2115 static struct mtx	knlist_lock;
2116 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
2117 	MTX_DEF);
2118 static void knlist_mtx_lock(void *arg);
2119 static void knlist_mtx_unlock(void *arg);
2120 
2121 static void
knlist_mtx_lock(void * arg)2122 knlist_mtx_lock(void *arg)
2123 {
2124 
2125 	mtx_lock((struct mtx *)arg);
2126 }
2127 
2128 static void
knlist_mtx_unlock(void * arg)2129 knlist_mtx_unlock(void *arg)
2130 {
2131 
2132 	mtx_unlock((struct mtx *)arg);
2133 }
2134 
2135 static void
knlist_mtx_assert_locked(void * arg)2136 knlist_mtx_assert_locked(void *arg)
2137 {
2138 
2139 	mtx_assert((struct mtx *)arg, MA_OWNED);
2140 }
2141 
2142 static void
knlist_mtx_assert_unlocked(void * arg)2143 knlist_mtx_assert_unlocked(void *arg)
2144 {
2145 
2146 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
2147 }
2148 
2149 static void
knlist_rw_rlock(void * arg)2150 knlist_rw_rlock(void *arg)
2151 {
2152 
2153 	rw_rlock((struct rwlock *)arg);
2154 }
2155 
2156 static void
knlist_rw_runlock(void * arg)2157 knlist_rw_runlock(void *arg)
2158 {
2159 
2160 	rw_runlock((struct rwlock *)arg);
2161 }
2162 
2163 static void
knlist_rw_assert_locked(void * arg)2164 knlist_rw_assert_locked(void *arg)
2165 {
2166 
2167 	rw_assert((struct rwlock *)arg, RA_LOCKED);
2168 }
2169 
2170 static void
knlist_rw_assert_unlocked(void * arg)2171 knlist_rw_assert_unlocked(void *arg)
2172 {
2173 
2174 	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
2175 }
2176 
2177 void
knlist_init(struct knlist * knl,void * lock,void (* kl_lock)(void *),void (* kl_unlock)(void *),void (* kl_assert_locked)(void *),void (* kl_assert_unlocked)(void *))2178 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
2179     void (*kl_unlock)(void *),
2180     void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
2181 {
2182 
2183 	if (lock == NULL)
2184 		knl->kl_lockarg = &knlist_lock;
2185 	else
2186 		knl->kl_lockarg = lock;
2187 
2188 	if (kl_lock == NULL)
2189 		knl->kl_lock = knlist_mtx_lock;
2190 	else
2191 		knl->kl_lock = kl_lock;
2192 	if (kl_unlock == NULL)
2193 		knl->kl_unlock = knlist_mtx_unlock;
2194 	else
2195 		knl->kl_unlock = kl_unlock;
2196 	if (kl_assert_locked == NULL)
2197 		knl->kl_assert_locked = knlist_mtx_assert_locked;
2198 	else
2199 		knl->kl_assert_locked = kl_assert_locked;
2200 	if (kl_assert_unlocked == NULL)
2201 		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
2202 	else
2203 		knl->kl_assert_unlocked = kl_assert_unlocked;
2204 
2205 	SLIST_INIT(&knl->kl_list);
2206 }
2207 
2208 void
knlist_init_mtx(struct knlist * knl,struct mtx * lock)2209 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
2210 {
2211 
2212 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
2213 }
2214 
2215 void
knlist_init_rw_reader(struct knlist * knl,struct rwlock * lock)2216 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
2217 {
2218 
2219 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
2220 	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
2221 }
2222 
2223 void
knlist_destroy(struct knlist * knl)2224 knlist_destroy(struct knlist *knl)
2225 {
2226 
2227 #ifdef INVARIANTS
2228 	/*
2229 	 * if we run across this error, we need to find the offending
2230 	 * driver and have it call knlist_clear or knlist_delete.
2231 	 */
2232 	if (!SLIST_EMPTY(&knl->kl_list))
2233 		printf("WARNING: destroying knlist w/ knotes on it!\n");
2234 #endif
2235 
2236 	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
2237 	SLIST_INIT(&knl->kl_list);
2238 }
2239 
2240 /*
2241  * Even if we are locked, we may need to drop the lock to allow any influx
2242  * knotes time to "settle".
2243  */
2244 void
knlist_cleardel(struct knlist * knl,struct thread * td,int islocked,int killkn)2245 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
2246 {
2247 	struct knote *kn, *kn2;
2248 	struct kqueue *kq;
2249 
2250 	if (islocked)
2251 		KNL_ASSERT_LOCKED(knl);
2252 	else {
2253 		KNL_ASSERT_UNLOCKED(knl);
2254 again:		/* need to reacquire lock since we have dropped it */
2255 		knl->kl_lock(knl->kl_lockarg);
2256 	}
2257 
2258 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
2259 		kq = kn->kn_kq;
2260 		KQ_LOCK(kq);
2261 		if ((kn->kn_status & KN_INFLUX)) {
2262 			KQ_UNLOCK(kq);
2263 			continue;
2264 		}
2265 		knlist_remove_kq(knl, kn, 1, 1);
2266 		if (killkn) {
2267 			kn->kn_status |= KN_INFLUX | KN_DETACHED;
2268 			KQ_UNLOCK(kq);
2269 			knote_drop(kn, td);
2270 		} else {
2271 			/* Make sure cleared knotes disappear soon */
2272 			kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2273 			KQ_UNLOCK(kq);
2274 		}
2275 		kq = NULL;
2276 	}
2277 
2278 	if (!SLIST_EMPTY(&knl->kl_list)) {
2279 		/* there are still KN_INFLUX remaining */
2280 		kn = SLIST_FIRST(&knl->kl_list);
2281 		kq = kn->kn_kq;
2282 		KQ_LOCK(kq);
2283 		KASSERT(kn->kn_status & KN_INFLUX,
2284 		    ("knote removed w/o list lock"));
2285 		knl->kl_unlock(knl->kl_lockarg);
2286 		kq->kq_state |= KQ_FLUXWAIT;
2287 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
2288 		kq = NULL;
2289 		goto again;
2290 	}
2291 
2292 	if (islocked)
2293 		KNL_ASSERT_LOCKED(knl);
2294 	else {
2295 		knl->kl_unlock(knl->kl_lockarg);
2296 		KNL_ASSERT_UNLOCKED(knl);
2297 	}
2298 }
2299 
2300 /*
2301  * Remove all knotes referencing a specified fd must be called with FILEDESC
2302  * lock.  This prevents a race where a new fd comes along and occupies the
2303  * entry and we attach a knote to the fd.
2304  */
2305 void
knote_fdclose(struct thread * td,int fd)2306 knote_fdclose(struct thread *td, int fd)
2307 {
2308 	struct filedesc *fdp = td->td_proc->p_fd;
2309 	struct kqueue *kq;
2310 	struct knote *kn;
2311 	int influx;
2312 
2313 	FILEDESC_XLOCK_ASSERT(fdp);
2314 
2315 	/*
2316 	 * We shouldn't have to worry about new kevents appearing on fd
2317 	 * since filedesc is locked.
2318 	 */
2319 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
2320 		KQ_LOCK(kq);
2321 
2322 again:
2323 		influx = 0;
2324 		while (kq->kq_knlistsize > fd &&
2325 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
2326 			if (kn->kn_status & KN_INFLUX) {
2327 				/* someone else might be waiting on our knote */
2328 				if (influx)
2329 					wakeup(kq);
2330 				kq->kq_state |= KQ_FLUXWAIT;
2331 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
2332 				goto again;
2333 			}
2334 			kn->kn_status |= KN_INFLUX;
2335 			KQ_UNLOCK(kq);
2336 			if (!(kn->kn_status & KN_DETACHED))
2337 				kn->kn_fop->f_detach(kn);
2338 			knote_drop(kn, td);
2339 			influx = 1;
2340 			KQ_LOCK(kq);
2341 		}
2342 		KQ_UNLOCK_FLUX(kq);
2343 	}
2344 }
2345 
2346 static int
knote_attach(struct knote * kn,struct kqueue * kq)2347 knote_attach(struct knote *kn, struct kqueue *kq)
2348 {
2349 	struct klist *list;
2350 
2351 	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
2352 	KQ_OWNED(kq);
2353 
2354 	if (kn->kn_fop->f_isfd) {
2355 		if (kn->kn_id >= kq->kq_knlistsize)
2356 			return ENOMEM;
2357 		list = &kq->kq_knlist[kn->kn_id];
2358 	} else {
2359 		if (kq->kq_knhash == NULL)
2360 			return ENOMEM;
2361 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2362 	}
2363 
2364 	SLIST_INSERT_HEAD(list, kn, kn_link);
2365 
2366 	return 0;
2367 }
2368 
2369 /*
2370  * knote must already have been detached using the f_detach method.
2371  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
2372  * to prevent other removal.
2373  */
2374 static void
knote_drop(struct knote * kn,struct thread * td)2375 knote_drop(struct knote *kn, struct thread *td)
2376 {
2377 	struct kqueue *kq;
2378 	struct klist *list;
2379 
2380 	kq = kn->kn_kq;
2381 
2382 	KQ_NOTOWNED(kq);
2383 	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
2384 	    ("knote_drop called without KN_INFLUX set in kn_status"));
2385 
2386 	KQ_LOCK(kq);
2387 	if (kn->kn_fop->f_isfd)
2388 		list = &kq->kq_knlist[kn->kn_id];
2389 	else
2390 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
2391 
2392 	if (!SLIST_EMPTY(list))
2393 		SLIST_REMOVE(list, kn, knote, kn_link);
2394 	if (kn->kn_status & KN_QUEUED)
2395 		knote_dequeue(kn);
2396 	KQ_UNLOCK_FLUX(kq);
2397 
2398 	if (kn->kn_fop->f_isfd) {
2399 		fdrop(kn->kn_fp, td);
2400 		kn->kn_fp = NULL;
2401 	}
2402 	kqueue_fo_release(kn->kn_kevent.filter);
2403 	kn->kn_fop = NULL;
2404 	knote_free(kn);
2405 }
2406 
2407 void
knote_enqueue(struct knote * kn)2408 knote_enqueue(struct knote *kn)
2409 {
2410 	struct kqueue *kq = kn->kn_kq;
2411 
2412 	KQ_OWNED(kn->kn_kq);
2413 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
2414 
2415 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
2416 	kn->kn_status |= KN_QUEUED;
2417 	kq->kq_count++;
2418 	kqueue_wakeup(kq);
2419 }
2420 
2421 static void
knote_dequeue(struct knote * kn)2422 knote_dequeue(struct knote *kn)
2423 {
2424 	struct kqueue *kq = kn->kn_kq;
2425 
2426 	KQ_OWNED(kn->kn_kq);
2427 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
2428 
2429 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
2430 	kn->kn_status &= ~KN_QUEUED;
2431 	kq->kq_count--;
2432 }
2433 
2434 static void
knote_init(void)2435 knote_init(void)
2436 {
2437 
2438 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
2439 	    NULL, NULL, UMA_ALIGN_PTR, 0);
2440 }
2441 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
2442 
2443 static struct knote *
knote_alloc(int waitok)2444 knote_alloc(int waitok)
2445 {
2446 	return ((struct knote *)uma_zalloc(knote_zone,
2447 	    (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
2448 }
2449 
2450 static void
knote_free(struct knote * kn)2451 knote_free(struct knote *kn)
2452 {
2453 	if (kn != NULL)
2454 		uma_zfree(knote_zone, kn);
2455 }
2456 
2457 /*
2458  * Register the kev w/ the kq specified by fd.
2459  */
2460 int
kqfd_register(int fd,struct kevent64_s * kev,struct thread * td,int waitok)2461 kqfd_register(int fd, struct kevent64_s *kev, struct thread *td, int waitok)
2462 {
2463 	struct kqueue *kq;
2464 	struct file *fp;
2465 	cap_rights_t rights;
2466 	int error;
2467 
2468 	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
2469 	if (error != 0)
2470 		return (error);
2471 	if ((error = kqueue_acquire(fp, &kq)) != 0)
2472 		goto noacquire;
2473 
2474 	error = kqueue_register(kq, kev, td, waitok);
2475 
2476 	kqueue_release(kq, 0);
2477 
2478 noacquire:
2479 	fdrop(fp, td);
2480 
2481 	return error;
2482 }
2483