1 /*        $NetBSD: linux_inotify.c,v 1.7 2024/10/01 16:41:29 riastradh Exp $    */
2 
3 /*-
4  * Copyright (c) 2023 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Theodore Preduta.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: linux_inotify.c,v 1.7 2024/10/01 16:41:29 riastradh Exp $");
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/bitops.h>
37 #include <sys/dirent.h>
38 #include <sys/event.h>
39 #include <sys/eventvar.h>
40 #include <sys/errno.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/fcntl.h>
44 #include <sys/poll.h>
45 #include <sys/proc.h>
46 #include <sys/selinfo.h>
47 #include <sys/select.h>
48 #include <sys/signal.h>
49 #include <sys/vnode.h>
50 
51 #include <sys/syscallargs.h>
52 
53 #include <compat/linux/common/linux_machdep.h>
54 #include <compat/linux/common/linux_fcntl.h>
55 #include <compat/linux/common/linux_inotify.h>
56 #include <compat/linux/common/linux_ipc.h>
57 #include <compat/linux/common/linux_sched.h>
58 #include <compat/linux/common/linux_sem.h>
59 #include <compat/linux/common/linux_signal.h>
60 
61 #include <compat/linux/linux_syscallargs.h>
62 
63 /*
64  * inotify(2).  This interface allows the user to get file system
65  * events and (unlike kqueue(2)) their order is strictly preserved.
66  * While nice, the API has sufficient gotchas that mean we don't want
67  * to add native entry points for it.  They are:
68  *
69  * - Because data is returned via read(2), this API is prone to
70  *   unaligned memory accesses.  There is a note in the Linux man page
71  *   that says the name field of struct linux_inotify_event *can* be
72  *   used for alignment purposes.  In practice, even Linux doesn't
73  *   always do this, so for simplicity, we don't ever do this.
74  */
75 
76 #define   LINUX_INOTIFY_MAX_QUEUED      16384
77 #define   LINUX_INOTIFY_MAX_FROM_KEVENT 3
78 
79 #if DEBUG_LINUX
80 #define   DPRINTF(x) uprintf x
81 #else
82 #define   DPRINTF(x) __nothing
83 #endif
84 
85 struct inotify_entry {
86           TAILQ_ENTRY(inotify_entry)    ie_entries;
87           char                                    ie_name[NAME_MAX + 1];
88           struct linux_inotify_event    ie_event;
89 };
90 
91 struct inotify_dir_entries {
92           size_t    ide_count;
93           struct inotify_dir_entry {
94                     char      name[NAME_MAX + 1];
95                     ino_t     fileno;
96           } ide_entries[];
97 };
98 #define   INOTIFY_DIR_ENTRIES_SIZE(count)         (sizeof(struct inotify_dir_entries) \
99     + count * sizeof(struct inotify_dir_entry))
100 
101 struct inotifyfd {
102           int                 ifd_kqfd; /* kqueue fd used by this inotify */
103                                                   /* instance */
104           struct selinfo      ifd_sel;  /* for EVFILT_READ by epoll */
105           kmutex_t  ifd_lock; /* lock for ifd_sel, ifd_wds and */
106                                                   /* ifd_nwds */
107 
108           struct inotify_dir_entries **ifd_wds;
109                                                   /* keeps track of watch descriptors */
110                                                   /* for directories: snapshot of the */
111                                                   /* directory state */
112                                                   /* for files: an inotify_dir_entries */
113                                                   /* with ide_count == 0 */
114           size_t              ifd_nwds; /* max watch descriptor that can be */
115                                                   /* stored in ifd_wds + 1 */
116 
117         TAILQ_HEAD(, inotify_entry) ifd_qhead;    /* queue of pending events */
118           size_t              ifd_qcount;         /* number of pending events */
119           kcondvar_t          ifd_qcv;  /* condvar for blocking reads */
120           kmutex_t  ifd_qlock;          /* lock for ifd_q* and interlock */
121                                                   /* for ifd_qcv */
122 };
123 
124 struct inotify_kevent_mask_pair {
125           uint32_t inotify;
126           uint32_t kevent;
127 };
128 
129 static int          inotify_kev_fetch_changes(void *, const struct kevent *,
130     struct kevent *, size_t, int);
131 static int          do_inotify_init(struct lwp *, register_t *, int);
132 static int          inotify_close_wd(struct inotifyfd *, int);
133 static uint32_t     inotify_mask_to_kevent_fflags(uint32_t, enum vtype);
134 static void         do_kevent_to_inotify(int32_t, uint32_t, uint32_t,
135     struct inotify_entry *, size_t *, char *);
136 static int          kevent_to_inotify(struct inotifyfd *, int, enum vtype, uint32_t,
137     uint32_t, struct inotify_entry *, size_t *);
138 static int          inotify_readdir(file_t *, struct dirent *, int *, bool);
139 static struct inotify_dir_entries *get_inotify_dir_entries(int, bool);
140 
141 static int          inotify_filt_attach(struct knote *);
142 static void         inotify_filt_detach(struct knote *);
143 static int          inotify_filt_event(struct knote *, long);
144 static void         inotify_read_filt_detach(struct knote *);
145 static int          inotify_read_filt_event(struct knote *, long);
146 
147 static int          inotify_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
148 static int          inotify_close(file_t *);
149 static int          inotify_poll(file_t *, int);
150 static int          inotify_kqfilter(file_t *, struct knote *);
151 static void         inotify_restart(file_t *);
152 
153 static const char inotify_filtname[] = "LINUX_INOTIFY";
154 static int inotify_filtid;
155 
156 /* "fake" EVFILT_VNODE that gets attached to ifd_deps */
157 static const struct filterops inotify_filtops = {
158           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
159           .f_attach = inotify_filt_attach,
160           .f_detach = inotify_filt_detach,
161           .f_event = inotify_filt_event,
162           .f_touch = NULL,
163 };
164 
165 /* EVFILT_READ attached to inotifyfd (to support watching via epoll) */
166 static const struct filterops inotify_read_filtops = {
167           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
168           .f_attach = NULL, /* attached via .fo_kqfilter */
169           .f_detach = inotify_read_filt_detach,
170           .f_event = inotify_read_filt_event,
171           .f_touch = NULL,
172 };
173 
174 static const struct fileops inotify_fileops = {
175           .fo_name = "inotify",
176           .fo_read = inotify_read,
177           .fo_write = fbadop_write,
178           .fo_ioctl = fbadop_ioctl,
179           .fo_fcntl = fnullop_fcntl,
180           .fo_poll = inotify_poll,
181           .fo_stat = fbadop_stat,
182           .fo_close = inotify_close,
183           .fo_kqfilter = inotify_kqfilter,
184           .fo_restart = inotify_restart,
185           .fo_fpathconf = (void *)eopnotsupp,
186 };
187 
188 /* basic flag translations */
189 static const struct inotify_kevent_mask_pair common_inotify_to_kevent[] = {
190           { .inotify = LINUX_IN_ATTRIB,           .kevent = NOTE_ATTRIB, },
191           { .inotify = LINUX_IN_CLOSE_NOWRITE,    .kevent = NOTE_CLOSE, },
192           { .inotify = LINUX_IN_OPEN,             .kevent = NOTE_OPEN, },
193           { .inotify = LINUX_IN_MOVE_SELF,        .kevent = NOTE_RENAME, },
194 };
195 static const size_t common_inotify_to_kevent_len =
196     __arraycount(common_inotify_to_kevent);
197 
198 static const struct inotify_kevent_mask_pair vreg_inotify_to_kevent[] = {
199           { .inotify = LINUX_IN_ACCESS,           .kevent = NOTE_READ, },
200           { .inotify = LINUX_IN_ATTRIB,           .kevent = NOTE_ATTRIB|NOTE_LINK, },
201           { .inotify = LINUX_IN_CLOSE_WRITE,      .kevent = NOTE_CLOSE_WRITE, },
202           { .inotify = LINUX_IN_MODIFY,           .kevent = NOTE_WRITE, },
203 };
204 static const size_t vreg_inotify_to_kevent_len =
205     __arraycount(vreg_inotify_to_kevent);
206 
207 static const struct inotify_kevent_mask_pair vdir_inotify_to_kevent[] = {
208           { .inotify = LINUX_IN_ACCESS,           .kevent = NOTE_READ, },
209           { .inotify = LINUX_IN_CREATE,           .kevent = NOTE_WRITE, },
210           { .inotify = LINUX_IN_DELETE,           .kevent = NOTE_WRITE, },
211           { .inotify = LINUX_IN_MOVED_FROM,       .kevent = NOTE_WRITE, },
212           { .inotify = LINUX_IN_MOVED_TO,                   .kevent = NOTE_WRITE, },
213 };
214 static const size_t vdir_inotify_to_kevent_len =
215     __arraycount(vdir_inotify_to_kevent);
216 
217 static const struct inotify_kevent_mask_pair common_kevent_to_inotify[] = {
218           { .kevent = NOTE_ATTRIB,      .inotify = LINUX_IN_ATTRIB, },
219           { .kevent = NOTE_CLOSE,                 .inotify = LINUX_IN_CLOSE_NOWRITE, },
220           { .kevent = NOTE_CLOSE_WRITE, .inotify = LINUX_IN_CLOSE_WRITE, },
221           { .kevent = NOTE_OPEN,                  .inotify = LINUX_IN_OPEN, },
222           { .kevent = NOTE_READ,                  .inotify = LINUX_IN_ACCESS, },
223           { .kevent = NOTE_RENAME,      .inotify = LINUX_IN_MOVE_SELF, },
224           { .kevent = NOTE_REVOKE,      .inotify = LINUX_IN_UNMOUNT, },
225 };
226 static const size_t common_kevent_to_inotify_len =
227     __arraycount(common_kevent_to_inotify);
228 
229 static const struct inotify_kevent_mask_pair vreg_kevent_to_inotify[] = {
230           { .kevent = NOTE_DELETE|NOTE_LINK, .inotify = LINUX_IN_ATTRIB, },
231           { .kevent = NOTE_WRITE,                 .inotify = LINUX_IN_MODIFY, },
232 };
233 static const size_t vreg_kevent_to_inotify_len =
234     __arraycount(vreg_kevent_to_inotify);
235 
236 /*
237  * Register the custom kfilter for inotify.
238  */
239 int
linux_inotify_init(void)240 linux_inotify_init(void)
241 {
242           return kfilter_register(inotify_filtname, &inotify_filtops,
243               &inotify_filtid);
244 }
245 
246 /*
247  * Unregister the custom kfilter for inotify.
248  */
249 int
linux_inotify_fini(void)250 linux_inotify_fini(void)
251 {
252           return kfilter_unregister(inotify_filtname);
253 }
254 
255 /*
256  * Copyin callback used by kevent.  This copies already converted
257  * filters from kernel memory to the kevent internal kernel memory.
258  * Hence the memcpy instead of copyin.
259  */
260 static int
inotify_kev_fetch_changes(void * ctx,const struct kevent * changelist,struct kevent * changes,size_t index,int n)261 inotify_kev_fetch_changes(void *ctx, const struct kevent *changelist,
262     struct kevent *changes, size_t index, int n)
263 {
264           memcpy(changes, changelist + index, n * sizeof(*changes));
265 
266           return 0;
267 }
268 
269 /*
270  * Initialize a new inotify fd.
271  */
272 static int
do_inotify_init(struct lwp * l,register_t * retval,int flags)273 do_inotify_init(struct lwp *l, register_t *retval, int flags)
274 {
275           file_t *fp;
276           int error, fd;
277           struct proc *p = l->l_proc;
278           struct inotifyfd *ifd;
279           struct sys_kqueue1_args kqa;
280 
281           if (flags & ~(LINUX_IN_ALL_FLAGS))
282                     return EINVAL;
283 
284           ifd = kmem_zalloc(sizeof(*ifd), KM_SLEEP);
285           mutex_init(&ifd->ifd_lock, MUTEX_DEFAULT, IPL_NONE);
286           mutex_init(&ifd->ifd_qlock, MUTEX_DEFAULT, IPL_NONE);
287           cv_init(&ifd->ifd_qcv, "inotify");
288           selinit(&ifd->ifd_sel);
289           TAILQ_INIT(&ifd->ifd_qhead);
290 
291           ifd->ifd_nwds = 1;
292           ifd->ifd_wds = kmem_zalloc(ifd->ifd_nwds * sizeof(*ifd->ifd_wds),
293           KM_SLEEP);
294 
295           SCARG(&kqa, flags) = 0;
296           if (flags & LINUX_IN_NONBLOCK)
297                     SCARG(&kqa, flags) |= O_NONBLOCK;
298           error = sys_kqueue1(l, &kqa, retval);
299           if (error != 0)
300                     goto leave0;
301           ifd->ifd_kqfd = *retval;
302 
303           error = fd_allocfile(&fp, &fd);
304           if (error != 0)
305                     goto leave1;
306 
307           fp->f_flag = FREAD;
308           if (flags & LINUX_IN_NONBLOCK)
309                     fp->f_flag |= FNONBLOCK;
310           fp->f_type = DTYPE_MISC;
311           fp->f_ops = &inotify_fileops;
312           fp->f_data = ifd;
313           fd_set_exclose(l, fd, (flags & LINUX_IN_CLOEXEC) != 0);
314           fd_affix(p, fp, fd);
315 
316           *retval = fd;
317           return 0;
318 
319 leave1:
320           KASSERT(fd_getfile(ifd->ifd_kqfd) != NULL);
321           fd_close(ifd->ifd_kqfd);
322 leave0:
323           kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
324           kmem_free(ifd, sizeof(*ifd));
325 
326           mutex_destroy(&ifd->ifd_lock);
327           mutex_destroy(&ifd->ifd_qlock);
328           cv_destroy(&ifd->ifd_qcv);
329           seldestroy(&ifd->ifd_sel);
330 
331           return error;
332 }
333 
334 #ifndef __aarch64__
335 /*
336  * inotify_init(2).  Initialize a new inotify fd with flags=0.
337  */
338 int
linux_sys_inotify_init(struct lwp * l,const void * v,register_t * retval)339 linux_sys_inotify_init(struct lwp *l, const void *v, register_t *retval)
340 {
341           return do_inotify_init(l, retval, 0);
342 }
343 #endif
344 
345 /*
346  * inotify_init(2).  Initialize a new inotify fd with the given flags.
347  */
348 int
linux_sys_inotify_init1(struct lwp * l,const struct linux_sys_inotify_init1_args * uap,register_t * retval)349 linux_sys_inotify_init1(struct lwp *l,
350     const struct linux_sys_inotify_init1_args *uap, register_t *retval)
351 {
352           /* {
353                     syscallarg(int) flags;
354           } */
355 
356           return do_inotify_init(l, retval, SCARG(uap, flags));
357 }
358 
359 /*
360  * Convert inotify mask to the fflags of an equivalent kevent.
361  */
362 static uint32_t
inotify_mask_to_kevent_fflags(uint32_t mask,enum vtype type)363 inotify_mask_to_kevent_fflags(uint32_t mask, enum vtype type)
364 {
365           const struct inotify_kevent_mask_pair *type_inotify_to_kevent;
366           uint32_t fflags;
367           size_t i, type_inotify_to_kevent_len;
368 
369           switch (type) {
370           case VREG:
371           case VDIR:
372           case VLNK:
373                     break;
374 
375           default:
376                     return 0;
377           }
378 
379           /* flags that all watches could have */
380           fflags = NOTE_DELETE|NOTE_REVOKE;
381           for (i = 0; i < common_inotify_to_kevent_len; i++)
382                     if (mask & common_inotify_to_kevent[i].inotify)
383                               fflags |= common_inotify_to_kevent[i].kevent;
384 
385           /* flags that depend on type */
386           switch (type) {
387           case VREG:
388                     type_inotify_to_kevent = vreg_inotify_to_kevent;
389                     type_inotify_to_kevent_len = vreg_inotify_to_kevent_len;
390                     break;
391 
392           case VDIR:
393                     type_inotify_to_kevent = vdir_inotify_to_kevent;
394                     type_inotify_to_kevent_len = vdir_inotify_to_kevent_len;
395                     break;
396 
397           default:
398                     type_inotify_to_kevent_len = 0;
399                     break;
400           }
401           for (i = 0; i < type_inotify_to_kevent_len; i++)
402                     if (mask & type_inotify_to_kevent[i].inotify)
403                               fflags |= type_inotify_to_kevent[i].kevent;
404 
405           return fflags;
406 }
407 
408 /*
409  * inotify_add_watch(2).  Open a fd for pathname (if desired by mask)
410  * track it and add an equivalent kqueue event for it in
411  * ifd->ifd_kqfd.
412  */
413 int
linux_sys_inotify_add_watch(struct lwp * l,const struct linux_sys_inotify_add_watch_args * uap,register_t * retval)414 linux_sys_inotify_add_watch(struct lwp *l,
415     const struct linux_sys_inotify_add_watch_args *uap, register_t *retval)
416 {
417           /* {
418                     syscallarg(int) fd;
419                     syscallarg(const char *) pathname;
420                     syscallarg(uint32_t) mask;
421           } */
422           int wd, i, error = 0;
423           file_t *fp, *wp, *cur_fp;
424           struct inotifyfd *ifd;
425           struct inotify_dir_entries **new_wds;
426           struct knote *kn, *tmpkn;
427           struct sys_open_args oa;
428           struct kevent kev;
429           struct vnode *wvp;
430           namei_simple_flags_t sflags;
431           struct kevent_ops k_ops = {
432                     .keo_private = NULL,
433                     .keo_fetch_timeout = NULL,
434                     .keo_fetch_changes = inotify_kev_fetch_changes,
435                     .keo_put_events = NULL,
436           };
437           const int fd = SCARG(uap, fd);
438           const uint32_t mask = SCARG(uap, mask);
439 
440           if (mask & ~LINUX_IN_ADD_KNOWN)
441                     return EINVAL;
442 
443           fp = fd_getfile(fd);
444           if (fp == NULL)
445                     return EBADF;
446 
447           if (fp->f_ops != &inotify_fileops) {
448                     /* not an inotify fd */
449                     error = EBADF;
450                     goto leave0;
451           }
452 
453           ifd = fp->f_data;
454 
455           mutex_enter(&ifd->ifd_lock);
456 
457           if (mask & LINUX_IN_DONT_FOLLOW)
458                     sflags = NSM_NOFOLLOW_TRYEMULROOT;
459           else
460                     sflags = NSM_FOLLOW_TRYEMULROOT;
461           error = namei_simple_user(SCARG(uap, pathname), sflags, &wvp);
462           if (error != 0)
463                     goto leave1;
464 
465           /* Check to see if we already have a descriptor to wd's file. */
466         wd = -1;
467           for (i = 0; i < ifd->ifd_nwds; i++) {
468                     if (ifd->ifd_wds[i] != NULL) {
469                               cur_fp = fd_getfile(i);
470                               if (cur_fp == NULL) {
471                                         DPRINTF(("%s: wd=%d was closed externally\n",
472                                             __func__, i));
473                                         error = EBADF;
474                                         goto leave1;
475                               }
476                               if (cur_fp->f_type != DTYPE_VNODE) {
477                                         DPRINTF(("%s: wd=%d was replaced "
478                                             "with a non-vnode\n", __func__, i));
479                                         error = EBADF;
480                               }
481                               if (error == 0 && cur_fp->f_vnode == wvp)
482                                         wd = i;
483                               fd_putfile(i);
484                               if (error != 0)
485                                         goto leave1;
486 
487                               if (wd != -1)
488                                         break;
489                     }
490           }
491 
492           if (wd == -1) {
493                     /*
494                      * If we do not have a descriptor to wd's file, we
495                      * need to open the watch descriptor.
496                      */
497                     SCARG(&oa, path) = SCARG(uap, pathname);
498                     SCARG(&oa, mode) = 0;
499                     SCARG(&oa, flags) = O_RDONLY;
500                     if (mask & LINUX_IN_DONT_FOLLOW)
501                               SCARG(&oa, flags) |= O_NOFOLLOW;
502                     if (mask & LINUX_IN_ONLYDIR)
503                               SCARG(&oa, flags) |= O_DIRECTORY;
504 
505                     error = sys_open(l, &oa, retval);
506                     if (error != 0)
507                               goto leave1;
508                     wd = *retval;
509                     wp = fd_getfile(wd);
510                   KASSERT(wp != NULL);
511                     KASSERT(wp->f_type == DTYPE_VNODE);
512 
513                     /* translate the flags */
514                     memset(&kev, 0, sizeof(kev));
515                     EV_SET(&kev, wd, inotify_filtid, EV_ADD|EV_ENABLE,
516                         NOTE_DELETE|NOTE_REVOKE, 0, ifd);
517                     if (mask & LINUX_IN_ONESHOT)
518                               kev.flags |= EV_ONESHOT;
519                     kev.fflags |= inotify_mask_to_kevent_fflags(mask,
520                         wp->f_vnode->v_type);
521 
522                     error = kevent1(retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL,
523                         &k_ops);
524                     if (error != 0) {
525                               KASSERT(fd_getfile(wd) != NULL);
526                               fd_close(wd);
527                     } else {
528                               /* Success! */
529                               *retval = wd;
530 
531                               /* Resize ifd_nwds to accommodate wd. */
532                               if (wd+1 > ifd->ifd_nwds) {
533                                         new_wds = kmem_zalloc(
534                                             (wd+1) * sizeof(*ifd->ifd_wds), KM_SLEEP);
535                                         memcpy(new_wds, ifd->ifd_wds,
536                                             ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
537 
538                                         kmem_free(ifd->ifd_wds,
539                                             ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
540 
541                                         ifd->ifd_wds = new_wds;
542                                         ifd->ifd_nwds = wd+1;
543                               }
544 
545                               ifd->ifd_wds[wd] = get_inotify_dir_entries(wd, true);
546                     }
547           } else {
548                     /*
549                      * If we do have a descriptor to wd's file, try to edit
550                      * the relevant knote.
551                      */
552                     if (mask & LINUX_IN_MASK_CREATE) {
553                               error = EEXIST;
554                               goto leave1;
555                     }
556 
557                     wp = fd_getfile(wd);
558                     if (wp == NULL) {
559                               DPRINTF(("%s: wd=%d was closed externally "
560                                   "(race, probably)\n", __func__, wd));
561                               error = EBADF;
562                               goto leave1;
563                     }
564                     if (wp->f_type != DTYPE_VNODE) {
565                               DPRINTF(("%s: wd=%d was replace with a non-vnode "
566                                   "(race, probably)\n", __func__, wd));
567                               error = EBADF;
568                               goto leave2;
569                     }
570 
571                     kev.fflags = NOTE_DELETE | NOTE_REVOKE
572                         | inotify_mask_to_kevent_fflags(mask, wp->f_vnode->v_type);
573 
574                     mutex_enter(wp->f_vnode->v_interlock);
575 
576                     /*
577                      * XXX We are forced to find the appropriate knote
578                      * manually because we cannot create a custom f_touch
579                      * function for inotify_filtops.  See filter_touch()
580                      * in kern_event.c for details.
581                      */
582                   SLIST_FOREACH_SAFE(kn, &wp->f_vnode->v_klist->vk_klist,
583                         kn_selnext, tmpkn) {
584                               if (kn->kn_fop == &inotify_filtops
585                                   && ifd == kn->kn_kevent.udata) {
586                                         mutex_enter(&kn->kn_kq->kq_lock);
587                                         if (mask & LINUX_IN_MASK_ADD)
588                                                   kn->kn_sfflags |= kev.fflags;
589                                         else
590                                                   kn->kn_sfflags = kev.fflags;
591                                         wp->f_vnode->v_klist->vk_interest |=
592                                             kn->kn_sfflags;
593                                         mutex_exit(&kn->kn_kq->kq_lock);
594                               }
595                     }
596 
597                     mutex_exit(wp->f_vnode->v_interlock);
598 
599                     /* Success! */
600                     *retval = wd;
601           }
602 
603 leave2:
604           fd_putfile(wd);
605 leave1:
606           mutex_exit(&ifd->ifd_lock);
607 leave0:
608           fd_putfile(fd);
609           return error;
610 }
611 
612 /*
613  * Remove a wd from ifd and close wd.
614  */
615 static int
inotify_close_wd(struct inotifyfd * ifd,int wd)616 inotify_close_wd(struct inotifyfd *ifd, int wd)
617 {
618           file_t *wp;
619           int error;
620           register_t retval;
621           struct kevent kev;
622           struct kevent_ops k_ops = {
623                     .keo_private = NULL,
624                     .keo_fetch_timeout = NULL,
625                     .keo_fetch_changes = inotify_kev_fetch_changes,
626                     .keo_put_events = NULL,
627           };
628 
629           mutex_enter(&ifd->ifd_lock);
630 
631           KASSERT(0 <= wd && wd < ifd->ifd_nwds && ifd->ifd_wds[wd] != NULL);
632 
633           kmem_free(ifd->ifd_wds[wd],
634               INOTIFY_DIR_ENTRIES_SIZE(ifd->ifd_wds[wd]->ide_count));
635           ifd->ifd_wds[wd] = NULL;
636 
637           mutex_exit(&ifd->ifd_lock);
638 
639           wp = fd_getfile(wd);
640           if (wp == NULL) {
641                     DPRINTF(("%s: wd=%d is already closed\n", __func__, wd));
642                     return 0;
643           }
644           KASSERT(!mutex_owned(wp->f_vnode->v_interlock));
645 
646           memset(&kev, 0, sizeof(kev));
647           EV_SET(&kev, wd, EVFILT_VNODE, EV_DELETE, 0, 0, 0);
648           error = kevent1(&retval, ifd->ifd_kqfd, &kev, 1, NULL, 0, NULL, &k_ops);
649           if (error != 0)
650                     DPRINTF(("%s: attempt to disable all events for wd=%d "
651                         "had error=%d\n", __func__, wd, error));
652 
653           return fd_close(wd);
654 }
655 
656 /*
657  * inotify_rm_watch(2).  Close wd and remove it from ifd->ifd_wds.
658  */
659 int
linux_sys_inotify_rm_watch(struct lwp * l,const struct linux_sys_inotify_rm_watch_args * uap,register_t * retval)660 linux_sys_inotify_rm_watch(struct lwp *l,
661     const struct linux_sys_inotify_rm_watch_args *uap, register_t *retval)
662 {
663           /* {
664                     syscallarg(int) fd;
665                     syscallarg(int) wd;
666           } */
667           struct inotifyfd *ifd;
668           file_t *fp;
669           int error = 0;
670           const int fd = SCARG(uap, fd);
671           const int wd = SCARG(uap, wd);
672 
673           fp = fd_getfile(fd);
674           if (fp == NULL)
675                     return EBADF;
676           if (fp->f_ops != &inotify_fileops) {
677                     /* not an inotify fd */
678                     error = EINVAL;
679                     goto leave;
680           }
681 
682           ifd = fp->f_data;
683           if (wd < 0 || wd >= ifd->ifd_nwds || ifd->ifd_wds[wd] == NULL) {
684                     error = EINVAL;
685                     goto leave;
686           }
687 
688           error = inotify_close_wd(ifd, wd);
689 
690 leave:
691           fd_putfile(fd);
692           return error;
693 }
694 
695 /*
696  * Attach the inotify filter.
697  */
698 static int
inotify_filt_attach(struct knote * kn)699 inotify_filt_attach(struct knote *kn)
700 {
701           file_t *fp = kn->kn_obj;
702           struct vnode *vp;
703 
704           KASSERT(fp->f_type == DTYPE_VNODE);
705           vp = fp->f_vnode;
706 
707           /*
708            * Needs to be set so that we get the same event handling as
709            * EVFILT_VNODE.  Otherwise we don't get any events.
710            *
711            * A consequence of this is that modifications/removals of
712            * this knote need to specify EVFILT_VNODE rather than
713            * inotify_filtid.
714            */
715           kn->kn_filter = EVFILT_VNODE;
716 
717           kn->kn_fop = &inotify_filtops;
718           kn->kn_hook = vp;
719           vn_knote_attach(vp, kn);
720 
721           return 0;
722 }
723 
724 /*
725  * Detach the inotify filter.
726  */
727 static void
inotify_filt_detach(struct knote * kn)728 inotify_filt_detach(struct knote *kn)
729 {
730           struct vnode *vp = (struct vnode *)kn->kn_hook;
731 
732           vn_knote_detach(vp, kn);
733 }
734 
735 /*
736  * Create a single inotify event.
737  */
738 static void
do_kevent_to_inotify(int32_t wd,uint32_t mask,uint32_t cookie,struct inotify_entry * buf,size_t * nbuf,char * name)739 do_kevent_to_inotify(int32_t wd, uint32_t mask, uint32_t cookie,
740     struct inotify_entry *buf, size_t *nbuf, char *name)
741 {
742           KASSERT(*nbuf < LINUX_INOTIFY_MAX_FROM_KEVENT);
743 
744           buf += *nbuf;
745 
746           memset(buf, 0, sizeof(*buf));
747 
748           buf->ie_event.wd = wd;
749           buf->ie_event.mask = mask;
750           buf->ie_event.cookie = cookie;
751 
752           if (name != NULL) {
753                     buf->ie_event.len = strlen(name) + 1;
754                     KASSERT(buf->ie_event.len < sizeof(buf->ie_name));
755                     strcpy(buf->ie_name, name);
756           }
757 
758           ++(*nbuf);
759 }
760 
761 /*
762  * Like vn_readdir(), but with vnode locking only if needs_lock is
763  * true (to avoid double locking in some situations).
764  */
765 static int
inotify_readdir(file_t * fp,struct dirent * dep,int * done,bool needs_lock)766 inotify_readdir(file_t *fp, struct dirent *dep, int *done, bool needs_lock)
767 {
768           struct vnode *vp;
769           struct iovec iov;
770           struct uio uio;
771           int error, eofflag;
772 
773           KASSERT(fp->f_type == DTYPE_VNODE);
774           vp = fp->f_vnode;
775           KASSERT(vp->v_type == VDIR);
776 
777           iov.iov_base = dep;
778           iov.iov_len = sizeof(*dep);
779 
780           uio.uio_iov = &iov;
781           uio.uio_iovcnt = 1;
782           uio.uio_rw = UIO_READ;
783           uio.uio_resid = sizeof(*dep);
784           UIO_SETUP_SYSSPACE(&uio);
785 
786           mutex_enter(&fp->f_lock);
787           uio.uio_offset = fp->f_offset;
788           mutex_exit(&fp->f_lock);
789 
790           /* XXX: should pass whether to lock or not */
791           if (needs_lock)
792                     vn_lock(vp, LK_SHARED | LK_RETRY);
793           else
794                     /*
795                      * XXX We need to temprarily drop v_interlock because
796                      * it may be temporarily acquired by biowait().
797                      */
798                     mutex_exit(vp->v_interlock);
799           KASSERT(!mutex_owned(vp->v_interlock));
800           error = VOP_READDIR(vp, &uio, fp->f_cred, &eofflag, NULL, NULL);
801           if (needs_lock)
802                     VOP_UNLOCK(vp);
803           else
804                     mutex_enter(vp->v_interlock);
805 
806           mutex_enter(&fp->f_lock);
807           fp->f_offset = uio.uio_offset;
808           mutex_exit(&fp->f_lock);
809 
810           *done = sizeof(*dep) - uio.uio_resid;
811           return error;
812 }
813 
814 /*
815  * Create (and allocate) an appropriate inotify_dir_entries struct for wd to be
816  * used on ifd_wds of inotifyfd.  If the entries on a directory fail to be read,
817  * NULL is returned.  needs_lock indicates if the vnode's lock is not already
818  * owned.
819  */
820 static struct inotify_dir_entries *
get_inotify_dir_entries(int wd,bool needs_lock)821 get_inotify_dir_entries(int wd, bool needs_lock)
822 {
823           struct dirent de;
824           struct dirent *currdep;
825           struct inotify_dir_entries *idep = NULL;
826           file_t *wp;
827           int done, error;
828           size_t i, decount;
829 
830           wp = fd_getfile(wd);
831           if (wp == NULL)
832                     return NULL;
833           if (wp->f_type != DTYPE_VNODE)
834                     goto leave;
835 
836           /* for non-directories, we have 0 entries. */
837           if (wp->f_vnode->v_type != VDIR) {
838                     idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(0), KM_SLEEP);
839                     goto leave;
840           }
841 
842           mutex_enter(&wp->f_lock);
843           wp->f_offset = 0;
844           mutex_exit(&wp->f_lock);
845           decount = 0;
846           for (;;) {
847                     error = inotify_readdir(wp, &de, &done, needs_lock);
848                     if (error != 0)
849                               goto leave;
850                     if (done == 0)
851                               break;
852 
853                     currdep = &de;
854                   while ((char *)currdep < ((char *)&de) + done) {
855                               decount++;
856                               currdep = _DIRENT_NEXT(currdep);
857                     }
858           }
859 
860           idep = kmem_zalloc(INOTIFY_DIR_ENTRIES_SIZE(decount), KM_SLEEP);
861           idep->ide_count = decount;
862 
863           mutex_enter(&wp->f_lock);
864           wp->f_offset = 0;
865           mutex_exit(&wp->f_lock);
866           for (i = 0; i < decount;) {
867                     error = inotify_readdir(wp, &de, &done, needs_lock);
868                     if (error != 0 || done == 0) {
869                               kmem_free(idep, INOTIFY_DIR_ENTRIES_SIZE(decount));
870                               idep = NULL;
871                               goto leave;
872                     }
873 
874                     currdep = &de;
875                     while ((char *)currdep < ((char *)&de) + done) {
876                               idep->ide_entries[i].fileno = currdep->d_fileno;
877                               strcpy(idep->ide_entries[i].name, currdep->d_name);
878 
879                               currdep = _DIRENT_NEXT(currdep);
880                               i++;
881                     }
882           }
883 
884 leave:
885           fd_putfile(wd);
886           return idep;
887 }
888 
889 static size_t
find_entry(struct inotify_dir_entries * i1,struct inotify_dir_entries * i2)890 find_entry(struct inotify_dir_entries *i1, struct inotify_dir_entries *i2)
891 {
892           for (size_t i = 0; i < i2->ide_count; i++)
893                     if (i2->ide_entries[i].fileno != i1->ide_entries[i].fileno)
894                               return i;
895           KASSERTMSG(0, "Entry not found");
896           return -1;
897 }
898 
899 static void
handle_write(struct inotifyfd * ifd,int wd,struct inotify_entry * buf,size_t * nbuf)900 handle_write(struct inotifyfd *ifd, int wd, struct inotify_entry *buf,
901     size_t *nbuf)
902 {
903           struct inotify_dir_entries *old_idep, *new_idep;
904           size_t i;
905 
906           mutex_enter(&ifd->ifd_lock);
907 
908           old_idep = ifd->ifd_wds[wd];
909           KASSERT(old_idep != NULL);
910           new_idep = get_inotify_dir_entries(wd, false);
911           if (new_idep == NULL) {
912                     DPRINTF(("%s: directory for wd=%d could not be read\n",
913                         __func__, wd));
914                     mutex_exit(&ifd->ifd_lock);
915                     return;
916           }
917 
918 
919           if (old_idep->ide_count < new_idep->ide_count) {
920                     KASSERT(old_idep->ide_count + 1 == new_idep->ide_count);
921 
922                     /* Find the new entry. */
923                     i = find_entry(new_idep, old_idep);
924                     do_kevent_to_inotify(wd, LINUX_IN_CREATE, 0,
925                         buf, nbuf, new_idep->ide_entries[i].name);
926                     goto out;
927           }
928 
929           if (old_idep->ide_count > new_idep->ide_count) {
930                     KASSERT(old_idep->ide_count == new_idep->ide_count + 1);
931 
932                     /* Find the deleted entry. */
933                     i = find_entry(old_idep, new_idep);
934 
935                     do_kevent_to_inotify(wd, LINUX_IN_DELETE, 0,
936                         buf, nbuf, old_idep->ide_entries[i].name);
937                     goto out;
938           }
939 
940           /*
941            * XXX Because we are not watching the entire
942            * file system, the only time we know for sure
943            * that the event is a LINUX_IN_MOVED_FROM/
944            * LINUX_IN_MOVED_TO is when the move happens
945            * within a single directory...  ie. the number
946            * of directory entries has not changed.
947            *
948            * Otherwise all we can say for sure is that
949            * something was created/deleted.  So we issue a
950            * LINUX_IN_CREATE/LINUX_IN_DELETE.
951            */
952           ino_t changed = new_idep->ide_entries[new_idep->ide_count - 1].fileno;
953 
954           /* Find the deleted entry. */
955           for (i = 0; i < old_idep->ide_count; i++)
956                     if (old_idep->ide_entries[i].fileno == changed)
957                               break;
958           KASSERT(i != old_idep->ide_count);
959 
960           do_kevent_to_inotify(wd, LINUX_IN_MOVED_FROM, changed, buf, nbuf,
961               old_idep->ide_entries[i].name);
962 
963           do_kevent_to_inotify(wd, LINUX_IN_MOVED_TO, changed, buf, nbuf,
964               new_idep->ide_entries[new_idep->ide_count - 1].name);
965 
966 out:
967           ifd->ifd_wds[wd] = new_idep;
968           mutex_exit(&ifd->ifd_lock);
969 }
970 
971 /*
972  * Convert a kevent flags and fflags for EVFILT_VNODE to some number
973  * of inotify events.
974  */
975 static int
kevent_to_inotify(struct inotifyfd * ifd,int wd,enum vtype wtype,uint32_t flags,uint32_t fflags,struct inotify_entry * buf,size_t * nbuf)976 kevent_to_inotify(struct inotifyfd *ifd, int wd, enum vtype wtype,
977     uint32_t flags, uint32_t fflags, struct inotify_entry *buf,
978     size_t *nbuf)
979 {
980           struct stat st;
981           file_t *wp;
982           size_t i;
983           int error = 0;
984 
985           for (i = 0; i < common_kevent_to_inotify_len; i++)
986                     if (fflags & common_kevent_to_inotify[i].kevent)
987                               do_kevent_to_inotify(wd,
988                                   common_kevent_to_inotify[i].inotify, 0, buf, nbuf,
989                                   NULL);
990 
991           if (wtype == VREG) {
992                     for (i = 0; i < vreg_kevent_to_inotify_len; i++)
993                               if (fflags & vreg_kevent_to_inotify[i].kevent)
994                                         do_kevent_to_inotify(wd,
995                                             vreg_kevent_to_inotify[i].inotify, 0,
996                                             buf, nbuf, NULL);
997           } else if (wtype == VDIR) {
998                     for (i = 0; i < *nbuf; i++)
999                               if (buf[i].ie_event.mask &
1000                                   (LINUX_IN_ACCESS|LINUX_IN_ATTRIB
1001                                 |LINUX_IN_CLOSE|LINUX_IN_OPEN))
1002                                         buf[i].ie_event.mask |= LINUX_IN_ISDIR;
1003 
1004                     /* Need to disambiguate the possible NOTE_WRITEs. */
1005                     if (fflags & NOTE_WRITE)
1006                               handle_write(ifd, wd, buf, nbuf);
1007           }
1008 
1009           /*
1010            * Need to check if wd is actually has a link count of 0 to issue a
1011            * LINUX_IN_DELETE_SELF.
1012            */
1013           if (fflags & NOTE_DELETE) {
1014                     wp = fd_getfile(wd);
1015                     KASSERT(wp != NULL);
1016                     KASSERT(wp->f_type == DTYPE_VNODE);
1017                     vn_stat(wp->f_vnode, &st);
1018                     fd_putfile(wd);
1019 
1020                     if (st.st_nlink == 0)
1021                               do_kevent_to_inotify(wd, LINUX_IN_DELETE_SELF, 0,
1022                                   buf, nbuf, NULL);
1023           }
1024 
1025           /* LINUX_IN_IGNORED must be the last event issued for wd. */
1026           if ((flags & EV_ONESHOT) || (fflags & (NOTE_REVOKE|NOTE_DELETE))) {
1027                     do_kevent_to_inotify(wd, LINUX_IN_IGNORED, 0, buf, nbuf, NULL);
1028                     /*
1029                      * XXX in theory we could call inotify_close_wd(ifd, wd) but if
1030                      * we get here we must already be holding v_interlock for
1031                      * wd... so we can't.
1032                      *
1033                      * For simplicity we do nothing, and so wd will only be closed
1034                      * when the inotify fd is closed.
1035                      */
1036           }
1037 
1038           return error;
1039 }
1040 
1041 /*
1042  * Handle an event.  Unlike EVFILT_VNODE, we translate the event to a
1043  * linux_inotify_event and put it in our own custom queue.
1044  */
1045 static int
inotify_filt_event(struct knote * kn,long hint)1046 inotify_filt_event(struct knote *kn, long hint)
1047 {
1048         struct vnode *vp = (struct vnode *)kn->kn_hook;
1049           struct inotifyfd *ifd;
1050           struct inotify_entry *cur_ie;
1051           size_t nbuf, i;
1052           uint32_t status;
1053           struct inotify_entry buf[LINUX_INOTIFY_MAX_FROM_KEVENT];
1054 
1055           /*
1056            * If KN_WILLDETACH is set then
1057            * 1. kn->kn_kevent.udata has already been trashed with a
1058            *    struct lwp *, so we don't have access to a real ifd
1059            *    anymore, and
1060            * 2. we're about to detach anyways, so we don't really care
1061            *    about the events.
1062            * (Also because of this we need to get ifd under the same
1063            * lock as kn->kn_status.)
1064            */
1065           mutex_enter(&kn->kn_kq->kq_lock);
1066           status = kn->kn_status;
1067           ifd = kn->kn_kevent.udata;
1068           mutex_exit(&kn->kn_kq->kq_lock);
1069           if (status & KN_WILLDETACH)
1070                     return 0;
1071 
1072           /*
1073            * If we don't care about the NOTEs in hint, we don't generate
1074            * any events.
1075            */
1076           hint &= kn->kn_sfflags;
1077           if (hint == 0)
1078                     return 0;
1079 
1080           KASSERT(mutex_owned(vp->v_interlock));
1081           KASSERT(!mutex_owned(&ifd->ifd_lock));
1082 
1083           mutex_enter(&ifd->ifd_qlock);
1084 
1085           /*
1086            * early out: there's no point even traslating the event if we
1087            * have nowhere to put it (and an LINUX_IN_Q_OVERFLOW has
1088            * already been added).
1089            */
1090           if (ifd->ifd_qcount >= LINUX_INOTIFY_MAX_QUEUED)
1091                     goto leave;
1092 
1093           nbuf = 0;
1094           (void)kevent_to_inotify(ifd, kn->kn_id, vp->v_type, kn->kn_flags,
1095               hint, buf, &nbuf);
1096           for (i = 0; i < nbuf && ifd->ifd_qcount < LINUX_INOTIFY_MAX_QUEUED-1;
1097                i++) {
1098                     cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
1099                     memcpy(cur_ie, &buf[i], sizeof(*cur_ie));
1100 
1101                     TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
1102                     ifd->ifd_qcount++;
1103           }
1104           /* handle early overflow, by adding an overflow event to the end */
1105           if (i != nbuf) {
1106                     nbuf = 0;
1107                     cur_ie = kmem_zalloc(sizeof(*cur_ie), KM_SLEEP);
1108                     do_kevent_to_inotify(-1, LINUX_IN_Q_OVERFLOW, 0,
1109                         cur_ie, &nbuf, NULL);
1110 
1111                     TAILQ_INSERT_TAIL(&ifd->ifd_qhead, cur_ie, ie_entries);
1112                     ifd->ifd_qcount++;
1113           }
1114 
1115           if (nbuf > 0) {
1116                     cv_signal(&ifd->ifd_qcv);
1117 
1118                     mutex_enter(&ifd->ifd_lock);
1119                     selnotify(&ifd->ifd_sel, 0, NOTE_LOWAT);
1120                     mutex_exit(&ifd->ifd_lock);
1121           } else
1122                     DPRINTF(("%s: hint=%lx resulted in 0 inotify events\n",
1123                         __func__, hint));
1124 
1125 leave:
1126           mutex_exit(&ifd->ifd_qlock);
1127           return 0;
1128 }
1129 
1130 /*
1131  * Read inotify events from the queue.
1132  */
1133 static int
inotify_read(file_t * fp,off_t * offp,struct uio * uio,kauth_cred_t cred,int flags)1134 inotify_read(file_t *fp, off_t *offp, struct uio *uio, kauth_cred_t cred,
1135     int flags)
1136 {
1137           struct inotify_entry *cur_iep;
1138           size_t cur_size, nread;
1139           int error = 0;
1140           struct inotifyfd *ifd = fp->f_data;
1141 
1142           mutex_enter(&ifd->ifd_qlock);
1143 
1144           if (ifd->ifd_qcount == 0) {
1145                     if (fp->f_flag & O_NONBLOCK) {
1146                               error = EAGAIN;
1147                               goto leave;
1148                     }
1149 
1150                     while (ifd->ifd_qcount == 0) {
1151                               /* wait until there is an event to read */
1152                               error = cv_wait_sig(&ifd->ifd_qcv, &ifd->ifd_qlock);
1153                               if (error != 0) {
1154                                         error = EINTR;
1155                                         goto leave;
1156                               }
1157                     }
1158           }
1159 
1160           KASSERT(ifd->ifd_qcount > 0);
1161           KASSERT(mutex_owned(&ifd->ifd_qlock));
1162 
1163           nread = 0;
1164           while (ifd->ifd_qcount > 0) {
1165                     cur_iep = TAILQ_FIRST(&ifd->ifd_qhead);
1166                     KASSERT(cur_iep != NULL);
1167 
1168                     cur_size = sizeof(cur_iep->ie_event) + cur_iep->ie_event.len;
1169                     if (cur_size > uio->uio_resid) {
1170                               if (nread == 0)
1171                                         error = EINVAL;
1172                               break;
1173                     }
1174 
1175                     error = uiomove(&cur_iep->ie_event, sizeof(cur_iep->ie_event),
1176                         uio);
1177                     if (error != 0)
1178                               break;
1179                     error = uiomove(&cur_iep->ie_name, cur_iep->ie_event.len, uio);
1180                     if (error != 0)
1181                               break;
1182 
1183                     /* cleanup */
1184                     TAILQ_REMOVE(&ifd->ifd_qhead, cur_iep, ie_entries);
1185                     kmem_free(cur_iep, sizeof(*cur_iep));
1186 
1187                     nread++;
1188                     ifd->ifd_qcount--;
1189           }
1190 
1191 leave:
1192           /* Wake up the next reader, if the queue is not empty. */
1193           if (ifd->ifd_qcount > 0)
1194                     cv_signal(&ifd->ifd_qcv);
1195 
1196           mutex_exit(&ifd->ifd_qlock);
1197           return error;
1198 }
1199 
1200 /*
1201  * Close all the file descriptors associated with fp.
1202  */
1203 static int
inotify_close(file_t * fp)1204 inotify_close(file_t *fp)
1205 {
1206           int error;
1207           size_t i;
1208           file_t *kqfp;
1209           struct inotifyfd *ifd = fp->f_data;
1210 
1211           for (i = 0; i < ifd->ifd_nwds; i++) {
1212                     if (ifd->ifd_wds[i] != NULL) {
1213                               error = inotify_close_wd(ifd, i);
1214                               if (error != 0)
1215                                         return error;
1216                     }
1217           }
1218 
1219           /* the reference we need to hold is ifd->ifd_kqfp */
1220           kqfp = fd_getfile(ifd->ifd_kqfd);
1221           if (kqfp == NULL) {
1222                     DPRINTF(("%s: kqfp=%d is already closed\n", __func__,
1223                         ifd->ifd_kqfd));
1224           } else {
1225                     error = fd_close(ifd->ifd_kqfd);
1226                     if (error != 0)
1227                               return error;
1228           }
1229 
1230           mutex_destroy(&ifd->ifd_lock);
1231           mutex_destroy(&ifd->ifd_qlock);
1232           cv_destroy(&ifd->ifd_qcv);
1233           seldestroy(&ifd->ifd_sel);
1234 
1235           kmem_free(ifd->ifd_wds, ifd->ifd_nwds * sizeof(*ifd->ifd_wds));
1236           kmem_free(ifd, sizeof(*ifd));
1237           fp->f_data = NULL;
1238 
1239           return 0;
1240 }
1241 
1242 /*
1243  * Check if there are pending read events.
1244  */
1245 static int
inotify_poll(file_t * fp,int events)1246 inotify_poll(file_t *fp, int events)
1247 {
1248           int revents;
1249           struct inotifyfd *ifd = fp->f_data;
1250 
1251           revents = 0;
1252           if (events & (POLLIN|POLLRDNORM)) {
1253                     mutex_enter(&ifd->ifd_qlock);
1254 
1255                     if (ifd->ifd_qcount > 0)
1256                               revents |= events & (POLLIN|POLLRDNORM);
1257 
1258                     mutex_exit(&ifd->ifd_qlock);
1259           }
1260 
1261           return revents;
1262 }
1263 
1264 /*
1265  * Attach EVFILT_READ to the inotify instance in fp.
1266  *
1267  * This is so you can watch inotify with epoll.  No other kqueue
1268  * filter needs to be supported.
1269  */
1270 static int
inotify_kqfilter(file_t * fp,struct knote * kn)1271 inotify_kqfilter(file_t *fp, struct knote *kn)
1272 {
1273           struct inotifyfd *ifd = fp->f_data;
1274 
1275           KASSERT(fp == kn->kn_obj);
1276 
1277           if (kn->kn_filter != EVFILT_READ)
1278                     return EINVAL;
1279 
1280           kn->kn_fop = &inotify_read_filtops;
1281           mutex_enter(&ifd->ifd_lock);
1282           selrecord_knote(&ifd->ifd_sel, kn);
1283           mutex_exit(&ifd->ifd_lock);
1284 
1285           return 0;
1286 }
1287 
1288 /*
1289  * Detach a filter from an inotify instance.
1290  */
1291 static void
inotify_read_filt_detach(struct knote * kn)1292 inotify_read_filt_detach(struct knote *kn)
1293 {
1294           struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
1295 
1296           mutex_enter(&ifd->ifd_lock);
1297           selremove_knote(&ifd->ifd_sel, kn);
1298           mutex_exit(&ifd->ifd_lock);
1299 }
1300 
1301 /*
1302  * Handle EVFILT_READ events.  Note that nothing is put in kn_data.
1303  */
1304 static int
inotify_read_filt_event(struct knote * kn,long hint)1305 inotify_read_filt_event(struct knote *kn, long hint)
1306 {
1307           struct inotifyfd *ifd = ((file_t *)kn->kn_obj)->f_data;
1308 
1309           if (hint != 0) {
1310                     KASSERT(mutex_owned(&ifd->ifd_lock));
1311                     KASSERT(mutex_owned(&ifd->ifd_qlock));
1312                     KASSERT(hint == NOTE_LOWAT);
1313 
1314                     kn->kn_data = ifd->ifd_qcount;
1315           }
1316 
1317           return kn->kn_data > 0;
1318 }
1319 
1320 /*
1321  * Restart the inotify instance.
1322  */
1323 static void
inotify_restart(file_t * fp)1324 inotify_restart(file_t *fp)
1325 {
1326           struct inotifyfd *ifd = fp->f_data;
1327 
1328           mutex_enter(&ifd->ifd_qlock);
1329           cv_broadcast(&ifd->ifd_qcv);
1330           mutex_exit(&ifd->ifd_qlock);
1331 }
1332