1 /*        $NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $     */
2 
3 /*-
4  * Copyright (c) 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_eventfd.c,v 1.11 2023/11/19 17:16:00 riastradh Exp $");
34 
35 /*
36  * eventfd
37  *
38  * Eventfd objects present a simple counting object associated with a
39  * file descriptor.  Writes and reads to this file descriptor increment
40  * and decrement the count, respectively.  When the count is non-zero,
41  * the descriptor is considered "readable", and when less than the max
42  * value (EVENTFD_MAXVAL), is considered "writable".
43  *
44  * This implementation is API compatible with the Linux eventfd(2)
45  * interface.
46  */
47 
48 #include <sys/param.h>
49 #include <sys/types.h>
50 #include <sys/condvar.h>
51 #include <sys/eventfd.h>
52 #include <sys/file.h>
53 #include <sys/filedesc.h>
54 #include <sys/kauth.h>
55 #include <sys/mutex.h>
56 #include <sys/poll.h>
57 #include <sys/proc.h>
58 #include <sys/select.h>
59 #include <sys/stat.h>
60 #include <sys/syscallargs.h>
61 #include <sys/uio.h>
62 
63 struct eventfd {
64           kmutex_t  efd_lock;
65           kcondvar_t          efd_read_wait;
66           kcondvar_t          efd_write_wait;
67           struct selinfo      efd_read_sel;
68           struct selinfo      efd_write_sel;
69           eventfd_t efd_val;
70           int64_t             efd_nwaiters;
71           bool                efd_restarting;
72           bool                efd_is_semaphore;
73 
74           /*
75            * Information kept for stat(2).
76            */
77           struct timespec efd_btime;    /* time created */
78           struct timespec     efd_mtime;          /* last write */
79           struct timespec     efd_atime;          /* last read */
80 };
81 
82 #define   EVENTFD_MAXVAL      (UINT64_MAX - 1)
83 
84 /*
85  * eventfd_create:
86  *
87  *        Create an eventfd object.
88  */
89 static struct eventfd *
eventfd_create(unsigned int const val,int const flags)90 eventfd_create(unsigned int const val, int const flags)
91 {
92           struct eventfd * const efd = kmem_zalloc(sizeof(*efd), KM_SLEEP);
93 
94           mutex_init(&efd->efd_lock, MUTEX_DEFAULT, IPL_NONE);
95           cv_init(&efd->efd_read_wait, "efdread");
96           cv_init(&efd->efd_write_wait, "efdwrite");
97           selinit(&efd->efd_read_sel);
98           selinit(&efd->efd_write_sel);
99           efd->efd_val = val;
100           efd->efd_is_semaphore = !!(flags & EFD_SEMAPHORE);
101           getnanotime(&efd->efd_btime);
102 
103           /* Caller deals with EFD_CLOEXEC and EFD_NONBLOCK. */
104 
105           return efd;
106 }
107 
108 /*
109  * eventfd_destroy:
110  *
111  *        Destroy an eventfd object.
112  */
113 static void
eventfd_destroy(struct eventfd * const efd)114 eventfd_destroy(struct eventfd * const efd)
115 {
116 
117           KASSERT(efd->efd_nwaiters == 0);
118 
119           cv_destroy(&efd->efd_read_wait);
120           cv_destroy(&efd->efd_write_wait);
121 
122           seldestroy(&efd->efd_read_sel);
123           seldestroy(&efd->efd_write_sel);
124 
125           mutex_destroy(&efd->efd_lock);
126 
127           kmem_free(efd, sizeof(*efd));
128 }
129 
130 /*
131  * eventfd_wait:
132  *
133  *        Block on an eventfd.  Handles non-blocking, as well as
134  *        the restart cases.
135  */
136 static int
eventfd_wait(struct eventfd * const efd,int const fflag,bool const is_write)137 eventfd_wait(struct eventfd * const efd, int const fflag, bool const is_write)
138 {
139           kcondvar_t *waitcv;
140           int error;
141 
142           if (fflag & FNONBLOCK) {
143                     return EAGAIN;
144           }
145 
146           /*
147            * We're going to block.  Check if we need to return ERESTART.
148            */
149           if (efd->efd_restarting) {
150                     return ERESTART;
151           }
152 
153           if (is_write) {
154                     waitcv = &efd->efd_write_wait;
155           } else {
156                     waitcv = &efd->efd_read_wait;
157           }
158 
159           efd->efd_nwaiters++;
160           KASSERT(efd->efd_nwaiters > 0);
161           error = cv_wait_sig(waitcv, &efd->efd_lock);
162           efd->efd_nwaiters--;
163           KASSERT(efd->efd_nwaiters >= 0);
164 
165           /*
166            * If a restart was triggered while we were asleep, we need
167            * to return ERESTART if no other error was returned.
168            */
169           if (efd->efd_restarting) {
170                     if (error == 0) {
171                               error = ERESTART;
172                     }
173           }
174 
175           return error;
176 }
177 
178 /*
179  * eventfd_wake:
180  *
181  *        Wake LWPs block on an eventfd.
182  */
183 static void
eventfd_wake(struct eventfd * const efd,bool const is_write)184 eventfd_wake(struct eventfd * const efd, bool const is_write)
185 {
186           kcondvar_t *waitcv = NULL;
187           struct selinfo *sel;
188           int pollev;
189 
190           if (is_write) {
191                     waitcv = &efd->efd_read_wait;
192                     sel = &efd->efd_read_sel;
193                     pollev = POLLIN | POLLRDNORM;
194           } else {
195                     waitcv = &efd->efd_write_wait;
196                     sel = &efd->efd_write_sel;
197                     pollev = POLLOUT | POLLWRNORM;
198           }
199           cv_broadcast(waitcv);
200           selnotify(sel, pollev, NOTE_SUBMIT);
201 }
202 
203 /*
204  * eventfd file operations
205  */
206 
207 static int
eventfd_fop_read(file_t * const fp,off_t * const offset,struct uio * const uio,kauth_cred_t const cred,int const flags)208 eventfd_fop_read(file_t * const fp, off_t * const offset,
209     struct uio * const uio, kauth_cred_t const cred, int const flags)
210 {
211           struct eventfd * const efd = fp->f_eventfd;
212           int const fflag = fp->f_flag;
213           eventfd_t return_value;
214           int error;
215 
216           if (uio->uio_resid < sizeof(eventfd_t)) {
217                     return EINVAL;
218           }
219 
220           mutex_enter(&efd->efd_lock);
221 
222           while (efd->efd_val == 0) {
223                     if ((error = eventfd_wait(efd, fflag, false)) != 0) {
224                               mutex_exit(&efd->efd_lock);
225                               return error;
226                     }
227           }
228 
229           if (efd->efd_is_semaphore) {
230                     return_value = 1;
231                     efd->efd_val--;
232           } else {
233                     return_value = efd->efd_val;
234                     efd->efd_val = 0;
235           }
236 
237           getnanotime(&efd->efd_atime);
238           eventfd_wake(efd, false);
239 
240           mutex_exit(&efd->efd_lock);
241 
242           error = uiomove(&return_value, sizeof(return_value), uio);
243 
244           return error;
245 }
246 
247 static int
eventfd_fop_write(file_t * const fp,off_t * const offset,struct uio * const uio,kauth_cred_t const cred,int const flags)248 eventfd_fop_write(file_t * const fp, off_t * const offset,
249     struct uio * const uio, kauth_cred_t const cred, int const flags)
250 {
251           struct eventfd * const efd = fp->f_eventfd;
252           int const fflag = fp->f_flag;
253           eventfd_t write_value;
254           int error;
255 
256           if (uio->uio_resid < sizeof(eventfd_t)) {
257                     return EINVAL;
258           }
259 
260           if ((error = uiomove(&write_value, sizeof(write_value), uio)) != 0) {
261                     return error;
262           }
263 
264           if (write_value > EVENTFD_MAXVAL) {
265                     error = EINVAL;
266                     goto out;
267           }
268 
269           mutex_enter(&efd->efd_lock);
270 
271           KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
272           while ((EVENTFD_MAXVAL - efd->efd_val) < write_value) {
273                     if ((error = eventfd_wait(efd, fflag, true)) != 0) {
274                               mutex_exit(&efd->efd_lock);
275                               goto out;
276                     }
277           }
278 
279           efd->efd_val += write_value;
280           KASSERT(efd->efd_val <= EVENTFD_MAXVAL);
281 
282           getnanotime(&efd->efd_mtime);
283           eventfd_wake(efd, true);
284 
285           mutex_exit(&efd->efd_lock);
286 
287  out:
288           if (error) {
289                     /*
290                      * Undo the effect of uiomove() so that the error
291                      * gets reported correctly; see dofilewrite().
292                      */
293                     uio->uio_resid += sizeof(write_value);
294           }
295           return error;
296 }
297 
298 static int
eventfd_ioctl(file_t * const fp,u_long const cmd,void * const data)299 eventfd_ioctl(file_t * const fp, u_long const cmd, void * const data)
300 {
301           struct eventfd * const efd = fp->f_eventfd;
302 
303           switch (cmd) {
304           case FIONBIO:
305                     return 0;
306 
307           case FIONREAD:
308                     mutex_enter(&efd->efd_lock);
309                     *(int *)data = efd->efd_val != 0 ? sizeof(eventfd_t) : 0;
310                     mutex_exit(&efd->efd_lock);
311                     return 0;
312 
313           case FIONWRITE:
314                     *(int *)data = 0;
315                     return 0;
316 
317           case FIONSPACE:
318                     /*
319                      * FIONSPACE doesn't really work for eventfd, because the
320                      * writability depends on the contents (value) being written.
321                      */
322                     break;
323 
324           default:
325                     break;
326           }
327 
328           return EPASSTHROUGH;
329 }
330 
331 static int
eventfd_fop_poll(file_t * const fp,int const events)332 eventfd_fop_poll(file_t * const fp, int const events)
333 {
334           struct eventfd * const efd = fp->f_eventfd;
335           int revents = 0;
336 
337           /*
338            * Note that Linux will return POLLERR if the eventfd count
339            * overflows, but that is not possible in the normal read/write
340            * API, only with Linux kernel-internal interfaces.  So, this
341            * implementation never returns POLLERR.
342            *
343            * Also note that the Linux eventfd(2) man page does not
344            * specifically discuss returning POLLRDNORM, but we check
345            * for that event in addition to POLLIN.
346            */
347 
348           mutex_enter(&efd->efd_lock);
349 
350           if (events & (POLLIN | POLLRDNORM)) {
351                     if (efd->efd_val != 0) {
352                               revents |= events & (POLLIN | POLLRDNORM);
353                     } else {
354                               selrecord(curlwp, &efd->efd_read_sel);
355                     }
356           }
357 
358           if (events & (POLLOUT | POLLWRNORM)) {
359                     if (efd->efd_val < EVENTFD_MAXVAL) {
360                               revents |= events & (POLLOUT | POLLWRNORM);
361                     } else {
362                               selrecord(curlwp, &efd->efd_write_sel);
363                     }
364           }
365 
366           mutex_exit(&efd->efd_lock);
367 
368           return revents;
369 }
370 
371 static int
eventfd_fop_stat(file_t * const fp,struct stat * const st)372 eventfd_fop_stat(file_t * const fp, struct stat * const st)
373 {
374           struct eventfd * const efd = fp->f_eventfd;
375 
376           memset(st, 0, sizeof(*st));
377 
378           mutex_enter(&efd->efd_lock);
379           st->st_size = (off_t)efd->efd_val;
380           st->st_blksize = sizeof(eventfd_t);
381           st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
382           st->st_blocks = 1;
383           st->st_birthtimespec = st->st_ctimespec = efd->efd_btime;
384           st->st_atimespec = efd->efd_atime;
385           st->st_mtimespec = efd->efd_mtime;
386           st->st_uid = kauth_cred_geteuid(fp->f_cred);
387           st->st_gid = kauth_cred_getegid(fp->f_cred);
388           mutex_exit(&efd->efd_lock);
389 
390           return 0;
391 }
392 
393 static int
eventfd_fop_close(file_t * const fp)394 eventfd_fop_close(file_t * const fp)
395 {
396           struct eventfd * const efd = fp->f_eventfd;
397 
398           fp->f_eventfd = NULL;
399           eventfd_destroy(efd);
400 
401           return 0;
402 }
403 
404 static void
eventfd_filt_read_detach(struct knote * const kn)405 eventfd_filt_read_detach(struct knote * const kn)
406 {
407           struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
408 
409           mutex_enter(&efd->efd_lock);
410           KASSERT(kn->kn_hook == efd);
411           selremove_knote(&efd->efd_read_sel, kn);
412           mutex_exit(&efd->efd_lock);
413 }
414 
415 static int
eventfd_filt_read(struct knote * const kn,long const hint)416 eventfd_filt_read(struct knote * const kn, long const hint)
417 {
418           struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
419           int rv;
420 
421           if (hint & NOTE_SUBMIT) {
422                     KASSERT(mutex_owned(&efd->efd_lock));
423           } else {
424                     mutex_enter(&efd->efd_lock);
425           }
426 
427           kn->kn_data = (int64_t)efd->efd_val;
428           rv = (eventfd_t)kn->kn_data > 0;
429 
430           if ((hint & NOTE_SUBMIT) == 0) {
431                     mutex_exit(&efd->efd_lock);
432           }
433 
434           return rv;
435 }
436 
437 static const struct filterops eventfd_read_filterops = {
438           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
439           .f_detach = eventfd_filt_read_detach,
440           .f_event = eventfd_filt_read,
441 };
442 
443 static void
eventfd_filt_write_detach(struct knote * const kn)444 eventfd_filt_write_detach(struct knote * const kn)
445 {
446           struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
447 
448           mutex_enter(&efd->efd_lock);
449           KASSERT(kn->kn_hook == efd);
450           selremove_knote(&efd->efd_write_sel, kn);
451           mutex_exit(&efd->efd_lock);
452 }
453 
454 static int
eventfd_filt_write(struct knote * const kn,long const hint)455 eventfd_filt_write(struct knote * const kn, long const hint)
456 {
457           struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
458           int rv;
459 
460           if (hint & NOTE_SUBMIT) {
461                     KASSERT(mutex_owned(&efd->efd_lock));
462           } else {
463                     mutex_enter(&efd->efd_lock);
464           }
465 
466           kn->kn_data = (int64_t)efd->efd_val;
467           rv = (eventfd_t)kn->kn_data < EVENTFD_MAXVAL;
468 
469           if ((hint & NOTE_SUBMIT) == 0) {
470                     mutex_exit(&efd->efd_lock);
471           }
472 
473           return rv;
474 }
475 
476 static const struct filterops eventfd_write_filterops = {
477           .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
478           .f_detach = eventfd_filt_write_detach,
479           .f_event = eventfd_filt_write,
480 };
481 
482 static int
eventfd_fop_kqfilter(file_t * const fp,struct knote * const kn)483 eventfd_fop_kqfilter(file_t * const fp, struct knote * const kn)
484 {
485           struct eventfd * const efd = ((file_t *)kn->kn_obj)->f_eventfd;
486           struct selinfo *sel;
487 
488           switch (kn->kn_filter) {
489           case EVFILT_READ:
490                     sel = &efd->efd_read_sel;
491                     kn->kn_fop = &eventfd_read_filterops;
492                     break;
493 
494           case EVFILT_WRITE:
495                     sel = &efd->efd_write_sel;
496                     kn->kn_fop = &eventfd_write_filterops;
497                     break;
498 
499           default:
500                     return EINVAL;
501           }
502 
503           kn->kn_hook = efd;
504 
505           mutex_enter(&efd->efd_lock);
506           selrecord_knote(sel, kn);
507           mutex_exit(&efd->efd_lock);
508 
509           return 0;
510 }
511 
512 static void
eventfd_fop_restart(file_t * const fp)513 eventfd_fop_restart(file_t * const fp)
514 {
515           struct eventfd * const efd = fp->f_eventfd;
516 
517           /*
518            * Unblock blocked reads/writes in order to allow close() to complete.
519            * System calls return ERESTART so that the fd is revalidated.
520            */
521 
522           mutex_enter(&efd->efd_lock);
523 
524           if (efd->efd_nwaiters != 0) {
525                     efd->efd_restarting = true;
526                     cv_broadcast(&efd->efd_read_wait);
527                     cv_broadcast(&efd->efd_write_wait);
528           }
529 
530           mutex_exit(&efd->efd_lock);
531 }
532 
533 static const struct fileops eventfd_fileops = {
534           .fo_name = "eventfd",
535           .fo_read = eventfd_fop_read,
536           .fo_write = eventfd_fop_write,
537           .fo_ioctl = eventfd_ioctl,
538           .fo_fcntl = fnullop_fcntl,
539           .fo_poll = eventfd_fop_poll,
540           .fo_stat = eventfd_fop_stat,
541           .fo_close = eventfd_fop_close,
542           .fo_kqfilter = eventfd_fop_kqfilter,
543           .fo_restart = eventfd_fop_restart,
544 };
545 
546 /*
547  * eventfd(2) system call
548  */
549 int
do_eventfd(struct lwp * const l,unsigned int const val,int const flags,register_t * retval)550 do_eventfd(struct lwp * const l, unsigned int const val, int const flags,
551     register_t *retval)
552 {
553           file_t *fp;
554           int fd, error;
555 
556           if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK | EFD_SEMAPHORE)) {
557                     return EINVAL;
558           }
559 
560           if ((error = fd_allocfile(&fp, &fd)) != 0) {
561                     return error;
562           }
563 
564           fp->f_flag = FREAD | FWRITE;
565           if (flags & EFD_NONBLOCK) {
566                     fp->f_flag |= FNONBLOCK;
567           }
568           fp->f_type = DTYPE_EVENTFD;
569           fp->f_ops = &eventfd_fileops;
570           fp->f_eventfd = eventfd_create(val, flags);
571           fd_set_exclose(l, fd, !!(flags & EFD_CLOEXEC));
572           fd_affix(curproc, fp, fd);
573 
574           *retval = fd;
575           return 0;
576 }
577 
578 int
sys_eventfd(struct lwp * l,const struct sys_eventfd_args * uap,register_t * retval)579 sys_eventfd(struct lwp *l, const struct sys_eventfd_args *uap,
580     register_t *retval)
581 {
582           /* {
583                     syscallarg(unsigned int) val;
584                     syscallarg(int) flags;
585           } */
586 
587           return do_eventfd(l, SCARG(uap, val), SCARG(uap, flags), retval);
588 }
589