1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2007 Roman Divacky
5 * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/callout.h>
32 #include <sys/capsicum.h>
33 #include <sys/errno.h>
34 #include <sys/event.h>
35 #include <sys/eventfd.h>
36 #include <sys/file.h>
37 #include <sys/filedesc.h>
38 #include <sys/filio.h>
39 #include <sys/limits.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/poll.h>
43 #include <sys/proc.h>
44 #include <sys/selinfo.h>
45 #include <sys/specialfd.h>
46 #include <sys/sx.h>
47 #include <sys/syscallsubr.h>
48 #include <sys/timespec.h>
49 #include <sys/user.h>
50
51 #ifdef COMPAT_LINUX32
52 #include <machine/../linux32/linux.h>
53 #include <machine/../linux32/linux32_proto.h>
54 #else
55 #include <machine/../linux/linux.h>
56 #include <machine/../linux/linux_proto.h>
57 #endif
58
59 #include <compat/linux/linux_emul.h>
60 #include <compat/linux/linux_event.h>
61 #include <compat/linux/linux_file.h>
62 #include <compat/linux/linux_signal.h>
63 #include <compat/linux/linux_time.h>
64 #include <compat/linux/linux_util.h>
65
66 typedef uint64_t epoll_udata_t;
67
68 struct epoll_event {
69 uint32_t events;
70 epoll_udata_t data;
71 }
72 #if defined(__amd64__)
73 __attribute__((packed))
74 #endif
75 ;
76
77 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
78
79 static int epoll_to_kevent(struct thread *td, int fd,
80 struct epoll_event *l_event, struct kevent *kevent,
81 int *nkevents);
82 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
83 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
84 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
85 static int epoll_register_kevent(struct thread *td, struct file *epfp,
86 int fd, int filter, unsigned int flags);
87 static int epoll_fd_registered(struct thread *td, struct file *epfp,
88 int fd);
89 static int epoll_delete_all_events(struct thread *td, struct file *epfp,
90 int fd);
91
92 struct epoll_copyin_args {
93 struct kevent *changelist;
94 };
95
96 struct epoll_copyout_args {
97 struct epoll_event *leventlist;
98 struct proc *p;
99 uint32_t count;
100 int error;
101 };
102
103 /* timerfd */
104 typedef uint64_t timerfd_t;
105
106 static fo_rdwr_t timerfd_read;
107 static fo_ioctl_t timerfd_ioctl;
108 static fo_poll_t timerfd_poll;
109 static fo_kqfilter_t timerfd_kqfilter;
110 static fo_stat_t timerfd_stat;
111 static fo_close_t timerfd_close;
112 static fo_fill_kinfo_t timerfd_fill_kinfo;
113
114 static struct fileops timerfdops = {
115 .fo_read = timerfd_read,
116 .fo_write = invfo_rdwr,
117 .fo_truncate = invfo_truncate,
118 .fo_ioctl = timerfd_ioctl,
119 .fo_poll = timerfd_poll,
120 .fo_kqfilter = timerfd_kqfilter,
121 .fo_stat = timerfd_stat,
122 .fo_close = timerfd_close,
123 .fo_chmod = invfo_chmod,
124 .fo_chown = invfo_chown,
125 .fo_sendfile = invfo_sendfile,
126 .fo_fill_kinfo = timerfd_fill_kinfo,
127 .fo_flags = DFLAG_PASSABLE
128 };
129
130 static void filt_timerfddetach(struct knote *kn);
131 static int filt_timerfdread(struct knote *kn, long hint);
132
133 static struct filterops timerfd_rfiltops = {
134 .f_isfd = 1,
135 .f_detach = filt_timerfddetach,
136 .f_event = filt_timerfdread
137 };
138
139 struct timerfd {
140 clockid_t tfd_clockid;
141 struct itimerspec tfd_time;
142 struct callout tfd_callout;
143 timerfd_t tfd_count;
144 bool tfd_canceled;
145 struct selinfo tfd_sel;
146 struct mtx tfd_lock;
147 };
148
149 static void linux_timerfd_expire(void *);
150 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *);
151
152 static int
epoll_create_common(struct thread * td,int flags)153 epoll_create_common(struct thread *td, int flags)
154 {
155
156 return (kern_kqueue(td, flags, NULL));
157 }
158
159 #ifdef LINUX_LEGACY_SYSCALLS
160 int
linux_epoll_create(struct thread * td,struct linux_epoll_create_args * args)161 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
162 {
163
164 /*
165 * args->size is unused. Linux just tests it
166 * and then forgets it as well.
167 */
168 if (args->size <= 0)
169 return (EINVAL);
170
171 return (epoll_create_common(td, 0));
172 }
173 #endif
174
175 int
linux_epoll_create1(struct thread * td,struct linux_epoll_create1_args * args)176 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
177 {
178 int flags;
179
180 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
181 return (EINVAL);
182
183 flags = 0;
184 if ((args->flags & LINUX_O_CLOEXEC) != 0)
185 flags |= O_CLOEXEC;
186
187 return (epoll_create_common(td, flags));
188 }
189
190 /* Structure converting function from epoll to kevent. */
191 static int
epoll_to_kevent(struct thread * td,int fd,struct epoll_event * l_event,struct kevent * kevent,int * nkevents)192 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
193 struct kevent *kevent, int *nkevents)
194 {
195 uint32_t levents = l_event->events;
196 struct linux_pemuldata *pem;
197 struct proc *p;
198 unsigned short kev_flags = EV_ADD | EV_ENABLE;
199
200 /* flags related to how event is registered */
201 if ((levents & LINUX_EPOLLONESHOT) != 0)
202 kev_flags |= EV_DISPATCH;
203 if ((levents & LINUX_EPOLLET) != 0)
204 kev_flags |= EV_CLEAR;
205 if ((levents & LINUX_EPOLLERR) != 0)
206 kev_flags |= EV_ERROR;
207 if ((levents & LINUX_EPOLLRDHUP) != 0)
208 kev_flags |= EV_EOF;
209
210 /* flags related to what event is registered */
211 if ((levents & LINUX_EPOLL_EVRD) != 0) {
212 EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
213 kevent->ext[0] = l_event->data;
214 ++kevent;
215 ++(*nkevents);
216 }
217 if ((levents & LINUX_EPOLL_EVWR) != 0) {
218 EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
219 kevent->ext[0] = l_event->data;
220 ++kevent;
221 ++(*nkevents);
222 }
223 /* zero event mask is legal */
224 if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
225 EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
226 ++(*nkevents);
227 }
228
229 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
230 p = td->td_proc;
231
232 pem = pem_find(p);
233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
234
235 LINUX_PEM_XLOCK(pem);
236 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
237 pem->flags |= LINUX_XUNSUP_EPOLL;
238 LINUX_PEM_XUNLOCK(pem);
239 linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
240 levents);
241 } else
242 LINUX_PEM_XUNLOCK(pem);
243 return (EINVAL);
244 }
245
246 return (0);
247 }
248
249 /*
250 * Structure converting function from kevent to epoll. In a case
251 * this is called on error in registration we store the error in
252 * event->data and pick it up later in linux_epoll_ctl().
253 */
254 static void
kevent_to_epoll(struct kevent * kevent,struct epoll_event * l_event)255 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
256 {
257
258 l_event->data = kevent->ext[0];
259
260 if ((kevent->flags & EV_ERROR) != 0) {
261 l_event->events = LINUX_EPOLLERR;
262 return;
263 }
264
265 /* XXX EPOLLPRI, EPOLLHUP */
266 switch (kevent->filter) {
267 case EVFILT_READ:
268 l_event->events = LINUX_EPOLLIN;
269 if ((kevent->flags & EV_EOF) != 0)
270 l_event->events |= LINUX_EPOLLRDHUP;
271 break;
272 case EVFILT_WRITE:
273 l_event->events = LINUX_EPOLLOUT;
274 break;
275 }
276 }
277
278 /*
279 * Copyout callback used by kevent. This converts kevent
280 * events to epoll events and copies them back to the
281 * userspace. This is also called on error on registering
282 * of the filter.
283 */
284 static int
epoll_kev_copyout(void * arg,struct kevent * kevp,int count)285 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
286 {
287 struct epoll_copyout_args *args;
288 struct epoll_event *eep;
289 int error, i;
290
291 args = (struct epoll_copyout_args*) arg;
292 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
293
294 for (i = 0; i < count; i++)
295 kevent_to_epoll(&kevp[i], &eep[i]);
296
297 error = copyout(eep, args->leventlist, count * sizeof(*eep));
298 if (error == 0) {
299 args->leventlist += count;
300 args->count += count;
301 } else if (args->error == 0)
302 args->error = error;
303
304 free(eep, M_EPOLL);
305 return (error);
306 }
307
308 /*
309 * Copyin callback used by kevent. This copies already
310 * converted filters from kernel memory to the kevent
311 * internal kernel memory. Hence the memcpy instead of
312 * copyin.
313 */
314 static int
epoll_kev_copyin(void * arg,struct kevent * kevp,int count)315 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
316 {
317 struct epoll_copyin_args *args;
318
319 args = (struct epoll_copyin_args*) arg;
320
321 memcpy(kevp, args->changelist, count * sizeof(*kevp));
322 args->changelist += count;
323
324 return (0);
325 }
326
327 /*
328 * Load epoll filter, convert it to kevent filter
329 * and load it into kevent subsystem.
330 */
331 int
linux_epoll_ctl(struct thread * td,struct linux_epoll_ctl_args * args)332 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
333 {
334 struct file *epfp, *fp;
335 struct epoll_copyin_args ciargs;
336 struct kevent kev[2];
337 struct kevent_copyops k_ops = { &ciargs,
338 NULL,
339 epoll_kev_copyin};
340 struct epoll_event le;
341 cap_rights_t rights;
342 int nchanges = 0;
343 int error;
344
345 if (args->op != LINUX_EPOLL_CTL_DEL) {
346 error = copyin(args->event, &le, sizeof(le));
347 if (error != 0)
348 return (error);
349 }
350
351 error = fget(td, args->epfd,
352 cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
353 if (error != 0)
354 return (error);
355 if (epfp->f_type != DTYPE_KQUEUE) {
356 error = EINVAL;
357 goto leave1;
358 }
359
360 /* Protect user data vector from incorrectly supplied fd. */
361 error = fget(td, args->fd,
362 cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
363 if (error != 0)
364 goto leave1;
365
366 /* Linux disallows spying on himself */
367 if (epfp == fp) {
368 error = EINVAL;
369 goto leave0;
370 }
371
372 ciargs.changelist = kev;
373
374 if (args->op != LINUX_EPOLL_CTL_DEL) {
375 error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
376 if (error != 0)
377 goto leave0;
378 }
379
380 switch (args->op) {
381 case LINUX_EPOLL_CTL_MOD:
382 error = epoll_delete_all_events(td, epfp, args->fd);
383 if (error != 0)
384 goto leave0;
385 break;
386
387 case LINUX_EPOLL_CTL_ADD:
388 if (epoll_fd_registered(td, epfp, args->fd)) {
389 error = EEXIST;
390 goto leave0;
391 }
392 break;
393
394 case LINUX_EPOLL_CTL_DEL:
395 /* CTL_DEL means unregister this fd with this epoll */
396 error = epoll_delete_all_events(td, epfp, args->fd);
397 goto leave0;
398
399 default:
400 error = EINVAL;
401 goto leave0;
402 }
403
404 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
405
406 leave0:
407 fdrop(fp, td);
408
409 leave1:
410 fdrop(epfp, td);
411 return (error);
412 }
413
414 /*
415 * Wait for a filter to be triggered on the epoll file descriptor.
416 */
417
418 static int
linux_epoll_wait_ts(struct thread * td,int epfd,struct epoll_event * events,int maxevents,struct timespec * tsp,sigset_t * uset)419 linux_epoll_wait_ts(struct thread *td, int epfd, struct epoll_event *events,
420 int maxevents, struct timespec *tsp, sigset_t *uset)
421 {
422 struct epoll_copyout_args coargs;
423 struct kevent_copyops k_ops = { &coargs,
424 epoll_kev_copyout,
425 NULL};
426 cap_rights_t rights;
427 struct file *epfp;
428 sigset_t omask;
429 int error;
430
431 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
432 return (EINVAL);
433
434 error = fget(td, epfd,
435 cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
436 if (error != 0)
437 return (error);
438 if (epfp->f_type != DTYPE_KQUEUE) {
439 error = EINVAL;
440 goto leave;
441 }
442 if (uset != NULL) {
443 error = kern_sigprocmask(td, SIG_SETMASK, uset,
444 &omask, 0);
445 if (error != 0)
446 goto leave;
447 td->td_pflags |= TDP_OLDMASK;
448 /*
449 * Make sure that ast() is called on return to
450 * usermode and TDP_OLDMASK is cleared, restoring old
451 * sigmask.
452 */
453 thread_lock(td);
454 td->td_flags |= TDF_ASTPENDING;
455 thread_unlock(td);
456 }
457
458 coargs.leventlist = events;
459 coargs.p = td->td_proc;
460 coargs.count = 0;
461 coargs.error = 0;
462
463 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
464 if (error == 0 && coargs.error != 0)
465 error = coargs.error;
466
467 /*
468 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
469 * Maybe we should translate that but I don't think it matters at all.
470 */
471 if (error == 0)
472 td->td_retval[0] = coargs.count;
473
474 if (uset != NULL)
475 error = kern_sigprocmask(td, SIG_SETMASK, &omask,
476 NULL, 0);
477 leave:
478 fdrop(epfp, td);
479 return (error);
480 }
481
482 static int
linux_epoll_wait_common(struct thread * td,int epfd,struct epoll_event * events,int maxevents,int timeout,sigset_t * uset)483 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
484 int maxevents, int timeout, sigset_t *uset)
485 {
486 struct timespec ts, *tsp;
487
488 /*
489 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
490 * to block indefinitely. Real implementation does it if any negative
491 * timeout value is passed.
492 */
493 if (timeout >= 0) {
494 /* Convert from milliseconds to timespec. */
495 ts.tv_sec = timeout / 1000;
496 ts.tv_nsec = (timeout % 1000) * 1000000;
497 tsp = &ts;
498 } else {
499 tsp = NULL;
500 }
501 return (linux_epoll_wait_ts(td, epfd, events, maxevents, tsp, uset));
502
503 }
504
505 #ifdef LINUX_LEGACY_SYSCALLS
506 int
linux_epoll_wait(struct thread * td,struct linux_epoll_wait_args * args)507 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
508 {
509
510 return (linux_epoll_wait_common(td, args->epfd, args->events,
511 args->maxevents, args->timeout, NULL));
512 }
513 #endif
514
515 int
linux_epoll_pwait(struct thread * td,struct linux_epoll_pwait_args * args)516 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
517 {
518 sigset_t mask, *pmask;
519 int error;
520
521 error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
522 &mask, &pmask);
523 if (error != 0)
524 return (error);
525
526 return (linux_epoll_wait_common(td, args->epfd, args->events,
527 args->maxevents, args->timeout, pmask));
528 }
529
530 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
531 int
linux_epoll_pwait2_64(struct thread * td,struct linux_epoll_pwait2_64_args * args)532 linux_epoll_pwait2_64(struct thread *td, struct linux_epoll_pwait2_64_args *args)
533 {
534 struct timespec ts, *tsa;
535 sigset_t mask, *pmask;
536 int error;
537
538 error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
539 &mask, &pmask);
540 if (error != 0)
541 return (error);
542
543 if (args->timeout) {
544 error = linux_get_timespec64(&ts, args->timeout);
545 if (error != 0)
546 return (error);
547 tsa = &ts;
548 } else
549 tsa = NULL;
550
551 return (linux_epoll_wait_ts(td, args->epfd, args->events,
552 args->maxevents, tsa, pmask));
553 }
554 #else
555 int
linux_epoll_pwait2(struct thread * td,struct linux_epoll_pwait2_args * args)556 linux_epoll_pwait2(struct thread *td, struct linux_epoll_pwait2_args *args)
557 {
558 struct timespec ts, *tsa;
559 sigset_t mask, *pmask;
560 int error;
561
562 error = linux_copyin_sigset(td, args->mask, sizeof(l_sigset_t),
563 &mask, &pmask);
564 if (error != 0)
565 return (error);
566
567 if (args->timeout) {
568 error = linux_get_timespec(&ts, args->timeout);
569 if (error != 0)
570 return (error);
571 tsa = &ts;
572 } else
573 tsa = NULL;
574
575 return (linux_epoll_wait_ts(td, args->epfd, args->events,
576 args->maxevents, tsa, pmask));
577 }
578 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
579
580 static int
epoll_register_kevent(struct thread * td,struct file * epfp,int fd,int filter,unsigned int flags)581 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
582 unsigned int flags)
583 {
584 struct epoll_copyin_args ciargs;
585 struct kevent kev;
586 struct kevent_copyops k_ops = { &ciargs,
587 NULL,
588 epoll_kev_copyin};
589
590 ciargs.changelist = &kev;
591 EV_SET(&kev, fd, filter, flags, 0, 0, 0);
592
593 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
594 }
595
596 static int
epoll_fd_registered(struct thread * td,struct file * epfp,int fd)597 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
598 {
599 /*
600 * Set empty filter flags to avoid accidental modification of already
601 * registered events. In the case of event re-registration:
602 * 1. If event does not exists kevent() does nothing and returns ENOENT
603 * 2. If event does exists, it's enabled/disabled state is preserved
604 * but fflags, data and udata fields are overwritten. So we can not
605 * set socket lowats and store user's context pointer in udata.
606 */
607 if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
608 epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
609 return (1);
610
611 return (0);
612 }
613
614 static int
epoll_delete_all_events(struct thread * td,struct file * epfp,int fd)615 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
616 {
617 int error1, error2;
618
619 error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
620 error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
621
622 /* return 0 if at least one result positive */
623 return (error1 == 0 ? 0 : error2);
624 }
625
626 #ifdef LINUX_LEGACY_SYSCALLS
627 int
linux_eventfd(struct thread * td,struct linux_eventfd_args * args)628 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
629 {
630 struct specialfd_eventfd ae;
631
632 bzero(&ae, sizeof(ae));
633 ae.initval = args->initval;
634 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
635 }
636 #endif
637
638 int
linux_eventfd2(struct thread * td,struct linux_eventfd2_args * args)639 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
640 {
641 struct specialfd_eventfd ae;
642 int flags;
643
644 if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
645 LINUX_EFD_SEMAPHORE)) != 0)
646 return (EINVAL);
647 flags = 0;
648 if ((args->flags & LINUX_O_CLOEXEC) != 0)
649 flags |= EFD_CLOEXEC;
650 if ((args->flags & LINUX_O_NONBLOCK) != 0)
651 flags |= EFD_NONBLOCK;
652 if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
653 flags |= EFD_SEMAPHORE;
654
655 bzero(&ae, sizeof(ae));
656 ae.flags = flags;
657 ae.initval = args->initval;
658 return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
659 }
660
661 int
linux_timerfd_create(struct thread * td,struct linux_timerfd_create_args * args)662 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
663 {
664 struct timerfd *tfd;
665 struct file *fp;
666 clockid_t clockid;
667 int fflags, fd, error;
668
669 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
670 return (EINVAL);
671
672 error = linux_to_native_clockid(&clockid, args->clockid);
673 if (error != 0)
674 return (error);
675 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
676 return (EINVAL);
677
678 fflags = 0;
679 if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
680 fflags |= O_CLOEXEC;
681
682 error = falloc(td, &fp, &fd, fflags);
683 if (error != 0)
684 return (error);
685
686 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
687 tfd->tfd_clockid = clockid;
688 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
689
690 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
691 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
692
693 fflags = FREAD;
694 if ((args->flags & LINUX_O_NONBLOCK) != 0)
695 fflags |= FNONBLOCK;
696
697 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
698 fdrop(fp, td);
699
700 td->td_retval[0] = fd;
701 return (error);
702 }
703
704 static int
timerfd_close(struct file * fp,struct thread * td)705 timerfd_close(struct file *fp, struct thread *td)
706 {
707 struct timerfd *tfd;
708
709 tfd = fp->f_data;
710 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
711 return (EINVAL);
712
713 timespecclear(&tfd->tfd_time.it_value);
714 timespecclear(&tfd->tfd_time.it_interval);
715
716 callout_drain(&tfd->tfd_callout);
717
718 seldrain(&tfd->tfd_sel);
719 knlist_destroy(&tfd->tfd_sel.si_note);
720
721 fp->f_ops = &badfileops;
722 mtx_destroy(&tfd->tfd_lock);
723 free(tfd, M_EPOLL);
724
725 return (0);
726 }
727
728 static int
timerfd_read(struct file * fp,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)729 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
730 int flags, struct thread *td)
731 {
732 struct timerfd *tfd;
733 timerfd_t count;
734 int error;
735
736 tfd = fp->f_data;
737 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
738 return (EINVAL);
739
740 if (uio->uio_resid < sizeof(timerfd_t))
741 return (EINVAL);
742
743 error = 0;
744 mtx_lock(&tfd->tfd_lock);
745 retry:
746 if (tfd->tfd_canceled) {
747 tfd->tfd_count = 0;
748 mtx_unlock(&tfd->tfd_lock);
749 return (ECANCELED);
750 }
751 if (tfd->tfd_count == 0) {
752 if ((fp->f_flag & FNONBLOCK) != 0) {
753 mtx_unlock(&tfd->tfd_lock);
754 return (EAGAIN);
755 }
756 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
757 if (error == 0)
758 goto retry;
759 }
760 if (error == 0) {
761 count = tfd->tfd_count;
762 tfd->tfd_count = 0;
763 mtx_unlock(&tfd->tfd_lock);
764 error = uiomove(&count, sizeof(timerfd_t), uio);
765 } else
766 mtx_unlock(&tfd->tfd_lock);
767
768 return (error);
769 }
770
771 static int
timerfd_poll(struct file * fp,int events,struct ucred * active_cred,struct thread * td)772 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
773 struct thread *td)
774 {
775 struct timerfd *tfd;
776 int revents = 0;
777
778 tfd = fp->f_data;
779 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
780 return (POLLERR);
781
782 mtx_lock(&tfd->tfd_lock);
783 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
784 revents |= events & (POLLIN|POLLRDNORM);
785 if (revents == 0)
786 selrecord(td, &tfd->tfd_sel);
787 mtx_unlock(&tfd->tfd_lock);
788
789 return (revents);
790 }
791
792 static int
timerfd_kqfilter(struct file * fp,struct knote * kn)793 timerfd_kqfilter(struct file *fp, struct knote *kn)
794 {
795 struct timerfd *tfd;
796
797 tfd = fp->f_data;
798 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
799 return (EINVAL);
800
801 if (kn->kn_filter == EVFILT_READ)
802 kn->kn_fop = &timerfd_rfiltops;
803 else
804 return (EINVAL);
805
806 kn->kn_hook = tfd;
807 knlist_add(&tfd->tfd_sel.si_note, kn, 0);
808
809 return (0);
810 }
811
812 static void
filt_timerfddetach(struct knote * kn)813 filt_timerfddetach(struct knote *kn)
814 {
815 struct timerfd *tfd = kn->kn_hook;
816
817 mtx_lock(&tfd->tfd_lock);
818 knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
819 mtx_unlock(&tfd->tfd_lock);
820 }
821
822 static int
filt_timerfdread(struct knote * kn,long hint)823 filt_timerfdread(struct knote *kn, long hint)
824 {
825 struct timerfd *tfd = kn->kn_hook;
826
827 return (tfd->tfd_count > 0);
828 }
829
830 static int
timerfd_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * active_cred,struct thread * td)831 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
832 struct ucred *active_cred, struct thread *td)
833 {
834
835 if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
836 return (EINVAL);
837
838 switch (cmd) {
839 case FIONBIO:
840 case FIOASYNC:
841 return (0);
842 }
843
844 return (ENOTTY);
845 }
846
847 static int
timerfd_stat(struct file * fp,struct stat * st,struct ucred * active_cred,struct thread * td)848 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
849 struct thread *td)
850 {
851
852 return (ENXIO);
853 }
854
855 static int
timerfd_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)856 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
857 {
858
859 kif->kf_type = KF_TYPE_UNKNOWN;
860 return (0);
861 }
862
863 static void
linux_timerfd_clocktime(struct timerfd * tfd,struct timespec * ts)864 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
865 {
866
867 if (tfd->tfd_clockid == CLOCK_REALTIME)
868 getnanotime(ts);
869 else /* CLOCK_MONOTONIC */
870 getnanouptime(ts);
871 }
872
873 static void
linux_timerfd_curval(struct timerfd * tfd,struct itimerspec * ots)874 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
875 {
876 struct timespec cts;
877
878 linux_timerfd_clocktime(tfd, &cts);
879 *ots = tfd->tfd_time;
880 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
881 timespecsub(&ots->it_value, &cts, &ots->it_value);
882 if (ots->it_value.tv_sec < 0 ||
883 (ots->it_value.tv_sec == 0 &&
884 ots->it_value.tv_nsec == 0)) {
885 ots->it_value.tv_sec = 0;
886 ots->it_value.tv_nsec = 1;
887 }
888 }
889 }
890
891 static int
linux_timerfd_gettime_common(struct thread * td,int fd,struct itimerspec * ots)892 linux_timerfd_gettime_common(struct thread *td, int fd, struct itimerspec *ots)
893 {
894 struct timerfd *tfd;
895 struct file *fp;
896 int error;
897
898 error = fget(td, fd, &cap_read_rights, &fp);
899 if (error != 0)
900 return (error);
901 tfd = fp->f_data;
902 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
903 error = EINVAL;
904 goto out;
905 }
906
907 mtx_lock(&tfd->tfd_lock);
908 linux_timerfd_curval(tfd, ots);
909 mtx_unlock(&tfd->tfd_lock);
910
911 out:
912 fdrop(fp, td);
913 return (error);
914 }
915
916 int
linux_timerfd_gettime(struct thread * td,struct linux_timerfd_gettime_args * args)917 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
918 {
919 struct l_itimerspec lots;
920 struct itimerspec ots;
921 int error;
922
923 error = linux_timerfd_gettime_common(td, args->fd, &ots);
924 if (error != 0)
925 return (error);
926 error = native_to_linux_itimerspec(&lots, &ots);
927 if (error == 0)
928 error = copyout(&lots, args->old_value, sizeof(lots));
929 return (error);
930 }
931
932 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
933 int
linux_timerfd_gettime64(struct thread * td,struct linux_timerfd_gettime64_args * args)934 linux_timerfd_gettime64(struct thread *td, struct linux_timerfd_gettime64_args *args)
935 {
936 struct l_itimerspec64 lots;
937 struct itimerspec ots;
938 int error;
939
940 error = linux_timerfd_gettime_common(td, args->fd, &ots);
941 if (error != 0)
942 return (error);
943 error = native_to_linux_itimerspec64(&lots, &ots);
944 if (error == 0)
945 error = copyout(&lots, args->old_value, sizeof(lots));
946 return (error);
947 }
948 #endif
949
950 static int
linux_timerfd_settime_common(struct thread * td,int fd,int flags,struct itimerspec * nts,struct itimerspec * oval)951 linux_timerfd_settime_common(struct thread *td, int fd, int flags,
952 struct itimerspec *nts, struct itimerspec *oval)
953 {
954 struct timespec cts, ts;
955 struct timerfd *tfd;
956 struct timeval tv;
957 struct file *fp;
958 int error;
959
960 if ((flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
961 return (EINVAL);
962
963 error = fget(td, fd, &cap_write_rights, &fp);
964 if (error != 0)
965 return (error);
966 tfd = fp->f_data;
967 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
968 error = EINVAL;
969 goto out;
970 }
971
972 mtx_lock(&tfd->tfd_lock);
973 if (!timespecisset(&nts->it_value))
974 timespecclear(&nts->it_interval);
975 if (oval != NULL)
976 linux_timerfd_curval(tfd, oval);
977
978 bcopy(nts, &tfd->tfd_time, sizeof(*nts));
979 tfd->tfd_count = 0;
980 if (timespecisset(&nts->it_value)) {
981 linux_timerfd_clocktime(tfd, &cts);
982 ts = nts->it_value;
983 if ((flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
984 timespecadd(&tfd->tfd_time.it_value, &cts,
985 &tfd->tfd_time.it_value);
986 } else {
987 timespecsub(&ts, &cts, &ts);
988 }
989 TIMESPEC_TO_TIMEVAL(&tv, &ts);
990 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
991 linux_timerfd_expire, tfd);
992 tfd->tfd_canceled = false;
993 } else {
994 tfd->tfd_canceled = true;
995 callout_stop(&tfd->tfd_callout);
996 }
997 mtx_unlock(&tfd->tfd_lock);
998
999 out:
1000 fdrop(fp, td);
1001 return (error);
1002 }
1003
1004 int
linux_timerfd_settime(struct thread * td,struct linux_timerfd_settime_args * args)1005 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1006 {
1007 struct l_itimerspec lots;
1008 struct itimerspec nts, ots, *pots;
1009 int error;
1010
1011 error = copyin(args->new_value, &lots, sizeof(lots));
1012 if (error != 0)
1013 return (error);
1014 error = linux_to_native_itimerspec(&nts, &lots);
1015 if (error != 0)
1016 return (error);
1017 pots = (args->old_value != NULL ? &ots : NULL);
1018 error = linux_timerfd_settime_common(td, args->fd, args->flags,
1019 &nts, pots);
1020 if (error == 0 && args->old_value != NULL) {
1021 error = native_to_linux_itimerspec(&lots, &ots);
1022 if (error == 0)
1023 error = copyout(&lots, args->old_value, sizeof(lots));
1024 }
1025 return (error);
1026 }
1027
1028 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1029 int
linux_timerfd_settime64(struct thread * td,struct linux_timerfd_settime64_args * args)1030 linux_timerfd_settime64(struct thread *td, struct linux_timerfd_settime64_args *args)
1031 {
1032 struct l_itimerspec64 lots;
1033 struct itimerspec nts, ots, *pots;
1034 int error;
1035
1036 error = copyin(args->new_value, &lots, sizeof(lots));
1037 if (error != 0)
1038 return (error);
1039 error = linux_to_native_itimerspec64(&nts, &lots);
1040 if (error != 0)
1041 return (error);
1042 pots = (args->old_value != NULL ? &ots : NULL);
1043 error = linux_timerfd_settime_common(td, args->fd, args->flags,
1044 &nts, pots);
1045 if (error == 0 && args->old_value != NULL) {
1046 error = native_to_linux_itimerspec64(&lots, &ots);
1047 if (error == 0)
1048 error = copyout(&lots, args->old_value, sizeof(lots));
1049 }
1050 return (error);
1051 }
1052 #endif
1053
1054 static void
linux_timerfd_expire(void * arg)1055 linux_timerfd_expire(void *arg)
1056 {
1057 struct timespec cts, ts;
1058 struct timeval tv;
1059 struct timerfd *tfd;
1060
1061 tfd = (struct timerfd *)arg;
1062
1063 linux_timerfd_clocktime(tfd, &cts);
1064 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1065 if (timespecisset(&tfd->tfd_time.it_interval))
1066 timespecadd(&tfd->tfd_time.it_value,
1067 &tfd->tfd_time.it_interval,
1068 &tfd->tfd_time.it_value);
1069 else
1070 /* single shot timer */
1071 timespecclear(&tfd->tfd_time.it_value);
1072 if (timespecisset(&tfd->tfd_time.it_value)) {
1073 timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1074 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1075 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1076 linux_timerfd_expire, tfd);
1077 }
1078 tfd->tfd_count++;
1079 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1080 selwakeup(&tfd->tfd_sel);
1081 wakeup(&tfd->tfd_count);
1082 } else if (timespecisset(&tfd->tfd_time.it_value)) {
1083 timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
1084 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1085 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1086 linux_timerfd_expire, tfd);
1087 }
1088 }
1089