1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5 * Copyright (c) 2008 Roman Divacky
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/imgact.h>
32 #include <sys/imgact_elf.h>
33 #include <sys/ktr.h>
34 #include <sys/lock.h>
35 #include <sys/mutex.h>
36 #include <sys/priv.h>
37 #include <sys/proc.h>
38 #include <sys/sched.h>
39 #include <sys/sysent.h>
40 #include <sys/vnode.h>
41 #include <sys/umtxvar.h>
42
43 #ifdef COMPAT_LINUX32
44 #include <machine/../linux32/linux.h>
45 #include <machine/../linux32/linux32_proto.h>
46 #else
47 #include <machine/../linux/linux.h>
48 #include <machine/../linux/linux_proto.h>
49 #endif
50 #include <compat/linux/linux_emul.h>
51 #include <compat/linux/linux_futex.h>
52 #include <compat/linux/linux_misc.h>
53 #include <compat/linux/linux_time.h>
54 #include <compat/linux/linux_util.h>
55
56 #define FUTEX_SHARED 0x8 /* shared futex */
57 #define FUTEX_UNOWNED 0
58
59 #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
60
61 static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
62 static int handle_futex_death(struct thread *td, struct linux_emuldata *,
63 uint32_t *, unsigned int, bool);
64 static int fetch_robust_entry(struct linux_robust_list **,
65 struct linux_robust_list **, unsigned int *);
66
67 struct linux_futex_args {
68 uint32_t *uaddr;
69 int32_t op;
70 uint32_t flags;
71 bool clockrt;
72 uint32_t val;
73 struct timespec *ts;
74 uint32_t *uaddr2;
75 uint32_t val3;
76 bool val3_compare;
77 struct timespec kts;
78 };
79
80 static inline int futex_key_get(const void *, int, int, struct umtx_key *);
81 static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
82 struct linux_futex_args *);
83 static int linux_futex(struct thread *, struct linux_futex_args *);
84 static int linux_futex_wait(struct thread *, struct linux_futex_args *);
85 static int linux_futex_wake(struct thread *, struct linux_futex_args *);
86 static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
87 static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
88 static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
89 static int linux_futex_unlock_pi(struct thread *, bool,
90 struct linux_futex_args *);
91 static int futex_wake_pi(struct thread *, uint32_t *, bool);
92
93 static int
futex_key_get(const void * uaddr,int type,int share,struct umtx_key * key)94 futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
95 {
96
97 /* Check that futex address is a 32bit aligned. */
98 if (!__is_aligned(uaddr, sizeof(uint32_t)))
99 return (EINVAL);
100 return (umtx_key_get(uaddr, type, share, key));
101 }
102
103 int
futex_wake(struct thread * td,uint32_t * uaddr,int val,bool shared)104 futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
105 {
106 struct linux_futex_args args;
107
108 bzero(&args, sizeof(args));
109 args.op = LINUX_FUTEX_WAKE;
110 args.uaddr = uaddr;
111 args.flags = shared == true ? FUTEX_SHARED : 0;
112 args.val = val;
113 args.val3 = FUTEX_BITSET_MATCH_ANY;
114
115 return (linux_futex_wake(td, &args));
116 }
117
118 static int
futex_wake_pi(struct thread * td,uint32_t * uaddr,bool shared)119 futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
120 {
121 struct linux_futex_args args;
122
123 bzero(&args, sizeof(args));
124 args.op = LINUX_FUTEX_UNLOCK_PI;
125 args.uaddr = uaddr;
126 args.flags = shared == true ? FUTEX_SHARED : 0;
127
128 return (linux_futex_unlock_pi(td, true, &args));
129 }
130
131 static int
futex_atomic_op(struct thread * td,int encoded_op,uint32_t * uaddr,int * res)132 futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
133 int *res)
134 {
135 int op = (encoded_op >> 28) & 7;
136 int cmp = (encoded_op >> 24) & 15;
137 int oparg = (encoded_op << 8) >> 20;
138 int cmparg = (encoded_op << 20) >> 20;
139 int oldval = 0, ret;
140
141 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
142 oparg = 1 << oparg;
143
144 switch (op) {
145 case FUTEX_OP_SET:
146 ret = futex_xchgl(oparg, uaddr, &oldval);
147 break;
148 case FUTEX_OP_ADD:
149 ret = futex_addl(oparg, uaddr, &oldval);
150 break;
151 case FUTEX_OP_OR:
152 ret = futex_orl(oparg, uaddr, &oldval);
153 break;
154 case FUTEX_OP_ANDN:
155 ret = futex_andl(~oparg, uaddr, &oldval);
156 break;
157 case FUTEX_OP_XOR:
158 ret = futex_xorl(oparg, uaddr, &oldval);
159 break;
160 default:
161 ret = ENOSYS;
162 break;
163 }
164
165 if (ret != 0)
166 return (ret);
167
168 switch (cmp) {
169 case FUTEX_OP_CMP_EQ:
170 *res = (oldval == cmparg);
171 break;
172 case FUTEX_OP_CMP_NE:
173 *res = (oldval != cmparg);
174 break;
175 case FUTEX_OP_CMP_LT:
176 *res = (oldval < cmparg);
177 break;
178 case FUTEX_OP_CMP_GE:
179 *res = (oldval >= cmparg);
180 break;
181 case FUTEX_OP_CMP_LE:
182 *res = (oldval <= cmparg);
183 break;
184 case FUTEX_OP_CMP_GT:
185 *res = (oldval > cmparg);
186 break;
187 default:
188 ret = ENOSYS;
189 }
190
191 return (ret);
192 }
193
194 static int
linux_futex(struct thread * td,struct linux_futex_args * args)195 linux_futex(struct thread *td, struct linux_futex_args *args)
196 {
197 struct linux_pemuldata *pem;
198 struct proc *p;
199
200 if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
201 args->flags = 0;
202 args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
203 } else
204 args->flags = FUTEX_SHARED;
205
206 args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
207 args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
208
209 if (args->clockrt &&
210 args->op != LINUX_FUTEX_WAIT_BITSET &&
211 args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
212 args->op != LINUX_FUTEX_LOCK_PI2)
213 return (ENOSYS);
214
215 switch (args->op) {
216 case LINUX_FUTEX_WAIT:
217 args->val3 = FUTEX_BITSET_MATCH_ANY;
218 /* FALLTHROUGH */
219
220 case LINUX_FUTEX_WAIT_BITSET:
221 LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
222 args->uaddr, args->val, args->val3);
223
224 return (linux_futex_wait(td, args));
225
226 case LINUX_FUTEX_WAKE:
227 args->val3 = FUTEX_BITSET_MATCH_ANY;
228 /* FALLTHROUGH */
229
230 case LINUX_FUTEX_WAKE_BITSET:
231 LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
232 args->uaddr, args->val, args->val3);
233
234 return (linux_futex_wake(td, args));
235
236 case LINUX_FUTEX_REQUEUE:
237 /*
238 * Glibc does not use this operation since version 2.3.3,
239 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
240 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
241 * FUTEX_REQUEUE returned EINVAL.
242 */
243 pem = pem_find(td->td_proc);
244 if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
245 linux_msg(td, "unsupported FUTEX_REQUEUE");
246 pem->flags |= LINUX_XDEPR_REQUEUEOP;
247 }
248
249 /*
250 * The above is true, however musl libc does make use of the
251 * futex requeue operation, allow operation for brands which
252 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
253 */
254 p = td->td_proc;
255 Elf_Brandinfo *bi = p->p_elf_brandinfo;
256 if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
257 return (EINVAL);
258 args->val3_compare = false;
259 /* FALLTHROUGH */
260
261 case LINUX_FUTEX_CMP_REQUEUE:
262 LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
263 "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
264 args->uaddr, args->val, args->val3, args->uaddr2,
265 args->ts);
266
267 return (linux_futex_requeue(td, args));
268
269 case LINUX_FUTEX_WAKE_OP:
270 LINUX_CTR5(sys_futex, "WAKE_OP "
271 "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
272 args->uaddr, args->val, args->uaddr2, args->val3,
273 args->ts);
274
275 return (linux_futex_wakeop(td, args));
276
277 case LINUX_FUTEX_LOCK_PI:
278 args->clockrt = true;
279 /* FALLTHROUGH */
280
281 case LINUX_FUTEX_LOCK_PI2:
282 LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
283 args->uaddr, args->val);
284
285 return (linux_futex_lock_pi(td, false, args));
286
287 case LINUX_FUTEX_UNLOCK_PI:
288 LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
289 args->uaddr);
290
291 return (linux_futex_unlock_pi(td, false, args));
292
293 case LINUX_FUTEX_TRYLOCK_PI:
294 LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
295 args->uaddr);
296
297 return (linux_futex_lock_pi(td, true, args));
298
299 /*
300 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
301 * can't be used anymore to implement conditional variables.
302 * A detailed explanation can be found here:
303 *
304 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
305 * and here http://austingroupbugs.net/view.php?id=609
306 *
307 * And since commit
308 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
309 * glibc does not use them.
310 */
311 case LINUX_FUTEX_WAIT_REQUEUE_PI:
312 /* not yet implemented */
313 pem = pem_find(td->td_proc);
314 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
315 linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
316 pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
317 }
318 return (ENOSYS);
319
320 case LINUX_FUTEX_CMP_REQUEUE_PI:
321 /* not yet implemented */
322 pem = pem_find(td->td_proc);
323 if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
324 linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
325 pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
326 }
327 return (ENOSYS);
328
329 default:
330 linux_msg(td, "unsupported futex op %d", args->op);
331 return (ENOSYS);
332 }
333 }
334
335 /*
336 * pi protocol:
337 * - 0 futex word value means unlocked.
338 * - TID futex word value means locked.
339 * Userspace uses atomic ops to lock/unlock these futexes without entering the
340 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
341 * then FUTEX_LOCK_PI is called.
342 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
343 * other waiters exists looks up the thread that owns the futex (it has put its
344 * own TID into the futex value) and made this thread the owner of the internal
345 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
346 * object, on which it blocks. Once it returns, it has the mutex acquired, and it
347 * sets the futex value to its own TID and returns (futex value contains
348 * FUTEX_WAITERS|TID).
349 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
350 * FUTEX_UNLOCK_PI will be called.
351 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
352 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
353 * bit is preserved (if any).
354 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
355 * the internal futex state and if correct, acquire futex.
356 */
357 static int
linux_futex_lock_pi(struct thread * td,bool try,struct linux_futex_args * args)358 linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
359 {
360 struct umtx_abs_timeout timo;
361 struct linux_emuldata *em;
362 struct umtx_pi *pi, *new_pi;
363 struct thread *td1;
364 struct umtx_q *uq;
365 int error, rv;
366 uint32_t owner, old_owner;
367
368 em = em_find(td);
369 uq = td->td_umtxq;
370 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
371 &uq->uq_key);
372 if (error != 0)
373 return (error);
374 if (args->ts != NULL)
375 linux_umtx_abs_timeout_init(&timo, args);
376
377 umtxq_lock(&uq->uq_key);
378 pi = umtx_pi_lookup(&uq->uq_key);
379 if (pi == NULL) {
380 new_pi = umtx_pi_alloc(M_NOWAIT);
381 if (new_pi == NULL) {
382 umtxq_unlock(&uq->uq_key);
383 new_pi = umtx_pi_alloc(M_WAITOK);
384 umtxq_lock(&uq->uq_key);
385 pi = umtx_pi_lookup(&uq->uq_key);
386 if (pi != NULL) {
387 umtx_pi_free(new_pi);
388 new_pi = NULL;
389 }
390 }
391 if (new_pi != NULL) {
392 new_pi->pi_key = uq->uq_key;
393 umtx_pi_insert(new_pi);
394 pi = new_pi;
395 }
396 }
397 umtx_pi_ref(pi);
398 umtxq_unlock(&uq->uq_key);
399 for (;;) {
400 /* Try uncontested case first. */
401 rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
402 /* The acquire succeeded. */
403 if (rv == 0) {
404 error = 0;
405 break;
406 }
407 if (rv == -1) {
408 error = EFAULT;
409 break;
410 }
411
412 /*
413 * Nobody owns it, but the acquire failed. This can happen
414 * with ll/sc atomic.
415 */
416 if (owner == FUTEX_UNOWNED) {
417 error = thread_check_susp(td, true);
418 if (error != 0)
419 break;
420 continue;
421 }
422
423 /*
424 * Avoid overwriting a possible error from sleep due
425 * to the pending signal with suspension check result.
426 */
427 if (error == 0) {
428 error = thread_check_susp(td, true);
429 if (error != 0)
430 break;
431 }
432
433 /* The futex word at *uaddr is already locked by the caller. */
434 if ((owner & FUTEX_TID_MASK) == em->em_tid) {
435 error = EDEADLK;
436 break;
437 }
438
439 /*
440 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
441 * and clear tid. Try to acquire it.
442 */
443 if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
444 old_owner = owner;
445 owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
446 owner |= em->em_tid;
447 rv = casueword32(args->uaddr, old_owner, &owner, owner);
448 if (rv == -1) {
449 error = EFAULT;
450 break;
451 }
452 if (rv == 1) {
453 if (error == 0) {
454 error = thread_check_susp(td, true);
455 if (error != 0)
456 break;
457 }
458
459 /*
460 * If this failed the lock could
461 * changed, restart.
462 */
463 continue;
464 }
465
466 umtxq_lock(&uq->uq_key);
467 umtxq_busy(&uq->uq_key);
468 error = umtx_pi_claim(pi, td);
469 umtxq_unbusy(&uq->uq_key);
470 umtxq_unlock(&uq->uq_key);
471 if (error != 0) {
472 /*
473 * Since we're going to return an
474 * error, restore the futex to its
475 * previous, unowned state to avoid
476 * compounding the problem.
477 */
478 (void)casuword32(args->uaddr, owner, old_owner);
479 }
480 break;
481 }
482
483 /*
484 * Inconsistent state: OWNER_DIED is set and tid is not 0.
485 * Linux does some checks of futex state, we return EINVAL,
486 * as the user space can take care of this.
487 */
488 if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
489 error = EINVAL;
490 break;
491 }
492
493 if (try != 0) {
494 error = EBUSY;
495 break;
496 }
497
498 /*
499 * If we caught a signal, we have retried and now
500 * exit immediately.
501 */
502 if (error != 0)
503 break;
504
505 umtxq_lock(&uq->uq_key);
506 umtxq_busy(&uq->uq_key);
507 umtxq_unlock(&uq->uq_key);
508
509 /*
510 * Set the contested bit so that a release in user space knows
511 * to use the system call for unlock. If this fails either some
512 * one else has acquired the lock or it has been released.
513 */
514 rv = casueword32(args->uaddr, owner, &owner,
515 owner | FUTEX_WAITERS);
516 if (rv == -1) {
517 umtxq_unbusy_unlocked(&uq->uq_key);
518 error = EFAULT;
519 break;
520 }
521 if (rv == 1) {
522 umtxq_unbusy_unlocked(&uq->uq_key);
523 error = thread_check_susp(td, true);
524 if (error != 0)
525 break;
526
527 /*
528 * The lock changed and we need to retry or we
529 * lost a race to the thread unlocking the umtx.
530 */
531 continue;
532 }
533
534 /*
535 * Substitute Linux thread id by native thread id to
536 * avoid refactoring code of umtxq_sleep_pi().
537 */
538 td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
539 if (td1 != NULL) {
540 owner = td1->td_tid;
541 PROC_UNLOCK(td1->td_proc);
542 } else {
543 umtxq_unbusy_unlocked(&uq->uq_key);
544 error = EINVAL;
545 break;
546 }
547
548 umtxq_lock(&uq->uq_key);
549
550 /* We set the contested bit, sleep. */
551 error = umtxq_sleep_pi(uq, pi, owner, "futexp",
552 args->ts == NULL ? NULL : &timo,
553 (args->flags & FUTEX_SHARED) != 0);
554 if (error != 0)
555 continue;
556
557 error = thread_check_susp(td, false);
558 if (error != 0)
559 break;
560 }
561
562 umtxq_lock(&uq->uq_key);
563 umtx_pi_unref(pi);
564 umtxq_unlock(&uq->uq_key);
565 umtx_key_release(&uq->uq_key);
566 return (error);
567 }
568
569 static int
linux_futex_unlock_pi(struct thread * td,bool rb,struct linux_futex_args * args)570 linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
571 {
572 struct linux_emuldata *em;
573 struct umtx_key key;
574 uint32_t old, owner, new_owner;
575 int count, error;
576
577 em = em_find(td);
578
579 /*
580 * Make sure we own this mtx.
581 */
582 error = fueword32(args->uaddr, &owner);
583 if (error == -1)
584 return (EFAULT);
585 if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
586 return (EPERM);
587
588 error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
589 if (error != 0)
590 return (error);
591 umtxq_lock(&key);
592 umtxq_busy(&key);
593 error = umtx_pi_drop(td, &key, rb, &count);
594 if (error != 0 || rb) {
595 umtxq_unbusy(&key);
596 umtxq_unlock(&key);
597 umtx_key_release(&key);
598 return (error);
599 }
600 umtxq_unlock(&key);
601
602 /*
603 * When unlocking the futex, it must be marked as unowned if
604 * there is zero or one thread only waiting for it.
605 * Otherwise, it must be marked as contested.
606 */
607 if (count > 1)
608 new_owner = FUTEX_WAITERS;
609 else
610 new_owner = FUTEX_UNOWNED;
611
612 again:
613 error = casueword32(args->uaddr, owner, &old, new_owner);
614 if (error == 1) {
615 error = thread_check_susp(td, false);
616 if (error == 0)
617 goto again;
618 }
619 umtxq_unbusy_unlocked(&key);
620 umtx_key_release(&key);
621 if (error == -1)
622 return (EFAULT);
623 if (error == 0 && old != owner)
624 return (EINVAL);
625 return (error);
626 }
627
628 static int
linux_futex_wakeop(struct thread * td,struct linux_futex_args * args)629 linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
630 {
631 struct umtx_key key, key2;
632 int nrwake, op_ret, ret;
633 int error, count;
634
635 if (args->uaddr == args->uaddr2)
636 return (EINVAL);
637
638 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
639 if (error != 0)
640 return (error);
641 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
642 if (error != 0) {
643 umtx_key_release(&key);
644 return (error);
645 }
646 umtxq_lock(&key);
647 umtxq_busy(&key);
648 umtxq_unlock(&key);
649 error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
650 umtxq_lock(&key);
651 umtxq_unbusy(&key);
652 if (error != 0)
653 goto out;
654 ret = umtxq_signal_mask(&key, args->val, args->val3);
655 if (op_ret > 0) {
656 nrwake = (int)(unsigned long)args->ts;
657 umtxq_lock(&key2);
658 count = umtxq_count(&key2);
659 if (count > 0)
660 ret += umtxq_signal_mask(&key2, nrwake, args->val3);
661 else
662 ret += umtxq_signal_mask(&key, nrwake, args->val3);
663 umtxq_unlock(&key2);
664 }
665 td->td_retval[0] = ret;
666 out:
667 umtxq_unlock(&key);
668 umtx_key_release(&key2);
669 umtx_key_release(&key);
670 return (error);
671 }
672
673 static int
linux_futex_requeue(struct thread * td,struct linux_futex_args * args)674 linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
675 {
676 int nrwake, nrrequeue;
677 struct umtx_key key, key2;
678 int error;
679 uint32_t uval;
680
681 /*
682 * Linux allows this, we would not, it is an incorrect
683 * usage of declared ABI, so return EINVAL.
684 */
685 if (args->uaddr == args->uaddr2)
686 return (EINVAL);
687
688 nrrequeue = (int)(unsigned long)args->ts;
689 nrwake = args->val;
690 /*
691 * Sanity check to prevent signed integer overflow,
692 * see Linux CVE-2018-6927
693 */
694 if (nrwake < 0 || nrrequeue < 0)
695 return (EINVAL);
696
697 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
698 if (error != 0)
699 return (error);
700 error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
701 if (error != 0) {
702 umtx_key_release(&key);
703 return (error);
704 }
705 umtxq_lock(&key);
706 umtxq_busy(&key);
707 umtxq_unlock(&key);
708 error = fueword32(args->uaddr, &uval);
709 if (error != 0)
710 error = EFAULT;
711 else if (args->val3_compare == true && uval != args->val3)
712 error = EWOULDBLOCK;
713 umtxq_lock(&key);
714 umtxq_unbusy(&key);
715 if (error == 0) {
716 umtxq_lock(&key2);
717 td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
718 umtxq_unlock(&key2);
719 }
720 umtxq_unlock(&key);
721 umtx_key_release(&key2);
722 umtx_key_release(&key);
723 return (error);
724 }
725
726 static int
linux_futex_wake(struct thread * td,struct linux_futex_args * args)727 linux_futex_wake(struct thread *td, struct linux_futex_args *args)
728 {
729 struct umtx_key key;
730 int error;
731
732 if (args->val3 == 0)
733 return (EINVAL);
734
735 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
736 if (error != 0)
737 return (error);
738 umtxq_lock(&key);
739 td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
740 umtxq_unlock(&key);
741 umtx_key_release(&key);
742 return (0);
743 }
744
745 static int
linux_futex_wait(struct thread * td,struct linux_futex_args * args)746 linux_futex_wait(struct thread *td, struct linux_futex_args *args)
747 {
748 struct umtx_abs_timeout timo;
749 struct umtx_q *uq;
750 uint32_t uval;
751 int error;
752
753 if (args->val3 == 0)
754 error = EINVAL;
755
756 uq = td->td_umtxq;
757 error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
758 &uq->uq_key);
759 if (error != 0)
760 return (error);
761 if (args->ts != NULL)
762 linux_umtx_abs_timeout_init(&timo, args);
763 umtxq_lock(&uq->uq_key);
764 umtxq_busy(&uq->uq_key);
765 uq->uq_bitset = args->val3;
766 umtxq_insert(uq);
767 umtxq_unlock(&uq->uq_key);
768 error = fueword32(args->uaddr, &uval);
769 if (error != 0)
770 error = EFAULT;
771 else if (uval != args->val)
772 error = EWOULDBLOCK;
773 umtxq_lock(&uq->uq_key);
774 umtxq_unbusy(&uq->uq_key);
775 if (error == 0) {
776 error = umtxq_sleep(uq, "futex",
777 args->ts == NULL ? NULL : &timo);
778 if ((uq->uq_flags & UQF_UMTXQ) == 0)
779 error = 0;
780 else
781 umtxq_remove(uq);
782 } else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
783 umtxq_remove(uq);
784 }
785 umtxq_unlock(&uq->uq_key);
786 umtx_key_release(&uq->uq_key);
787 if (error == ERESTART)
788 error = EINTR;
789 return (error);
790 }
791
792 static void
linux_umtx_abs_timeout_init(struct umtx_abs_timeout * timo,struct linux_futex_args * args)793 linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
794 struct linux_futex_args *args)
795 {
796 int clockid, absolute;
797
798 /*
799 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
800 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
801 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
802 * futex operations timeout is interpreted as an absolute value.
803 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
804 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
805 * measures the timeout against the CLOCK_MONOTONIC clock.
806 */
807 clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
808 absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
809 umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
810 }
811
812 int
linux_sys_futex(struct thread * td,struct linux_sys_futex_args * args)813 linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
814 {
815 struct linux_futex_args fargs = {
816 .uaddr = args->uaddr,
817 .op = args->op,
818 .val = args->val,
819 .ts = NULL,
820 .uaddr2 = args->uaddr2,
821 .val3 = args->val3,
822 .val3_compare = true,
823 };
824 int error;
825
826 switch (args->op & LINUX_FUTEX_CMD_MASK) {
827 case LINUX_FUTEX_WAIT:
828 case LINUX_FUTEX_WAIT_BITSET:
829 case LINUX_FUTEX_LOCK_PI:
830 case LINUX_FUTEX_LOCK_PI2:
831 if (args->timeout != NULL) {
832 error = linux_get_timespec(&fargs.kts, args->timeout);
833 if (error != 0)
834 return (error);
835 fargs.ts = &fargs.kts;
836 }
837 break;
838 default:
839 fargs.ts = PTRIN(args->timeout);
840 }
841 return (linux_futex(td, &fargs));
842 }
843
844 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
845 int
linux_sys_futex_time64(struct thread * td,struct linux_sys_futex_time64_args * args)846 linux_sys_futex_time64(struct thread *td,
847 struct linux_sys_futex_time64_args *args)
848 {
849 struct linux_futex_args fargs = {
850 .uaddr = args->uaddr,
851 .op = args->op,
852 .val = args->val,
853 .ts = NULL,
854 .uaddr2 = args->uaddr2,
855 .val3 = args->val3,
856 .val3_compare = true,
857 };
858 int error;
859
860 switch (args->op & LINUX_FUTEX_CMD_MASK) {
861 case LINUX_FUTEX_WAIT:
862 case LINUX_FUTEX_WAIT_BITSET:
863 case LINUX_FUTEX_LOCK_PI:
864 case LINUX_FUTEX_LOCK_PI2:
865 if (args->timeout != NULL) {
866 error = linux_get_timespec64(&fargs.kts, args->timeout);
867 if (error != 0)
868 return (error);
869 fargs.ts = &fargs.kts;
870 }
871 break;
872 default:
873 fargs.ts = PTRIN(args->timeout);
874 }
875 return (linux_futex(td, &fargs));
876 }
877 #endif
878
879 int
linux_set_robust_list(struct thread * td,struct linux_set_robust_list_args * args)880 linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
881 {
882 struct linux_emuldata *em;
883
884 if (args->len != sizeof(struct linux_robust_list_head))
885 return (EINVAL);
886
887 em = em_find(td);
888 em->robust_futexes = args->head;
889
890 return (0);
891 }
892
893 int
linux_get_robust_list(struct thread * td,struct linux_get_robust_list_args * args)894 linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
895 {
896 struct linux_emuldata *em;
897 struct linux_robust_list_head *head;
898 l_size_t len;
899 struct thread *td2;
900 int error;
901
902 if (!args->pid) {
903 em = em_find(td);
904 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
905 head = em->robust_futexes;
906 } else {
907 td2 = linux_tdfind(td, args->pid, -1);
908 if (td2 == NULL)
909 return (ESRCH);
910 if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
911 PROC_UNLOCK(td2->td_proc);
912 return (EPERM);
913 }
914
915 em = em_find(td2);
916 KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
917 /* XXX: ptrace? */
918 if (priv_check(td, PRIV_CRED_SETUID) ||
919 priv_check(td, PRIV_CRED_SETEUID) ||
920 p_candebug(td, td2->td_proc)) {
921 PROC_UNLOCK(td2->td_proc);
922 return (EPERM);
923 }
924 head = em->robust_futexes;
925
926 PROC_UNLOCK(td2->td_proc);
927 }
928
929 len = sizeof(struct linux_robust_list_head);
930 error = copyout(&len, args->len, sizeof(l_size_t));
931 if (error != 0)
932 return (EFAULT);
933
934 return (copyout(&head, args->head, sizeof(l_uintptr_t)));
935 }
936
937 static int
handle_futex_death(struct thread * td,struct linux_emuldata * em,uint32_t * uaddr,unsigned int pi,bool pending_op)938 handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
939 unsigned int pi, bool pending_op)
940 {
941 uint32_t uval, nval, mval;
942 int error;
943
944 retry:
945 error = fueword32(uaddr, &uval);
946 if (error != 0)
947 return (EFAULT);
948
949 /*
950 * Special case for regular (non PI) futexes. The unlock path in
951 * user space has two race scenarios:
952 *
953 * 1. The unlock path releases the user space futex value and
954 * before it can execute the futex() syscall to wake up
955 * waiters it is killed.
956 *
957 * 2. A woken up waiter is killed before it can acquire the
958 * futex in user space.
959 *
960 * In both cases the TID validation below prevents a wakeup of
961 * potential waiters which can cause these waiters to block
962 * forever.
963 *
964 * In both cases it is safe to attempt waking up a potential
965 * waiter without touching the user space futex value and trying
966 * to set the OWNER_DIED bit.
967 */
968 if (pending_op && !pi && !uval) {
969 (void)futex_wake(td, uaddr, 1, true);
970 return (0);
971 }
972
973 if ((uval & FUTEX_TID_MASK) == em->em_tid) {
974 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
975 error = casueword32(uaddr, uval, &nval, mval);
976 if (error == -1)
977 return (EFAULT);
978 if (error == 1) {
979 error = thread_check_susp(td, false);
980 if (error != 0)
981 return (error);
982 goto retry;
983 }
984
985 if (!pi && (uval & FUTEX_WAITERS)) {
986 error = futex_wake(td, uaddr, 1, true);
987 if (error != 0)
988 return (error);
989 } else if (pi && (uval & FUTEX_WAITERS)) {
990 error = futex_wake_pi(td, uaddr, true);
991 if (error != 0)
992 return (error);
993 }
994 }
995
996 return (0);
997 }
998
999 static int
fetch_robust_entry(struct linux_robust_list ** entry,struct linux_robust_list ** head,unsigned int * pi)1000 fetch_robust_entry(struct linux_robust_list **entry,
1001 struct linux_robust_list **head, unsigned int *pi)
1002 {
1003 l_ulong uentry;
1004 int error;
1005
1006 error = copyin((const void *)head, &uentry, sizeof(uentry));
1007 if (error != 0)
1008 return (EFAULT);
1009
1010 *entry = (void *)(uentry & ~1UL);
1011 *pi = uentry & 1;
1012
1013 return (0);
1014 }
1015
1016 #define LINUX_HANDLE_DEATH_PENDING true
1017 #define LINUX_HANDLE_DEATH_LIST false
1018
1019 /* This walks the list of robust futexes releasing them. */
1020 void
release_futexes(struct thread * td,struct linux_emuldata * em)1021 release_futexes(struct thread *td, struct linux_emuldata *em)
1022 {
1023 struct linux_robust_list_head *head;
1024 struct linux_robust_list *entry, *next_entry, *pending;
1025 unsigned int limit = 2048, pi, next_pi, pip;
1026 uint32_t *uaddr;
1027 l_long futex_offset;
1028 int error;
1029
1030 head = em->robust_futexes;
1031 if (head == NULL)
1032 return;
1033
1034 if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1035 return;
1036
1037 error = copyin(&head->futex_offset, &futex_offset,
1038 sizeof(futex_offset));
1039 if (error != 0)
1040 return;
1041
1042 if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1043 return;
1044
1045 while (entry != &head->list) {
1046 error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1047 &next_pi);
1048
1049 /*
1050 * A pending lock might already be on the list, so
1051 * don't process it twice.
1052 */
1053 if (entry != pending) {
1054 uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1055 if (handle_futex_death(td, em, uaddr, pi,
1056 LINUX_HANDLE_DEATH_LIST))
1057 return;
1058 }
1059 if (error != 0)
1060 return;
1061
1062 entry = next_entry;
1063 pi = next_pi;
1064
1065 if (!--limit)
1066 break;
1067
1068 sched_relinquish(curthread);
1069 }
1070
1071 if (pending) {
1072 uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1073 (void)handle_futex_death(td, em, uaddr, pip,
1074 LINUX_HANDLE_DEATH_PENDING);
1075 }
1076 }
1077