1 /*        $NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $       */
2 
3 /*-
4  * Copyright (c) 2018, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Taylor R. Campbell and Jason R. Thorpe.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: sys_futex.c,v 1.26 2025/03/05 14:01:55 riastradh Exp $");
34 
35 /*
36  * Futexes
37  *
38  *        The futex system call coordinates notifying threads waiting for
39  *        changes on a 32-bit word of memory.  The word can be managed by
40  *        CPU atomic operations in userland, without system calls, as long
41  *        as there is no contention.
42  *
43  *        The simplest use case demonstrating the utility is:
44  *
45  *                  // 32-bit word of memory shared among threads or
46  *                  // processes in userland.  lock & 1 means owned;
47  *                  // lock & 2 means there are waiters waiting.
48  *                  volatile int lock = 0;
49  *
50  *                  int v;
51  *
52  *                  // Acquire a lock.
53  *                  do {
54  *                            v = lock;
55  *                            if (v & 1) {
56  *                                      // Lock is held.  Set a bit to say that
57  *                                      // there are waiters, and wait for lock
58  *                                      // to change to anything other than v;
59  *                                      // then retry.
60  *                                      if (atomic_cas_uint(&lock, v, v | 2) != v)
61  *                                                continue;
62  *                                      futex(&lock, FUTEX_WAIT, v | 2, NULL, NULL, 0,
63  *                                          0);
64  *                                      continue;
65  *                            }
66  *                  } while (atomic_cas_uint(&lock, v, v | 1) != v);
67  *                  membar_acquire();
68  *
69  *                  ...
70  *
71  *                  // Release the lock.  Optimistically assume there are
72  *                  // no waiters first until demonstrated otherwise.
73  *                  membar_release();
74  *                  if (atomic_cas_uint(&lock, 1, 0) != 1) {
75  *                            // There may be waiters.
76  *                            v = atomic_swap_uint(&lock, 0);
77  *                            // If there are still waiters, wake one.
78  *                            if (v & 2)
79  *                                      futex(&lock, FUTEX_WAKE, 1, NULL, NULL, 0, 0);
80  *                  }
81  *
82  *        The goal is to avoid the futex system call unless there is
83  *        contention; then if there is contention, to guarantee no missed
84  *        wakeups.
85  *
86  *        For a simple implementation, futex(FUTEX_WAIT) could queue
87  *        itself to be woken, double-check the lock word, and then sleep;
88  *        spurious wakeups are generally a fact of life, so any
89  *        FUTEX_WAKE could just wake every FUTEX_WAIT in the system.
90  *
91  *        If this were all there is to it, we could then increase
92  *        parallelism by refining the approximation: partition the
93  *        waiters into buckets by hashing the lock addresses to reduce
94  *        the incidence of spurious wakeups.  But this is not all.
95  *
96  *        The futex(&lock, FUTEX_CMP_REQUEUE, n, timeout, &lock2, m, val)
97  *        operation not only wakes n waiters on lock if lock == val, but
98  *        also _transfers_ m additional waiters to lock2.  Unless wakeups
99  *        on lock2 also trigger wakeups on lock, we cannot move waiters
100  *        to lock2 if they merely share the same hash as waiters on lock.
101  *        Thus, we can't approximately distribute waiters into queues by
102  *        a hash function; we must distinguish futex queues exactly by
103  *        lock address.
104  *
105  *        For now, we use a global red/black tree to index futexes.  This
106  *        should be replaced by a lockless radix tree with a thread to
107  *        free entries no longer in use once all lookups on all CPUs have
108  *        completed.
109  *
110  *        Specifically, we maintain two maps:
111  *
112  *        futex_tab.va[vmspace, va] for private futexes
113  *        futex_tab.oa[uvm_voaddr] for shared futexes
114  *
115  *        This implementation does not support priority inheritance.
116  */
117 
118 #include <sys/param.h>
119 #include <sys/types.h>
120 #include <sys/atomic.h>
121 #include <sys/condvar.h>
122 #include <sys/futex.h>
123 #include <sys/mutex.h>
124 #include <sys/rbtree.h>
125 #include <sys/queue.h>
126 
127 #include <sys/syscall.h>
128 #include <sys/syscallargs.h>
129 #include <sys/syscallvar.h>
130 
131 #include <uvm/uvm_extern.h>
132 
133 /*
134  * Lock order:
135  *
136  *        futex_tab.lock
137  *        futex::fx_qlock                         ordered by kva of struct futex
138  *         -> futex_wait::fw_lock                 only one at a time
139  *        futex_wait::fw_lock           only one at a time
140  *         -> futex::fx_abortlock                 only one at a time
141  */
142 
143 /*
144  * union futex_key
145  *
146  *        A futex is addressed either by a vmspace+va (private) or by
147  *        a uvm_voaddr (shared).
148  */
149 union futex_key {
150           struct {
151                     struct vmspace                          *vmspace;
152                     vaddr_t                                 va;
153           }                             fk_private;
154           struct uvm_voaddr   fk_shared;
155 };
156 
157 /*
158  * struct futex
159  *
160  *        Kernel state for a futex located at a particular address in a
161  *        particular virtual address space.
162  *
163  *        N.B. fx_refcnt is an unsigned long because we need to be able
164  *        to operate on it atomically on all systems while at the same
165  *        time rendering practically impossible the chance of it reaching
166  *        its max value.  In practice, we're limited by the number of LWPs
167  *        that can be present on the system at any given time, and the
168  *        assumption is that limit will be good enough on a 32-bit platform.
169  *        See futex_wake() for why overflow needs to be avoided.
170  */
171 struct futex {
172           union futex_key               fx_key;
173           unsigned long                 fx_refcnt;
174           bool                          fx_shared;
175           bool                          fx_on_tree;
176           struct rb_node                fx_node;
177 
178           kmutex_t                      fx_qlock;
179           TAILQ_HEAD(, futex_wait)      fx_queue;
180 
181           kmutex_t                      fx_abortlock;
182           LIST_HEAD(, futex_wait)                 fx_abortlist;
183           kcondvar_t                              fx_abortcv;
184 };
185 
186 /*
187  * struct futex_wait
188  *
189  *        State for a thread to wait on a futex.  Threads wait on fw_cv
190  *        for fw_bitset to be set to zero.  The thread may transition to
191  *        a different futex queue at any time under the futex's lock.
192  */
193 struct futex_wait {
194           kmutex_t            fw_lock;
195           kcondvar_t                    fw_cv;
196           struct futex                  *fw_futex;
197           TAILQ_ENTRY(futex_wait)       fw_entry; /* queue lock */
198           LIST_ENTRY(futex_wait)        fw_abort; /* queue abortlock */
199           int                           fw_bitset;
200           bool                          fw_aborting;        /* fw_lock */
201 };
202 
203 /*
204  * futex_tab
205  *
206  *        Global trees of futexes by vmspace/va and VM object address.
207  *
208  *        XXX This obviously doesn't scale in parallel.  We could use a
209  *        pserialize-safe data structure, but there may be a high cost to
210  *        frequent deletion since we don't cache futexes after we're done
211  *        with them.  We could use hashed locks.  But for now, just make
212  *        sure userland can't DoS the serial performance, by using a
213  *        balanced binary tree for lookup.
214  *
215  *        XXX We could use a per-process tree for the table indexed by
216  *        virtual address to reduce contention between processes.
217  */
218 static struct {
219           kmutex_t  lock;
220           struct rb_tree      va;
221           struct rb_tree      oa;
222 } futex_tab __cacheline_aligned;
223 
224 static int
compare_futex_key(void * cookie,const void * n,const void * k)225 compare_futex_key(void *cookie, const void *n, const void *k)
226 {
227           const struct futex *fa = n;
228           const union futex_key *fka = &fa->fx_key;
229           const union futex_key *fkb = k;
230 
231           if ((uintptr_t)fka->fk_private.vmspace <
232               (uintptr_t)fkb->fk_private.vmspace)
233                     return -1;
234           if ((uintptr_t)fka->fk_private.vmspace >
235               (uintptr_t)fkb->fk_private.vmspace)
236                     return +1;
237           if (fka->fk_private.va < fkb->fk_private.va)
238                     return -1;
239           if (fka->fk_private.va > fkb->fk_private.va)
240                     return +1;
241           return 0;
242 }
243 
244 static int
compare_futex(void * cookie,const void * na,const void * nb)245 compare_futex(void *cookie, const void *na, const void *nb)
246 {
247           const struct futex *fa = na;
248           const struct futex *fb = nb;
249 
250           return compare_futex_key(cookie, fa, &fb->fx_key);
251 }
252 
253 static const rb_tree_ops_t futex_rb_ops = {
254           .rbto_compare_nodes = compare_futex,
255           .rbto_compare_key = compare_futex_key,
256           .rbto_node_offset = offsetof(struct futex, fx_node),
257 };
258 
259 static int
compare_futex_shared_key(void * cookie,const void * n,const void * k)260 compare_futex_shared_key(void *cookie, const void *n, const void *k)
261 {
262           const struct futex *fa = n;
263           const union futex_key *fka = &fa->fx_key;
264           const union futex_key *fkb = k;
265 
266           return uvm_voaddr_compare(&fka->fk_shared, &fkb->fk_shared);
267 }
268 
269 static int
compare_futex_shared(void * cookie,const void * na,const void * nb)270 compare_futex_shared(void *cookie, const void *na, const void *nb)
271 {
272           const struct futex *fa = na;
273           const struct futex *fb = nb;
274 
275           return compare_futex_shared_key(cookie, fa, &fb->fx_key);
276 }
277 
278 static const rb_tree_ops_t futex_shared_rb_ops = {
279           .rbto_compare_nodes = compare_futex_shared,
280           .rbto_compare_key = compare_futex_shared_key,
281           .rbto_node_offset = offsetof(struct futex, fx_node),
282 };
283 
284 static void         futex_wait_dequeue(struct futex_wait *, struct futex *);
285 
286 /*
287  * futex_load(uaddr, kaddr)
288  *
289  *        Perform a single atomic load to read *uaddr, and return the
290  *        result in *kaddr.  Return 0 on success, EFAULT if uaddr is not
291  *        mapped.
292  */
293 static inline int
futex_load(int * uaddr,int * kaddr)294 futex_load(int *uaddr, int *kaddr)
295 {
296           return ufetch_int((u_int *)uaddr, (u_int *)kaddr);
297 }
298 
299 /*
300  * futex_test(uaddr, expected)
301  *
302  *        True if *uaddr == expected.  False if *uaddr != expected, or if
303  *        uaddr is not mapped.
304  */
305 static bool
futex_test(int * uaddr,int expected)306 futex_test(int *uaddr, int expected)
307 {
308           int val;
309           int error;
310 
311           error = futex_load(uaddr, &val);
312           if (error)
313                     return false;
314           return val == expected;
315 }
316 
317 /*
318  * futex_sys_init()
319  *
320  *        Initialize the futex subsystem.
321  */
322 void
futex_sys_init(void)323 futex_sys_init(void)
324 {
325 
326           mutex_init(&futex_tab.lock, MUTEX_DEFAULT, IPL_NONE);
327           rb_tree_init(&futex_tab.va, &futex_rb_ops);
328           rb_tree_init(&futex_tab.oa, &futex_shared_rb_ops);
329 }
330 
331 /*
332  * futex_sys_fini()
333  *
334  *        Finalize the futex subsystem.
335  */
336 void
futex_sys_fini(void)337 futex_sys_fini(void)
338 {
339 
340           KASSERT(RB_TREE_MIN(&futex_tab.oa) == NULL);
341           KASSERT(RB_TREE_MIN(&futex_tab.va) == NULL);
342           mutex_destroy(&futex_tab.lock);
343 }
344 
345 /*
346  * futex_queue_init(f)
347  *
348  *        Initialize the futex queue.  Caller must call futex_queue_fini
349  *        when done.
350  *
351  *        Never sleeps.
352  */
353 static void
futex_queue_init(struct futex * f)354 futex_queue_init(struct futex *f)
355 {
356 
357           mutex_init(&f->fx_qlock, MUTEX_DEFAULT, IPL_NONE);
358           mutex_init(&f->fx_abortlock, MUTEX_DEFAULT, IPL_NONE);
359           cv_init(&f->fx_abortcv, "fqabort");
360           LIST_INIT(&f->fx_abortlist);
361           TAILQ_INIT(&f->fx_queue);
362 }
363 
364 /*
365  * futex_queue_drain(f)
366  *
367  *        Wait for any aborting waiters in f; then empty the queue of
368  *        any stragglers and wake them.  Caller must guarantee no new
369  *        references to f.
370  *
371  *        May sleep.
372  */
373 static void
futex_queue_drain(struct futex * f)374 futex_queue_drain(struct futex *f)
375 {
376           struct futex_wait *fw, *fw_next;
377 
378           mutex_enter(&f->fx_abortlock);
379           while (!LIST_EMPTY(&f->fx_abortlist))
380                     cv_wait(&f->fx_abortcv, &f->fx_abortlock);
381           mutex_exit(&f->fx_abortlock);
382 
383           mutex_enter(&f->fx_qlock);
384           TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
385                     mutex_enter(&fw->fw_lock);
386                     futex_wait_dequeue(fw, f);
387                     cv_broadcast(&fw->fw_cv);
388                     mutex_exit(&fw->fw_lock);
389           }
390           mutex_exit(&f->fx_qlock);
391 }
392 
393 /*
394  * futex_queue_fini(fq)
395  *
396  *        Finalize the futex queue initialized by futex_queue_init.  Queue
397  *        must be empty.  Caller must not use f again until a subsequent
398  *        futex_queue_init.
399  */
400 static void
futex_queue_fini(struct futex * f)401 futex_queue_fini(struct futex *f)
402 {
403 
404           KASSERT(TAILQ_EMPTY(&f->fx_queue));
405           KASSERT(LIST_EMPTY(&f->fx_abortlist));
406           mutex_destroy(&f->fx_qlock);
407           mutex_destroy(&f->fx_abortlock);
408           cv_destroy(&f->fx_abortcv);
409 }
410 
411 /*
412  * futex_key_init(key, vm, va, shared)
413  *
414  *        Initialize a futex key for lookup, etc.
415  */
416 static int
futex_key_init(union futex_key * fk,struct vmspace * vm,vaddr_t va,bool shared)417 futex_key_init(union futex_key *fk, struct vmspace *vm, vaddr_t va, bool shared)
418 {
419           int error = 0;
420 
421           if (__predict_false(shared)) {
422                     if (!uvm_voaddr_acquire(&vm->vm_map, va, &fk->fk_shared))
423                               error = EFAULT;
424           } else {
425                     fk->fk_private.vmspace = vm;
426                     fk->fk_private.va = va;
427           }
428 
429           return error;
430 }
431 
432 /*
433  * futex_key_fini(key, shared)
434  *
435  *        Release a futex key.
436  */
437 static void
futex_key_fini(union futex_key * fk,bool shared)438 futex_key_fini(union futex_key *fk, bool shared)
439 {
440           if (__predict_false(shared))
441                     uvm_voaddr_release(&fk->fk_shared);
442           memset(fk, 0, sizeof(*fk));
443 }
444 
445 /*
446  * futex_create(fk, shared)
447  *
448  *        Create a futex.  Initial reference count is 1, representing the
449  *        caller.  Returns NULL on failure.  Always takes ownership of the
450  *        key, either transferring it to the newly-created futex, or releasing
451  *        the key if creation fails.
452  *
453  *        Never sleeps for memory, but may sleep to acquire a lock.
454  */
455 static struct futex *
futex_create(union futex_key * fk,bool shared)456 futex_create(union futex_key *fk, bool shared)
457 {
458           struct futex *f;
459 
460           f = kmem_alloc(sizeof(*f), KM_NOSLEEP);
461           if (f == NULL) {
462                     futex_key_fini(fk, shared);
463                     return NULL;
464           }
465           f->fx_key = *fk;
466           f->fx_refcnt = 1;
467           f->fx_shared = shared;
468           f->fx_on_tree = false;
469           futex_queue_init(f);
470 
471           return f;
472 }
473 
474 /*
475  * futex_destroy(f)
476  *
477  *        Destroy a futex created with futex_create.  Reference count
478  *        must be zero.
479  *
480  *        May sleep.
481  */
482 static void
futex_destroy(struct futex * f)483 futex_destroy(struct futex *f)
484 {
485 
486           ASSERT_SLEEPABLE();
487 
488           KASSERT(atomic_load_relaxed(&f->fx_refcnt) == 0);
489           KASSERT(!f->fx_on_tree);
490 
491           /* Drain and destroy the private queue.  */
492           futex_queue_drain(f);
493           futex_queue_fini(f);
494 
495           futex_key_fini(&f->fx_key, f->fx_shared);
496 
497           kmem_free(f, sizeof(*f));
498 }
499 
500 /*
501  * futex_hold(f)
502  *
503  *        Attempt to acquire a reference to f.  Return 0 on success,
504  *        ENFILE on too many references.
505  *
506  *        Never sleeps.
507  */
508 static int
futex_hold(struct futex * f)509 futex_hold(struct futex *f)
510 {
511           unsigned long refcnt;
512 
513           do {
514                     refcnt = atomic_load_relaxed(&f->fx_refcnt);
515                     if (refcnt == ULONG_MAX)
516                               return ENFILE;
517           } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt + 1) != refcnt);
518 
519           return 0;
520 }
521 
522 /*
523  * futex_rele(f)
524  *
525  *        Release a reference to f acquired with futex_create or
526  *        futex_hold.
527  *
528  *        May sleep to free f.
529  */
530 static void
futex_rele(struct futex * f)531 futex_rele(struct futex *f)
532 {
533           unsigned long refcnt;
534 
535           ASSERT_SLEEPABLE();
536 
537           do {
538                     refcnt = atomic_load_relaxed(&f->fx_refcnt);
539                     if (refcnt == 1)
540                               goto trylast;
541                     membar_release();
542           } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
543           return;
544 
545 trylast:
546           mutex_enter(&futex_tab.lock);
547           if (atomic_dec_ulong_nv(&f->fx_refcnt) == 0) {
548                     membar_acquire();
549                     if (f->fx_on_tree) {
550                               if (__predict_false(f->fx_shared))
551                                         rb_tree_remove_node(&futex_tab.oa, f);
552                               else
553                                         rb_tree_remove_node(&futex_tab.va, f);
554                               f->fx_on_tree = false;
555                     }
556           } else {
557                     /* References remain -- don't destroy it.  */
558                     f = NULL;
559           }
560           mutex_exit(&futex_tab.lock);
561           if (f != NULL)
562                     futex_destroy(f);
563 }
564 
565 /*
566  * futex_rele_not_last(f)
567  *
568  *        Release a reference to f acquired with futex_create or
569  *        futex_hold.
570  *
571  *        This version asserts that we are not dropping the last
572  *        reference to f.
573  */
574 static void
futex_rele_not_last(struct futex * f)575 futex_rele_not_last(struct futex *f)
576 {
577           unsigned long refcnt;
578 
579           do {
580                     refcnt = atomic_load_relaxed(&f->fx_refcnt);
581                     KASSERT(refcnt > 1);
582           } while (atomic_cas_ulong(&f->fx_refcnt, refcnt, refcnt - 1) != refcnt);
583 }
584 
585 /*
586  * futex_lookup_by_key(key, shared, &f)
587  *
588  *        Try to find an existing futex va reference in the specified key
589  *        On success, return 0, set f to found futex or to NULL if not found,
590  *        and increment f's reference count if found.
591  *
592  *        Return ENFILE if reference count too high.
593  *
594  *        Internal lookup routine shared by futex_lookup() and
595  *        futex_lookup_create().
596  */
597 static int
futex_lookup_by_key(union futex_key * fk,bool shared,struct futex ** fp)598 futex_lookup_by_key(union futex_key *fk, bool shared, struct futex **fp)
599 {
600           struct futex *f;
601           int error = 0;
602 
603           mutex_enter(&futex_tab.lock);
604           if (__predict_false(shared)) {
605                     f = rb_tree_find_node(&futex_tab.oa, fk);
606           } else {
607                     f = rb_tree_find_node(&futex_tab.va, fk);
608           }
609           if (f) {
610                     error = futex_hold(f);
611                     if (error)
612                               f = NULL;
613           }
614           *fp = f;
615           mutex_exit(&futex_tab.lock);
616 
617           return error;
618 }
619 
620 /*
621  * futex_insert(f, fp)
622  *
623  *        Try to insert the futex f into the tree by va.  If there
624  *        already is a futex for its va, acquire a reference to it, and
625  *        store it in *fp; otherwise store f in *fp.
626  *
627  *        Return 0 on success, ENFILE if there already is a futex but its
628  *        reference count is too high.
629  */
630 static int
futex_insert(struct futex * f,struct futex ** fp)631 futex_insert(struct futex *f, struct futex **fp)
632 {
633           struct futex *f0;
634           int error;
635 
636           KASSERT(atomic_load_relaxed(&f->fx_refcnt) != 0);
637           KASSERT(!f->fx_on_tree);
638 
639           mutex_enter(&futex_tab.lock);
640           if (__predict_false(f->fx_shared))
641                     f0 = rb_tree_insert_node(&futex_tab.oa, f);
642           else
643                     f0 = rb_tree_insert_node(&futex_tab.va, f);
644           if (f0 == f) {
645                     f->fx_on_tree = true;
646                     error = 0;
647           } else {
648                     KASSERT(atomic_load_relaxed(&f0->fx_refcnt) != 0);
649                     KASSERT(f0->fx_on_tree);
650                     error = futex_hold(f0);
651                     if (error)
652                               goto out;
653           }
654           *fp = f0;
655 out:      mutex_exit(&futex_tab.lock);
656 
657           return error;
658 }
659 
660 /*
661  * futex_lookup(uaddr, shared, &f)
662  *
663  *        Find a futex at the userland pointer uaddr in the current
664  *        process's VM space.  On success, return the futex in f and
665  *        increment its reference count.
666  *
667  *        Caller must call futex_rele when done.
668  */
669 static int
futex_lookup(int * uaddr,bool shared,struct futex ** fp)670 futex_lookup(int *uaddr, bool shared, struct futex **fp)
671 {
672           union futex_key fk;
673           struct vmspace *vm = curproc->p_vmspace;
674           vaddr_t va = (vaddr_t)uaddr;
675           int error;
676 
677           /*
678            * Reject unaligned user pointers so we don't cross page
679            * boundaries and so atomics will work.
680            */
681           if ((va & 3) != 0)
682                     return EINVAL;
683 
684           /* Look it up. */
685           error = futex_key_init(&fk, vm, va, shared);
686           if (error)
687                     return error;
688 
689           error = futex_lookup_by_key(&fk, shared, fp);
690           futex_key_fini(&fk, shared);
691           if (error)
692                     return error;
693 
694           KASSERT(*fp == NULL || (*fp)->fx_shared == shared);
695           KASSERT(*fp == NULL || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
696 
697           /*
698            * Success!  (Caller must still check whether we found
699            * anything, but nothing went _wrong_ like trying to use
700            * unmapped memory.)
701            */
702           KASSERT(error == 0);
703 
704           return error;
705 }
706 
707 /*
708  * futex_lookup_create(uaddr, shared, &f)
709  *
710  *        Find or create a futex at the userland pointer uaddr in the
711  *        current process's VM space.  On success, return the futex in f
712  *        and increment its reference count.
713  *
714  *        Caller must call futex_rele when done.
715  */
716 static int
futex_lookup_create(int * uaddr,bool shared,struct futex ** fp)717 futex_lookup_create(int *uaddr, bool shared, struct futex **fp)
718 {
719           union futex_key fk;
720           struct vmspace *vm = curproc->p_vmspace;
721           struct futex *f = NULL;
722           vaddr_t va = (vaddr_t)uaddr;
723           int error;
724 
725           /*
726            * Reject unaligned user pointers so we don't cross page
727            * boundaries and so atomics will work.
728            */
729           if ((va & 3) != 0)
730                     return EINVAL;
731 
732           error = futex_key_init(&fk, vm, va, shared);
733           if (error)
734                     return error;
735 
736           /*
737            * Optimistically assume there already is one, and try to find
738            * it.
739            */
740           error = futex_lookup_by_key(&fk, shared, fp);
741           if (error || *fp != NULL) {
742                     /*
743                      * We either found one, or there was an error.
744                      * In either case, we are done with the key.
745                      */
746                     futex_key_fini(&fk, shared);
747                     goto out;
748           }
749 
750           /*
751            * Create a futex record.  This transfers ownership of the key
752            * in all cases.
753            */
754           f = futex_create(&fk, shared);
755           if (f == NULL) {
756                     error = ENOMEM;
757                     goto out;
758           }
759 
760           /*
761            * Insert our new futex, or use existing if someone else beat
762            * us to it.
763            */
764           error = futex_insert(f, fp);
765           if (error)
766                     goto out;
767           if (*fp == f)
768                     f = NULL; /* don't release on exit */
769 
770           /* Success!  */
771           KASSERT(error == 0);
772 
773 out:      if (f != NULL)
774                     futex_rele(f);
775           KASSERT(error || *fp != NULL);
776           KASSERT(error || atomic_load_relaxed(&(*fp)->fx_refcnt) != 0);
777           return error;
778 }
779 
780 /*
781  * futex_wait_init(fw, bitset)
782  *
783  *        Initialize a record for a thread to wait on a futex matching
784  *        the specified bit set.  Should be passed to futex_wait_enqueue
785  *        before futex_wait, and should be passed to futex_wait_fini when
786  *        done.
787  */
788 static void
futex_wait_init(struct futex_wait * fw,int bitset)789 futex_wait_init(struct futex_wait *fw, int bitset)
790 {
791 
792           KASSERT(bitset);
793 
794           mutex_init(&fw->fw_lock, MUTEX_DEFAULT, IPL_NONE);
795           cv_init(&fw->fw_cv, "futex");
796           fw->fw_futex = NULL;
797           fw->fw_bitset = bitset;
798           fw->fw_aborting = false;
799 }
800 
801 /*
802  * futex_wait_fini(fw)
803  *
804  *        Finalize a record for a futex waiter.  Must not be on any
805  *        futex's queue.
806  */
807 static void
futex_wait_fini(struct futex_wait * fw)808 futex_wait_fini(struct futex_wait *fw)
809 {
810 
811           KASSERT(fw->fw_futex == NULL);
812 
813           cv_destroy(&fw->fw_cv);
814           mutex_destroy(&fw->fw_lock);
815 }
816 
817 /*
818  * futex_wait_enqueue(fw, f)
819  *
820  *        Put fw on the futex queue.  Must be done before futex_wait.
821  *        Caller must hold fw's lock and f's lock, and fw must not be on
822  *        any existing futex's waiter list.
823  */
824 static void
futex_wait_enqueue(struct futex_wait * fw,struct futex * f)825 futex_wait_enqueue(struct futex_wait *fw, struct futex *f)
826 {
827 
828           KASSERT(mutex_owned(&f->fx_qlock));
829           KASSERT(mutex_owned(&fw->fw_lock));
830           KASSERT(fw->fw_futex == NULL);
831           KASSERT(!fw->fw_aborting);
832 
833           fw->fw_futex = f;
834           TAILQ_INSERT_TAIL(&f->fx_queue, fw, fw_entry);
835 }
836 
837 /*
838  * futex_wait_dequeue(fw, f)
839  *
840  *        Remove fw from the futex queue.  Precludes subsequent
841  *        futex_wait until a futex_wait_enqueue.  Caller must hold fw's
842  *        lock and f's lock, and fw must be on f.
843  */
844 static void
futex_wait_dequeue(struct futex_wait * fw,struct futex * f)845 futex_wait_dequeue(struct futex_wait *fw, struct futex *f)
846 {
847 
848           KASSERT(mutex_owned(&f->fx_qlock));
849           KASSERT(mutex_owned(&fw->fw_lock));
850           KASSERT(fw->fw_futex == f);
851 
852           TAILQ_REMOVE(&f->fx_queue, fw, fw_entry);
853           fw->fw_futex = NULL;
854 }
855 
856 /*
857  * futex_wait_abort(fw)
858  *
859  *        Caller is no longer waiting for fw.  Remove it from any queue
860  *        if it was on one.  Caller must hold fw->fw_lock.
861  */
862 static void
futex_wait_abort(struct futex_wait * fw)863 futex_wait_abort(struct futex_wait *fw)
864 {
865           struct futex *f;
866 
867           KASSERT(mutex_owned(&fw->fw_lock));
868 
869           /*
870            * Grab the futex queue.  It can't go away as long as we hold
871            * fw_lock.  However, we can't take the queue lock because
872            * that's a lock order reversal.
873            */
874           f = fw->fw_futex;
875 
876           /* Put us on the abort list so that fq won't go away.  */
877           mutex_enter(&f->fx_abortlock);
878           LIST_INSERT_HEAD(&f->fx_abortlist, fw, fw_abort);
879           mutex_exit(&f->fx_abortlock);
880 
881           /*
882            * Mark fw as aborting so it won't lose wakeups and won't be
883            * transferred to any other queue.
884            */
885           fw->fw_aborting = true;
886 
887           /* f is now stable, so we can release fw_lock.  */
888           mutex_exit(&fw->fw_lock);
889 
890           /* Now we can remove fw under the queue lock.  */
891           mutex_enter(&f->fx_qlock);
892           mutex_enter(&fw->fw_lock);
893           futex_wait_dequeue(fw, f);
894           mutex_exit(&fw->fw_lock);
895           mutex_exit(&f->fx_qlock);
896 
897           /*
898            * Finally, remove us from the abort list and notify anyone
899            * waiting for the abort to complete if we were the last to go.
900            */
901           mutex_enter(&f->fx_abortlock);
902           LIST_REMOVE(fw, fw_abort);
903           if (LIST_EMPTY(&f->fx_abortlist))
904                     cv_broadcast(&f->fx_abortcv);
905           mutex_exit(&f->fx_abortlock);
906 
907           /*
908            * Release our reference to the futex now that we are not
909            * waiting for it.
910            */
911           futex_rele(f);
912 
913           /*
914            * Reacquire the fw lock as caller expects.  Verify that we're
915            * aborting and no longer associated with a futex.
916            */
917           mutex_enter(&fw->fw_lock);
918           KASSERT(fw->fw_aborting);
919           KASSERT(fw->fw_futex == NULL);
920 }
921 
922 /*
923  * futex_wait(fw, deadline, clkid)
924  *
925  *        fw must be a waiter on a futex's queue.  Wait until deadline on
926  *        the clock clkid, or forever if deadline is NULL, for a futex
927  *        wakeup.  Return 0 on explicit wakeup or destruction of futex,
928  *        ETIMEDOUT on timeout, EINTR/ERESTART on signal.  Either way, fw
929  *        will no longer be on a futex queue on return.
930  */
931 static int
futex_wait(struct futex_wait * fw,const struct timespec * deadline,clockid_t clkid)932 futex_wait(struct futex_wait *fw, const struct timespec *deadline,
933     clockid_t clkid)
934 {
935           int error = 0;
936 
937           /* Test and wait under the wait lock.  */
938           mutex_enter(&fw->fw_lock);
939 
940           for (;;) {
941                     /* If we're done yet, stop and report success.  */
942                     if (fw->fw_bitset == 0 || fw->fw_futex == NULL) {
943                               error = 0;
944                               break;
945                     }
946 
947                     /* If anything went wrong in the last iteration, stop.  */
948                     if (error)
949                               break;
950 
951                     /* Not done yet.  Wait.  */
952                     if (deadline) {
953                               struct timespec ts;
954 
955                               /* Check our watch.  */
956                               error = clock_gettime1(clkid, &ts);
957                               if (error)
958                                         break;
959 
960                               /* If we're past the deadline, ETIMEDOUT.  */
961                               if (timespeccmp(deadline, &ts, <=)) {
962                                         error = ETIMEDOUT;
963                                         break;
964                               }
965 
966                               /* Count how much time is left.  */
967                               timespecsub(deadline, &ts, &ts);
968 
969                               /*
970                                * Wait for that much time, allowing signals.
971                                * Ignore EWOULDBLOCK, however: we will detect
972                                * timeout ourselves on the next iteration of
973                                * the loop -- and the timeout may have been
974                                * truncated by tstohz, anyway.
975                                */
976                               error = cv_timedwait_sig(&fw->fw_cv, &fw->fw_lock,
977                                   MAX(1, tstohz(&ts)));
978                               if (error == EWOULDBLOCK)
979                                         error = 0;
980                     } else {
981                               /* Wait indefinitely, allowing signals. */
982                               error = cv_wait_sig(&fw->fw_cv, &fw->fw_lock);
983                     }
984           }
985 
986           /*
987            * If we were woken up, the waker will have removed fw from the
988            * queue.  But if anything went wrong, we must remove fw from
989            * the queue ourselves.
990            */
991           if (error)
992                     futex_wait_abort(fw);
993 
994           mutex_exit(&fw->fw_lock);
995 
996           return error;
997 }
998 
999 /*
1000  * futex_wake(f, nwake, f2, nrequeue, bitset)
1001  *
1002  *        Wake up to nwake waiters on f matching bitset; then, if f2 is
1003  *        provided, move up to nrequeue remaining waiters on f matching
1004  *        bitset to f2.  Return the number of waiters actually woken or
1005  *        requeued.  Caller must hold the locks of f and f2, if provided.
1006  */
1007 static unsigned
futex_wake(struct futex * f,unsigned nwake,struct futex * f2,unsigned nrequeue,int bitset)1008 futex_wake(struct futex *f, unsigned nwake, struct futex *f2,
1009     unsigned nrequeue, int bitset)
1010 {
1011           struct futex_wait *fw, *fw_next;
1012           unsigned nwoken_or_requeued = 0;
1013           int hold_error __diagused;
1014 
1015           KASSERT(mutex_owned(&f->fx_qlock));
1016           KASSERT(f2 == NULL || mutex_owned(&f2->fx_qlock));
1017 
1018           /* Wake up to nwake waiters, and count the number woken.  */
1019           TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
1020                     if ((fw->fw_bitset & bitset) == 0)
1021                               continue;
1022                     if (nwake > 0) {
1023                               mutex_enter(&fw->fw_lock);
1024                               if (__predict_false(fw->fw_aborting)) {
1025                                         mutex_exit(&fw->fw_lock);
1026                                         continue;
1027                               }
1028                               futex_wait_dequeue(fw, f);
1029                               fw->fw_bitset = 0;
1030                               cv_broadcast(&fw->fw_cv);
1031                               mutex_exit(&fw->fw_lock);
1032                               nwake--;
1033                               nwoken_or_requeued++;
1034                               /*
1035                                * Drop the futex reference on behalf of the
1036                                * waiter.  We assert this is not the last
1037                                * reference on the futex (our caller should
1038                                * also have one).
1039                                */
1040                               futex_rele_not_last(f);
1041                     } else {
1042                               break;
1043                     }
1044           }
1045 
1046           if (f2) {
1047                     /* Move up to nrequeue waiters from f's queue to f2's queue. */
1048                     TAILQ_FOREACH_SAFE(fw, &f->fx_queue, fw_entry, fw_next) {
1049                               if ((fw->fw_bitset & bitset) == 0)
1050                                         continue;
1051                               if (nrequeue > 0) {
1052                                         mutex_enter(&fw->fw_lock);
1053                                         if (__predict_false(fw->fw_aborting)) {
1054                                                   mutex_exit(&fw->fw_lock);
1055                                                   continue;
1056                                         }
1057                                         futex_wait_dequeue(fw, f);
1058                                         futex_wait_enqueue(fw, f2);
1059                                         mutex_exit(&fw->fw_lock);
1060                                         nrequeue--;
1061                                         /*
1062                                          * PR kern/59004: Missing constant for upper
1063                                          * bound on systemwide number of lwps
1064                                          */
1065                                         KASSERT(nwoken_or_requeued <
1066                                             MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK));
1067                                         __CTASSERT(UINT_MAX >=
1068                                             MIN(PID_MAX*MAXMAXLWP, FUTEX_TID_MASK));
1069                                         if (++nwoken_or_requeued == 0) /* paranoia */
1070                                                   nwoken_or_requeued = UINT_MAX;
1071                                         /*
1072                                          * Transfer the reference from f to f2.
1073                                          * As above, we assert that we are not
1074                                          * dropping the last reference to f here.
1075                                          *
1076                                          * XXX futex_hold() could theoretically
1077                                          * XXX fail here.
1078                                          */
1079                                         futex_rele_not_last(f);
1080                                         hold_error = futex_hold(f2);
1081                                         KASSERT(hold_error == 0);
1082                               } else {
1083                                         break;
1084                               }
1085                     }
1086           } else {
1087                     KASSERT(nrequeue == 0);
1088           }
1089 
1090           /* Return the number of waiters woken or requeued.  */
1091           return nwoken_or_requeued;
1092 }
1093 
1094 /*
1095  * futex_queue_lock(f)
1096  *
1097  *        Acquire the queue lock of f.  Pair with futex_queue_unlock.  Do
1098  *        not use if caller needs to acquire two locks; use
1099  *        futex_queue_lock2 instead.
1100  */
1101 static void
futex_queue_lock(struct futex * f)1102 futex_queue_lock(struct futex *f)
1103 {
1104           mutex_enter(&f->fx_qlock);
1105 }
1106 
1107 /*
1108  * futex_queue_unlock(f)
1109  *
1110  *        Release the queue lock of f.
1111  */
1112 static void
futex_queue_unlock(struct futex * f)1113 futex_queue_unlock(struct futex *f)
1114 {
1115           mutex_exit(&f->fx_qlock);
1116 }
1117 
1118 /*
1119  * futex_queue_lock2(f, f2)
1120  *
1121  *        Acquire the queue locks of both f and f2, which may be null, or
1122  *        which may have the same underlying queue.  If they are
1123  *        distinct, an arbitrary total order is chosen on the locks.
1124  *
1125  *        Callers should only ever acquire multiple queue locks
1126  *        simultaneously using futex_queue_lock2.
1127  */
1128 static void
futex_queue_lock2(struct futex * f,struct futex * f2)1129 futex_queue_lock2(struct futex *f, struct futex *f2)
1130 {
1131 
1132           /*
1133            * If both are null, do nothing; if one is null and the other
1134            * is not, lock the other and be done with it.
1135            */
1136           if (f == NULL && f2 == NULL) {
1137                     return;
1138           } else if (f == NULL) {
1139                     mutex_enter(&f2->fx_qlock);
1140                     return;
1141           } else if (f2 == NULL) {
1142                     mutex_enter(&f->fx_qlock);
1143                     return;
1144           }
1145 
1146           /* If both futexes are the same, acquire only one. */
1147           if (f == f2) {
1148                     mutex_enter(&f->fx_qlock);
1149                     return;
1150           }
1151 
1152           /* Otherwise, use the ordering on the kva of the futex pointer.  */
1153           if ((uintptr_t)f < (uintptr_t)f2) {
1154                     mutex_enter(&f->fx_qlock);
1155                     mutex_enter(&f2->fx_qlock);
1156           } else {
1157                     mutex_enter(&f2->fx_qlock);
1158                     mutex_enter(&f->fx_qlock);
1159           }
1160 }
1161 
1162 /*
1163  * futex_queue_unlock2(f, f2)
1164  *
1165  *        Release the queue locks of both f and f2, which may be null, or
1166  *        which may have the same underlying queue.
1167  */
1168 static void
futex_queue_unlock2(struct futex * f,struct futex * f2)1169 futex_queue_unlock2(struct futex *f, struct futex *f2)
1170 {
1171 
1172           /*
1173            * If both are null, do nothing; if one is null and the other
1174            * is not, unlock the other and be done with it.
1175            */
1176           if (f == NULL && f2 == NULL) {
1177                     return;
1178           } else if (f == NULL) {
1179                     mutex_exit(&f2->fx_qlock);
1180                     return;
1181           } else if (f2 == NULL) {
1182                     mutex_exit(&f->fx_qlock);
1183                     return;
1184           }
1185 
1186           /* If both futexes are the same, release only one. */
1187           if (f == f2) {
1188                     mutex_exit(&f->fx_qlock);
1189                     return;
1190           }
1191 
1192           /* Otherwise, use the ordering on the kva of the futex pointer.  */
1193           if ((uintptr_t)f < (uintptr_t)f2) {
1194                     mutex_exit(&f2->fx_qlock);
1195                     mutex_exit(&f->fx_qlock);
1196           } else {
1197                     mutex_exit(&f->fx_qlock);
1198                     mutex_exit(&f2->fx_qlock);
1199           }
1200 }
1201 
1202 /*
1203  * futex_func_wait(uaddr, cmpval@val, bitset@val3, timeout, clkid, clkflags,
1204  *     retval)
1205  *
1206  *        Implement futex(FUTEX_WAIT) and futex(FUTEX_WAIT_BITSET): If
1207  *        *uaddr == cmpval, wait until futex-woken on any of the bits in
1208  *        bitset.  But if *uaddr != cmpval, fail with EAGAIN.
1209  *
1210  *        For FUTEX_WAIT, bitset has all bits set and val3 is ignored.
1211  */
1212 static int
futex_func_wait(bool shared,int * uaddr,int cmpval,int bitset,const struct timespec * timeout,clockid_t clkid,int clkflags,register_t * retval)1213 futex_func_wait(bool shared, int *uaddr, int cmpval, int bitset,
1214     const struct timespec *timeout, clockid_t clkid, int clkflags,
1215     register_t *retval)
1216 {
1217           struct futex *f;
1218           struct futex_wait wait, *fw = &wait;
1219           struct timespec ts;
1220           const struct timespec *deadline;
1221           int error;
1222 
1223           /*
1224            * If there's nothing to wait for, and nobody will ever wake
1225            * us, then don't set anything up to wait -- just stop here.
1226            */
1227           if (bitset == 0)
1228                     return EINVAL;
1229 
1230           /* Optimistically test before anything else.  */
1231           if (!futex_test(uaddr, cmpval))
1232                     return EAGAIN;
1233 
1234           /* Determine a deadline on the specified clock.  */
1235           if (timeout == NULL || (clkflags & TIMER_ABSTIME) == TIMER_ABSTIME) {
1236                     deadline = timeout;
1237           } else {
1238                     error = clock_gettime1(clkid, &ts);
1239                     if (error)
1240                               return error;
1241                     timespecadd(&ts, timeout, &ts);
1242                     deadline = &ts;
1243           }
1244 
1245           /* Get the futex, creating it if necessary.  */
1246           error = futex_lookup_create(uaddr, shared, &f);
1247           if (error)
1248                     return error;
1249           KASSERT(f);
1250 
1251           /* Get ready to wait.  */
1252           futex_wait_init(fw, bitset);
1253 
1254           /*
1255            * Under the queue lock, check the value again: if it has
1256            * already changed, EAGAIN; otherwise enqueue the waiter.
1257            * Since FUTEX_WAKE will use the same lock and be done after
1258            * modifying the value, the order in which we check and enqueue
1259            * is immaterial.
1260            */
1261           futex_queue_lock(f);
1262           if (!futex_test(uaddr, cmpval)) {
1263                     futex_queue_unlock(f);
1264                     error = EAGAIN;
1265                     goto out;
1266           }
1267           mutex_enter(&fw->fw_lock);
1268           futex_wait_enqueue(fw, f);
1269           mutex_exit(&fw->fw_lock);
1270           futex_queue_unlock(f);
1271 
1272           /*
1273            * We cannot drop our reference to the futex here, because
1274            * we might be enqueued on a different one when we are awakened.
1275            * The references will be managed on our behalf in the requeue
1276            * and wake cases.
1277            */
1278           f = NULL;
1279 
1280           /* Wait. */
1281           error = futex_wait(fw, deadline, clkid);
1282           if (error)
1283                     goto out;
1284 
1285           /* Return 0 on success, error on failure. */
1286           *retval = 0;
1287 
1288 out:      if (f != NULL)
1289                     futex_rele(f);
1290           futex_wait_fini(fw);
1291           return error;
1292 }
1293 
1294 /*
1295  * futex_func_wake(uaddr, nwake@val, bitset@val3, retval)
1296  *
1297  *        Implement futex(FUTEX_WAKE) and futex(FUTEX_WAKE_BITSET): Wake
1298  *        up to nwake waiters on uaddr waiting on any of the bits in
1299  *        bitset.
1300  *
1301  *        Return the number of waiters woken.
1302  *
1303  *        For FUTEX_WAKE, bitset has all bits set and val3 is ignored.
1304  */
1305 static int
futex_func_wake(bool shared,int * uaddr,int nwake,int bitset,register_t * retval)1306 futex_func_wake(bool shared, int *uaddr, int nwake, int bitset,
1307     register_t *retval)
1308 {
1309           struct futex *f;
1310           unsigned int nwoken = 0;
1311           int error = 0;
1312 
1313           /* Reject negative number of wakeups.  */
1314           if (nwake < 0) {
1315                     error = EINVAL;
1316                     goto out;
1317           }
1318 
1319           /* Look up the futex, if any.  */
1320           error = futex_lookup(uaddr, shared, &f);
1321           if (error)
1322                     goto out;
1323 
1324           /* If there's no futex, there are no waiters to wake.  */
1325           if (f == NULL)
1326                     goto out;
1327 
1328           /*
1329            * Under f's queue lock, wake the waiters and remember the
1330            * number woken.
1331            */
1332           futex_queue_lock(f);
1333           nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0, bitset);
1334           futex_queue_unlock(f);
1335 
1336           /* Release the futex.  */
1337           futex_rele(f);
1338 
1339 out:
1340           /* Return the number of waiters woken.  */
1341           *retval = nwoken;
1342 
1343           /* Success!  */
1344           return error;
1345 }
1346 
1347 /*
1348  * futex_func_requeue(op, uaddr, nwake@val, uaddr2, nrequeue@val2,
1349  *     cmpval@val3, retval)
1350  *
1351  *        Implement futex(FUTEX_REQUEUE) and futex(FUTEX_CMP_REQUEUE): If
1352  *        *uaddr == cmpval or if op == FUTEX_REQUEUE, wake up to nwake
1353  *        waiters at uaddr and then requeue up to nrequeue waiters from
1354  *        uaddr to uaddr2.
1355  *
1356  *        Return the number of waiters woken or requeued.
1357  *
1358  *        For FUTEX_CMP_REQUEUE, if *uaddr != cmpval, fail with EAGAIN
1359  *        and no wakeups.
1360  */
1361 static int
futex_func_requeue(bool shared,int op,int * uaddr,int nwake,int * uaddr2,int nrequeue,int cmpval,register_t * retval)1362 futex_func_requeue(bool shared, int op, int *uaddr, int nwake, int *uaddr2,
1363     int nrequeue, int cmpval, register_t *retval)
1364 {
1365           struct futex *f = NULL, *f2 = NULL;
1366           unsigned nwoken_or_requeued = 0; /* default to zero on early return */
1367           int error;
1368 
1369           /* Reject negative number of wakeups or requeues. */
1370           if (nwake < 0 || nrequeue < 0) {
1371                     error = EINVAL;
1372                     goto out;
1373           }
1374 
1375           /*
1376            * Look up or create the source futex.  For FUTEX_CMP_REQUEUE,
1377            * we always create it, rather than bail if it has no waiters,
1378            * because FUTEX_CMP_REQUEUE always tests the futex word in
1379            * order to report EAGAIN.
1380            */
1381           error = (op == FUTEX_CMP_REQUEUE
1382               ? futex_lookup_create(uaddr, shared, &f)
1383               : futex_lookup(uaddr, shared, &f));
1384           if (error)
1385                     goto out;
1386 
1387           /* If there is none for FUTEX_REQUEUE, nothing to do. */
1388           if (f == NULL) {
1389                     KASSERT(op != FUTEX_CMP_REQUEUE);
1390                     goto out;
1391           }
1392 
1393           /*
1394            * We may need to create the destination futex because it's
1395            * entirely possible it does not currently have any waiters.
1396            */
1397           error = futex_lookup_create(uaddr2, shared, &f2);
1398           if (error)
1399                     goto out;
1400 
1401           /*
1402            * Under the futexes' queue locks, check the value; if
1403            * unchanged from cmpval, or if this is the unconditional
1404            * FUTEX_REQUEUE operation, wake the waiters.
1405            */
1406           futex_queue_lock2(f, f2);
1407           if (op == FUTEX_CMP_REQUEUE && !futex_test(uaddr, cmpval)) {
1408                     error = EAGAIN;
1409           } else {
1410                     error = 0;
1411                     nwoken_or_requeued = futex_wake(f, nwake, f2, nrequeue,
1412                         FUTEX_BITSET_MATCH_ANY);
1413           }
1414           futex_queue_unlock2(f, f2);
1415 
1416 out:
1417           /* Return the number of waiters woken or requeued.  */
1418           *retval = nwoken_or_requeued;
1419 
1420           /* Release the futexes if we got them.  */
1421           if (f2)
1422                     futex_rele(f2);
1423           if (f)
1424                     futex_rele(f);
1425           return error;
1426 }
1427 
1428 /*
1429  * futex_opcmp_arg(arg)
1430  *
1431  *        arg is either the oparg or cmparg field of a FUTEX_WAKE_OP
1432  *        operation, a 12-bit string in either case.  Map it to a numeric
1433  *        argument value by sign-extending it in two's-complement
1434  *        representation.
1435  */
1436 static int
futex_opcmp_arg(int arg)1437 futex_opcmp_arg(int arg)
1438 {
1439 
1440           KASSERT(arg == (arg & __BITS(11,0)));
1441           return arg - 0x1000*__SHIFTOUT(arg, __BIT(11));
1442 }
1443 
1444 /*
1445  * futex_validate_op_cmp(opcmp)
1446  *
1447  *        Validate an op/cmp argument for FUTEX_WAKE_OP.
1448  */
1449 static int
futex_validate_op_cmp(int opcmp)1450 futex_validate_op_cmp(int opcmp)
1451 {
1452           int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK);
1453           int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK);
1454 
1455           if (op & FUTEX_OP_OPARG_SHIFT) {
1456                     int oparg =
1457                         futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK));
1458                     if (oparg < 0)
1459                               return EINVAL;
1460                     if (oparg >= 32)
1461                               return EINVAL;
1462                     op &= ~FUTEX_OP_OPARG_SHIFT;
1463           }
1464 
1465           switch (op) {
1466           case FUTEX_OP_SET:
1467           case FUTEX_OP_ADD:
1468           case FUTEX_OP_OR:
1469           case FUTEX_OP_ANDN:
1470           case FUTEX_OP_XOR:
1471                     break;
1472           default:
1473                     return EINVAL;
1474           }
1475 
1476           switch (cmp) {
1477           case FUTEX_OP_CMP_EQ:
1478           case FUTEX_OP_CMP_NE:
1479           case FUTEX_OP_CMP_LT:
1480           case FUTEX_OP_CMP_LE:
1481           case FUTEX_OP_CMP_GT:
1482           case FUTEX_OP_CMP_GE:
1483                     break;
1484           default:
1485                     return EINVAL;
1486           }
1487 
1488           return 0;
1489 }
1490 
1491 /*
1492  * futex_compute_op(oldval, opcmp)
1493  *
1494  *        Apply a FUTEX_WAKE_OP operation to oldval.
1495  */
1496 static int
futex_compute_op(int oldval,int opcmp)1497 futex_compute_op(int oldval, int opcmp)
1498 {
1499           int op = __SHIFTOUT(opcmp, FUTEX_OP_OP_MASK);
1500           int oparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_OPARG_MASK));
1501 
1502           if (op & FUTEX_OP_OPARG_SHIFT) {
1503                     KASSERT(oparg >= 0);
1504                     KASSERT(oparg < 32);
1505                     oparg = 1u << oparg;
1506                     op &= ~FUTEX_OP_OPARG_SHIFT;
1507           }
1508 
1509           switch (op) {
1510           case FUTEX_OP_SET:
1511                     return oparg;
1512 
1513           case FUTEX_OP_ADD:
1514                     /*
1515                      * Avoid signed arithmetic overflow by doing
1516                      * arithmetic unsigned and converting back to signed
1517                      * at the end.
1518                      */
1519                     return (int)((unsigned)oldval + (unsigned)oparg);
1520 
1521           case FUTEX_OP_OR:
1522                     return oldval | oparg;
1523 
1524           case FUTEX_OP_ANDN:
1525                     return oldval & ~oparg;
1526 
1527           case FUTEX_OP_XOR:
1528                     return oldval ^ oparg;
1529 
1530           default:
1531                     panic("invalid futex op");
1532           }
1533 }
1534 
1535 /*
1536  * futex_compute_cmp(oldval, opcmp)
1537  *
1538  *        Apply a FUTEX_WAKE_OP comparison to oldval.
1539  */
1540 static bool
futex_compute_cmp(int oldval,int opcmp)1541 futex_compute_cmp(int oldval, int opcmp)
1542 {
1543           int cmp = __SHIFTOUT(opcmp, FUTEX_OP_CMP_MASK);
1544           int cmparg = futex_opcmp_arg(__SHIFTOUT(opcmp, FUTEX_OP_CMPARG_MASK));
1545 
1546           switch (cmp) {
1547           case FUTEX_OP_CMP_EQ:
1548                     return (oldval == cmparg);
1549 
1550           case FUTEX_OP_CMP_NE:
1551                     return (oldval != cmparg);
1552 
1553           case FUTEX_OP_CMP_LT:
1554                     return (oldval < cmparg);
1555 
1556           case FUTEX_OP_CMP_LE:
1557                     return (oldval <= cmparg);
1558 
1559           case FUTEX_OP_CMP_GT:
1560                     return (oldval > cmparg);
1561 
1562           case FUTEX_OP_CMP_GE:
1563                     return (oldval >= cmparg);
1564 
1565           default:
1566                     panic("invalid futex cmp operation");
1567           }
1568 }
1569 
1570 /*
1571  * futex_func_wake_op(uaddr, nwake@val, uaddr2, nwake2@val2, opcmp@val3,
1572  *     retval)
1573  *
1574  *        Implement futex(FUTEX_WAKE_OP):
1575  *
1576  *        1. Update *uaddr2 according to the r/m/w operation specified in
1577  *           opcmp.
1578  *
1579  *        2. If uaddr is nonnull, wake up to nwake waiters at uaddr.
1580  *
1581  *        3. If what was previously at *uaddr2 matches the comparison
1582  *           operation specified in opcmp, additionally wake up to nwake2
1583  *           waiters at uaddr2.
1584  */
1585 static int
futex_func_wake_op(bool shared,int * uaddr,int nwake,int * uaddr2,int nwake2,int opcmp,register_t * retval)1586 futex_func_wake_op(bool shared, int *uaddr, int nwake, int *uaddr2, int nwake2,
1587     int opcmp, register_t *retval)
1588 {
1589           struct futex *f = NULL, *f2 = NULL;
1590           int oldval, newval, actual;
1591           unsigned nwoken = 0;
1592           int error;
1593 
1594           /* Reject negative number of wakeups.  */
1595           if (nwake < 0 || nwake2 < 0) {
1596                     error = EINVAL;
1597                     goto out;
1598           }
1599 
1600           /* Reject invalid operations before we start doing things.  */
1601           if ((error = futex_validate_op_cmp(opcmp)) != 0)
1602                     goto out;
1603 
1604           /* Look up the first futex, if any.  */
1605           error = futex_lookup(uaddr, shared, &f);
1606           if (error)
1607                     goto out;
1608 
1609           /* Look up the second futex, if any.  */
1610           error = futex_lookup(uaddr2, shared, &f2);
1611           if (error)
1612                     goto out;
1613 
1614           /*
1615            * Under the queue locks:
1616            *
1617            * 1. Read/modify/write: *uaddr2 op= oparg, as in opcmp.
1618            * 2. Unconditionally wake uaddr.
1619            * 3. Conditionally wake uaddr2, if it previously matched the
1620            *    comparison in opcmp.
1621            */
1622           futex_queue_lock2(f, f2);
1623           do {
1624                     error = futex_load(uaddr2, &oldval);
1625                     if (error)
1626                               goto out_unlock;
1627                     newval = futex_compute_op(oldval, opcmp);
1628                     error = ucas_int(uaddr2, oldval, newval, &actual);
1629                     if (error)
1630                               goto out_unlock;
1631           } while (actual != oldval);
1632           if (f == NULL) {
1633                     nwoken = 0;
1634           } else {
1635                     nwoken = futex_wake(f, nwake, NULL, /*nrequeue*/0,
1636                         FUTEX_BITSET_MATCH_ANY);
1637           }
1638           if (f2 && futex_compute_cmp(oldval, opcmp)) {
1639                     nwoken += futex_wake(f2, nwake2, NULL, /*nrequeue*/0,
1640                         FUTEX_BITSET_MATCH_ANY);
1641           }
1642 
1643           /* Success! */
1644           error = 0;
1645 out_unlock:
1646           futex_queue_unlock2(f, f2);
1647 
1648 out:
1649           /* Return the number of waiters woken. */
1650           *retval = nwoken;
1651 
1652           /* Release the futexes, if we got them. */
1653           if (f2)
1654                     futex_rele(f2);
1655           if (f)
1656                     futex_rele(f);
1657           return error;
1658 }
1659 
1660 /*
1661  * do_futex(uaddr, op, val, timeout, uaddr2, val2, val3)
1662  *
1663  *        Implement the futex system call with all the parameters
1664  *        parsed out.
1665  */
1666 int
do_futex(int * uaddr,int op,int val,const struct timespec * timeout,int * uaddr2,int val2,int val3,register_t * retval)1667 do_futex(int *uaddr, int op, int val, const struct timespec *timeout,
1668     int *uaddr2, int val2, int val3, register_t *retval)
1669 {
1670           const bool shared = (op & FUTEX_PRIVATE_FLAG) ? false : true;
1671           const clockid_t clkid = (op & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME
1672                                                                           : CLOCK_MONOTONIC;
1673 
1674           op &= FUTEX_CMD_MASK;
1675 
1676           switch (op) {
1677           case FUTEX_WAIT: {
1678                     const int cmpval = val;
1679                     const int bitset = FUTEX_BITSET_MATCH_ANY;
1680 
1681                     return futex_func_wait(shared, uaddr, cmpval, bitset, timeout,
1682                         clkid, TIMER_RELTIME, retval);
1683           }
1684           case FUTEX_WAKE: {
1685                     const int nwake = val;
1686                     const int bitset = FUTEX_BITSET_MATCH_ANY;
1687 
1688                     return futex_func_wake(shared, uaddr, nwake, bitset, retval);
1689           }
1690           case FUTEX_WAKE_BITSET: {
1691                     const int nwake = val;
1692                     const int bitset = val3;
1693 
1694                     return futex_func_wake(shared, uaddr, nwake, bitset, retval);
1695           }
1696           case FUTEX_REQUEUE:
1697           case FUTEX_CMP_REQUEUE: {
1698                     const int nwake = val;
1699                     const int nrequeue = val2;
1700                     const int cmpval = val3; /* ignored if op=FUTEX_REQUEUE */
1701 
1702                     return futex_func_requeue(shared, op, uaddr, nwake, uaddr2,
1703                         nrequeue, cmpval, retval);
1704           }
1705           case FUTEX_WAIT_BITSET: {
1706                     const int cmpval = val;
1707                     const int bitset = val3;
1708 
1709                     return futex_func_wait(shared, uaddr, cmpval, bitset, timeout,
1710                         clkid, TIMER_ABSTIME, retval);
1711           }
1712           case FUTEX_WAKE_OP: {
1713                     const int nwake = val;
1714                     const int nwake2 = val2;
1715                     const int opcmp = val3;
1716 
1717                     return futex_func_wake_op(shared, uaddr, nwake, uaddr2, nwake2,
1718                         opcmp, retval);
1719           }
1720           case FUTEX_FD:
1721           default:
1722                     return ENOSYS;
1723           }
1724 }
1725 
1726 /*
1727  * sys___futex(l, uap, retval)
1728  *
1729  *        __futex(2) system call: generic futex operations.
1730  */
1731 int
sys___futex(struct lwp * l,const struct sys___futex_args * uap,register_t * retval)1732 sys___futex(struct lwp *l, const struct sys___futex_args *uap,
1733     register_t *retval)
1734 {
1735           /* {
1736                     syscallarg(int *) uaddr;
1737                     syscallarg(int) op;
1738                     syscallarg(int) val;
1739                     syscallarg(const struct timespec *) timeout;
1740                     syscallarg(int *) uaddr2;
1741                     syscallarg(int) val2;
1742                     syscallarg(int) val3;
1743           } */
1744           struct timespec ts, *tsp;
1745           int error;
1746 
1747           /*
1748            * Copy in the timeout argument, if specified.
1749            */
1750           if (SCARG(uap, timeout)) {
1751                     error = copyin(SCARG(uap, timeout), &ts, sizeof(ts));
1752                     if (error)
1753                               return error;
1754                     tsp = &ts;
1755           } else {
1756                     tsp = NULL;
1757           }
1758 
1759           return do_futex(SCARG(uap, uaddr), SCARG(uap, op), SCARG(uap, val),
1760               tsp, SCARG(uap, uaddr2), SCARG(uap, val2), SCARG(uap, val3),
1761               retval);
1762 }
1763 
1764 /*
1765  * sys___futex_set_robust_list(l, uap, retval)
1766  *
1767  *        __futex_set_robust_list(2) system call for robust futexes.
1768  */
1769 int
sys___futex_set_robust_list(struct lwp * l,const struct sys___futex_set_robust_list_args * uap,register_t * retval)1770 sys___futex_set_robust_list(struct lwp *l,
1771     const struct sys___futex_set_robust_list_args *uap, register_t *retval)
1772 {
1773           /* {
1774                     syscallarg(void *) head;
1775                     syscallarg(size_t) len;
1776           } */
1777           void *head = SCARG(uap, head);
1778 
1779           if (SCARG(uap, len) != _FUTEX_ROBUST_HEAD_SIZE)
1780                     return EINVAL;
1781           if ((uintptr_t)head % sizeof(u_long))
1782                     return EINVAL;
1783 
1784           l->l_robust_head = (uintptr_t)head;
1785 
1786           return 0;
1787 }
1788 
1789 /*
1790  * sys___futex_get_robust_list(l, uap, retval)
1791  *
1792  *        __futex_get_robust_list(2) system call for robust futexes.
1793  */
1794 int
sys___futex_get_robust_list(struct lwp * l,const struct sys___futex_get_robust_list_args * uap,register_t * retval)1795 sys___futex_get_robust_list(struct lwp *l,
1796     const struct sys___futex_get_robust_list_args *uap, register_t *retval)
1797 {
1798           /* {
1799                     syscallarg(lwpid_t) lwpid;
1800                     syscallarg(void **) headp;
1801                     syscallarg(size_t *) lenp;
1802           } */
1803           void *head;
1804           const size_t len = _FUTEX_ROBUST_HEAD_SIZE;
1805           int error;
1806 
1807           error = futex_robust_head_lookup(l, SCARG(uap, lwpid), &head);
1808           if (error)
1809                     return error;
1810 
1811           /* Copy out the head pointer and the head structure length. */
1812           error = copyout(&head, SCARG(uap, headp), sizeof(head));
1813           if (__predict_true(error == 0)) {
1814                     error = copyout(&len, SCARG(uap, lenp), sizeof(len));
1815           }
1816 
1817           return error;
1818 }
1819 
1820 /*
1821  * release_futex(uva, tid)
1822  *
1823  *        Try to release the robust futex at uva in the current process
1824  *        on lwp exit.  If anything goes wrong, silently fail.  It is the
1825  *        userland program's obligation to arrange correct behaviour.
1826  */
1827 static void
release_futex(uintptr_t const uptr,lwpid_t const tid,bool const is_pi,bool const is_pending)1828 release_futex(uintptr_t const uptr, lwpid_t const tid, bool const is_pi,
1829     bool const is_pending)
1830 {
1831           int *uaddr;
1832           struct futex *f;
1833           int oldval, newval, actual;
1834           int error;
1835 
1836           /* If it's misaligned, tough.  */
1837           if (__predict_false(uptr & 3))
1838                     return;
1839           uaddr = (int *)uptr;
1840 
1841           error = futex_load(uaddr, &oldval);
1842           if (__predict_false(error))
1843                     return;
1844 
1845           /*
1846            * There are two race conditions we need to handle here:
1847            *
1848            * 1. User space cleared the futex word but died before
1849            *    being able to issue the wakeup.  No wakeups will
1850            *    ever be issued, oops!
1851            *
1852            * 2. Awakened waiter died before being able to acquire
1853            *    the futex in user space.  Any other waiters are
1854            *    now stuck, oops!
1855            *
1856            * In both of these cases, the futex word will be 0 (because
1857            * it's updated before the wake is issued).  The best we can
1858            * do is detect this situation if it's the pending futex and
1859            * issue a wake without modifying the futex word.
1860            *
1861            * XXX eventual PI handling?
1862            */
1863           if (__predict_false(is_pending && (oldval & ~FUTEX_WAITERS) == 0)) {
1864                     register_t retval;
1865                     (void) futex_func_wake(/*shared*/true, uaddr, 1,
1866                         FUTEX_BITSET_MATCH_ANY, &retval);
1867                     return;
1868           }
1869 
1870           /* Optimistically test whether we need to do anything at all.  */
1871           if ((oldval & FUTEX_TID_MASK) != tid)
1872                     return;
1873 
1874           /*
1875            * We need to handle the case where this thread owned the futex,
1876            * but it was uncontended.  In this case, there won't be any
1877            * kernel state to look up.  All we can do is mark the futex
1878            * as a zombie to be mopped up the next time another thread
1879            * attempts to acquire it.
1880            *
1881            * N.B. It's important to ensure to set FUTEX_OWNER_DIED in
1882            * this loop, even if waiters appear while we're are doing
1883            * so.  This is beause FUTEX_WAITERS is set by user space
1884            * before calling __futex() to wait, and the futex needs
1885            * to be marked as a zombie when the new waiter gets into
1886            * the kernel.
1887            */
1888           if ((oldval & FUTEX_WAITERS) == 0) {
1889                     do {
1890                               error = futex_load(uaddr, &oldval);
1891                               if (error)
1892                                         return;
1893                               if ((oldval & FUTEX_TID_MASK) != tid)
1894                                         return;
1895                               newval = oldval | FUTEX_OWNER_DIED;
1896                               error = ucas_int(uaddr, oldval, newval, &actual);
1897                               if (error)
1898                                         return;
1899                     } while (actual != oldval);
1900 
1901                     /*
1902                      * If where is still no indication of waiters, then there is
1903                      * no more work for us to do.
1904                      */
1905                     if ((oldval & FUTEX_WAITERS) == 0)
1906                               return;
1907           }
1908 
1909           /*
1910            * Look for a shared futex since we have no positive indication
1911            * it is private.  If we can't, tough.
1912            */
1913           error = futex_lookup(uaddr, /*shared*/true, &f);
1914           if (error)
1915                     return;
1916 
1917           /*
1918            * If there's no kernel state for this futex, there's nothing to
1919            * release.
1920            */
1921           if (f == NULL)
1922                     return;
1923 
1924           /* Work under the futex queue lock.  */
1925           futex_queue_lock(f);
1926 
1927           /*
1928            * Fetch the word: if the tid doesn't match ours, skip;
1929            * otherwise, set the owner-died bit, atomically.
1930            */
1931           do {
1932                     error = futex_load(uaddr, &oldval);
1933                     if (error)
1934                               goto out;
1935                     if ((oldval & FUTEX_TID_MASK) != tid)
1936                               goto out;
1937                     newval = oldval | FUTEX_OWNER_DIED;
1938                     error = ucas_int(uaddr, oldval, newval, &actual);
1939                     if (error)
1940                               goto out;
1941           } while (actual != oldval);
1942 
1943           /*
1944            * If there may be waiters, try to wake one.  If anything goes
1945            * wrong, tough.
1946            *
1947            * XXX eventual PI handling?
1948            */
1949           if (oldval & FUTEX_WAITERS) {
1950                     (void)futex_wake(f, /*nwake*/1, NULL, /*nrequeue*/0,
1951                         FUTEX_BITSET_MATCH_ANY);
1952           }
1953 
1954           /* Unlock the queue and release the futex.  */
1955 out:      futex_queue_unlock(f);
1956           futex_rele(f);
1957 }
1958 
1959 /*
1960  * futex_robust_head_lookup(l, lwpid)
1961  *
1962  *        Helper function to look up a robust head by LWP ID.
1963  */
1964 int
futex_robust_head_lookup(struct lwp * l,lwpid_t lwpid,void ** headp)1965 futex_robust_head_lookup(struct lwp *l, lwpid_t lwpid, void **headp)
1966 {
1967           struct proc *p = l->l_proc;
1968 
1969           /* Find the other lwp, if requested; otherwise use our robust head.  */
1970           if (lwpid) {
1971                     mutex_enter(p->p_lock);
1972                     l = lwp_find(p, lwpid);
1973                     if (l == NULL) {
1974                               mutex_exit(p->p_lock);
1975                               return ESRCH;
1976                     }
1977                     *headp = (void *)l->l_robust_head;
1978                     mutex_exit(p->p_lock);
1979           } else {
1980                     *headp = (void *)l->l_robust_head;
1981           }
1982           return 0;
1983 }
1984 
1985 /*
1986  * futex_fetch_robust_head(uaddr)
1987  *
1988  *        Helper routine to fetch the futex robust list head that
1989  *        handles 32-bit binaries running on 64-bit kernels.
1990  */
1991 static int
futex_fetch_robust_head(uintptr_t uaddr,u_long * rhead)1992 futex_fetch_robust_head(uintptr_t uaddr, u_long *rhead)
1993 {
1994 #ifdef _LP64
1995           if (curproc->p_flag & PK_32) {
1996                     uint32_t rhead32[_FUTEX_ROBUST_HEAD_NWORDS];
1997                     int error;
1998 
1999                     error = copyin((void *)uaddr, rhead32, sizeof(rhead32));
2000                     if (__predict_true(error == 0)) {
2001                               for (int i = 0; i < _FUTEX_ROBUST_HEAD_NWORDS; i++) {
2002                                         if (i == _FUTEX_ROBUST_HEAD_OFFSET) {
2003                                                   /*
2004                                                    * Make sure the offset is sign-
2005                                                    * extended.
2006                                                    */
2007                                                   rhead[i] = (int32_t)rhead32[i];
2008                                         } else {
2009                                                   rhead[i] = rhead32[i];
2010                                         }
2011                               }
2012                     }
2013                     return error;
2014           }
2015 #endif /* _L64 */
2016 
2017           return copyin((void *)uaddr, rhead,
2018               sizeof(*rhead) * _FUTEX_ROBUST_HEAD_NWORDS);
2019 }
2020 
2021 /*
2022  * futex_decode_robust_word(word)
2023  *
2024  *        Decode a robust futex list word into the entry and entry
2025  *        properties.
2026  */
2027 static inline void
futex_decode_robust_word(uintptr_t const word,uintptr_t * const entry,bool * const is_pi)2028 futex_decode_robust_word(uintptr_t const word, uintptr_t * const entry,
2029     bool * const is_pi)
2030 {
2031           *is_pi = (word & _FUTEX_ROBUST_ENTRY_PI) ? true : false;
2032           *entry = word & ~_FUTEX_ROBUST_ENTRY_PI;
2033 }
2034 
2035 /*
2036  * futex_fetch_robust_entry(uaddr)
2037  *
2038  *        Helper routine to fetch and decode a robust futex entry
2039  *        that handles 32-bit binaries running on 64-bit kernels.
2040  */
2041 static int
futex_fetch_robust_entry(uintptr_t const uaddr,uintptr_t * const valp,bool * const is_pi)2042 futex_fetch_robust_entry(uintptr_t const uaddr, uintptr_t * const valp,
2043     bool * const is_pi)
2044 {
2045           uintptr_t val = 0;
2046           int error = 0;
2047 
2048 #ifdef _LP64
2049           if (curproc->p_flag & PK_32) {
2050                     uint32_t val32;
2051 
2052                     error = ufetch_32((uint32_t *)uaddr, &val32);
2053                     if (__predict_true(error == 0))
2054                               val = val32;
2055           } else
2056 #endif /* _LP64 */
2057                     error = ufetch_long((u_long *)uaddr, (u_long *)&val);
2058           if (__predict_false(error))
2059                     return error;
2060 
2061           futex_decode_robust_word(val, valp, is_pi);
2062           return 0;
2063 }
2064 
2065 /*
2066  * futex_release_all_lwp(l, tid)
2067  *
2068  *        Release all l's robust futexes.  If anything looks funny in
2069  *        the process, give up -- it's userland's responsibility to dot
2070  *        the i's and cross the t's.
2071  */
2072 void
futex_release_all_lwp(struct lwp * const l)2073 futex_release_all_lwp(struct lwp * const l)
2074 {
2075           u_long rhead[_FUTEX_ROBUST_HEAD_NWORDS];
2076           int limit = 1000000;
2077           int error;
2078 
2079           /* If there's no robust list there's nothing to do. */
2080           if (l->l_robust_head == 0)
2081                     return;
2082 
2083           KASSERT((l->l_lid & FUTEX_TID_MASK) == l->l_lid);
2084 
2085           /* Read the final snapshot of the robust list head. */
2086           error = futex_fetch_robust_head(l->l_robust_head, rhead);
2087           if (error) {
2088                     printf("WARNING: pid %jd (%s) lwp %jd:"
2089                         " unmapped robust futex list head\n",
2090                         (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2091                         (uintmax_t)l->l_lid);
2092                     return;
2093           }
2094 
2095           const long offset = (long)rhead[_FUTEX_ROBUST_HEAD_OFFSET];
2096 
2097           uintptr_t next, pending;
2098           bool is_pi, pending_is_pi;
2099 
2100           futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_LIST],
2101               &next, &is_pi);
2102           futex_decode_robust_word(rhead[_FUTEX_ROBUST_HEAD_PENDING],
2103               &pending, &pending_is_pi);
2104 
2105           /*
2106            * Walk down the list of locked futexes and release them, up
2107            * to one million of them before we give up.
2108            */
2109 
2110           while (next != l->l_robust_head && limit-- > 0) {
2111                     /* pending handled below. */
2112                     if (next != pending)
2113                               release_futex(next + offset, l->l_lid, is_pi, false);
2114                     error = futex_fetch_robust_entry(next, &next, &is_pi);
2115                     if (error)
2116                               break;
2117                     preempt_point();
2118           }
2119           if (limit <= 0) {
2120                     printf("WARNING: pid %jd (%s) lwp %jd:"
2121                         " exhausted robust futex limit\n",
2122                         (uintmax_t)l->l_proc->p_pid, l->l_proc->p_comm,
2123                         (uintmax_t)l->l_lid);
2124           }
2125 
2126           /* If there's a pending futex, it may need to be released too. */
2127           if (pending != 0) {
2128                     release_futex(pending + offset, l->l_lid, pending_is_pi, true);
2129           }
2130 }
2131