1 /*      $NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $      */
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __KERNEL_RCSID(0, "$NetBSD: scheduler.c,v 1.55 2023/10/05 19:41:07 ad Exp $");
30 
31 #include <sys/param.h>
32 #include <sys/atomic.h>
33 #include <sys/cpu.h>
34 #include <sys/kmem.h>
35 #include <sys/mutex.h>
36 #include <sys/namei.h>
37 #include <sys/queue.h>
38 #include <sys/select.h>
39 #include <sys/systm.h>
40 
41 #include <rump-sys/kern.h>
42 
43 #include <rump/rumpuser.h>
44 
45 static struct rumpcpu {
46           /* needed in fastpath */
47           struct cpu_info *rcpu_ci;
48           void *rcpu_prevlwp;
49 
50           /* needed in slowpath */
51           struct rumpuser_mtx *rcpu_mtx;
52           struct rumpuser_cv *rcpu_cv;
53           int rcpu_wanted;
54 
55           /* offset 20 (P=4) or 36 (P=8) here */
56 
57           /*
58            * Some stats.  Not really that necessary, but we should
59            * have room.  Note that these overflow quite fast, so need
60            * to be collected often.
61            */
62           unsigned int rcpu_fastpath;
63           unsigned int rcpu_slowpath;
64           unsigned int rcpu_migrated;
65 
66           /* offset 32 (P=4) or 50 (P=8) */
67 
68           int rcpu_align[0] __aligned(CACHE_LINE_SIZE);
69 } rcpu_storage[MAXCPUS];
70 
71 static inline struct rumpcpu *
cpuinfo_to_rumpcpu(struct cpu_info * ci)72 cpuinfo_to_rumpcpu(struct cpu_info *ci)
73 {
74 
75           return &rcpu_storage[cpu_index(ci)];
76 }
77 
78 struct cpu_info rump_bootcpu;
79 
80 #define RCPULWP_BUSY          ((void *)-1)
81 #define RCPULWP_WANTED        ((void *)-2)
82 
83 static struct rumpuser_mtx *lwp0mtx;
84 static struct rumpuser_cv *lwp0cv;
85 static unsigned nextcpu;
86 
87 kmutex_t unruntime_lock; /* unruntime lwp lock.  practically unused */
88 
89 static bool lwp0isbusy = false;
90 
91 /*
92  * Keep some stats.
93  *
94  * Keeping track of there is not really critical for speed, unless
95  * stats happen to be on a different cache line (CACHE_LINE_SIZE is
96  * really just a coarse estimate), so default for the performant case
97  * (i.e. no stats).
98  */
99 #ifdef RUMPSCHED_STATS
100 #define SCHED_FASTPATH(rcpu) rcpu->rcpu_fastpath++;
101 #define SCHED_SLOWPATH(rcpu) rcpu->rcpu_slowpath++;
102 #define SCHED_MIGRATED(rcpu) rcpu->rcpu_migrated++;
103 #else
104 #define SCHED_FASTPATH(rcpu)
105 #define SCHED_SLOWPATH(rcpu)
106 #define SCHED_MIGRATED(rcpu)
107 #endif
108 
109 struct cpu_info *
cpu_lookup(u_int index)110 cpu_lookup(u_int index)
111 {
112 
113           return rcpu_storage[index].rcpu_ci;
114 }
115 
116 static inline struct rumpcpu *
getnextcpu(void)117 getnextcpu(void)
118 {
119           unsigned newcpu;
120 
121           newcpu = atomic_inc_uint_nv(&nextcpu);
122           if (__predict_false(ncpu > UINT_MAX/2))
123                     atomic_and_uint(&nextcpu, 0);
124           newcpu = newcpu % ncpu;
125 
126           return &rcpu_storage[newcpu];
127 }
128 
129 /* this could/should be mi_attach_cpu? */
130 void
rump_cpus_bootstrap(int * nump)131 rump_cpus_bootstrap(int *nump)
132 {
133           int num = *nump;
134 
135           if (num > MAXCPUS) {
136                     aprint_verbose("CPU limit: %d wanted, %d (MAXCPUS) "
137                         "available (adjusted)\n", num, MAXCPUS);
138                     num = MAXCPUS;
139           }
140 
141           cpu_setmodel("rumpcore (virtual)");
142 
143           mi_cpu_init();
144 
145           /* attach first cpu for bootstrap */
146           rump_cpu_attach(&rump_bootcpu);
147           ncpu = 1;
148           *nump = num;
149 }
150 
151 void
rump_scheduler_init(int numcpu)152 rump_scheduler_init(int numcpu)
153 {
154           struct rumpcpu *rcpu;
155           struct cpu_info *ci;
156           int i;
157 
158           rumpuser_mutex_init(&lwp0mtx, RUMPUSER_MTX_SPIN);
159           rumpuser_cv_init(&lwp0cv);
160           for (i = 0; i < numcpu; i++) {
161                     if (i == 0) {
162                               ci = &rump_bootcpu;
163                     } else {
164                               ci = kmem_zalloc(sizeof(*ci), KM_SLEEP);
165                               ci->ci_index = i;
166                     }
167 
168                     rcpu = &rcpu_storage[i];
169                     rcpu->rcpu_ci = ci;
170                     rcpu->rcpu_wanted = 0;
171                     rumpuser_cv_init(&rcpu->rcpu_cv);
172                     rumpuser_mutex_init(&rcpu->rcpu_mtx, RUMPUSER_MTX_SPIN);
173 
174                     ci->ci_schedstate.spc_mutex =
175                         mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
176                     ci->ci_schedstate.spc_flags = SPCF_RUNNING;
177           }
178 
179           mutex_init(&unruntime_lock, MUTEX_DEFAULT, IPL_SCHED);
180 }
181 
182 void
rump_schedlock_cv_signal(struct cpu_info * ci,struct rumpuser_cv * cv)183 rump_schedlock_cv_signal(struct cpu_info *ci, struct rumpuser_cv *cv)
184 {
185           struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(ci);
186 
187           rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
188           rumpuser_cv_signal(cv);
189           rumpuser_mutex_exit(rcpu->rcpu_mtx);
190 }
191 
192 /*
193  * condvar ops using scheduler lock as the rumpuser interlock.
194  */
195 void
rump_schedlock_cv_wait(struct rumpuser_cv * cv)196 rump_schedlock_cv_wait(struct rumpuser_cv *cv)
197 {
198           struct lwp *l = curlwp;
199           struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
200 
201           /* mutex will be taken and released in cpu schedule/unschedule */
202           rumpuser_cv_wait(cv, rcpu->rcpu_mtx);
203 }
204 
205 int
rump_schedlock_cv_timedwait(struct rumpuser_cv * cv,const struct timespec * ts)206 rump_schedlock_cv_timedwait(struct rumpuser_cv *cv, const struct timespec *ts)
207 {
208           struct lwp *l = curlwp;
209           struct rumpcpu *rcpu = cpuinfo_to_rumpcpu(l->l_cpu);
210 
211           /* mutex will be taken and released in cpu schedule/unschedule */
212           return rumpuser_cv_timedwait(cv, rcpu->rcpu_mtx,
213               ts->tv_sec, ts->tv_nsec);
214 }
215 
216 static void
lwp0busy(void)217 lwp0busy(void)
218 {
219 
220           /* busy lwp0 */
221           KASSERT(curlwp == NULL || curlwp->l_stat != LSONPROC);
222           rumpuser_mutex_enter_nowrap(lwp0mtx);
223           while (lwp0isbusy)
224                     rumpuser_cv_wait_nowrap(lwp0cv, lwp0mtx);
225           lwp0isbusy = true;
226           rumpuser_mutex_exit(lwp0mtx);
227 }
228 
229 static void
lwp0rele(void)230 lwp0rele(void)
231 {
232 
233           rumpuser_mutex_enter_nowrap(lwp0mtx);
234           KASSERT(lwp0isbusy == true);
235           lwp0isbusy = false;
236           rumpuser_cv_signal(lwp0cv);
237           rumpuser_mutex_exit(lwp0mtx);
238 }
239 
240 /*
241  * rump_schedule: ensure that the calling host thread has a valid lwp context.
242  * ie. ensure that curlwp != NULL.  Also, ensure that there
243  * a 1:1 mapping between the lwp and rump kernel cpu.
244  */
245 void
rump_schedule()246 rump_schedule()
247 {
248           struct lwp *l;
249 
250           /*
251            * If there is no dedicated lwp, allocate a temp one and
252            * set it to be free'd upon unschedule().  Use lwp0 context
253            * for reserving the necessary resources.  Don't optimize
254            * for this case -- anyone who cares about performance will
255            * start a real thread.
256            */
257           if (__predict_true((l = curlwp) != NULL)) {
258                     struct proc *p = l->l_proc;
259                     rump_schedule_cpu(l);
260                     if (l->l_cred != p->p_cred) {
261                               kauth_cred_t oc = l->l_cred;
262                               mutex_enter(p->p_lock);
263                               l->l_cred = kauth_cred_hold(p->p_cred);
264                               mutex_exit(p->p_lock);
265                               kauth_cred_free(oc);
266                     }
267           } else {
268                     lwp0busy();
269 
270                     /* schedule cpu and use lwp0 */
271                     rump_schedule_cpu(&lwp0);
272                     rump_lwproc_curlwp_set(&lwp0);
273 
274                     /* allocate thread, switch to it, and release lwp0 */
275                     l = rump__lwproc_alloclwp(initproc);
276                     rump_lwproc_switch(l);
277                     lwp0rele();
278 
279                     /*
280                      * mark new thread dead-on-unschedule.  this
281                      * means that we'll be running with l_refcnt == 0.
282                      * relax, it's fine.
283                      */
284                     rump_lwproc_releaselwp();
285           }
286 }
287 
288 void
rump_schedule_cpu(struct lwp * l)289 rump_schedule_cpu(struct lwp *l)
290 {
291 
292           rump_schedule_cpu_interlock(l, NULL);
293 }
294 
295 /*
296  * Schedule a CPU.  This optimizes for the case where we schedule
297  * the same thread often, and we have nCPU >= nFrequently-Running-Thread
298  * (where CPU is virtual rump cpu, not host CPU).
299  */
300 void
rump_schedule_cpu_interlock(struct lwp * l,void * interlock)301 rump_schedule_cpu_interlock(struct lwp *l, void *interlock)
302 {
303           struct rumpcpu *rcpu;
304           struct cpu_info *ci;
305           void *old;
306           bool domigrate;
307           bool bound = l->l_pflag & LP_BOUND;
308 
309           l->l_stat = LSRUN;
310 
311           /*
312            * First, try fastpath: if we were the previous user of the
313            * CPU, everything is in order cachewise and we can just
314            * proceed to use it.
315            *
316            * If we are a different thread (i.e. CAS fails), we must go
317            * through a memory barrier to ensure we get a truthful
318            * view of the world.
319            */
320 
321           KASSERT(l->l_target_cpu != NULL);
322           rcpu = cpuinfo_to_rumpcpu(l->l_target_cpu);
323           if (atomic_cas_ptr(&rcpu->rcpu_prevlwp, l, RCPULWP_BUSY) == l) {
324                     if (interlock == rcpu->rcpu_mtx)
325                               rumpuser_mutex_exit(rcpu->rcpu_mtx);
326                     SCHED_FASTPATH(rcpu);
327                     /* jones, you're the man */
328                     goto fastlane;
329           }
330 
331           /*
332            * Else, it's the slowpath for us.  First, determine if we
333            * can migrate.
334            */
335           if (ncpu == 1)
336                     domigrate = false;
337           else
338                     domigrate = true;
339 
340           /* Take lock.  This acts as a load barrier too. */
341           if (interlock != rcpu->rcpu_mtx)
342                     rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
343 
344           for (;;) {
345                     SCHED_SLOWPATH(rcpu);
346                     old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, RCPULWP_WANTED);
347 
348                     /* CPU is free? */
349                     if (old != RCPULWP_BUSY && old != RCPULWP_WANTED) {
350                               if (atomic_cas_ptr(&rcpu->rcpu_prevlwp,
351                                   RCPULWP_WANTED, RCPULWP_BUSY) == RCPULWP_WANTED) {
352                                         break;
353                               }
354                     }
355 
356                     /*
357                      * Do we want to migrate once?
358                      * This may need a slightly better algorithm, or we
359                      * might cache pingpong eternally for non-frequent
360                      * threads.
361                      */
362                     if (domigrate && !bound) {
363                               domigrate = false;
364                               SCHED_MIGRATED(rcpu);
365                               rumpuser_mutex_exit(rcpu->rcpu_mtx);
366                               rcpu = getnextcpu();
367                               rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
368                               continue;
369                     }
370 
371                     /* Want CPU, wait until it's released an retry */
372                     rcpu->rcpu_wanted++;
373                     rumpuser_cv_wait_nowrap(rcpu->rcpu_cv, rcpu->rcpu_mtx);
374                     rcpu->rcpu_wanted--;
375           }
376           rumpuser_mutex_exit(rcpu->rcpu_mtx);
377 
378  fastlane:
379           ci = rcpu->rcpu_ci;
380           l->l_cpu = l->l_target_cpu = ci;
381           l->l_mutex = rcpu->rcpu_ci->ci_schedstate.spc_mutex;
382           l->l_ru.ru_nvcsw++;
383           l->l_stat = LSONPROC;
384 
385           /*
386            * No interrupts, so ci_curlwp === cpu_onproc.
387            * Okay, we could make an attempt to not set cpu_onproc
388            * in the case that an interrupt is scheduled immediately
389            * after a user proc, but leave that for later.
390            */
391           ci->ci_curlwp = ci->ci_onproc = l;
392 }
393 
394 void
rump_unschedule()395 rump_unschedule()
396 {
397           struct lwp *l = curlwp;
398 #ifdef DIAGNOSTIC
399           int nlock;
400 
401           KERNEL_UNLOCK_ALL(l, &nlock);
402           KASSERT(nlock == 0);
403 #endif
404 
405           KASSERT(l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex);
406           rump_unschedule_cpu(l);
407           l->l_mutex = &unruntime_lock;
408           l->l_stat = LSSTOP;
409 
410           /*
411            * Check special conditions:
412            *  1) do we need to free the lwp which just unscheduled?
413            *     (locking order: lwp0, cpu)
414            *  2) do we want to clear curlwp for the current host thread
415            */
416           if (__predict_false(l->l_flag & LW_WEXIT)) {
417                     lwp0busy();
418 
419                     /* Now that we have lwp0, we can schedule a CPU again */
420                     rump_schedule_cpu(l);
421 
422                     /* switch to lwp0.  this frees the old thread */
423                     KASSERT(l->l_flag & LW_WEXIT);
424                     rump_lwproc_switch(&lwp0);
425 
426                     /* release lwp0 */
427                     rump_unschedule_cpu(&lwp0);
428                     lwp0.l_mutex = &unruntime_lock;
429                     lwp0.l_pflag &= ~LP_RUNNING;
430                     lwp0rele();
431                     rump_lwproc_curlwp_clear(&lwp0);
432 
433           } else if (__predict_false(l->l_flag & LW_RUMP_CLEAR)) {
434                     rump_lwproc_curlwp_clear(l);
435                     l->l_flag &= ~LW_RUMP_CLEAR;
436           }
437 }
438 
439 void
rump_unschedule_cpu(struct lwp * l)440 rump_unschedule_cpu(struct lwp *l)
441 {
442 
443           rump_unschedule_cpu_interlock(l, NULL);
444 }
445 
446 void
rump_unschedule_cpu_interlock(struct lwp * l,void * interlock)447 rump_unschedule_cpu_interlock(struct lwp *l, void *interlock)
448 {
449 
450           if ((l->l_pflag & LP_INTR) == 0)
451                     rump_softint_run(l->l_cpu);
452           rump_unschedule_cpu1(l, interlock);
453 }
454 
455 void
rump_unschedule_cpu1(struct lwp * l,void * interlock)456 rump_unschedule_cpu1(struct lwp *l, void *interlock)
457 {
458           struct rumpcpu *rcpu;
459           struct cpu_info *ci;
460           void *old;
461 
462           ci = l->l_cpu;
463           ci->ci_curlwp = ci->ci_onproc = NULL;
464           rcpu = cpuinfo_to_rumpcpu(ci);
465 
466           KASSERT(rcpu->rcpu_ci == ci);
467 
468           /*
469            * Make sure all stores are seen before the CPU release.  This
470            * is relevant only in the non-fastpath scheduling case, but
471            * we don't know here if that's going to happen, so need to
472            * expect the worst.
473            *
474            * If the scheduler interlock was requested by the caller, we
475            * need to obtain it before we release the CPU.  Otherwise, we risk a
476            * race condition where another thread is scheduled onto the
477            * rump kernel CPU before our current thread can
478            * grab the interlock.
479            */
480           if (interlock == rcpu->rcpu_mtx)
481                     rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
482           else
483                     membar_release(); /* XXX what does this pair with? */
484 
485           /* Release the CPU. */
486           old = atomic_swap_ptr(&rcpu->rcpu_prevlwp, l);
487 
488           /* No waiters?  No problems.  We're outta here. */
489           if (old == RCPULWP_BUSY) {
490                     return;
491           }
492 
493           KASSERT(old == RCPULWP_WANTED);
494 
495           /*
496            * Ok, things weren't so snappy.
497            *
498            * Snailpath: take lock and signal anyone waiting for this CPU.
499            */
500 
501           if (interlock != rcpu->rcpu_mtx)
502                     rumpuser_mutex_enter_nowrap(rcpu->rcpu_mtx);
503           if (rcpu->rcpu_wanted)
504                     rumpuser_cv_broadcast(rcpu->rcpu_cv);
505           if (interlock != rcpu->rcpu_mtx)
506                     rumpuser_mutex_exit(rcpu->rcpu_mtx);
507 }
508 
509 /* Give up and retake CPU (perhaps a different one) */
510 void
yield()511 yield()
512 {
513           struct lwp *l = curlwp;
514           int nlocks;
515 
516           KERNEL_UNLOCK_ALL(l, &nlocks);
517           rump_unschedule_cpu(l);
518           rump_schedule_cpu(l);
519           KERNEL_LOCK(nlocks, l);
520 }
521 
522 void
preempt()523 preempt()
524 {
525 
526           yield();
527 }
528 
529 bool
kpreempt(uintptr_t where)530 kpreempt(uintptr_t where)
531 {
532 
533           return false;
534 }
535 
536 /*
537  * There is no kernel thread preemption in rump currently.  But call
538  * the implementing macros anyway in case they grow some side-effects
539  * down the road.
540  */
541 void
kpreempt_disable(void)542 kpreempt_disable(void)
543 {
544 
545           KPREEMPT_DISABLE(curlwp);
546 }
547 
548 void
kpreempt_enable(void)549 kpreempt_enable(void)
550 {
551 
552           KPREEMPT_ENABLE(curlwp);
553 }
554 
555 bool
kpreempt_disabled(void)556 kpreempt_disabled(void)
557 {
558 #if 0
559           const lwp_t *l = curlwp;
560 
561           return l->l_nopreempt != 0 || l->l_stat == LSZOMB ||
562               (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled();
563 #endif
564           /* XXX: emulate cpu_kpreempt_disabled() */
565           return true;
566 }
567 
568 void
suspendsched(void)569 suspendsched(void)
570 {
571 
572           /*
573            * Could wait until everyone is out and block further entries,
574            * but skip that for now.
575            */
576 }
577 
578 void
sched_nice(struct proc * p,int level)579 sched_nice(struct proc *p, int level)
580 {
581 
582           /* nothing to do for now */
583 }
584 
585 void
setrunnable(struct lwp * l)586 setrunnable(struct lwp *l)
587 {
588 
589           sched_enqueue(l);
590 }
591 
592 void
sched_enqueue(struct lwp * l)593 sched_enqueue(struct lwp *l)
594 {
595 
596           rump_thread_allow(l);
597 }
598 
599 void
sched_resched_cpu(struct cpu_info * ci,pri_t pri,bool unlock)600 sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
601 {
602 
603 }
604 
605 void
sched_resched_lwp(struct lwp * l,bool unlock)606 sched_resched_lwp(struct lwp *l, bool unlock)
607 {
608 
609 }
610 
611 void
sched_dequeue(struct lwp * l)612 sched_dequeue(struct lwp *l)
613 {
614 
615           panic("sched_dequeue not implemented");
616 }
617 
618 void
preempt_point(void)619 preempt_point(void)
620 {
621 
622 }
623 
624 bool
preempt_needed(void)625 preempt_needed(void)
626 {
627 
628           return false;
629 }
630