1 /*        $NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $     */
2 
3 /*
4  * Copyright (c) 2008, Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Implementation of the Processor Sets.
31  *
32  * Locking
33  *  The array of the processor-set structures and its members are protected
34  *  by the global cpu_lock.  Note that in scheduler, the very l_psid value
35  *  might be used without lock held.
36  */
37 
38 #include <sys/cdefs.h>
39 __KERNEL_RCSID(0, "$NetBSD: sys_pset.c,v 1.24 2020/05/23 23:42:43 ad Exp $");
40 
41 #include <sys/param.h>
42 
43 #include <sys/cpu.h>
44 #include <sys/kauth.h>
45 #include <sys/kmem.h>
46 #include <sys/lwp.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/pset.h>
50 #include <sys/sched.h>
51 #include <sys/syscallargs.h>
52 #include <sys/sysctl.h>
53 #include <sys/systm.h>
54 #include <sys/types.h>
55 
56 static pset_info_t **         psets;
57 static u_int                  psets_max;
58 static u_int                  psets_count;
59 static kauth_listener_t       psets_listener;
60 
61 static int          psets_realloc(int);
62 static int          psid_validate(psetid_t, bool);
63 static int          kern_pset_create(psetid_t *);
64 static int          kern_pset_destroy(psetid_t);
65 
66 static int
psets_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)67 psets_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
68     void *arg0, void *arg1, void *arg2, void *arg3)
69 {
70           psetid_t id;
71           enum kauth_system_req req;
72           int result;
73 
74           result = KAUTH_RESULT_DEFER;
75           req = (enum kauth_system_req)(uintptr_t)arg0;
76           id = (psetid_t)(uintptr_t)arg1;
77 
78           if (action != KAUTH_SYSTEM_PSET)
79                     return result;
80 
81           if ((req == KAUTH_REQ_SYSTEM_PSET_ASSIGN) ||
82               (req == KAUTH_REQ_SYSTEM_PSET_BIND)) {
83                     if (id == PS_QUERY)
84                               result = KAUTH_RESULT_ALLOW;
85           }
86 
87           return result;
88 }
89 
90 /*
91  * Initialization of the processor-sets.
92  */
93 void
psets_init(void)94 psets_init(void)
95 {
96 
97           psets_max = uimax(maxcpus, 32);
98           psets = kmem_zalloc(psets_max * sizeof(void *), KM_SLEEP);
99           psets_count = 0;
100 
101           psets_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
102               psets_listener_cb, NULL);
103 }
104 
105 /*
106  * Reallocate the array of the processor-set structures.
107  */
108 static int
psets_realloc(int new_psets_max)109 psets_realloc(int new_psets_max)
110 {
111           pset_info_t **new_psets, **old_psets;
112           const u_int newsize = new_psets_max * sizeof(void *);
113           u_int i, oldsize;
114 
115           if (new_psets_max < 1)
116                     return EINVAL;
117 
118           new_psets = kmem_zalloc(newsize, KM_SLEEP);
119           mutex_enter(&cpu_lock);
120           old_psets = psets;
121           oldsize = psets_max * sizeof(void *);
122 
123           /* Check if we can lower the size of the array */
124           if (new_psets_max < psets_max) {
125                     for (i = new_psets_max; i < psets_max; i++) {
126                               if (psets[i] == NULL)
127                                         continue;
128                               mutex_exit(&cpu_lock);
129                               kmem_free(new_psets, newsize);
130                               return EBUSY;
131                     }
132           }
133 
134           /* Copy all pointers to the new array */
135           memcpy(new_psets, psets, newsize);
136           psets_max = new_psets_max;
137           psets = new_psets;
138           mutex_exit(&cpu_lock);
139 
140           kmem_free(old_psets, oldsize);
141           return 0;
142 }
143 
144 /*
145  * Validate processor-set ID.
146  */
147 static int
psid_validate(psetid_t psid,bool chkps)148 psid_validate(psetid_t psid, bool chkps)
149 {
150 
151           KASSERT(mutex_owned(&cpu_lock));
152 
153           if (chkps && (psid == PS_NONE || psid == PS_QUERY || psid == PS_MYID))
154                     return 0;
155           if (psid <= 0 || psid > psets_max)
156                     return EINVAL;
157           if (psets[psid - 1] == NULL)
158                     return EINVAL;
159 
160           return 0;
161 }
162 
163 /*
164  * Create a processor-set.
165  */
166 static int
kern_pset_create(psetid_t * psid)167 kern_pset_create(psetid_t *psid)
168 {
169           pset_info_t *pi;
170           u_int i;
171 
172           if (psets_count == psets_max)
173                     return ENOMEM;
174 
175           pi = kmem_zalloc(sizeof(pset_info_t), KM_SLEEP);
176 
177           mutex_enter(&cpu_lock);
178           if (psets_count == psets_max) {
179                     mutex_exit(&cpu_lock);
180                     kmem_free(pi, sizeof(pset_info_t));
181                     return ENOMEM;
182           }
183 
184           /* Find a free entry in the array */
185           for (i = 0; i < psets_max; i++)
186                     if (psets[i] == NULL)
187                               break;
188           KASSERT(i != psets_max);
189 
190           psets[i] = pi;
191           psets_count++;
192           mutex_exit(&cpu_lock);
193 
194           *psid = i + 1;
195           return 0;
196 }
197 
198 /*
199  * Destroy a processor-set.
200  */
201 static int
kern_pset_destroy(psetid_t psid)202 kern_pset_destroy(psetid_t psid)
203 {
204           struct cpu_info *ci;
205           struct lwp *l;
206           CPU_INFO_ITERATOR cii;
207           int error;
208 
209           mutex_enter(&cpu_lock);
210           if (psid == PS_MYID) {
211                     /* Use caller's processor-set ID */
212                     psid = curlwp->l_psid;
213           }
214           error = psid_validate(psid, false);
215           if (error) {
216                     mutex_exit(&cpu_lock);
217                     return error;
218           }
219 
220           /* Release the processor-set from all CPUs */
221           for (CPU_INFO_FOREACH(cii, ci)) {
222                     struct schedstate_percpu *spc;
223 
224                     spc = &ci->ci_schedstate;
225                     if (spc->spc_psid != psid)
226                               continue;
227                     spc->spc_psid = PS_NONE;
228           }
229 
230           /* Unmark the processor-set ID from each thread */
231           mutex_enter(&proc_lock);
232           LIST_FOREACH(l, &alllwp, l_list) {
233                     /* Safe to check and set without lock held */
234                     if (l->l_psid != psid)
235                               continue;
236                     l->l_psid = PS_NONE;
237           }
238           mutex_exit(&proc_lock);
239 
240           /* Destroy the processor-set */
241           kmem_free(psets[psid - 1], sizeof(pset_info_t));
242           psets[psid - 1] = NULL;
243           psets_count--;
244           mutex_exit(&cpu_lock);
245 
246           return 0;
247 }
248 
249 /*
250  * General system calls for the processor-sets.
251  */
252 
253 int
sys_pset_create(struct lwp * l,const struct sys_pset_create_args * uap,register_t * retval)254 sys_pset_create(struct lwp *l, const struct sys_pset_create_args *uap,
255     register_t *retval)
256 {
257           /* {
258                     syscallarg(psetid_t) *psid;
259           } */
260           psetid_t psid;
261           int error;
262 
263           /* Available only for super-user */
264           if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
265               KAUTH_REQ_SYSTEM_PSET_CREATE, NULL, NULL, NULL))
266                     return EPERM;
267 
268           error = kern_pset_create(&psid);
269           if (error)
270                     return error;
271 
272           error = copyout(&psid, SCARG(uap, psid), sizeof(psetid_t));
273           if (error)
274                     (void)kern_pset_destroy(psid);
275 
276           return error;
277 }
278 
279 int
sys_pset_destroy(struct lwp * l,const struct sys_pset_destroy_args * uap,register_t * retval)280 sys_pset_destroy(struct lwp *l, const struct sys_pset_destroy_args *uap,
281     register_t *retval)
282 {
283           /* {
284                     syscallarg(psetid_t) psid;
285           } */
286 
287           /* Available only for super-user */
288           if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
289               KAUTH_REQ_SYSTEM_PSET_DESTROY,
290               KAUTH_ARG(SCARG(uap, psid)), NULL, NULL))
291                     return EPERM;
292 
293           return kern_pset_destroy(SCARG(uap, psid));
294 }
295 
296 int
sys_pset_assign(struct lwp * l,const struct sys_pset_assign_args * uap,register_t * retval)297 sys_pset_assign(struct lwp *l, const struct sys_pset_assign_args *uap,
298     register_t *retval)
299 {
300           /* {
301                     syscallarg(psetid_t) psid;
302                     syscallarg(cpuid_t) cpuid;
303                     syscallarg(psetid_t) *opsid;
304           } */
305           struct cpu_info *ici, *ci = NULL;
306           struct schedstate_percpu *spc = NULL;
307           struct lwp *t;
308           psetid_t psid = SCARG(uap, psid), opsid = 0;
309           CPU_INFO_ITERATOR cii;
310           int error = 0, nnone = 0;
311 
312           /* Available only for super-user, except the case of PS_QUERY */
313           if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
314               KAUTH_REQ_SYSTEM_PSET_ASSIGN, KAUTH_ARG(SCARG(uap, psid)), NULL,
315               NULL))
316                     return EPERM;
317 
318           /* Find the target CPU */
319           mutex_enter(&cpu_lock);
320           for (CPU_INFO_FOREACH(cii, ici)) {
321                     struct schedstate_percpu *ispc;
322                     ispc = &ici->ci_schedstate;
323                     if (cpu_index(ici) == SCARG(uap, cpuid)) {
324                               ci = ici;
325                               spc = ispc;
326                     }
327                     nnone += (ispc->spc_psid == PS_NONE);
328           }
329           if (ci == NULL) {
330                     mutex_exit(&cpu_lock);
331                     return EINVAL;
332           }
333           error = psid_validate(psid, true);
334           if (error) {
335                     mutex_exit(&cpu_lock);
336                     return error;
337           }
338           opsid = spc->spc_psid;
339           switch (psid) {
340           case PS_QUERY:
341                     break;
342           case PS_MYID:
343                     psid = curlwp->l_psid;
344                     /* FALLTHROUGH */
345           default:
346                     /*
347                      * Just finish if old and new processor-sets are
348                      * the same.
349                      */
350                     if (spc->spc_psid == psid)
351                               break;
352                     /*
353                      * Ensure at least one CPU stays in the default set,
354                      * and that specified CPU is not offline.
355                      */
356                     if (psid != PS_NONE && ((spc->spc_flags & SPCF_OFFLINE) ||
357                         (nnone == 1 && spc->spc_psid == PS_NONE))) {
358                               mutex_exit(&cpu_lock);
359                               return EBUSY;
360                     }
361                     mutex_enter(&proc_lock);
362                     /*
363                      * Ensure that none of the threads are using affinity mask
364                      * with this target CPU in it.
365                      */
366                     LIST_FOREACH(t, &alllwp, l_list) {
367                               if (t->l_affinity == NULL) {
368                                         continue;
369                               }
370                               lwp_lock(t);
371                               if (t->l_affinity == NULL) {
372                                         lwp_unlock(t);
373                                         continue;
374                               }
375                               if (kcpuset_isset(t->l_affinity, cpu_index(ci))) {
376                                         lwp_unlock(t);
377                                         mutex_exit(&proc_lock);
378                                         mutex_exit(&cpu_lock);
379                                         return EPERM;
380                               }
381                               lwp_unlock(t);
382                     }
383                     /*
384                      * Set the processor-set ID.
385                      * Migrate out any threads running on this CPU.
386                      */
387                     spc->spc_psid = psid;
388 
389                     LIST_FOREACH(t, &alllwp, l_list) {
390                               struct cpu_info *tci;
391                               if (t->l_cpu != ci)
392                                         continue;
393                               if (t->l_pflag & (LP_BOUND | LP_INTR))
394                                         continue;
395                               lwp_lock(t);
396                               tci = sched_takecpu(t);
397                               KASSERT(tci != ci);
398                               lwp_migrate(t, tci);
399                     }
400                     mutex_exit(&proc_lock);
401                     break;
402           }
403           mutex_exit(&cpu_lock);
404 
405           if (SCARG(uap, opsid) != NULL)
406                     error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t));
407 
408           return error;
409 }
410 
411 int
sys__pset_bind(struct lwp * l,const struct sys__pset_bind_args * uap,register_t * retval)412 sys__pset_bind(struct lwp *l, const struct sys__pset_bind_args *uap,
413     register_t *retval)
414 {
415           /* {
416                     syscallarg(idtype_t) idtype;
417                     syscallarg(id_t) first_id;
418                     syscallarg(id_t) second_id;
419                     syscallarg(psetid_t) psid;
420                     syscallarg(psetid_t) *opsid;
421           } */
422           struct cpu_info *ci;
423           struct proc *p;
424           struct lwp *t;
425           id_t id1, id2;
426           pid_t pid = 0;
427           lwpid_t lid = 0;
428           psetid_t psid, opsid;
429           int error = 0, lcnt;
430 
431           psid = SCARG(uap, psid);
432 
433           /* Available only for super-user, except the case of PS_QUERY */
434           if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_PSET,
435               KAUTH_REQ_SYSTEM_PSET_BIND, KAUTH_ARG(SCARG(uap, psid)), NULL,
436               NULL))
437                     return EPERM;
438 
439           mutex_enter(&cpu_lock);
440           error = psid_validate(psid, true);
441           if (error) {
442                     mutex_exit(&cpu_lock);
443                     return error;
444           }
445           if (psid == PS_MYID)
446                     psid = curlwp->l_psid;
447 
448           /*
449            * Get PID and LID from the ID.
450            */
451           p = l->l_proc;
452           id1 = SCARG(uap, first_id);
453           id2 = SCARG(uap, second_id);
454 
455           mutex_enter(&proc_lock);
456           switch (SCARG(uap, idtype)) {
457           case P_PID:
458                     /*
459                      * Process:
460                      *  First ID        - PID;
461                      *  Second ID       - ignored;
462                      */
463                     pid = (id1 == P_MYID) ? p->p_pid : id1;
464                     lid = 0;
465                     break;
466           case P_LWPID:
467                     /*
468                      * Thread (LWP):
469                      *  First ID        - LID;
470                      *  Second ID       - PID;
471                      */
472                     if (id1 == P_MYID) {
473                               pid = p->p_pid;
474                               lid = l->l_lid;
475                               break;
476                     }
477                     lid = id1;
478                     pid = (id2 == P_MYID) ? p->p_pid : id2;
479                     break;
480           default:
481                     error = EINVAL;
482                     goto error;
483           }
484 
485           /* Find the process */
486           p = proc_find(pid);
487           if (p == NULL) {
488                     error = ESRCH;
489                     goto error;
490           }
491           /* Disallow modification of the system processes */
492           if (p->p_flag & PK_SYSTEM) {
493                     error = EPERM;
494                     goto error;
495           }
496 
497           /* Find the LWP(s) */
498           lcnt = 0;
499           ci = NULL;
500           mutex_enter(p->p_lock);
501           LIST_FOREACH(t, &p->p_lwps, l_sibling) {
502                     if (lid && lid != t->l_lid)
503                               continue;
504                     /*
505                      * Bind the thread to the processor-set,
506                      * take some CPU and migrate.
507                      */
508                     lwp_lock(t);
509                     opsid = t->l_psid;
510                     t->l_psid = psid;
511                     ci = sched_takecpu(t);
512                     /* Unlocks LWP */
513                     lwp_migrate(t, ci);
514                     lcnt++;
515           }
516           mutex_exit(p->p_lock);
517           if (lcnt == 0) {
518                     error = ESRCH;
519           }
520 error:
521           mutex_exit(&proc_lock);
522           mutex_exit(&cpu_lock);
523           if (error == 0 && SCARG(uap, opsid))
524                     error = copyout(&opsid, SCARG(uap, opsid), sizeof(psetid_t));
525           return error;
526 }
527 
528 /*
529  * Sysctl nodes and initialization.
530  */
531 
532 static int
sysctl_psets_max(SYSCTLFN_ARGS)533 sysctl_psets_max(SYSCTLFN_ARGS)
534 {
535           struct sysctlnode node;
536           int error, newsize;
537 
538           node = *rnode;
539           node.sysctl_data = &newsize;
540 
541           newsize = psets_max;
542           error = sysctl_lookup(SYSCTLFN_CALL(&node));
543           if (error || newp == NULL)
544                     return error;
545 
546           if (newsize <= 0)
547                     return EINVAL;
548 
549           sysctl_unlock();
550           error = psets_realloc(newsize);
551           sysctl_relock();
552           return error;
553 }
554 
555 static int
sysctl_psets_list(SYSCTLFN_ARGS)556 sysctl_psets_list(SYSCTLFN_ARGS)
557 {
558           const size_t bufsz = 1024;
559           char *buf, tbuf[16];
560           int i, error;
561           size_t len;
562 
563           sysctl_unlock();
564           buf = kmem_alloc(bufsz, KM_SLEEP);
565           snprintf(buf, bufsz, "%d:1", PS_NONE);  /* XXX */
566 
567           mutex_enter(&cpu_lock);
568           for (i = 0; i < psets_max; i++) {
569                     if (psets[i] == NULL)
570                               continue;
571                     snprintf(tbuf, sizeof(tbuf), ",%d:2", i + 1);     /* XXX */
572                     strlcat(buf, tbuf, bufsz);
573           }
574           mutex_exit(&cpu_lock);
575           len = strlen(buf) + 1;
576           error = 0;
577           if (oldp != NULL)
578                     error = copyout(buf, oldp, uimin(len, *oldlenp));
579           *oldlenp = len;
580           kmem_free(buf, bufsz);
581           sysctl_relock();
582           return error;
583 }
584 
585 SYSCTL_SETUP(sysctl_pset_setup, "sysctl kern.pset subtree setup")
586 {
587           const struct sysctlnode *node = NULL;
588 
589           sysctl_createv(clog, 0, NULL, &node,
590                     CTLFLAG_PERMANENT,
591                     CTLTYPE_NODE, "pset",
592                     SYSCTL_DESCR("Processor-set options"),
593                     NULL, 0, NULL, 0,
594                     CTL_KERN, CTL_CREATE, CTL_EOL);
595 
596           if (node == NULL)
597                     return;
598 
599           sysctl_createv(clog, 0, &node, NULL,
600                     CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
601                     CTLTYPE_INT, "psets_max",
602                     SYSCTL_DESCR("Maximal count of the processor-sets"),
603                     sysctl_psets_max, 0, &psets_max, 0,
604                     CTL_CREATE, CTL_EOL);
605           sysctl_createv(clog, 0, &node, NULL,
606                     CTLFLAG_PERMANENT,
607                     CTLTYPE_STRING, "list",
608                     SYSCTL_DESCR("List of active sets"),
609                     sysctl_psets_list, 0, NULL, 0,
610                     CTL_CREATE, CTL_EOL);
611 }
612