1 /*        $NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $         */
2 
3 /*-
4  * Copyright (c) 1982, 1986, 1991, 1993
5  *        The Regents of the University of California.  All rights reserved.
6  * (c) UNIX System Laboratories, Inc.
7  * All or some portions of this file are derived from material licensed
8  * to the University of California by American Telephone and Telegraph
9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10  * the permission of UNIX System Laboratories, Inc.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *        @(#)kern_resource.c 8.8 (Berkeley) 2/14/95
37  */
38 
39 #include <sys/cdefs.h>
40 __KERNEL_RCSID(0, "$NetBSD: kern_resource.c,v 1.195 2023/10/04 20:28:06 ad Exp $");
41 
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/file.h>
46 #include <sys/resourcevar.h>
47 #include <sys/kmem.h>
48 #include <sys/namei.h>
49 #include <sys/pool.h>
50 #include <sys/proc.h>
51 #include <sys/sysctl.h>
52 #include <sys/timevar.h>
53 #include <sys/kauth.h>
54 #include <sys/atomic.h>
55 #include <sys/mount.h>
56 #include <sys/syscallargs.h>
57 #include <sys/atomic.h>
58 
59 #include <uvm/uvm_extern.h>
60 
61 /*
62  * Maximum process data and stack limits.
63  * They are variables so they are patchable.
64  */
65 rlim_t                        maxdmap = MAXDSIZ;
66 rlim_t                        maxsmap = MAXSSIZ;
67 
68 static kauth_listener_t       resource_listener;
69 static struct sysctllog       *proc_sysctllog;
70 
71 static int          donice(struct lwp *, struct proc *, int);
72 static void         sysctl_proc_setup(void);
73 
74 static int
resource_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)75 resource_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
76     void *arg0, void *arg1, void *arg2, void *arg3)
77 {
78           struct proc *p;
79           int result;
80 
81           result = KAUTH_RESULT_DEFER;
82           p = arg0;
83 
84           switch (action) {
85           case KAUTH_PROCESS_NICE:
86                     if (kauth_cred_geteuid(cred) != kauth_cred_geteuid(p->p_cred) &&
87                         kauth_cred_getuid(cred) != kauth_cred_geteuid(p->p_cred)) {
88                               break;
89                     }
90 
91                     if ((u_long)arg1 >= p->p_nice)
92                               result = KAUTH_RESULT_ALLOW;
93 
94                     break;
95 
96           case KAUTH_PROCESS_RLIMIT: {
97                     enum kauth_process_req req;
98 
99                     req = (enum kauth_process_req)(uintptr_t)arg1;
100 
101                     switch (req) {
102                     case KAUTH_REQ_PROCESS_RLIMIT_GET:
103                               result = KAUTH_RESULT_ALLOW;
104                               break;
105 
106                     case KAUTH_REQ_PROCESS_RLIMIT_SET: {
107                               struct rlimit *new_rlimit;
108                               u_long which;
109 
110                               if ((p != curlwp->l_proc) &&
111                                   (proc_uidmatch(cred, p->p_cred) != 0))
112                                         break;
113 
114                               new_rlimit = arg2;
115                               which = (u_long)arg3;
116 
117                               if (new_rlimit->rlim_max <= p->p_rlimit[which].rlim_max)
118                                         result = KAUTH_RESULT_ALLOW;
119 
120                               break;
121                               }
122 
123                     default:
124                               break;
125                     }
126 
127                     break;
128           }
129 
130           default:
131                     break;
132           }
133 
134           return result;
135 }
136 
137 void
resource_init(void)138 resource_init(void)
139 {
140 
141           resource_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
142               resource_listener_cb, NULL);
143 
144           sysctl_proc_setup();
145 }
146 
147 /*
148  * Resource controls and accounting.
149  */
150 
151 int
sys_getpriority(struct lwp * l,const struct sys_getpriority_args * uap,register_t * retval)152 sys_getpriority(struct lwp *l, const struct sys_getpriority_args *uap,
153     register_t *retval)
154 {
155           /* {
156                     syscallarg(int) which;
157                     syscallarg(id_t) who;
158           } */
159           struct proc *curp = l->l_proc, *p;
160           id_t who = SCARG(uap, who);
161           int low = NZERO + PRIO_MAX + 1;
162 
163           mutex_enter(&proc_lock);
164           switch (SCARG(uap, which)) {
165           case PRIO_PROCESS:
166                     p = who ? proc_find(who) : curp;
167                     if (p != NULL)
168                               low = p->p_nice;
169                     break;
170 
171           case PRIO_PGRP: {
172                     struct pgrp *pg;
173 
174                     if (who == 0)
175                               pg = curp->p_pgrp;
176                     else if ((pg = pgrp_find(who)) == NULL)
177                               break;
178                     LIST_FOREACH(p, &pg->pg_members, p_pglist) {
179                               if (p->p_nice < low)
180                                         low = p->p_nice;
181                     }
182                     break;
183           }
184 
185           case PRIO_USER:
186                     if (who == 0)
187                               who = (int)kauth_cred_geteuid(l->l_cred);
188                     PROCLIST_FOREACH(p, &allproc) {
189                               mutex_enter(p->p_lock);
190                               if (kauth_cred_geteuid(p->p_cred) ==
191                                   (uid_t)who && p->p_nice < low)
192                                         low = p->p_nice;
193                               mutex_exit(p->p_lock);
194                     }
195                     break;
196 
197           default:
198                     mutex_exit(&proc_lock);
199                     return EINVAL;
200           }
201           mutex_exit(&proc_lock);
202 
203           if (low == NZERO + PRIO_MAX + 1) {
204                     return ESRCH;
205           }
206           *retval = low - NZERO;
207           return 0;
208 }
209 
210 int
sys_setpriority(struct lwp * l,const struct sys_setpriority_args * uap,register_t * retval)211 sys_setpriority(struct lwp *l, const struct sys_setpriority_args *uap,
212     register_t *retval)
213 {
214           /* {
215                     syscallarg(int) which;
216                     syscallarg(id_t) who;
217                     syscallarg(int) prio;
218           } */
219           struct proc *curp = l->l_proc, *p;
220           id_t who = SCARG(uap, who);
221           int found = 0, error = 0;
222 
223           mutex_enter(&proc_lock);
224           switch (SCARG(uap, which)) {
225           case PRIO_PROCESS:
226                     p = who ? proc_find(who) : curp;
227                     if (p != NULL) {
228                               mutex_enter(p->p_lock);
229                               found++;
230                               error = donice(l, p, SCARG(uap, prio));
231                               mutex_exit(p->p_lock);
232                     }
233                     break;
234 
235           case PRIO_PGRP: {
236                     struct pgrp *pg;
237 
238                     if (who == 0)
239                               pg = curp->p_pgrp;
240                     else if ((pg = pgrp_find(who)) == NULL)
241                               break;
242                     LIST_FOREACH(p, &pg->pg_members, p_pglist) {
243                               mutex_enter(p->p_lock);
244                               found++;
245                               error = donice(l, p, SCARG(uap, prio));
246                               mutex_exit(p->p_lock);
247                               if (error)
248                                         break;
249                     }
250                     break;
251           }
252 
253           case PRIO_USER:
254                     if (who == 0)
255                               who = (int)kauth_cred_geteuid(l->l_cred);
256                     PROCLIST_FOREACH(p, &allproc) {
257                               mutex_enter(p->p_lock);
258                               if (kauth_cred_geteuid(p->p_cred) ==
259                                   (uid_t)SCARG(uap, who)) {
260                                         found++;
261                                         error = donice(l, p, SCARG(uap, prio));
262                               }
263                               mutex_exit(p->p_lock);
264                               if (error)
265                                         break;
266                     }
267                     break;
268 
269           default:
270                     mutex_exit(&proc_lock);
271                     return EINVAL;
272           }
273           mutex_exit(&proc_lock);
274 
275           return (found == 0) ? ESRCH : error;
276 }
277 
278 /*
279  * Renice a process.
280  *
281  * Call with the target process' credentials locked.
282  */
283 static int
donice(struct lwp * l,struct proc * chgp,int n)284 donice(struct lwp *l, struct proc *chgp, int n)
285 {
286           kauth_cred_t cred = l->l_cred;
287 
288           KASSERT(mutex_owned(chgp->p_lock));
289 
290           if (kauth_cred_geteuid(cred) && kauth_cred_getuid(cred) &&
291               kauth_cred_geteuid(cred) != kauth_cred_geteuid(chgp->p_cred) &&
292               kauth_cred_getuid(cred) != kauth_cred_geteuid(chgp->p_cred))
293                     return EPERM;
294 
295           if (n > PRIO_MAX) {
296                     n = PRIO_MAX;
297           }
298           if (n < PRIO_MIN) {
299                     n = PRIO_MIN;
300           }
301           n += NZERO;
302 
303           if (kauth_authorize_process(cred, KAUTH_PROCESS_NICE, chgp,
304               KAUTH_ARG(n), NULL, NULL)) {
305                     return EACCES;
306           }
307 
308           sched_nice(chgp, n);
309           return 0;
310 }
311 
312 int
sys_setrlimit(struct lwp * l,const struct sys_setrlimit_args * uap,register_t * retval)313 sys_setrlimit(struct lwp *l, const struct sys_setrlimit_args *uap,
314     register_t *retval)
315 {
316           /* {
317                     syscallarg(int) which;
318                     syscallarg(const struct rlimit *) rlp;
319           } */
320           int error, which = SCARG(uap, which);
321           struct rlimit alim;
322 
323           error = copyin(SCARG(uap, rlp), &alim, sizeof(struct rlimit));
324           if (error) {
325                     return error;
326           }
327           return dosetrlimit(l, l->l_proc, which, &alim);
328 }
329 
330 int
dosetrlimit(struct lwp * l,struct proc * p,int which,struct rlimit * limp)331 dosetrlimit(struct lwp *l, struct proc *p, int which, struct rlimit *limp)
332 {
333           struct rlimit *alimp;
334           int error;
335 
336           if ((u_int)which >= RLIM_NLIMITS)
337                     return EINVAL;
338 
339           if (limp->rlim_cur > limp->rlim_max) {
340                     /*
341                      * This is programming error. According to SUSv2, we should
342                      * return error in this case.
343                      */
344                     return EINVAL;
345           }
346 
347           alimp = &p->p_rlimit[which];
348           /* if we don't change the value, no need to limcopy() */
349           if (limp->rlim_cur == alimp->rlim_cur &&
350               limp->rlim_max == alimp->rlim_max)
351                     return 0;
352 
353           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
354               p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_SET), limp, KAUTH_ARG(which));
355           if (error)
356                     return error;
357 
358           lim_privatise(p);
359           /* p->p_limit is now unchangeable */
360           alimp = &p->p_rlimit[which];
361 
362           switch (which) {
363 
364           case RLIMIT_DATA:
365                     if (limp->rlim_cur > maxdmap)
366                               limp->rlim_cur = maxdmap;
367                     if (limp->rlim_max > maxdmap)
368                               limp->rlim_max = maxdmap;
369                     break;
370 
371           case RLIMIT_STACK:
372                     if (limp->rlim_cur > maxsmap)
373                               limp->rlim_cur = maxsmap;
374                     if (limp->rlim_max > maxsmap)
375                               limp->rlim_max = maxsmap;
376 
377                     /*
378                      * Return EINVAL if the new stack size limit is lower than
379                      * current usage. Otherwise, the process would get SIGSEGV the
380                      * moment it would try to access anything on its current stack.
381                      * This conforms to SUSv2.
382                      */
383                     if (btoc(limp->rlim_cur) < p->p_vmspace->vm_ssize ||
384                         btoc(limp->rlim_max) < p->p_vmspace->vm_ssize) {
385                               return EINVAL;
386                     }
387 
388                     /*
389                      * Stack is allocated to the max at exec time with
390                      * only "rlim_cur" bytes accessible (In other words,
391                      * allocates stack dividing two contiguous regions at
392                      * "rlim_cur" bytes boundary).
393                      *
394                      * Since allocation is done in terms of page, roundup
395                      * "rlim_cur" (otherwise, contiguous regions
396                      * overlap).  If stack limit is going up make more
397                      * accessible, if going down make inaccessible.
398                      */
399                     limp->rlim_max = round_page(limp->rlim_max);
400                     limp->rlim_cur = round_page(limp->rlim_cur);
401                     if (limp->rlim_cur != alimp->rlim_cur) {
402                               vaddr_t addr;
403                               vsize_t size;
404                               vm_prot_t prot;
405                               char *base, *tmp;
406 
407                               base = p->p_vmspace->vm_minsaddr;
408                               if (limp->rlim_cur > alimp->rlim_cur) {
409                                         prot = VM_PROT_READ | VM_PROT_WRITE;
410                                         size = limp->rlim_cur - alimp->rlim_cur;
411                                         tmp = STACK_GROW(base, alimp->rlim_cur);
412                               } else {
413                                         prot = VM_PROT_NONE;
414                                         size = alimp->rlim_cur - limp->rlim_cur;
415                                         tmp = STACK_GROW(base, limp->rlim_cur);
416                               }
417                               addr = (vaddr_t)STACK_ALLOC(tmp, size);
418                               (void) uvm_map_protect(&p->p_vmspace->vm_map,
419                                   addr, addr + size, prot, false);
420                     }
421                     break;
422 
423           case RLIMIT_NOFILE:
424                     if (limp->rlim_cur > maxfiles)
425                               limp->rlim_cur = maxfiles;
426                     if (limp->rlim_max > maxfiles)
427                               limp->rlim_max = maxfiles;
428                     break;
429 
430           case RLIMIT_NPROC:
431                     if (limp->rlim_cur > maxproc)
432                               limp->rlim_cur = maxproc;
433                     if (limp->rlim_max > maxproc)
434                               limp->rlim_max = maxproc;
435                     break;
436 
437           case RLIMIT_NTHR:
438                     if (limp->rlim_cur > maxlwp)
439                               limp->rlim_cur = maxlwp;
440                     if (limp->rlim_max > maxlwp)
441                               limp->rlim_max = maxlwp;
442                     break;
443           }
444 
445           mutex_enter(&p->p_limit->pl_lock);
446           *alimp = *limp;
447           mutex_exit(&p->p_limit->pl_lock);
448           return 0;
449 }
450 
451 int
sys_getrlimit(struct lwp * l,const struct sys_getrlimit_args * uap,register_t * retval)452 sys_getrlimit(struct lwp *l, const struct sys_getrlimit_args *uap,
453     register_t *retval)
454 {
455           /* {
456                     syscallarg(int) which;
457                     syscallarg(struct rlimit *) rlp;
458           } */
459           struct proc *p = l->l_proc;
460           int which = SCARG(uap, which);
461           struct rlimit rl;
462 
463           if ((u_int)which >= RLIM_NLIMITS)
464                     return EINVAL;
465 
466           mutex_enter(p->p_lock);
467           memcpy(&rl, &p->p_rlimit[which], sizeof(rl));
468           mutex_exit(p->p_lock);
469 
470           return copyout(&rl, SCARG(uap, rlp), sizeof(rl));
471 }
472 
473 void
addrulwp(struct lwp * l,struct bintime * tm)474 addrulwp(struct lwp *l, struct bintime *tm)
475 {
476 
477           lwp_lock(l);
478           bintime_add(tm, &l->l_rtime);
479           if ((l->l_pflag & LP_RUNNING) != 0 &&
480               (l->l_pflag & (LP_INTR | LP_TIMEINTR)) != LP_INTR) {
481                     struct bintime diff;
482                     /*
483                      * Adjust for the current time slice.  This is
484                      * actually fairly important since the error
485                      * here is on the order of a time quantum,
486                      * which is much greater than the sampling
487                      * error.
488                      */
489                     binuptime(&diff);
490                     membar_consumer(); /* for softint_dispatch() */
491                     bintime_sub(&diff, &l->l_stime);
492                     bintime_add(tm, &diff);
493           }
494           lwp_unlock(l);
495 }
496 
497 /*
498  * Transform the running time and tick information in proc p into user,
499  * system, and interrupt time usage.
500  *
501  * Should be called with p->p_lock held unless called from exit1().
502  */
503 void
calcru(struct proc * p,struct timeval * up,struct timeval * sp,struct timeval * ip,struct timeval * rp)504 calcru(struct proc *p, struct timeval *up, struct timeval *sp,
505     struct timeval *ip, struct timeval *rp)
506 {
507           uint64_t u, st, ut, it, tot, dt;
508           struct lwp *l;
509           struct bintime tm;
510           struct timeval tv;
511 
512           KASSERT(p->p_stat == SDEAD || mutex_owned(p->p_lock));
513 
514           mutex_spin_enter(&p->p_stmutex);
515           st = p->p_sticks;
516           ut = p->p_uticks;
517           it = p->p_iticks;
518           mutex_spin_exit(&p->p_stmutex);
519 
520           tm = p->p_rtime;
521 
522           LIST_FOREACH(l, &p->p_lwps, l_sibling) {
523                     addrulwp(l, &tm);
524           }
525 
526           tot = st + ut + it;
527           bintime2timeval(&tm, &tv);
528           u = (uint64_t)tv.tv_sec * 1000000ul + tv.tv_usec;
529 
530           if (tot == 0) {
531                     /* No ticks, so can't use to share time out, split 50-50 */
532                     st = ut = u / 2;
533           } else {
534                     st = (u * st) / tot;
535                     ut = (u * ut) / tot;
536           }
537 
538           /*
539            * Try to avoid lying to the users (too much)
540            *
541            * Of course, user/sys time are based on sampling (ie: statistics)
542            * so that would be impossible, but convincing the mark
543            * that we have used less ?time this call than we had
544            * last time, is beyond reasonable...  (the con fails!)
545            *
546            * Note that since actual used time cannot decrease, either
547            * utime or stime (or both) must be greater now than last time
548            * (or both the same) - if one seems to have decreased, hold
549            * it constant and steal the necessary bump from the other
550            * which must have increased.
551            */
552           if (p->p_xutime > ut) {
553                     dt = p->p_xutime - ut;
554                     st -= uimin(dt, st);
555                     ut = p->p_xutime;
556           } else if (p->p_xstime > st) {
557                     dt = p->p_xstime - st;
558                     ut -= uimin(dt, ut);
559                     st = p->p_xstime;
560           }
561 
562           if (sp != NULL) {
563                     p->p_xstime = st;
564                     sp->tv_sec = st / 1000000;
565                     sp->tv_usec = st % 1000000;
566           }
567           if (up != NULL) {
568                     p->p_xutime = ut;
569                     up->tv_sec = ut / 1000000;
570                     up->tv_usec = ut % 1000000;
571           }
572           if (ip != NULL) {
573                     if (it != 0)                  /* it != 0 --> tot != 0 */
574                               it = (u * it) / tot;
575                     ip->tv_sec = it / 1000000;
576                     ip->tv_usec = it % 1000000;
577           }
578           if (rp != NULL) {
579                     *rp = tv;
580           }
581 }
582 
583 int
sys___getrusage50(struct lwp * l,const struct sys___getrusage50_args * uap,register_t * retval)584 sys___getrusage50(struct lwp *l, const struct sys___getrusage50_args *uap,
585     register_t *retval)
586 {
587           /* {
588                     syscallarg(int) who;
589                     syscallarg(struct rusage *) rusage;
590           } */
591           int error;
592           struct rusage ru;
593           struct proc *p = l->l_proc;
594 
595           error = getrusage1(p, SCARG(uap, who), &ru);
596           if (error != 0)
597                     return error;
598 
599           return copyout(&ru, SCARG(uap, rusage), sizeof(ru));
600 }
601 
602 int
getrusage1(struct proc * p,int who,struct rusage * ru)603 getrusage1(struct proc *p, int who, struct rusage *ru)
604 {
605 
606           switch (who) {
607           case RUSAGE_SELF:
608                     mutex_enter(p->p_lock);
609                     ruspace(p);
610                     memcpy(ru, &p->p_stats->p_ru, sizeof(*ru));
611                     calcru(p, &ru->ru_utime, &ru->ru_stime, NULL, NULL);
612                     rulwps(p, ru);
613                     mutex_exit(p->p_lock);
614                     break;
615           case RUSAGE_CHILDREN:
616                     mutex_enter(p->p_lock);
617                     memcpy(ru, &p->p_stats->p_cru, sizeof(*ru));
618                     mutex_exit(p->p_lock);
619                     break;
620           default:
621                     return EINVAL;
622           }
623 
624           return 0;
625 }
626 
627 void
ruspace(struct proc * p)628 ruspace(struct proc *p)
629 {
630           struct vmspace *vm = p->p_vmspace;
631           struct rusage *ru = &p->p_stats->p_ru;
632 
633           ru->ru_ixrss = vm->vm_tsize << (PAGE_SHIFT - 10);
634           ru->ru_idrss = vm->vm_dsize << (PAGE_SHIFT - 10);
635           ru->ru_isrss = vm->vm_ssize << (PAGE_SHIFT - 10);
636 #ifdef __HAVE_NO_PMAP_STATS
637           /* We don't keep track of the max so we get the current */
638           ru->ru_maxrss = vm_resident_count(vm) << (PAGE_SHIFT - 10);
639 #else
640           ru->ru_maxrss = vm->vm_rssmax << (PAGE_SHIFT - 10);
641 #endif
642 }
643 
644 void
ruadd(struct rusage * ru,struct rusage * ru2)645 ruadd(struct rusage *ru, struct rusage *ru2)
646 {
647           long *ip, *ip2;
648           int i;
649 
650           timeradd(&ru->ru_utime, &ru2->ru_utime, &ru->ru_utime);
651           timeradd(&ru->ru_stime, &ru2->ru_stime, &ru->ru_stime);
652           if (ru->ru_maxrss < ru2->ru_maxrss)
653                     ru->ru_maxrss = ru2->ru_maxrss;
654           ip = &ru->ru_first; ip2 = &ru2->ru_first;
655           for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
656                     *ip++ += *ip2++;
657 }
658 
659 void
rulwps(proc_t * p,struct rusage * ru)660 rulwps(proc_t *p, struct rusage *ru)
661 {
662           lwp_t *l;
663 
664           KASSERT(mutex_owned(p->p_lock));
665 
666           LIST_FOREACH(l, &p->p_lwps, l_sibling) {
667                     ruadd(ru, &l->l_ru);
668           }
669 }
670 
671 /*
672  * lim_copy: make a copy of the plimit structure.
673  *
674  * We use copy-on-write after fork, and copy when a limit is changed.
675  */
676 struct plimit *
lim_copy(struct plimit * lim)677 lim_copy(struct plimit *lim)
678 {
679           struct plimit *newlim;
680           char *corename;
681           size_t alen, len;
682 
683           newlim = kmem_alloc(sizeof(*newlim), KM_SLEEP);
684           mutex_init(&newlim->pl_lock, MUTEX_DEFAULT, IPL_NONE);
685           newlim->pl_writeable = false;
686           newlim->pl_refcnt = 1;
687           newlim->pl_sv_limit = NULL;
688 
689           mutex_enter(&lim->pl_lock);
690           memcpy(newlim->pl_rlimit, lim->pl_rlimit,
691               sizeof(struct rlimit) * RLIM_NLIMITS);
692 
693           /*
694            * Note: the common case is a use of default core name.
695            */
696           alen = 0;
697           corename = NULL;
698           for (;;) {
699                     if (lim->pl_corename == defcorename) {
700                               newlim->pl_corename = defcorename;
701                               newlim->pl_cnlen = 0;
702                               break;
703                     }
704                     len = lim->pl_cnlen;
705                     if (len == alen) {
706                               newlim->pl_corename = corename;
707                               newlim->pl_cnlen = len;
708                               memcpy(corename, lim->pl_corename, len);
709                               corename = NULL;
710                               break;
711                     }
712                     mutex_exit(&lim->pl_lock);
713                     if (corename) {
714                               kmem_free(corename, alen);
715                     }
716                     alen = len;
717                     corename = kmem_alloc(alen, KM_SLEEP);
718                     mutex_enter(&lim->pl_lock);
719           }
720           mutex_exit(&lim->pl_lock);
721 
722           if (corename) {
723                     kmem_free(corename, alen);
724           }
725           return newlim;
726 }
727 
728 void
lim_addref(struct plimit * lim)729 lim_addref(struct plimit *lim)
730 {
731           atomic_inc_uint(&lim->pl_refcnt);
732 }
733 
734 /*
735  * lim_privatise: give a process its own private plimit structure.
736  */
737 void
lim_privatise(proc_t * p)738 lim_privatise(proc_t *p)
739 {
740           struct plimit *lim = p->p_limit, *newlim;
741 
742           if (lim->pl_writeable) {
743                     return;
744           }
745 
746           newlim = lim_copy(lim);
747 
748           mutex_enter(p->p_lock);
749           if (p->p_limit->pl_writeable) {
750                     /* Other thread won the race. */
751                     mutex_exit(p->p_lock);
752                     lim_free(newlim);
753                     return;
754           }
755 
756           /*
757            * Since p->p_limit can be accessed without locked held,
758            * old limit structure must not be deleted yet.
759            */
760           newlim->pl_sv_limit = p->p_limit;
761           newlim->pl_writeable = true;
762           p->p_limit = newlim;
763           mutex_exit(p->p_lock);
764 }
765 
766 void
lim_setcorename(proc_t * p,char * name,size_t len)767 lim_setcorename(proc_t *p, char *name, size_t len)
768 {
769           struct plimit *lim;
770           char *oname;
771           size_t olen;
772 
773           lim_privatise(p);
774           lim = p->p_limit;
775 
776           mutex_enter(&lim->pl_lock);
777           oname = lim->pl_corename;
778           olen = lim->pl_cnlen;
779           lim->pl_corename = name;
780           lim->pl_cnlen = len;
781           mutex_exit(&lim->pl_lock);
782 
783           if (oname != defcorename) {
784                     kmem_free(oname, olen);
785           }
786 }
787 
788 void
lim_free(struct plimit * lim)789 lim_free(struct plimit *lim)
790 {
791           struct plimit *sv_lim;
792 
793           do {
794                     membar_release();
795                     if (atomic_dec_uint_nv(&lim->pl_refcnt) > 0) {
796                               return;
797                     }
798                     membar_acquire();
799                     if (lim->pl_corename != defcorename) {
800                               kmem_free(lim->pl_corename, lim->pl_cnlen);
801                     }
802                     sv_lim = lim->pl_sv_limit;
803                     mutex_destroy(&lim->pl_lock);
804                     kmem_free(lim, sizeof(*lim));
805           } while ((lim = sv_lim) != NULL);
806 }
807 
808 struct pstats *
pstatscopy(struct pstats * ps)809 pstatscopy(struct pstats *ps)
810 {
811           struct pstats *nps;
812           size_t len;
813 
814           nps = kmem_alloc(sizeof(*nps), KM_SLEEP);
815 
816           len = (char *)&nps->pstat_endzero - (char *)&nps->pstat_startzero;
817           memset(&nps->pstat_startzero, 0, len);
818 
819           len = (char *)&nps->pstat_endcopy - (char *)&nps->pstat_startcopy;
820           memcpy(&nps->pstat_startcopy, &ps->pstat_startcopy, len);
821 
822           return nps;
823 }
824 
825 void
pstatsfree(struct pstats * ps)826 pstatsfree(struct pstats *ps)
827 {
828 
829           kmem_free(ps, sizeof(*ps));
830 }
831 
832 /*
833  * sysctl_proc_findproc: a routine for sysctl proc subtree helpers that
834  * need to pick a valid process by PID.
835  *
836  * => Hold a reference on the process, on success.
837  */
838 static int
sysctl_proc_findproc(lwp_t * l,pid_t pid,proc_t ** p2)839 sysctl_proc_findproc(lwp_t *l, pid_t pid, proc_t **p2)
840 {
841           proc_t *p;
842           int error;
843 
844           if (pid == PROC_CURPROC) {
845                     p = l->l_proc;
846           } else {
847                     mutex_enter(&proc_lock);
848                     p = proc_find(pid);
849                     if (p == NULL) {
850                               mutex_exit(&proc_lock);
851                               return ESRCH;
852                     }
853           }
854           error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
855           if (pid != PROC_CURPROC) {
856                     mutex_exit(&proc_lock);
857           }
858           *p2 = p;
859           return error;
860 }
861 
862 /*
863  * sysctl_proc_paxflags: helper routine to get process's paxctl flags
864  */
865 static int
sysctl_proc_paxflags(SYSCTLFN_ARGS)866 sysctl_proc_paxflags(SYSCTLFN_ARGS)
867 {
868           struct proc *p;
869           struct sysctlnode node;
870           int paxflags;
871           int error;
872 
873           /* First, validate the request. */
874           if (namelen != 0 || name[-1] != PROC_PID_PAXFLAGS)
875                     return EINVAL;
876 
877           /* Find the process.  Hold a reference (p_reflock), if found. */
878           error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
879           if (error)
880                     return error;
881 
882           /* XXX-elad */
883           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
884               KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
885           if (error) {
886                     rw_exit(&p->p_reflock);
887                     return error;
888           }
889 
890           /* Retrieve the limits. */
891           node = *rnode;
892           paxflags = p->p_pax;
893           node.sysctl_data = &paxflags;
894 
895           error = sysctl_lookup(SYSCTLFN_CALL(&node));
896 
897           /* If attempting to write new value, it's an error */
898           if (error == 0 && newp != NULL)
899                     error = EACCES;
900 
901           rw_exit(&p->p_reflock);
902           return error;
903 }
904 
905 /*
906  * sysctl_proc_corename: helper routine to get or set the core file name
907  * for a process specified by PID.
908  */
909 static int
sysctl_proc_corename(SYSCTLFN_ARGS)910 sysctl_proc_corename(SYSCTLFN_ARGS)
911 {
912           struct proc *p;
913           struct plimit *lim;
914           char *cnbuf, *cname;
915           struct sysctlnode node;
916           size_t len;
917           int error;
918 
919           /* First, validate the request. */
920           if (namelen != 0 || name[-1] != PROC_PID_CORENAME)
921                     return EINVAL;
922 
923           /* Find the process.  Hold a reference (p_reflock), if found. */
924           error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
925           if (error)
926                     return error;
927 
928           /* XXX-elad */
929           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
930               KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
931           if (error) {
932                     rw_exit(&p->p_reflock);
933                     return error;
934           }
935 
936           cnbuf = PNBUF_GET();
937 
938           if (oldp) {
939                     /* Get case: copy the core name into the buffer. */
940                     error = kauth_authorize_process(l->l_cred,
941                         KAUTH_PROCESS_CORENAME, p,
942                         KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_GET), NULL, NULL);
943                     if (error) {
944                               goto done;
945                     }
946                     lim = p->p_limit;
947                     mutex_enter(&lim->pl_lock);
948                     strlcpy(cnbuf, lim->pl_corename, MAXPATHLEN);
949                     mutex_exit(&lim->pl_lock);
950           }
951 
952           node = *rnode;
953           node.sysctl_data = cnbuf;
954           error = sysctl_lookup(SYSCTLFN_CALL(&node));
955 
956           /* Return if error, or if caller is only getting the core name. */
957           if (error || newp == NULL) {
958                     goto done;
959           }
960 
961           /*
962            * Set case.  Check permission and then validate new core name.
963            * It must be either "core", "/core", or end in ".core".
964            */
965           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CORENAME,
966               p, KAUTH_ARG(KAUTH_REQ_PROCESS_CORENAME_SET), cnbuf, NULL);
967           if (error) {
968                     goto done;
969           }
970           len = strlen(cnbuf);
971           if ((len < 4 || strcmp(cnbuf + len - 4, "core") != 0) ||
972               (len > 4 && cnbuf[len - 5] != '/' && cnbuf[len - 5] != '.')) {
973                     error = EINVAL;
974                     goto done;
975           }
976 
977           /* Allocate, copy and set the new core name for plimit structure. */
978           cname = kmem_alloc(++len, KM_NOSLEEP);
979           if (cname == NULL) {
980                     error = ENOMEM;
981                     goto done;
982           }
983           memcpy(cname, cnbuf, len);
984           lim_setcorename(p, cname, len);
985 done:
986           rw_exit(&p->p_reflock);
987           PNBUF_PUT(cnbuf);
988           return error;
989 }
990 
991 /*
992  * sysctl_proc_stop: helper routine for checking/setting the stop flags.
993  */
994 static int
sysctl_proc_stop(SYSCTLFN_ARGS)995 sysctl_proc_stop(SYSCTLFN_ARGS)
996 {
997           struct proc *p;
998           int isset, flag, error = 0;
999           struct sysctlnode node;
1000 
1001           if (namelen != 0)
1002                     return EINVAL;
1003 
1004           /* Find the process.  Hold a reference (p_reflock), if found. */
1005           error = sysctl_proc_findproc(l, (pid_t)name[-2], &p);
1006           if (error)
1007                     return error;
1008 
1009           /* XXX-elad */
1010           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
1011               KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
1012           if (error) {
1013                     goto out;
1014           }
1015 
1016           /* Determine the flag. */
1017           switch (rnode->sysctl_num) {
1018           case PROC_PID_STOPFORK:
1019                     flag = PS_STOPFORK;
1020                     break;
1021           case PROC_PID_STOPEXEC:
1022                     flag = PS_STOPEXEC;
1023                     break;
1024           case PROC_PID_STOPEXIT:
1025                     flag = PS_STOPEXIT;
1026                     break;
1027           default:
1028                     error = EINVAL;
1029                     goto out;
1030           }
1031           isset = (p->p_flag & flag) ? 1 : 0;
1032           node = *rnode;
1033           node.sysctl_data = &isset;
1034           error = sysctl_lookup(SYSCTLFN_CALL(&node));
1035 
1036           /* Return if error, or if callers is only getting the flag. */
1037           if (error || newp == NULL) {
1038                     goto out;
1039           }
1040 
1041           /* Check if caller can set the flags. */
1042           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_STOPFLAG,
1043               p, KAUTH_ARG(flag), NULL, NULL);
1044           if (error) {
1045                     goto out;
1046           }
1047           mutex_enter(p->p_lock);
1048           if (isset) {
1049                     p->p_sflag |= flag;
1050           } else {
1051                     p->p_sflag &= ~flag;
1052           }
1053           mutex_exit(p->p_lock);
1054 out:
1055           rw_exit(&p->p_reflock);
1056           return error;
1057 }
1058 
1059 /*
1060  * sysctl_proc_plimit: helper routine to get/set rlimits of a process.
1061  */
1062 static int
sysctl_proc_plimit(SYSCTLFN_ARGS)1063 sysctl_proc_plimit(SYSCTLFN_ARGS)
1064 {
1065           struct proc *p;
1066           u_int limitno;
1067           int which, error = 0;
1068         struct rlimit alim;
1069           struct sysctlnode node;
1070 
1071           if (namelen != 0)
1072                     return EINVAL;
1073 
1074           which = name[-1];
1075           if (which != PROC_PID_LIMIT_TYPE_SOFT &&
1076               which != PROC_PID_LIMIT_TYPE_HARD)
1077                     return EINVAL;
1078 
1079           limitno = name[-2] - 1;
1080           if (limitno >= RLIM_NLIMITS)
1081                     return EINVAL;
1082 
1083           if (name[-3] != PROC_PID_LIMIT)
1084                     return EINVAL;
1085 
1086           /* Find the process.  Hold a reference (p_reflock), if found. */
1087           error = sysctl_proc_findproc(l, (pid_t)name[-4], &p);
1088           if (error)
1089                     return error;
1090 
1091           /* XXX-elad */
1092           error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE, p,
1093               KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
1094           if (error)
1095                     goto out;
1096 
1097           /* Check if caller can retrieve the limits. */
1098           if (newp == NULL) {
1099                     error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
1100                         p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_GET), &alim,
1101                         KAUTH_ARG(which));
1102                     if (error)
1103                               goto out;
1104           }
1105 
1106           /* Retrieve the limits. */
1107           node = *rnode;
1108           memcpy(&alim, &p->p_rlimit[limitno], sizeof(alim));
1109           if (which == PROC_PID_LIMIT_TYPE_HARD) {
1110                     node.sysctl_data = &alim.rlim_max;
1111           } else {
1112                     node.sysctl_data = &alim.rlim_cur;
1113           }
1114           error = sysctl_lookup(SYSCTLFN_CALL(&node));
1115 
1116           /* Return if error, or if we are only retrieving the limits. */
1117           if (error || newp == NULL) {
1118                     goto out;
1119           }
1120           error = dosetrlimit(l, p, limitno, &alim);
1121 out:
1122           rw_exit(&p->p_reflock);
1123           return error;
1124 }
1125 
1126 /*
1127  * Setup sysctl nodes.
1128  */
1129 static void
sysctl_proc_setup(void)1130 sysctl_proc_setup(void)
1131 {
1132 
1133           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1134                            CTLFLAG_PERMANENT|CTLFLAG_ANYNUMBER,
1135                            CTLTYPE_NODE, "curproc",
1136                            SYSCTL_DESCR("Per-process settings"),
1137                            NULL, 0, NULL, 0,
1138                            CTL_PROC, PROC_CURPROC, CTL_EOL);
1139 
1140           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1141                            CTLFLAG_PERMANENT|CTLFLAG_READONLY,
1142                            CTLTYPE_INT, "paxflags",
1143                            SYSCTL_DESCR("Process PAX control flags"),
1144                            sysctl_proc_paxflags, 0, NULL, 0,
1145                            CTL_PROC, PROC_CURPROC, PROC_PID_PAXFLAGS, CTL_EOL);
1146 
1147           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1148                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
1149                            CTLTYPE_STRING, "corename",
1150                            SYSCTL_DESCR("Core file name"),
1151                            sysctl_proc_corename, 0, NULL, MAXPATHLEN,
1152                            CTL_PROC, PROC_CURPROC, PROC_PID_CORENAME, CTL_EOL);
1153           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1154                            CTLFLAG_PERMANENT,
1155                            CTLTYPE_NODE, "rlimit",
1156                            SYSCTL_DESCR("Process limits"),
1157                            NULL, 0, NULL, 0,
1158                            CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, CTL_EOL);
1159 
1160 #define create_proc_plimit(s, n) do {                                           \
1161           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
1162                            CTLFLAG_PERMANENT,                                   \
1163                            CTLTYPE_NODE, s,                                               \
1164                            SYSCTL_DESCR("Process " s " limits"),                \
1165                            NULL, 0, NULL, 0,                                    \
1166                            CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \
1167                            CTL_EOL);                                            \
1168           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
1169                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \
1170                            CTLTYPE_QUAD, "soft",                                \
1171                            SYSCTL_DESCR("Process soft " s " limit"),  \
1172                            sysctl_proc_plimit, 0, NULL, 0,                      \
1173                            CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \
1174                            PROC_PID_LIMIT_TYPE_SOFT, CTL_EOL);                  \
1175           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,                        \
1176                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE, \
1177                            CTLTYPE_QUAD, "hard",                                \
1178                            SYSCTL_DESCR("Process hard " s " limit"),  \
1179                            sysctl_proc_plimit, 0, NULL, 0,                      \
1180                            CTL_PROC, PROC_CURPROC, PROC_PID_LIMIT, n, \
1181                            PROC_PID_LIMIT_TYPE_HARD, CTL_EOL);                  \
1182           } while (0/*CONSTCOND*/)
1183 
1184           create_proc_plimit("cputime",           PROC_PID_LIMIT_CPU);
1185           create_proc_plimit("filesize",                    PROC_PID_LIMIT_FSIZE);
1186           create_proc_plimit("datasize",                    PROC_PID_LIMIT_DATA);
1187           create_proc_plimit("stacksize",                   PROC_PID_LIMIT_STACK);
1188           create_proc_plimit("coredumpsize",      PROC_PID_LIMIT_CORE);
1189           create_proc_plimit("memoryuse",                   PROC_PID_LIMIT_RSS);
1190           create_proc_plimit("memorylocked",      PROC_PID_LIMIT_MEMLOCK);
1191           create_proc_plimit("maxproc",           PROC_PID_LIMIT_NPROC);
1192           create_proc_plimit("descriptors",       PROC_PID_LIMIT_NOFILE);
1193           create_proc_plimit("sbsize",            PROC_PID_LIMIT_SBSIZE);
1194           create_proc_plimit("vmemoryuse",        PROC_PID_LIMIT_AS);
1195           create_proc_plimit("maxlwp",            PROC_PID_LIMIT_NTHR);
1196 
1197 #undef create_proc_plimit
1198 
1199           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1200                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
1201                            CTLTYPE_INT, "stopfork",
1202                            SYSCTL_DESCR("Stop process at fork(2)"),
1203                            sysctl_proc_stop, 0, NULL, 0,
1204                            CTL_PROC, PROC_CURPROC, PROC_PID_STOPFORK, CTL_EOL);
1205           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1206                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
1207                            CTLTYPE_INT, "stopexec",
1208                            SYSCTL_DESCR("Stop process at execve(2)"),
1209                            sysctl_proc_stop, 0, NULL, 0,
1210                            CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXEC, CTL_EOL);
1211           sysctl_createv(&proc_sysctllog, 0, NULL, NULL,
1212                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE|CTLFLAG_ANYWRITE,
1213                            CTLTYPE_INT, "stopexit",
1214                            SYSCTL_DESCR("Stop process before completing exit"),
1215                            sysctl_proc_stop, 0, NULL, 0,
1216                            CTL_PROC, PROC_CURPROC, PROC_PID_STOPEXIT, CTL_EOL);
1217 }
1218