1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 1993, David Greenman
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 #include "opt_capsicum.h"
31 #include "opt_hwpmc_hooks.h"
32 #include "opt_ktrace.h"
33 #include "opt_vm.h"
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/acct.h>
38 #include <sys/asan.h>
39 #include <sys/capsicum.h>
40 #include <sys/compressor.h>
41 #include <sys/eventhandler.h>
42 #include <sys/exec.h>
43 #include <sys/fcntl.h>
44 #include <sys/filedesc.h>
45 #include <sys/imgact.h>
46 #include <sys/imgact_elf.h>
47 #include <sys/kernel.h>
48 #include <sys/lock.h>
49 #include <sys/malloc.h>
50 #include <sys/mman.h>
51 #include <sys/mount.h>
52 #include <sys/mutex.h>
53 #include <sys/namei.h>
54 #include <sys/priv.h>
55 #include <sys/proc.h>
56 #include <sys/ptrace.h>
57 #include <sys/reg.h>
58 #include <sys/resourcevar.h>
59 #include <sys/rwlock.h>
60 #include <sys/sched.h>
61 #include <sys/sdt.h>
62 #include <sys/sf_buf.h>
63 #include <sys/shm.h>
64 #include <sys/signalvar.h>
65 #include <sys/smp.h>
66 #include <sys/stat.h>
67 #include <sys/syscallsubr.h>
68 #include <sys/sysctl.h>
69 #include <sys/sysent.h>
70 #include <sys/sysproto.h>
71 #include <sys/timers.h>
72 #include <sys/umtxvar.h>
73 #include <sys/vnode.h>
74 #include <sys/wait.h>
75 #ifdef KTRACE
76 #include <sys/ktrace.h>
77 #endif
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_kern.h>
85 #include <vm/vm_extern.h>
86 #include <vm/vm_object.h>
87 #include <vm/vm_pager.h>
88
89 #ifdef HWPMC_HOOKS
90 #include <sys/pmckern.h>
91 #endif
92
93 #include <security/audit/audit.h>
94 #include <security/mac/mac_framework.h>
95
96 #ifdef KDTRACE_HOOKS
97 #include <sys/dtrace_bsd.h>
98 dtrace_execexit_func_t dtrace_fasttrap_exec;
99 #endif
100
101 SDT_PROVIDER_DECLARE(proc);
102 SDT_PROBE_DEFINE1(proc, , , exec, "char *");
103 SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
104 SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
105
106 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
107
108 int coredump_pack_fileinfo = 1;
109 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
110 &coredump_pack_fileinfo, 0,
111 "Enable file path packing in 'procstat -f' coredump notes");
112
113 int coredump_pack_vmmapinfo = 1;
114 SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
115 &coredump_pack_vmmapinfo, 0,
116 "Enable file path packing in 'procstat -v' coredump notes");
117
118 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
119 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
120 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
121 static int do_execve(struct thread *td, struct image_args *args,
122 struct mac *mac_p, struct vmspace *oldvmspace);
123
124 /* XXX This should be vm_size_t. */
125 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD|
126 CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_ps_strings, "LU",
127 "Location of process' ps_strings structure");
128
129 /* XXX This should be vm_size_t. */
130 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
131 CTLFLAG_CAPRD|CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_usrstack, "LU",
132 "Top of process stack");
133
134 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_MPSAFE,
135 NULL, 0, sysctl_kern_stackprot, "I",
136 "Stack memory permissions");
137
138 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
139 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
140 &ps_arg_cache_limit, 0,
141 "Process' command line characters cache limit");
142
143 static int disallow_high_osrel;
144 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
145 &disallow_high_osrel, 0,
146 "Disallow execution of binaries built for higher version of the world");
147
148 static int map_at_zero = 0;
149 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RWTUN, &map_at_zero, 0,
150 "Permit processes to map an object at virtual address 0.");
151
152 static int core_dump_can_intr = 1;
153 SYSCTL_INT(_kern, OID_AUTO, core_dump_can_intr, CTLFLAG_RWTUN,
154 &core_dump_can_intr, 0,
155 "Core dumping interruptible with SIGKILL");
156
157 static int
sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)158 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
159 {
160 struct proc *p;
161 vm_offset_t ps_strings;
162
163 p = curproc;
164 #ifdef SCTL_MASK32
165 if (req->flags & SCTL_MASK32) {
166 unsigned int val;
167 val = (unsigned int)PROC_PS_STRINGS(p);
168 return (SYSCTL_OUT(req, &val, sizeof(val)));
169 }
170 #endif
171 ps_strings = PROC_PS_STRINGS(p);
172 return (SYSCTL_OUT(req, &ps_strings, sizeof(ps_strings)));
173 }
174
175 static int
sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)176 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
177 {
178 struct proc *p;
179 vm_offset_t val;
180
181 p = curproc;
182 #ifdef SCTL_MASK32
183 if (req->flags & SCTL_MASK32) {
184 unsigned int val32;
185
186 val32 = round_page((unsigned int)p->p_vmspace->vm_stacktop);
187 return (SYSCTL_OUT(req, &val32, sizeof(val32)));
188 }
189 #endif
190 val = round_page(p->p_vmspace->vm_stacktop);
191 return (SYSCTL_OUT(req, &val, sizeof(val)));
192 }
193
194 static int
sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)195 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
196 {
197 struct proc *p;
198
199 p = curproc;
200 return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
201 sizeof(p->p_sysent->sv_stackprot)));
202 }
203
204 /*
205 * Each of the items is a pointer to a `const struct execsw', hence the
206 * double pointer here.
207 */
208 static const struct execsw **execsw;
209
210 #ifndef _SYS_SYSPROTO_H_
211 struct execve_args {
212 char *fname;
213 char **argv;
214 char **envv;
215 };
216 #endif
217
218 int
sys_execve(struct thread * td,struct execve_args * uap)219 sys_execve(struct thread *td, struct execve_args *uap)
220 {
221 struct image_args args;
222 struct vmspace *oldvmspace;
223 int error;
224
225 error = pre_execve(td, &oldvmspace);
226 if (error != 0)
227 return (error);
228 error = exec_copyin_args(&args, uap->fname, uap->argv, uap->envv);
229 if (error == 0)
230 error = kern_execve(td, &args, NULL, oldvmspace);
231 post_execve(td, error, oldvmspace);
232 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
233 return (error);
234 }
235
236 #ifndef _SYS_SYSPROTO_H_
237 struct fexecve_args {
238 int fd;
239 char **argv;
240 char **envv;
241 };
242 #endif
243 int
sys_fexecve(struct thread * td,struct fexecve_args * uap)244 sys_fexecve(struct thread *td, struct fexecve_args *uap)
245 {
246 struct image_args args;
247 struct vmspace *oldvmspace;
248 int error;
249
250 error = pre_execve(td, &oldvmspace);
251 if (error != 0)
252 return (error);
253 error = exec_copyin_args(&args, NULL, uap->argv, uap->envv);
254 if (error == 0) {
255 args.fd = uap->fd;
256 error = kern_execve(td, &args, NULL, oldvmspace);
257 }
258 post_execve(td, error, oldvmspace);
259 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
260 return (error);
261 }
262
263 #ifndef _SYS_SYSPROTO_H_
264 struct __mac_execve_args {
265 char *fname;
266 char **argv;
267 char **envv;
268 struct mac *mac_p;
269 };
270 #endif
271
272 int
sys___mac_execve(struct thread * td,struct __mac_execve_args * uap)273 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
274 {
275 #ifdef MAC
276 struct image_args args;
277 struct vmspace *oldvmspace;
278 int error;
279
280 error = pre_execve(td, &oldvmspace);
281 if (error != 0)
282 return (error);
283 error = exec_copyin_args(&args, uap->fname, uap->argv, uap->envv);
284 if (error == 0)
285 error = kern_execve(td, &args, uap->mac_p, oldvmspace);
286 post_execve(td, error, oldvmspace);
287 AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
288 return (error);
289 #else
290 return (ENOSYS);
291 #endif
292 }
293
294 int
pre_execve(struct thread * td,struct vmspace ** oldvmspace)295 pre_execve(struct thread *td, struct vmspace **oldvmspace)
296 {
297 struct proc *p;
298 int error;
299
300 KASSERT(td == curthread, ("non-current thread %p", td));
301 error = 0;
302 p = td->td_proc;
303 if ((p->p_flag & P_HADTHREADS) != 0) {
304 PROC_LOCK(p);
305 if (thread_single(p, SINGLE_BOUNDARY) != 0)
306 error = ERESTART;
307 PROC_UNLOCK(p);
308 }
309 KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
310 ("nested execve"));
311 *oldvmspace = p->p_vmspace;
312 return (error);
313 }
314
315 void
post_execve(struct thread * td,int error,struct vmspace * oldvmspace)316 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
317 {
318 struct proc *p;
319
320 KASSERT(td == curthread, ("non-current thread %p", td));
321 p = td->td_proc;
322 if ((p->p_flag & P_HADTHREADS) != 0) {
323 PROC_LOCK(p);
324 /*
325 * If success, we upgrade to SINGLE_EXIT state to
326 * force other threads to suicide.
327 */
328 if (error == EJUSTRETURN)
329 thread_single(p, SINGLE_EXIT);
330 else
331 thread_single_end(p, SINGLE_BOUNDARY);
332 PROC_UNLOCK(p);
333 }
334 exec_cleanup(td, oldvmspace);
335 }
336
337 /*
338 * kern_execve() has the astonishing property of not always returning to
339 * the caller. If sufficiently bad things happen during the call to
340 * do_execve(), it can end up calling exit1(); as a result, callers must
341 * avoid doing anything which they might need to undo (e.g., allocating
342 * memory).
343 */
344 int
kern_execve(struct thread * td,struct image_args * args,struct mac * mac_p,struct vmspace * oldvmspace)345 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
346 struct vmspace *oldvmspace)
347 {
348
349 TSEXEC(td->td_proc->p_pid, args->begin_argv);
350 AUDIT_ARG_ARGV(args->begin_argv, args->argc,
351 exec_args_get_begin_envv(args) - args->begin_argv);
352 AUDIT_ARG_ENVV(exec_args_get_begin_envv(args), args->envc,
353 args->endp - exec_args_get_begin_envv(args));
354
355 /* Must have at least one argument. */
356 if (args->argc == 0) {
357 exec_free_args(args);
358 return (EINVAL);
359 }
360 return (do_execve(td, args, mac_p, oldvmspace));
361 }
362
363 static void
execve_nosetid(struct image_params * imgp)364 execve_nosetid(struct image_params *imgp)
365 {
366 imgp->credential_setid = false;
367 if (imgp->newcred != NULL) {
368 crfree(imgp->newcred);
369 imgp->newcred = NULL;
370 }
371 }
372
373 /*
374 * In-kernel implementation of execve(). All arguments are assumed to be
375 * userspace pointers from the passed thread.
376 */
377 static int
do_execve(struct thread * td,struct image_args * args,struct mac * mac_p,struct vmspace * oldvmspace)378 do_execve(struct thread *td, struct image_args *args, struct mac *mac_p,
379 struct vmspace *oldvmspace)
380 {
381 struct proc *p = td->td_proc;
382 struct nameidata nd;
383 struct ucred *oldcred;
384 struct uidinfo *euip = NULL;
385 uintptr_t stack_base;
386 struct image_params image_params, *imgp;
387 struct vattr attr;
388 struct pargs *oldargs = NULL, *newargs = NULL;
389 struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
390 #ifdef KTRACE
391 struct ktr_io_params *kiop;
392 #endif
393 struct vnode *oldtextvp, *newtextvp;
394 struct vnode *oldtextdvp, *newtextdvp;
395 char *oldbinname, *newbinname;
396 bool credential_changing;
397 #ifdef MAC
398 struct label *interpvplabel = NULL;
399 bool will_transition;
400 #endif
401 #ifdef HWPMC_HOOKS
402 struct pmckern_procexec pe;
403 #endif
404 int error, i, orig_osrel;
405 uint32_t orig_fctl0;
406 Elf_Brandinfo *orig_brandinfo;
407 size_t freepath_size;
408 static const char fexecv_proc_title[] = "(fexecv)";
409
410 imgp = &image_params;
411 oldtextvp = oldtextdvp = NULL;
412 newtextvp = newtextdvp = NULL;
413 newbinname = oldbinname = NULL;
414 #ifdef KTRACE
415 kiop = NULL;
416 #endif
417
418 /*
419 * Lock the process and set the P_INEXEC flag to indicate that
420 * it should be left alone until we're done here. This is
421 * necessary to avoid race conditions - e.g. in ptrace() -
422 * that might allow a local user to illicitly obtain elevated
423 * privileges.
424 */
425 PROC_LOCK(p);
426 KASSERT((p->p_flag & P_INEXEC) == 0,
427 ("%s(): process already has P_INEXEC flag", __func__));
428 p->p_flag |= P_INEXEC;
429 PROC_UNLOCK(p);
430
431 /*
432 * Initialize part of the common data
433 */
434 bzero(imgp, sizeof(*imgp));
435 imgp->proc = p;
436 imgp->attr = &attr;
437 imgp->args = args;
438 oldcred = p->p_ucred;
439 orig_osrel = p->p_osrel;
440 orig_fctl0 = p->p_fctl0;
441 orig_brandinfo = p->p_elf_brandinfo;
442
443 #ifdef MAC
444 error = mac_execve_enter(imgp, mac_p);
445 if (error)
446 goto exec_fail;
447 #endif
448
449 SDT_PROBE1(proc, , , exec, args->fname);
450
451 interpret:
452 if (args->fname != NULL) {
453 #ifdef CAPABILITY_MODE
454 if (CAP_TRACING(td))
455 ktrcapfail(CAPFAIL_NAMEI, args->fname);
456 /*
457 * While capability mode can't reach this point via direct
458 * path arguments to execve(), we also don't allow
459 * interpreters to be used in capability mode (for now).
460 * Catch indirect lookups and return a permissions error.
461 */
462 if (IN_CAPABILITY_MODE(td)) {
463 error = ECAPMODE;
464 goto exec_fail;
465 }
466 #endif
467
468 /*
469 * Translate the file name. namei() returns a vnode
470 * pointer in ni_vp among other things.
471 */
472 NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | LOCKSHARED | FOLLOW |
473 AUDITVNODE1 | WANTPARENT, UIO_SYSSPACE,
474 args->fname);
475
476 error = namei(&nd);
477 if (error)
478 goto exec_fail;
479
480 newtextvp = nd.ni_vp;
481 newtextdvp = nd.ni_dvp;
482 nd.ni_dvp = NULL;
483 newbinname = malloc(nd.ni_cnd.cn_namelen + 1, M_PARGS,
484 M_WAITOK);
485 memcpy(newbinname, nd.ni_cnd.cn_nameptr, nd.ni_cnd.cn_namelen);
486 newbinname[nd.ni_cnd.cn_namelen] = '\0';
487 imgp->vp = newtextvp;
488
489 /*
490 * Do the best to calculate the full path to the image file.
491 */
492 if (args->fname[0] == '/') {
493 imgp->execpath = args->fname;
494 } else {
495 VOP_UNLOCK(imgp->vp);
496 freepath_size = MAXPATHLEN;
497 if (vn_fullpath_hardlink(newtextvp, newtextdvp,
498 newbinname, nd.ni_cnd.cn_namelen, &imgp->execpath,
499 &imgp->freepath, &freepath_size) != 0)
500 imgp->execpath = args->fname;
501 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
502 }
503 } else if (imgp->interpreter_vp) {
504 /*
505 * An image activator has already provided an open vnode
506 */
507 newtextvp = imgp->interpreter_vp;
508 imgp->interpreter_vp = NULL;
509 if (vn_fullpath(newtextvp, &imgp->execpath,
510 &imgp->freepath) != 0)
511 imgp->execpath = args->fname;
512 vn_lock(newtextvp, LK_SHARED | LK_RETRY);
513 AUDIT_ARG_VNODE1(newtextvp);
514 imgp->vp = newtextvp;
515 } else {
516 AUDIT_ARG_FD(args->fd);
517
518 /*
519 * If the descriptors was not opened with O_PATH, then
520 * we require that it was opened with O_EXEC or
521 * O_RDONLY. In either case, exec_check_permissions()
522 * below checks _current_ file access mode regardless
523 * of the permissions additionally checked at the
524 * open(2).
525 */
526 error = fgetvp_exec(td, args->fd, &cap_fexecve_rights,
527 &newtextvp);
528 if (error != 0)
529 goto exec_fail;
530
531 if (vn_fullpath(newtextvp, &imgp->execpath,
532 &imgp->freepath) != 0)
533 imgp->execpath = args->fname;
534 vn_lock(newtextvp, LK_SHARED | LK_RETRY);
535 AUDIT_ARG_VNODE1(newtextvp);
536 imgp->vp = newtextvp;
537 }
538
539 /*
540 * Check file permissions. Also 'opens' file and sets its vnode to
541 * text mode.
542 */
543 error = exec_check_permissions(imgp);
544 if (error)
545 goto exec_fail_dealloc;
546
547 imgp->object = imgp->vp->v_object;
548 if (imgp->object != NULL)
549 vm_object_reference(imgp->object);
550
551 error = exec_map_first_page(imgp);
552 if (error)
553 goto exec_fail_dealloc;
554
555 imgp->proc->p_osrel = 0;
556 imgp->proc->p_fctl0 = 0;
557 imgp->proc->p_elf_brandinfo = NULL;
558
559 /*
560 * Implement image setuid/setgid.
561 *
562 * Determine new credentials before attempting image activators
563 * so that it can be used by process_exec handlers to determine
564 * credential/setid changes.
565 *
566 * Don't honor setuid/setgid if the filesystem prohibits it or if
567 * the process is being traced.
568 *
569 * We disable setuid/setgid/etc in capability mode on the basis
570 * that most setugid applications are not written with that
571 * environment in mind, and will therefore almost certainly operate
572 * incorrectly. In principle there's no reason that setugid
573 * applications might not be useful in capability mode, so we may want
574 * to reconsider this conservative design choice in the future.
575 *
576 * XXXMAC: For the time being, use NOSUID to also prohibit
577 * transitions on the file system.
578 */
579 credential_changing = false;
580 credential_changing |= (attr.va_mode & S_ISUID) &&
581 oldcred->cr_uid != attr.va_uid;
582 credential_changing |= (attr.va_mode & S_ISGID) &&
583 oldcred->cr_gid != attr.va_gid;
584 #ifdef MAC
585 will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
586 interpvplabel, imgp) != 0;
587 credential_changing |= will_transition;
588 #endif
589
590 /* Don't inherit PROC_PDEATHSIG_CTL value if setuid/setgid. */
591 if (credential_changing)
592 imgp->proc->p_pdeathsig = 0;
593
594 if (credential_changing &&
595 #ifdef CAPABILITY_MODE
596 ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
597 #endif
598 (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
599 (p->p_flag & P_TRACED) == 0) {
600 imgp->credential_setid = true;
601 VOP_UNLOCK(imgp->vp);
602 imgp->newcred = crdup(oldcred);
603 if (attr.va_mode & S_ISUID) {
604 euip = uifind(attr.va_uid);
605 change_euid(imgp->newcred, euip);
606 }
607 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
608 if (attr.va_mode & S_ISGID)
609 change_egid(imgp->newcred, attr.va_gid);
610 /*
611 * Implement correct POSIX saved-id behavior.
612 *
613 * XXXMAC: Note that the current logic will save the
614 * uid and gid if a MAC domain transition occurs, even
615 * though maybe it shouldn't.
616 */
617 change_svuid(imgp->newcred, imgp->newcred->cr_uid);
618 change_svgid(imgp->newcred, imgp->newcred->cr_gid);
619 } else {
620 /*
621 * Implement correct POSIX saved-id behavior.
622 *
623 * XXX: It's not clear that the existing behavior is
624 * POSIX-compliant. A number of sources indicate that the
625 * saved uid/gid should only be updated if the new ruid is
626 * not equal to the old ruid, or the new euid is not equal
627 * to the old euid and the new euid is not equal to the old
628 * ruid. The FreeBSD code always updates the saved uid/gid.
629 * Also, this code uses the new (replaced) euid and egid as
630 * the source, which may or may not be the right ones to use.
631 */
632 if (oldcred->cr_svuid != oldcred->cr_uid ||
633 oldcred->cr_svgid != oldcred->cr_gid) {
634 VOP_UNLOCK(imgp->vp);
635 imgp->newcred = crdup(oldcred);
636 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
637 change_svuid(imgp->newcred, imgp->newcred->cr_uid);
638 change_svgid(imgp->newcred, imgp->newcred->cr_gid);
639 }
640 }
641 /* The new credentials are installed into the process later. */
642
643 /*
644 * Loop through the list of image activators, calling each one.
645 * An activator returns -1 if there is no match, 0 on success,
646 * and an error otherwise.
647 */
648 error = -1;
649 for (i = 0; error == -1 && execsw[i]; ++i) {
650 if (execsw[i]->ex_imgact == NULL)
651 continue;
652 error = (*execsw[i]->ex_imgact)(imgp);
653 }
654
655 if (error) {
656 if (error == -1)
657 error = ENOEXEC;
658 goto exec_fail_dealloc;
659 }
660
661 /*
662 * Special interpreter operation, cleanup and loop up to try to
663 * activate the interpreter.
664 */
665 if (imgp->interpreted) {
666 exec_unmap_first_page(imgp);
667 /*
668 * The text reference needs to be removed for scripts.
669 * There is a short period before we determine that
670 * something is a script where text reference is active.
671 * The vnode lock is held over this entire period
672 * so nothing should illegitimately be blocked.
673 */
674 MPASS(imgp->textset);
675 VOP_UNSET_TEXT_CHECKED(newtextvp);
676 imgp->textset = false;
677 /* free name buffer and old vnode */
678 #ifdef MAC
679 mac_execve_interpreter_enter(newtextvp, &interpvplabel);
680 #endif
681 if (imgp->opened) {
682 VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
683 imgp->opened = false;
684 }
685 vput(newtextvp);
686 imgp->vp = newtextvp = NULL;
687 if (args->fname != NULL) {
688 if (newtextdvp != NULL) {
689 vrele(newtextdvp);
690 newtextdvp = NULL;
691 }
692 NDFREE_PNBUF(&nd);
693 free(newbinname, M_PARGS);
694 newbinname = NULL;
695 }
696 vm_object_deallocate(imgp->object);
697 imgp->object = NULL;
698 execve_nosetid(imgp);
699 imgp->execpath = NULL;
700 free(imgp->freepath, M_TEMP);
701 imgp->freepath = NULL;
702 /* set new name to that of the interpreter */
703 if (imgp->interpreter_vp) {
704 args->fname = NULL;
705 } else {
706 args->fname = imgp->interpreter_name;
707 }
708 goto interpret;
709 }
710
711 /*
712 * NB: We unlock the vnode here because it is believed that none
713 * of the sv_copyout_strings/sv_fixup operations require the vnode.
714 */
715 VOP_UNLOCK(imgp->vp);
716
717 if (disallow_high_osrel &&
718 P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
719 error = ENOEXEC;
720 uprintf("Osrel %d for image %s too high\n", p->p_osrel,
721 imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
722 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
723 goto exec_fail_dealloc;
724 }
725
726 /*
727 * Copy out strings (args and env) and initialize stack base.
728 */
729 error = (*p->p_sysent->sv_copyout_strings)(imgp, &stack_base);
730 if (error != 0) {
731 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
732 goto exec_fail_dealloc;
733 }
734
735 /*
736 * Stack setup.
737 */
738 error = (*p->p_sysent->sv_fixup)(&stack_base, imgp);
739 if (error != 0) {
740 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
741 goto exec_fail_dealloc;
742 }
743
744 /*
745 * For security and other reasons, the file descriptor table cannot be
746 * shared after an exec.
747 */
748 fdunshare(td);
749 pdunshare(td);
750 /* close files on exec */
751 fdcloseexec(td);
752
753 /*
754 * Malloc things before we need locks.
755 */
756 i = exec_args_get_begin_envv(imgp->args) - imgp->args->begin_argv;
757 /* Cache arguments if they fit inside our allowance */
758 if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
759 newargs = pargs_alloc(i);
760 bcopy(imgp->args->begin_argv, newargs->ar_args, i);
761 }
762
763 /*
764 * For security and other reasons, signal handlers cannot
765 * be shared after an exec. The new process gets a copy of the old
766 * handlers. In execsigs(), the new process will have its signals
767 * reset.
768 */
769 if (sigacts_shared(p->p_sigacts)) {
770 oldsigacts = p->p_sigacts;
771 newsigacts = sigacts_alloc();
772 sigacts_copy(newsigacts, oldsigacts);
773 }
774
775 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
776
777 PROC_LOCK(p);
778 if (oldsigacts)
779 p->p_sigacts = newsigacts;
780 /* Stop profiling */
781 stopprofclock(p);
782
783 /* reset caught signals */
784 execsigs(p);
785
786 /* name this process - nameiexec(p, ndp) */
787 bzero(p->p_comm, sizeof(p->p_comm));
788 if (args->fname)
789 bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
790 min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
791 else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
792 bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
793 bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
794 #ifdef KTR
795 sched_clear_tdname(td);
796 #endif
797
798 /*
799 * mark as execed, wakeup the process that vforked (if any) and tell
800 * it that it now has its own resources back
801 */
802 p->p_flag |= P_EXEC;
803 if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
804 p->p_flag2 &= ~P2_NOTRACE;
805 if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0)
806 p->p_flag2 &= ~P2_STKGAP_DISABLE;
807 p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE |
808 P2_MEMBAR_GLOBE);
809 if (p->p_flag & P_PPWAIT) {
810 p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
811 cv_broadcast(&p->p_pwait);
812 /* STOPs are no longer ignored, arrange for AST */
813 signotify(td);
814 }
815
816 if ((imgp->sysent->sv_setid_allowed != NULL &&
817 !(*imgp->sysent->sv_setid_allowed)(td, imgp)) ||
818 (p->p_flag2 & P2_NO_NEW_PRIVS) != 0)
819 execve_nosetid(imgp);
820
821 /*
822 * Implement image setuid/setgid installation.
823 */
824 if (imgp->credential_setid) {
825 /*
826 * Turn off syscall tracing for set-id programs, except for
827 * root. Record any set-id flags first to make sure that
828 * we do not regain any tracing during a possible block.
829 */
830 setsugid(p);
831 #ifdef KTRACE
832 kiop = ktrprocexec(p);
833 #endif
834 /*
835 * Close any file descriptors 0..2 that reference procfs,
836 * then make sure file descriptors 0..2 are in use.
837 *
838 * Both fdsetugidsafety() and fdcheckstd() may call functions
839 * taking sleepable locks, so temporarily drop our locks.
840 */
841 PROC_UNLOCK(p);
842 VOP_UNLOCK(imgp->vp);
843 fdsetugidsafety(td);
844 error = fdcheckstd(td);
845 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
846 if (error != 0)
847 goto exec_fail_dealloc;
848 PROC_LOCK(p);
849 #ifdef MAC
850 if (will_transition) {
851 mac_vnode_execve_transition(oldcred, imgp->newcred,
852 imgp->vp, interpvplabel, imgp);
853 }
854 #endif
855 } else {
856 if (oldcred->cr_uid == oldcred->cr_ruid &&
857 oldcred->cr_gid == oldcred->cr_rgid)
858 p->p_flag &= ~P_SUGID;
859 }
860 /*
861 * Set the new credentials.
862 */
863 if (imgp->newcred != NULL) {
864 proc_set_cred(p, imgp->newcred);
865 crfree(oldcred);
866 oldcred = NULL;
867 }
868
869 /*
870 * Store the vp for use in kern.proc.pathname. This vnode was
871 * referenced by namei() or by fexecve variant of fname handling.
872 */
873 oldtextvp = p->p_textvp;
874 p->p_textvp = newtextvp;
875 oldtextdvp = p->p_textdvp;
876 p->p_textdvp = newtextdvp;
877 newtextdvp = NULL;
878 oldbinname = p->p_binname;
879 p->p_binname = newbinname;
880 newbinname = NULL;
881
882 #ifdef KDTRACE_HOOKS
883 /*
884 * Tell the DTrace fasttrap provider about the exec if it
885 * has declared an interest.
886 */
887 if (dtrace_fasttrap_exec)
888 dtrace_fasttrap_exec(p);
889 #endif
890
891 /*
892 * Notify others that we exec'd, and clear the P_INEXEC flag
893 * as we're now a bona fide freshly-execed process.
894 */
895 KNOTE_LOCKED(p->p_klist, NOTE_EXEC);
896 p->p_flag &= ~P_INEXEC;
897
898 /* clear "fork but no exec" flag, as we _are_ execing */
899 p->p_acflag &= ~AFORK;
900
901 /*
902 * Free any previous argument cache and replace it with
903 * the new argument cache, if any.
904 */
905 oldargs = p->p_args;
906 p->p_args = newargs;
907 newargs = NULL;
908
909 PROC_UNLOCK(p);
910
911 #ifdef HWPMC_HOOKS
912 /*
913 * Check if system-wide sampling is in effect or if the
914 * current process is using PMCs. If so, do exec() time
915 * processing. This processing needs to happen AFTER the
916 * P_INEXEC flag is cleared.
917 */
918 if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
919 VOP_UNLOCK(imgp->vp);
920 pe.pm_credentialschanged = credential_changing;
921 pe.pm_baseaddr = imgp->reloc_base;
922 pe.pm_dynaddr = imgp->et_dyn_addr;
923
924 PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
925 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
926 }
927 #endif
928
929 /* Set values passed into the program in registers. */
930 (*p->p_sysent->sv_setregs)(td, imgp, stack_base);
931
932 VOP_MMAPPED(imgp->vp);
933
934 SDT_PROBE1(proc, , , exec__success, args->fname);
935
936 exec_fail_dealloc:
937 if (error != 0) {
938 p->p_osrel = orig_osrel;
939 p->p_fctl0 = orig_fctl0;
940 p->p_elf_brandinfo = orig_brandinfo;
941 }
942
943 if (imgp->firstpage != NULL)
944 exec_unmap_first_page(imgp);
945
946 if (imgp->vp != NULL) {
947 if (imgp->opened)
948 VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
949 if (imgp->textset)
950 VOP_UNSET_TEXT_CHECKED(imgp->vp);
951 if (error != 0)
952 vput(imgp->vp);
953 else
954 VOP_UNLOCK(imgp->vp);
955 if (args->fname != NULL)
956 NDFREE_PNBUF(&nd);
957 if (newtextdvp != NULL)
958 vrele(newtextdvp);
959 free(newbinname, M_PARGS);
960 }
961
962 if (imgp->object != NULL)
963 vm_object_deallocate(imgp->object);
964
965 free(imgp->freepath, M_TEMP);
966
967 if (error == 0) {
968 if (p->p_ptevents & PTRACE_EXEC) {
969 PROC_LOCK(p);
970 if (p->p_ptevents & PTRACE_EXEC)
971 td->td_dbgflags |= TDB_EXEC;
972 PROC_UNLOCK(p);
973 }
974 } else {
975 exec_fail:
976 /* we're done here, clear P_INEXEC */
977 PROC_LOCK(p);
978 p->p_flag &= ~P_INEXEC;
979 PROC_UNLOCK(p);
980
981 SDT_PROBE1(proc, , , exec__failure, error);
982 }
983
984 if (imgp->newcred != NULL && oldcred != NULL)
985 crfree(imgp->newcred);
986
987 #ifdef MAC
988 mac_execve_exit(imgp);
989 mac_execve_interpreter_exit(interpvplabel);
990 #endif
991 exec_free_args(args);
992
993 /*
994 * Handle deferred decrement of ref counts.
995 */
996 if (oldtextvp != NULL)
997 vrele(oldtextvp);
998 if (oldtextdvp != NULL)
999 vrele(oldtextdvp);
1000 free(oldbinname, M_PARGS);
1001 #ifdef KTRACE
1002 ktr_io_params_free(kiop);
1003 #endif
1004 pargs_drop(oldargs);
1005 pargs_drop(newargs);
1006 if (oldsigacts != NULL)
1007 sigacts_free(oldsigacts);
1008 if (euip != NULL)
1009 uifree(euip);
1010
1011 if (error && imgp->vmspace_destroyed) {
1012 /* sorry, no more process anymore. exit gracefully */
1013 exec_cleanup(td, oldvmspace);
1014 exit1(td, 0, SIGABRT);
1015 /* NOT REACHED */
1016 }
1017
1018 #ifdef KTRACE
1019 if (error == 0)
1020 ktrprocctor(p);
1021 #endif
1022
1023 /*
1024 * We don't want cpu_set_syscall_retval() to overwrite any of
1025 * the register values put in place by exec_setregs().
1026 * Implementations of cpu_set_syscall_retval() will leave
1027 * registers unmodified when returning EJUSTRETURN.
1028 */
1029 return (error == 0 ? EJUSTRETURN : error);
1030 }
1031
1032 void
exec_cleanup(struct thread * td,struct vmspace * oldvmspace)1033 exec_cleanup(struct thread *td, struct vmspace *oldvmspace)
1034 {
1035 if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
1036 KASSERT(td->td_proc->p_vmspace != oldvmspace,
1037 ("oldvmspace still used"));
1038 vmspace_free(oldvmspace);
1039 td->td_pflags &= ~TDP_EXECVMSPC;
1040 }
1041 }
1042
1043 int
exec_map_first_page(struct image_params * imgp)1044 exec_map_first_page(struct image_params *imgp)
1045 {
1046 vm_object_t object;
1047 vm_page_t m;
1048 int error;
1049
1050 if (imgp->firstpage != NULL)
1051 exec_unmap_first_page(imgp);
1052
1053 object = imgp->vp->v_object;
1054 if (object == NULL)
1055 return (EACCES);
1056 #if VM_NRESERVLEVEL > 0
1057 if ((object->flags & OBJ_COLORED) == 0) {
1058 VM_OBJECT_WLOCK(object);
1059 vm_object_color(object, 0);
1060 VM_OBJECT_WUNLOCK(object);
1061 }
1062 #endif
1063 error = vm_page_grab_valid_unlocked(&m, object, 0,
1064 VM_ALLOC_COUNT(VM_INITIAL_PAGEIN) |
1065 VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED);
1066
1067 if (error != VM_PAGER_OK)
1068 return (EIO);
1069 imgp->firstpage = sf_buf_alloc(m, 0);
1070 imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
1071
1072 return (0);
1073 }
1074
1075 void
exec_unmap_first_page(struct image_params * imgp)1076 exec_unmap_first_page(struct image_params *imgp)
1077 {
1078 vm_page_t m;
1079
1080 if (imgp->firstpage != NULL) {
1081 m = sf_buf_page(imgp->firstpage);
1082 sf_buf_free(imgp->firstpage);
1083 imgp->firstpage = NULL;
1084 vm_page_unwire(m, PQ_ACTIVE);
1085 }
1086 }
1087
1088 void
exec_onexec_old(struct thread * td)1089 exec_onexec_old(struct thread *td)
1090 {
1091 sigfastblock_clear(td);
1092 umtx_exec(td->td_proc);
1093 }
1094
1095 /*
1096 * This is an optimization which removes the unmanaged shared page
1097 * mapping. In combination with pmap_remove_pages(), which cleans all
1098 * managed mappings in the process' vmspace pmap, no work will be left
1099 * for pmap_remove(min, max).
1100 */
1101 void
exec_free_abi_mappings(struct proc * p)1102 exec_free_abi_mappings(struct proc *p)
1103 {
1104 struct vmspace *vmspace;
1105
1106 vmspace = p->p_vmspace;
1107 if (refcount_load(&vmspace->vm_refcnt) != 1)
1108 return;
1109
1110 if (!PROC_HAS_SHP(p))
1111 return;
1112
1113 pmap_remove(vmspace_pmap(vmspace), vmspace->vm_shp_base,
1114 vmspace->vm_shp_base + p->p_sysent->sv_shared_page_len);
1115 }
1116
1117 /*
1118 * Run down the current address space and install a new one.
1119 */
1120 int
exec_new_vmspace(struct image_params * imgp,struct sysentvec * sv)1121 exec_new_vmspace(struct image_params *imgp, struct sysentvec *sv)
1122 {
1123 int error;
1124 struct proc *p = imgp->proc;
1125 struct vmspace *vmspace = p->p_vmspace;
1126 struct thread *td = curthread;
1127 vm_offset_t sv_minuser;
1128 vm_map_t map;
1129
1130 imgp->vmspace_destroyed = true;
1131 imgp->sysent = sv;
1132
1133 if (p->p_sysent->sv_onexec_old != NULL)
1134 p->p_sysent->sv_onexec_old(td);
1135 itimers_exec(p);
1136
1137 EVENTHANDLER_DIRECT_INVOKE(process_exec, p, imgp);
1138
1139 /*
1140 * Blow away entire process VM, if address space not shared,
1141 * otherwise, create a new VM space so that other threads are
1142 * not disrupted
1143 */
1144 map = &vmspace->vm_map;
1145 if (map_at_zero)
1146 sv_minuser = sv->sv_minuser;
1147 else
1148 sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
1149 if (refcount_load(&vmspace->vm_refcnt) == 1 &&
1150 vm_map_min(map) == sv_minuser &&
1151 vm_map_max(map) == sv->sv_maxuser &&
1152 cpu_exec_vmspace_reuse(p, map)) {
1153 exec_free_abi_mappings(p);
1154 shmexit(vmspace);
1155 pmap_remove_pages(vmspace_pmap(vmspace));
1156 vm_map_remove(map, vm_map_min(map), vm_map_max(map));
1157 /*
1158 * An exec terminates mlockall(MCL_FUTURE).
1159 * ASLR and W^X states must be re-evaluated.
1160 */
1161 vm_map_lock(map);
1162 vm_map_modflags(map, 0, MAP_WIREFUTURE | MAP_ASLR |
1163 MAP_ASLR_IGNSTART | MAP_ASLR_STACK | MAP_WXORX);
1164 vm_map_unlock(map);
1165 } else {
1166 error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
1167 if (error)
1168 return (error);
1169 vmspace = p->p_vmspace;
1170 map = &vmspace->vm_map;
1171 }
1172 map->flags |= imgp->map_flags;
1173
1174 return (sv->sv_onexec != NULL ? sv->sv_onexec(p, imgp) : 0);
1175 }
1176
1177 /*
1178 * Compute the stack size limit and map the main process stack.
1179 * Map the shared page.
1180 */
1181 int
exec_map_stack(struct image_params * imgp)1182 exec_map_stack(struct image_params *imgp)
1183 {
1184 struct rlimit rlim_stack;
1185 struct sysentvec *sv;
1186 struct proc *p;
1187 vm_map_t map;
1188 struct vmspace *vmspace;
1189 vm_offset_t stack_addr, stack_top;
1190 vm_offset_t sharedpage_addr;
1191 u_long ssiz;
1192 int error, find_space, stack_off;
1193 vm_prot_t stack_prot;
1194 vm_object_t obj;
1195
1196 p = imgp->proc;
1197 sv = p->p_sysent;
1198
1199 if (imgp->stack_sz != 0) {
1200 ssiz = trunc_page(imgp->stack_sz);
1201 PROC_LOCK(p);
1202 lim_rlimit_proc(p, RLIMIT_STACK, &rlim_stack);
1203 PROC_UNLOCK(p);
1204 if (ssiz > rlim_stack.rlim_max)
1205 ssiz = rlim_stack.rlim_max;
1206 if (ssiz > rlim_stack.rlim_cur) {
1207 rlim_stack.rlim_cur = ssiz;
1208 kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
1209 }
1210 } else if (sv->sv_maxssiz != NULL) {
1211 ssiz = *sv->sv_maxssiz;
1212 } else {
1213 ssiz = maxssiz;
1214 }
1215
1216 vmspace = p->p_vmspace;
1217 map = &vmspace->vm_map;
1218
1219 stack_prot = sv->sv_shared_page_obj != NULL && imgp->stack_prot != 0 ?
1220 imgp->stack_prot : sv->sv_stackprot;
1221 if ((map->flags & MAP_ASLR_STACK) != 0) {
1222 stack_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
1223 lim_max(curthread, RLIMIT_DATA));
1224 find_space = VMFS_ANY_SPACE;
1225 } else {
1226 stack_addr = sv->sv_usrstack - ssiz;
1227 find_space = VMFS_NO_SPACE;
1228 }
1229 error = vm_map_find(map, NULL, 0, &stack_addr, (vm_size_t)ssiz,
1230 sv->sv_usrstack, find_space, stack_prot, VM_PROT_ALL,
1231 MAP_STACK_GROWS_DOWN);
1232 if (error != KERN_SUCCESS) {
1233 uprintf("exec_new_vmspace: mapping stack size %#jx prot %#x "
1234 "failed, mach error %d errno %d\n", (uintmax_t)ssiz,
1235 stack_prot, error, vm_mmap_to_errno(error));
1236 return (vm_mmap_to_errno(error));
1237 }
1238
1239 stack_top = stack_addr + ssiz;
1240 if ((map->flags & MAP_ASLR_STACK) != 0) {
1241 /* Randomize within the first page of the stack. */
1242 arc4rand(&stack_off, sizeof(stack_off), 0);
1243 stack_top -= rounddown2(stack_off & PAGE_MASK, sizeof(void *));
1244 }
1245
1246 /* Map a shared page */
1247 obj = sv->sv_shared_page_obj;
1248 if (obj == NULL) {
1249 sharedpage_addr = 0;
1250 goto out;
1251 }
1252
1253 /*
1254 * If randomization is disabled then the shared page will
1255 * be mapped at address specified in sysentvec.
1256 * Otherwise any address above .data section can be selected.
1257 * Same logic is used for stack address randomization.
1258 * If the address randomization is applied map a guard page
1259 * at the top of UVA.
1260 */
1261 vm_object_reference(obj);
1262 if ((imgp->imgp_flags & IMGP_ASLR_SHARED_PAGE) != 0) {
1263 sharedpage_addr = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
1264 lim_max(curthread, RLIMIT_DATA));
1265
1266 error = vm_map_fixed(map, NULL, 0,
1267 sv->sv_maxuser - PAGE_SIZE, PAGE_SIZE,
1268 VM_PROT_NONE, VM_PROT_NONE, MAP_CREATE_GUARD);
1269 if (error != KERN_SUCCESS) {
1270 /*
1271 * This is not fatal, so let's just print a warning
1272 * and continue.
1273 */
1274 uprintf("%s: Mapping guard page at the top of UVA failed"
1275 " mach error %d errno %d",
1276 __func__, error, vm_mmap_to_errno(error));
1277 }
1278
1279 error = vm_map_find(map, obj, 0,
1280 &sharedpage_addr, sv->sv_shared_page_len,
1281 sv->sv_maxuser, VMFS_ANY_SPACE,
1282 VM_PROT_READ | VM_PROT_EXECUTE,
1283 VM_PROT_READ | VM_PROT_EXECUTE,
1284 MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
1285 } else {
1286 sharedpage_addr = sv->sv_shared_page_base;
1287 vm_map_fixed(map, obj, 0,
1288 sharedpage_addr, sv->sv_shared_page_len,
1289 VM_PROT_READ | VM_PROT_EXECUTE,
1290 VM_PROT_READ | VM_PROT_EXECUTE,
1291 MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
1292 }
1293 if (error != KERN_SUCCESS) {
1294 uprintf("%s: mapping shared page at addr: %p"
1295 "failed, mach error %d errno %d\n", __func__,
1296 (void *)sharedpage_addr, error, vm_mmap_to_errno(error));
1297 vm_object_deallocate(obj);
1298 return (vm_mmap_to_errno(error));
1299 }
1300 out:
1301 /*
1302 * vm_ssize and vm_maxsaddr are somewhat antiquated concepts, but they
1303 * are still used to enforce the stack rlimit on the process stack.
1304 */
1305 vmspace->vm_maxsaddr = (char *)stack_addr;
1306 vmspace->vm_stacktop = stack_top;
1307 vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
1308 vmspace->vm_shp_base = sharedpage_addr;
1309
1310 return (0);
1311 }
1312
1313 /*
1314 * Copy out argument and environment strings from the old process address
1315 * space into the temporary string buffer.
1316 */
1317 int
exec_copyin_args(struct image_args * args,const char * fname,char ** argv,char ** envv)1318 exec_copyin_args(struct image_args *args, const char *fname,
1319 char **argv, char **envv)
1320 {
1321 u_long arg, env;
1322 int error;
1323
1324 bzero(args, sizeof(*args));
1325 if (argv == NULL)
1326 return (EFAULT);
1327
1328 /*
1329 * Allocate demand-paged memory for the file name, argument, and
1330 * environment strings.
1331 */
1332 error = exec_alloc_args(args);
1333 if (error != 0)
1334 return (error);
1335
1336 /*
1337 * Copy the file name.
1338 */
1339 error = exec_args_add_fname(args, fname, UIO_USERSPACE);
1340 if (error != 0)
1341 goto err_exit;
1342
1343 /*
1344 * extract arguments first
1345 */
1346 for (;;) {
1347 error = fueword(argv++, &arg);
1348 if (error == -1) {
1349 error = EFAULT;
1350 goto err_exit;
1351 }
1352 if (arg == 0)
1353 break;
1354 error = exec_args_add_arg(args, (char *)(uintptr_t)arg,
1355 UIO_USERSPACE);
1356 if (error != 0)
1357 goto err_exit;
1358 }
1359
1360 /*
1361 * extract environment strings
1362 */
1363 if (envv) {
1364 for (;;) {
1365 error = fueword(envv++, &env);
1366 if (error == -1) {
1367 error = EFAULT;
1368 goto err_exit;
1369 }
1370 if (env == 0)
1371 break;
1372 error = exec_args_add_env(args,
1373 (char *)(uintptr_t)env, UIO_USERSPACE);
1374 if (error != 0)
1375 goto err_exit;
1376 }
1377 }
1378
1379 return (0);
1380
1381 err_exit:
1382 exec_free_args(args);
1383 return (error);
1384 }
1385
1386 struct exec_args_kva {
1387 vm_offset_t addr;
1388 u_int gen;
1389 SLIST_ENTRY(exec_args_kva) next;
1390 };
1391
1392 DPCPU_DEFINE_STATIC(struct exec_args_kva *, exec_args_kva);
1393
1394 static SLIST_HEAD(, exec_args_kva) exec_args_kva_freelist;
1395 static struct mtx exec_args_kva_mtx;
1396 static u_int exec_args_gen;
1397
1398 static void
exec_prealloc_args_kva(void * arg __unused)1399 exec_prealloc_args_kva(void *arg __unused)
1400 {
1401 struct exec_args_kva *argkva;
1402 u_int i;
1403
1404 SLIST_INIT(&exec_args_kva_freelist);
1405 mtx_init(&exec_args_kva_mtx, "exec args kva", NULL, MTX_DEF);
1406 for (i = 0; i < exec_map_entries; i++) {
1407 argkva = malloc(sizeof(*argkva), M_PARGS, M_WAITOK);
1408 argkva->addr = kmap_alloc_wait(exec_map, exec_map_entry_size);
1409 argkva->gen = exec_args_gen;
1410 SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
1411 }
1412 }
1413 SYSINIT(exec_args_kva, SI_SUB_EXEC, SI_ORDER_ANY, exec_prealloc_args_kva, NULL);
1414
1415 static vm_offset_t
exec_alloc_args_kva(void ** cookie)1416 exec_alloc_args_kva(void **cookie)
1417 {
1418 struct exec_args_kva *argkva;
1419
1420 argkva = (void *)atomic_readandclear_ptr(
1421 (uintptr_t *)DPCPU_PTR(exec_args_kva));
1422 if (argkva == NULL) {
1423 mtx_lock(&exec_args_kva_mtx);
1424 while ((argkva = SLIST_FIRST(&exec_args_kva_freelist)) == NULL)
1425 (void)mtx_sleep(&exec_args_kva_freelist,
1426 &exec_args_kva_mtx, 0, "execkva", 0);
1427 SLIST_REMOVE_HEAD(&exec_args_kva_freelist, next);
1428 mtx_unlock(&exec_args_kva_mtx);
1429 }
1430 kasan_mark((void *)argkva->addr, exec_map_entry_size,
1431 exec_map_entry_size, 0);
1432 *(struct exec_args_kva **)cookie = argkva;
1433 return (argkva->addr);
1434 }
1435
1436 static void
exec_release_args_kva(struct exec_args_kva * argkva,u_int gen)1437 exec_release_args_kva(struct exec_args_kva *argkva, u_int gen)
1438 {
1439 vm_offset_t base;
1440
1441 base = argkva->addr;
1442 kasan_mark((void *)argkva->addr, 0, exec_map_entry_size,
1443 KASAN_EXEC_ARGS_FREED);
1444 if (argkva->gen != gen) {
1445 (void)vm_map_madvise(exec_map, base, base + exec_map_entry_size,
1446 MADV_FREE);
1447 argkva->gen = gen;
1448 }
1449 if (!atomic_cmpset_ptr((uintptr_t *)DPCPU_PTR(exec_args_kva),
1450 (uintptr_t)NULL, (uintptr_t)argkva)) {
1451 mtx_lock(&exec_args_kva_mtx);
1452 SLIST_INSERT_HEAD(&exec_args_kva_freelist, argkva, next);
1453 wakeup_one(&exec_args_kva_freelist);
1454 mtx_unlock(&exec_args_kva_mtx);
1455 }
1456 }
1457
1458 static void
exec_free_args_kva(void * cookie)1459 exec_free_args_kva(void *cookie)
1460 {
1461
1462 exec_release_args_kva(cookie, exec_args_gen);
1463 }
1464
1465 static void
exec_args_kva_lowmem(void * arg __unused)1466 exec_args_kva_lowmem(void *arg __unused)
1467 {
1468 SLIST_HEAD(, exec_args_kva) head;
1469 struct exec_args_kva *argkva;
1470 u_int gen;
1471 int i;
1472
1473 gen = atomic_fetchadd_int(&exec_args_gen, 1) + 1;
1474
1475 /*
1476 * Force an madvise of each KVA range. Any currently allocated ranges
1477 * will have MADV_FREE applied once they are freed.
1478 */
1479 SLIST_INIT(&head);
1480 mtx_lock(&exec_args_kva_mtx);
1481 SLIST_SWAP(&head, &exec_args_kva_freelist, exec_args_kva);
1482 mtx_unlock(&exec_args_kva_mtx);
1483 while ((argkva = SLIST_FIRST(&head)) != NULL) {
1484 SLIST_REMOVE_HEAD(&head, next);
1485 exec_release_args_kva(argkva, gen);
1486 }
1487
1488 CPU_FOREACH(i) {
1489 argkva = (void *)atomic_readandclear_ptr(
1490 (uintptr_t *)DPCPU_ID_PTR(i, exec_args_kva));
1491 if (argkva != NULL)
1492 exec_release_args_kva(argkva, gen);
1493 }
1494 }
1495 EVENTHANDLER_DEFINE(vm_lowmem, exec_args_kva_lowmem, NULL,
1496 EVENTHANDLER_PRI_ANY);
1497
1498 /*
1499 * Allocate temporary demand-paged, zero-filled memory for the file name,
1500 * argument, and environment strings.
1501 */
1502 int
exec_alloc_args(struct image_args * args)1503 exec_alloc_args(struct image_args *args)
1504 {
1505
1506 args->buf = (char *)exec_alloc_args_kva(&args->bufkva);
1507 return (0);
1508 }
1509
1510 void
exec_free_args(struct image_args * args)1511 exec_free_args(struct image_args *args)
1512 {
1513
1514 if (args->buf != NULL) {
1515 exec_free_args_kva(args->bufkva);
1516 args->buf = NULL;
1517 }
1518 if (args->fname_buf != NULL) {
1519 free(args->fname_buf, M_TEMP);
1520 args->fname_buf = NULL;
1521 }
1522 }
1523
1524 /*
1525 * A set to functions to fill struct image args.
1526 *
1527 * NOTE: exec_args_add_fname() must be called (possibly with a NULL
1528 * fname) before the other functions. All exec_args_add_arg() calls must
1529 * be made before any exec_args_add_env() calls. exec_args_adjust_args()
1530 * may be called any time after exec_args_add_fname().
1531 *
1532 * exec_args_add_fname() - install path to be executed
1533 * exec_args_add_arg() - append an argument string
1534 * exec_args_add_env() - append an env string
1535 * exec_args_adjust_args() - adjust location of the argument list to
1536 * allow new arguments to be prepended
1537 */
1538 int
exec_args_add_fname(struct image_args * args,const char * fname,enum uio_seg segflg)1539 exec_args_add_fname(struct image_args *args, const char *fname,
1540 enum uio_seg segflg)
1541 {
1542 int error;
1543 size_t length;
1544
1545 KASSERT(args->fname == NULL, ("fname already appended"));
1546 KASSERT(args->endp == NULL, ("already appending to args"));
1547
1548 if (fname != NULL) {
1549 args->fname = args->buf;
1550 error = segflg == UIO_SYSSPACE ?
1551 copystr(fname, args->fname, PATH_MAX, &length) :
1552 copyinstr(fname, args->fname, PATH_MAX, &length);
1553 if (error != 0)
1554 return (error == ENAMETOOLONG ? E2BIG : error);
1555 } else
1556 length = 0;
1557
1558 /* Set up for _arg_*()/_env_*() */
1559 args->endp = args->buf + length;
1560 /* begin_argv must be set and kept updated */
1561 args->begin_argv = args->endp;
1562 KASSERT(exec_map_entry_size - length >= ARG_MAX,
1563 ("too little space remaining for arguments %zu < %zu",
1564 exec_map_entry_size - length, (size_t)ARG_MAX));
1565 args->stringspace = ARG_MAX;
1566
1567 return (0);
1568 }
1569
1570 static int
exec_args_add_str(struct image_args * args,const char * str,enum uio_seg segflg,int * countp)1571 exec_args_add_str(struct image_args *args, const char *str,
1572 enum uio_seg segflg, int *countp)
1573 {
1574 int error;
1575 size_t length;
1576
1577 KASSERT(args->endp != NULL, ("endp not initialized"));
1578 KASSERT(args->begin_argv != NULL, ("begin_argp not initialized"));
1579
1580 error = (segflg == UIO_SYSSPACE) ?
1581 copystr(str, args->endp, args->stringspace, &length) :
1582 copyinstr(str, args->endp, args->stringspace, &length);
1583 if (error != 0)
1584 return (error == ENAMETOOLONG ? E2BIG : error);
1585 args->stringspace -= length;
1586 args->endp += length;
1587 (*countp)++;
1588
1589 return (0);
1590 }
1591
1592 int
exec_args_add_arg(struct image_args * args,const char * argp,enum uio_seg segflg)1593 exec_args_add_arg(struct image_args *args, const char *argp,
1594 enum uio_seg segflg)
1595 {
1596
1597 KASSERT(args->envc == 0, ("appending args after env"));
1598
1599 return (exec_args_add_str(args, argp, segflg, &args->argc));
1600 }
1601
1602 int
exec_args_add_env(struct image_args * args,const char * envp,enum uio_seg segflg)1603 exec_args_add_env(struct image_args *args, const char *envp,
1604 enum uio_seg segflg)
1605 {
1606
1607 if (args->envc == 0)
1608 args->begin_envv = args->endp;
1609
1610 return (exec_args_add_str(args, envp, segflg, &args->envc));
1611 }
1612
1613 int
exec_args_adjust_args(struct image_args * args,size_t consume,ssize_t extend)1614 exec_args_adjust_args(struct image_args *args, size_t consume, ssize_t extend)
1615 {
1616 ssize_t offset;
1617
1618 KASSERT(args->endp != NULL, ("endp not initialized"));
1619 KASSERT(args->begin_argv != NULL, ("begin_argp not initialized"));
1620
1621 offset = extend - consume;
1622 if (args->stringspace < offset)
1623 return (E2BIG);
1624 memmove(args->begin_argv + extend, args->begin_argv + consume,
1625 args->endp - args->begin_argv + consume);
1626 if (args->envc > 0)
1627 args->begin_envv += offset;
1628 args->endp += offset;
1629 args->stringspace -= offset;
1630 return (0);
1631 }
1632
1633 char *
exec_args_get_begin_envv(struct image_args * args)1634 exec_args_get_begin_envv(struct image_args *args)
1635 {
1636
1637 KASSERT(args->endp != NULL, ("endp not initialized"));
1638
1639 if (args->envc > 0)
1640 return (args->begin_envv);
1641 return (args->endp);
1642 }
1643
1644 /*
1645 * Copy strings out to the new process address space, constructing new arg
1646 * and env vector tables. Return a pointer to the base so that it can be used
1647 * as the initial stack pointer.
1648 */
1649 int
exec_copyout_strings(struct image_params * imgp,uintptr_t * stack_base)1650 exec_copyout_strings(struct image_params *imgp, uintptr_t *stack_base)
1651 {
1652 int argc, envc;
1653 char **vectp;
1654 char *stringp;
1655 uintptr_t destp, ustringp;
1656 struct ps_strings *arginfo;
1657 struct proc *p;
1658 struct sysentvec *sysent;
1659 size_t execpath_len;
1660 int error, szsigcode;
1661 char canary[sizeof(long) * 8];
1662
1663 p = imgp->proc;
1664 sysent = p->p_sysent;
1665
1666 destp = PROC_PS_STRINGS(p);
1667 arginfo = imgp->ps_strings = (void *)destp;
1668
1669 /*
1670 * Install sigcode.
1671 */
1672 if (sysent->sv_shared_page_base == 0 && sysent->sv_szsigcode != NULL) {
1673 szsigcode = *(sysent->sv_szsigcode);
1674 destp -= szsigcode;
1675 destp = rounddown2(destp, sizeof(void *));
1676 error = copyout(sysent->sv_sigcode, (void *)destp, szsigcode);
1677 if (error != 0)
1678 return (error);
1679 }
1680
1681 /*
1682 * Copy the image path for the rtld.
1683 */
1684 if (imgp->execpath != NULL && imgp->auxargs != NULL) {
1685 execpath_len = strlen(imgp->execpath) + 1;
1686 destp -= execpath_len;
1687 destp = rounddown2(destp, sizeof(void *));
1688 imgp->execpathp = (void *)destp;
1689 error = copyout(imgp->execpath, imgp->execpathp, execpath_len);
1690 if (error != 0)
1691 return (error);
1692 }
1693
1694 /*
1695 * Prepare the canary for SSP.
1696 */
1697 arc4rand(canary, sizeof(canary), 0);
1698 destp -= sizeof(canary);
1699 imgp->canary = (void *)destp;
1700 error = copyout(canary, imgp->canary, sizeof(canary));
1701 if (error != 0)
1702 return (error);
1703 imgp->canarylen = sizeof(canary);
1704
1705 /*
1706 * Prepare the pagesizes array.
1707 */
1708 imgp->pagesizeslen = sizeof(pagesizes[0]) * MAXPAGESIZES;
1709 destp -= imgp->pagesizeslen;
1710 destp = rounddown2(destp, sizeof(void *));
1711 imgp->pagesizes = (void *)destp;
1712 error = copyout(pagesizes, imgp->pagesizes, imgp->pagesizeslen);
1713 if (error != 0)
1714 return (error);
1715
1716 /*
1717 * Allocate room for the argument and environment strings.
1718 */
1719 destp -= ARG_MAX - imgp->args->stringspace;
1720 destp = rounddown2(destp, sizeof(void *));
1721 ustringp = destp;
1722
1723 if (imgp->auxargs) {
1724 /*
1725 * Allocate room on the stack for the ELF auxargs
1726 * array. It has up to AT_COUNT entries.
1727 */
1728 destp -= AT_COUNT * sizeof(Elf_Auxinfo);
1729 destp = rounddown2(destp, sizeof(void *));
1730 }
1731
1732 vectp = (char **)destp;
1733
1734 /*
1735 * Allocate room for the argv[] and env vectors including the
1736 * terminating NULL pointers.
1737 */
1738 vectp -= imgp->args->argc + 1 + imgp->args->envc + 1;
1739
1740 /*
1741 * vectp also becomes our initial stack base
1742 */
1743 *stack_base = (uintptr_t)vectp;
1744
1745 stringp = imgp->args->begin_argv;
1746 argc = imgp->args->argc;
1747 envc = imgp->args->envc;
1748
1749 /*
1750 * Copy out strings - arguments and environment.
1751 */
1752 error = copyout(stringp, (void *)ustringp,
1753 ARG_MAX - imgp->args->stringspace);
1754 if (error != 0)
1755 return (error);
1756
1757 /*
1758 * Fill in "ps_strings" struct for ps, w, etc.
1759 */
1760 imgp->argv = vectp;
1761 if (suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp) != 0 ||
1762 suword32(&arginfo->ps_nargvstr, argc) != 0)
1763 return (EFAULT);
1764
1765 /*
1766 * Fill in argument portion of vector table.
1767 */
1768 for (; argc > 0; --argc) {
1769 if (suword(vectp++, ustringp) != 0)
1770 return (EFAULT);
1771 while (*stringp++ != 0)
1772 ustringp++;
1773 ustringp++;
1774 }
1775
1776 /* a null vector table pointer separates the argp's from the envp's */
1777 if (suword(vectp++, 0) != 0)
1778 return (EFAULT);
1779
1780 imgp->envv = vectp;
1781 if (suword(&arginfo->ps_envstr, (long)(intptr_t)vectp) != 0 ||
1782 suword32(&arginfo->ps_nenvstr, envc) != 0)
1783 return (EFAULT);
1784
1785 /*
1786 * Fill in environment portion of vector table.
1787 */
1788 for (; envc > 0; --envc) {
1789 if (suword(vectp++, ustringp) != 0)
1790 return (EFAULT);
1791 while (*stringp++ != 0)
1792 ustringp++;
1793 ustringp++;
1794 }
1795
1796 /* end of vector table is a null pointer */
1797 if (suword(vectp, 0) != 0)
1798 return (EFAULT);
1799
1800 if (imgp->auxargs) {
1801 vectp++;
1802 error = imgp->sysent->sv_copyout_auxargs(imgp,
1803 (uintptr_t)vectp);
1804 if (error != 0)
1805 return (error);
1806 }
1807
1808 return (0);
1809 }
1810
1811 /*
1812 * Check permissions of file to execute.
1813 * Called with imgp->vp locked.
1814 * Return 0 for success or error code on failure.
1815 */
1816 int
exec_check_permissions(struct image_params * imgp)1817 exec_check_permissions(struct image_params *imgp)
1818 {
1819 struct vnode *vp = imgp->vp;
1820 struct vattr *attr = imgp->attr;
1821 struct thread *td;
1822 int error;
1823
1824 td = curthread;
1825
1826 /* Get file attributes */
1827 error = VOP_GETATTR(vp, attr, td->td_ucred);
1828 if (error)
1829 return (error);
1830
1831 #ifdef MAC
1832 error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
1833 if (error)
1834 return (error);
1835 #endif
1836
1837 /*
1838 * 1) Check if file execution is disabled for the filesystem that
1839 * this file resides on.
1840 * 2) Ensure that at least one execute bit is on. Otherwise, a
1841 * privileged user will always succeed, and we don't want this
1842 * to happen unless the file really is executable.
1843 * 3) Ensure that the file is a regular file.
1844 */
1845 if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
1846 (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
1847 (attr->va_type != VREG))
1848 return (EACCES);
1849
1850 /*
1851 * Zero length files can't be exec'd
1852 */
1853 if (attr->va_size == 0)
1854 return (ENOEXEC);
1855
1856 /*
1857 * Check for execute permission to file based on current credentials.
1858 */
1859 error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
1860 if (error)
1861 return (error);
1862
1863 /*
1864 * Check number of open-for-writes on the file and deny execution
1865 * if there are any.
1866 *
1867 * Add a text reference now so no one can write to the
1868 * executable while we're activating it.
1869 *
1870 * Remember if this was set before and unset it in case this is not
1871 * actually an executable image.
1872 */
1873 error = VOP_SET_TEXT(vp);
1874 if (error != 0)
1875 return (error);
1876 imgp->textset = true;
1877
1878 /*
1879 * Call filesystem specific open routine (which does nothing in the
1880 * general case).
1881 */
1882 error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
1883 if (error == 0)
1884 imgp->opened = true;
1885 return (error);
1886 }
1887
1888 /*
1889 * Exec handler registration
1890 */
1891 int
exec_register(const struct execsw * execsw_arg)1892 exec_register(const struct execsw *execsw_arg)
1893 {
1894 const struct execsw **es, **xs, **newexecsw;
1895 u_int count = 2; /* New slot and trailing NULL */
1896
1897 if (execsw)
1898 for (es = execsw; *es; es++)
1899 count++;
1900 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1901 xs = newexecsw;
1902 if (execsw)
1903 for (es = execsw; *es; es++)
1904 *xs++ = *es;
1905 *xs++ = execsw_arg;
1906 *xs = NULL;
1907 if (execsw)
1908 free(execsw, M_TEMP);
1909 execsw = newexecsw;
1910 return (0);
1911 }
1912
1913 int
exec_unregister(const struct execsw * execsw_arg)1914 exec_unregister(const struct execsw *execsw_arg)
1915 {
1916 const struct execsw **es, **xs, **newexecsw;
1917 int count = 1;
1918
1919 if (execsw == NULL)
1920 panic("unregister with no handlers left?\n");
1921
1922 for (es = execsw; *es; es++) {
1923 if (*es == execsw_arg)
1924 break;
1925 }
1926 if (*es == NULL)
1927 return (ENOENT);
1928 for (es = execsw; *es; es++)
1929 if (*es != execsw_arg)
1930 count++;
1931 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1932 xs = newexecsw;
1933 for (es = execsw; *es; es++)
1934 if (*es != execsw_arg)
1935 *xs++ = *es;
1936 *xs = NULL;
1937 if (execsw)
1938 free(execsw, M_TEMP);
1939 execsw = newexecsw;
1940 return (0);
1941 }
1942
1943 /*
1944 * Write out a core segment to the compression stream.
1945 */
1946 static int
compress_chunk(struct coredump_params * cp,char * base,char * buf,size_t len)1947 compress_chunk(struct coredump_params *cp, char *base, char *buf, size_t len)
1948 {
1949 size_t chunk_len;
1950 int error;
1951
1952 while (len > 0) {
1953 chunk_len = MIN(len, CORE_BUF_SIZE);
1954
1955 /*
1956 * We can get EFAULT error here.
1957 * In that case zero out the current chunk of the segment.
1958 */
1959 error = copyin(base, buf, chunk_len);
1960 if (error != 0)
1961 bzero(buf, chunk_len);
1962 error = compressor_write(cp->comp, buf, chunk_len);
1963 if (error != 0)
1964 break;
1965 base += chunk_len;
1966 len -= chunk_len;
1967 }
1968 return (error);
1969 }
1970
1971 int
core_write(struct coredump_params * cp,const void * base,size_t len,off_t offset,enum uio_seg seg,size_t * resid)1972 core_write(struct coredump_params *cp, const void *base, size_t len,
1973 off_t offset, enum uio_seg seg, size_t *resid)
1974 {
1975
1976 return (vn_rdwr_inchunks(UIO_WRITE, cp->vp, __DECONST(void *, base),
1977 len, offset, seg, IO_UNIT | IO_DIRECT | IO_RANGELOCKED,
1978 cp->active_cred, cp->file_cred, resid, cp->td));
1979 }
1980
1981 int
core_output(char * base,size_t len,off_t offset,struct coredump_params * cp,void * tmpbuf)1982 core_output(char *base, size_t len, off_t offset, struct coredump_params *cp,
1983 void *tmpbuf)
1984 {
1985 vm_map_t map;
1986 struct mount *mp;
1987 size_t resid, runlen;
1988 int error;
1989 bool success;
1990
1991 KASSERT((uintptr_t)base % PAGE_SIZE == 0,
1992 ("%s: user address %p is not page-aligned", __func__, base));
1993
1994 if (cp->comp != NULL)
1995 return (compress_chunk(cp, base, tmpbuf, len));
1996
1997 map = &cp->td->td_proc->p_vmspace->vm_map;
1998 for (; len > 0; base += runlen, offset += runlen, len -= runlen) {
1999 /*
2000 * Attempt to page in all virtual pages in the range. If a
2001 * virtual page is not backed by the pager, it is represented as
2002 * a hole in the file. This can occur with zero-filled
2003 * anonymous memory or truncated files, for example.
2004 */
2005 for (runlen = 0; runlen < len; runlen += PAGE_SIZE) {
2006 if (core_dump_can_intr && curproc_sigkilled())
2007 return (EINTR);
2008 error = vm_fault(map, (uintptr_t)base + runlen,
2009 VM_PROT_READ, VM_FAULT_NOFILL, NULL);
2010 if (runlen == 0)
2011 success = error == KERN_SUCCESS;
2012 else if ((error == KERN_SUCCESS) != success)
2013 break;
2014 }
2015
2016 if (success) {
2017 error = core_write(cp, base, runlen, offset,
2018 UIO_USERSPACE, &resid);
2019 if (error != 0) {
2020 if (error != EFAULT)
2021 break;
2022
2023 /*
2024 * EFAULT may be returned if the user mapping
2025 * could not be accessed, e.g., because a mapped
2026 * file has been truncated. Skip the page if no
2027 * progress was made, to protect against a
2028 * hypothetical scenario where vm_fault() was
2029 * successful but core_write() returns EFAULT
2030 * anyway.
2031 */
2032 runlen -= resid;
2033 if (runlen == 0) {
2034 success = false;
2035 runlen = PAGE_SIZE;
2036 }
2037 }
2038 }
2039 if (!success) {
2040 error = vn_start_write(cp->vp, &mp, V_WAIT);
2041 if (error != 0)
2042 break;
2043 vn_lock(cp->vp, LK_EXCLUSIVE | LK_RETRY);
2044 error = vn_truncate_locked(cp->vp, offset + runlen,
2045 false, cp->td->td_ucred);
2046 VOP_UNLOCK(cp->vp);
2047 vn_finished_write(mp);
2048 if (error != 0)
2049 break;
2050 }
2051 }
2052 return (error);
2053 }
2054
2055 /*
2056 * Drain into a core file.
2057 */
2058 int
sbuf_drain_core_output(void * arg,const char * data,int len)2059 sbuf_drain_core_output(void *arg, const char *data, int len)
2060 {
2061 struct coredump_params *cp;
2062 struct proc *p;
2063 int error, locked;
2064
2065 cp = arg;
2066 p = cp->td->td_proc;
2067
2068 /*
2069 * Some kern_proc out routines that print to this sbuf may
2070 * call us with the process lock held. Draining with the
2071 * non-sleepable lock held is unsafe. The lock is needed for
2072 * those routines when dumping a live process. In our case we
2073 * can safely release the lock before draining and acquire
2074 * again after.
2075 */
2076 locked = PROC_LOCKED(p);
2077 if (locked)
2078 PROC_UNLOCK(p);
2079 if (cp->comp != NULL)
2080 error = compressor_write(cp->comp, __DECONST(char *, data),
2081 len);
2082 else
2083 error = core_write(cp, __DECONST(void *, data), len, cp->offset,
2084 UIO_SYSSPACE, NULL);
2085 if (locked)
2086 PROC_LOCK(p);
2087 if (error != 0)
2088 return (-error);
2089 cp->offset += len;
2090 return (len);
2091 }
2092