xref: /freebsd-13-stable/sys/kern/kern_jail.c (revision 73e891526ebfda33ccc6162b0b41af46ee3a689c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 1999 Poul-Henning Kamp.
5  * Copyright (c) 2008 Bjoern A. Zeeb.
6  * Copyright (c) 2009 James Gritton.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include "opt_nfs.h"
36 
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/sysproto.h>
43 #include <sys/malloc.h>
44 #include <sys/osd.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/taskqueue.h>
48 #include <sys/fcntl.h>
49 #include <sys/jail.h>
50 #include <sys/linker.h>
51 #include <sys/lock.h>
52 #include <sys/mman.h>
53 #include <sys/mutex.h>
54 #include <sys/racct.h>
55 #include <sys/rctl.h>
56 #include <sys/refcount.h>
57 #include <sys/sx.h>
58 #include <sys/sysent.h>
59 #include <sys/namei.h>
60 #include <sys/mount.h>
61 #include <sys/queue.h>
62 #include <sys/socket.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/uuid.h>
66 #include <sys/vnode.h>
67 
68 #include <net/if.h>
69 #include <net/vnet.h>
70 
71 #include <netinet/in.h>
72 
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif /* DDB */
76 
77 #include <security/mac/mac_framework.h>
78 
79 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
80 
81 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
82 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
83 
84 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
85 #ifdef INET
86 #ifdef INET6
87 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
88 #else
89 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
90 #endif
91 #else /* !INET */
92 #ifdef INET6
93 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
94 #else
95 #define	_PR_IP_SADDRSEL	0
96 #endif
97 #endif
98 
99 /* prison0 describes what is "real" about the system. */
100 struct prison prison0 = {
101 	.pr_id		= 0,
102 	.pr_name	= "0",
103 	.pr_ref		= 1,
104 	.pr_uref	= 1,
105 	.pr_path	= "/",
106 	.pr_securelevel	= -1,
107 	.pr_devfs_rsnum = 0,
108 	.pr_state	= PRISON_STATE_ALIVE,
109 	.pr_childmax	= JAIL_MAX,
110 	.pr_hostuuid	= DEFAULT_HOSTUUID,
111 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
112 #ifdef VIMAGE
113 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
114 #else
115 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
116 #endif
117 	.pr_allow	= PR_ALLOW_ALL_STATIC,
118 };
119 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
120 
121 struct bool_flags {
122 	const char	*name;
123 	const char	*noname;
124 	volatile u_int	 flag;
125 };
126 struct jailsys_flags {
127 	const char	*name;
128 	unsigned	 disable;
129 	unsigned	 new;
130 };
131 
132 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
133 struct	sx allprison_lock;
134 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
135 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
136 LIST_HEAD(, prison_racct) allprison_racct;
137 int	lastprid = 0;
138 
139 static int get_next_prid(struct prison **insprp);
140 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
141 static void prison_complete(void *context, int pending);
142 static void prison_deref(struct prison *pr, int flags);
143 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
144 static int prison_lock_xlock(struct prison *pr, int flags);
145 static void prison_cleanup(struct prison *pr);
146 static void prison_free_not_last(struct prison *pr);
147 static void prison_proc_free_not_last(struct prison *pr);
148 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
149     int enable);
150 static char *prison_path(struct prison *pr1, struct prison *pr2);
151 #ifdef RACCT
152 static void prison_racct_attach(struct prison *pr);
153 static void prison_racct_modify(struct prison *pr);
154 static void prison_racct_detach(struct prison *pr);
155 #endif
156 
157 /* Flags for prison_deref */
158 #define	PD_DEREF	0x01	/* Decrement pr_ref */
159 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
160 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
161 #define	PD_LOCKED	0x10	/* pr_mtx is held */
162 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
163 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
164 #define PD_OP_FLAGS	0x07	/* Operation flags */
165 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
166 
167 /*
168  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
169  * as we cannot figure out the size of a sparse array, or an array without a
170  * terminating entry.
171  */
172 static struct bool_flags pr_flag_bool[] = {
173 	{"persist", "nopersist", PR_PERSIST},
174 #ifdef INET
175 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
176 #endif
177 #ifdef INET6
178 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
179 #endif
180 };
181 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
182 
183 static struct jailsys_flags pr_flag_jailsys[] = {
184 	{"host", 0, PR_HOST},
185 #ifdef VIMAGE
186 	{"vnet", 0, PR_VNET},
187 #endif
188 #ifdef INET
189 	{"ip4", PR_IP4_USER, PR_IP4_USER},
190 #endif
191 #ifdef INET6
192 	{"ip6", PR_IP6_USER, PR_IP6_USER},
193 #endif
194 };
195 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196 
197 /*
198  * Make this array full-size so dynamic parameters can be added.
199  * It is protected by prison0.mtx, but lockless reading is allowed
200  * with an atomic check of the flag values.
201  */
202 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
203 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
204 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
205 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
206 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
207 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
208 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
209 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
210 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
211 	{"allow.reserved_ports", "allow.noreserved_ports",
212 	 PR_ALLOW_RESERVED_PORTS},
213 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
214 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
215 	 PR_ALLOW_UNPRIV_DEBUG},
216 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
217 #ifdef VIMAGE
218 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
219 #endif
220 };
221 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
222 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
223 
224 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
225 					 PR_ALLOW_RESERVED_PORTS | \
226 					 PR_ALLOW_UNPRIV_DEBUG | \
227 					 PR_ALLOW_SUSER)
228 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
229 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
230 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
231 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
232 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
233 #if defined(INET) || defined(INET6)
234 static unsigned jail_max_af_ips = 255;
235 #endif
236 
237 /*
238  * Initialize the parts of prison0 that can't be static-initialized with
239  * constants.  This is called from proc0_init() after creating thread0 cpuset.
240  */
241 void
prison0_init(void)242 prison0_init(void)
243 {
244 	uint8_t *file, *data;
245 	size_t size;
246 	char buf[sizeof(prison0.pr_hostuuid)];
247 	bool valid;
248 
249 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
250 	prison0.pr_osreldate = osreldate;
251 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
252 
253 	/* If we have a preloaded hostuuid, use it. */
254 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
255 	if (file != NULL) {
256 		data = preload_fetch_addr(file);
257 		size = preload_fetch_size(file);
258 		if (data != NULL) {
259 			/*
260 			 * The preloaded data may include trailing whitespace, almost
261 			 * certainly a newline; skip over any whitespace or
262 			 * non-printable characters to be safe.
263 			 */
264 			while (size > 0 && data[size - 1] <= 0x20) {
265 				size--;
266 			}
267 
268 			valid = false;
269 
270 			/*
271 			 * Not NUL-terminated when passed from loader, but
272 			 * validate_uuid requires that due to using sscanf (as
273 			 * does the subsequent strlcpy, since it still reads
274 			 * past the given size to return the true length);
275 			 * bounce to a temporary buffer to fix.
276 			 */
277 			if (size >= sizeof(buf))
278 				goto done;
279 
280 			memcpy(buf, data, size);
281 			buf[size] = '\0';
282 
283 			if (validate_uuid(buf, size, NULL, 0) != 0)
284 				goto done;
285 
286 			valid = true;
287 			(void)strlcpy(prison0.pr_hostuuid, buf,
288 			    sizeof(prison0.pr_hostuuid));
289 
290 done:
291 			if (bootverbose && !valid) {
292 				printf("hostuuid: preload data malformed: '%.*s'\n",
293 				    (int)size, data);
294 			}
295 		}
296 	}
297 	if (bootverbose)
298 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
299 }
300 
301 /*
302  * struct jail_args {
303  *	struct jail *jail;
304  * };
305  */
306 int
sys_jail(struct thread * td,struct jail_args * uap)307 sys_jail(struct thread *td, struct jail_args *uap)
308 {
309 	uint32_t version;
310 	int error;
311 	struct jail j;
312 
313 	error = copyin(uap->jail, &version, sizeof(uint32_t));
314 	if (error)
315 		return (error);
316 
317 	switch (version) {
318 	case 0:
319 	{
320 		struct jail_v0 j0;
321 
322 		/* FreeBSD single IPv4 jails. */
323 		bzero(&j, sizeof(struct jail));
324 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
325 		if (error)
326 			return (error);
327 		j.version = j0.version;
328 		j.path = j0.path;
329 		j.hostname = j0.hostname;
330 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
331 		break;
332 	}
333 
334 	case 1:
335 		/*
336 		 * Version 1 was used by multi-IPv4 jail implementations
337 		 * that never made it into the official kernel.
338 		 */
339 		return (EINVAL);
340 
341 	case 2:	/* JAIL_API_VERSION */
342 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
343 		error = copyin(uap->jail, &j, sizeof(struct jail));
344 		if (error)
345 			return (error);
346 		break;
347 
348 	default:
349 		/* Sci-Fi jails are not supported, sorry. */
350 		return (EINVAL);
351 	}
352 	return (kern_jail(td, &j));
353 }
354 
355 int
kern_jail(struct thread * td,struct jail * j)356 kern_jail(struct thread *td, struct jail *j)
357 {
358 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
359 #ifdef INET
360 			    + 1
361 #endif
362 #ifdef INET6
363 			    + 1
364 #endif
365 			    )];
366 	struct uio opt;
367 	char *u_path, *u_hostname, *u_name;
368 	struct bool_flags *bf;
369 #ifdef INET
370 	uint32_t ip4s;
371 	struct in_addr *u_ip4;
372 #endif
373 #ifdef INET6
374 	struct in6_addr *u_ip6;
375 #endif
376 	size_t tmplen;
377 	int error, enforce_statfs;
378 
379 	bzero(&optiov, sizeof(optiov));
380 	opt.uio_iov = optiov;
381 	opt.uio_iovcnt = 0;
382 	opt.uio_offset = -1;
383 	opt.uio_resid = -1;
384 	opt.uio_segflg = UIO_SYSSPACE;
385 	opt.uio_rw = UIO_READ;
386 	opt.uio_td = td;
387 
388 	/* Set permissions for top-level jails from sysctls. */
389 	if (!jailed(td->td_ucred)) {
390 		for (bf = pr_flag_allow;
391 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
392 			atomic_load_int(&bf->flag) != 0;
393 		     bf++) {
394 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
395 			    (jail_default_allow & bf->flag)
396 			    ? bf->name : bf->noname);
397 			optiov[opt.uio_iovcnt].iov_len =
398 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
399 			opt.uio_iovcnt += 2;
400 		}
401 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
402 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
403 		opt.uio_iovcnt++;
404 		enforce_statfs = jail_default_enforce_statfs;
405 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
406 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
407 		opt.uio_iovcnt++;
408 	}
409 
410 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
411 #ifdef INET
412 	ip4s = (j->version == 0) ? 1 : j->ip4s;
413 	if (ip4s > jail_max_af_ips)
414 		return (EINVAL);
415 	tmplen += ip4s * sizeof(struct in_addr);
416 #else
417 	if (j->ip4s > 0)
418 		return (EINVAL);
419 #endif
420 #ifdef INET6
421 	if (j->ip6s > jail_max_af_ips)
422 		return (EINVAL);
423 	tmplen += j->ip6s * sizeof(struct in6_addr);
424 #else
425 	if (j->ip6s > 0)
426 		return (EINVAL);
427 #endif
428 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
429 	u_hostname = u_path + MAXPATHLEN;
430 	u_name = u_hostname + MAXHOSTNAMELEN;
431 #ifdef INET
432 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
433 #endif
434 #ifdef INET6
435 #ifdef INET
436 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
437 #else
438 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
439 #endif
440 #endif
441 	optiov[opt.uio_iovcnt].iov_base = "path";
442 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
443 	opt.uio_iovcnt++;
444 	optiov[opt.uio_iovcnt].iov_base = u_path;
445 	error = copyinstr(j->path, u_path, MAXPATHLEN,
446 	    &optiov[opt.uio_iovcnt].iov_len);
447 	if (error) {
448 		free(u_path, M_TEMP);
449 		return (error);
450 	}
451 	opt.uio_iovcnt++;
452 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
453 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
454 	opt.uio_iovcnt++;
455 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
456 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
457 	    &optiov[opt.uio_iovcnt].iov_len);
458 	if (error) {
459 		free(u_path, M_TEMP);
460 		return (error);
461 	}
462 	opt.uio_iovcnt++;
463 	if (j->jailname != NULL) {
464 		optiov[opt.uio_iovcnt].iov_base = "name";
465 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
466 		opt.uio_iovcnt++;
467 		optiov[opt.uio_iovcnt].iov_base = u_name;
468 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
469 		    &optiov[opt.uio_iovcnt].iov_len);
470 		if (error) {
471 			free(u_path, M_TEMP);
472 			return (error);
473 		}
474 		opt.uio_iovcnt++;
475 	}
476 #ifdef INET
477 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
478 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
479 	opt.uio_iovcnt++;
480 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
481 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
482 	if (j->version == 0)
483 		u_ip4->s_addr = j->ip4s;
484 	else {
485 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
486 		if (error) {
487 			free(u_path, M_TEMP);
488 			return (error);
489 		}
490 	}
491 	opt.uio_iovcnt++;
492 #endif
493 #ifdef INET6
494 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
495 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
496 	opt.uio_iovcnt++;
497 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
498 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
499 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
500 	if (error) {
501 		free(u_path, M_TEMP);
502 		return (error);
503 	}
504 	opt.uio_iovcnt++;
505 #endif
506 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
507 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
508 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
509 	free(u_path, M_TEMP);
510 	return (error);
511 }
512 
513 /*
514  * struct jail_set_args {
515  *	struct iovec *iovp;
516  *	unsigned int iovcnt;
517  *	int flags;
518  * };
519  */
520 int
sys_jail_set(struct thread * td,struct jail_set_args * uap)521 sys_jail_set(struct thread *td, struct jail_set_args *uap)
522 {
523 	struct uio *auio;
524 	int error;
525 
526 	/* Check that we have an even number of iovecs. */
527 	if (uap->iovcnt & 1)
528 		return (EINVAL);
529 
530 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
531 	if (error)
532 		return (error);
533 	error = kern_jail_set(td, auio, uap->flags);
534 	free(auio, M_IOV);
535 	return (error);
536 }
537 
538 int
kern_jail_set(struct thread * td,struct uio * optuio,int flags)539 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
540 {
541 	struct nameidata nd;
542 #ifdef INET
543 	struct in_addr *ip4;
544 #endif
545 #ifdef INET6
546 	struct in6_addr *ip6;
547 #endif
548 	struct vfsopt *opt;
549 	struct vfsoptlist *opts;
550 	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
551 	struct vnode *root;
552 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
553 	char *g_path, *osrelstr;
554 	struct bool_flags *bf;
555 	struct jailsys_flags *jsf;
556 #if defined(INET) || defined(INET6)
557 	struct prison *tppr;
558 	void *op;
559 #endif
560 	unsigned long hid;
561 	size_t namelen, onamelen, pnamelen;
562 	int born, created, cuflags, descend, drflags, enforce;
563 	int error, errmsg_len, errmsg_pos;
564 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
565 	int jid, jsys, len, level;
566 	int childmax, osreldt, rsnum, slevel;
567 #if defined(INET) || defined(INET6)
568 	int ii, ij;
569 #endif
570 #ifdef INET
571 	int ip4s, redo_ip4;
572 #endif
573 #ifdef INET6
574 	int ip6s, redo_ip6;
575 #endif
576 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
577 	uint64_t pr_allow_diff;
578 	unsigned tallow;
579 	char numbuf[12];
580 
581 	error = priv_check(td, PRIV_JAIL_SET);
582 	if (!error && (flags & JAIL_ATTACH))
583 		error = priv_check(td, PRIV_JAIL_ATTACH);
584 	if (error)
585 		return (error);
586 	mypr = td->td_ucred->cr_prison;
587 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
588 		return (EPERM);
589 	if (flags & ~JAIL_SET_MASK)
590 		return (EINVAL);
591 
592 	/*
593 	 * Check all the parameters before committing to anything.  Not all
594 	 * errors can be caught early, but we may as well try.  Also, this
595 	 * takes care of some expensive stuff (path lookup) before getting
596 	 * the allprison lock.
597 	 *
598 	 * XXX Jails are not filesystems, and jail parameters are not mount
599 	 *     options.  But it makes more sense to re-use the vfsopt code
600 	 *     than duplicate it under a different name.
601 	 */
602 	error = vfs_buildopts(optuio, &opts);
603 	if (error)
604 		return (error);
605 #ifdef INET
606 	ip4 = NULL;
607 #endif
608 #ifdef INET6
609 	ip6 = NULL;
610 #endif
611 	g_path = NULL;
612 
613 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
614 	if (!cuflags) {
615 		error = EINVAL;
616 		vfs_opterror(opts, "no valid operation (create or update)");
617 		goto done_errmsg;
618 	}
619 
620 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
621 	if (error == ENOENT)
622 		jid = 0;
623 	else if (error != 0)
624 		goto done_free;
625 
626 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
627 	if (error == ENOENT)
628 		gotslevel = 0;
629 	else if (error != 0)
630 		goto done_free;
631 	else
632 		gotslevel = 1;
633 
634 	error =
635 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
636 	if (error == ENOENT)
637 		gotchildmax = 0;
638 	else if (error != 0)
639 		goto done_free;
640 	else
641 		gotchildmax = 1;
642 
643 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
644 	if (error == ENOENT)
645 		gotenforce = 0;
646 	else if (error != 0)
647 		goto done_free;
648 	else if (enforce < 0 || enforce > 2) {
649 		error = EINVAL;
650 		goto done_free;
651 	} else
652 		gotenforce = 1;
653 
654 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
655 	if (error == ENOENT)
656 		gotrsnum = 0;
657 	else if (error != 0)
658 		goto done_free;
659 	else
660 		gotrsnum = 1;
661 
662 	pr_flags = ch_flags = 0;
663 	for (bf = pr_flag_bool;
664 	     bf < pr_flag_bool + nitems(pr_flag_bool);
665 	     bf++) {
666 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
667 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
668 	}
669 	ch_flags |= pr_flags;
670 	for (jsf = pr_flag_jailsys;
671 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
672 	     jsf++) {
673 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
674 		if (error == ENOENT)
675 			continue;
676 		if (error != 0)
677 			goto done_free;
678 		switch (jsys) {
679 		case JAIL_SYS_DISABLE:
680 			if (!jsf->disable) {
681 				error = EINVAL;
682 				goto done_free;
683 			}
684 			pr_flags |= jsf->disable;
685 			break;
686 		case JAIL_SYS_NEW:
687 			pr_flags |= jsf->new;
688 			break;
689 		case JAIL_SYS_INHERIT:
690 			break;
691 		default:
692 			error = EINVAL;
693 			goto done_free;
694 		}
695 		ch_flags |= jsf->new | jsf->disable;
696 	}
697 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
698 	    && !(pr_flags & PR_PERSIST)) {
699 		error = EINVAL;
700 		vfs_opterror(opts, "new jail must persist or attach");
701 		goto done_errmsg;
702 	}
703 #ifdef VIMAGE
704 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
705 		error = EINVAL;
706 		vfs_opterror(opts, "vnet cannot be changed after creation");
707 		goto done_errmsg;
708 	}
709 #endif
710 #ifdef INET
711 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
712 		error = EINVAL;
713 		vfs_opterror(opts, "ip4 cannot be changed after creation");
714 		goto done_errmsg;
715 	}
716 #endif
717 #ifdef INET6
718 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
719 		error = EINVAL;
720 		vfs_opterror(opts, "ip6 cannot be changed after creation");
721 		goto done_errmsg;
722 	}
723 #endif
724 
725 	pr_allow = ch_allow = 0;
726 	for (bf = pr_flag_allow;
727 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
728 		atomic_load_int(&bf->flag) != 0;
729 	     bf++) {
730 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
731 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
732 	}
733 	ch_allow |= pr_allow;
734 
735 	error = vfs_getopt(opts, "name", (void **)&name, &len);
736 	if (error == ENOENT)
737 		name = NULL;
738 	else if (error != 0)
739 		goto done_free;
740 	else {
741 		if (len == 0 || name[len - 1] != '\0') {
742 			error = EINVAL;
743 			goto done_free;
744 		}
745 		if (len > MAXHOSTNAMELEN) {
746 			error = ENAMETOOLONG;
747 			goto done_free;
748 		}
749 	}
750 
751 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
752 	if (error == ENOENT)
753 		host = NULL;
754 	else if (error != 0)
755 		goto done_free;
756 	else {
757 		ch_flags |= PR_HOST;
758 		pr_flags |= PR_HOST;
759 		if (len == 0 || host[len - 1] != '\0') {
760 			error = EINVAL;
761 			goto done_free;
762 		}
763 		if (len > MAXHOSTNAMELEN) {
764 			error = ENAMETOOLONG;
765 			goto done_free;
766 		}
767 	}
768 
769 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
770 	if (error == ENOENT)
771 		domain = NULL;
772 	else if (error != 0)
773 		goto done_free;
774 	else {
775 		ch_flags |= PR_HOST;
776 		pr_flags |= PR_HOST;
777 		if (len == 0 || domain[len - 1] != '\0') {
778 			error = EINVAL;
779 			goto done_free;
780 		}
781 		if (len > MAXHOSTNAMELEN) {
782 			error = ENAMETOOLONG;
783 			goto done_free;
784 		}
785 	}
786 
787 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
788 	if (error == ENOENT)
789 		uuid = NULL;
790 	else if (error != 0)
791 		goto done_free;
792 	else {
793 		ch_flags |= PR_HOST;
794 		pr_flags |= PR_HOST;
795 		if (len == 0 || uuid[len - 1] != '\0') {
796 			error = EINVAL;
797 			goto done_free;
798 		}
799 		if (len > HOSTUUIDLEN) {
800 			error = ENAMETOOLONG;
801 			goto done_free;
802 		}
803 	}
804 
805 #ifdef COMPAT_FREEBSD32
806 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
807 		uint32_t hid32;
808 
809 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
810 		hid = hid32;
811 	} else
812 #endif
813 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
814 	if (error == ENOENT)
815 		gothid = 0;
816 	else if (error != 0)
817 		goto done_free;
818 	else {
819 		gothid = 1;
820 		ch_flags |= PR_HOST;
821 		pr_flags |= PR_HOST;
822 	}
823 
824 #ifdef INET
825 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
826 	if (error == ENOENT)
827 		ip4s = 0;
828 	else if (error != 0)
829 		goto done_free;
830 	else if (ip4s & (sizeof(*ip4) - 1)) {
831 		error = EINVAL;
832 		goto done_free;
833 	} else {
834 		ch_flags |= PR_IP4_USER;
835 		pr_flags |= PR_IP4_USER;
836 		if (ip4s > 0) {
837 			ip4s /= sizeof(*ip4);
838 			if (ip4s > jail_max_af_ips) {
839 				error = EINVAL;
840 				vfs_opterror(opts, "too many IPv4 addresses");
841 				goto done_errmsg;
842 			}
843 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
844 			bcopy(op, ip4, ip4s * sizeof(*ip4));
845 			/*
846 			 * IP addresses are all sorted but ip[0] to preserve
847 			 * the primary IP address as given from userland.
848 			 * This special IP is used for unbound outgoing
849 			 * connections as well for "loopback" traffic in case
850 			 * source address selection cannot find any more fitting
851 			 * address to connect from.
852 			 */
853 			if (ip4s > 1)
854 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
855 				    prison_qcmp_v4);
856 			/*
857 			 * Check for duplicate addresses and do some simple
858 			 * zero and broadcast checks. If users give other bogus
859 			 * addresses it is their problem.
860 			 *
861 			 * We do not have to care about byte order for these
862 			 * checks so we will do them in NBO.
863 			 */
864 			for (ii = 0; ii < ip4s; ii++) {
865 				if (ip4[ii].s_addr == INADDR_ANY ||
866 				    ip4[ii].s_addr == INADDR_BROADCAST) {
867 					error = EINVAL;
868 					goto done_free;
869 				}
870 				if ((ii+1) < ip4s &&
871 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
872 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
873 					error = EINVAL;
874 					goto done_free;
875 				}
876 			}
877 		}
878 	}
879 #endif
880 
881 #ifdef INET6
882 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
883 	if (error == ENOENT)
884 		ip6s = 0;
885 	else if (error != 0)
886 		goto done_free;
887 	else if (ip6s & (sizeof(*ip6) - 1)) {
888 		error = EINVAL;
889 		goto done_free;
890 	} else {
891 		ch_flags |= PR_IP6_USER;
892 		pr_flags |= PR_IP6_USER;
893 		if (ip6s > 0) {
894 			ip6s /= sizeof(*ip6);
895 			if (ip6s > jail_max_af_ips) {
896 				error = EINVAL;
897 				vfs_opterror(opts, "too many IPv6 addresses");
898 				goto done_errmsg;
899 			}
900 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
901 			bcopy(op, ip6, ip6s * sizeof(*ip6));
902 			if (ip6s > 1)
903 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
904 				    prison_qcmp_v6);
905 			for (ii = 0; ii < ip6s; ii++) {
906 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
907 					error = EINVAL;
908 					goto done_free;
909 				}
910 				if ((ii+1) < ip6s &&
911 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
912 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
913 				{
914 					error = EINVAL;
915 					goto done_free;
916 				}
917 			}
918 		}
919 	}
920 #endif
921 
922 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
923 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
924 		error = EINVAL;
925 		vfs_opterror(opts,
926 		    "vnet jails cannot have IP address restrictions");
927 		goto done_errmsg;
928 	}
929 #endif
930 
931 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
932 	if (error == ENOENT)
933 		osrelstr = NULL;
934 	else if (error != 0)
935 		goto done_free;
936 	else {
937 		if (flags & JAIL_UPDATE) {
938 			error = EINVAL;
939 			vfs_opterror(opts,
940 			    "osrelease cannot be changed after creation");
941 			goto done_errmsg;
942 		}
943 		if (len == 0 || osrelstr[len - 1] != '\0') {
944 			error = EINVAL;
945 			goto done_free;
946 		}
947 		if (len >= OSRELEASELEN) {
948 			error = ENAMETOOLONG;
949 			vfs_opterror(opts,
950 			    "osrelease string must be 1-%d bytes long",
951 			    OSRELEASELEN - 1);
952 			goto done_errmsg;
953 		}
954 	}
955 
956 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
957 	if (error == ENOENT)
958 		osreldt = 0;
959 	else if (error != 0)
960 		goto done_free;
961 	else {
962 		if (flags & JAIL_UPDATE) {
963 			error = EINVAL;
964 			vfs_opterror(opts,
965 			    "osreldate cannot be changed after creation");
966 			goto done_errmsg;
967 		}
968 		if (osreldt == 0) {
969 			error = EINVAL;
970 			vfs_opterror(opts, "osreldate cannot be 0");
971 			goto done_errmsg;
972 		}
973 	}
974 
975 	root = NULL;
976 	error = vfs_getopt(opts, "path", (void **)&path, &len);
977 	if (error == ENOENT)
978 		path = NULL;
979 	else if (error != 0)
980 		goto done_free;
981 	else {
982 		if (flags & JAIL_UPDATE) {
983 			error = EINVAL;
984 			vfs_opterror(opts,
985 			    "path cannot be changed after creation");
986 			goto done_errmsg;
987 		}
988 		if (len == 0 || path[len - 1] != '\0') {
989 			error = EINVAL;
990 			goto done_free;
991 		}
992 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
993 		    path, td);
994 		error = namei(&nd);
995 		if (error)
996 			goto done_free;
997 		root = nd.ni_vp;
998 		NDFREE(&nd, NDF_ONLY_PNBUF);
999 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1000 		strlcpy(g_path, path, MAXPATHLEN);
1001 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1002 		if (error == 0) {
1003 			path = g_path;
1004 		} else {
1005 			/* exit on other errors */
1006 			goto done_free;
1007 		}
1008 		if (root->v_type != VDIR) {
1009 			error = ENOTDIR;
1010 			vput(root);
1011 			goto done_free;
1012 		}
1013 		VOP_UNLOCK(root);
1014 	}
1015 
1016 	/*
1017 	 * Find the specified jail, or at least its parent.
1018 	 * This abuses the file error codes ENOENT and EEXIST.
1019 	 */
1020 	pr = NULL;
1021 	inspr = NULL;
1022 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1023 		namelc = strrchr(name, '.');
1024 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1025 		if (*p != '\0')
1026 			jid = 0;
1027 	}
1028 	sx_xlock(&allprison_lock);
1029 	drflags = PD_LIST_XLOCKED;
1030 	ppr = mypr;
1031 	if (!prison_isalive(ppr)) {
1032 		/* This jail is dying.  This process will surely follow. */
1033 		error = EAGAIN;
1034 		goto done_deref;
1035 	}
1036 	if (jid != 0) {
1037 		if (jid < 0) {
1038 			error = EINVAL;
1039 			vfs_opterror(opts, "negative jid");
1040 			goto done_deref;
1041 		}
1042 		/*
1043 		 * See if a requested jid already exists.  Keep track of
1044 		 * where it can be inserted later.
1045 		 */
1046 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1047 			if (inspr->pr_id < jid)
1048 				continue;
1049 			if (inspr->pr_id > jid)
1050 				break;
1051 			pr = inspr;
1052 			mtx_lock(&pr->pr_mtx);
1053 			drflags |= PD_LOCKED;
1054 			inspr = NULL;
1055 			break;
1056 		}
1057 		if (pr != NULL) {
1058 			/* Create: jid must not exist. */
1059 			if (cuflags == JAIL_CREATE) {
1060 				/*
1061 				 * Even creators that cannot see the jail will
1062 				 * get EEXIST.
1063 				 */
1064 				error = EEXIST;
1065 				vfs_opterror(opts, "jail %d already exists",
1066 				    jid);
1067 				goto done_deref;
1068 			}
1069 			if (!prison_ischild(mypr, pr)) {
1070 				/*
1071 				 * Updaters get ENOENT if they cannot see the
1072 				 * jail.  This is true even for CREATE | UPDATE,
1073 				 * which normally cannot give this error.
1074 				 */
1075 				error = ENOENT;
1076 				vfs_opterror(opts, "jail %d not found", jid);
1077 				goto done_deref;
1078 			}
1079 			ppr = pr->pr_parent;
1080 			if (!prison_isalive(ppr)) {
1081 				error = ENOENT;
1082 				vfs_opterror(opts, "jail %d is dying",
1083 				    ppr->pr_id);
1084 				goto done_deref;
1085 			}
1086 			if (!prison_isalive(pr)) {
1087 				if (!(flags & JAIL_DYING)) {
1088 					error = ENOENT;
1089 					vfs_opterror(opts, "jail %d is dying",
1090 					    jid);
1091 					goto done_deref;
1092 				}
1093 				if ((flags & JAIL_ATTACH) ||
1094 				    (pr_flags & PR_PERSIST)) {
1095 					/*
1096 					 * A dying jail might be resurrected
1097 					 * (via attach or persist), but first
1098 					 * it must determine if another jail
1099 					 * has claimed its name.  Accomplish
1100 					 * this by implicitly re-setting the
1101 					 * name.
1102 					 */
1103 					if (name == NULL)
1104 						name = prison_name(mypr, pr);
1105 				}
1106 			}
1107 		} else {
1108 			/* Update: jid must exist. */
1109 			if (cuflags == JAIL_UPDATE) {
1110 				error = ENOENT;
1111 				vfs_opterror(opts, "jail %d not found", jid);
1112 				goto done_deref;
1113 			}
1114 		}
1115 	}
1116 	/*
1117 	 * If the caller provided a name, look for a jail by that name.
1118 	 * This has different semantics for creates and updates keyed by jid
1119 	 * (where the name must not already exist in a different jail),
1120 	 * and updates keyed by the name itself (where the name must exist
1121 	 * because that is the jail being updated).
1122 	 */
1123 	namelc = NULL;
1124 	if (name != NULL) {
1125 		namelc = strrchr(name, '.');
1126 		if (namelc == NULL)
1127 			namelc = name;
1128 		else {
1129 			/*
1130 			 * This is a hierarchical name.  Split it into the
1131 			 * parent and child names, and make sure the parent
1132 			 * exists or matches an already found jail.
1133 			 */
1134 			if (pr != NULL) {
1135 				if (strncmp(name, ppr->pr_name, namelc - name)
1136 				    || ppr->pr_name[namelc - name] != '\0') {
1137 					error = EINVAL;
1138 					vfs_opterror(opts,
1139 					    "cannot change jail's parent");
1140 					goto done_deref;
1141 				}
1142 			} else {
1143 				*namelc = '\0';
1144 				ppr = prison_find_name(mypr, name);
1145 				if (ppr == NULL) {
1146 					error = ENOENT;
1147 					vfs_opterror(opts,
1148 					    "jail \"%s\" not found", name);
1149 					goto done_deref;
1150 				}
1151 				mtx_unlock(&ppr->pr_mtx);
1152 				if (!prison_isalive(ppr)) {
1153 					error = ENOENT;
1154 					vfs_opterror(opts,
1155 					    "jail \"%s\" is dying", name);
1156 					goto done_deref;
1157 				}
1158 				*namelc = '.';
1159 			}
1160 			namelc++;
1161 		}
1162 		if (namelc[0] != '\0') {
1163 			pnamelen =
1164 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1165 			deadpr = NULL;
1166 			FOREACH_PRISON_CHILD(ppr, tpr) {
1167 				if (tpr != pr &&
1168 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1169 					if (prison_isalive(tpr)) {
1170 						if (pr == NULL &&
1171 						    cuflags != JAIL_CREATE) {
1172 							/*
1173 							 * Use this jail
1174 							 * for updates.
1175 							 */
1176 							pr = tpr;
1177 							mtx_lock(&pr->pr_mtx);
1178 							drflags |= PD_LOCKED;
1179 							break;
1180 						}
1181 						/*
1182 						 * Create, or update(jid):
1183 						 * name must not exist in an
1184 						 * active sibling jail.
1185 						 */
1186 						error = EEXIST;
1187 						vfs_opterror(opts,
1188 						   "jail \"%s\" already exists",
1189 						   name);
1190 						goto done_deref;
1191 					}
1192 					if (pr == NULL &&
1193 					    cuflags != JAIL_CREATE) {
1194 						deadpr = tpr;
1195 					}
1196 				}
1197 			}
1198 			/* If no active jail is found, use a dying one. */
1199 			if (deadpr != NULL && pr == NULL) {
1200 				if (flags & JAIL_DYING) {
1201 					pr = deadpr;
1202 					mtx_lock(&pr->pr_mtx);
1203 					drflags |= PD_LOCKED;
1204 				} else if (cuflags == JAIL_UPDATE) {
1205 					error = ENOENT;
1206 					vfs_opterror(opts,
1207 					    "jail \"%s\" is dying", name);
1208 					goto done_deref;
1209 				}
1210 			}
1211 			/* Update: name must exist if no jid. */
1212 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1213 				error = ENOENT;
1214 				vfs_opterror(opts, "jail \"%s\" not found",
1215 				    name);
1216 				goto done_deref;
1217 			}
1218 		}
1219 	}
1220 	/* Update: must provide a jid or name. */
1221 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1222 		error = ENOENT;
1223 		vfs_opterror(opts, "update specified no jail");
1224 		goto done_deref;
1225 	}
1226 
1227 	/* If there's no prison to update, create a new one and link it in. */
1228 	created = pr == NULL;
1229 	if (created) {
1230 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1231 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1232 				error = EPERM;
1233 				vfs_opterror(opts, "prison limit exceeded");
1234 				goto done_deref;
1235 			}
1236 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1237 			error = EAGAIN;
1238 			vfs_opterror(opts, "no available jail IDs");
1239 			goto done_deref;
1240 		}
1241 
1242 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1243 		pr->pr_state = PRISON_STATE_INVALID;
1244 		refcount_init(&pr->pr_ref, 1);
1245 		refcount_init(&pr->pr_uref, 0);
1246 		drflags |= PD_DEREF;
1247 		LIST_INIT(&pr->pr_children);
1248 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1249 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1250 
1251 		pr->pr_id = jid;
1252 		if (inspr != NULL)
1253 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1254 		else
1255 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1256 
1257 		pr->pr_parent = ppr;
1258 		prison_hold(ppr);
1259 		prison_proc_hold(ppr);
1260 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1261 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1262 			tpr->pr_childcount++;
1263 
1264 		/* Set some default values, and inherit some from the parent. */
1265 		if (namelc == NULL)
1266 			namelc = "";
1267 		if (path == NULL) {
1268 			path = "/";
1269 			root = mypr->pr_root;
1270 			vref(root);
1271 		}
1272 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1273 		pr->pr_flags |= PR_HOST;
1274 #if defined(INET) || defined(INET6)
1275 #ifdef VIMAGE
1276 		if (!(pr_flags & PR_VNET))
1277 #endif
1278 		{
1279 #ifdef INET
1280 			if (!(ch_flags & PR_IP4_USER))
1281 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1282 			else if (!(pr_flags & PR_IP4_USER)) {
1283 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1284 				if (ppr->pr_ip4 != NULL) {
1285 					pr->pr_ip4s = ppr->pr_ip4s;
1286 					pr->pr_ip4 = malloc(pr->pr_ip4s *
1287 					    sizeof(struct in_addr), M_PRISON,
1288 					    M_WAITOK);
1289 					bcopy(ppr->pr_ip4, pr->pr_ip4,
1290 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1291 				}
1292 			}
1293 #endif
1294 #ifdef INET6
1295 			if (!(ch_flags & PR_IP6_USER))
1296 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1297 			else if (!(pr_flags & PR_IP6_USER)) {
1298 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1299 				if (ppr->pr_ip6 != NULL) {
1300 					pr->pr_ip6s = ppr->pr_ip6s;
1301 					pr->pr_ip6 = malloc(pr->pr_ip6s *
1302 					    sizeof(struct in6_addr), M_PRISON,
1303 					    M_WAITOK);
1304 					bcopy(ppr->pr_ip6, pr->pr_ip6,
1305 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1306 				}
1307 			}
1308 #endif
1309 		}
1310 #endif
1311 		/* Source address selection is always on by default. */
1312 		pr->pr_flags |= _PR_IP_SADDRSEL;
1313 
1314 		pr->pr_securelevel = ppr->pr_securelevel;
1315 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1316 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1317 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1318 
1319 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1320 		if (osrelstr == NULL)
1321 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1322 			    sizeof(pr->pr_osrelease));
1323 		else
1324 			strlcpy(pr->pr_osrelease, osrelstr,
1325 			    sizeof(pr->pr_osrelease));
1326 
1327 #ifdef VIMAGE
1328 		/* Allocate a new vnet if specified. */
1329 		pr->pr_vnet = (pr_flags & PR_VNET)
1330 		    ? vnet_alloc() : ppr->pr_vnet;
1331 #endif
1332 		/*
1333 		 * Allocate a dedicated cpuset for each jail.
1334 		 * Unlike other initial settings, this may return an error.
1335 		 */
1336 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1337 		if (error)
1338 			goto done_deref;
1339 
1340 		mtx_lock(&pr->pr_mtx);
1341 		drflags |= PD_LOCKED;
1342 	} else {
1343 		/*
1344 		 * Grab a reference for existing prisons, to ensure they
1345 		 * continue to exist for the duration of the call.
1346 		 */
1347 		prison_hold(pr);
1348 		drflags |= PD_DEREF;
1349 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1350 		if ((pr->pr_flags & PR_VNET) &&
1351 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1352 			error = EINVAL;
1353 			vfs_opterror(opts,
1354 			    "vnet jails cannot have IP address restrictions");
1355 			goto done_deref;
1356 		}
1357 #endif
1358 #ifdef INET
1359 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1360 			error = EINVAL;
1361 			vfs_opterror(opts,
1362 			    "ip4 cannot be changed after creation");
1363 			goto done_deref;
1364 		}
1365 #endif
1366 #ifdef INET6
1367 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1368 			error = EINVAL;
1369 			vfs_opterror(opts,
1370 			    "ip6 cannot be changed after creation");
1371 			goto done_deref;
1372 		}
1373 #endif
1374 	}
1375 
1376 	/* Do final error checking before setting anything. */
1377 	if (gotslevel) {
1378 		if (slevel < ppr->pr_securelevel) {
1379 			error = EPERM;
1380 			goto done_deref;
1381 		}
1382 	}
1383 	if (gotchildmax) {
1384 		if (childmax >= ppr->pr_childmax) {
1385 			error = EPERM;
1386 			goto done_deref;
1387 		}
1388 	}
1389 	if (gotenforce) {
1390 		if (enforce < ppr->pr_enforce_statfs) {
1391 			error = EPERM;
1392 			goto done_deref;
1393 		}
1394 	}
1395 	if (gotrsnum) {
1396 		/*
1397 		 * devfs_rsnum is a uint16_t
1398 		 */
1399 		if (rsnum < 0 || rsnum > 65535) {
1400 			error = EINVAL;
1401 			goto done_deref;
1402 		}
1403 		/*
1404 		 * Nested jails always inherit parent's devfs ruleset
1405 		 */
1406 		if (jailed(td->td_ucred)) {
1407 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1408 				error = EPERM;
1409 				goto done_deref;
1410 			} else
1411 				rsnum = ppr->pr_devfs_rsnum;
1412 		}
1413 	}
1414 #ifdef INET
1415 	if (ip4s > 0) {
1416 		if (ppr->pr_flags & PR_IP4) {
1417 			/*
1418 			 * Make sure the new set of IP addresses is a
1419 			 * subset of the parent's list.  Don't worry
1420 			 * about the parent being unlocked, as any
1421 			 * setting is done with allprison_lock held.
1422 			 */
1423 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1424 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1425 					break;
1426 			if (ij == ppr->pr_ip4s) {
1427 				error = EPERM;
1428 				goto done_deref;
1429 			}
1430 			if (ip4s > 1) {
1431 				for (ii = ij = 1; ii < ip4s; ii++) {
1432 					if (ip4[ii].s_addr ==
1433 					    ppr->pr_ip4[0].s_addr)
1434 						continue;
1435 					for (; ij < ppr->pr_ip4s; ij++)
1436 						if (ip4[ii].s_addr ==
1437 						    ppr->pr_ip4[ij].s_addr)
1438 							break;
1439 					if (ij == ppr->pr_ip4s)
1440 						break;
1441 				}
1442 				if (ij == ppr->pr_ip4s) {
1443 					error = EPERM;
1444 					goto done_deref;
1445 				}
1446 			}
1447 		}
1448 		/*
1449 		 * Check for conflicting IP addresses.  We permit them
1450 		 * if there is no more than one IP on each jail.  If
1451 		 * there is a duplicate on a jail with more than one
1452 		 * IP stop checking and return error.
1453 		 */
1454 #ifdef VIMAGE
1455 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1456 			if (tppr->pr_flags & PR_VNET)
1457 				break;
1458 #else
1459 		tppr = &prison0;
1460 #endif
1461 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1462 			if (tpr == pr ||
1463 #ifdef VIMAGE
1464 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1465 #endif
1466 			    !prison_isalive(tpr)) {
1467 				descend = 0;
1468 				continue;
1469 			}
1470 			if (!(tpr->pr_flags & PR_IP4_USER))
1471 				continue;
1472 			descend = 0;
1473 			if (tpr->pr_ip4 == NULL ||
1474 			    (ip4s == 1 && tpr->pr_ip4s == 1))
1475 				continue;
1476 			for (ii = 0; ii < ip4s; ii++) {
1477 				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
1478 				    0) {
1479 					error = EADDRINUSE;
1480 					vfs_opterror(opts,
1481 					    "IPv4 addresses clash");
1482 					goto done_deref;
1483 				}
1484 			}
1485 		}
1486 	}
1487 #endif
1488 #ifdef INET6
1489 	if (ip6s > 0) {
1490 		if (ppr->pr_flags & PR_IP6) {
1491 			/*
1492 			 * Make sure the new set of IP addresses is a
1493 			 * subset of the parent's list.
1494 			 */
1495 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1496 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1497 				    &ppr->pr_ip6[ij]))
1498 					break;
1499 			if (ij == ppr->pr_ip6s) {
1500 				error = EPERM;
1501 				goto done_deref;
1502 			}
1503 			if (ip6s > 1) {
1504 				for (ii = ij = 1; ii < ip6s; ii++) {
1505 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1506 					     &ppr->pr_ip6[0]))
1507 						continue;
1508 					for (; ij < ppr->pr_ip6s; ij++)
1509 						if (IN6_ARE_ADDR_EQUAL(
1510 						    &ip6[ii], &ppr->pr_ip6[ij]))
1511 							break;
1512 					if (ij == ppr->pr_ip6s)
1513 						break;
1514 				}
1515 				if (ij == ppr->pr_ip6s) {
1516 					error = EPERM;
1517 					goto done_deref;
1518 				}
1519 			}
1520 		}
1521 		/* Check for conflicting IP addresses. */
1522 #ifdef VIMAGE
1523 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1524 			if (tppr->pr_flags & PR_VNET)
1525 				break;
1526 #else
1527 		tppr = &prison0;
1528 #endif
1529 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1530 			if (tpr == pr ||
1531 #ifdef VIMAGE
1532 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1533 #endif
1534 			    !prison_isalive(tpr)) {
1535 				descend = 0;
1536 				continue;
1537 			}
1538 			if (!(tpr->pr_flags & PR_IP6_USER))
1539 				continue;
1540 			descend = 0;
1541 			if (tpr->pr_ip6 == NULL ||
1542 			    (ip6s == 1 && tpr->pr_ip6s == 1))
1543 				continue;
1544 			for (ii = 0; ii < ip6s; ii++) {
1545 				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
1546 				    0) {
1547 					error = EADDRINUSE;
1548 					vfs_opterror(opts,
1549 					    "IPv6 addresses clash");
1550 					goto done_deref;
1551 				}
1552 			}
1553 		}
1554 	}
1555 #endif
1556 	onamelen = namelen = 0;
1557 	if (namelc != NULL) {
1558 		/* Give a default name of the jid.  Also allow the name to be
1559 		 * explicitly the jid - but not any other number, and only in
1560 		 * normal form (no leading zero/etc).
1561 		 */
1562 		if (namelc[0] == '\0')
1563 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1564 		else if ((strtoul(namelc, &p, 10) != jid ||
1565 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1566 			error = EINVAL;
1567 			vfs_opterror(opts,
1568 			    "name cannot be numeric (unless it is the jid)");
1569 			goto done_deref;
1570 		}
1571 		/*
1572 		 * Make sure the name isn't too long for the prison or its
1573 		 * children.
1574 		 */
1575 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1576 		onamelen = strlen(pr->pr_name + pnamelen);
1577 		namelen = strlen(namelc);
1578 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1579 			error = ENAMETOOLONG;
1580 			goto done_deref;
1581 		}
1582 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1583 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1584 			    sizeof(pr->pr_name)) {
1585 				error = ENAMETOOLONG;
1586 				goto done_deref;
1587 			}
1588 		}
1589 	}
1590 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1591 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1592 		error = EPERM;
1593 		goto done_deref;
1594 	}
1595 
1596 	/*
1597 	 * Let modules check their parameters.  This requires unlocking and
1598 	 * then re-locking the prison, but this is still a valid state as long
1599 	 * as allprison_lock remains xlocked.
1600 	 */
1601 	mtx_unlock(&pr->pr_mtx);
1602 	drflags &= ~PD_LOCKED;
1603 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1604 	if (error != 0)
1605 		goto done_deref;
1606 	mtx_lock(&pr->pr_mtx);
1607 	drflags |= PD_LOCKED;
1608 
1609 	/* At this point, all valid parameters should have been noted. */
1610 	TAILQ_FOREACH(opt, opts, link) {
1611 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1612 			error = EINVAL;
1613 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1614 			goto done_deref;
1615 		}
1616 	}
1617 
1618 	/* Set the parameters of the prison. */
1619 #ifdef INET
1620 	redo_ip4 = 0;
1621 	if (pr_flags & PR_IP4_USER) {
1622 		pr->pr_flags |= PR_IP4;
1623 		free(pr->pr_ip4, M_PRISON);
1624 		pr->pr_ip4s = ip4s;
1625 		pr->pr_ip4 = ip4;
1626 		ip4 = NULL;
1627 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1628 #ifdef VIMAGE
1629 			if (tpr->pr_flags & PR_VNET) {
1630 				descend = 0;
1631 				continue;
1632 			}
1633 #endif
1634 			if (prison_restrict_ip4(tpr, NULL)) {
1635 				redo_ip4 = 1;
1636 				descend = 0;
1637 			}
1638 		}
1639 	}
1640 #endif
1641 #ifdef INET6
1642 	redo_ip6 = 0;
1643 	if (pr_flags & PR_IP6_USER) {
1644 		pr->pr_flags |= PR_IP6;
1645 		free(pr->pr_ip6, M_PRISON);
1646 		pr->pr_ip6s = ip6s;
1647 		pr->pr_ip6 = ip6;
1648 		ip6 = NULL;
1649 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1650 #ifdef VIMAGE
1651 			if (tpr->pr_flags & PR_VNET) {
1652 				descend = 0;
1653 				continue;
1654 			}
1655 #endif
1656 			if (prison_restrict_ip6(tpr, NULL)) {
1657 				redo_ip6 = 1;
1658 				descend = 0;
1659 			}
1660 		}
1661 	}
1662 #endif
1663 	if (gotslevel) {
1664 		pr->pr_securelevel = slevel;
1665 		/* Set all child jails to be at least this level. */
1666 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1667 			if (tpr->pr_securelevel < slevel)
1668 				tpr->pr_securelevel = slevel;
1669 	}
1670 	if (gotchildmax) {
1671 		pr->pr_childmax = childmax;
1672 		/* Set all child jails to under this limit. */
1673 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1674 			if (tpr->pr_childmax > childmax - level)
1675 				tpr->pr_childmax = childmax > level
1676 				    ? childmax - level : 0;
1677 	}
1678 	if (gotenforce) {
1679 		pr->pr_enforce_statfs = enforce;
1680 		/* Pass this restriction on to the children. */
1681 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1682 			if (tpr->pr_enforce_statfs < enforce)
1683 				tpr->pr_enforce_statfs = enforce;
1684 	}
1685 	if (gotrsnum) {
1686 		pr->pr_devfs_rsnum = rsnum;
1687 		/* Pass this restriction on to the children. */
1688 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1689 			tpr->pr_devfs_rsnum = rsnum;
1690 	}
1691 	if (namelc != NULL) {
1692 		if (ppr == &prison0)
1693 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1694 		else
1695 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1696 			    ppr->pr_name, namelc);
1697 		/* Change this component of child names. */
1698 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1699 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1700 			    strlen(tpr->pr_name + onamelen) + 1);
1701 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1702 		}
1703 	}
1704 	if (path != NULL) {
1705 		/* Try to keep a real-rooted full pathname. */
1706 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1707 		pr->pr_root = root;
1708 		root = NULL;
1709 	}
1710 	if (PR_HOST & ch_flags & ~pr_flags) {
1711 		if (pr->pr_flags & PR_HOST) {
1712 			/*
1713 			 * Copy the parent's host info.  As with pr_ip4 above,
1714 			 * the lack of a lock on the parent is not a problem;
1715 			 * it is always set with allprison_lock at least
1716 			 * shared, and is held exclusively here.
1717 			 */
1718 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1719 			    sizeof(pr->pr_hostname));
1720 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1721 			    sizeof(pr->pr_domainname));
1722 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1723 			    sizeof(pr->pr_hostuuid));
1724 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1725 		}
1726 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1727 		/* Set this prison, and any descendants without PR_HOST. */
1728 		if (host != NULL)
1729 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1730 		if (domain != NULL)
1731 			strlcpy(pr->pr_domainname, domain,
1732 			    sizeof(pr->pr_domainname));
1733 		if (uuid != NULL)
1734 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1735 		if (gothid)
1736 			pr->pr_hostid = hid;
1737 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1738 			if (tpr->pr_flags & PR_HOST)
1739 				descend = 0;
1740 			else {
1741 				if (host != NULL)
1742 					strlcpy(tpr->pr_hostname,
1743 					    pr->pr_hostname,
1744 					    sizeof(tpr->pr_hostname));
1745 				if (domain != NULL)
1746 					strlcpy(tpr->pr_domainname,
1747 					    pr->pr_domainname,
1748 					    sizeof(tpr->pr_domainname));
1749 				if (uuid != NULL)
1750 					strlcpy(tpr->pr_hostuuid,
1751 					    pr->pr_hostuuid,
1752 					    sizeof(tpr->pr_hostuuid));
1753 				if (gothid)
1754 					tpr->pr_hostid = hid;
1755 			}
1756 		}
1757 	}
1758 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1759 	if ((tallow = ch_allow & ~pr_allow))
1760 		prison_set_allow_locked(pr, tallow, 0);
1761 	/*
1762 	 * Persistent prisons get an extra reference, and prisons losing their
1763 	 * persist flag lose that reference.
1764 	 */
1765 	born = !prison_isalive(pr);
1766 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
1767 		if (pr_flags & PR_PERSIST) {
1768 			prison_hold(pr);
1769 			/*
1770 			 * This may make a dead prison alive again, but wait
1771 			 * to label it as such until after OSD calls have had
1772 			 * a chance to run (and perhaps to fail).
1773 			 */
1774 			refcount_acquire(&pr->pr_uref);
1775 		} else {
1776 			drflags |= PD_DEUREF;
1777 			prison_free_not_last(pr);
1778 		}
1779 	}
1780 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1781 	mtx_unlock(&pr->pr_mtx);
1782 	drflags &= ~PD_LOCKED;
1783 	/*
1784 	 * Any errors past this point will need to de-persist newly created
1785 	 * prisons, as well as call remove methods.
1786 	 */
1787 	if (born)
1788 		drflags |= PD_KILL;
1789 
1790 #ifdef RACCT
1791 	if (racct_enable && created)
1792 		prison_racct_attach(pr);
1793 #endif
1794 
1795 	/* Locks may have prevented a complete restriction of child IP
1796 	 * addresses.  If so, allocate some more memory and try again.
1797 	 */
1798 #ifdef INET
1799 	while (redo_ip4) {
1800 		ip4s = pr->pr_ip4s;
1801 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1802 		mtx_lock(&pr->pr_mtx);
1803 		redo_ip4 = 0;
1804 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1805 #ifdef VIMAGE
1806 			if (tpr->pr_flags & PR_VNET) {
1807 				descend = 0;
1808 				continue;
1809 			}
1810 #endif
1811 			if (prison_restrict_ip4(tpr, ip4)) {
1812 				if (ip4 != NULL)
1813 					ip4 = NULL;
1814 				else
1815 					redo_ip4 = 1;
1816 			}
1817 		}
1818 		mtx_unlock(&pr->pr_mtx);
1819 	}
1820 #endif
1821 #ifdef INET6
1822 	while (redo_ip6) {
1823 		ip6s = pr->pr_ip6s;
1824 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1825 		mtx_lock(&pr->pr_mtx);
1826 		redo_ip6 = 0;
1827 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1828 #ifdef VIMAGE
1829 			if (tpr->pr_flags & PR_VNET) {
1830 				descend = 0;
1831 				continue;
1832 			}
1833 #endif
1834 			if (prison_restrict_ip6(tpr, ip6)) {
1835 				if (ip6 != NULL)
1836 					ip6 = NULL;
1837 				else
1838 					redo_ip6 = 1;
1839 			}
1840 		}
1841 		mtx_unlock(&pr->pr_mtx);
1842 	}
1843 #endif
1844 
1845 	/* Let the modules do their work. */
1846 	if (born) {
1847 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1848 		if (error)
1849 			goto done_deref;
1850 	}
1851 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1852 	if (error)
1853 		goto done_deref;
1854 
1855 	/*
1856 	 * A new prison is now ready to be seen; either it has gained a user
1857 	 * reference via persistence, or is about to gain one via attachment.
1858 	 */
1859 	if (born) {
1860 		drflags = prison_lock_xlock(pr, drflags);
1861 		pr->pr_state = PRISON_STATE_ALIVE;
1862 	}
1863 
1864 	/* Attach this process to the prison if requested. */
1865 	if (flags & JAIL_ATTACH) {
1866 		error = do_jail_attach(td, pr,
1867 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
1868 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
1869 		if (error) {
1870 			vfs_opterror(opts, "attach failed");
1871 			goto done_deref;
1872 		}
1873 	}
1874 
1875 #ifdef RACCT
1876 	if (racct_enable && !created) {
1877 		if (drflags & PD_LOCKED) {
1878 			mtx_unlock(&pr->pr_mtx);
1879 			drflags &= ~PD_LOCKED;
1880 		}
1881 		if (drflags & PD_LIST_XLOCKED) {
1882 			sx_xunlock(&allprison_lock);
1883 			drflags &= ~PD_LIST_XLOCKED;
1884 		}
1885 		prison_racct_modify(pr);
1886 	}
1887 #endif
1888 
1889 	if (born && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
1890 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
1891 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
1892 		   " file system\n", pr->pr_id);
1893 
1894 	drflags &= ~PD_KILL;
1895 	td->td_retval[0] = pr->pr_id;
1896 
1897  done_deref:
1898 	/* Release any temporary prison holds and/or locks. */
1899 	if (pr != NULL)
1900 		prison_deref(pr, drflags);
1901 	else if (drflags & PD_LIST_SLOCKED)
1902 		sx_sunlock(&allprison_lock);
1903 	else if (drflags & PD_LIST_XLOCKED)
1904 		sx_xunlock(&allprison_lock);
1905 	if (root != NULL)
1906 		vrele(root);
1907  done_errmsg:
1908 	if (error) {
1909 		/* Write the error message back to userspace. */
1910 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1911 		    &errmsg_len) == 0 && errmsg_len > 0) {
1912 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1913 			if (optuio->uio_segflg == UIO_SYSSPACE)
1914 				bcopy(errmsg,
1915 				    optuio->uio_iov[errmsg_pos].iov_base,
1916 				    errmsg_len);
1917 			else
1918 				(void)copyout(errmsg,
1919 				    optuio->uio_iov[errmsg_pos].iov_base,
1920 				    errmsg_len);
1921 		}
1922 	}
1923  done_free:
1924 #ifdef INET
1925 	free(ip4, M_PRISON);
1926 #endif
1927 #ifdef INET6
1928 	free(ip6, M_PRISON);
1929 #endif
1930 	if (g_path != NULL)
1931 		free(g_path, M_TEMP);
1932 	vfs_freeopts(opts);
1933 	return (error);
1934 }
1935 
1936 /*
1937  * Find the next available prison ID.  Return the ID on success, or zero
1938  * on failure.  Also set a pointer to the allprison list entry the prison
1939  * should be inserted before.
1940  */
1941 static int
get_next_prid(struct prison ** insprp)1942 get_next_prid(struct prison **insprp)
1943 {
1944 	struct prison *inspr;
1945 	int jid, maxid;
1946 
1947 	jid = lastprid % JAIL_MAX + 1;
1948 	if (TAILQ_EMPTY(&allprison) ||
1949 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
1950 		/*
1951 		 * A common case is for all jails to be implicitly numbered,
1952 		 * which means they'll go on the end of the list, at least
1953 		 * for the first JAIL_MAX times.
1954 		 */
1955 		inspr = NULL;
1956 	} else {
1957 		/*
1958 		 * Take two passes through the allprison list: first starting
1959 		 * with the proposed jid, then ending with it.
1960 		 */
1961 		for (maxid = JAIL_MAX; maxid != 0; ) {
1962 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
1963 				if (inspr->pr_id < jid)
1964 					continue;
1965 				if (inspr->pr_id > jid) {
1966 					/* Found an opening. */
1967 					maxid = 0;
1968 					break;
1969 				}
1970 				if (++jid > maxid) {
1971 					if (lastprid == maxid || lastprid == 0)
1972 					{
1973 						/*
1974 						 * The entire legal range
1975 						 * has been traversed
1976 						 */
1977 						return 0;
1978 					}
1979 					/* Try again from the start. */
1980 					jid = 1;
1981 					maxid = lastprid;
1982 					break;
1983 				}
1984 			}
1985 			if (inspr == NULL) {
1986 				/* Found room at the end of the list. */
1987 				break;
1988 			}
1989 		}
1990 	}
1991 	*insprp = inspr;
1992 	lastprid = jid;
1993 	return (jid);
1994 }
1995 
1996 /*
1997  * struct jail_get_args {
1998  *	struct iovec *iovp;
1999  *	unsigned int iovcnt;
2000  *	int flags;
2001  * };
2002  */
2003 int
sys_jail_get(struct thread * td,struct jail_get_args * uap)2004 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2005 {
2006 	struct uio *auio;
2007 	int error;
2008 
2009 	/* Check that we have an even number of iovecs. */
2010 	if (uap->iovcnt & 1)
2011 		return (EINVAL);
2012 
2013 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2014 	if (error)
2015 		return (error);
2016 	error = kern_jail_get(td, auio, uap->flags);
2017 	if (error == 0)
2018 		error = copyout(auio->uio_iov, uap->iovp,
2019 		    uap->iovcnt * sizeof (struct iovec));
2020 	free(auio, M_IOV);
2021 	return (error);
2022 }
2023 
2024 int
kern_jail_get(struct thread * td,struct uio * optuio,int flags)2025 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2026 {
2027 	struct bool_flags *bf;
2028 	struct jailsys_flags *jsf;
2029 	struct prison *pr, *mypr;
2030 	struct vfsopt *opt;
2031 	struct vfsoptlist *opts;
2032 	char *errmsg, *name;
2033 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2034 	unsigned f;
2035 
2036 	if (flags & ~JAIL_GET_MASK)
2037 		return (EINVAL);
2038 
2039 	/* Get the parameter list. */
2040 	error = vfs_buildopts(optuio, &opts);
2041 	if (error)
2042 		return (error);
2043 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2044 	mypr = td->td_ucred->cr_prison;
2045 	pr = NULL;
2046 
2047 	/*
2048 	 * Find the prison specified by one of: lastjid, jid, name.
2049 	 */
2050 	sx_slock(&allprison_lock);
2051 	drflags = PD_LIST_SLOCKED;
2052 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2053 	if (error == 0) {
2054 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2055 			if (pr->pr_id > jid &&
2056 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2057 			    prison_ischild(mypr, pr)) {
2058 				mtx_lock(&pr->pr_mtx);
2059 				drflags |= PD_LOCKED;
2060 				goto found_prison;
2061 			}
2062 		}
2063 		error = ENOENT;
2064 		vfs_opterror(opts, "no jail after %d", jid);
2065 		goto done;
2066 	} else if (error != ENOENT)
2067 		goto done;
2068 
2069 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2070 	if (error == 0) {
2071 		if (jid != 0) {
2072 			pr = prison_find_child(mypr, jid);
2073 			if (pr != NULL) {
2074 				drflags |= PD_LOCKED;
2075 				if (!(prison_isalive(pr) ||
2076 				    (flags & JAIL_DYING))) {
2077 					error = ENOENT;
2078 					vfs_opterror(opts, "jail %d is dying",
2079 					    jid);
2080 					goto done;
2081 				}
2082 				goto found_prison;
2083 			}
2084 			error = ENOENT;
2085 			vfs_opterror(opts, "jail %d not found", jid);
2086 			goto done;
2087 		}
2088 	} else if (error != ENOENT)
2089 		goto done;
2090 
2091 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2092 	if (error == 0) {
2093 		if (len == 0 || name[len - 1] != '\0') {
2094 			error = EINVAL;
2095 			goto done;
2096 		}
2097 		pr = prison_find_name(mypr, name);
2098 		if (pr != NULL) {
2099 			drflags |= PD_LOCKED;
2100 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2101 				error = ENOENT;
2102 				vfs_opterror(opts, "jail \"%s\" is dying",
2103 				    name);
2104 				goto done;
2105 			}
2106 			goto found_prison;
2107 		}
2108 		error = ENOENT;
2109 		vfs_opterror(opts, "jail \"%s\" not found", name);
2110 		goto done;
2111 	} else if (error != ENOENT)
2112 		goto done;
2113 
2114 	vfs_opterror(opts, "no jail specified");
2115 	error = ENOENT;
2116 	goto done;
2117 
2118  found_prison:
2119 	/* Get the parameters of the prison. */
2120 	prison_hold(pr);
2121 	drflags |= PD_DEREF;
2122 	td->td_retval[0] = pr->pr_id;
2123 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2124 	if (error != 0 && error != ENOENT)
2125 		goto done;
2126 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2127 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2128 	if (error != 0 && error != ENOENT)
2129 		goto done;
2130 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2131 	if (error != 0 && error != ENOENT)
2132 		goto done;
2133 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2134 	    sizeof(pr->pr_cpuset->cs_id));
2135 	if (error != 0 && error != ENOENT)
2136 		goto done;
2137 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2138 	if (error != 0 && error != ENOENT)
2139 		goto done;
2140 #ifdef INET
2141 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2142 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2143 	if (error != 0 && error != ENOENT)
2144 		goto done;
2145 #endif
2146 #ifdef INET6
2147 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2148 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2149 	if (error != 0 && error != ENOENT)
2150 		goto done;
2151 #endif
2152 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2153 	    sizeof(pr->pr_securelevel));
2154 	if (error != 0 && error != ENOENT)
2155 		goto done;
2156 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2157 	    sizeof(pr->pr_childcount));
2158 	if (error != 0 && error != ENOENT)
2159 		goto done;
2160 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2161 	    sizeof(pr->pr_childmax));
2162 	if (error != 0 && error != ENOENT)
2163 		goto done;
2164 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2165 	if (error != 0 && error != ENOENT)
2166 		goto done;
2167 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2168 	if (error != 0 && error != ENOENT)
2169 		goto done;
2170 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2171 	if (error != 0 && error != ENOENT)
2172 		goto done;
2173 #ifdef COMPAT_FREEBSD32
2174 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2175 		uint32_t hid32 = pr->pr_hostid;
2176 
2177 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2178 	} else
2179 #endif
2180 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2181 	    sizeof(pr->pr_hostid));
2182 	if (error != 0 && error != ENOENT)
2183 		goto done;
2184 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2185 	    sizeof(pr->pr_enforce_statfs));
2186 	if (error != 0 && error != ENOENT)
2187 		goto done;
2188 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2189 	    sizeof(pr->pr_devfs_rsnum));
2190 	if (error != 0 && error != ENOENT)
2191 		goto done;
2192 	for (bf = pr_flag_bool;
2193 	     bf < pr_flag_bool + nitems(pr_flag_bool);
2194 	     bf++) {
2195 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2196 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2197 		if (error != 0 && error != ENOENT)
2198 			goto done;
2199 		i = !i;
2200 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2201 		if (error != 0 && error != ENOENT)
2202 			goto done;
2203 	}
2204 	for (jsf = pr_flag_jailsys;
2205 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2206 	     jsf++) {
2207 		f = pr->pr_flags & (jsf->disable | jsf->new);
2208 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2209 		    : (f == jsf->new) ? JAIL_SYS_NEW
2210 		    : JAIL_SYS_INHERIT;
2211 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2212 		if (error != 0 && error != ENOENT)
2213 			goto done;
2214 	}
2215 	for (bf = pr_flag_allow;
2216 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2217 		atomic_load_int(&bf->flag) != 0;
2218 	     bf++) {
2219 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2220 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2221 		if (error != 0 && error != ENOENT)
2222 			goto done;
2223 		i = !i;
2224 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2225 		if (error != 0 && error != ENOENT)
2226 			goto done;
2227 	}
2228 	i = !prison_isalive(pr);
2229 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2230 	if (error != 0 && error != ENOENT)
2231 		goto done;
2232 	i = !i;
2233 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2234 	if (error != 0 && error != ENOENT)
2235 		goto done;
2236 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2237 	    sizeof(pr->pr_osreldate));
2238 	if (error != 0 && error != ENOENT)
2239 		goto done;
2240 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2241 	if (error != 0 && error != ENOENT)
2242 		goto done;
2243 
2244 	/* Get the module parameters. */
2245 	mtx_unlock(&pr->pr_mtx);
2246 	drflags &= ~PD_LOCKED;
2247 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2248 	if (error)
2249 		goto done;
2250 	prison_deref(pr, drflags);
2251 	pr = NULL;
2252 	drflags = 0;
2253 
2254 	/* By now, all parameters should have been noted. */
2255 	TAILQ_FOREACH(opt, opts, link) {
2256 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2257 			error = EINVAL;
2258 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2259 			goto done;
2260 		}
2261 	}
2262 
2263 	/* Write the fetched parameters back to userspace. */
2264 	error = 0;
2265 	TAILQ_FOREACH(opt, opts, link) {
2266 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2267 			pos = 2 * opt->pos + 1;
2268 			optuio->uio_iov[pos].iov_len = opt->len;
2269 			if (opt->value != NULL) {
2270 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2271 					bcopy(opt->value,
2272 					    optuio->uio_iov[pos].iov_base,
2273 					    opt->len);
2274 				} else {
2275 					error = copyout(opt->value,
2276 					    optuio->uio_iov[pos].iov_base,
2277 					    opt->len);
2278 					if (error)
2279 						break;
2280 				}
2281 			}
2282 		}
2283 	}
2284 
2285  done:
2286 	/* Release any temporary prison holds and/or locks. */
2287 	if (pr != NULL)
2288 		prison_deref(pr, drflags);
2289 	else if (drflags & PD_LIST_SLOCKED)
2290 		sx_sunlock(&allprison_lock);
2291 	if (error && errmsg_pos >= 0) {
2292 		/* Write the error message back to userspace. */
2293 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2294 		errmsg_pos = 2 * errmsg_pos + 1;
2295 		if (errmsg_len > 0) {
2296 			if (optuio->uio_segflg == UIO_SYSSPACE)
2297 				bcopy(errmsg,
2298 				    optuio->uio_iov[errmsg_pos].iov_base,
2299 				    errmsg_len);
2300 			else
2301 				(void)copyout(errmsg,
2302 				    optuio->uio_iov[errmsg_pos].iov_base,
2303 				    errmsg_len);
2304 		}
2305 	}
2306 	vfs_freeopts(opts);
2307 	return (error);
2308 }
2309 
2310 /*
2311  * struct jail_remove_args {
2312  *	int jid;
2313  * };
2314  */
2315 int
sys_jail_remove(struct thread * td,struct jail_remove_args * uap)2316 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2317 {
2318 	struct prison *pr;
2319 	int error;
2320 
2321 	error = priv_check(td, PRIV_JAIL_REMOVE);
2322 	if (error)
2323 		return (error);
2324 
2325 	sx_xlock(&allprison_lock);
2326 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2327 	if (pr == NULL) {
2328 		sx_xunlock(&allprison_lock);
2329 		return (EINVAL);
2330 	}
2331 	if (!prison_isalive(pr)) {
2332 		/* Silently ignore already-dying prisons. */
2333 		mtx_unlock(&pr->pr_mtx);
2334 		sx_xunlock(&allprison_lock);
2335 		return (0);
2336 	}
2337 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2338 	return (0);
2339 }
2340 
2341 /*
2342  * struct jail_attach_args {
2343  *	int jid;
2344  * };
2345  */
2346 int
sys_jail_attach(struct thread * td,struct jail_attach_args * uap)2347 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2348 {
2349 	struct prison *pr;
2350 	int error;
2351 
2352 	error = priv_check(td, PRIV_JAIL_ATTACH);
2353 	if (error)
2354 		return (error);
2355 
2356 	sx_slock(&allprison_lock);
2357 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2358 	if (pr == NULL) {
2359 		sx_sunlock(&allprison_lock);
2360 		return (EINVAL);
2361 	}
2362 
2363 	/* Do not allow a process to attach to a prison that is not alive. */
2364 	if (!prison_isalive(pr)) {
2365 		mtx_unlock(&pr->pr_mtx);
2366 		sx_sunlock(&allprison_lock);
2367 		return (EINVAL);
2368 	}
2369 
2370 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2371 }
2372 
2373 static int
do_jail_attach(struct thread * td,struct prison * pr,int drflags)2374 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2375 {
2376 	struct proc *p;
2377 	struct ucred *newcred, *oldcred;
2378 	int error;
2379 
2380 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2381 	sx_assert(&allprison_lock, SX_LOCKED);
2382 	drflags &= PD_LOCK_FLAGS;
2383 	/*
2384 	 * XXX: Note that there is a slight race here if two threads
2385 	 * in the same privileged process attempt to attach to two
2386 	 * different jails at the same time.  It is important for
2387 	 * user processes not to do this, or they might end up with
2388 	 * a process root from one prison, but attached to the jail
2389 	 * of another.
2390 	 */
2391 	prison_hold(pr);
2392 	refcount_acquire(&pr->pr_uref);
2393 	drflags |= PD_DEREF | PD_DEUREF;
2394 	mtx_unlock(&pr->pr_mtx);
2395 	drflags &= ~PD_LOCKED;
2396 
2397 	/* Let modules do whatever they need to prepare for attaching. */
2398 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2399 	if (error) {
2400 		prison_deref(pr, drflags);
2401 		return (error);
2402 	}
2403 	sx_unlock(&allprison_lock);
2404 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2405 
2406 	/*
2407 	 * Reparent the newly attached process to this jail.
2408 	 */
2409 	p = td->td_proc;
2410 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2411 	if (error)
2412 		goto e_revert_osd;
2413 
2414 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2415 	if ((error = change_dir(pr->pr_root, td)) != 0)
2416 		goto e_unlock;
2417 #ifdef MAC
2418 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2419 		goto e_unlock;
2420 #endif
2421 	VOP_UNLOCK(pr->pr_root);
2422 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2423 		goto e_revert_osd;
2424 
2425 	newcred = crget();
2426 	PROC_LOCK(p);
2427 	oldcred = crcopysafe(p, newcred);
2428 	newcred->cr_prison = pr;
2429 	proc_set_cred(p, newcred);
2430 	setsugid(p);
2431 #ifdef RACCT
2432 	racct_proc_ucred_changed(p, oldcred, newcred);
2433 	crhold(newcred);
2434 #endif
2435 	PROC_UNLOCK(p);
2436 #ifdef RCTL
2437 	rctl_proc_ucred_changed(p, newcred);
2438 	crfree(newcred);
2439 #endif
2440 	prison_deref(oldcred->cr_prison, drflags);
2441 	crfree(oldcred);
2442 
2443 	/*
2444 	 * If the prison was killed while changing credentials, die along
2445 	 * with it.
2446 	 */
2447 	if (!prison_isalive(pr)) {
2448 		PROC_LOCK(p);
2449 		kern_psignal(p, SIGKILL);
2450 		PROC_UNLOCK(p);
2451 	}
2452 
2453 	return (0);
2454 
2455  e_unlock:
2456 	VOP_UNLOCK(pr->pr_root);
2457  e_revert_osd:
2458 	/* Tell modules this thread is still in its old jail after all. */
2459 	sx_slock(&allprison_lock);
2460 	drflags |= PD_LIST_SLOCKED;
2461 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2462 	prison_deref(pr, drflags);
2463 	return (error);
2464 }
2465 
2466 /*
2467  * Returns a locked prison instance, or NULL on failure.
2468  */
2469 struct prison *
prison_find(int prid)2470 prison_find(int prid)
2471 {
2472 	struct prison *pr;
2473 
2474 	sx_assert(&allprison_lock, SX_LOCKED);
2475 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2476 		if (pr->pr_id < prid)
2477 			continue;
2478 		if (pr->pr_id > prid)
2479 			break;
2480 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2481 		mtx_lock(&pr->pr_mtx);
2482 		return (pr);
2483 	}
2484 	return (NULL);
2485 }
2486 
2487 /*
2488  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2489  */
2490 struct prison *
prison_find_child(struct prison * mypr,int prid)2491 prison_find_child(struct prison *mypr, int prid)
2492 {
2493 	struct prison *pr;
2494 	int descend;
2495 
2496 	sx_assert(&allprison_lock, SX_LOCKED);
2497 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2498 		if (pr->pr_id == prid) {
2499 			KASSERT(prison_isvalid(pr),
2500 			    ("Found invalid prison %p", pr));
2501 			mtx_lock(&pr->pr_mtx);
2502 			return (pr);
2503 		}
2504 	}
2505 	return (NULL);
2506 }
2507 
2508 /*
2509  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2510  */
2511 struct prison *
prison_find_name(struct prison * mypr,const char * name)2512 prison_find_name(struct prison *mypr, const char *name)
2513 {
2514 	struct prison *pr, *deadpr;
2515 	size_t mylen;
2516 	int descend;
2517 
2518 	sx_assert(&allprison_lock, SX_LOCKED);
2519 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2520 	deadpr = NULL;
2521 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2522 		if (!strcmp(pr->pr_name + mylen, name)) {
2523 			KASSERT(prison_isvalid(pr),
2524 			    ("Found invalid prison %p", pr));
2525 			if (prison_isalive(pr)) {
2526 				mtx_lock(&pr->pr_mtx);
2527 				return (pr);
2528 			}
2529 			deadpr = pr;
2530 		}
2531 	}
2532 	/* There was no valid prison - perhaps there was a dying one. */
2533 	if (deadpr != NULL)
2534 		mtx_lock(&deadpr->pr_mtx);
2535 	return (deadpr);
2536 }
2537 
2538 /*
2539  * See if a prison has the specific flag set.  The prison should be locked,
2540  * unless checking for flags that are only set at jail creation (such as
2541  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2542  * to any other prison data.
2543  */
2544 int
prison_flag(struct ucred * cred,unsigned flag)2545 prison_flag(struct ucred *cred, unsigned flag)
2546 {
2547 
2548 	return (cred->cr_prison->pr_flags & flag);
2549 }
2550 
2551 int
prison_allow(struct ucred * cred,unsigned flag)2552 prison_allow(struct ucred *cred, unsigned flag)
2553 {
2554 
2555 	return ((cred->cr_prison->pr_allow & flag) != 0);
2556 }
2557 
2558 /*
2559  * Hold a prison reference, by incrementing pr_ref.  It is generally
2560  * an error to hold a prison that does not already have a reference.
2561  * A prison record will remain valid as long as it has at least one
2562  * reference, and will not be removed as long as either the prison
2563  * mutex or the allprison lock is held (allprison_lock may be shared).
2564  */
2565 void
prison_hold_locked(struct prison * pr)2566 prison_hold_locked(struct prison *pr)
2567 {
2568 
2569 	/* Locking is no longer required. */
2570 	prison_hold(pr);
2571 }
2572 
2573 void
prison_hold(struct prison * pr)2574 prison_hold(struct prison *pr)
2575 {
2576 #ifdef INVARIANTS
2577 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2578 
2579 	KASSERT(was_valid,
2580 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2581 #else
2582 	refcount_acquire(&pr->pr_ref);
2583 #endif
2584 }
2585 
2586 /*
2587  * Remove a prison reference.  If that was the last reference, the
2588  * prison will be removed (at a later time).
2589  */
2590 void
prison_free_locked(struct prison * pr)2591 prison_free_locked(struct prison *pr)
2592 {
2593 
2594 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2595 	/*
2596 	 * Locking is no longer required, but unlock because the caller
2597 	 * expects it.
2598 	 */
2599 	mtx_unlock(&pr->pr_mtx);
2600 	prison_free(pr);
2601 }
2602 
2603 void
prison_free(struct prison * pr)2604 prison_free(struct prison *pr)
2605 {
2606 
2607 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2608 	    ("Trying to free dead prison %p (jid=%d).",
2609 	     pr, pr->pr_id));
2610 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2611 		/*
2612 		 * Don't remove the last reference in this context,
2613 		 * in case there are locks held.
2614 		 */
2615 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2616 	}
2617 }
2618 
2619 static void
prison_free_not_last(struct prison * pr)2620 prison_free_not_last(struct prison *pr)
2621 {
2622 #ifdef INVARIANTS
2623 	int lastref;
2624 
2625 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2626 	    ("Trying to free dead prison %p (jid=%d).",
2627 	     pr, pr->pr_id));
2628 	lastref = refcount_release(&pr->pr_ref);
2629 	KASSERT(!lastref,
2630 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2631 	     pr, pr->pr_id));
2632 #else
2633 	refcount_release(&pr->pr_ref);
2634 #endif
2635 }
2636 
2637 /*
2638  * Hold a prison for user visibility, by incrementing pr_uref.
2639  * It is generally an error to hold a prison that isn't already
2640  * user-visible, except through the the jail system calls.  It is also
2641  * an error to hold an invalid prison.  A prison record will remain
2642  * alive as long as it has at least one user reference, and will not
2643  * be set to the dying state until the prison mutex and allprison_lock
2644  * are both freed.
2645  */
2646 void
prison_proc_hold(struct prison * pr)2647 prison_proc_hold(struct prison *pr)
2648 {
2649 #ifdef INVARIANTS
2650 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2651 
2652 	KASSERT(was_alive,
2653 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2654 #else
2655 	refcount_acquire(&pr->pr_uref);
2656 #endif
2657 }
2658 
2659 /*
2660  * Remove a prison user reference.  If it was the last reference, the
2661  * prison will be considered "dying", and may be removed once all of
2662  * its references are dropped.
2663  */
2664 void
prison_proc_free(struct prison * pr)2665 prison_proc_free(struct prison *pr)
2666 {
2667 
2668 	/*
2669 	 * Locking is only required when releasing the last reference.
2670 	 * This allows assurance that a locked prison will remain alive
2671 	 * until it is unlocked.
2672 	 */
2673 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2674 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2675 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2676 		/*
2677 		 * Don't remove the last user reference in this context,
2678 		 * which is expected to be a process that is not only locked,
2679 		 * but also half dead.  Add a reference so any calls to
2680 		 * prison_free() won't re-submit the task.
2681 		 */
2682 		prison_hold(pr);
2683 		mtx_lock(&pr->pr_mtx);
2684 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2685 		    ("Redundant last reference in prison_proc_free (jid=%d)",
2686 		     pr->pr_id));
2687 		pr->pr_flags |= PR_COMPLETE_PROC;
2688 		mtx_unlock(&pr->pr_mtx);
2689 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2690 	}
2691 }
2692 
2693 static void
prison_proc_free_not_last(struct prison * pr)2694 prison_proc_free_not_last(struct prison *pr)
2695 {
2696 #ifdef INVARIANTS
2697 	int lastref;
2698 
2699 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2700 	    ("Trying to free dead prison %p (jid=%d).",
2701 	     pr, pr->pr_id));
2702 	lastref = refcount_release(&pr->pr_uref);
2703 	KASSERT(!lastref,
2704 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2705 	     pr, pr->pr_id));
2706 #else
2707 	refcount_release(&pr->pr_uref);
2708 #endif
2709 }
2710 
2711 /*
2712  * Complete a call to either prison_free or prison_proc_free.
2713  */
2714 static void
prison_complete(void * context,int pending)2715 prison_complete(void *context, int pending)
2716 {
2717 	struct prison *pr = context;
2718 	int drflags;
2719 
2720 	/*
2721 	 * This could be called to release the last reference, or the last
2722 	 * user reference (plus the reference held in prison_proc_free).
2723 	 */
2724 	drflags = prison_lock_xlock(pr, PD_DEREF);
2725 	if (pr->pr_flags & PR_COMPLETE_PROC) {
2726 		pr->pr_flags &= ~PR_COMPLETE_PROC;
2727 		drflags |= PD_DEUREF;
2728 	}
2729 	prison_deref(pr, drflags);
2730 }
2731 
2732 /*
2733  * Remove a prison reference and/or user reference (usually).
2734  * This assumes context that allows sleeping (for allprison_lock),
2735  * with no non-sleeping locks held, except perhaps the prison itself.
2736  * If there are no more references, release and delist the prison.
2737  * On completion, the prison lock and the allprison lock are both
2738  * unlocked.
2739  */
2740 static void
prison_deref(struct prison * pr,int flags)2741 prison_deref(struct prison *pr, int flags)
2742 {
2743 	struct prisonlist freeprison;
2744 	struct prison *killpr, *rpr, *ppr, *tpr;
2745 	struct proc *p;
2746 
2747 	killpr = NULL;
2748 	TAILQ_INIT(&freeprison);
2749 	/*
2750 	 * Release this prison as requested, which may cause its parent
2751 	 * to be released, and then maybe its grandparent, etc.
2752 	 */
2753 	for (;;) {
2754 		if (flags & PD_KILL) {
2755 			/* Kill the prison and its descendents. */
2756 			KASSERT(pr != &prison0,
2757 			    ("prison_deref trying to kill prison0"));
2758 			if (!(flags & PD_DEREF)) {
2759 				prison_hold(pr);
2760 				flags |= PD_DEREF;
2761 			}
2762 			flags = prison_lock_xlock(pr, flags);
2763 			prison_deref_kill(pr, &freeprison);
2764 		}
2765 		if (flags & PD_DEUREF) {
2766 			/* Drop a user reference. */
2767 			KASSERT(refcount_load(&pr->pr_uref) > 0,
2768 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2769 			     pr->pr_id));
2770 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
2771 				if (!(flags & PD_DEREF)) {
2772 					prison_hold(pr);
2773 					flags |= PD_DEREF;
2774 				}
2775 				flags = prison_lock_xlock(pr, flags);
2776 				if (refcount_release(&pr->pr_uref) &&
2777 				    pr->pr_state == PRISON_STATE_ALIVE) {
2778 					/*
2779 					 * When the last user references goes,
2780 					 * this becomes a dying prison.
2781 					 */
2782 					KASSERT(
2783 					    refcount_load(&prison0.pr_uref) > 0,
2784 					    ("prison0 pr_uref=0"));
2785 					pr->pr_state = PRISON_STATE_DYING;
2786 					mtx_unlock(&pr->pr_mtx);
2787 					flags &= ~PD_LOCKED;
2788 					prison_cleanup(pr);
2789 				}
2790 			}
2791 		}
2792 		if (flags & PD_KILL) {
2793 			/*
2794 			 * Any remaining user references are probably processes
2795 			 * that need to be killed, either in this prison or its
2796 			 * descendants.
2797 			 */
2798 			if (refcount_load(&pr->pr_uref) > 0)
2799 				killpr = pr;
2800 			/* Make sure the parent prison doesn't get killed. */
2801 			flags &= ~PD_KILL;
2802 		}
2803 		if (flags & PD_DEREF) {
2804 			/* Drop a reference. */
2805 			KASSERT(refcount_load(&pr->pr_ref) > 0,
2806 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2807 			     pr->pr_id));
2808 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
2809 				flags = prison_lock_xlock(pr, flags);
2810 				if (refcount_release(&pr->pr_ref)) {
2811 					/*
2812 					 * When the last reference goes,
2813 					 * unlink the prison and set it aside.
2814 					 */
2815 					KASSERT(
2816 					    refcount_load(&pr->pr_uref) == 0,
2817 					    ("prison_deref: last ref, "
2818 					     "but still has %d urefs (jid=%d)",
2819 					     pr->pr_uref, pr->pr_id));
2820 					KASSERT(
2821 					    refcount_load(&prison0.pr_ref) != 0,
2822 					    ("prison0 pr_ref=0"));
2823 					pr->pr_state = PRISON_STATE_INVALID;
2824 					TAILQ_REMOVE(&allprison, pr, pr_list);
2825 					LIST_REMOVE(pr, pr_sibling);
2826 					TAILQ_INSERT_TAIL(&freeprison, pr,
2827 					    pr_list);
2828 					for (ppr = pr->pr_parent;
2829 					     ppr != NULL;
2830 					     ppr = ppr->pr_parent)
2831 						ppr->pr_childcount--;
2832 					/*
2833 					 * Removing a prison frees references
2834 					 * from its parent.
2835 					 */
2836 					mtx_unlock(&pr->pr_mtx);
2837 					flags &= ~PD_LOCKED;
2838 					pr = pr->pr_parent;
2839 					flags |= PD_DEREF | PD_DEUREF;
2840 					continue;
2841 				}
2842 			}
2843 		}
2844 		break;
2845 	}
2846 
2847 	/* Release all the prison locks. */
2848 	if (flags & PD_LOCKED)
2849 		mtx_unlock(&pr->pr_mtx);
2850 	if (flags & PD_LIST_SLOCKED)
2851 		sx_sunlock(&allprison_lock);
2852 	else if (flags & PD_LIST_XLOCKED)
2853 		sx_xunlock(&allprison_lock);
2854 
2855 	/* Kill any processes attached to a killed prison. */
2856 	if (killpr != NULL) {
2857 		sx_slock(&allproc_lock);
2858 		FOREACH_PROC_IN_SYSTEM(p) {
2859 			PROC_LOCK(p);
2860 			if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
2861 				for (ppr = p->p_ucred->cr_prison;
2862 				     ppr != &prison0;
2863 				     ppr = ppr->pr_parent)
2864 					if (ppr == killpr) {
2865 						kern_psignal(p, SIGKILL);
2866 						break;
2867 					}
2868 			}
2869 			PROC_UNLOCK(p);
2870 		}
2871 		sx_sunlock(&allproc_lock);
2872 	}
2873 
2874 	/*
2875 	 * Finish removing any unreferenced prisons, which couldn't happen
2876 	 * while allprison_lock was held (to avoid a LOR on vrele).
2877 	 */
2878 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
2879 #ifdef VIMAGE
2880 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
2881 			vnet_destroy(rpr->pr_vnet);
2882 #endif
2883 		if (rpr->pr_root != NULL)
2884 			vrele(rpr->pr_root);
2885 		mtx_destroy(&rpr->pr_mtx);
2886 #ifdef INET
2887 		free(rpr->pr_ip4, M_PRISON);
2888 #endif
2889 #ifdef INET6
2890 		free(rpr->pr_ip6, M_PRISON);
2891 #endif
2892 		if (rpr->pr_cpuset != NULL)
2893 			cpuset_rel(rpr->pr_cpuset);
2894 		osd_jail_exit(rpr);
2895 #ifdef RACCT
2896 		if (racct_enable)
2897 			prison_racct_detach(rpr);
2898 #endif
2899 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
2900 		free(rpr, M_PRISON);
2901 	}
2902 }
2903 
2904 /*
2905  * Kill the prison and its descendants.  Mark them as dying, clear the
2906  * persist flag, and call module remove methods.
2907  */
2908 static void
prison_deref_kill(struct prison * pr,struct prisonlist * freeprison)2909 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
2910 {
2911 	struct prison *cpr, *ppr, *rpr;
2912 	bool descend;
2913 
2914 	/*
2915 	 * Unlike the descendants, the target prison can be killed
2916 	 * even if it is currently dying.  This is useful for failed
2917 	 * creation in jail_set(2).
2918 	 */
2919 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2920 	    ("Trying to kill dead prison %p (jid=%d).",
2921 	     pr, pr->pr_id));
2922 	refcount_acquire(&pr->pr_uref);
2923 	pr->pr_state = PRISON_STATE_DYING;
2924 	mtx_unlock(&pr->pr_mtx);
2925 
2926 	rpr = NULL;
2927 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
2928 		if (descend) {
2929 			if (!prison_isalive(cpr)) {
2930 				descend = false;
2931 				continue;
2932 			}
2933 			prison_hold(cpr);
2934 			prison_proc_hold(cpr);
2935 			mtx_lock(&cpr->pr_mtx);
2936 			cpr->pr_state = PRISON_STATE_DYING;
2937 			cpr->pr_flags |= PR_REMOVE;
2938 			mtx_unlock(&cpr->pr_mtx);
2939 			continue;
2940 		}
2941 		if (!(cpr->pr_flags & PR_REMOVE))
2942 			continue;
2943 		prison_cleanup(cpr);
2944 		mtx_lock(&cpr->pr_mtx);
2945 		cpr->pr_flags &= ~PR_REMOVE;
2946 		if (cpr->pr_flags & PR_PERSIST) {
2947 			cpr->pr_flags &= ~PR_PERSIST;
2948 			prison_proc_free_not_last(cpr);
2949 			prison_free_not_last(cpr);
2950 		}
2951 		(void)refcount_release(&cpr->pr_uref);
2952 		if (refcount_release(&cpr->pr_ref)) {
2953 			/*
2954 			 * When the last reference goes, unlink the prison
2955 			 * and set it aside for prison_deref() to handle.
2956 			 * Delay unlinking the sibling list to keep the loop
2957 			 * safe.
2958 			 */
2959 			if (rpr != NULL)
2960 				LIST_REMOVE(rpr, pr_sibling);
2961 			rpr = cpr;
2962 			rpr->pr_state = PRISON_STATE_INVALID;
2963 			TAILQ_REMOVE(&allprison, rpr, pr_list);
2964 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
2965 			/*
2966 			 * Removing a prison frees references from its parent.
2967 			 */
2968 			ppr = rpr->pr_parent;
2969 			prison_proc_free_not_last(ppr);
2970 			prison_free_not_last(ppr);
2971 			for (; ppr != NULL; ppr = ppr->pr_parent)
2972 				ppr->pr_childcount--;
2973 		}
2974 		mtx_unlock(&cpr->pr_mtx);
2975 	}
2976 	if (rpr != NULL)
2977 		LIST_REMOVE(rpr, pr_sibling);
2978 
2979 	prison_cleanup(pr);
2980 	mtx_lock(&pr->pr_mtx);
2981 	if (pr->pr_flags & PR_PERSIST) {
2982 		pr->pr_flags &= ~PR_PERSIST;
2983 		prison_proc_free_not_last(pr);
2984 		prison_free_not_last(pr);
2985 	}
2986 	(void)refcount_release(&pr->pr_uref);
2987 }
2988 
2989 /*
2990  * Given the current locking state in the flags, make sure allprison_lock
2991  * is held exclusive, and the prison is locked.  Return flags indicating
2992  * the new state.
2993  */
2994 static int
prison_lock_xlock(struct prison * pr,int flags)2995 prison_lock_xlock(struct prison *pr, int flags)
2996 {
2997 
2998 	if (!(flags & PD_LIST_XLOCKED)) {
2999 		/*
3000 		 * Get allprison_lock, which may be an upgrade,
3001 		 * and may require unlocking the prison.
3002 		 */
3003 		if (flags & PD_LOCKED) {
3004 			mtx_unlock(&pr->pr_mtx);
3005 			flags &= ~PD_LOCKED;
3006 		}
3007 		if (flags & PD_LIST_SLOCKED) {
3008 			if (!sx_try_upgrade(&allprison_lock)) {
3009 				sx_sunlock(&allprison_lock);
3010 				sx_xlock(&allprison_lock);
3011 			}
3012 			flags &= ~PD_LIST_SLOCKED;
3013 		} else
3014 			sx_xlock(&allprison_lock);
3015 		flags |= PD_LIST_XLOCKED;
3016 	}
3017 	if (!(flags & PD_LOCKED)) {
3018 		/* Lock the prison mutex. */
3019 		mtx_lock(&pr->pr_mtx);
3020 		flags |= PD_LOCKED;
3021 	}
3022 	return flags;
3023 }
3024 
3025 /*
3026  * Release a prison's resources when it starts dying (when the last user
3027  * reference is dropped, or when it is killed).
3028  */
3029 static void
prison_cleanup(struct prison * pr)3030 prison_cleanup(struct prison *pr)
3031 {
3032 	sx_assert(&allprison_lock, SA_XLOCKED);
3033 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3034 	vfs_exjail_delete(pr);
3035 	shm_remove_prison(pr);
3036 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3037 }
3038 
3039 /*
3040  * Set or clear a permission bit in the pr_allow field, passing restrictions
3041  * (cleared permission) down to child jails.
3042  */
3043 void
prison_set_allow(struct ucred * cred,unsigned flag,int enable)3044 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3045 {
3046 	struct prison *pr;
3047 
3048 	pr = cred->cr_prison;
3049 	sx_slock(&allprison_lock);
3050 	mtx_lock(&pr->pr_mtx);
3051 	prison_set_allow_locked(pr, flag, enable);
3052 	mtx_unlock(&pr->pr_mtx);
3053 	sx_sunlock(&allprison_lock);
3054 }
3055 
3056 static void
prison_set_allow_locked(struct prison * pr,unsigned flag,int enable)3057 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3058 {
3059 	struct prison *cpr;
3060 	int descend;
3061 
3062 	if (enable != 0)
3063 		pr->pr_allow |= flag;
3064 	else {
3065 		pr->pr_allow &= ~flag;
3066 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3067 			cpr->pr_allow &= ~flag;
3068 	}
3069 }
3070 
3071 /*
3072  * Check if a jail supports the given address family.
3073  *
3074  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3075  * if not.
3076  */
3077 int
prison_check_af(struct ucred * cred,int af)3078 prison_check_af(struct ucred *cred, int af)
3079 {
3080 	struct prison *pr;
3081 	int error;
3082 
3083 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3084 
3085 	pr = cred->cr_prison;
3086 #ifdef VIMAGE
3087 	/* Prisons with their own network stack are not limited. */
3088 	if (prison_owns_vnet(cred))
3089 		return (0);
3090 #endif
3091 
3092 	error = 0;
3093 	switch (af)
3094 	{
3095 #ifdef INET
3096 	case AF_INET:
3097 		if (pr->pr_flags & PR_IP4)
3098 		{
3099 			mtx_lock(&pr->pr_mtx);
3100 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3101 				error = EAFNOSUPPORT;
3102 			mtx_unlock(&pr->pr_mtx);
3103 		}
3104 		break;
3105 #endif
3106 #ifdef INET6
3107 	case AF_INET6:
3108 		if (pr->pr_flags & PR_IP6)
3109 		{
3110 			mtx_lock(&pr->pr_mtx);
3111 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3112 				error = EAFNOSUPPORT;
3113 			mtx_unlock(&pr->pr_mtx);
3114 		}
3115 		break;
3116 #endif
3117 	case AF_LOCAL:
3118 	case AF_ROUTE:
3119 		break;
3120 	default:
3121 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3122 			error = EAFNOSUPPORT;
3123 	}
3124 	return (error);
3125 }
3126 
3127 /*
3128  * Check if given address belongs to the jail referenced by cred (wrapper to
3129  * prison_check_ip[46]).
3130  *
3131  * Returns 0 if jail doesn't restrict the address family or if address belongs
3132  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3133  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3134  */
3135 int
prison_if(struct ucred * cred,const struct sockaddr * sa)3136 prison_if(struct ucred *cred, const struct sockaddr *sa)
3137 {
3138 #ifdef INET
3139 	const struct sockaddr_in *sai;
3140 #endif
3141 #ifdef INET6
3142 	const struct sockaddr_in6 *sai6;
3143 #endif
3144 	int error;
3145 
3146 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3147 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3148 
3149 #ifdef VIMAGE
3150 	if (prison_owns_vnet(cred))
3151 		return (0);
3152 #endif
3153 
3154 	error = 0;
3155 	switch (sa->sa_family)
3156 	{
3157 #ifdef INET
3158 	case AF_INET:
3159 		sai = (const struct sockaddr_in *)sa;
3160 		error = prison_check_ip4(cred, &sai->sin_addr);
3161 		break;
3162 #endif
3163 #ifdef INET6
3164 	case AF_INET6:
3165 		sai6 = (const struct sockaddr_in6 *)sa;
3166 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3167 		break;
3168 #endif
3169 	default:
3170 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3171 			error = EAFNOSUPPORT;
3172 	}
3173 	return (error);
3174 }
3175 
3176 /*
3177  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3178  */
3179 int
prison_check(struct ucred * cred1,struct ucred * cred2)3180 prison_check(struct ucred *cred1, struct ucred *cred2)
3181 {
3182 
3183 	return ((cred1->cr_prison == cred2->cr_prison ||
3184 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3185 }
3186 
3187 /*
3188  * For mountd/nfsd to run within a prison, it must be:
3189  * - A vnet prison.
3190  * - PR_ALLOW_NFSD must be set on it.
3191  * - The root directory (pr_root) of the prison must be
3192  *   a file system mount point, so the mountd can hang
3193  *   export information on it.
3194  * - The prison's enforce_statfs cannot be 0, so that
3195  *   mountd(8) can do exports.
3196  */
3197 bool
prison_check_nfsd(struct ucred * cred)3198 prison_check_nfsd(struct ucred *cred)
3199 {
3200 
3201 	if (jailed_without_vnet(cred))
3202 		return (false);
3203 	if (!prison_allow(cred, PR_ALLOW_NFSD))
3204 		return (false);
3205 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3206 		return (false);
3207 	if (cred->cr_prison->pr_enforce_statfs == 0)
3208 		return (false);
3209 	return (true);
3210 }
3211 
3212 /*
3213  * Return 1 if p2 is a child of p1, otherwise 0.
3214  */
3215 int
prison_ischild(struct prison * pr1,struct prison * pr2)3216 prison_ischild(struct prison *pr1, struct prison *pr2)
3217 {
3218 
3219 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3220 		if (pr1 == pr2)
3221 			return (1);
3222 	return (0);
3223 }
3224 
3225 /*
3226  * Return true if the prison is currently alive.  A prison is alive if it
3227  * holds user references and it isn't being removed.
3228  */
3229 bool
prison_isalive(struct prison * pr)3230 prison_isalive(struct prison *pr)
3231 {
3232 
3233 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3234 		return (false);
3235 	return (true);
3236 }
3237 
3238 /*
3239  * Return true if the prison is currently valid.  A prison is valid if it has
3240  * been fully created, and is not being destroyed.  Note that dying prisons
3241  * are still considered valid.  Invalid prisons won't be found under normal
3242  * circumstances, as they're only put in that state by functions that have
3243  * an exclusive hold on allprison_lock.
3244  */
3245 bool
prison_isvalid(struct prison * pr)3246 prison_isvalid(struct prison *pr)
3247 {
3248 
3249 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3250 		return (false);
3251 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3252 		return (false);
3253 	return (true);
3254 }
3255 
3256 /*
3257  * Return 1 if the passed credential is in a jail and that jail does not
3258  * have its own virtual network stack, otherwise 0.
3259  */
3260 int
jailed_without_vnet(struct ucred * cred)3261 jailed_without_vnet(struct ucred *cred)
3262 {
3263 
3264 	if (!jailed(cred))
3265 		return (0);
3266 #ifdef VIMAGE
3267 	if (prison_owns_vnet(cred))
3268 		return (0);
3269 #endif
3270 
3271 	return (1);
3272 }
3273 
3274 /*
3275  * Return the correct hostname (domainname, et al) for the passed credential.
3276  */
3277 void
getcredhostname(struct ucred * cred,char * buf,size_t size)3278 getcredhostname(struct ucred *cred, char *buf, size_t size)
3279 {
3280 	struct prison *pr;
3281 
3282 	/*
3283 	 * A NULL credential can be used to shortcut to the physical
3284 	 * system's hostname.
3285 	 */
3286 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3287 	mtx_lock(&pr->pr_mtx);
3288 	strlcpy(buf, pr->pr_hostname, size);
3289 	mtx_unlock(&pr->pr_mtx);
3290 }
3291 
3292 void
getcreddomainname(struct ucred * cred,char * buf,size_t size)3293 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3294 {
3295 
3296 	mtx_lock(&cred->cr_prison->pr_mtx);
3297 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3298 	mtx_unlock(&cred->cr_prison->pr_mtx);
3299 }
3300 
3301 void
getcredhostuuid(struct ucred * cred,char * buf,size_t size)3302 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3303 {
3304 
3305 	mtx_lock(&cred->cr_prison->pr_mtx);
3306 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3307 	mtx_unlock(&cred->cr_prison->pr_mtx);
3308 }
3309 
3310 void
getcredhostid(struct ucred * cred,unsigned long * hostid)3311 getcredhostid(struct ucred *cred, unsigned long *hostid)
3312 {
3313 
3314 	mtx_lock(&cred->cr_prison->pr_mtx);
3315 	*hostid = cred->cr_prison->pr_hostid;
3316 	mtx_unlock(&cred->cr_prison->pr_mtx);
3317 }
3318 
3319 void
getjailname(struct ucred * cred,char * name,size_t len)3320 getjailname(struct ucred *cred, char *name, size_t len)
3321 {
3322 
3323 	mtx_lock(&cred->cr_prison->pr_mtx);
3324 	strlcpy(name, cred->cr_prison->pr_name, len);
3325 	mtx_unlock(&cred->cr_prison->pr_mtx);
3326 }
3327 
3328 #ifdef VIMAGE
3329 /*
3330  * Determine whether the prison represented by cred owns
3331  * its vnet rather than having it inherited.
3332  *
3333  * Returns 1 in case the prison owns the vnet, 0 otherwise.
3334  */
3335 int
prison_owns_vnet(struct ucred * cred)3336 prison_owns_vnet(struct ucred *cred)
3337 {
3338 
3339 	/*
3340 	 * vnets cannot be added/removed after jail creation,
3341 	 * so no need to lock here.
3342 	 */
3343 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3344 }
3345 #endif
3346 
3347 /*
3348  * Determine whether the subject represented by cred can "see"
3349  * status of a mount point.
3350  * Returns: 0 for permitted, ENOENT otherwise.
3351  * XXX: This function should be called cr_canseemount() and should be
3352  *      placed in kern_prot.c.
3353  */
3354 int
prison_canseemount(struct ucred * cred,struct mount * mp)3355 prison_canseemount(struct ucred *cred, struct mount *mp)
3356 {
3357 	struct prison *pr;
3358 	struct statfs *sp;
3359 	size_t len;
3360 
3361 	pr = cred->cr_prison;
3362 	if (pr->pr_enforce_statfs == 0)
3363 		return (0);
3364 	if (pr->pr_root->v_mount == mp)
3365 		return (0);
3366 	if (pr->pr_enforce_statfs == 2)
3367 		return (ENOENT);
3368 	/*
3369 	 * If jail's chroot directory is set to "/" we should be able to see
3370 	 * all mount-points from inside a jail.
3371 	 * This is ugly check, but this is the only situation when jail's
3372 	 * directory ends with '/'.
3373 	 */
3374 	if (strcmp(pr->pr_path, "/") == 0)
3375 		return (0);
3376 	len = strlen(pr->pr_path);
3377 	sp = &mp->mnt_stat;
3378 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3379 		return (ENOENT);
3380 	/*
3381 	 * Be sure that we don't have situation where jail's root directory
3382 	 * is "/some/path" and mount point is "/some/pathpath".
3383 	 */
3384 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3385 		return (ENOENT);
3386 	return (0);
3387 }
3388 
3389 void
prison_enforce_statfs(struct ucred * cred,struct mount * mp,struct statfs * sp)3390 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3391 {
3392 	char jpath[MAXPATHLEN];
3393 	struct prison *pr;
3394 	size_t len;
3395 
3396 	pr = cred->cr_prison;
3397 	if (pr->pr_enforce_statfs == 0)
3398 		return;
3399 	if (prison_canseemount(cred, mp) != 0) {
3400 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3401 		strlcpy(sp->f_mntonname, "[restricted]",
3402 		    sizeof(sp->f_mntonname));
3403 		return;
3404 	}
3405 	if (pr->pr_root->v_mount == mp) {
3406 		/*
3407 		 * Clear current buffer data, so we are sure nothing from
3408 		 * the valid path left there.
3409 		 */
3410 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3411 		*sp->f_mntonname = '/';
3412 		return;
3413 	}
3414 	/*
3415 	 * If jail's chroot directory is set to "/" we should be able to see
3416 	 * all mount-points from inside a jail.
3417 	 */
3418 	if (strcmp(pr->pr_path, "/") == 0)
3419 		return;
3420 	len = strlen(pr->pr_path);
3421 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3422 	/*
3423 	 * Clear current buffer data, so we are sure nothing from
3424 	 * the valid path left there.
3425 	 */
3426 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3427 	if (*jpath == '\0') {
3428 		/* Should never happen. */
3429 		*sp->f_mntonname = '/';
3430 	} else {
3431 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3432 	}
3433 }
3434 
3435 /*
3436  * Check with permission for a specific privilege is granted within jail.  We
3437  * have a specific list of accepted privileges; the rest are denied.
3438  */
3439 int
prison_priv_check(struct ucred * cred,int priv)3440 prison_priv_check(struct ucred *cred, int priv)
3441 {
3442 	struct prison *pr;
3443 	int error;
3444 
3445 	/*
3446 	 * Some policies have custom handlers. This routine should not be
3447 	 * called for them. See priv_check_cred().
3448 	 */
3449 	switch (priv) {
3450 	case PRIV_VFS_LOOKUP:
3451 	case PRIV_VFS_GENERATION:
3452 		KASSERT(0, ("prison_priv_check instead of a custom handler "
3453 		    "called for %d\n", priv));
3454 	}
3455 
3456 	if (!jailed(cred))
3457 		return (0);
3458 
3459 #ifdef VIMAGE
3460 	/*
3461 	 * Privileges specific to prisons with a virtual network stack.
3462 	 * There might be a duplicate entry here in case the privilege
3463 	 * is only granted conditionally in the legacy jail case.
3464 	 */
3465 	switch (priv) {
3466 		/*
3467 		 * NFS-specific privileges.
3468 		 */
3469 	case PRIV_NFS_DAEMON:
3470 	case PRIV_VFS_GETFH:
3471 	case PRIV_VFS_MOUNT_EXPORTED:
3472 		if (!prison_check_nfsd(cred))
3473 			return (EPERM);
3474 #ifdef notyet
3475 	case PRIV_NFS_LOCKD:
3476 #endif
3477 		/*
3478 		 * Network stack privileges.
3479 		 */
3480 	case PRIV_NET_BRIDGE:
3481 	case PRIV_NET_GRE:
3482 	case PRIV_NET_BPF:
3483 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3484 	case PRIV_NET_ROUTE:
3485 	case PRIV_NET_TAP:
3486 	case PRIV_NET_SETIFMTU:
3487 	case PRIV_NET_SETIFFLAGS:
3488 	case PRIV_NET_SETIFCAP:
3489 	case PRIV_NET_SETIFDESCR:
3490 	case PRIV_NET_SETIFNAME	:
3491 	case PRIV_NET_SETIFMETRIC:
3492 	case PRIV_NET_SETIFPHYS:
3493 	case PRIV_NET_SETIFMAC:
3494 	case PRIV_NET_SETLANPCP:
3495 	case PRIV_NET_ADDMULTI:
3496 	case PRIV_NET_DELMULTI:
3497 	case PRIV_NET_HWIOCTL:
3498 	case PRIV_NET_SETLLADDR:
3499 	case PRIV_NET_ADDIFGROUP:
3500 	case PRIV_NET_DELIFGROUP:
3501 	case PRIV_NET_IFCREATE:
3502 	case PRIV_NET_IFDESTROY:
3503 	case PRIV_NET_ADDIFADDR:
3504 	case PRIV_NET_DELIFADDR:
3505 	case PRIV_NET_LAGG:
3506 	case PRIV_NET_GIF:
3507 	case PRIV_NET_SETIFVNET:
3508 	case PRIV_NET_SETIFFIB:
3509 	case PRIV_NET_ME:
3510 	case PRIV_NET_WG:
3511 
3512 		/*
3513 		 * 802.11-related privileges.
3514 		 */
3515 	case PRIV_NET80211_VAP_GETKEY:
3516 	case PRIV_NET80211_VAP_MANAGE:
3517 
3518 #ifdef notyet
3519 		/*
3520 		 * ATM privileges.
3521 		 */
3522 	case PRIV_NETATM_CFG:
3523 	case PRIV_NETATM_ADD:
3524 	case PRIV_NETATM_DEL:
3525 	case PRIV_NETATM_SET:
3526 
3527 		/*
3528 		 * Bluetooth privileges.
3529 		 */
3530 	case PRIV_NETBLUETOOTH_RAW:
3531 #endif
3532 
3533 		/*
3534 		 * Netgraph and netgraph module privileges.
3535 		 */
3536 	case PRIV_NETGRAPH_CONTROL:
3537 #ifdef notyet
3538 	case PRIV_NETGRAPH_TTY:
3539 #endif
3540 
3541 		/*
3542 		 * IPv4 and IPv6 privileges.
3543 		 */
3544 	case PRIV_NETINET_IPFW:
3545 	case PRIV_NETINET_DIVERT:
3546 	case PRIV_NETINET_PF:
3547 	case PRIV_NETINET_DUMMYNET:
3548 	case PRIV_NETINET_CARP:
3549 	case PRIV_NETINET_MROUTE:
3550 	case PRIV_NETINET_RAW:
3551 	case PRIV_NETINET_ADDRCTRL6:
3552 	case PRIV_NETINET_ND6:
3553 	case PRIV_NETINET_SCOPE6:
3554 	case PRIV_NETINET_ALIFETIME6:
3555 	case PRIV_NETINET_IPSEC:
3556 	case PRIV_NETINET_BINDANY:
3557 
3558 #ifdef notyet
3559 		/*
3560 		 * NCP privileges.
3561 		 */
3562 	case PRIV_NETNCP:
3563 
3564 		/*
3565 		 * SMB privileges.
3566 		 */
3567 	case PRIV_NETSMB:
3568 #endif
3569 
3570 	/*
3571 	 * No default: or deny here.
3572 	 * In case of no permit fall through to next switch().
3573 	 */
3574 		if (cred->cr_prison->pr_flags & PR_VNET)
3575 			return (0);
3576 	}
3577 #endif /* VIMAGE */
3578 
3579 	switch (priv) {
3580 		/*
3581 		 * Allow ktrace privileges for root in jail.
3582 		 */
3583 	case PRIV_KTRACE:
3584 
3585 #if 0
3586 		/*
3587 		 * Allow jailed processes to configure audit identity and
3588 		 * submit audit records (login, etc).  In the future we may
3589 		 * want to further refine the relationship between audit and
3590 		 * jail.
3591 		 */
3592 	case PRIV_AUDIT_GETAUDIT:
3593 	case PRIV_AUDIT_SETAUDIT:
3594 	case PRIV_AUDIT_SUBMIT:
3595 #endif
3596 
3597 		/*
3598 		 * Allow jailed processes to manipulate process UNIX
3599 		 * credentials in any way they see fit.
3600 		 */
3601 	case PRIV_CRED_SETUID:
3602 	case PRIV_CRED_SETEUID:
3603 	case PRIV_CRED_SETGID:
3604 	case PRIV_CRED_SETEGID:
3605 	case PRIV_CRED_SETGROUPS:
3606 	case PRIV_CRED_SETREUID:
3607 	case PRIV_CRED_SETREGID:
3608 	case PRIV_CRED_SETRESUID:
3609 	case PRIV_CRED_SETRESGID:
3610 
3611 		/*
3612 		 * Jail implements visibility constraints already, so allow
3613 		 * jailed root to override uid/gid-based constraints.
3614 		 */
3615 	case PRIV_SEEOTHERGIDS:
3616 	case PRIV_SEEOTHERUIDS:
3617 	case PRIV_SEEJAILPROC:
3618 
3619 		/*
3620 		 * Jail implements inter-process debugging limits already, so
3621 		 * allow jailed root various debugging privileges.
3622 		 */
3623 	case PRIV_DEBUG_DIFFCRED:
3624 	case PRIV_DEBUG_SUGID:
3625 	case PRIV_DEBUG_UNPRIV:
3626 
3627 		/*
3628 		 * Allow jail to set various resource limits and login
3629 		 * properties, and for now, exceed process resource limits.
3630 		 */
3631 	case PRIV_PROC_LIMIT:
3632 	case PRIV_PROC_SETLOGIN:
3633 	case PRIV_PROC_SETRLIMIT:
3634 
3635 		/*
3636 		 * System V and POSIX IPC privileges are granted in jail.
3637 		 */
3638 	case PRIV_IPC_READ:
3639 	case PRIV_IPC_WRITE:
3640 	case PRIV_IPC_ADMIN:
3641 	case PRIV_IPC_MSGSIZE:
3642 	case PRIV_MQ_ADMIN:
3643 
3644 		/*
3645 		 * Jail operations within a jail work on child jails.
3646 		 */
3647 	case PRIV_JAIL_ATTACH:
3648 	case PRIV_JAIL_SET:
3649 	case PRIV_JAIL_REMOVE:
3650 
3651 		/*
3652 		 * Jail implements its own inter-process limits, so allow
3653 		 * root processes in jail to change scheduling on other
3654 		 * processes in the same jail.  Likewise for signalling.
3655 		 */
3656 	case PRIV_SCHED_DIFFCRED:
3657 	case PRIV_SCHED_CPUSET:
3658 	case PRIV_SIGNAL_DIFFCRED:
3659 	case PRIV_SIGNAL_SUGID:
3660 
3661 		/*
3662 		 * Allow jailed processes to write to sysctls marked as jail
3663 		 * writable.
3664 		 */
3665 	case PRIV_SYSCTL_WRITEJAIL:
3666 
3667 		/*
3668 		 * Allow root in jail to manage a variety of quota
3669 		 * properties.  These should likely be conditional on a
3670 		 * configuration option.
3671 		 */
3672 	case PRIV_VFS_GETQUOTA:
3673 	case PRIV_VFS_SETQUOTA:
3674 
3675 		/*
3676 		 * Since Jail relies on chroot() to implement file system
3677 		 * protections, grant many VFS privileges to root in jail.
3678 		 * Be careful to exclude mount-related and NFS-related
3679 		 * privileges.
3680 		 */
3681 	case PRIV_VFS_READ:
3682 	case PRIV_VFS_WRITE:
3683 	case PRIV_VFS_ADMIN:
3684 	case PRIV_VFS_EXEC:
3685 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
3686 	case PRIV_VFS_CHFLAGS_DEV:
3687 	case PRIV_VFS_CHOWN:
3688 	case PRIV_VFS_CHROOT:
3689 	case PRIV_VFS_RETAINSUGID:
3690 	case PRIV_VFS_FCHROOT:
3691 	case PRIV_VFS_LINK:
3692 	case PRIV_VFS_SETGID:
3693 	case PRIV_VFS_STAT:
3694 	case PRIV_VFS_STICKYFILE:
3695 
3696 		/*
3697 		 * As in the non-jail case, non-root users are expected to be
3698 		 * able to read kernel/physical memory (provided /dev/[k]mem
3699 		 * exists in the jail and they have permission to access it).
3700 		 */
3701 	case PRIV_KMEM_READ:
3702 		return (0);
3703 
3704 		/*
3705 		 * Depending on the global setting, allow privilege of
3706 		 * setting system flags.
3707 		 */
3708 	case PRIV_VFS_SYSFLAGS:
3709 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3710 			return (0);
3711 		else
3712 			return (EPERM);
3713 
3714 		/*
3715 		 * Depending on the global setting, allow privilege of
3716 		 * mounting/unmounting file systems.
3717 		 */
3718 	case PRIV_VFS_MOUNT:
3719 	case PRIV_VFS_UNMOUNT:
3720 	case PRIV_VFS_MOUNT_NONUSER:
3721 	case PRIV_VFS_MOUNT_OWNER:
3722 		pr = cred->cr_prison;
3723 		prison_lock(pr);
3724 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
3725 			error = 0;
3726 		else
3727 			error = EPERM;
3728 		prison_unlock(pr);
3729 		return (error);
3730 
3731 		/*
3732 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
3733 		 * policy.  priv_check_cred will not specifically allow it, and
3734 		 * we may want a MAC policy to allow it.
3735 		 */
3736 	case PRIV_VFS_READ_DIR:
3737 		return (0);
3738 
3739 		/*
3740 		 * Conditionnaly allow locking (unlocking) physical pages
3741 		 * in memory.
3742 		 */
3743 	case PRIV_VM_MLOCK:
3744 	case PRIV_VM_MUNLOCK:
3745 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
3746 			return (0);
3747 		else
3748 			return (EPERM);
3749 
3750 		/*
3751 		 * Conditionally allow jailed root to bind reserved ports.
3752 		 */
3753 	case PRIV_NETINET_RESERVEDPORT:
3754 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
3755 			return (0);
3756 		else
3757 			return (EPERM);
3758 
3759 		/*
3760 		 * Allow jailed root to reuse in-use ports.
3761 		 */
3762 	case PRIV_NETINET_REUSEPORT:
3763 		return (0);
3764 
3765 		/*
3766 		 * Allow jailed root to set certain IPv4/6 (option) headers.
3767 		 */
3768 	case PRIV_NETINET_SETHDROPTS:
3769 		return (0);
3770 
3771 		/*
3772 		 * Conditionally allow creating raw sockets in jail.
3773 		 */
3774 	case PRIV_NETINET_RAW:
3775 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3776 			return (0);
3777 		else
3778 			return (EPERM);
3779 
3780 		/*
3781 		 * Since jail implements its own visibility limits on netstat
3782 		 * sysctls, allow getcred.  This allows identd to work in
3783 		 * jail.
3784 		 */
3785 	case PRIV_NETINET_GETCRED:
3786 		return (0);
3787 
3788 		/*
3789 		 * Allow jailed root to set loginclass.
3790 		 */
3791 	case PRIV_PROC_SETLOGINCLASS:
3792 		return (0);
3793 
3794 		/*
3795 		 * Do not allow a process inside a jail to read the kernel
3796 		 * message buffer unless explicitly permitted.
3797 		 */
3798 	case PRIV_MSGBUF:
3799 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
3800 			return (0);
3801 		return (EPERM);
3802 
3803 	default:
3804 		/*
3805 		 * In all remaining cases, deny the privilege request.  This
3806 		 * includes almost all network privileges, many system
3807 		 * configuration privileges.
3808 		 */
3809 		return (EPERM);
3810 	}
3811 }
3812 
3813 /*
3814  * Return the part of pr2's name that is relative to pr1, or the whole name
3815  * if it does not directly follow.
3816  */
3817 
3818 char *
prison_name(struct prison * pr1,struct prison * pr2)3819 prison_name(struct prison *pr1, struct prison *pr2)
3820 {
3821 	char *name;
3822 
3823 	/* Jails see themselves as "0" (if they see themselves at all). */
3824 	if (pr1 == pr2)
3825 		return "0";
3826 	name = pr2->pr_name;
3827 	if (prison_ischild(pr1, pr2)) {
3828 		/*
3829 		 * pr1 isn't locked (and allprison_lock may not be either)
3830 		 * so its length can't be counted on.  But the number of dots
3831 		 * can be counted on - and counted.
3832 		 */
3833 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3834 			name = strchr(name, '.') + 1;
3835 	}
3836 	return (name);
3837 }
3838 
3839 /*
3840  * Return the part of pr2's path that is relative to pr1, or the whole path
3841  * if it does not directly follow.
3842  */
3843 static char *
prison_path(struct prison * pr1,struct prison * pr2)3844 prison_path(struct prison *pr1, struct prison *pr2)
3845 {
3846 	char *path1, *path2;
3847 	int len1;
3848 
3849 	path1 = pr1->pr_path;
3850 	path2 = pr2->pr_path;
3851 	if (!strcmp(path1, "/"))
3852 		return (path2);
3853 	len1 = strlen(path1);
3854 	if (strncmp(path1, path2, len1))
3855 		return (path2);
3856 	if (path2[len1] == '\0')
3857 		return "/";
3858 	if (path2[len1] == '/')
3859 		return (path2 + len1);
3860 	return (path2);
3861 }
3862 
3863 /*
3864  * Jail-related sysctls.
3865  */
3866 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
3867     "Jails");
3868 
3869 static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)3870 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3871 {
3872 	struct xprison *xp;
3873 	struct prison *pr, *cpr;
3874 #ifdef INET
3875 	struct in_addr *ip4 = NULL;
3876 	int ip4s = 0;
3877 #endif
3878 #ifdef INET6
3879 	struct in6_addr *ip6 = NULL;
3880 	int ip6s = 0;
3881 #endif
3882 	int descend, error;
3883 
3884 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3885 	pr = req->td->td_ucred->cr_prison;
3886 	error = 0;
3887 	sx_slock(&allprison_lock);
3888 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3889 #if defined(INET) || defined(INET6)
3890  again:
3891 #endif
3892 		mtx_lock(&cpr->pr_mtx);
3893 #ifdef INET
3894 		if (cpr->pr_ip4s > 0) {
3895 			if (ip4s < cpr->pr_ip4s) {
3896 				ip4s = cpr->pr_ip4s;
3897 				mtx_unlock(&cpr->pr_mtx);
3898 				ip4 = realloc(ip4, ip4s *
3899 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
3900 				goto again;
3901 			}
3902 			bcopy(cpr->pr_ip4, ip4,
3903 			    cpr->pr_ip4s * sizeof(struct in_addr));
3904 		}
3905 #endif
3906 #ifdef INET6
3907 		if (cpr->pr_ip6s > 0) {
3908 			if (ip6s < cpr->pr_ip6s) {
3909 				ip6s = cpr->pr_ip6s;
3910 				mtx_unlock(&cpr->pr_mtx);
3911 				ip6 = realloc(ip6, ip6s *
3912 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3913 				goto again;
3914 			}
3915 			bcopy(cpr->pr_ip6, ip6,
3916 			    cpr->pr_ip6s * sizeof(struct in6_addr));
3917 		}
3918 #endif
3919 		bzero(xp, sizeof(*xp));
3920 		xp->pr_version = XPRISON_VERSION;
3921 		xp->pr_id = cpr->pr_id;
3922 		xp->pr_state = cpr->pr_state;
3923 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3924 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3925 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3926 #ifdef INET
3927 		xp->pr_ip4s = cpr->pr_ip4s;
3928 #endif
3929 #ifdef INET6
3930 		xp->pr_ip6s = cpr->pr_ip6s;
3931 #endif
3932 		mtx_unlock(&cpr->pr_mtx);
3933 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
3934 		if (error)
3935 			break;
3936 #ifdef INET
3937 		if (xp->pr_ip4s > 0) {
3938 			error = SYSCTL_OUT(req, ip4,
3939 			    xp->pr_ip4s * sizeof(struct in_addr));
3940 			if (error)
3941 				break;
3942 		}
3943 #endif
3944 #ifdef INET6
3945 		if (xp->pr_ip6s > 0) {
3946 			error = SYSCTL_OUT(req, ip6,
3947 			    xp->pr_ip6s * sizeof(struct in6_addr));
3948 			if (error)
3949 				break;
3950 		}
3951 #endif
3952 	}
3953 	sx_sunlock(&allprison_lock);
3954 	free(xp, M_TEMP);
3955 #ifdef INET
3956 	free(ip4, M_TEMP);
3957 #endif
3958 #ifdef INET6
3959 	free(ip6, M_TEMP);
3960 #endif
3961 	return (error);
3962 }
3963 
3964 SYSCTL_OID(_security_jail, OID_AUTO, list,
3965     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3966     sysctl_jail_list, "S", "List of active jails");
3967 
3968 static int
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)3969 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3970 {
3971 	int error, injail;
3972 
3973 	injail = jailed(req->td->td_ucred);
3974 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
3975 
3976 	return (error);
3977 }
3978 
3979 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3980     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3981     sysctl_jail_jailed, "I", "Process in jail?");
3982 
3983 static int
sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)3984 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
3985 {
3986 	int error, havevnet;
3987 #ifdef VIMAGE
3988 	struct ucred *cred = req->td->td_ucred;
3989 
3990 	havevnet = jailed(cred) && prison_owns_vnet(cred);
3991 #else
3992 	havevnet = 0;
3993 #endif
3994 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
3995 
3996 	return (error);
3997 }
3998 
3999 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4000     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4001     sysctl_jail_vnet, "I", "Jail owns vnet?");
4002 
4003 #if defined(INET) || defined(INET6)
4004 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4005     &jail_max_af_ips, 0,
4006     "Number of IP addresses a jail may have at most per address family (deprecated)");
4007 #endif
4008 
4009 /*
4010  * Default parameters for jail(2) compatibility.  For historical reasons,
4011  * the sysctl names have varying similarity to the parameter names.  Prisons
4012  * just see their own parameters, and can't change them.
4013  */
4014 static int
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)4015 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4016 {
4017 	int error, i;
4018 
4019 	/* Get the current flag value, and convert it to a boolean. */
4020 	if (req->td->td_ucred->cr_prison == &prison0) {
4021 		mtx_lock(&prison0.pr_mtx);
4022 		i = (jail_default_allow & arg2) != 0;
4023 		mtx_unlock(&prison0.pr_mtx);
4024 	} else
4025 		i = prison_allow(req->td->td_ucred, arg2);
4026 
4027 	if (arg1 != NULL)
4028 		i = !i;
4029 	error = sysctl_handle_int(oidp, &i, 0, req);
4030 	if (error || !req->newptr)
4031 		return (error);
4032 	i = i ? arg2 : 0;
4033 	if (arg1 != NULL)
4034 		i ^= arg2;
4035 	/*
4036 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4037 	 * for writing.
4038 	 */
4039 	mtx_lock(&prison0.pr_mtx);
4040 	jail_default_allow = (jail_default_allow & ~arg2) | i;
4041 	mtx_unlock(&prison0.pr_mtx);
4042 	return (0);
4043 }
4044 
4045 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4046     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4047     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4048     "Processes in jail can set their hostnames (deprecated)");
4049 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4050     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4051     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4052     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4053 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4054     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4055     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4056     "Processes in jail can use System V IPC primitives (deprecated)");
4057 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4058     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4059     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4060     "Prison root can create raw sockets (deprecated)");
4061 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4062     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4063     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4064     "Processes in jail can alter system file flags (deprecated)");
4065 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4066     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4067     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4068     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4069 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
4070     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4071     NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
4072     "Processes in jail can lock/unlock physical pages in memory");
4073 
4074 static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)4075 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4076 {
4077 	struct prison *pr;
4078 	int level, error;
4079 
4080 	pr = req->td->td_ucred->cr_prison;
4081 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4082 	error = sysctl_handle_int(oidp, &level, 0, req);
4083 	if (error || !req->newptr)
4084 		return (error);
4085 	*(int *)arg1 = level;
4086 	return (0);
4087 }
4088 
4089 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4090     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4091     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4092     sysctl_jail_default_level, "I",
4093     "Processes in jail cannot see all mounted file systems (deprecated)");
4094 
4095 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4096     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4097     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4098     sysctl_jail_default_level, "I",
4099     "Ruleset for the devfs filesystem in jail (deprecated)");
4100 
4101 /*
4102  * Nodes to describe jail parameters.  Maximum length of string parameters
4103  * is returned in the string itself, and the other parameters exist merely
4104  * to make themselves and their types known.
4105  */
4106 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4107     "Jail parameters");
4108 
4109 int
sysctl_jail_param(SYSCTL_HANDLER_ARGS)4110 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4111 {
4112 	int i;
4113 	long l;
4114 	size_t s;
4115 	char numbuf[12];
4116 
4117 	switch (oidp->oid_kind & CTLTYPE)
4118 	{
4119 	case CTLTYPE_LONG:
4120 	case CTLTYPE_ULONG:
4121 		l = 0;
4122 #ifdef SCTL_MASK32
4123 		if (!(req->flags & SCTL_MASK32))
4124 #endif
4125 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4126 	case CTLTYPE_INT:
4127 	case CTLTYPE_UINT:
4128 		i = 0;
4129 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4130 	case CTLTYPE_STRING:
4131 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4132 		return
4133 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4134 	case CTLTYPE_STRUCT:
4135 		s = (size_t)arg2;
4136 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4137 	}
4138 	return (0);
4139 }
4140 
4141 /*
4142  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4143  * jail creation time but cannot be changed in an existing jail.
4144  */
4145 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4146 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4147 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4148 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4149 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4150     "I", "Jail secure level");
4151 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4152     "Jail value for kern.osreldate and uname -K");
4153 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4154     "Jail value for kern.osrelease and uname -r");
4155 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4156     "I", "Jail cannot see all mounted file systems");
4157 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4158     "I", "Ruleset for in-jail devfs mounts");
4159 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4160     "B", "Jail persistence");
4161 #ifdef VIMAGE
4162 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4163     "E,jailsys", "Virtual network stack");
4164 #endif
4165 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4166     "B", "Jail is in the process of shutting down");
4167 
4168 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4169 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4170     "I", "Current number of child jails");
4171 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4172     "I", "Maximum number of child jails");
4173 
4174 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4175 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4176     "Jail hostname");
4177 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4178     "Jail NIS domainname");
4179 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4180     "Jail host UUID");
4181 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4182     "LU", "Jail host ID");
4183 
4184 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4185 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4186 
4187 #ifdef INET
4188 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4189     "Jail IPv4 address virtualization");
4190 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4191     "S,in_addr,a", "Jail IPv4 addresses");
4192 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4193     "B", "Do (not) use IPv4 source address selection rather than the "
4194     "primary jail IPv4 address.");
4195 #endif
4196 #ifdef INET6
4197 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4198     "Jail IPv6 address virtualization");
4199 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4200     "S,in6_addr,a", "Jail IPv6 addresses");
4201 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4202     "B", "Do (not) use IPv6 source address selection rather than the "
4203     "primary jail IPv6 address.");
4204 #endif
4205 
4206 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4207 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4208     "B", "Jail may set hostname");
4209 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4210     "B", "Jail may use SYSV IPC");
4211 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4212     "B", "Jail may create raw sockets");
4213 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4214     "B", "Jail may alter system file flags");
4215 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4216     "B", "Jail may set file quotas");
4217 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4218     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4219 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4220     "B", "Jail may lock (unlock) physical pages in memory");
4221 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4222     "B", "Jail may bind sockets to reserved ports");
4223 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4224     "B", "Jail may read the kernel message buffer");
4225 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4226     "B", "Unprivileged processes may use process debugging facilities");
4227 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4228     "B", "Processes in jail with uid 0 have privilege");
4229 #ifdef VIMAGE
4230 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
4231     "B", "Mountd/nfsd may run in the jail");
4232 #endif
4233 
4234 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4235 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4236     "B", "Jail may mount/unmount jail-friendly file systems in general");
4237 
4238 /*
4239  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4240  * its associated bit in the pr_allow bitmask, or zero if the parameter was
4241  * not created.
4242  */
4243 unsigned
prison_add_allow(const char * prefix,const char * name,const char * prefix_descr,const char * descr)4244 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4245     const char *descr)
4246 {
4247 	struct bool_flags *bf;
4248 	struct sysctl_oid *parent;
4249 	char *allow_name, *allow_noname, *allowed;
4250 #ifndef NO_SYSCTL_DESCR
4251 	char *descr_deprecated;
4252 #endif
4253 	u_int allow_flag;
4254 
4255 	if (prefix
4256 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4257 		< 0 ||
4258 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4259 		< 0
4260 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4261 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4262 		free(allow_name, M_PRISON);
4263 		return 0;
4264 	}
4265 
4266 	/*
4267 	 * See if this parameter has already beed added, i.e. a module was
4268 	 * previously loaded/unloaded.
4269 	 */
4270 	mtx_lock(&prison0.pr_mtx);
4271 	for (bf = pr_flag_allow;
4272 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4273 		atomic_load_int(&bf->flag) != 0;
4274 	     bf++) {
4275 		if (strcmp(bf->name, allow_name) == 0) {
4276 			allow_flag = bf->flag;
4277 			goto no_add;
4278 		}
4279 	}
4280 
4281 	/*
4282 	 * Find a free bit in pr_allow_all, failing if there are none
4283 	 * (which shouldn't happen as long as we keep track of how many
4284 	 * potential dynamic flags exist).
4285 	 */
4286 	for (allow_flag = 1;; allow_flag <<= 1) {
4287 		if (allow_flag == 0)
4288 			goto no_add;
4289 		if ((pr_allow_all & allow_flag) == 0)
4290 			break;
4291 	}
4292 
4293 	/* Note the parameter in the next open slot in pr_flag_allow. */
4294 	for (bf = pr_flag_allow; ; bf++) {
4295 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4296 			/* This should never happen, but is not fatal. */
4297 			allow_flag = 0;
4298 			goto no_add;
4299 		}
4300 		if (atomic_load_int(&bf->flag) == 0)
4301 			break;
4302 	}
4303 	bf->name = allow_name;
4304 	bf->noname = allow_noname;
4305 	pr_allow_all |= allow_flag;
4306 	/*
4307 	 * prison0 always has permission for the new parameter.
4308 	 * Other jails must have it granted to them.
4309 	 */
4310 	prison0.pr_allow |= allow_flag;
4311 	/* The flag indicates a valid entry, so make sure it is set last. */
4312 	atomic_store_rel_int(&bf->flag, allow_flag);
4313 	mtx_unlock(&prison0.pr_mtx);
4314 
4315 	/*
4316 	 * Create sysctls for the parameter, and the back-compat global
4317 	 * permission.
4318 	 */
4319 	parent = prefix
4320 	    ? SYSCTL_ADD_NODE(NULL,
4321 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4322 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4323 	    : &sysctl___security_jail_param_allow;
4324 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4325 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4326 	    NULL, 0, sysctl_jail_param, "B", descr);
4327 	if ((prefix
4328 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4329 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4330 #ifndef NO_SYSCTL_DESCR
4331 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4332 		    descr);
4333 #endif
4334 		(void)SYSCTL_ADD_PROC(NULL,
4335 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4336 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4337 		    sysctl_jail_default_allow, "I", descr_deprecated);
4338 #ifndef NO_SYSCTL_DESCR
4339 		free(descr_deprecated, M_TEMP);
4340 #endif
4341 		free(allowed, M_TEMP);
4342 	}
4343 	return allow_flag;
4344 
4345  no_add:
4346 	mtx_unlock(&prison0.pr_mtx);
4347 	free(allow_name, M_PRISON);
4348 	free(allow_noname, M_PRISON);
4349 	return allow_flag;
4350 }
4351 
4352 /*
4353  * The VFS system will register jail-aware filesystems here.  They each get
4354  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4355  * attempts to mount.
4356  */
4357 void
prison_add_vfs(struct vfsconf * vfsp)4358 prison_add_vfs(struct vfsconf *vfsp)
4359 {
4360 #ifdef NO_SYSCTL_DESCR
4361 
4362 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4363 	    NULL, NULL);
4364 #else
4365 	char *descr;
4366 
4367 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4368 	    vfsp->vfc_name);
4369 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4370 	    NULL, descr);
4371 	free(descr, M_TEMP);
4372 #endif
4373 }
4374 
4375 #ifdef RACCT
4376 void
prison_racct_foreach(void (* callback)(struct racct * racct,void * arg2,void * arg3),void (* pre)(void),void (* post)(void),void * arg2,void * arg3)4377 prison_racct_foreach(void (*callback)(struct racct *racct,
4378     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4379     void *arg2, void *arg3)
4380 {
4381 	struct prison_racct *prr;
4382 
4383 	ASSERT_RACCT_ENABLED();
4384 
4385 	sx_slock(&allprison_lock);
4386 	if (pre != NULL)
4387 		(pre)();
4388 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4389 		(callback)(prr->prr_racct, arg2, arg3);
4390 	if (post != NULL)
4391 		(post)();
4392 	sx_sunlock(&allprison_lock);
4393 }
4394 
4395 static struct prison_racct *
prison_racct_find_locked(const char * name)4396 prison_racct_find_locked(const char *name)
4397 {
4398 	struct prison_racct *prr;
4399 
4400 	ASSERT_RACCT_ENABLED();
4401 	sx_assert(&allprison_lock, SA_XLOCKED);
4402 
4403 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4404 		return (NULL);
4405 
4406 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4407 		if (strcmp(name, prr->prr_name) != 0)
4408 			continue;
4409 
4410 		/* Found prison_racct with a matching name? */
4411 		prison_racct_hold(prr);
4412 		return (prr);
4413 	}
4414 
4415 	/* Add new prison_racct. */
4416 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4417 	racct_create(&prr->prr_racct);
4418 
4419 	strcpy(prr->prr_name, name);
4420 	refcount_init(&prr->prr_refcount, 1);
4421 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4422 
4423 	return (prr);
4424 }
4425 
4426 struct prison_racct *
prison_racct_find(const char * name)4427 prison_racct_find(const char *name)
4428 {
4429 	struct prison_racct *prr;
4430 
4431 	ASSERT_RACCT_ENABLED();
4432 
4433 	sx_xlock(&allprison_lock);
4434 	prr = prison_racct_find_locked(name);
4435 	sx_xunlock(&allprison_lock);
4436 	return (prr);
4437 }
4438 
4439 void
prison_racct_hold(struct prison_racct * prr)4440 prison_racct_hold(struct prison_racct *prr)
4441 {
4442 
4443 	ASSERT_RACCT_ENABLED();
4444 
4445 	refcount_acquire(&prr->prr_refcount);
4446 }
4447 
4448 static void
prison_racct_free_locked(struct prison_racct * prr)4449 prison_racct_free_locked(struct prison_racct *prr)
4450 {
4451 
4452 	ASSERT_RACCT_ENABLED();
4453 	sx_assert(&allprison_lock, SA_XLOCKED);
4454 
4455 	if (refcount_release(&prr->prr_refcount)) {
4456 		racct_destroy(&prr->prr_racct);
4457 		LIST_REMOVE(prr, prr_next);
4458 		free(prr, M_PRISON_RACCT);
4459 	}
4460 }
4461 
4462 void
prison_racct_free(struct prison_racct * prr)4463 prison_racct_free(struct prison_racct *prr)
4464 {
4465 
4466 	ASSERT_RACCT_ENABLED();
4467 	sx_assert(&allprison_lock, SA_UNLOCKED);
4468 
4469 	if (refcount_release_if_not_last(&prr->prr_refcount))
4470 		return;
4471 
4472 	sx_xlock(&allprison_lock);
4473 	prison_racct_free_locked(prr);
4474 	sx_xunlock(&allprison_lock);
4475 }
4476 
4477 static void
prison_racct_attach(struct prison * pr)4478 prison_racct_attach(struct prison *pr)
4479 {
4480 	struct prison_racct *prr;
4481 
4482 	ASSERT_RACCT_ENABLED();
4483 	sx_assert(&allprison_lock, SA_XLOCKED);
4484 
4485 	prr = prison_racct_find_locked(pr->pr_name);
4486 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4487 
4488 	pr->pr_prison_racct = prr;
4489 }
4490 
4491 /*
4492  * Handle jail renaming.  From the racct point of view, renaming means
4493  * moving from one prison_racct to another.
4494  */
4495 static void
prison_racct_modify(struct prison * pr)4496 prison_racct_modify(struct prison *pr)
4497 {
4498 #ifdef RCTL
4499 	struct proc *p;
4500 	struct ucred *cred;
4501 #endif
4502 	struct prison_racct *oldprr;
4503 
4504 	ASSERT_RACCT_ENABLED();
4505 
4506 	sx_slock(&allproc_lock);
4507 	sx_xlock(&allprison_lock);
4508 
4509 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4510 		sx_xunlock(&allprison_lock);
4511 		sx_sunlock(&allproc_lock);
4512 		return;
4513 	}
4514 
4515 	oldprr = pr->pr_prison_racct;
4516 	pr->pr_prison_racct = NULL;
4517 
4518 	prison_racct_attach(pr);
4519 
4520 	/*
4521 	 * Move resource utilisation records.
4522 	 */
4523 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4524 
4525 #ifdef RCTL
4526 	/*
4527 	 * Force rctl to reattach rules to processes.
4528 	 */
4529 	FOREACH_PROC_IN_SYSTEM(p) {
4530 		PROC_LOCK(p);
4531 		cred = crhold(p->p_ucred);
4532 		PROC_UNLOCK(p);
4533 		rctl_proc_ucred_changed(p, cred);
4534 		crfree(cred);
4535 	}
4536 #endif
4537 
4538 	sx_sunlock(&allproc_lock);
4539 	prison_racct_free_locked(oldprr);
4540 	sx_xunlock(&allprison_lock);
4541 }
4542 
4543 static void
prison_racct_detach(struct prison * pr)4544 prison_racct_detach(struct prison *pr)
4545 {
4546 
4547 	ASSERT_RACCT_ENABLED();
4548 	sx_assert(&allprison_lock, SA_UNLOCKED);
4549 
4550 	if (pr->pr_prison_racct == NULL)
4551 		return;
4552 	prison_racct_free(pr->pr_prison_racct);
4553 	pr->pr_prison_racct = NULL;
4554 }
4555 #endif /* RACCT */
4556 
4557 #ifdef DDB
4558 
4559 static void
db_show_prison(struct prison * pr)4560 db_show_prison(struct prison *pr)
4561 {
4562 	struct bool_flags *bf;
4563 	struct jailsys_flags *jsf;
4564 #if defined(INET) || defined(INET6)
4565 	int ii;
4566 #endif
4567 	unsigned f;
4568 #ifdef INET
4569 	char ip4buf[INET_ADDRSTRLEN];
4570 #endif
4571 #ifdef INET6
4572 	char ip6buf[INET6_ADDRSTRLEN];
4573 #endif
4574 
4575 	db_printf("prison %p:\n", pr);
4576 	db_printf(" jid             = %d\n", pr->pr_id);
4577 	db_printf(" name            = %s\n", pr->pr_name);
4578 	db_printf(" parent          = %p\n", pr->pr_parent);
4579 	db_printf(" ref             = %d\n", pr->pr_ref);
4580 	db_printf(" uref            = %d\n", pr->pr_uref);
4581 	db_printf(" state           = %s\n",
4582 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
4583 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
4584 	    "invalid");
4585 	db_printf(" path            = %s\n", pr->pr_path);
4586 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4587 	    ? pr->pr_cpuset->cs_id : -1);
4588 #ifdef VIMAGE
4589 	db_printf(" vnet            = %p\n", pr->pr_vnet);
4590 #endif
4591 	db_printf(" root            = %p\n", pr->pr_root);
4592 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4593 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4594 	db_printf(" children.max    = %d\n", pr->pr_childmax);
4595 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4596 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4597 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4598 	db_printf(" flags           = 0x%x", pr->pr_flags);
4599 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
4600 		if (pr->pr_flags & bf->flag)
4601 			db_printf(" %s", bf->name);
4602 	for (jsf = pr_flag_jailsys;
4603 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
4604 	     jsf++) {
4605 		f = pr->pr_flags & (jsf->disable | jsf->new);
4606 		db_printf(" %-16s= %s\n", jsf->name,
4607 		    (f != 0 && f == jsf->disable) ? "disable"
4608 		    : (f == jsf->new) ? "new"
4609 		    : "inherit");
4610 	}
4611 	db_printf(" allow           = 0x%x", pr->pr_allow);
4612 	for (bf = pr_flag_allow;
4613 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4614 		atomic_load_int(&bf->flag) != 0;
4615 	     bf++)
4616 		if (pr->pr_allow & bf->flag)
4617 			db_printf(" %s", bf->name);
4618 	db_printf("\n");
4619 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4620 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4621 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4622 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4623 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4624 #ifdef INET
4625 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4626 	for (ii = 0; ii < pr->pr_ip4s; ii++)
4627 		db_printf(" %s %s\n",
4628 		    ii == 0 ? "ip4.addr        =" : "                 ",
4629 		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
4630 #endif
4631 #ifdef INET6
4632 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4633 	for (ii = 0; ii < pr->pr_ip6s; ii++)
4634 		db_printf(" %s %s\n",
4635 		    ii == 0 ? "ip6.addr        =" : "                 ",
4636 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4637 #endif
4638 }
4639 
DB_SHOW_COMMAND(prison,db_show_prison_command)4640 DB_SHOW_COMMAND(prison, db_show_prison_command)
4641 {
4642 	struct prison *pr;
4643 
4644 	if (!have_addr) {
4645 		/*
4646 		 * Show all prisons in the list, and prison0 which is not
4647 		 * listed.
4648 		 */
4649 		db_show_prison(&prison0);
4650 		if (!db_pager_quit) {
4651 			TAILQ_FOREACH(pr, &allprison, pr_list) {
4652 				db_show_prison(pr);
4653 				if (db_pager_quit)
4654 					break;
4655 			}
4656 		}
4657 		return;
4658 	}
4659 
4660 	if (addr == 0)
4661 		pr = &prison0;
4662 	else {
4663 		/* Look for a prison with the ID and with references. */
4664 		TAILQ_FOREACH(pr, &allprison, pr_list)
4665 			if (pr->pr_id == addr && pr->pr_ref > 0)
4666 				break;
4667 		if (pr == NULL)
4668 			/* Look again, without requiring a reference. */
4669 			TAILQ_FOREACH(pr, &allprison, pr_list)
4670 				if (pr->pr_id == addr)
4671 					break;
4672 		if (pr == NULL)
4673 			/* Assume address points to a valid prison. */
4674 			pr = (struct prison *)addr;
4675 	}
4676 	db_show_prison(pr);
4677 }
4678 
4679 #endif /* DDB */
4680