1 /*-
2  * Copyright (c) 1999 Poul-Henning Kamp.
3  * Copyright (c) 2008 Bjoern A. Zeeb.
4  * Copyright (c) 2009 James Gritton.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 339410 2018-10-17 16:17:56Z jamie $");
31 
32 #include "opt_compat.h"
33 #include "opt_ddb.h"
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/errno.h>
42 #include <sys/sysproto.h>
43 #include <sys/malloc.h>
44 #include <sys/osd.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/taskqueue.h>
48 #include <sys/fcntl.h>
49 #include <sys/jail.h>
50 #include <sys/lock.h>
51 #include <sys/mutex.h>
52 #include <sys/racct.h>
53 #include <sys/refcount.h>
54 #include <sys/sx.h>
55 #include <sys/sysent.h>
56 #include <sys/namei.h>
57 #include <sys/mount.h>
58 #include <sys/queue.h>
59 #include <sys/socket.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/sysctl.h>
62 #include <sys/vnode.h>
63 
64 #include <net/if.h>
65 #include <net/vnet.h>
66 
67 #include <netinet/in.h>
68 
69 #ifdef DDB
70 #include <ddb/ddb.h>
71 #ifdef INET6
72 #include <netinet6/in6_var.h>
73 #endif /* INET6 */
74 #endif /* DDB */
75 
76 #include <security/mac/mac_framework.h>
77 
78 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
79 
80 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
81 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
82 
83 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
84 #ifdef INET
85 #ifdef INET6
86 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
87 #else
88 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
89 #endif
90 #else /* !INET */
91 #ifdef INET6
92 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
93 #else
94 #define	_PR_IP_SADDRSEL	0
95 #endif
96 #endif
97 
98 /* prison0 describes what is "real" about the system. */
99 struct prison prison0 = {
100 	.pr_id		= 0,
101 	.pr_name	= "0",
102 	.pr_ref		= 1,
103 	.pr_uref	= 1,
104 	.pr_path	= "/",
105 	.pr_securelevel	= -1,
106 	.pr_devfs_rsnum = 0,
107 	.pr_childmax	= JAIL_MAX,
108 	.pr_hostuuid	= DEFAULT_HOSTUUID,
109 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
110 #ifdef VIMAGE
111 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
112 #else
113 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
114 #endif
115 	.pr_allow	= PR_ALLOW_ALL,
116 };
117 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
118 
119 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
120 struct	sx allprison_lock;
121 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
122 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
123 LIST_HEAD(, prison_racct) allprison_racct;
124 int	lastprid = 0;
125 
126 static int do_jail_attach(struct thread *td, struct prison *pr);
127 static void prison_complete(void *context, int pending);
128 static void prison_deref(struct prison *pr, int flags);
129 static char *prison_path(struct prison *pr1, struct prison *pr2);
130 static void prison_remove_one(struct prison *pr);
131 #ifdef RACCT
132 static void prison_racct_attach(struct prison *pr);
133 static void prison_racct_modify(struct prison *pr);
134 static void prison_racct_detach(struct prison *pr);
135 #endif
136 #ifdef INET
137 static int _prison_check_ip4(struct prison *pr, struct in_addr *ia);
138 static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4);
139 #endif
140 #ifdef INET6
141 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6);
142 static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6);
143 #endif
144 
145 /* Flags for prison_deref */
146 #define	PD_DEREF	0x01
147 #define	PD_DEUREF	0x02
148 #define	PD_LOCKED	0x04
149 #define	PD_LIST_SLOCKED	0x08
150 #define	PD_LIST_XLOCKED	0x10
151 
152 /*
153  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
154  * as we cannot figure out the size of a sparse array, or an array without a
155  * terminating entry.
156  */
157 static char *pr_flag_names[] = {
158 	[0] = "persist",
159 #ifdef INET
160 	[7] = "ip4.saddrsel",
161 #endif
162 #ifdef INET6
163 	[8] = "ip6.saddrsel",
164 #endif
165 };
166 const size_t pr_flag_names_size = sizeof(pr_flag_names);
167 
168 static char *pr_flag_nonames[] = {
169 	[0] = "nopersist",
170 #ifdef INET
171 	[7] = "ip4.nosaddrsel",
172 #endif
173 #ifdef INET6
174 	[8] = "ip6.nosaddrsel",
175 #endif
176 };
177 const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
178 
179 struct jailsys_flags {
180 	const char	*name;
181 	unsigned	 disable;
182 	unsigned	 new;
183 } pr_flag_jailsys[] = {
184 	{ "host", 0, PR_HOST },
185 #ifdef VIMAGE
186 	{ "vnet", 0, PR_VNET },
187 #endif
188 #ifdef INET
189 	{ "ip4", PR_IP4_USER | PR_IP4_DISABLE, PR_IP4_USER },
190 #endif
191 #ifdef INET6
192 	{ "ip6", PR_IP6_USER | PR_IP6_DISABLE, PR_IP6_USER },
193 #endif
194 };
195 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
196 
197 static char *pr_allow_names[] = {
198 	"allow.set_hostname",
199 	"allow.sysvipc",
200 	"allow.raw_sockets",
201 	"allow.chflags",
202 	"allow.mount",
203 	"allow.quotas",
204 	"allow.socket_af",
205 	"allow.mount.devfs",
206 	"allow.mount.nullfs",
207 	"allow.mount.zfs",
208 	"allow.mount.procfs",
209 	"allow.mount.tmpfs",
210 	"allow.mount.fdescfs",
211 	"allow.mount.linprocfs",
212 	"allow.mount.linsysfs",
213 };
214 const size_t pr_allow_names_size = sizeof(pr_allow_names);
215 
216 static char *pr_allow_nonames[] = {
217 	"allow.noset_hostname",
218 	"allow.nosysvipc",
219 	"allow.noraw_sockets",
220 	"allow.nochflags",
221 	"allow.nomount",
222 	"allow.noquotas",
223 	"allow.nosocket_af",
224 	"allow.mount.nodevfs",
225 	"allow.mount.nonullfs",
226 	"allow.mount.nozfs",
227 	"allow.mount.noprocfs",
228 	"allow.mount.notmpfs",
229 	"allow.mount.nofdescfs",
230 	"allow.mount.nolinprocfs",
231 	"allow.mount.nolinsysfs",
232 };
233 const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
234 
235 #define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
236 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
237 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
238 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
239 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
240 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
241 #if defined(INET) || defined(INET6)
242 static unsigned jail_max_af_ips = 255;
243 #endif
244 
245 /*
246  * Initialize the parts of prison0 that can't be static-initialized with
247  * constants.  This is called from proc0_init() after creating thread0 cpuset.
248  */
249 void
prison0_init(void)250 prison0_init(void)
251 {
252 
253 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
254 	prison0.pr_osreldate = osreldate;
255 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
256 }
257 
258 #ifdef INET
259 static int
qcmp_v4(const void * ip1,const void * ip2)260 qcmp_v4(const void *ip1, const void *ip2)
261 {
262 	in_addr_t iaa, iab;
263 
264 	/*
265 	 * We need to compare in HBO here to get the list sorted as expected
266 	 * by the result of the code.  Sorting NBO addresses gives you
267 	 * interesting results.  If you do not understand, do not try.
268 	 */
269 	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
270 	iab = ntohl(((const struct in_addr *)ip2)->s_addr);
271 
272 	/*
273 	 * Do not simply return the difference of the two numbers, the int is
274 	 * not wide enough.
275 	 */
276 	if (iaa > iab)
277 		return (1);
278 	else if (iaa < iab)
279 		return (-1);
280 	else
281 		return (0);
282 }
283 #endif
284 
285 #ifdef INET6
286 static int
qcmp_v6(const void * ip1,const void * ip2)287 qcmp_v6(const void *ip1, const void *ip2)
288 {
289 	const struct in6_addr *ia6a, *ia6b;
290 	int i, rc;
291 
292 	ia6a = (const struct in6_addr *)ip1;
293 	ia6b = (const struct in6_addr *)ip2;
294 
295 	rc = 0;
296 	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
297 		if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
298 			rc = 1;
299 		else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
300 			rc = -1;
301 	}
302 	return (rc);
303 }
304 #endif
305 
306 /*
307  * struct jail_args {
308  *	struct jail *jail;
309  * };
310  */
311 int
sys_jail(struct thread * td,struct jail_args * uap)312 sys_jail(struct thread *td, struct jail_args *uap)
313 {
314 	uint32_t version;
315 	int error;
316 	struct jail j;
317 
318 	error = copyin(uap->jail, &version, sizeof(uint32_t));
319 	if (error)
320 		return (error);
321 
322 	switch (version) {
323 	case 0:
324 	{
325 		struct jail_v0 j0;
326 
327 		/* FreeBSD single IPv4 jails. */
328 		bzero(&j, sizeof(struct jail));
329 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
330 		if (error)
331 			return (error);
332 		j.version = j0.version;
333 		j.path = j0.path;
334 		j.hostname = j0.hostname;
335 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
336 		break;
337 	}
338 
339 	case 1:
340 		/*
341 		 * Version 1 was used by multi-IPv4 jail implementations
342 		 * that never made it into the official kernel.
343 		 */
344 		return (EINVAL);
345 
346 	case 2:	/* JAIL_API_VERSION */
347 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
348 		error = copyin(uap->jail, &j, sizeof(struct jail));
349 		if (error)
350 			return (error);
351 		break;
352 
353 	default:
354 		/* Sci-Fi jails are not supported, sorry. */
355 		return (EINVAL);
356 	}
357 	return (kern_jail(td, &j));
358 }
359 
360 int
kern_jail(struct thread * td,struct jail * j)361 kern_jail(struct thread *td, struct jail *j)
362 {
363 	struct iovec optiov[2 * (4
364 			    + sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
365 #ifdef INET
366 			    + 1
367 #endif
368 #ifdef INET6
369 			    + 1
370 #endif
371 			    )];
372 	struct uio opt;
373 	char *u_path, *u_hostname, *u_name;
374 #ifdef INET
375 	uint32_t ip4s;
376 	struct in_addr *u_ip4;
377 #endif
378 #ifdef INET6
379 	struct in6_addr *u_ip6;
380 #endif
381 	size_t tmplen;
382 	int error, enforce_statfs, fi;
383 
384 	bzero(&optiov, sizeof(optiov));
385 	opt.uio_iov = optiov;
386 	opt.uio_iovcnt = 0;
387 	opt.uio_offset = -1;
388 	opt.uio_resid = -1;
389 	opt.uio_segflg = UIO_SYSSPACE;
390 	opt.uio_rw = UIO_READ;
391 	opt.uio_td = td;
392 
393 	/* Set permissions for top-level jails from sysctls. */
394 	if (!jailed(td->td_ucred)) {
395 		for (fi = 0; fi < sizeof(pr_allow_names) /
396 		     sizeof(pr_allow_names[0]); fi++) {
397 			optiov[opt.uio_iovcnt].iov_base =
398 			    (jail_default_allow & (1 << fi))
399 			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
400 			optiov[opt.uio_iovcnt].iov_len =
401 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
402 			opt.uio_iovcnt += 2;
403 		}
404 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
405 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
406 		opt.uio_iovcnt++;
407 		enforce_statfs = jail_default_enforce_statfs;
408 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
409 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
410 		opt.uio_iovcnt++;
411 	}
412 
413 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
414 #ifdef INET
415 	ip4s = (j->version == 0) ? 1 : j->ip4s;
416 	if (ip4s > jail_max_af_ips)
417 		return (EINVAL);
418 	tmplen += ip4s * sizeof(struct in_addr);
419 #else
420 	if (j->ip4s > 0)
421 		return (EINVAL);
422 #endif
423 #ifdef INET6
424 	if (j->ip6s > jail_max_af_ips)
425 		return (EINVAL);
426 	tmplen += j->ip6s * sizeof(struct in6_addr);
427 #else
428 	if (j->ip6s > 0)
429 		return (EINVAL);
430 #endif
431 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
432 	u_hostname = u_path + MAXPATHLEN;
433 	u_name = u_hostname + MAXHOSTNAMELEN;
434 #ifdef INET
435 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
436 #endif
437 #ifdef INET6
438 #ifdef INET
439 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
440 #else
441 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
442 #endif
443 #endif
444 	optiov[opt.uio_iovcnt].iov_base = "path";
445 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
446 	opt.uio_iovcnt++;
447 	optiov[opt.uio_iovcnt].iov_base = u_path;
448 	error = copyinstr(j->path, u_path, MAXPATHLEN,
449 	    &optiov[opt.uio_iovcnt].iov_len);
450 	if (error) {
451 		free(u_path, M_TEMP);
452 		return (error);
453 	}
454 	opt.uio_iovcnt++;
455 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
456 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
457 	opt.uio_iovcnt++;
458 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
459 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
460 	    &optiov[opt.uio_iovcnt].iov_len);
461 	if (error) {
462 		free(u_path, M_TEMP);
463 		return (error);
464 	}
465 	opt.uio_iovcnt++;
466 	if (j->jailname != NULL) {
467 		optiov[opt.uio_iovcnt].iov_base = "name";
468 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
469 		opt.uio_iovcnt++;
470 		optiov[opt.uio_iovcnt].iov_base = u_name;
471 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
472 		    &optiov[opt.uio_iovcnt].iov_len);
473 		if (error) {
474 			free(u_path, M_TEMP);
475 			return (error);
476 		}
477 		opt.uio_iovcnt++;
478 	}
479 #ifdef INET
480 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
481 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
482 	opt.uio_iovcnt++;
483 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
484 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
485 	if (j->version == 0)
486 		u_ip4->s_addr = j->ip4s;
487 	else {
488 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
489 		if (error) {
490 			free(u_path, M_TEMP);
491 			return (error);
492 		}
493 	}
494 	opt.uio_iovcnt++;
495 #endif
496 #ifdef INET6
497 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
498 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
499 	opt.uio_iovcnt++;
500 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
501 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
502 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
503 	if (error) {
504 		free(u_path, M_TEMP);
505 		return (error);
506 	}
507 	opt.uio_iovcnt++;
508 #endif
509 	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
510 	    ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
511 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
512 	free(u_path, M_TEMP);
513 	return (error);
514 }
515 
516 
517 /*
518  * struct jail_set_args {
519  *	struct iovec *iovp;
520  *	unsigned int iovcnt;
521  *	int flags;
522  * };
523  */
524 int
sys_jail_set(struct thread * td,struct jail_set_args * uap)525 sys_jail_set(struct thread *td, struct jail_set_args *uap)
526 {
527 	struct uio *auio;
528 	int error;
529 
530 	/* Check that we have an even number of iovecs. */
531 	if (uap->iovcnt & 1)
532 		return (EINVAL);
533 
534 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
535 	if (error)
536 		return (error);
537 	error = kern_jail_set(td, auio, uap->flags);
538 	free(auio, M_IOV);
539 	return (error);
540 }
541 
542 int
kern_jail_set(struct thread * td,struct uio * optuio,int flags)543 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
544 {
545 	struct nameidata nd;
546 #ifdef INET
547 	struct in_addr *ip4;
548 #endif
549 #ifdef INET6
550 	struct in6_addr *ip6;
551 #endif
552 	struct vfsopt *opt;
553 	struct vfsoptlist *opts;
554 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
555 	struct vnode *root;
556 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
557 	char *g_path, *osrelstr;
558 #if defined(INET) || defined(INET6)
559 	struct prison *tppr;
560 	void *op;
561 #endif
562 	unsigned long hid;
563 	size_t namelen, onamelen, pnamelen;
564 	int born, created, cuflags, descend, enforce;
565 	int error, errmsg_len, errmsg_pos;
566 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
567 	int fi, jid, jsys, len, level;
568 	int childmax, osreldt, rsnum, slevel;
569 	int fullpath_disabled;
570 #if defined(INET) || defined(INET6)
571 	int ii, ij;
572 #endif
573 #ifdef INET
574 	int ip4s, redo_ip4;
575 #endif
576 #ifdef INET6
577 	int ip6s, redo_ip6;
578 #endif
579 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
580 	unsigned tallow;
581 	char numbuf[12];
582 
583 	error = priv_check(td, PRIV_JAIL_SET);
584 	if (!error && (flags & JAIL_ATTACH))
585 		error = priv_check(td, PRIV_JAIL_ATTACH);
586 	if (error)
587 		return (error);
588 	mypr = td->td_ucred->cr_prison;
589 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
590 		return (EPERM);
591 	if (flags & ~JAIL_SET_MASK)
592 		return (EINVAL);
593 
594 	/*
595 	 * Check all the parameters before committing to anything.  Not all
596 	 * errors can be caught early, but we may as well try.  Also, this
597 	 * takes care of some expensive stuff (path lookup) before getting
598 	 * the allprison lock.
599 	 *
600 	 * XXX Jails are not filesystems, and jail parameters are not mount
601 	 *     options.  But it makes more sense to re-use the vfsopt code
602 	 *     than duplicate it under a different name.
603 	 */
604 	error = vfs_buildopts(optuio, &opts);
605 	if (error)
606 		return (error);
607 #ifdef INET
608 	ip4 = NULL;
609 #endif
610 #ifdef INET6
611 	ip6 = NULL;
612 #endif
613 	g_path = NULL;
614 
615 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
616 	if (!cuflags) {
617 		error = EINVAL;
618 		vfs_opterror(opts, "no valid operation (create or update)");
619 		goto done_errmsg;
620 	}
621 
622 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
623 	if (error == ENOENT)
624 		jid = 0;
625 	else if (error != 0)
626 		goto done_free;
627 
628 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
629 	if (error == ENOENT)
630 		gotslevel = 0;
631 	else if (error != 0)
632 		goto done_free;
633 	else
634 		gotslevel = 1;
635 
636 	error =
637 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
638 	if (error == ENOENT)
639 		gotchildmax = 0;
640 	else if (error != 0)
641 		goto done_free;
642 	else
643 		gotchildmax = 1;
644 
645 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
646 	if (error == ENOENT)
647 		gotenforce = 0;
648 	else if (error != 0)
649 		goto done_free;
650 	else if (enforce < 0 || enforce > 2) {
651 		error = EINVAL;
652 		goto done_free;
653 	} else
654 		gotenforce = 1;
655 
656 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
657 	if (error == ENOENT)
658 		gotrsnum = 0;
659 	else if (error != 0)
660 		goto done_free;
661 	else
662 		gotrsnum = 1;
663 
664 	pr_flags = ch_flags = 0;
665 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
666 	    fi++) {
667 		if (pr_flag_names[fi] == NULL)
668 			continue;
669 		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
670 		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
671 	}
672 	ch_flags |= pr_flags;
673 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
674 	    fi++) {
675 		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
676 		    sizeof(jsys));
677 		if (error == ENOENT)
678 			continue;
679 		if (error != 0)
680 			goto done_free;
681 		switch (jsys) {
682 		case JAIL_SYS_DISABLE:
683 			if (!pr_flag_jailsys[fi].disable) {
684 				error = EINVAL;
685 				goto done_free;
686 			}
687 			pr_flags |= pr_flag_jailsys[fi].disable;
688 			break;
689 		case JAIL_SYS_NEW:
690 			pr_flags |= pr_flag_jailsys[fi].new;
691 			break;
692 		case JAIL_SYS_INHERIT:
693 			break;
694 		default:
695 			error = EINVAL;
696 			goto done_free;
697 		}
698 		ch_flags |=
699 		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
700 	}
701 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
702 	    && !(pr_flags & PR_PERSIST)) {
703 		error = EINVAL;
704 		vfs_opterror(opts, "new jail must persist or attach");
705 		goto done_errmsg;
706 	}
707 #ifdef VIMAGE
708 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
709 		error = EINVAL;
710 		vfs_opterror(opts, "vnet cannot be changed after creation");
711 		goto done_errmsg;
712 	}
713 #endif
714 #ifdef INET
715 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
716 		error = EINVAL;
717 		vfs_opterror(opts, "ip4 cannot be changed after creation");
718 		goto done_errmsg;
719 	}
720 #endif
721 #ifdef INET6
722 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
723 		error = EINVAL;
724 		vfs_opterror(opts, "ip6 cannot be changed after creation");
725 		goto done_errmsg;
726 	}
727 #endif
728 
729 	pr_allow = ch_allow = 0;
730 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
731 	    fi++) {
732 		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
733 		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
734 	}
735 	ch_allow |= pr_allow;
736 
737 	error = vfs_getopt(opts, "name", (void **)&name, &len);
738 	if (error == ENOENT)
739 		name = NULL;
740 	else if (error != 0)
741 		goto done_free;
742 	else {
743 		if (len == 0 || name[len - 1] != '\0') {
744 			error = EINVAL;
745 			goto done_free;
746 		}
747 		if (len > MAXHOSTNAMELEN) {
748 			error = ENAMETOOLONG;
749 			goto done_free;
750 		}
751 	}
752 
753 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
754 	if (error == ENOENT)
755 		host = NULL;
756 	else if (error != 0)
757 		goto done_free;
758 	else {
759 		ch_flags |= PR_HOST;
760 		pr_flags |= PR_HOST;
761 		if (len == 0 || host[len - 1] != '\0') {
762 			error = EINVAL;
763 			goto done_free;
764 		}
765 		if (len > MAXHOSTNAMELEN) {
766 			error = ENAMETOOLONG;
767 			goto done_free;
768 		}
769 	}
770 
771 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
772 	if (error == ENOENT)
773 		domain = NULL;
774 	else if (error != 0)
775 		goto done_free;
776 	else {
777 		ch_flags |= PR_HOST;
778 		pr_flags |= PR_HOST;
779 		if (len == 0 || domain[len - 1] != '\0') {
780 			error = EINVAL;
781 			goto done_free;
782 		}
783 		if (len > MAXHOSTNAMELEN) {
784 			error = ENAMETOOLONG;
785 			goto done_free;
786 		}
787 	}
788 
789 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
790 	if (error == ENOENT)
791 		uuid = NULL;
792 	else if (error != 0)
793 		goto done_free;
794 	else {
795 		ch_flags |= PR_HOST;
796 		pr_flags |= PR_HOST;
797 		if (len == 0 || uuid[len - 1] != '\0') {
798 			error = EINVAL;
799 			goto done_free;
800 		}
801 		if (len > HOSTUUIDLEN) {
802 			error = ENAMETOOLONG;
803 			goto done_free;
804 		}
805 	}
806 
807 #ifdef COMPAT_FREEBSD32
808 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
809 		uint32_t hid32;
810 
811 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
812 		hid = hid32;
813 	} else
814 #endif
815 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
816 	if (error == ENOENT)
817 		gothid = 0;
818 	else if (error != 0)
819 		goto done_free;
820 	else {
821 		gothid = 1;
822 		ch_flags |= PR_HOST;
823 		pr_flags |= PR_HOST;
824 	}
825 
826 #ifdef INET
827 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
828 	if (error == ENOENT)
829 		ip4s = 0;
830 	else if (error != 0)
831 		goto done_free;
832 	else if (ip4s & (sizeof(*ip4) - 1)) {
833 		error = EINVAL;
834 		goto done_free;
835 	} else {
836 		ch_flags |= PR_IP4_USER | PR_IP4_DISABLE;
837 		if (ip4s == 0)
838 			pr_flags |= PR_IP4_USER | PR_IP4_DISABLE;
839 		else {
840 			pr_flags = (pr_flags & ~PR_IP4_DISABLE) | PR_IP4_USER;
841 			ip4s /= sizeof(*ip4);
842 			if (ip4s > jail_max_af_ips) {
843 				error = EINVAL;
844 				vfs_opterror(opts, "too many IPv4 addresses");
845 				goto done_errmsg;
846 			}
847 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
848 			bcopy(op, ip4, ip4s * sizeof(*ip4));
849 			/*
850 			 * IP addresses are all sorted but ip[0] to preserve
851 			 * the primary IP address as given from userland.
852 			 * This special IP is used for unbound outgoing
853 			 * connections as well for "loopback" traffic in case
854 			 * source address selection cannot find any more fitting
855 			 * address to connect from.
856 			 */
857 			if (ip4s > 1)
858 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
859 			/*
860 			 * Check for duplicate addresses and do some simple
861 			 * zero and broadcast checks. If users give other bogus
862 			 * addresses it is their problem.
863 			 *
864 			 * We do not have to care about byte order for these
865 			 * checks so we will do them in NBO.
866 			 */
867 			for (ii = 0; ii < ip4s; ii++) {
868 				if (ip4[ii].s_addr == INADDR_ANY ||
869 				    ip4[ii].s_addr == INADDR_BROADCAST) {
870 					error = EINVAL;
871 					goto done_free;
872 				}
873 				if ((ii+1) < ip4s &&
874 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
875 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
876 					error = EINVAL;
877 					goto done_free;
878 				}
879 			}
880 		}
881 	}
882 #endif
883 
884 #ifdef INET6
885 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
886 	if (error == ENOENT)
887 		ip6s = 0;
888 	else if (error != 0)
889 		goto done_free;
890 	else if (ip6s & (sizeof(*ip6) - 1)) {
891 		error = EINVAL;
892 		goto done_free;
893 	} else {
894 		ch_flags |= PR_IP6_USER | PR_IP6_DISABLE;
895 		if (ip6s == 0)
896 			pr_flags |= PR_IP6_USER | PR_IP6_DISABLE;
897 		else {
898 			pr_flags = (pr_flags & ~PR_IP6_DISABLE) | PR_IP6_USER;
899 			ip6s /= sizeof(*ip6);
900 			if (ip6s > jail_max_af_ips) {
901 				error = EINVAL;
902 				vfs_opterror(opts, "too many IPv6 addresses");
903 				goto done_errmsg;
904 			}
905 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
906 			bcopy(op, ip6, ip6s * sizeof(*ip6));
907 			if (ip6s > 1)
908 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
909 			for (ii = 0; ii < ip6s; ii++) {
910 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
911 					error = EINVAL;
912 					goto done_free;
913 				}
914 				if ((ii+1) < ip6s &&
915 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
916 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
917 				{
918 					error = EINVAL;
919 					goto done_free;
920 				}
921 			}
922 		}
923 	}
924 #endif
925 
926 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
927 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
928 		error = EINVAL;
929 		vfs_opterror(opts,
930 		    "vnet jails cannot have IP address restrictions");
931 		goto done_errmsg;
932 	}
933 #endif
934 
935 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
936 	if (error == ENOENT)
937 		osrelstr = NULL;
938 	else if (error != 0)
939 		goto done_free;
940 	else {
941 		if (flags & JAIL_UPDATE) {
942 			error = EINVAL;
943 			vfs_opterror(opts,
944 			    "osrelease cannot be changed after creation");
945 			goto done_errmsg;
946 		}
947 		if (len == 0 || len >= OSRELEASELEN) {
948 			error = EINVAL;
949 			vfs_opterror(opts,
950 			    "osrelease string must be 1-%d bytes long",
951 			    OSRELEASELEN - 1);
952 			goto done_errmsg;
953 		}
954 	}
955 
956 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
957 	if (error == ENOENT)
958 		osreldt = 0;
959 	else if (error != 0)
960 		goto done_free;
961 	else {
962 		if (flags & JAIL_UPDATE) {
963 			error = EINVAL;
964 			vfs_opterror(opts,
965 			    "osreldate cannot be changed after creation");
966 			goto done_errmsg;
967 		}
968 		if (osreldt == 0) {
969 			error = EINVAL;
970 			vfs_opterror(opts, "osreldate cannot be 0");
971 			goto done_errmsg;
972 		}
973 	}
974 
975 	fullpath_disabled = 0;
976 	root = NULL;
977 	error = vfs_getopt(opts, "path", (void **)&path, &len);
978 	if (error == ENOENT)
979 		path = NULL;
980 	else if (error != 0)
981 		goto done_free;
982 	else {
983 		if (flags & JAIL_UPDATE) {
984 			error = EINVAL;
985 			vfs_opterror(opts,
986 			    "path cannot be changed after creation");
987 			goto done_errmsg;
988 		}
989 		if (len == 0 || path[len - 1] != '\0') {
990 			error = EINVAL;
991 			goto done_free;
992 		}
993 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
994 		    path, td);
995 		error = namei(&nd);
996 		if (error)
997 			goto done_free;
998 		root = nd.ni_vp;
999 		NDFREE(&nd, NDF_ONLY_PNBUF);
1000 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1001 		strlcpy(g_path, path, MAXPATHLEN);
1002 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1003 		if (error == 0)
1004 			path = g_path;
1005 		else if (error == ENODEV) {
1006 			/* proceed if sysctl debug.disablefullpath == 1 */
1007 			fullpath_disabled = 1;
1008 			if (len < 2 || (len == 2 && path[0] == '/'))
1009 				path = NULL;
1010 		} else {
1011 			/* exit on other errors */
1012 			goto done_free;
1013 		}
1014 		if (root->v_type != VDIR) {
1015 			error = ENOTDIR;
1016 			vput(root);
1017 			goto done_free;
1018 		}
1019 		VOP_UNLOCK(root, 0);
1020 		if (fullpath_disabled) {
1021 			/* Leave room for a real-root full pathname. */
1022 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
1023 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
1024 				error = ENAMETOOLONG;
1025 				vrele(root);
1026 				goto done_free;
1027 			}
1028 		}
1029 	}
1030 
1031 	/*
1032 	 * Find the specified jail, or at least its parent.
1033 	 * This abuses the file error codes ENOENT and EEXIST.
1034 	 */
1035 	pr = NULL;
1036 	ppr = mypr;
1037 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1038 		namelc = strrchr(name, '.');
1039 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1040 		if (*p != '\0')
1041 			jid = 0;
1042 	}
1043 	sx_xlock(&allprison_lock);
1044 	if (jid != 0) {
1045 		/*
1046 		 * See if a requested jid already exists.  There is an
1047 		 * information leak here if the jid exists but is not within
1048 		 * the caller's jail hierarchy.  Jail creators will get EEXIST
1049 		 * even though they cannot see the jail, and CREATE | UPDATE
1050 		 * will return ENOENT which is not normally a valid error.
1051 		 */
1052 		if (jid < 0) {
1053 			error = EINVAL;
1054 			vfs_opterror(opts, "negative jid");
1055 			goto done_unlock_list;
1056 		}
1057 		pr = prison_find(jid);
1058 		if (pr != NULL) {
1059 			ppr = pr->pr_parent;
1060 			/* Create: jid must not exist. */
1061 			if (cuflags == JAIL_CREATE) {
1062 				mtx_unlock(&pr->pr_mtx);
1063 				error = EEXIST;
1064 				vfs_opterror(opts, "jail %d already exists",
1065 				    jid);
1066 				goto done_unlock_list;
1067 			}
1068 			if (!prison_ischild(mypr, pr)) {
1069 				mtx_unlock(&pr->pr_mtx);
1070 				pr = NULL;
1071 			} else if (pr->pr_uref == 0) {
1072 				if (!(flags & JAIL_DYING)) {
1073 					mtx_unlock(&pr->pr_mtx);
1074 					error = ENOENT;
1075 					vfs_opterror(opts, "jail %d is dying",
1076 					    jid);
1077 					goto done_unlock_list;
1078 				} else if ((flags & JAIL_ATTACH) ||
1079 				    (pr_flags & PR_PERSIST)) {
1080 					/*
1081 					 * A dying jail might be resurrected
1082 					 * (via attach or persist), but first
1083 					 * it must determine if another jail
1084 					 * has claimed its name.  Accomplish
1085 					 * this by implicitly re-setting the
1086 					 * name.
1087 					 */
1088 					if (name == NULL)
1089 						name = prison_name(mypr, pr);
1090 				}
1091 			}
1092 		}
1093 		if (pr == NULL) {
1094 			/* Update: jid must exist. */
1095 			if (cuflags == JAIL_UPDATE) {
1096 				error = ENOENT;
1097 				vfs_opterror(opts, "jail %d not found", jid);
1098 				goto done_unlock_list;
1099 			}
1100 		}
1101 	}
1102 	/*
1103 	 * If the caller provided a name, look for a jail by that name.
1104 	 * This has different semantics for creates and updates keyed by jid
1105 	 * (where the name must not already exist in a different jail),
1106 	 * and updates keyed by the name itself (where the name must exist
1107 	 * because that is the jail being updated).
1108 	 */
1109 	namelc = NULL;
1110 	if (name != NULL) {
1111 		namelc = strrchr(name, '.');
1112 		if (namelc == NULL)
1113 			namelc = name;
1114 		else {
1115 			/*
1116 			 * This is a hierarchical name.  Split it into the
1117 			 * parent and child names, and make sure the parent
1118 			 * exists or matches an already found jail.
1119 			 */
1120 			if (pr != NULL) {
1121 				if (strncmp(name, ppr->pr_name, namelc - name)
1122 				    || ppr->pr_name[namelc - name] != '\0') {
1123 					mtx_unlock(&pr->pr_mtx);
1124 					error = EINVAL;
1125 					vfs_opterror(opts,
1126 					    "cannot change jail's parent");
1127 					goto done_unlock_list;
1128 				}
1129 			} else {
1130 				*namelc = '\0';
1131 				ppr = prison_find_name(mypr, name);
1132 				if (ppr == NULL) {
1133 					error = ENOENT;
1134 					vfs_opterror(opts,
1135 					    "jail \"%s\" not found", name);
1136 					goto done_unlock_list;
1137 				}
1138 				mtx_unlock(&ppr->pr_mtx);
1139 				*namelc = '.';
1140 			}
1141 			namelc++;
1142 		}
1143 		if (namelc[0] != '\0') {
1144 			pnamelen =
1145 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1146  name_again:
1147 			deadpr = NULL;
1148 			FOREACH_PRISON_CHILD(ppr, tpr) {
1149 				if (tpr != pr && tpr->pr_ref > 0 &&
1150 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1151 					if (pr == NULL &&
1152 					    cuflags != JAIL_CREATE) {
1153 						mtx_lock(&tpr->pr_mtx);
1154 						if (tpr->pr_ref > 0) {
1155 							/*
1156 							 * Use this jail
1157 							 * for updates.
1158 							 */
1159 							if (tpr->pr_uref > 0) {
1160 								pr = tpr;
1161 								break;
1162 							}
1163 							deadpr = tpr;
1164 						}
1165 						mtx_unlock(&tpr->pr_mtx);
1166 					} else if (tpr->pr_uref > 0) {
1167 						/*
1168 						 * Create, or update(jid):
1169 						 * name must not exist in an
1170 						 * active sibling jail.
1171 						 */
1172 						error = EEXIST;
1173 						if (pr != NULL)
1174 							mtx_unlock(&pr->pr_mtx);
1175 						vfs_opterror(opts,
1176 						   "jail \"%s\" already exists",
1177 						   name);
1178 						goto done_unlock_list;
1179 					}
1180 				}
1181 			}
1182 			/* If no active jail is found, use a dying one. */
1183 			if (deadpr != NULL && pr == NULL) {
1184 				if (flags & JAIL_DYING) {
1185 					mtx_lock(&deadpr->pr_mtx);
1186 					if (deadpr->pr_ref == 0) {
1187 						mtx_unlock(&deadpr->pr_mtx);
1188 						goto name_again;
1189 					}
1190 					pr = deadpr;
1191 				} else if (cuflags == JAIL_UPDATE) {
1192 					error = ENOENT;
1193 					vfs_opterror(opts,
1194 					    "jail \"%s\" is dying", name);
1195 					goto done_unlock_list;
1196 				}
1197 			}
1198 			/* Update: name must exist if no jid. */
1199 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1200 				error = ENOENT;
1201 				vfs_opterror(opts, "jail \"%s\" not found",
1202 				    name);
1203 				goto done_unlock_list;
1204 			}
1205 		}
1206 	}
1207 	/* Update: must provide a jid or name. */
1208 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1209 		error = ENOENT;
1210 		vfs_opterror(opts, "update specified no jail");
1211 		goto done_unlock_list;
1212 	}
1213 
1214 	/* If there's no prison to update, create a new one and link it in. */
1215 	if (pr == NULL) {
1216 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1217 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1218 				error = EPERM;
1219 				vfs_opterror(opts, "prison limit exceeded");
1220 				goto done_unlock_list;
1221 			}
1222 		created = 1;
1223 		mtx_lock(&ppr->pr_mtx);
1224 		if (ppr->pr_ref == 0) {
1225 			mtx_unlock(&ppr->pr_mtx);
1226 			error = ENOENT;
1227 			vfs_opterror(opts, "jail \"%s\" not found",
1228 			    prison_name(mypr, ppr));
1229 			goto done_unlock_list;
1230 		}
1231 		ppr->pr_ref++;
1232 		ppr->pr_uref++;
1233 		mtx_unlock(&ppr->pr_mtx);
1234 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1235 		if (jid == 0) {
1236 			/* Find the next free jid. */
1237 			jid = lastprid + 1;
1238  findnext:
1239 			if (jid == JAIL_MAX)
1240 				jid = 1;
1241 			TAILQ_FOREACH(tpr, &allprison, pr_list) {
1242 				if (tpr->pr_id < jid)
1243 					continue;
1244 				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1245 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1246 					break;
1247 				}
1248 				if (jid == lastprid) {
1249 					error = EAGAIN;
1250 					vfs_opterror(opts,
1251 					    "no available jail IDs");
1252 					free(pr, M_PRISON);
1253 					prison_deref(ppr, PD_DEREF |
1254 					    PD_DEUREF | PD_LIST_XLOCKED);
1255 					goto done_releroot;
1256 				}
1257 				jid++;
1258 				goto findnext;
1259 			}
1260 			lastprid = jid;
1261 		} else {
1262 			/*
1263 			 * The jail already has a jid (that did not yet exist),
1264 			 * so just find where to insert it.
1265 			 */
1266 			TAILQ_FOREACH(tpr, &allprison, pr_list)
1267 				if (tpr->pr_id >= jid) {
1268 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1269 					break;
1270 				}
1271 		}
1272 		if (tpr == NULL)
1273 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1274 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1275 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1276 			tpr->pr_childcount++;
1277 
1278 		pr->pr_parent = ppr;
1279 		pr->pr_id = jid;
1280 
1281 		/* Set some default values, and inherit some from the parent. */
1282 		if (namelc == NULL)
1283 			namelc = "";
1284 		if (path == NULL) {
1285 			path = "/";
1286 			root = mypr->pr_root;
1287 			vref(root);
1288 		}
1289 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1290 		pr->pr_flags |= PR_HOST;
1291 #if defined(INET) || defined(INET6)
1292 #ifdef VIMAGE
1293 		if (!(pr_flags & PR_VNET))
1294 #endif
1295 		{
1296 #ifdef INET
1297 			if (!(ch_flags & PR_IP4_USER))
1298 				pr->pr_flags |=
1299 				    PR_IP4 | PR_IP4_USER | PR_IP4_DISABLE;
1300 			else if (!(pr_flags & PR_IP4_USER)) {
1301 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1302 				if (ppr->pr_ip4 != NULL) {
1303 					pr->pr_ip4s = ppr->pr_ip4s;
1304 					pr->pr_ip4 = malloc(pr->pr_ip4s *
1305 					    sizeof(struct in_addr), M_PRISON,
1306 					    M_WAITOK);
1307 					bcopy(ppr->pr_ip4, pr->pr_ip4,
1308 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
1309 				}
1310 			}
1311 #endif
1312 #ifdef INET6
1313 			if (!(ch_flags & PR_IP6_USER))
1314 				pr->pr_flags |=
1315 				    PR_IP6 | PR_IP6_USER | PR_IP6_DISABLE;
1316 			else if (!(pr_flags & PR_IP6_USER)) {
1317 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1318 				if (ppr->pr_ip6 != NULL) {
1319 					pr->pr_ip6s = ppr->pr_ip6s;
1320 					pr->pr_ip6 = malloc(pr->pr_ip6s *
1321 					    sizeof(struct in6_addr), M_PRISON,
1322 					    M_WAITOK);
1323 					bcopy(ppr->pr_ip6, pr->pr_ip6,
1324 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
1325 				}
1326 			}
1327 #endif
1328 		}
1329 #endif
1330 		/* Source address selection is always on by default. */
1331 		pr->pr_flags |= _PR_IP_SADDRSEL;
1332 
1333 		pr->pr_securelevel = ppr->pr_securelevel;
1334 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1335 		pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
1336 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1337 
1338 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1339 		if (osrelstr == NULL)
1340 		    strcpy(pr->pr_osrelease, ppr->pr_osrelease);
1341 		else
1342 		    strcpy(pr->pr_osrelease, osrelstr);
1343 
1344 		LIST_INIT(&pr->pr_children);
1345 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1346 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1347 
1348 #ifdef VIMAGE
1349 		/* Allocate a new vnet if specified. */
1350 		pr->pr_vnet = (pr_flags & PR_VNET)
1351 		    ? vnet_alloc() : ppr->pr_vnet;
1352 #endif
1353 		/*
1354 		 * Allocate a dedicated cpuset for each jail.
1355 		 * Unlike other initial settings, this may return an erorr.
1356 		 */
1357 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1358 		if (error) {
1359 			prison_deref(pr, PD_LIST_XLOCKED);
1360 			goto done_releroot;
1361 		}
1362 
1363 		mtx_lock(&pr->pr_mtx);
1364 		/*
1365 		 * New prisons do not yet have a reference, because we do not
1366 		 * want others to see the incomplete prison once the
1367 		 * allprison_lock is downgraded.
1368 		 */
1369 	} else {
1370 		created = 0;
1371 		/*
1372 		 * Grab a reference for existing prisons, to ensure they
1373 		 * continue to exist for the duration of the call.
1374 		 */
1375 		pr->pr_ref++;
1376 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1377 		if ((pr->pr_flags & PR_VNET) &&
1378 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1379 			error = EINVAL;
1380 			vfs_opterror(opts,
1381 			    "vnet jails cannot have IP address restrictions");
1382 			goto done_deref_locked;
1383 		}
1384 #endif
1385 #ifdef INET
1386 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1387 			error = EINVAL;
1388 			vfs_opterror(opts,
1389 			    "ip4 cannot be changed after creation");
1390 			goto done_deref_locked;
1391 		}
1392 #endif
1393 #ifdef INET6
1394 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1395 			error = EINVAL;
1396 			vfs_opterror(opts,
1397 			    "ip6 cannot be changed after creation");
1398 			goto done_deref_locked;
1399 		}
1400 #endif
1401 	}
1402 
1403 	/* Do final error checking before setting anything. */
1404 	if (gotslevel) {
1405 		if (slevel < ppr->pr_securelevel) {
1406 			error = EPERM;
1407 			goto done_deref_locked;
1408 		}
1409 	}
1410 	if (gotchildmax) {
1411 		if (childmax >= ppr->pr_childmax) {
1412 			error = EPERM;
1413 			goto done_deref_locked;
1414 		}
1415 	}
1416 	if (gotenforce) {
1417 		if (enforce < ppr->pr_enforce_statfs) {
1418 			error = EPERM;
1419 			goto done_deref_locked;
1420 		}
1421 	}
1422 	if (gotrsnum) {
1423 		/*
1424 		 * devfs_rsnum is a uint16_t
1425 		 */
1426 		if (rsnum < 0 || rsnum > 65535) {
1427 			error = EINVAL;
1428 			goto done_deref_locked;
1429 		}
1430 		/*
1431 		 * Nested jails always inherit parent's devfs ruleset
1432 		 */
1433 		if (jailed(td->td_ucred)) {
1434 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1435 				error = EPERM;
1436 				goto done_deref_locked;
1437 			} else
1438 				rsnum = ppr->pr_devfs_rsnum;
1439 		}
1440 	}
1441 #ifdef INET
1442 	if (ip4s > 0) {
1443 		if (ppr->pr_flags & PR_IP4) {
1444 			/*
1445 			 * Make sure the new set of IP addresses is a
1446 			 * subset of the parent's list.  Don't worry
1447 			 * about the parent being unlocked, as any
1448 			 * setting is done with allprison_lock held.
1449 			 */
1450 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
1451 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1452 					break;
1453 			if (ij == ppr->pr_ip4s) {
1454 				error = EPERM;
1455 				goto done_deref_locked;
1456 			}
1457 			if (ip4s > 1) {
1458 				for (ii = ij = 1; ii < ip4s; ii++) {
1459 					if (ip4[ii].s_addr ==
1460 					    ppr->pr_ip4[0].s_addr)
1461 						continue;
1462 					for (; ij < ppr->pr_ip4s; ij++)
1463 						if (ip4[ii].s_addr ==
1464 						    ppr->pr_ip4[ij].s_addr)
1465 							break;
1466 					if (ij == ppr->pr_ip4s)
1467 						break;
1468 				}
1469 				if (ij == ppr->pr_ip4s) {
1470 					error = EPERM;
1471 					goto done_deref_locked;
1472 				}
1473 			}
1474 		}
1475 		/*
1476 		 * Check for conflicting IP addresses.  We permit them
1477 		 * if there is no more than one IP on each jail.  If
1478 		 * there is a duplicate on a jail with more than one
1479 		 * IP stop checking and return error.
1480 		 */
1481 #ifdef VIMAGE
1482 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1483 			if (tppr->pr_flags & PR_VNET)
1484 				break;
1485 #else
1486 		tppr = &prison0;
1487 #endif
1488 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1489 			if (tpr == pr ||
1490 #ifdef VIMAGE
1491 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1492 #endif
1493 			    tpr->pr_uref == 0) {
1494 				descend = 0;
1495 				continue;
1496 			}
1497 			if (!(tpr->pr_flags & PR_IP4_USER))
1498 				continue;
1499 			descend = 0;
1500 			if (tpr->pr_ip4 == NULL ||
1501 			    (ip4s == 1 && tpr->pr_ip4s == 1))
1502 				continue;
1503 			for (ii = 0; ii < ip4s; ii++) {
1504 				if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
1505 					error = EADDRINUSE;
1506 					vfs_opterror(opts,
1507 					    "IPv4 addresses clash");
1508 					goto done_deref_locked;
1509 				}
1510 			}
1511 		}
1512 	}
1513 #endif
1514 #ifdef INET6
1515 	if (ip6s > 0) {
1516 		if (ppr->pr_flags & PR_IP6) {
1517 			/*
1518 			 * Make sure the new set of IP addresses is a
1519 			 * subset of the parent's list.
1520 			 */
1521 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
1522 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1523 				    &ppr->pr_ip6[ij]))
1524 					break;
1525 			if (ij == ppr->pr_ip6s) {
1526 				error = EPERM;
1527 				goto done_deref_locked;
1528 			}
1529 			if (ip6s > 1) {
1530 				for (ii = ij = 1; ii < ip6s; ii++) {
1531 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1532 					     &ppr->pr_ip6[0]))
1533 						continue;
1534 					for (; ij < ppr->pr_ip6s; ij++)
1535 						if (IN6_ARE_ADDR_EQUAL(
1536 						    &ip6[ii], &ppr->pr_ip6[ij]))
1537 							break;
1538 					if (ij == ppr->pr_ip6s)
1539 						break;
1540 				}
1541 				if (ij == ppr->pr_ip6s) {
1542 					error = EPERM;
1543 					goto done_deref_locked;
1544 				}
1545 			}
1546 		}
1547 		/* Check for conflicting IP addresses. */
1548 #ifdef VIMAGE
1549 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1550 			if (tppr->pr_flags & PR_VNET)
1551 				break;
1552 #else
1553 		tppr = &prison0;
1554 #endif
1555 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1556 			if (tpr == pr ||
1557 #ifdef VIMAGE
1558 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1559 #endif
1560 			    tpr->pr_uref == 0) {
1561 				descend = 0;
1562 				continue;
1563 			}
1564 			if (!(tpr->pr_flags & PR_IP6_USER))
1565 				continue;
1566 			descend = 0;
1567 			if (tpr->pr_ip6 == NULL ||
1568 			    (ip6s == 1 && tpr->pr_ip6s == 1))
1569 				continue;
1570 			for (ii = 0; ii < ip6s; ii++) {
1571 				if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
1572 					error = EADDRINUSE;
1573 					vfs_opterror(opts,
1574 					    "IPv6 addresses clash");
1575 					goto done_deref_locked;
1576 				}
1577 			}
1578 		}
1579 	}
1580 #endif
1581 	onamelen = namelen = 0;
1582 	if (namelc != NULL) {
1583 		/* Give a default name of the jid.  Also allow the name to be
1584 		 * explicitly the jid - but not any other number, and only in
1585 		 * normal form (no leading zero/etc).
1586 		 */
1587 		if (namelc[0] == '\0')
1588 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1589 		else if ((strtoul(namelc, &p, 10) != jid ||
1590 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1591 			error = EINVAL;
1592 			vfs_opterror(opts,
1593 			    "name cannot be numeric (unless it is the jid)");
1594 			goto done_deref_locked;
1595 		}
1596 		/*
1597 		 * Make sure the name isn't too long for the prison or its
1598 		 * children.
1599 		 */
1600 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1601 		onamelen = strlen(pr->pr_name + pnamelen);
1602 		namelen = strlen(namelc);
1603 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1604 			error = ENAMETOOLONG;
1605 			goto done_deref_locked;
1606 		}
1607 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1608 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1609 			    sizeof(pr->pr_name)) {
1610 				error = ENAMETOOLONG;
1611 				goto done_deref_locked;
1612 			}
1613 		}
1614 	}
1615 	if (pr_allow & ~ppr->pr_allow) {
1616 		error = EPERM;
1617 		goto done_deref_locked;
1618 	}
1619 
1620 	/*
1621 	 * Let modules check their parameters.  This requires unlocking and
1622 	 * then re-locking the prison, but this is still a valid state as long
1623 	 * as allprison_lock remains xlocked.
1624 	 */
1625 	mtx_unlock(&pr->pr_mtx);
1626 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1627 	if (error != 0) {
1628 		prison_deref(pr, created
1629 		    ? PD_LIST_XLOCKED
1630 		    : PD_DEREF | PD_LIST_XLOCKED);
1631 		goto done_releroot;
1632 	}
1633 	mtx_lock(&pr->pr_mtx);
1634 
1635 	/* At this point, all valid parameters should have been noted. */
1636 	TAILQ_FOREACH(opt, opts, link) {
1637 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1638 			error = EINVAL;
1639 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1640 			goto done_deref_locked;
1641 		}
1642 	}
1643 
1644 	/* Set the parameters of the prison. */
1645 #ifdef INET
1646 	redo_ip4 = 0;
1647 	if (pr_flags & PR_IP4_USER) {
1648 		pr->pr_flags |= PR_IP4;
1649 		free(pr->pr_ip4, M_PRISON);
1650 		pr->pr_ip4s = ip4s;
1651 		pr->pr_ip4 = ip4;
1652 		ip4 = NULL;
1653 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1654 #ifdef VIMAGE
1655 			if (tpr->pr_flags & PR_VNET) {
1656 				descend = 0;
1657 				continue;
1658 			}
1659 #endif
1660 			if (prison_restrict_ip4(tpr, NULL)) {
1661 				redo_ip4 = 1;
1662 				descend = 0;
1663 			}
1664 		}
1665 	}
1666 #endif
1667 #ifdef INET6
1668 	redo_ip6 = 0;
1669 	if (pr_flags & PR_IP6_USER) {
1670 		pr->pr_flags |= PR_IP6;
1671 		free(pr->pr_ip6, M_PRISON);
1672 		pr->pr_ip6s = ip6s;
1673 		pr->pr_ip6 = ip6;
1674 		ip6 = NULL;
1675 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1676 #ifdef VIMAGE
1677 			if (tpr->pr_flags & PR_VNET) {
1678 				descend = 0;
1679 				continue;
1680 			}
1681 #endif
1682 			if (prison_restrict_ip6(tpr, NULL)) {
1683 				redo_ip6 = 1;
1684 				descend = 0;
1685 			}
1686 		}
1687 	}
1688 #endif
1689 	if (gotslevel) {
1690 		pr->pr_securelevel = slevel;
1691 		/* Set all child jails to be at least this level. */
1692 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1693 			if (tpr->pr_securelevel < slevel)
1694 				tpr->pr_securelevel = slevel;
1695 	}
1696 	if (gotchildmax) {
1697 		pr->pr_childmax = childmax;
1698 		/* Set all child jails to under this limit. */
1699 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1700 			if (tpr->pr_childmax > childmax - level)
1701 				tpr->pr_childmax = childmax > level
1702 				    ? childmax - level : 0;
1703 	}
1704 	if (gotenforce) {
1705 		pr->pr_enforce_statfs = enforce;
1706 		/* Pass this restriction on to the children. */
1707 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1708 			if (tpr->pr_enforce_statfs < enforce)
1709 				tpr->pr_enforce_statfs = enforce;
1710 	}
1711 	if (gotrsnum) {
1712 		pr->pr_devfs_rsnum = rsnum;
1713 		/* Pass this restriction on to the children. */
1714 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1715 			tpr->pr_devfs_rsnum = rsnum;
1716 	}
1717 	if (namelc != NULL) {
1718 		if (ppr == &prison0)
1719 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1720 		else
1721 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1722 			    ppr->pr_name, namelc);
1723 		/* Change this component of child names. */
1724 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1725 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1726 			    strlen(tpr->pr_name + onamelen) + 1);
1727 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1728 		}
1729 	}
1730 	if (path != NULL) {
1731 		/* Try to keep a real-rooted full pathname. */
1732 		if (fullpath_disabled && path[0] == '/' &&
1733 		    strcmp(mypr->pr_path, "/"))
1734 			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1735 			    mypr->pr_path, path);
1736 		else
1737 			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1738 		pr->pr_root = root;
1739 	}
1740 	if (PR_HOST & ch_flags & ~pr_flags) {
1741 		if (pr->pr_flags & PR_HOST) {
1742 			/*
1743 			 * Copy the parent's host info.  As with pr_ip4 above,
1744 			 * the lack of a lock on the parent is not a problem;
1745 			 * it is always set with allprison_lock at least
1746 			 * shared, and is held exclusively here.
1747 			 */
1748 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1749 			    sizeof(pr->pr_hostname));
1750 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1751 			    sizeof(pr->pr_domainname));
1752 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1753 			    sizeof(pr->pr_hostuuid));
1754 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1755 		}
1756 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1757 		/* Set this prison, and any descendants without PR_HOST. */
1758 		if (host != NULL)
1759 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1760 		if (domain != NULL)
1761 			strlcpy(pr->pr_domainname, domain,
1762 			    sizeof(pr->pr_domainname));
1763 		if (uuid != NULL)
1764 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1765 		if (gothid)
1766 			pr->pr_hostid = hid;
1767 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1768 			if (tpr->pr_flags & PR_HOST)
1769 				descend = 0;
1770 			else {
1771 				if (host != NULL)
1772 					strlcpy(tpr->pr_hostname,
1773 					    pr->pr_hostname,
1774 					    sizeof(tpr->pr_hostname));
1775 				if (domain != NULL)
1776 					strlcpy(tpr->pr_domainname,
1777 					    pr->pr_domainname,
1778 					    sizeof(tpr->pr_domainname));
1779 				if (uuid != NULL)
1780 					strlcpy(tpr->pr_hostuuid,
1781 					    pr->pr_hostuuid,
1782 					    sizeof(tpr->pr_hostuuid));
1783 				if (gothid)
1784 					tpr->pr_hostid = hid;
1785 			}
1786 		}
1787 	}
1788 	if ((tallow = ch_allow & ~pr_allow)) {
1789 		/* Clear allow bits in all children. */
1790 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1791 			tpr->pr_allow &= ~tallow;
1792 	}
1793 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1794 	/*
1795 	 * Persistent prisons get an extra reference, and prisons losing their
1796 	 * persist flag lose that reference.  Only do this for existing prisons
1797 	 * for now, so new ones will remain unseen until after the module
1798 	 * handlers have completed.
1799 	 */
1800 	born = pr->pr_uref == 0;
1801 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1802 		if (pr_flags & PR_PERSIST) {
1803 			pr->pr_ref++;
1804 			pr->pr_uref++;
1805 		} else {
1806 			pr->pr_ref--;
1807 			pr->pr_uref--;
1808 		}
1809 	}
1810 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1811 	mtx_unlock(&pr->pr_mtx);
1812 
1813 #ifdef RACCT
1814 	if (racct_enable && created)
1815 		prison_racct_attach(pr);
1816 #endif
1817 
1818 	/* Locks may have prevented a complete restriction of child IP
1819 	 * addresses.  If so, allocate some more memory and try again.
1820 	 */
1821 #ifdef INET
1822 	while (redo_ip4) {
1823 		ip4s = pr->pr_ip4s;
1824 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1825 		mtx_lock(&pr->pr_mtx);
1826 		redo_ip4 = 0;
1827 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1828 #ifdef VIMAGE
1829 			if (tpr->pr_flags & PR_VNET) {
1830 				descend = 0;
1831 				continue;
1832 			}
1833 #endif
1834 			if (prison_restrict_ip4(tpr, ip4)) {
1835 				if (ip4 != NULL)
1836 					ip4 = NULL;
1837 				else
1838 					redo_ip4 = 1;
1839 			}
1840 		}
1841 		mtx_unlock(&pr->pr_mtx);
1842 	}
1843 #endif
1844 #ifdef INET6
1845 	while (redo_ip6) {
1846 		ip6s = pr->pr_ip6s;
1847 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1848 		mtx_lock(&pr->pr_mtx);
1849 		redo_ip6 = 0;
1850 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1851 #ifdef VIMAGE
1852 			if (tpr->pr_flags & PR_VNET) {
1853 				descend = 0;
1854 				continue;
1855 			}
1856 #endif
1857 			if (prison_restrict_ip6(tpr, ip6)) {
1858 				if (ip6 != NULL)
1859 					ip6 = NULL;
1860 				else
1861 					redo_ip6 = 1;
1862 			}
1863 		}
1864 		mtx_unlock(&pr->pr_mtx);
1865 	}
1866 #endif
1867 
1868 	/* Let the modules do their work. */
1869 	sx_downgrade(&allprison_lock);
1870 	if (born) {
1871 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1872 		if (error) {
1873 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1874 			prison_deref(pr, created
1875 			    ? PD_LIST_SLOCKED
1876 			    : PD_DEREF | PD_LIST_SLOCKED);
1877 			goto done_errmsg;
1878 		}
1879 	}
1880 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
1881 	if (error) {
1882 		if (born)
1883 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1884 		prison_deref(pr, created
1885 		    ? PD_LIST_SLOCKED
1886 		    : PD_DEREF | PD_LIST_SLOCKED);
1887 		goto done_errmsg;
1888 	}
1889 
1890 	/* Attach this process to the prison if requested. */
1891 	if (flags & JAIL_ATTACH) {
1892 		mtx_lock(&pr->pr_mtx);
1893 		error = do_jail_attach(td, pr);
1894 		if (error) {
1895 			vfs_opterror(opts, "attach failed");
1896 			if (!created)
1897 				prison_deref(pr, PD_DEREF);
1898 			goto done_errmsg;
1899 		}
1900 	}
1901 
1902 #ifdef RACCT
1903 	if (racct_enable && !created) {
1904 		if (!(flags & JAIL_ATTACH))
1905 			sx_sunlock(&allprison_lock);
1906 		prison_racct_modify(pr);
1907 		if (!(flags & JAIL_ATTACH))
1908 			sx_slock(&allprison_lock);
1909 	}
1910 #endif
1911 
1912 	td->td_retval[0] = pr->pr_id;
1913 
1914 	/*
1915 	 * Now that it is all there, drop the temporary reference from existing
1916 	 * prisons.  Or add a reference to newly created persistent prisons
1917 	 * (which was not done earlier so that the prison would not be publicly
1918 	 * visible).
1919 	 */
1920 	if (!created) {
1921 		prison_deref(pr, (flags & JAIL_ATTACH)
1922 		    ? PD_DEREF
1923 		    : PD_DEREF | PD_LIST_SLOCKED);
1924 	} else {
1925 		if (pr_flags & PR_PERSIST) {
1926 			mtx_lock(&pr->pr_mtx);
1927 			pr->pr_ref++;
1928 			pr->pr_uref++;
1929 			mtx_unlock(&pr->pr_mtx);
1930 		}
1931 		if (!(flags & JAIL_ATTACH))
1932 			sx_sunlock(&allprison_lock);
1933 	}
1934 
1935 	goto done_free;
1936 
1937  done_deref_locked:
1938 	prison_deref(pr, created
1939 	    ? PD_LOCKED | PD_LIST_XLOCKED
1940 	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1941 	goto done_releroot;
1942  done_unlock_list:
1943 	sx_xunlock(&allprison_lock);
1944  done_releroot:
1945 	if (root != NULL)
1946 		vrele(root);
1947  done_errmsg:
1948 	if (error) {
1949 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1950 		    &errmsg_len) == 0 && errmsg_len > 0) {
1951 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1952 			if (optuio->uio_segflg == UIO_SYSSPACE)
1953 				bcopy(errmsg,
1954 				    optuio->uio_iov[errmsg_pos].iov_base,
1955 				    errmsg_len);
1956 			else
1957 				copyout(errmsg,
1958 				    optuio->uio_iov[errmsg_pos].iov_base,
1959 				    errmsg_len);
1960 		}
1961 	}
1962  done_free:
1963 #ifdef INET
1964 	free(ip4, M_PRISON);
1965 #endif
1966 #ifdef INET6
1967 	free(ip6, M_PRISON);
1968 #endif
1969 	if (g_path != NULL)
1970 		free(g_path, M_TEMP);
1971 	vfs_freeopts(opts);
1972 	return (error);
1973 }
1974 
1975 
1976 /*
1977  * struct jail_get_args {
1978  *	struct iovec *iovp;
1979  *	unsigned int iovcnt;
1980  *	int flags;
1981  * };
1982  */
1983 int
sys_jail_get(struct thread * td,struct jail_get_args * uap)1984 sys_jail_get(struct thread *td, struct jail_get_args *uap)
1985 {
1986 	struct uio *auio;
1987 	int error;
1988 
1989 	/* Check that we have an even number of iovecs. */
1990 	if (uap->iovcnt & 1)
1991 		return (EINVAL);
1992 
1993 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1994 	if (error)
1995 		return (error);
1996 	error = kern_jail_get(td, auio, uap->flags);
1997 	if (error == 0)
1998 		error = copyout(auio->uio_iov, uap->iovp,
1999 		    uap->iovcnt * sizeof (struct iovec));
2000 	free(auio, M_IOV);
2001 	return (error);
2002 }
2003 
2004 int
kern_jail_get(struct thread * td,struct uio * optuio,int flags)2005 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2006 {
2007 	struct prison *pr, *mypr;
2008 	struct vfsopt *opt;
2009 	struct vfsoptlist *opts;
2010 	char *errmsg, *name;
2011 	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
2012 
2013 	if (flags & ~JAIL_GET_MASK)
2014 		return (EINVAL);
2015 
2016 	/* Get the parameter list. */
2017 	error = vfs_buildopts(optuio, &opts);
2018 	if (error)
2019 		return (error);
2020 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2021 	mypr = td->td_ucred->cr_prison;
2022 
2023 	/*
2024 	 * Find the prison specified by one of: lastjid, jid, name.
2025 	 */
2026 	sx_slock(&allprison_lock);
2027 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2028 	if (error == 0) {
2029 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2030 			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
2031 				mtx_lock(&pr->pr_mtx);
2032 				if (pr->pr_ref > 0 &&
2033 				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
2034 					break;
2035 				mtx_unlock(&pr->pr_mtx);
2036 			}
2037 		}
2038 		if (pr != NULL)
2039 			goto found_prison;
2040 		error = ENOENT;
2041 		vfs_opterror(opts, "no jail after %d", jid);
2042 		goto done_unlock_list;
2043 	} else if (error != ENOENT)
2044 		goto done_unlock_list;
2045 
2046 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2047 	if (error == 0) {
2048 		if (jid != 0) {
2049 			pr = prison_find_child(mypr, jid);
2050 			if (pr != NULL) {
2051 				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2052 					mtx_unlock(&pr->pr_mtx);
2053 					error = ENOENT;
2054 					vfs_opterror(opts, "jail %d is dying",
2055 					    jid);
2056 					goto done_unlock_list;
2057 				}
2058 				goto found_prison;
2059 			}
2060 			error = ENOENT;
2061 			vfs_opterror(opts, "jail %d not found", jid);
2062 			goto done_unlock_list;
2063 		}
2064 	} else if (error != ENOENT)
2065 		goto done_unlock_list;
2066 
2067 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2068 	if (error == 0) {
2069 		if (len == 0 || name[len - 1] != '\0') {
2070 			error = EINVAL;
2071 			goto done_unlock_list;
2072 		}
2073 		pr = prison_find_name(mypr, name);
2074 		if (pr != NULL) {
2075 			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2076 				mtx_unlock(&pr->pr_mtx);
2077 				error = ENOENT;
2078 				vfs_opterror(opts, "jail \"%s\" is dying",
2079 				    name);
2080 				goto done_unlock_list;
2081 			}
2082 			goto found_prison;
2083 		}
2084 		error = ENOENT;
2085 		vfs_opterror(opts, "jail \"%s\" not found", name);
2086 		goto done_unlock_list;
2087 	} else if (error != ENOENT)
2088 		goto done_unlock_list;
2089 
2090 	vfs_opterror(opts, "no jail specified");
2091 	error = ENOENT;
2092 	goto done_unlock_list;
2093 
2094  found_prison:
2095 	/* Get the parameters of the prison. */
2096 	pr->pr_ref++;
2097 	locked = PD_LOCKED;
2098 	td->td_retval[0] = pr->pr_id;
2099 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2100 	if (error != 0 && error != ENOENT)
2101 		goto done_deref;
2102 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2103 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2104 	if (error != 0 && error != ENOENT)
2105 		goto done_deref;
2106 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2107 	if (error != 0 && error != ENOENT)
2108 		goto done_deref;
2109 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2110 	    sizeof(pr->pr_cpuset->cs_id));
2111 	if (error != 0 && error != ENOENT)
2112 		goto done_deref;
2113 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2114 	if (error != 0 && error != ENOENT)
2115 		goto done_deref;
2116 #ifdef INET
2117 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2118 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
2119 	if (error != 0 && error != ENOENT)
2120 		goto done_deref;
2121 #endif
2122 #ifdef INET6
2123 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2124 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
2125 	if (error != 0 && error != ENOENT)
2126 		goto done_deref;
2127 #endif
2128 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2129 	    sizeof(pr->pr_securelevel));
2130 	if (error != 0 && error != ENOENT)
2131 		goto done_deref;
2132 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2133 	    sizeof(pr->pr_childcount));
2134 	if (error != 0 && error != ENOENT)
2135 		goto done_deref;
2136 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2137 	    sizeof(pr->pr_childmax));
2138 	if (error != 0 && error != ENOENT)
2139 		goto done_deref;
2140 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2141 	if (error != 0 && error != ENOENT)
2142 		goto done_deref;
2143 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2144 	if (error != 0 && error != ENOENT)
2145 		goto done_deref;
2146 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2147 	if (error != 0 && error != ENOENT)
2148 		goto done_deref;
2149 #ifdef COMPAT_FREEBSD32
2150 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2151 		uint32_t hid32 = pr->pr_hostid;
2152 
2153 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2154 	} else
2155 #endif
2156 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2157 	    sizeof(pr->pr_hostid));
2158 	if (error != 0 && error != ENOENT)
2159 		goto done_deref;
2160 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2161 	    sizeof(pr->pr_enforce_statfs));
2162 	if (error != 0 && error != ENOENT)
2163 		goto done_deref;
2164 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2165 	    sizeof(pr->pr_devfs_rsnum));
2166 	if (error != 0 && error != ENOENT)
2167 		goto done_deref;
2168 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
2169 	    fi++) {
2170 		if (pr_flag_names[fi] == NULL)
2171 			continue;
2172 		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
2173 		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
2174 		if (error != 0 && error != ENOENT)
2175 			goto done_deref;
2176 		i = !i;
2177 		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
2178 		if (error != 0 && error != ENOENT)
2179 			goto done_deref;
2180 	}
2181 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
2182 	    fi++) {
2183 		i = pr->pr_flags &
2184 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
2185 		i = pr_flag_jailsys[fi].disable &&
2186 		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
2187 		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
2188 		    : JAIL_SYS_INHERIT;
2189 		error =
2190 		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
2191 		if (error != 0 && error != ENOENT)
2192 			goto done_deref;
2193 	}
2194 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
2195 	    fi++) {
2196 		if (pr_allow_names[fi] == NULL)
2197 			continue;
2198 		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
2199 		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
2200 		if (error != 0 && error != ENOENT)
2201 			goto done_deref;
2202 		i = !i;
2203 		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
2204 		if (error != 0 && error != ENOENT)
2205 			goto done_deref;
2206 	}
2207 	i = (pr->pr_uref == 0);
2208 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2209 	if (error != 0 && error != ENOENT)
2210 		goto done_deref;
2211 	i = !i;
2212 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2213 	if (error != 0 && error != ENOENT)
2214 		goto done_deref;
2215 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2216 	    sizeof(pr->pr_osreldate));
2217 	if (error != 0 && error != ENOENT)
2218 		goto done_deref;
2219 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2220 	if (error != 0 && error != ENOENT)
2221 		goto done_deref;
2222 
2223 	/* Get the module parameters. */
2224 	mtx_unlock(&pr->pr_mtx);
2225 	locked = 0;
2226 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2227 	if (error)
2228 		goto done_deref;
2229 	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2230 
2231 	/* By now, all parameters should have been noted. */
2232 	TAILQ_FOREACH(opt, opts, link) {
2233 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2234 			error = EINVAL;
2235 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2236 			goto done_errmsg;
2237 		}
2238 	}
2239 
2240 	/* Write the fetched parameters back to userspace. */
2241 	error = 0;
2242 	TAILQ_FOREACH(opt, opts, link) {
2243 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2244 			pos = 2 * opt->pos + 1;
2245 			optuio->uio_iov[pos].iov_len = opt->len;
2246 			if (opt->value != NULL) {
2247 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2248 					bcopy(opt->value,
2249 					    optuio->uio_iov[pos].iov_base,
2250 					    opt->len);
2251 				} else {
2252 					error = copyout(opt->value,
2253 					    optuio->uio_iov[pos].iov_base,
2254 					    opt->len);
2255 					if (error)
2256 						break;
2257 				}
2258 			}
2259 		}
2260 	}
2261 	goto done_errmsg;
2262 
2263  done_deref:
2264 	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2265 	goto done_errmsg;
2266 
2267  done_unlock_list:
2268 	sx_sunlock(&allprison_lock);
2269  done_errmsg:
2270 	if (error && errmsg_pos >= 0) {
2271 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2272 		errmsg_pos = 2 * errmsg_pos + 1;
2273 		if (errmsg_len > 0) {
2274 			if (optuio->uio_segflg == UIO_SYSSPACE)
2275 				bcopy(errmsg,
2276 				    optuio->uio_iov[errmsg_pos].iov_base,
2277 				    errmsg_len);
2278 			else
2279 				copyout(errmsg,
2280 				    optuio->uio_iov[errmsg_pos].iov_base,
2281 				    errmsg_len);
2282 		}
2283 	}
2284 	vfs_freeopts(opts);
2285 	return (error);
2286 }
2287 
2288 
2289 /*
2290  * struct jail_remove_args {
2291  *	int jid;
2292  * };
2293  */
2294 int
sys_jail_remove(struct thread * td,struct jail_remove_args * uap)2295 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2296 {
2297 	struct prison *pr, *cpr, *lpr, *tpr;
2298 	int descend, error;
2299 
2300 	error = priv_check(td, PRIV_JAIL_REMOVE);
2301 	if (error)
2302 		return (error);
2303 
2304 	sx_xlock(&allprison_lock);
2305 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2306 	if (pr == NULL) {
2307 		sx_xunlock(&allprison_lock);
2308 		return (EINVAL);
2309 	}
2310 
2311 	/* Remove all descendants of this prison, then remove this prison. */
2312 	pr->pr_ref++;
2313 	if (!LIST_EMPTY(&pr->pr_children)) {
2314 		mtx_unlock(&pr->pr_mtx);
2315 		lpr = NULL;
2316 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2317 			mtx_lock(&cpr->pr_mtx);
2318 			if (cpr->pr_ref > 0) {
2319 				tpr = cpr;
2320 				cpr->pr_ref++;
2321 			} else {
2322 				/* Already removed - do not do it again. */
2323 				tpr = NULL;
2324 			}
2325 			mtx_unlock(&cpr->pr_mtx);
2326 			if (lpr != NULL) {
2327 				mtx_lock(&lpr->pr_mtx);
2328 				prison_remove_one(lpr);
2329 				sx_xlock(&allprison_lock);
2330 			}
2331 			lpr = tpr;
2332 		}
2333 		if (lpr != NULL) {
2334 			mtx_lock(&lpr->pr_mtx);
2335 			prison_remove_one(lpr);
2336 			sx_xlock(&allprison_lock);
2337 		}
2338 		mtx_lock(&pr->pr_mtx);
2339 	}
2340 	prison_remove_one(pr);
2341 	return (0);
2342 }
2343 
2344 static void
prison_remove_one(struct prison * pr)2345 prison_remove_one(struct prison *pr)
2346 {
2347 	struct proc *p;
2348 	int deuref;
2349 
2350 	/* If the prison was persistent, it is not anymore. */
2351 	deuref = 0;
2352 	if (pr->pr_flags & PR_PERSIST) {
2353 		pr->pr_ref--;
2354 		deuref = PD_DEUREF;
2355 		pr->pr_flags &= ~PR_PERSIST;
2356 	}
2357 
2358 	/*
2359 	 * jail_remove added a reference.  If that's the only one, remove
2360 	 * the prison now.
2361 	 */
2362 	KASSERT(pr->pr_ref > 0,
2363 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2364 	if (pr->pr_ref == 1) {
2365 		prison_deref(pr,
2366 		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2367 		return;
2368 	}
2369 
2370 	mtx_unlock(&pr->pr_mtx);
2371 	sx_xunlock(&allprison_lock);
2372 	/*
2373 	 * Kill all processes unfortunate enough to be attached to this prison.
2374 	 */
2375 	sx_slock(&allproc_lock);
2376 	LIST_FOREACH(p, &allproc, p_list) {
2377 		PROC_LOCK(p);
2378 		if (p->p_state != PRS_NEW && p->p_ucred &&
2379 		    p->p_ucred->cr_prison == pr)
2380 			kern_psignal(p, SIGKILL);
2381 		PROC_UNLOCK(p);
2382 	}
2383 	sx_sunlock(&allproc_lock);
2384 	/* Remove the temporary reference added by jail_remove. */
2385 	prison_deref(pr, deuref | PD_DEREF);
2386 }
2387 
2388 
2389 /*
2390  * struct jail_attach_args {
2391  *	int jid;
2392  * };
2393  */
2394 int
sys_jail_attach(struct thread * td,struct jail_attach_args * uap)2395 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2396 {
2397 	struct prison *pr;
2398 	int error;
2399 
2400 	error = priv_check(td, PRIV_JAIL_ATTACH);
2401 	if (error)
2402 		return (error);
2403 
2404 	/*
2405 	 * Start with exclusive hold on allprison_lock to ensure that a possible
2406 	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
2407 	 * But then immediately downgrade it since we don't need to stop
2408 	 * readers.
2409 	 */
2410 	sx_xlock(&allprison_lock);
2411 	sx_downgrade(&allprison_lock);
2412 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2413 	if (pr == NULL) {
2414 		sx_sunlock(&allprison_lock);
2415 		return (EINVAL);
2416 	}
2417 
2418 	/*
2419 	 * Do not allow a process to attach to a prison that is not
2420 	 * considered to be "alive".
2421 	 */
2422 	if (pr->pr_uref == 0) {
2423 		mtx_unlock(&pr->pr_mtx);
2424 		sx_sunlock(&allprison_lock);
2425 		return (EINVAL);
2426 	}
2427 
2428 	return (do_jail_attach(td, pr));
2429 }
2430 
2431 static int
do_jail_attach(struct thread * td,struct prison * pr)2432 do_jail_attach(struct thread *td, struct prison *pr)
2433 {
2434 	struct proc *p;
2435 	struct ucred *newcred, *oldcred;
2436 	int error;
2437 
2438 	/*
2439 	 * XXX: Note that there is a slight race here if two threads
2440 	 * in the same privileged process attempt to attach to two
2441 	 * different jails at the same time.  It is important for
2442 	 * user processes not to do this, or they might end up with
2443 	 * a process root from one prison, but attached to the jail
2444 	 * of another.
2445 	 */
2446 	pr->pr_ref++;
2447 	pr->pr_uref++;
2448 	mtx_unlock(&pr->pr_mtx);
2449 
2450 	/* Let modules do whatever they need to prepare for attaching. */
2451 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2452 	if (error) {
2453 		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2454 		return (error);
2455 	}
2456 	sx_sunlock(&allprison_lock);
2457 
2458 	/*
2459 	 * Reparent the newly attached process to this jail.
2460 	 */
2461 	p = td->td_proc;
2462 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2463 	if (error)
2464 		goto e_revert_osd;
2465 
2466 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2467 	if ((error = change_dir(pr->pr_root, td)) != 0)
2468 		goto e_unlock;
2469 #ifdef MAC
2470 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2471 		goto e_unlock;
2472 #endif
2473 	VOP_UNLOCK(pr->pr_root, 0);
2474 	if ((error = change_root(pr->pr_root, td)))
2475 		goto e_revert_osd;
2476 
2477 	newcred = crget();
2478 	PROC_LOCK(p);
2479 	oldcred = crcopysafe(p, newcred);
2480 	newcred->cr_prison = pr;
2481 	proc_set_cred(p, newcred);
2482 	setsugid(p);
2483 	PROC_UNLOCK(p);
2484 #ifdef RACCT
2485 	racct_proc_ucred_changed(p, oldcred, newcred);
2486 #endif
2487 	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
2488 	crfree(oldcred);
2489 	return (0);
2490 
2491  e_unlock:
2492 	VOP_UNLOCK(pr->pr_root, 0);
2493  e_revert_osd:
2494 	/* Tell modules this thread is still in its old jail after all. */
2495 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2496 	prison_deref(pr, PD_DEREF | PD_DEUREF);
2497 	return (error);
2498 }
2499 
2500 
2501 /*
2502  * Returns a locked prison instance, or NULL on failure.
2503  */
2504 struct prison *
prison_find(int prid)2505 prison_find(int prid)
2506 {
2507 	struct prison *pr;
2508 
2509 	sx_assert(&allprison_lock, SX_LOCKED);
2510 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2511 		if (pr->pr_id == prid) {
2512 			mtx_lock(&pr->pr_mtx);
2513 			if (pr->pr_ref > 0)
2514 				return (pr);
2515 			mtx_unlock(&pr->pr_mtx);
2516 		}
2517 	}
2518 	return (NULL);
2519 }
2520 
2521 /*
2522  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2523  */
2524 struct prison *
prison_find_child(struct prison * mypr,int prid)2525 prison_find_child(struct prison *mypr, int prid)
2526 {
2527 	struct prison *pr;
2528 	int descend;
2529 
2530 	sx_assert(&allprison_lock, SX_LOCKED);
2531 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2532 		if (pr->pr_id == prid) {
2533 			mtx_lock(&pr->pr_mtx);
2534 			if (pr->pr_ref > 0)
2535 				return (pr);
2536 			mtx_unlock(&pr->pr_mtx);
2537 		}
2538 	}
2539 	return (NULL);
2540 }
2541 
2542 /*
2543  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2544  */
2545 struct prison *
prison_find_name(struct prison * mypr,const char * name)2546 prison_find_name(struct prison *mypr, const char *name)
2547 {
2548 	struct prison *pr, *deadpr;
2549 	size_t mylen;
2550 	int descend;
2551 
2552 	sx_assert(&allprison_lock, SX_LOCKED);
2553 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2554  again:
2555 	deadpr = NULL;
2556 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2557 		if (!strcmp(pr->pr_name + mylen, name)) {
2558 			mtx_lock(&pr->pr_mtx);
2559 			if (pr->pr_ref > 0) {
2560 				if (pr->pr_uref > 0)
2561 					return (pr);
2562 				deadpr = pr;
2563 			}
2564 			mtx_unlock(&pr->pr_mtx);
2565 		}
2566 	}
2567 	/* There was no valid prison - perhaps there was a dying one. */
2568 	if (deadpr != NULL) {
2569 		mtx_lock(&deadpr->pr_mtx);
2570 		if (deadpr->pr_ref == 0) {
2571 			mtx_unlock(&deadpr->pr_mtx);
2572 			goto again;
2573 		}
2574 	}
2575 	return (deadpr);
2576 }
2577 
2578 /*
2579  * See if a prison has the specific flag set.
2580  */
2581 int
prison_flag(struct ucred * cred,unsigned flag)2582 prison_flag(struct ucred *cred, unsigned flag)
2583 {
2584 
2585 	/* This is an atomic read, so no locking is necessary. */
2586 	return (cred->cr_prison->pr_flags & flag);
2587 }
2588 
2589 int
prison_allow(struct ucred * cred,unsigned flag)2590 prison_allow(struct ucred *cred, unsigned flag)
2591 {
2592 
2593 	/* This is an atomic read, so no locking is necessary. */
2594 	return (cred->cr_prison->pr_allow & flag);
2595 }
2596 
2597 /*
2598  * Remove a prison reference.  If that was the last reference, remove the
2599  * prison itself - but not in this context in case there are locks held.
2600  */
2601 void
prison_free_locked(struct prison * pr)2602 prison_free_locked(struct prison *pr)
2603 {
2604 	int ref;
2605 
2606 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2607 	ref = --pr->pr_ref;
2608 	mtx_unlock(&pr->pr_mtx);
2609 	if (ref == 0)
2610 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2611 }
2612 
2613 void
prison_free(struct prison * pr)2614 prison_free(struct prison *pr)
2615 {
2616 
2617 	mtx_lock(&pr->pr_mtx);
2618 	prison_free_locked(pr);
2619 }
2620 
2621 /*
2622  * Complete a call to either prison_free or prison_proc_free.
2623  */
2624 static void
prison_complete(void * context,int pending)2625 prison_complete(void *context, int pending)
2626 {
2627 	struct prison *pr = context;
2628 
2629 	sx_xlock(&allprison_lock);
2630 	mtx_lock(&pr->pr_mtx);
2631 	prison_deref(pr, pr->pr_uref
2632 	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
2633 	    : PD_LOCKED | PD_LIST_XLOCKED);
2634 }
2635 
2636 /*
2637  * Remove a prison reference (usually).  This internal version assumes no
2638  * mutexes are held, except perhaps the prison itself.  If there are no more
2639  * references, release and delist the prison.  On completion, the prison lock
2640  * and the allprison lock are both unlocked.
2641  */
2642 static void
prison_deref(struct prison * pr,int flags)2643 prison_deref(struct prison *pr, int flags)
2644 {
2645 	struct prison *ppr, *tpr;
2646 	int ref, lasturef;
2647 
2648 	if (!(flags & PD_LOCKED))
2649 		mtx_lock(&pr->pr_mtx);
2650 	for (;;) {
2651 		if (flags & PD_DEUREF) {
2652 			KASSERT(pr->pr_uref > 0,
2653 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2654 			     pr->pr_id));
2655 			pr->pr_uref--;
2656 			lasturef = pr->pr_uref == 0;
2657 			if (lasturef)
2658 				pr->pr_ref++;
2659 			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2660 		} else
2661 			lasturef = 0;
2662 		if (flags & PD_DEREF) {
2663 			KASSERT(pr->pr_ref > 0,
2664 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2665 			     pr->pr_id));
2666 			pr->pr_ref--;
2667 		}
2668 		ref = pr->pr_ref;
2669 		mtx_unlock(&pr->pr_mtx);
2670 
2671 		/*
2672 		 * Tell the modules if the last user reference was removed
2673 		 * (even it sticks around in dying state).
2674 		 */
2675 		if (lasturef) {
2676 			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
2677 				sx_xlock(&allprison_lock);
2678 				flags |= PD_LIST_XLOCKED;
2679 			}
2680 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2681 			mtx_lock(&pr->pr_mtx);
2682 			ref = --pr->pr_ref;
2683 			mtx_unlock(&pr->pr_mtx);
2684 		}
2685 
2686 		/* If the prison still has references, nothing else to do. */
2687 		if (ref > 0) {
2688 			if (flags & PD_LIST_SLOCKED)
2689 				sx_sunlock(&allprison_lock);
2690 			else if (flags & PD_LIST_XLOCKED)
2691 				sx_xunlock(&allprison_lock);
2692 			return;
2693 		}
2694 
2695 		if (flags & PD_LIST_SLOCKED) {
2696 			if (!sx_try_upgrade(&allprison_lock)) {
2697 				sx_sunlock(&allprison_lock);
2698 				sx_xlock(&allprison_lock);
2699 			}
2700 		} else if (!(flags & PD_LIST_XLOCKED))
2701 			sx_xlock(&allprison_lock);
2702 
2703 		TAILQ_REMOVE(&allprison, pr, pr_list);
2704 		LIST_REMOVE(pr, pr_sibling);
2705 		ppr = pr->pr_parent;
2706 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2707 			tpr->pr_childcount--;
2708 		sx_xunlock(&allprison_lock);
2709 
2710 #ifdef VIMAGE
2711 		if (pr->pr_vnet != ppr->pr_vnet)
2712 			vnet_destroy(pr->pr_vnet);
2713 #endif
2714 		if (pr->pr_root != NULL)
2715 			vrele(pr->pr_root);
2716 		mtx_destroy(&pr->pr_mtx);
2717 #ifdef INET
2718 		free(pr->pr_ip4, M_PRISON);
2719 #endif
2720 #ifdef INET6
2721 		free(pr->pr_ip6, M_PRISON);
2722 #endif
2723 		if (pr->pr_cpuset != NULL)
2724 			cpuset_rel(pr->pr_cpuset);
2725 		osd_jail_exit(pr);
2726 #ifdef RACCT
2727 		if (racct_enable)
2728 			prison_racct_detach(pr);
2729 #endif
2730 		free(pr, M_PRISON);
2731 
2732 		/* Removing a prison frees a reference on its parent. */
2733 		pr = ppr;
2734 		mtx_lock(&pr->pr_mtx);
2735 		flags = PD_DEREF | PD_DEUREF;
2736 	}
2737 }
2738 
2739 void
prison_hold_locked(struct prison * pr)2740 prison_hold_locked(struct prison *pr)
2741 {
2742 
2743 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2744 	KASSERT(pr->pr_ref > 0,
2745 	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
2746 	pr->pr_ref++;
2747 }
2748 
2749 void
prison_hold(struct prison * pr)2750 prison_hold(struct prison *pr)
2751 {
2752 
2753 	mtx_lock(&pr->pr_mtx);
2754 	prison_hold_locked(pr);
2755 	mtx_unlock(&pr->pr_mtx);
2756 }
2757 
2758 void
prison_proc_hold(struct prison * pr)2759 prison_proc_hold(struct prison *pr)
2760 {
2761 
2762 	mtx_lock(&pr->pr_mtx);
2763 	KASSERT(pr->pr_uref > 0,
2764 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2765 	pr->pr_uref++;
2766 	mtx_unlock(&pr->pr_mtx);
2767 }
2768 
2769 void
prison_proc_free(struct prison * pr)2770 prison_proc_free(struct prison *pr)
2771 {
2772 
2773 	mtx_lock(&pr->pr_mtx);
2774 	KASSERT(pr->pr_uref > 0,
2775 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2776 	if (pr->pr_uref > 1)
2777 		pr->pr_uref--;
2778 	else {
2779 		/*
2780 		 * Don't remove the last user reference in this context, which
2781 		 * is expected to be a process that is not only locked, but
2782 		 * also half dead.
2783 		 */
2784 		pr->pr_ref++;
2785 		mtx_unlock(&pr->pr_mtx);
2786 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2787 		return;
2788 	}
2789 	mtx_unlock(&pr->pr_mtx);
2790 }
2791 
2792 
2793 #ifdef INET
2794 /*
2795  * Restrict a prison's IP address list with its parent's, possibly replacing
2796  * it.  Return true if the replacement buffer was used (or would have been).
2797  */
2798 static int
prison_restrict_ip4(struct prison * pr,struct in_addr * newip4)2799 prison_restrict_ip4(struct prison *pr, struct in_addr *newip4)
2800 {
2801 	int ii, ij, used;
2802 	struct prison *ppr;
2803 
2804 	ppr = pr->pr_parent;
2805 	if (!(pr->pr_flags & PR_IP4_USER)) {
2806 		/* This has no user settings, so just copy the parent's list. */
2807 		if (pr->pr_ip4s < ppr->pr_ip4s) {
2808 			/*
2809 			 * There's no room for the parent's list.  Use the
2810 			 * new list buffer, which is assumed to be big enough
2811 			 * (if it was passed).  If there's no buffer, try to
2812 			 * allocate one.
2813 			 */
2814 			used = 1;
2815 			if (newip4 == NULL) {
2816 				newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
2817 				    M_PRISON, M_NOWAIT);
2818 				if (newip4 != NULL)
2819 					used = 0;
2820 			}
2821 			if (newip4 != NULL) {
2822 				bcopy(ppr->pr_ip4, newip4,
2823 				    ppr->pr_ip4s * sizeof(*newip4));
2824 				free(pr->pr_ip4, M_PRISON);
2825 				pr->pr_ip4 = newip4;
2826 				pr->pr_ip4s = ppr->pr_ip4s;
2827 			}
2828 			return (used);
2829 		}
2830 		pr->pr_ip4s = ppr->pr_ip4s;
2831 		if (pr->pr_ip4s > 0)
2832 			bcopy(ppr->pr_ip4, pr->pr_ip4,
2833 			    pr->pr_ip4s * sizeof(*newip4));
2834 		else if (pr->pr_ip4 != NULL) {
2835 			free(pr->pr_ip4, M_PRISON);
2836 			pr->pr_ip4 = NULL;
2837 		}
2838 	} else if (pr->pr_ip4s > 0) {
2839 		/* Remove addresses that aren't in the parent. */
2840 		for (ij = 0; ij < ppr->pr_ip4s; ij++)
2841 			if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
2842 				break;
2843 		if (ij < ppr->pr_ip4s)
2844 			ii = 1;
2845 		else {
2846 			bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
2847 			    --pr->pr_ip4s * sizeof(*pr->pr_ip4));
2848 			ii = 0;
2849 		}
2850 		for (ij = 1; ii < pr->pr_ip4s; ) {
2851 			if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
2852 				ii++;
2853 				continue;
2854 			}
2855 			switch (ij >= ppr->pr_ip4s ? -1 :
2856 				qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
2857 			case -1:
2858 				bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
2859 				    (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
2860 				break;
2861 			case 0:
2862 				ii++;
2863 				ij++;
2864 				break;
2865 			case 1:
2866 				ij++;
2867 				break;
2868 			}
2869 		}
2870 		if (pr->pr_ip4s == 0) {
2871 			pr->pr_flags |= PR_IP4_DISABLE;
2872 			free(pr->pr_ip4, M_PRISON);
2873 			pr->pr_ip4 = NULL;
2874 		}
2875 	}
2876 	return (0);
2877 }
2878 
2879 /*
2880  * Pass back primary IPv4 address of this jail.
2881  *
2882  * If not restricted return success but do not alter the address.  Caller has
2883  * to make sure to initialize it correctly (e.g. INADDR_ANY).
2884  *
2885  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
2886  * Address returned in NBO.
2887  */
2888 int
prison_get_ip4(struct ucred * cred,struct in_addr * ia)2889 prison_get_ip4(struct ucred *cred, struct in_addr *ia)
2890 {
2891 	struct prison *pr;
2892 
2893 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2894 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2895 
2896 	pr = cred->cr_prison;
2897 	if (!(pr->pr_flags & PR_IP4))
2898 		return (0);
2899 	mtx_lock(&pr->pr_mtx);
2900 	if (!(pr->pr_flags & PR_IP4)) {
2901 		mtx_unlock(&pr->pr_mtx);
2902 		return (0);
2903 	}
2904 	if (pr->pr_ip4 == NULL) {
2905 		mtx_unlock(&pr->pr_mtx);
2906 		return (EAFNOSUPPORT);
2907 	}
2908 
2909 	ia->s_addr = pr->pr_ip4[0].s_addr;
2910 	mtx_unlock(&pr->pr_mtx);
2911 	return (0);
2912 }
2913 
2914 /*
2915  * Return 1 if we should do proper source address selection or are not jailed.
2916  * We will return 0 if we should bypass source address selection in favour
2917  * of the primary jail IPv4 address. Only in this case *ia will be updated and
2918  * returned in NBO.
2919  * Return EAFNOSUPPORT, in case this jail does not allow IPv4.
2920  */
2921 int
prison_saddrsel_ip4(struct ucred * cred,struct in_addr * ia)2922 prison_saddrsel_ip4(struct ucred *cred, struct in_addr *ia)
2923 {
2924 	struct prison *pr;
2925 	struct in_addr lia;
2926 	int error;
2927 
2928 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2929 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2930 
2931 	if (!jailed(cred))
2932 		return (1);
2933 
2934 	pr = cred->cr_prison;
2935 	if (pr->pr_flags & PR_IP4_SADDRSEL)
2936 		return (1);
2937 
2938 	lia.s_addr = INADDR_ANY;
2939 	error = prison_get_ip4(cred, &lia);
2940 	if (error)
2941 		return (error);
2942 	if (lia.s_addr == INADDR_ANY)
2943 		return (1);
2944 
2945 	ia->s_addr = lia.s_addr;
2946 	return (0);
2947 }
2948 
2949 /*
2950  * Return true if pr1 and pr2 have the same IPv4 address restrictions.
2951  */
2952 int
prison_equal_ip4(struct prison * pr1,struct prison * pr2)2953 prison_equal_ip4(struct prison *pr1, struct prison *pr2)
2954 {
2955 
2956 	if (pr1 == pr2)
2957 		return (1);
2958 
2959 	/*
2960 	 * No need to lock since the PR_IP4_USER flag can't be altered for
2961 	 * existing prisons.
2962 	 */
2963 	while (pr1 != &prison0 &&
2964 #ifdef VIMAGE
2965 	       !(pr1->pr_flags & PR_VNET) &&
2966 #endif
2967 	       !(pr1->pr_flags & PR_IP4_USER))
2968 		pr1 = pr1->pr_parent;
2969 	while (pr2 != &prison0 &&
2970 #ifdef VIMAGE
2971 	       !(pr2->pr_flags & PR_VNET) &&
2972 #endif
2973 	       !(pr2->pr_flags & PR_IP4_USER))
2974 		pr2 = pr2->pr_parent;
2975 	return (pr1 == pr2);
2976 }
2977 
2978 /*
2979  * Make sure our (source) address is set to something meaningful to this
2980  * jail.
2981  *
2982  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
2983  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
2984  * doesn't allow IPv4.  Address passed in in NBO and returned in NBO.
2985  */
2986 int
prison_local_ip4(struct ucred * cred,struct in_addr * ia)2987 prison_local_ip4(struct ucred *cred, struct in_addr *ia)
2988 {
2989 	struct prison *pr;
2990 	struct in_addr ia0;
2991 	int error;
2992 
2993 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2994 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
2995 
2996 	pr = cred->cr_prison;
2997 	if (!(pr->pr_flags & PR_IP4))
2998 		return (0);
2999 	mtx_lock(&pr->pr_mtx);
3000 	if (!(pr->pr_flags & PR_IP4)) {
3001 		mtx_unlock(&pr->pr_mtx);
3002 		return (0);
3003 	}
3004 	if (pr->pr_ip4 == NULL) {
3005 		mtx_unlock(&pr->pr_mtx);
3006 		return (EAFNOSUPPORT);
3007 	}
3008 
3009 	ia0.s_addr = ntohl(ia->s_addr);
3010 	if (ia0.s_addr == INADDR_LOOPBACK) {
3011 		ia->s_addr = pr->pr_ip4[0].s_addr;
3012 		mtx_unlock(&pr->pr_mtx);
3013 		return (0);
3014 	}
3015 
3016 	if (ia0.s_addr == INADDR_ANY) {
3017 		/*
3018 		 * In case there is only 1 IPv4 address, bind directly.
3019 		 */
3020 		if (pr->pr_ip4s == 1)
3021 			ia->s_addr = pr->pr_ip4[0].s_addr;
3022 		mtx_unlock(&pr->pr_mtx);
3023 		return (0);
3024 	}
3025 
3026 	error = _prison_check_ip4(pr, ia);
3027 	mtx_unlock(&pr->pr_mtx);
3028 	return (error);
3029 }
3030 
3031 /*
3032  * Rewrite destination address in case we will connect to loopback address.
3033  *
3034  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
3035  * Address passed in in NBO and returned in NBO.
3036  */
3037 int
prison_remote_ip4(struct ucred * cred,struct in_addr * ia)3038 prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
3039 {
3040 	struct prison *pr;
3041 
3042 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3043 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3044 
3045 	pr = cred->cr_prison;
3046 	if (!(pr->pr_flags & PR_IP4))
3047 		return (0);
3048 	mtx_lock(&pr->pr_mtx);
3049 	if (!(pr->pr_flags & PR_IP4)) {
3050 		mtx_unlock(&pr->pr_mtx);
3051 		return (0);
3052 	}
3053 	if (pr->pr_ip4 == NULL) {
3054 		mtx_unlock(&pr->pr_mtx);
3055 		return (EAFNOSUPPORT);
3056 	}
3057 
3058 	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
3059 		ia->s_addr = pr->pr_ip4[0].s_addr;
3060 		mtx_unlock(&pr->pr_mtx);
3061 		return (0);
3062 	}
3063 
3064 	/*
3065 	 * Return success because nothing had to be changed.
3066 	 */
3067 	mtx_unlock(&pr->pr_mtx);
3068 	return (0);
3069 }
3070 
3071 /*
3072  * Check if given address belongs to the jail referenced by cred/prison.
3073  *
3074  * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
3075  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3076  * doesn't allow IPv4.  Address passed in in NBO.
3077  */
3078 static int
_prison_check_ip4(struct prison * pr,struct in_addr * ia)3079 _prison_check_ip4(struct prison *pr, struct in_addr *ia)
3080 {
3081 	int i, a, z, d;
3082 
3083 	/*
3084 	 * Check the primary IP.
3085 	 */
3086 	if (pr->pr_ip4[0].s_addr == ia->s_addr)
3087 		return (0);
3088 
3089 	/*
3090 	 * All the other IPs are sorted so we can do a binary search.
3091 	 */
3092 	a = 0;
3093 	z = pr->pr_ip4s - 2;
3094 	while (a <= z) {
3095 		i = (a + z) / 2;
3096 		d = qcmp_v4(&pr->pr_ip4[i+1], ia);
3097 		if (d > 0)
3098 			z = i - 1;
3099 		else if (d < 0)
3100 			a = i + 1;
3101 		else
3102 			return (0);
3103 	}
3104 
3105 	return (EADDRNOTAVAIL);
3106 }
3107 
3108 int
prison_check_ip4(struct ucred * cred,struct in_addr * ia)3109 prison_check_ip4(struct ucred *cred, struct in_addr *ia)
3110 {
3111 	struct prison *pr;
3112 	int error;
3113 
3114 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3115 	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
3116 
3117 	pr = cred->cr_prison;
3118 	if (!(pr->pr_flags & PR_IP4))
3119 		return (0);
3120 	mtx_lock(&pr->pr_mtx);
3121 	if (!(pr->pr_flags & PR_IP4)) {
3122 		mtx_unlock(&pr->pr_mtx);
3123 		return (0);
3124 	}
3125 	if (pr->pr_ip4 == NULL) {
3126 		mtx_unlock(&pr->pr_mtx);
3127 		return (EAFNOSUPPORT);
3128 	}
3129 
3130 	error = _prison_check_ip4(pr, ia);
3131 	mtx_unlock(&pr->pr_mtx);
3132 	return (error);
3133 }
3134 #endif
3135 
3136 #ifdef INET6
3137 static int
prison_restrict_ip6(struct prison * pr,struct in6_addr * newip6)3138 prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6)
3139 {
3140 	int ii, ij, used;
3141 	struct prison *ppr;
3142 
3143 	ppr = pr->pr_parent;
3144 	if (!(pr->pr_flags & PR_IP6_USER)) {
3145 		/* This has no user settings, so just copy the parent's list. */
3146 		if (pr->pr_ip6s < ppr->pr_ip6s) {
3147 			/*
3148 			 * There's no room for the parent's list.  Use the
3149 			 * new list buffer, which is assumed to be big enough
3150 			 * (if it was passed).  If there's no buffer, try to
3151 			 * allocate one.
3152 			 */
3153 			used = 1;
3154 			if (newip6 == NULL) {
3155 				newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
3156 				    M_PRISON, M_NOWAIT);
3157 				if (newip6 != NULL)
3158 					used = 0;
3159 			}
3160 			if (newip6 != NULL) {
3161 				bcopy(ppr->pr_ip6, newip6,
3162 				    ppr->pr_ip6s * sizeof(*newip6));
3163 				free(pr->pr_ip6, M_PRISON);
3164 				pr->pr_ip6 = newip6;
3165 				pr->pr_ip6s = ppr->pr_ip6s;
3166 			}
3167 			return (used);
3168 		}
3169 		pr->pr_ip6s = ppr->pr_ip6s;
3170 		if (pr->pr_ip6s > 0)
3171 			bcopy(ppr->pr_ip6, pr->pr_ip6,
3172 			    pr->pr_ip6s * sizeof(*newip6));
3173 		else if (pr->pr_ip6 != NULL) {
3174 			free(pr->pr_ip6, M_PRISON);
3175 			pr->pr_ip6 = NULL;
3176 		}
3177 	} else if (pr->pr_ip6s > 0) {
3178 		/* Remove addresses that aren't in the parent. */
3179 		for (ij = 0; ij < ppr->pr_ip6s; ij++)
3180 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
3181 			    &ppr->pr_ip6[ij]))
3182 				break;
3183 		if (ij < ppr->pr_ip6s)
3184 			ii = 1;
3185 		else {
3186 			bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
3187 			    --pr->pr_ip6s * sizeof(*pr->pr_ip6));
3188 			ii = 0;
3189 		}
3190 		for (ij = 1; ii < pr->pr_ip6s; ) {
3191 			if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
3192 			    &ppr->pr_ip6[0])) {
3193 				ii++;
3194 				continue;
3195 			}
3196 			switch (ij >= ppr->pr_ip6s ? -1 :
3197 				qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
3198 			case -1:
3199 				bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
3200 				    (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
3201 				break;
3202 			case 0:
3203 				ii++;
3204 				ij++;
3205 				break;
3206 			case 1:
3207 				ij++;
3208 				break;
3209 			}
3210 		}
3211 		if (pr->pr_ip6s == 0) {
3212 			pr->pr_flags |= PR_IP6_DISABLE;
3213 			free(pr->pr_ip6, M_PRISON);
3214 			pr->pr_ip6 = NULL;
3215 		}
3216 	}
3217 	return 0;
3218 }
3219 
3220 /*
3221  * Pass back primary IPv6 address for this jail.
3222  *
3223  * If not restricted return success but do not alter the address.  Caller has
3224  * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
3225  *
3226  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3227  */
3228 int
prison_get_ip6(struct ucred * cred,struct in6_addr * ia6)3229 prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
3230 {
3231 	struct prison *pr;
3232 
3233 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3234 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3235 
3236 	pr = cred->cr_prison;
3237 	if (!(pr->pr_flags & PR_IP6))
3238 		return (0);
3239 	mtx_lock(&pr->pr_mtx);
3240 	if (!(pr->pr_flags & PR_IP6)) {
3241 		mtx_unlock(&pr->pr_mtx);
3242 		return (0);
3243 	}
3244 	if (pr->pr_ip6 == NULL) {
3245 		mtx_unlock(&pr->pr_mtx);
3246 		return (EAFNOSUPPORT);
3247 	}
3248 
3249 	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3250 	mtx_unlock(&pr->pr_mtx);
3251 	return (0);
3252 }
3253 
3254 /*
3255  * Return 1 if we should do proper source address selection or are not jailed.
3256  * We will return 0 if we should bypass source address selection in favour
3257  * of the primary jail IPv6 address. Only in this case *ia will be updated and
3258  * returned in NBO.
3259  * Return EAFNOSUPPORT, in case this jail does not allow IPv6.
3260  */
3261 int
prison_saddrsel_ip6(struct ucred * cred,struct in6_addr * ia6)3262 prison_saddrsel_ip6(struct ucred *cred, struct in6_addr *ia6)
3263 {
3264 	struct prison *pr;
3265 	struct in6_addr lia6;
3266 	int error;
3267 
3268 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3269 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3270 
3271 	if (!jailed(cred))
3272 		return (1);
3273 
3274 	pr = cred->cr_prison;
3275 	if (pr->pr_flags & PR_IP6_SADDRSEL)
3276 		return (1);
3277 
3278 	lia6 = in6addr_any;
3279 	error = prison_get_ip6(cred, &lia6);
3280 	if (error)
3281 		return (error);
3282 	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
3283 		return (1);
3284 
3285 	bcopy(&lia6, ia6, sizeof(struct in6_addr));
3286 	return (0);
3287 }
3288 
3289 /*
3290  * Return true if pr1 and pr2 have the same IPv6 address restrictions.
3291  */
3292 int
prison_equal_ip6(struct prison * pr1,struct prison * pr2)3293 prison_equal_ip6(struct prison *pr1, struct prison *pr2)
3294 {
3295 
3296 	if (pr1 == pr2)
3297 		return (1);
3298 
3299 	while (pr1 != &prison0 &&
3300 #ifdef VIMAGE
3301 	       !(pr1->pr_flags & PR_VNET) &&
3302 #endif
3303 	       !(pr1->pr_flags & PR_IP6_USER))
3304 		pr1 = pr1->pr_parent;
3305 	while (pr2 != &prison0 &&
3306 #ifdef VIMAGE
3307 	       !(pr2->pr_flags & PR_VNET) &&
3308 #endif
3309 	       !(pr2->pr_flags & PR_IP6_USER))
3310 		pr2 = pr2->pr_parent;
3311 	return (pr1 == pr2);
3312 }
3313 
3314 /*
3315  * Make sure our (source) address is set to something meaningful to this jail.
3316  *
3317  * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
3318  * when needed while binding.
3319  *
3320  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3321  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3322  * doesn't allow IPv6.
3323  */
3324 int
prison_local_ip6(struct ucred * cred,struct in6_addr * ia6,int v6only)3325 prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
3326 {
3327 	struct prison *pr;
3328 	int error;
3329 
3330 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3331 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3332 
3333 	pr = cred->cr_prison;
3334 	if (!(pr->pr_flags & PR_IP6))
3335 		return (0);
3336 	mtx_lock(&pr->pr_mtx);
3337 	if (!(pr->pr_flags & PR_IP6)) {
3338 		mtx_unlock(&pr->pr_mtx);
3339 		return (0);
3340 	}
3341 	if (pr->pr_ip6 == NULL) {
3342 		mtx_unlock(&pr->pr_mtx);
3343 		return (EAFNOSUPPORT);
3344 	}
3345 
3346 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3347 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3348 		mtx_unlock(&pr->pr_mtx);
3349 		return (0);
3350 	}
3351 
3352 	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
3353 		/*
3354 		 * In case there is only 1 IPv6 address, and v6only is true,
3355 		 * then bind directly.
3356 		 */
3357 		if (v6only != 0 && pr->pr_ip6s == 1)
3358 			bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3359 		mtx_unlock(&pr->pr_mtx);
3360 		return (0);
3361 	}
3362 
3363 	error = _prison_check_ip6(pr, ia6);
3364 	mtx_unlock(&pr->pr_mtx);
3365 	return (error);
3366 }
3367 
3368 /*
3369  * Rewrite destination address in case we will connect to loopback address.
3370  *
3371  * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
3372  */
3373 int
prison_remote_ip6(struct ucred * cred,struct in6_addr * ia6)3374 prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
3375 {
3376 	struct prison *pr;
3377 
3378 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3379 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3380 
3381 	pr = cred->cr_prison;
3382 	if (!(pr->pr_flags & PR_IP6))
3383 		return (0);
3384 	mtx_lock(&pr->pr_mtx);
3385 	if (!(pr->pr_flags & PR_IP6)) {
3386 		mtx_unlock(&pr->pr_mtx);
3387 		return (0);
3388 	}
3389 	if (pr->pr_ip6 == NULL) {
3390 		mtx_unlock(&pr->pr_mtx);
3391 		return (EAFNOSUPPORT);
3392 	}
3393 
3394 	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
3395 		bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
3396 		mtx_unlock(&pr->pr_mtx);
3397 		return (0);
3398 	}
3399 
3400 	/*
3401 	 * Return success because nothing had to be changed.
3402 	 */
3403 	mtx_unlock(&pr->pr_mtx);
3404 	return (0);
3405 }
3406 
3407 /*
3408  * Check if given address belongs to the jail referenced by cred/prison.
3409  *
3410  * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
3411  * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
3412  * doesn't allow IPv6.
3413  */
3414 static int
_prison_check_ip6(struct prison * pr,struct in6_addr * ia6)3415 _prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
3416 {
3417 	int i, a, z, d;
3418 
3419 	/*
3420 	 * Check the primary IP.
3421 	 */
3422 	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
3423 		return (0);
3424 
3425 	/*
3426 	 * All the other IPs are sorted so we can do a binary search.
3427 	 */
3428 	a = 0;
3429 	z = pr->pr_ip6s - 2;
3430 	while (a <= z) {
3431 		i = (a + z) / 2;
3432 		d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
3433 		if (d > 0)
3434 			z = i - 1;
3435 		else if (d < 0)
3436 			a = i + 1;
3437 		else
3438 			return (0);
3439 	}
3440 
3441 	return (EADDRNOTAVAIL);
3442 }
3443 
3444 int
prison_check_ip6(struct ucred * cred,struct in6_addr * ia6)3445 prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
3446 {
3447 	struct prison *pr;
3448 	int error;
3449 
3450 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3451 	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
3452 
3453 	pr = cred->cr_prison;
3454 	if (!(pr->pr_flags & PR_IP6))
3455 		return (0);
3456 	mtx_lock(&pr->pr_mtx);
3457 	if (!(pr->pr_flags & PR_IP6)) {
3458 		mtx_unlock(&pr->pr_mtx);
3459 		return (0);
3460 	}
3461 	if (pr->pr_ip6 == NULL) {
3462 		mtx_unlock(&pr->pr_mtx);
3463 		return (EAFNOSUPPORT);
3464 	}
3465 
3466 	error = _prison_check_ip6(pr, ia6);
3467 	mtx_unlock(&pr->pr_mtx);
3468 	return (error);
3469 }
3470 #endif
3471 
3472 /*
3473  * Check if a jail supports the given address family.
3474  *
3475  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3476  * if not.
3477  */
3478 int
prison_check_af(struct ucred * cred,int af)3479 prison_check_af(struct ucred *cred, int af)
3480 {
3481 	struct prison *pr;
3482 	int error;
3483 
3484 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3485 
3486 	pr = cred->cr_prison;
3487 #ifdef VIMAGE
3488 	/* Prisons with their own network stack are not limited. */
3489 	if (prison_owns_vnet(cred))
3490 		return (0);
3491 #endif
3492 
3493 	error = 0;
3494 	switch (af)
3495 	{
3496 #ifdef INET
3497 	case AF_INET:
3498 		if (pr->pr_flags & PR_IP4)
3499 		{
3500 			mtx_lock(&pr->pr_mtx);
3501 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
3502 				error = EAFNOSUPPORT;
3503 			mtx_unlock(&pr->pr_mtx);
3504 		}
3505 		break;
3506 #endif
3507 #ifdef INET6
3508 	case AF_INET6:
3509 		if (pr->pr_flags & PR_IP6)
3510 		{
3511 			mtx_lock(&pr->pr_mtx);
3512 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
3513 				error = EAFNOSUPPORT;
3514 			mtx_unlock(&pr->pr_mtx);
3515 		}
3516 		break;
3517 #endif
3518 	case AF_LOCAL:
3519 	case AF_ROUTE:
3520 		break;
3521 	default:
3522 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3523 			error = EAFNOSUPPORT;
3524 	}
3525 	return (error);
3526 }
3527 
3528 /*
3529  * Check if given address belongs to the jail referenced by cred (wrapper to
3530  * prison_check_ip[46]).
3531  *
3532  * Returns 0 if jail doesn't restrict the address family or if address belongs
3533  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3534  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3535  */
3536 int
prison_if(struct ucred * cred,struct sockaddr * sa)3537 prison_if(struct ucred *cred, struct sockaddr *sa)
3538 {
3539 #ifdef INET
3540 	struct sockaddr_in *sai;
3541 #endif
3542 #ifdef INET6
3543 	struct sockaddr_in6 *sai6;
3544 #endif
3545 	int error;
3546 
3547 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3548 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3549 
3550 #ifdef VIMAGE
3551 	if (prison_owns_vnet(cred))
3552 		return (0);
3553 #endif
3554 
3555 	error = 0;
3556 	switch (sa->sa_family)
3557 	{
3558 #ifdef INET
3559 	case AF_INET:
3560 		sai = (struct sockaddr_in *)sa;
3561 		error = prison_check_ip4(cred, &sai->sin_addr);
3562 		break;
3563 #endif
3564 #ifdef INET6
3565 	case AF_INET6:
3566 		sai6 = (struct sockaddr_in6 *)sa;
3567 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3568 		break;
3569 #endif
3570 	default:
3571 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3572 			error = EAFNOSUPPORT;
3573 	}
3574 	return (error);
3575 }
3576 
3577 /*
3578  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3579  */
3580 int
prison_check(struct ucred * cred1,struct ucred * cred2)3581 prison_check(struct ucred *cred1, struct ucred *cred2)
3582 {
3583 
3584 	return ((cred1->cr_prison == cred2->cr_prison ||
3585 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3586 }
3587 
3588 /*
3589  * Return 1 if p2 is a child of p1, otherwise 0.
3590  */
3591 int
prison_ischild(struct prison * pr1,struct prison * pr2)3592 prison_ischild(struct prison *pr1, struct prison *pr2)
3593 {
3594 
3595 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3596 		if (pr1 == pr2)
3597 			return (1);
3598 	return (0);
3599 }
3600 
3601 /*
3602  * Return 1 if the passed credential is in a jail, otherwise 0.
3603  */
3604 int
jailed(struct ucred * cred)3605 jailed(struct ucred *cred)
3606 {
3607 
3608 	return (cred->cr_prison != &prison0);
3609 }
3610 
3611 /*
3612  * Return 1 if the passed credential is in a jail and that jail does not
3613  * have its own virtual network stack, otherwise 0.
3614  */
3615 int
jailed_without_vnet(struct ucred * cred)3616 jailed_without_vnet(struct ucred *cred)
3617 {
3618 
3619 	if (!jailed(cred))
3620 		return (0);
3621 #ifdef VIMAGE
3622 	if (prison_owns_vnet(cred))
3623 		return (0);
3624 #endif
3625 
3626 	return (1);
3627 }
3628 
3629 /*
3630  * Return the correct hostname (domainname, et al) for the passed credential.
3631  */
3632 void
getcredhostname(struct ucred * cred,char * buf,size_t size)3633 getcredhostname(struct ucred *cred, char *buf, size_t size)
3634 {
3635 	struct prison *pr;
3636 
3637 	/*
3638 	 * A NULL credential can be used to shortcut to the physical
3639 	 * system's hostname.
3640 	 */
3641 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3642 	mtx_lock(&pr->pr_mtx);
3643 	strlcpy(buf, pr->pr_hostname, size);
3644 	mtx_unlock(&pr->pr_mtx);
3645 }
3646 
3647 void
getcreddomainname(struct ucred * cred,char * buf,size_t size)3648 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3649 {
3650 
3651 	mtx_lock(&cred->cr_prison->pr_mtx);
3652 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3653 	mtx_unlock(&cred->cr_prison->pr_mtx);
3654 }
3655 
3656 void
getcredhostuuid(struct ucred * cred,char * buf,size_t size)3657 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3658 {
3659 
3660 	mtx_lock(&cred->cr_prison->pr_mtx);
3661 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3662 	mtx_unlock(&cred->cr_prison->pr_mtx);
3663 }
3664 
3665 void
getcredhostid(struct ucred * cred,unsigned long * hostid)3666 getcredhostid(struct ucred *cred, unsigned long *hostid)
3667 {
3668 
3669 	mtx_lock(&cred->cr_prison->pr_mtx);
3670 	*hostid = cred->cr_prison->pr_hostid;
3671 	mtx_unlock(&cred->cr_prison->pr_mtx);
3672 }
3673 
3674 #ifdef VIMAGE
3675 /*
3676  * Determine whether the prison represented by cred owns
3677  * its vnet rather than having it inherited.
3678  *
3679  * Returns 1 in case the prison owns the vnet, 0 otherwise.
3680  */
3681 int
prison_owns_vnet(struct ucred * cred)3682 prison_owns_vnet(struct ucred *cred)
3683 {
3684 
3685 	/*
3686 	 * vnets cannot be added/removed after jail creation,
3687 	 * so no need to lock here.
3688 	 */
3689 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3690 }
3691 #endif
3692 
3693 /*
3694  * Determine whether the subject represented by cred can "see"
3695  * status of a mount point.
3696  * Returns: 0 for permitted, ENOENT otherwise.
3697  * XXX: This function should be called cr_canseemount() and should be
3698  *      placed in kern_prot.c.
3699  */
3700 int
prison_canseemount(struct ucred * cred,struct mount * mp)3701 prison_canseemount(struct ucred *cred, struct mount *mp)
3702 {
3703 	struct prison *pr;
3704 	struct statfs *sp;
3705 	size_t len;
3706 
3707 	pr = cred->cr_prison;
3708 	if (pr->pr_enforce_statfs == 0)
3709 		return (0);
3710 	if (pr->pr_root->v_mount == mp)
3711 		return (0);
3712 	if (pr->pr_enforce_statfs == 2)
3713 		return (ENOENT);
3714 	/*
3715 	 * If jail's chroot directory is set to "/" we should be able to see
3716 	 * all mount-points from inside a jail.
3717 	 * This is ugly check, but this is the only situation when jail's
3718 	 * directory ends with '/'.
3719 	 */
3720 	if (strcmp(pr->pr_path, "/") == 0)
3721 		return (0);
3722 	len = strlen(pr->pr_path);
3723 	sp = &mp->mnt_stat;
3724 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3725 		return (ENOENT);
3726 	/*
3727 	 * Be sure that we don't have situation where jail's root directory
3728 	 * is "/some/path" and mount point is "/some/pathpath".
3729 	 */
3730 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3731 		return (ENOENT);
3732 	return (0);
3733 }
3734 
3735 void
prison_enforce_statfs(struct ucred * cred,struct mount * mp,struct statfs * sp)3736 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3737 {
3738 	char jpath[MAXPATHLEN];
3739 	struct prison *pr;
3740 	size_t len;
3741 
3742 	pr = cred->cr_prison;
3743 	if (pr->pr_enforce_statfs == 0)
3744 		return;
3745 	if (prison_canseemount(cred, mp) != 0) {
3746 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3747 		strlcpy(sp->f_mntonname, "[restricted]",
3748 		    sizeof(sp->f_mntonname));
3749 		return;
3750 	}
3751 	if (pr->pr_root->v_mount == mp) {
3752 		/*
3753 		 * Clear current buffer data, so we are sure nothing from
3754 		 * the valid path left there.
3755 		 */
3756 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3757 		*sp->f_mntonname = '/';
3758 		return;
3759 	}
3760 	/*
3761 	 * If jail's chroot directory is set to "/" we should be able to see
3762 	 * all mount-points from inside a jail.
3763 	 */
3764 	if (strcmp(pr->pr_path, "/") == 0)
3765 		return;
3766 	len = strlen(pr->pr_path);
3767 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3768 	/*
3769 	 * Clear current buffer data, so we are sure nothing from
3770 	 * the valid path left there.
3771 	 */
3772 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3773 	if (*jpath == '\0') {
3774 		/* Should never happen. */
3775 		*sp->f_mntonname = '/';
3776 	} else {
3777 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3778 	}
3779 }
3780 
3781 /*
3782  * Check with permission for a specific privilege is granted within jail.  We
3783  * have a specific list of accepted privileges; the rest are denied.
3784  */
3785 int
prison_priv_check(struct ucred * cred,int priv)3786 prison_priv_check(struct ucred *cred, int priv)
3787 {
3788 
3789 	if (!jailed(cred))
3790 		return (0);
3791 
3792 #ifdef VIMAGE
3793 	/*
3794 	 * Privileges specific to prisons with a virtual network stack.
3795 	 * There might be a duplicate entry here in case the privilege
3796 	 * is only granted conditionally in the legacy jail case.
3797 	 */
3798 	switch (priv) {
3799 #ifdef notyet
3800 		/*
3801 		 * NFS-specific privileges.
3802 		 */
3803 	case PRIV_NFS_DAEMON:
3804 	case PRIV_NFS_LOCKD:
3805 #endif
3806 		/*
3807 		 * Network stack privileges.
3808 		 */
3809 	case PRIV_NET_BRIDGE:
3810 	case PRIV_NET_GRE:
3811 	case PRIV_NET_BPF:
3812 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3813 	case PRIV_NET_ROUTE:
3814 	case PRIV_NET_TAP:
3815 	case PRIV_NET_SETIFMTU:
3816 	case PRIV_NET_SETIFFLAGS:
3817 	case PRIV_NET_SETIFCAP:
3818 	case PRIV_NET_SETIFDESCR:
3819 	case PRIV_NET_SETIFNAME	:
3820 	case PRIV_NET_SETIFMETRIC:
3821 	case PRIV_NET_SETIFPHYS:
3822 	case PRIV_NET_SETIFMAC:
3823 	case PRIV_NET_ADDMULTI:
3824 	case PRIV_NET_DELMULTI:
3825 	case PRIV_NET_HWIOCTL:
3826 	case PRIV_NET_SETLLADDR:
3827 	case PRIV_NET_ADDIFGROUP:
3828 	case PRIV_NET_DELIFGROUP:
3829 	case PRIV_NET_IFCREATE:
3830 	case PRIV_NET_IFDESTROY:
3831 	case PRIV_NET_ADDIFADDR:
3832 	case PRIV_NET_DELIFADDR:
3833 	case PRIV_NET_LAGG:
3834 	case PRIV_NET_GIF:
3835 	case PRIV_NET_SETIFVNET:
3836 	case PRIV_NET_SETIFFIB:
3837 
3838 		/*
3839 		 * 802.11-related privileges.
3840 		 */
3841 	case PRIV_NET80211_GETKEY:
3842 #ifdef notyet
3843 	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
3844 #endif
3845 
3846 #ifdef notyet
3847 		/*
3848 		 * AppleTalk privileges.
3849 		 */
3850 	case PRIV_NETATALK_RESERVEDPORT:
3851 
3852 		/*
3853 		 * ATM privileges.
3854 		 */
3855 	case PRIV_NETATM_CFG:
3856 	case PRIV_NETATM_ADD:
3857 	case PRIV_NETATM_DEL:
3858 	case PRIV_NETATM_SET:
3859 
3860 		/*
3861 		 * Bluetooth privileges.
3862 		 */
3863 	case PRIV_NETBLUETOOTH_RAW:
3864 #endif
3865 
3866 		/*
3867 		 * Netgraph and netgraph module privileges.
3868 		 */
3869 	case PRIV_NETGRAPH_CONTROL:
3870 #ifdef notyet
3871 	case PRIV_NETGRAPH_TTY:
3872 #endif
3873 
3874 		/*
3875 		 * IPv4 and IPv6 privileges.
3876 		 */
3877 	case PRIV_NETINET_IPFW:
3878 	case PRIV_NETINET_DIVERT:
3879 	case PRIV_NETINET_PF:
3880 	case PRIV_NETINET_DUMMYNET:
3881 	case PRIV_NETINET_CARP:
3882 	case PRIV_NETINET_MROUTE:
3883 	case PRIV_NETINET_RAW:
3884 	case PRIV_NETINET_ADDRCTRL6:
3885 	case PRIV_NETINET_ND6:
3886 	case PRIV_NETINET_SCOPE6:
3887 	case PRIV_NETINET_ALIFETIME6:
3888 	case PRIV_NETINET_IPSEC:
3889 	case PRIV_NETINET_BINDANY:
3890 
3891 #ifdef notyet
3892 		/*
3893 		 * IPX/SPX privileges.
3894 		 */
3895 	case PRIV_NETIPX_RESERVEDPORT:
3896 	case PRIV_NETIPX_RAW:
3897 
3898 		/*
3899 		 * NCP privileges.
3900 		 */
3901 	case PRIV_NETNCP:
3902 
3903 		/*
3904 		 * SMB privileges.
3905 		 */
3906 	case PRIV_NETSMB:
3907 #endif
3908 
3909 	/*
3910 	 * No default: or deny here.
3911 	 * In case of no permit fall through to next switch().
3912 	 */
3913 		if (cred->cr_prison->pr_flags & PR_VNET)
3914 			return (0);
3915 	}
3916 #endif /* VIMAGE */
3917 
3918 	switch (priv) {
3919 
3920 		/*
3921 		 * Allow ktrace privileges for root in jail.
3922 		 */
3923 	case PRIV_KTRACE:
3924 
3925 #if 0
3926 		/*
3927 		 * Allow jailed processes to configure audit identity and
3928 		 * submit audit records (login, etc).  In the future we may
3929 		 * want to further refine the relationship between audit and
3930 		 * jail.
3931 		 */
3932 	case PRIV_AUDIT_GETAUDIT:
3933 	case PRIV_AUDIT_SETAUDIT:
3934 	case PRIV_AUDIT_SUBMIT:
3935 #endif
3936 
3937 		/*
3938 		 * Allow jailed processes to manipulate process UNIX
3939 		 * credentials in any way they see fit.
3940 		 */
3941 	case PRIV_CRED_SETUID:
3942 	case PRIV_CRED_SETEUID:
3943 	case PRIV_CRED_SETGID:
3944 	case PRIV_CRED_SETEGID:
3945 	case PRIV_CRED_SETGROUPS:
3946 	case PRIV_CRED_SETREUID:
3947 	case PRIV_CRED_SETREGID:
3948 	case PRIV_CRED_SETRESUID:
3949 	case PRIV_CRED_SETRESGID:
3950 
3951 		/*
3952 		 * Jail implements visibility constraints already, so allow
3953 		 * jailed root to override uid/gid-based constraints.
3954 		 */
3955 	case PRIV_SEEOTHERGIDS:
3956 	case PRIV_SEEOTHERUIDS:
3957 
3958 		/*
3959 		 * Jail implements inter-process debugging limits already, so
3960 		 * allow jailed root various debugging privileges.
3961 		 */
3962 	case PRIV_DEBUG_DIFFCRED:
3963 	case PRIV_DEBUG_SUGID:
3964 	case PRIV_DEBUG_UNPRIV:
3965 
3966 		/*
3967 		 * Allow jail to set various resource limits and login
3968 		 * properties, and for now, exceed process resource limits.
3969 		 */
3970 	case PRIV_PROC_LIMIT:
3971 	case PRIV_PROC_SETLOGIN:
3972 	case PRIV_PROC_SETRLIMIT:
3973 
3974 		/*
3975 		 * System V and POSIX IPC privileges are granted in jail.
3976 		 */
3977 	case PRIV_IPC_READ:
3978 	case PRIV_IPC_WRITE:
3979 	case PRIV_IPC_ADMIN:
3980 	case PRIV_IPC_MSGSIZE:
3981 	case PRIV_MQ_ADMIN:
3982 
3983 		/*
3984 		 * Jail operations within a jail work on child jails.
3985 		 */
3986 	case PRIV_JAIL_ATTACH:
3987 	case PRIV_JAIL_SET:
3988 	case PRIV_JAIL_REMOVE:
3989 
3990 		/*
3991 		 * Jail implements its own inter-process limits, so allow
3992 		 * root processes in jail to change scheduling on other
3993 		 * processes in the same jail.  Likewise for signalling.
3994 		 */
3995 	case PRIV_SCHED_DIFFCRED:
3996 	case PRIV_SCHED_CPUSET:
3997 	case PRIV_SIGNAL_DIFFCRED:
3998 	case PRIV_SIGNAL_SUGID:
3999 
4000 		/*
4001 		 * Allow jailed processes to write to sysctls marked as jail
4002 		 * writable.
4003 		 */
4004 	case PRIV_SYSCTL_WRITEJAIL:
4005 
4006 		/*
4007 		 * Allow root in jail to manage a variety of quota
4008 		 * properties.  These should likely be conditional on a
4009 		 * configuration option.
4010 		 */
4011 	case PRIV_VFS_GETQUOTA:
4012 	case PRIV_VFS_SETQUOTA:
4013 
4014 		/*
4015 		 * Since Jail relies on chroot() to implement file system
4016 		 * protections, grant many VFS privileges to root in jail.
4017 		 * Be careful to exclude mount-related and NFS-related
4018 		 * privileges.
4019 		 */
4020 	case PRIV_VFS_READ:
4021 	case PRIV_VFS_WRITE:
4022 	case PRIV_VFS_ADMIN:
4023 	case PRIV_VFS_EXEC:
4024 	case PRIV_VFS_LOOKUP:
4025 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4026 	case PRIV_VFS_CHFLAGS_DEV:
4027 	case PRIV_VFS_CHOWN:
4028 	case PRIV_VFS_CHROOT:
4029 	case PRIV_VFS_RETAINSUGID:
4030 	case PRIV_VFS_FCHROOT:
4031 	case PRIV_VFS_LINK:
4032 	case PRIV_VFS_SETGID:
4033 	case PRIV_VFS_STAT:
4034 	case PRIV_VFS_STICKYFILE:
4035 
4036 		/*
4037 		 * As in the non-jail case, non-root users are expected to be
4038 		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
4039 		 * exists in the jail and they have permission to access it).
4040 		 */
4041 	case PRIV_KMEM_READ:
4042 		return (0);
4043 
4044 		/*
4045 		 * Depending on the global setting, allow privilege of
4046 		 * setting system flags.
4047 		 */
4048 	case PRIV_VFS_SYSFLAGS:
4049 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4050 			return (0);
4051 		else
4052 			return (EPERM);
4053 
4054 		/*
4055 		 * Depending on the global setting, allow privilege of
4056 		 * mounting/unmounting file systems.
4057 		 */
4058 	case PRIV_VFS_MOUNT:
4059 	case PRIV_VFS_UNMOUNT:
4060 	case PRIV_VFS_MOUNT_NONUSER:
4061 	case PRIV_VFS_MOUNT_OWNER:
4062 		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
4063 		    cred->cr_prison->pr_enforce_statfs < 2)
4064 			return (0);
4065 		else
4066 			return (EPERM);
4067 
4068 		/*
4069 		 * Allow jailed root to bind reserved ports and reuse in-use
4070 		 * ports.
4071 		 */
4072 	case PRIV_NETINET_RESERVEDPORT:
4073 	case PRIV_NETINET_REUSEPORT:
4074 		return (0);
4075 
4076 		/*
4077 		 * Allow jailed root to set certain IPv4/6 (option) headers.
4078 		 */
4079 	case PRIV_NETINET_SETHDROPTS:
4080 		return (0);
4081 
4082 		/*
4083 		 * Conditionally allow creating raw sockets in jail.
4084 		 */
4085 	case PRIV_NETINET_RAW:
4086 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4087 			return (0);
4088 		else
4089 			return (EPERM);
4090 
4091 		/*
4092 		 * Since jail implements its own visibility limits on netstat
4093 		 * sysctls, allow getcred.  This allows identd to work in
4094 		 * jail.
4095 		 */
4096 	case PRIV_NETINET_GETCRED:
4097 		return (0);
4098 
4099 		/*
4100 		 * Allow jailed root to set loginclass.
4101 		 */
4102 	case PRIV_PROC_SETLOGINCLASS:
4103 		return (0);
4104 
4105 	default:
4106 		/*
4107 		 * In all remaining cases, deny the privilege request.  This
4108 		 * includes almost all network privileges, many system
4109 		 * configuration privileges.
4110 		 */
4111 		return (EPERM);
4112 	}
4113 }
4114 
4115 /*
4116  * Return the part of pr2's name that is relative to pr1, or the whole name
4117  * if it does not directly follow.
4118  */
4119 
4120 char *
prison_name(struct prison * pr1,struct prison * pr2)4121 prison_name(struct prison *pr1, struct prison *pr2)
4122 {
4123 	char *name;
4124 
4125 	/* Jails see themselves as "0" (if they see themselves at all). */
4126 	if (pr1 == pr2)
4127 		return "0";
4128 	name = pr2->pr_name;
4129 	if (prison_ischild(pr1, pr2)) {
4130 		/*
4131 		 * pr1 isn't locked (and allprison_lock may not be either)
4132 		 * so its length can't be counted on.  But the number of dots
4133 		 * can be counted on - and counted.
4134 		 */
4135 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4136 			name = strchr(name, '.') + 1;
4137 	}
4138 	return (name);
4139 }
4140 
4141 /*
4142  * Return the part of pr2's path that is relative to pr1, or the whole path
4143  * if it does not directly follow.
4144  */
4145 static char *
prison_path(struct prison * pr1,struct prison * pr2)4146 prison_path(struct prison *pr1, struct prison *pr2)
4147 {
4148 	char *path1, *path2;
4149 	int len1;
4150 
4151 	path1 = pr1->pr_path;
4152 	path2 = pr2->pr_path;
4153 	if (!strcmp(path1, "/"))
4154 		return (path2);
4155 	len1 = strlen(path1);
4156 	if (strncmp(path1, path2, len1))
4157 		return (path2);
4158 	if (path2[len1] == '\0')
4159 		return "/";
4160 	if (path2[len1] == '/')
4161 		return (path2 + len1);
4162 	return (path2);
4163 }
4164 
4165 
4166 /*
4167  * Jail-related sysctls.
4168  */
4169 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
4170     "Jails");
4171 
4172 static int
sysctl_jail_list(SYSCTL_HANDLER_ARGS)4173 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4174 {
4175 	struct xprison *xp;
4176 	struct prison *pr, *cpr;
4177 #ifdef INET
4178 	struct in_addr *ip4 = NULL;
4179 	int ip4s = 0;
4180 #endif
4181 #ifdef INET6
4182 	struct in6_addr *ip6 = NULL;
4183 	int ip6s = 0;
4184 #endif
4185 	int descend, error;
4186 
4187 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4188 	pr = req->td->td_ucred->cr_prison;
4189 	error = 0;
4190 	sx_slock(&allprison_lock);
4191 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4192 #if defined(INET) || defined(INET6)
4193  again:
4194 #endif
4195 		mtx_lock(&cpr->pr_mtx);
4196 #ifdef INET
4197 		if (cpr->pr_ip4s > 0) {
4198 			if (ip4s < cpr->pr_ip4s) {
4199 				ip4s = cpr->pr_ip4s;
4200 				mtx_unlock(&cpr->pr_mtx);
4201 				ip4 = realloc(ip4, ip4s *
4202 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
4203 				goto again;
4204 			}
4205 			bcopy(cpr->pr_ip4, ip4,
4206 			    cpr->pr_ip4s * sizeof(struct in_addr));
4207 		}
4208 #endif
4209 #ifdef INET6
4210 		if (cpr->pr_ip6s > 0) {
4211 			if (ip6s < cpr->pr_ip6s) {
4212 				ip6s = cpr->pr_ip6s;
4213 				mtx_unlock(&cpr->pr_mtx);
4214 				ip6 = realloc(ip6, ip6s *
4215 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
4216 				goto again;
4217 			}
4218 			bcopy(cpr->pr_ip6, ip6,
4219 			    cpr->pr_ip6s * sizeof(struct in6_addr));
4220 		}
4221 #endif
4222 		if (cpr->pr_ref == 0) {
4223 			mtx_unlock(&cpr->pr_mtx);
4224 			continue;
4225 		}
4226 		bzero(xp, sizeof(*xp));
4227 		xp->pr_version = XPRISON_VERSION;
4228 		xp->pr_id = cpr->pr_id;
4229 		xp->pr_state = cpr->pr_uref > 0
4230 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
4231 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4232 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4233 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4234 #ifdef INET
4235 		xp->pr_ip4s = cpr->pr_ip4s;
4236 #endif
4237 #ifdef INET6
4238 		xp->pr_ip6s = cpr->pr_ip6s;
4239 #endif
4240 		mtx_unlock(&cpr->pr_mtx);
4241 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4242 		if (error)
4243 			break;
4244 #ifdef INET
4245 		if (xp->pr_ip4s > 0) {
4246 			error = SYSCTL_OUT(req, ip4,
4247 			    xp->pr_ip4s * sizeof(struct in_addr));
4248 			if (error)
4249 				break;
4250 		}
4251 #endif
4252 #ifdef INET6
4253 		if (xp->pr_ip6s > 0) {
4254 			error = SYSCTL_OUT(req, ip6,
4255 			    xp->pr_ip6s * sizeof(struct in6_addr));
4256 			if (error)
4257 				break;
4258 		}
4259 #endif
4260 	}
4261 	sx_sunlock(&allprison_lock);
4262 	free(xp, M_TEMP);
4263 #ifdef INET
4264 	free(ip4, M_TEMP);
4265 #endif
4266 #ifdef INET6
4267 	free(ip6, M_TEMP);
4268 #endif
4269 	return (error);
4270 }
4271 
4272 SYSCTL_OID(_security_jail, OID_AUTO, list,
4273     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4274     sysctl_jail_list, "S", "List of active jails");
4275 
4276 static int
sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)4277 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4278 {
4279 	int error, injail;
4280 
4281 	injail = jailed(req->td->td_ucred);
4282 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4283 
4284 	return (error);
4285 }
4286 
4287 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4288     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4289     sysctl_jail_jailed, "I", "Process in jail?");
4290 
4291 static int
sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)4292 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4293 {
4294 	int error, havevnet;
4295 #ifdef VIMAGE
4296 	struct ucred *cred = req->td->td_ucred;
4297 
4298 	havevnet = jailed(cred) && prison_owns_vnet(cred);
4299 #else
4300 	havevnet = 0;
4301 #endif
4302 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4303 
4304 	return (error);
4305 }
4306 
4307 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4308     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4309     sysctl_jail_vnet, "I", "Jail owns VNET?");
4310 
4311 #if defined(INET) || defined(INET6)
4312 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4313     &jail_max_af_ips, 0,
4314     "Number of IP addresses a jail may have at most per address family (deprecated)");
4315 #endif
4316 
4317 /*
4318  * Default parameters for jail(2) compatibility.  For historical reasons,
4319  * the sysctl names have varying similarity to the parameter names.  Prisons
4320  * just see their own parameters, and can't change them.
4321  */
4322 static int
sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)4323 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4324 {
4325 	struct prison *pr;
4326 	int allow, error, i;
4327 
4328 	pr = req->td->td_ucred->cr_prison;
4329 	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
4330 
4331 	/* Get the current flag value, and convert it to a boolean. */
4332 	i = (allow & arg2) ? 1 : 0;
4333 	if (arg1 != NULL)
4334 		i = !i;
4335 	error = sysctl_handle_int(oidp, &i, 0, req);
4336 	if (error || !req->newptr)
4337 		return (error);
4338 	i = i ? arg2 : 0;
4339 	if (arg1 != NULL)
4340 		i ^= arg2;
4341 	/*
4342 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4343 	 * for writing.
4344 	 */
4345 	mtx_lock(&prison0.pr_mtx);
4346 	jail_default_allow = (jail_default_allow & ~arg2) | i;
4347 	mtx_unlock(&prison0.pr_mtx);
4348 	return (0);
4349 }
4350 
4351 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4352     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4353     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4354     "Processes in jail can set their hostnames (deprecated)");
4355 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4356     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4357     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4358     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4359 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4360     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4361     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4362     "Processes in jail can use System V IPC primitives (deprecated)");
4363 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4364     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4365     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4366     "Prison root can create raw sockets (deprecated)");
4367 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4368     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4369     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4370     "Processes in jail can alter system file flags (deprecated)");
4371 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4372     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4373     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4374     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4375 SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
4376     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4377     NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
4378     "Processes in jail can mount the devfs file system (deprecated)");
4379 SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
4380     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4381     NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
4382     "Processes in jail can mount the fdescfs file system (deprecated)");
4383 SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
4384     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4385     NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
4386     "Processes in jail can mount the nullfs file system (deprecated)");
4387 SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
4388     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4389     NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
4390     "Processes in jail can mount the procfs file system (deprecated)");
4391 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
4392     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4393     NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
4394     "Processes in jail can mount the linprocfs file system (deprecated)");
4395 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
4396     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4397     NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
4398     "Processes in jail can mount the linsysfs file system (deprecated)");
4399 SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
4400     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4401     NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
4402     "Processes in jail can mount the tmpfs file system (deprecated)");
4403 SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
4404     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4405     NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
4406     "Processes in jail can mount the zfs file system (deprecated)");
4407 
4408 static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)4409 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4410 {
4411 	struct prison *pr;
4412 	int level, error;
4413 
4414 	pr = req->td->td_ucred->cr_prison;
4415 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4416 	error = sysctl_handle_int(oidp, &level, 0, req);
4417 	if (error || !req->newptr)
4418 		return (error);
4419 	*(int *)arg1 = level;
4420 	return (0);
4421 }
4422 
4423 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4424     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4425     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4426     sysctl_jail_default_level, "I",
4427     "Processes in jail cannot see all mounted file systems (deprecated)");
4428 
4429 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4430     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4431     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4432     sysctl_jail_default_level, "I",
4433     "Ruleset for the devfs filesystem in jail (deprecated)");
4434 
4435 /*
4436  * Nodes to describe jail parameters.  Maximum length of string parameters
4437  * is returned in the string itself, and the other parameters exist merely
4438  * to make themselves and their types known.
4439  */
4440 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
4441     "Jail parameters");
4442 
4443 int
sysctl_jail_param(SYSCTL_HANDLER_ARGS)4444 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4445 {
4446 	int i;
4447 	long l;
4448 	size_t s;
4449 	char numbuf[12];
4450 
4451 	switch (oidp->oid_kind & CTLTYPE)
4452 	{
4453 	case CTLTYPE_LONG:
4454 	case CTLTYPE_ULONG:
4455 		l = 0;
4456 #ifdef SCTL_MASK32
4457 		if (!(req->flags & SCTL_MASK32))
4458 #endif
4459 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4460 	case CTLTYPE_INT:
4461 	case CTLTYPE_UINT:
4462 		i = 0;
4463 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4464 	case CTLTYPE_STRING:
4465 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4466 		return
4467 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4468 	case CTLTYPE_STRUCT:
4469 		s = (size_t)arg2;
4470 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4471 	}
4472 	return (0);
4473 }
4474 
4475 /*
4476  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4477  * jail creation time but cannot be changed in an existing jail.
4478  */
4479 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4480 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4481 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4482 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4483 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4484     "I", "Jail secure level");
4485 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4486     "Jail value for kern.osreldate and uname -K");
4487 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4488     "Jail value for kern.osrelease and uname -r");
4489 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4490     "I", "Jail cannot see all mounted file systems");
4491 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4492     "I", "Ruleset for in-jail devfs mounts");
4493 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4494     "B", "Jail persistence");
4495 #ifdef VIMAGE
4496 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4497     "E,jailsys", "Virtual network stack");
4498 #endif
4499 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4500     "B", "Jail is in the process of shutting down");
4501 
4502 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4503 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4504     "I", "Current number of child jails");
4505 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4506     "I", "Maximum number of child jails");
4507 
4508 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4509 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4510     "Jail hostname");
4511 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4512     "Jail NIS domainname");
4513 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4514     "Jail host UUID");
4515 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4516     "LU", "Jail host ID");
4517 
4518 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4519 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4520 
4521 #ifdef INET
4522 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4523     "Jail IPv4 address virtualization");
4524 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4525     "S,in_addr,a", "Jail IPv4 addresses");
4526 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4527     "B", "Do (not) use IPv4 source address selection rather than the "
4528     "primary jail IPv4 address.");
4529 #endif
4530 #ifdef INET6
4531 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4532     "Jail IPv6 address virtualization");
4533 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4534     "S,in6_addr,a", "Jail IPv6 addresses");
4535 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4536     "B", "Do (not) use IPv6 source address selection rather than the "
4537     "primary jail IPv6 address.");
4538 #endif
4539 
4540 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4541 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4542     "B", "Jail may set hostname");
4543 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4544     "B", "Jail may use SYSV IPC");
4545 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4546     "B", "Jail may create raw sockets");
4547 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4548     "B", "Jail may alter system file flags");
4549 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4550     "B", "Jail may set file quotas");
4551 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4552     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4553 
4554 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4555 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4556     "B", "Jail may mount/unmount jail-friendly file systems in general");
4557 SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
4558     "B", "Jail may mount the devfs file system");
4559 SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
4560     "B", "Jail may mount the fdescfs file system");
4561 SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
4562     "B", "Jail may mount the nullfs file system");
4563 SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
4564     "B", "Jail may mount the procfs file system");
4565 SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
4566     "B", "Jail may mount the linprocfs file system");
4567 SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
4568     "B", "Jail may mount the linsysfs file system");
4569 SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
4570     "B", "Jail may mount the tmpfs file system");
4571 SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
4572     "B", "Jail may mount the zfs file system");
4573 
4574 #ifdef RACCT
4575 void
prison_racct_foreach(void (* callback)(struct racct * racct,void * arg2,void * arg3),void * arg2,void * arg3)4576 prison_racct_foreach(void (*callback)(struct racct *racct,
4577     void *arg2, void *arg3), void *arg2, void *arg3)
4578 {
4579 	struct prison_racct *prr;
4580 
4581 	ASSERT_RACCT_ENABLED();
4582 
4583 	sx_slock(&allprison_lock);
4584 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4585 		(callback)(prr->prr_racct, arg2, arg3);
4586 	sx_sunlock(&allprison_lock);
4587 }
4588 
4589 static struct prison_racct *
prison_racct_find_locked(const char * name)4590 prison_racct_find_locked(const char *name)
4591 {
4592 	struct prison_racct *prr;
4593 
4594 	ASSERT_RACCT_ENABLED();
4595 	sx_assert(&allprison_lock, SA_XLOCKED);
4596 
4597 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4598 		return (NULL);
4599 
4600 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4601 		if (strcmp(name, prr->prr_name) != 0)
4602 			continue;
4603 
4604 		/* Found prison_racct with a matching name? */
4605 		prison_racct_hold(prr);
4606 		return (prr);
4607 	}
4608 
4609 	/* Add new prison_racct. */
4610 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4611 	racct_create(&prr->prr_racct);
4612 
4613 	strcpy(prr->prr_name, name);
4614 	refcount_init(&prr->prr_refcount, 1);
4615 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4616 
4617 	return (prr);
4618 }
4619 
4620 struct prison_racct *
prison_racct_find(const char * name)4621 prison_racct_find(const char *name)
4622 {
4623 	struct prison_racct *prr;
4624 
4625 	ASSERT_RACCT_ENABLED();
4626 
4627 	sx_xlock(&allprison_lock);
4628 	prr = prison_racct_find_locked(name);
4629 	sx_xunlock(&allprison_lock);
4630 	return (prr);
4631 }
4632 
4633 void
prison_racct_hold(struct prison_racct * prr)4634 prison_racct_hold(struct prison_racct *prr)
4635 {
4636 
4637 	ASSERT_RACCT_ENABLED();
4638 
4639 	refcount_acquire(&prr->prr_refcount);
4640 }
4641 
4642 static void
prison_racct_free_locked(struct prison_racct * prr)4643 prison_racct_free_locked(struct prison_racct *prr)
4644 {
4645 
4646 	ASSERT_RACCT_ENABLED();
4647 	sx_assert(&allprison_lock, SA_XLOCKED);
4648 
4649 	if (refcount_release(&prr->prr_refcount)) {
4650 		racct_destroy(&prr->prr_racct);
4651 		LIST_REMOVE(prr, prr_next);
4652 		free(prr, M_PRISON_RACCT);
4653 	}
4654 }
4655 
4656 void
prison_racct_free(struct prison_racct * prr)4657 prison_racct_free(struct prison_racct *prr)
4658 {
4659 	int old;
4660 
4661 	ASSERT_RACCT_ENABLED();
4662 	sx_assert(&allprison_lock, SA_UNLOCKED);
4663 
4664 	old = prr->prr_refcount;
4665 	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4666 		return;
4667 
4668 	sx_xlock(&allprison_lock);
4669 	prison_racct_free_locked(prr);
4670 	sx_xunlock(&allprison_lock);
4671 }
4672 
4673 static void
prison_racct_attach(struct prison * pr)4674 prison_racct_attach(struct prison *pr)
4675 {
4676 	struct prison_racct *prr;
4677 
4678 	ASSERT_RACCT_ENABLED();
4679 	sx_assert(&allprison_lock, SA_XLOCKED);
4680 
4681 	prr = prison_racct_find_locked(pr->pr_name);
4682 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4683 
4684 	pr->pr_prison_racct = prr;
4685 }
4686 
4687 /*
4688  * Handle jail renaming.  From the racct point of view, renaming means
4689  * moving from one prison_racct to another.
4690  */
4691 static void
prison_racct_modify(struct prison * pr)4692 prison_racct_modify(struct prison *pr)
4693 {
4694 	struct proc *p;
4695 	struct ucred *cred;
4696 	struct prison_racct *oldprr;
4697 
4698 	ASSERT_RACCT_ENABLED();
4699 
4700 	sx_slock(&allproc_lock);
4701 	sx_xlock(&allprison_lock);
4702 
4703 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4704 		sx_xunlock(&allprison_lock);
4705 		sx_sunlock(&allproc_lock);
4706 		return;
4707 	}
4708 
4709 	oldprr = pr->pr_prison_racct;
4710 	pr->pr_prison_racct = NULL;
4711 
4712 	prison_racct_attach(pr);
4713 
4714 	/*
4715 	 * Move resource utilisation records.
4716 	 */
4717 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4718 
4719 	/*
4720 	 * Force rctl to reattach rules to processes.
4721 	 */
4722 	FOREACH_PROC_IN_SYSTEM(p) {
4723 		PROC_LOCK(p);
4724 		cred = crhold(p->p_ucred);
4725 		PROC_UNLOCK(p);
4726 		racct_proc_ucred_changed(p, cred, cred);
4727 		crfree(cred);
4728 	}
4729 
4730 	sx_sunlock(&allproc_lock);
4731 	prison_racct_free_locked(oldprr);
4732 	sx_xunlock(&allprison_lock);
4733 }
4734 
4735 static void
prison_racct_detach(struct prison * pr)4736 prison_racct_detach(struct prison *pr)
4737 {
4738 
4739 	ASSERT_RACCT_ENABLED();
4740 	sx_assert(&allprison_lock, SA_UNLOCKED);
4741 
4742 	if (pr->pr_prison_racct == NULL)
4743 		return;
4744 	prison_racct_free(pr->pr_prison_racct);
4745 	pr->pr_prison_racct = NULL;
4746 }
4747 #endif /* RACCT */
4748 
4749 #ifdef DDB
4750 
4751 static void
db_show_prison(struct prison * pr)4752 db_show_prison(struct prison *pr)
4753 {
4754 	int fi;
4755 #if defined(INET) || defined(INET6)
4756 	int ii;
4757 #endif
4758 	unsigned jsf;
4759 #ifdef INET6
4760 	char ip6buf[INET6_ADDRSTRLEN];
4761 #endif
4762 
4763 	db_printf("prison %p:\n", pr);
4764 	db_printf(" jid             = %d\n", pr->pr_id);
4765 	db_printf(" name            = %s\n", pr->pr_name);
4766 	db_printf(" parent          = %p\n", pr->pr_parent);
4767 	db_printf(" ref             = %d\n", pr->pr_ref);
4768 	db_printf(" uref            = %d\n", pr->pr_uref);
4769 	db_printf(" path            = %s\n", pr->pr_path);
4770 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4771 	    ? pr->pr_cpuset->cs_id : -1);
4772 #ifdef VIMAGE
4773 	db_printf(" vnet            = %p\n", pr->pr_vnet);
4774 #endif
4775 	db_printf(" root            = %p\n", pr->pr_root);
4776 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4777 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4778 	db_printf(" children.max    = %d\n", pr->pr_childmax);
4779 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4780 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4781 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4782 	db_printf(" flags           = 0x%x", pr->pr_flags);
4783 	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
4784 	    fi++)
4785 		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
4786 			db_printf(" %s", pr_flag_names[fi]);
4787 	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
4788 	    fi++) {
4789 		jsf = pr->pr_flags &
4790 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
4791 		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
4792 		    pr_flag_jailsys[fi].disable &&
4793 		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
4794 		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
4795 		    : "inherit");
4796 	}
4797 	db_printf(" allow           = 0x%x", pr->pr_allow);
4798 	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
4799 	    fi++)
4800 		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
4801 			db_printf(" %s", pr_allow_names[fi]);
4802 	db_printf("\n");
4803 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4804 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4805 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4806 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4807 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4808 #ifdef INET
4809 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
4810 	for (ii = 0; ii < pr->pr_ip4s; ii++)
4811 		db_printf(" %s %s\n",
4812 		    ii == 0 ? "ip4.addr        =" : "                 ",
4813 		    inet_ntoa(pr->pr_ip4[ii]));
4814 #endif
4815 #ifdef INET6
4816 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
4817 	for (ii = 0; ii < pr->pr_ip6s; ii++)
4818 		db_printf(" %s %s\n",
4819 		    ii == 0 ? "ip6.addr        =" : "                 ",
4820 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4821 #endif
4822 }
4823 
DB_SHOW_COMMAND(prison,db_show_prison_command)4824 DB_SHOW_COMMAND(prison, db_show_prison_command)
4825 {
4826 	struct prison *pr;
4827 
4828 	if (!have_addr) {
4829 		/*
4830 		 * Show all prisons in the list, and prison0 which is not
4831 		 * listed.
4832 		 */
4833 		db_show_prison(&prison0);
4834 		if (!db_pager_quit) {
4835 			TAILQ_FOREACH(pr, &allprison, pr_list) {
4836 				db_show_prison(pr);
4837 				if (db_pager_quit)
4838 					break;
4839 			}
4840 		}
4841 		return;
4842 	}
4843 
4844 	if (addr == 0)
4845 		pr = &prison0;
4846 	else {
4847 		/* Look for a prison with the ID and with references. */
4848 		TAILQ_FOREACH(pr, &allprison, pr_list)
4849 			if (pr->pr_id == addr && pr->pr_ref > 0)
4850 				break;
4851 		if (pr == NULL)
4852 			/* Look again, without requiring a reference. */
4853 			TAILQ_FOREACH(pr, &allprison, pr_list)
4854 				if (pr->pr_id == addr)
4855 					break;
4856 		if (pr == NULL)
4857 			/* Assume address points to a valid prison. */
4858 			pr = (struct prison *)addr;
4859 	}
4860 	db_show_prison(pr);
4861 }
4862 
4863 #endif /* DDB */
4864