xref: /NextBSD/sys/vm/vm_mmap.c (revision b18d8897b2bd56752e4cc571a6db8a2af9d31f43)
1 /*-
2  * Copyright (c) 1988 University of Utah.
3  * Copyright (c) 1991, 1993
4  *	The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the Systems Programming Group of the University of Utah Computer
8  * Science Department.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35  *
36  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
37  */
38 
39 /*
40  * Mapped file (mmap) interface to VM
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_compat.h"
47 #include "opt_hwpmc_hooks.h"
48 #include "opt_vm.h"
49 
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/capsicum.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/mutex.h>
56 #include <sys/sysproto.h>
57 #include <sys/filedesc.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/mman.h>
70 #include <sys/mount.h>
71 #include <sys/conf.h>
72 #include <sys/stat.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
76 
77 #include <security/mac/mac_framework.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_extern.h>
88 #include <vm/vm_page.h>
89 #include <vm/vnode_pager.h>
90 
91 #ifdef HWPMC_HOOKS
92 #include <sys/pmckern.h>
93 #endif
94 
95 int old_mlock = 0;
96 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
97     "Do not apply RLIMIT_MEMLOCK on mlockall");
98 
99 #ifdef MAP_32BIT
100 #define	MAP_32BIT_MAX_ADDR	((vm_offset_t)1 << 31)
101 #endif
102 
103 #ifndef _SYS_SYSPROTO_H_
104 struct sbrk_args {
105 	int incr;
106 };
107 #endif
108 
109 /*
110  * MPSAFE
111  */
112 /* ARGSUSED */
113 int
sys_sbrk(td,uap)114 sys_sbrk(td, uap)
115 	struct thread *td;
116 	struct sbrk_args *uap;
117 {
118 	/* Not yet implemented */
119 	return (EOPNOTSUPP);
120 }
121 
122 #ifndef _SYS_SYSPROTO_H_
123 struct sstk_args {
124 	int incr;
125 };
126 #endif
127 
128 /*
129  * MPSAFE
130  */
131 /* ARGSUSED */
132 int
sys_sstk(td,uap)133 sys_sstk(td, uap)
134 	struct thread *td;
135 	struct sstk_args *uap;
136 {
137 	/* Not yet implemented */
138 	return (EOPNOTSUPP);
139 }
140 
141 #if defined(COMPAT_43)
142 #ifndef _SYS_SYSPROTO_H_
143 struct getpagesize_args {
144 	int dummy;
145 };
146 #endif
147 
148 int
ogetpagesize(td,uap)149 ogetpagesize(td, uap)
150 	struct thread *td;
151 	struct getpagesize_args *uap;
152 {
153 	/* MP SAFE */
154 	td->td_retval[0] = PAGE_SIZE;
155 	return (0);
156 }
157 #endif				/* COMPAT_43 */
158 
159 
160 /*
161  * Memory Map (mmap) system call.  Note that the file offset
162  * and address are allowed to be NOT page aligned, though if
163  * the MAP_FIXED flag it set, both must have the same remainder
164  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
165  * page-aligned, the actual mapping starts at trunc_page(addr)
166  * and the return value is adjusted up by the page offset.
167  *
168  * Generally speaking, only character devices which are themselves
169  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
170  * there would be no cache coherency between a descriptor and a VM mapping
171  * both to the same character device.
172  */
173 #ifndef _SYS_SYSPROTO_H_
174 struct mmap_args {
175 	void *addr;
176 	size_t len;
177 	int prot;
178 	int flags;
179 	int fd;
180 	long pad;
181 	off_t pos;
182 };
183 #endif
184 
185 /*
186  * MPSAFE
187  */
188 int
sys_mmap(td,uap)189 sys_mmap(td, uap)
190 	struct thread *td;
191 	struct mmap_args *uap;
192 {
193 	struct file *fp;
194 	vm_offset_t addr;
195 	vm_size_t size, pageoff;
196 	vm_prot_t cap_maxprot;
197 	int align, error, flags, prot;
198 	off_t pos;
199 	struct vmspace *vms = td->td_proc->p_vmspace;
200 	cap_rights_t rights;
201 
202 	addr = (vm_offset_t) uap->addr;
203 	size = uap->len;
204 	prot = uap->prot;
205 	flags = uap->flags;
206 	pos = uap->pos;
207 
208 	fp = NULL;
209 
210 	/*
211 	 * Ignore old flags that used to be defined but did not do anything.
212 	 */
213 	flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
214 
215 	/*
216 	 * Enforce the constraints.
217 	 * Mapping of length 0 is only allowed for old binaries.
218 	 * Anonymous mapping shall specify -1 as filedescriptor and
219 	 * zero position for new code. Be nice to ancient a.out
220 	 * binaries and correct pos for anonymous mapping, since old
221 	 * ld.so sometimes issues anonymous map requests with non-zero
222 	 * pos.
223 	 */
224 	if (!SV_CURPROC_FLAG(SV_AOUT)) {
225 		if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
226 		    ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
227 			return (EINVAL);
228 	} else {
229 		if ((flags & MAP_ANON) != 0)
230 			pos = 0;
231 	}
232 
233 	if (flags & MAP_STACK) {
234 		if ((uap->fd != -1) ||
235 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
236 			return (EINVAL);
237 		flags |= MAP_ANON;
238 		pos = 0;
239 	}
240 	if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
241 	    MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
242 	    MAP_PREFAULT_READ |
243 #ifdef MAP_32BIT
244 	    MAP_32BIT |
245 #endif
246 	    MAP_ALIGNMENT_MASK)) != 0)
247 		return (EINVAL);
248 	if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
249 		return (EINVAL);
250 	if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
251 		return (EINVAL);
252 	if (prot != PROT_NONE &&
253 	    (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
254 		return (EINVAL);
255 
256 	/*
257 	 * Align the file position to a page boundary,
258 	 * and save its page offset component.
259 	 */
260 	pageoff = (pos & PAGE_MASK);
261 	pos -= pageoff;
262 
263 	/* Adjust size for rounding (on both ends). */
264 	size += pageoff;			/* low end... */
265 	size = (vm_size_t) round_page(size);	/* hi end */
266 
267 	/* Ensure alignment is at least a page and fits in a pointer. */
268 	align = flags & MAP_ALIGNMENT_MASK;
269 	if (align != 0 && align != MAP_ALIGNED_SUPER &&
270 	    (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
271 	    align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
272 		return (EINVAL);
273 
274 	/*
275 	 * Check for illegal addresses.  Watch out for address wrap... Note
276 	 * that VM_*_ADDRESS are not constants due to casts (argh).
277 	 */
278 	if (flags & MAP_FIXED) {
279 		/*
280 		 * The specified address must have the same remainder
281 		 * as the file offset taken modulo PAGE_SIZE, so it
282 		 * should be aligned after adjustment by pageoff.
283 		 */
284 		addr -= pageoff;
285 		if (addr & PAGE_MASK)
286 			return (EINVAL);
287 
288 		/* Address range must be all in user VM space. */
289 		if (addr < vm_map_min(&vms->vm_map) ||
290 		    addr + size > vm_map_max(&vms->vm_map))
291 			return (EINVAL);
292 		if (addr + size < addr)
293 			return (EINVAL);
294 #ifdef MAP_32BIT
295 		if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
296 			return (EINVAL);
297 	} else if (flags & MAP_32BIT) {
298 		/*
299 		 * For MAP_32BIT, override the hint if it is too high and
300 		 * do not bother moving the mapping past the heap (since
301 		 * the heap is usually above 2GB).
302 		 */
303 		if (addr + size > MAP_32BIT_MAX_ADDR)
304 			addr = 0;
305 #endif
306 	} else {
307 		/*
308 		 * XXX for non-fixed mappings where no hint is provided or
309 		 * the hint would fall in the potential heap space,
310 		 * place it after the end of the largest possible heap.
311 		 *
312 		 * There should really be a pmap call to determine a reasonable
313 		 * location.
314 		 */
315 		if (addr == 0 ||
316 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
317 		    addr < round_page((vm_offset_t)vms->vm_daddr +
318 		    lim_max(td, RLIMIT_DATA))))
319 			addr = round_page((vm_offset_t)vms->vm_daddr +
320 			    lim_max(td, RLIMIT_DATA));
321 	}
322 	if (size == 0) {
323 		/*
324 		 * Return success without mapping anything for old
325 		 * binaries that request a page-aligned mapping of
326 		 * length 0.  For modern binaries, this function
327 		 * returns an error earlier.
328 		 */
329 		error = 0;
330 	} else if (flags & MAP_ANON) {
331 		/*
332 		 * Mapping blank space is trivial.
333 		 *
334 		 * This relies on VM_PROT_* matching PROT_*.
335 		 */
336 		error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
337 		    VM_PROT_ALL, flags, NULL, pos, FALSE, td);
338 	} else {
339 		/*
340 		 * Mapping file, get fp for validation and don't let the
341 		 * descriptor disappear on us if we block. Check capability
342 		 * rights, but also return the maximum rights to be combined
343 		 * with maxprot later.
344 		 */
345 		cap_rights_init(&rights, CAP_MMAP);
346 		if (prot & PROT_READ)
347 			cap_rights_set(&rights, CAP_MMAP_R);
348 		if ((flags & MAP_SHARED) != 0) {
349 			if (prot & PROT_WRITE)
350 				cap_rights_set(&rights, CAP_MMAP_W);
351 		}
352 		if (prot & PROT_EXEC)
353 			cap_rights_set(&rights, CAP_MMAP_X);
354 		error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
355 		if (error != 0)
356 			goto done;
357 		if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
358 		    td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
359 			error = EINVAL;
360 			goto done;
361 		}
362 
363 		/* This relies on VM_PROT_* matching PROT_*. */
364 		error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
365 		    cap_maxprot, flags, pos, td);
366 	}
367 
368 	if (error == 0)
369 		td->td_retval[0] = (register_t) (addr + pageoff);
370 done:
371 	if (fp)
372 		fdrop(fp, td);
373 
374 	return (error);
375 }
376 
377 #if defined(COMPAT_FREEBSD6)
378 int
freebsd6_mmap(struct thread * td,struct freebsd6_mmap_args * uap)379 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
380 {
381 	struct mmap_args oargs;
382 
383 	oargs.addr = uap->addr;
384 	oargs.len = uap->len;
385 	oargs.prot = uap->prot;
386 	oargs.flags = uap->flags;
387 	oargs.fd = uap->fd;
388 	oargs.pos = uap->pos;
389 	return (sys_mmap(td, &oargs));
390 }
391 #endif
392 
393 #ifdef COMPAT_43
394 #ifndef _SYS_SYSPROTO_H_
395 struct ommap_args {
396 	caddr_t addr;
397 	int len;
398 	int prot;
399 	int flags;
400 	int fd;
401 	long pos;
402 };
403 #endif
404 int
ommap(td,uap)405 ommap(td, uap)
406 	struct thread *td;
407 	struct ommap_args *uap;
408 {
409 	struct mmap_args nargs;
410 	static const char cvtbsdprot[8] = {
411 		0,
412 		PROT_EXEC,
413 		PROT_WRITE,
414 		PROT_EXEC | PROT_WRITE,
415 		PROT_READ,
416 		PROT_EXEC | PROT_READ,
417 		PROT_WRITE | PROT_READ,
418 		PROT_EXEC | PROT_WRITE | PROT_READ,
419 	};
420 
421 #define	OMAP_ANON	0x0002
422 #define	OMAP_COPY	0x0020
423 #define	OMAP_SHARED	0x0010
424 #define	OMAP_FIXED	0x0100
425 
426 	nargs.addr = uap->addr;
427 	nargs.len = uap->len;
428 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
429 #ifdef COMPAT_FREEBSD32
430 #if defined(__amd64__)
431 	if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
432 	    nargs.prot != 0)
433 		nargs.prot |= PROT_EXEC;
434 #endif
435 #endif
436 	nargs.flags = 0;
437 	if (uap->flags & OMAP_ANON)
438 		nargs.flags |= MAP_ANON;
439 	if (uap->flags & OMAP_COPY)
440 		nargs.flags |= MAP_COPY;
441 	if (uap->flags & OMAP_SHARED)
442 		nargs.flags |= MAP_SHARED;
443 	else
444 		nargs.flags |= MAP_PRIVATE;
445 	if (uap->flags & OMAP_FIXED)
446 		nargs.flags |= MAP_FIXED;
447 	nargs.fd = uap->fd;
448 	nargs.pos = uap->pos;
449 	return (sys_mmap(td, &nargs));
450 }
451 #endif				/* COMPAT_43 */
452 
453 
454 #ifndef _SYS_SYSPROTO_H_
455 struct msync_args {
456 	void *addr;
457 	size_t len;
458 	int flags;
459 };
460 #endif
461 /*
462  * MPSAFE
463  */
464 int
sys_msync(td,uap)465 sys_msync(td, uap)
466 	struct thread *td;
467 	struct msync_args *uap;
468 {
469 	vm_offset_t addr;
470 	vm_size_t size, pageoff;
471 	int flags;
472 	vm_map_t map;
473 	int rv;
474 
475 	addr = (vm_offset_t) uap->addr;
476 	size = uap->len;
477 	flags = uap->flags;
478 
479 	pageoff = (addr & PAGE_MASK);
480 	addr -= pageoff;
481 	size += pageoff;
482 	size = (vm_size_t) round_page(size);
483 	if (addr + size < addr)
484 		return (EINVAL);
485 
486 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
487 		return (EINVAL);
488 
489 	map = &td->td_proc->p_vmspace->vm_map;
490 
491 	/*
492 	 * Clean the pages and interpret the return value.
493 	 */
494 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
495 	    (flags & MS_INVALIDATE) != 0);
496 	switch (rv) {
497 	case KERN_SUCCESS:
498 		return (0);
499 	case KERN_INVALID_ADDRESS:
500 		return (ENOMEM);
501 	case KERN_INVALID_ARGUMENT:
502 		return (EBUSY);
503 	case KERN_FAILURE:
504 		return (EIO);
505 	default:
506 		return (EINVAL);
507 	}
508 }
509 
510 #ifndef _SYS_SYSPROTO_H_
511 struct munmap_args {
512 	void *addr;
513 	size_t len;
514 };
515 #endif
516 /*
517  * MPSAFE
518  */
519 int
sys_munmap(td,uap)520 sys_munmap(td, uap)
521 	struct thread *td;
522 	struct munmap_args *uap;
523 {
524 #ifdef HWPMC_HOOKS
525 	struct pmckern_map_out pkm;
526 	vm_map_entry_t entry;
527 #endif
528 	vm_offset_t addr;
529 	vm_size_t size, pageoff;
530 	vm_map_t map;
531 
532 	addr = (vm_offset_t) uap->addr;
533 	size = uap->len;
534 	if (size == 0)
535 		return (EINVAL);
536 
537 	pageoff = (addr & PAGE_MASK);
538 	addr -= pageoff;
539 	size += pageoff;
540 	size = (vm_size_t) round_page(size);
541 	if (addr + size < addr)
542 		return (EINVAL);
543 
544 	/*
545 	 * Check for illegal addresses.  Watch out for address wrap...
546 	 */
547 	map = &td->td_proc->p_vmspace->vm_map;
548 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
549 		return (EINVAL);
550 	vm_map_lock(map);
551 #ifdef HWPMC_HOOKS
552 	/*
553 	 * Inform hwpmc if the address range being unmapped contains
554 	 * an executable region.
555 	 */
556 	pkm.pm_address = (uintptr_t) NULL;
557 	if (vm_map_lookup_entry(map, addr, &entry)) {
558 		for (;
559 		     entry != &map->header && entry->start < addr + size;
560 		     entry = entry->next) {
561 			if (vm_map_check_protection(map, entry->start,
562 				entry->end, VM_PROT_EXECUTE) == TRUE) {
563 				pkm.pm_address = (uintptr_t) addr;
564 				pkm.pm_size = (size_t) size;
565 				break;
566 			}
567 		}
568 	}
569 #endif
570 	vm_map_delete(map, addr, addr + size);
571 
572 #ifdef HWPMC_HOOKS
573 	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
574 	vm_map_lock_downgrade(map);
575 	if (pkm.pm_address != (uintptr_t) NULL)
576 		PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
577 	vm_map_unlock_read(map);
578 #else
579 	vm_map_unlock(map);
580 #endif
581 	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
582 	return (0);
583 }
584 
585 #ifndef _SYS_SYSPROTO_H_
586 struct mprotect_args {
587 	const void *addr;
588 	size_t len;
589 	int prot;
590 };
591 #endif
592 /*
593  * MPSAFE
594  */
595 int
sys_mprotect(td,uap)596 sys_mprotect(td, uap)
597 	struct thread *td;
598 	struct mprotect_args *uap;
599 {
600 	vm_offset_t addr;
601 	vm_size_t size, pageoff;
602 	vm_prot_t prot;
603 
604 	addr = (vm_offset_t) uap->addr;
605 	size = uap->len;
606 	prot = uap->prot & VM_PROT_ALL;
607 
608 	pageoff = (addr & PAGE_MASK);
609 	addr -= pageoff;
610 	size += pageoff;
611 	size = (vm_size_t) round_page(size);
612 	if (addr + size < addr)
613 		return (EINVAL);
614 
615 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
616 	    addr + size, prot, FALSE)) {
617 	case KERN_SUCCESS:
618 		return (0);
619 	case KERN_PROTECTION_FAILURE:
620 		return (EACCES);
621 	case KERN_RESOURCE_SHORTAGE:
622 		return (ENOMEM);
623 	}
624 	return (EINVAL);
625 }
626 
627 #ifndef _SYS_SYSPROTO_H_
628 struct minherit_args {
629 	void *addr;
630 	size_t len;
631 	int inherit;
632 };
633 #endif
634 /*
635  * MPSAFE
636  */
637 int
sys_minherit(td,uap)638 sys_minherit(td, uap)
639 	struct thread *td;
640 	struct minherit_args *uap;
641 {
642 	vm_offset_t addr;
643 	vm_size_t size, pageoff;
644 	vm_inherit_t inherit;
645 
646 	addr = (vm_offset_t)uap->addr;
647 	size = uap->len;
648 	inherit = uap->inherit;
649 
650 	pageoff = (addr & PAGE_MASK);
651 	addr -= pageoff;
652 	size += pageoff;
653 	size = (vm_size_t) round_page(size);
654 	if (addr + size < addr)
655 		return (EINVAL);
656 
657 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
658 	    addr + size, inherit)) {
659 	case KERN_SUCCESS:
660 		return (0);
661 	case KERN_PROTECTION_FAILURE:
662 		return (EACCES);
663 	}
664 	return (EINVAL);
665 }
666 
667 #ifndef _SYS_SYSPROTO_H_
668 struct madvise_args {
669 	void *addr;
670 	size_t len;
671 	int behav;
672 };
673 #endif
674 
675 /*
676  * MPSAFE
677  */
678 int
sys_madvise(td,uap)679 sys_madvise(td, uap)
680 	struct thread *td;
681 	struct madvise_args *uap;
682 {
683 	vm_offset_t start, end;
684 	vm_map_t map;
685 	int flags;
686 
687 	/*
688 	 * Check for our special case, advising the swap pager we are
689 	 * "immortal."
690 	 */
691 	if (uap->behav == MADV_PROTECT) {
692 		flags = PPROT_SET;
693 		return (kern_procctl(td, P_PID, td->td_proc->p_pid,
694 		    PROC_SPROTECT, &flags));
695 	}
696 
697 	/*
698 	 * Check for illegal behavior
699 	 */
700 	if (uap->behav < 0 || uap->behav > MADV_CORE)
701 		return (EINVAL);
702 	/*
703 	 * Check for illegal addresses.  Watch out for address wrap... Note
704 	 * that VM_*_ADDRESS are not constants due to casts (argh).
705 	 */
706 	map = &td->td_proc->p_vmspace->vm_map;
707 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
708 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
709 		return (EINVAL);
710 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
711 		return (EINVAL);
712 
713 	/*
714 	 * Since this routine is only advisory, we default to conservative
715 	 * behavior.
716 	 */
717 	start = trunc_page((vm_offset_t) uap->addr);
718 	end = round_page((vm_offset_t) uap->addr + uap->len);
719 
720 	if (vm_map_madvise(map, start, end, uap->behav))
721 		return (EINVAL);
722 	return (0);
723 }
724 
725 #ifndef _SYS_SYSPROTO_H_
726 struct mincore_args {
727 	const void *addr;
728 	size_t len;
729 	char *vec;
730 };
731 #endif
732 
733 /*
734  * MPSAFE
735  */
736 int
sys_mincore(td,uap)737 sys_mincore(td, uap)
738 	struct thread *td;
739 	struct mincore_args *uap;
740 {
741 	vm_offset_t addr, first_addr;
742 	vm_offset_t end, cend;
743 	pmap_t pmap;
744 	vm_map_t map;
745 	char *vec;
746 	int error = 0;
747 	int vecindex, lastvecindex;
748 	vm_map_entry_t current;
749 	vm_map_entry_t entry;
750 	vm_object_t object;
751 	vm_paddr_t locked_pa;
752 	vm_page_t m;
753 	vm_pindex_t pindex;
754 	int mincoreinfo;
755 	unsigned int timestamp;
756 	boolean_t locked;
757 
758 	/*
759 	 * Make sure that the addresses presented are valid for user
760 	 * mode.
761 	 */
762 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
763 	end = addr + (vm_size_t)round_page(uap->len);
764 	map = &td->td_proc->p_vmspace->vm_map;
765 	if (end > vm_map_max(map) || end < addr)
766 		return (ENOMEM);
767 
768 	/*
769 	 * Address of byte vector
770 	 */
771 	vec = uap->vec;
772 
773 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
774 
775 	vm_map_lock_read(map);
776 RestartScan:
777 	timestamp = map->timestamp;
778 
779 	if (!vm_map_lookup_entry(map, addr, &entry)) {
780 		vm_map_unlock_read(map);
781 		return (ENOMEM);
782 	}
783 
784 	/*
785 	 * Do this on a map entry basis so that if the pages are not
786 	 * in the current processes address space, we can easily look
787 	 * up the pages elsewhere.
788 	 */
789 	lastvecindex = -1;
790 	for (current = entry;
791 	    (current != &map->header) && (current->start < end);
792 	    current = current->next) {
793 
794 		/*
795 		 * check for contiguity
796 		 */
797 		if (current->end < end &&
798 		    (entry->next == &map->header ||
799 		     current->next->start > current->end)) {
800 			vm_map_unlock_read(map);
801 			return (ENOMEM);
802 		}
803 
804 		/*
805 		 * ignore submaps (for now) or null objects
806 		 */
807 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
808 			current->object.vm_object == NULL)
809 			continue;
810 
811 		/*
812 		 * limit this scan to the current map entry and the
813 		 * limits for the mincore call
814 		 */
815 		if (addr < current->start)
816 			addr = current->start;
817 		cend = current->end;
818 		if (cend > end)
819 			cend = end;
820 
821 		/*
822 		 * scan this entry one page at a time
823 		 */
824 		while (addr < cend) {
825 			/*
826 			 * Check pmap first, it is likely faster, also
827 			 * it can provide info as to whether we are the
828 			 * one referencing or modifying the page.
829 			 */
830 			object = NULL;
831 			locked_pa = 0;
832 		retry:
833 			m = NULL;
834 			mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
835 			if (locked_pa != 0) {
836 				/*
837 				 * The page is mapped by this process but not
838 				 * both accessed and modified.  It is also
839 				 * managed.  Acquire the object lock so that
840 				 * other mappings might be examined.
841 				 */
842 				m = PHYS_TO_VM_PAGE(locked_pa);
843 				if (m->object != object) {
844 					if (object != NULL)
845 						VM_OBJECT_WUNLOCK(object);
846 					object = m->object;
847 					locked = VM_OBJECT_TRYWLOCK(object);
848 					vm_page_unlock(m);
849 					if (!locked) {
850 						VM_OBJECT_WLOCK(object);
851 						vm_page_lock(m);
852 						goto retry;
853 					}
854 				} else
855 					vm_page_unlock(m);
856 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
857 				    ("mincore: page %p is mapped but invalid",
858 				    m));
859 			} else if (mincoreinfo == 0) {
860 				/*
861 				 * The page is not mapped by this process.  If
862 				 * the object implements managed pages, then
863 				 * determine if the page is resident so that
864 				 * the mappings might be examined.
865 				 */
866 				if (current->object.vm_object != object) {
867 					if (object != NULL)
868 						VM_OBJECT_WUNLOCK(object);
869 					object = current->object.vm_object;
870 					VM_OBJECT_WLOCK(object);
871 				}
872 				if (object->type == OBJT_DEFAULT ||
873 				    object->type == OBJT_SWAP ||
874 				    object->type == OBJT_VNODE) {
875 					pindex = OFF_TO_IDX(current->offset +
876 					    (addr - current->start));
877 					m = vm_page_lookup(object, pindex);
878 					if (m != NULL && m->valid == 0)
879 						m = NULL;
880 					if (m != NULL)
881 						mincoreinfo = MINCORE_INCORE;
882 				}
883 			}
884 			if (m != NULL) {
885 				/* Examine other mappings to the page. */
886 				if (m->dirty == 0 && pmap_is_modified(m))
887 					vm_page_dirty(m);
888 				if (m->dirty != 0)
889 					mincoreinfo |= MINCORE_MODIFIED_OTHER;
890 				/*
891 				 * The first test for PGA_REFERENCED is an
892 				 * optimization.  The second test is
893 				 * required because a concurrent pmap
894 				 * operation could clear the last reference
895 				 * and set PGA_REFERENCED before the call to
896 				 * pmap_is_referenced().
897 				 */
898 				if ((m->aflags & PGA_REFERENCED) != 0 ||
899 				    pmap_is_referenced(m) ||
900 				    (m->aflags & PGA_REFERENCED) != 0)
901 					mincoreinfo |= MINCORE_REFERENCED_OTHER;
902 			}
903 			if (object != NULL)
904 				VM_OBJECT_WUNLOCK(object);
905 
906 			/*
907 			 * subyte may page fault.  In case it needs to modify
908 			 * the map, we release the lock.
909 			 */
910 			vm_map_unlock_read(map);
911 
912 			/*
913 			 * calculate index into user supplied byte vector
914 			 */
915 			vecindex = OFF_TO_IDX(addr - first_addr);
916 
917 			/*
918 			 * If we have skipped map entries, we need to make sure that
919 			 * the byte vector is zeroed for those skipped entries.
920 			 */
921 			while ((lastvecindex + 1) < vecindex) {
922 				++lastvecindex;
923 				error = subyte(vec + lastvecindex, 0);
924 				if (error) {
925 					error = EFAULT;
926 					goto done2;
927 				}
928 			}
929 
930 			/*
931 			 * Pass the page information to the user
932 			 */
933 			error = subyte(vec + vecindex, mincoreinfo);
934 			if (error) {
935 				error = EFAULT;
936 				goto done2;
937 			}
938 
939 			/*
940 			 * If the map has changed, due to the subyte, the previous
941 			 * output may be invalid.
942 			 */
943 			vm_map_lock_read(map);
944 			if (timestamp != map->timestamp)
945 				goto RestartScan;
946 
947 			lastvecindex = vecindex;
948 			addr += PAGE_SIZE;
949 		}
950 	}
951 
952 	/*
953 	 * subyte may page fault.  In case it needs to modify
954 	 * the map, we release the lock.
955 	 */
956 	vm_map_unlock_read(map);
957 
958 	/*
959 	 * Zero the last entries in the byte vector.
960 	 */
961 	vecindex = OFF_TO_IDX(end - first_addr);
962 	while ((lastvecindex + 1) < vecindex) {
963 		++lastvecindex;
964 		error = subyte(vec + lastvecindex, 0);
965 		if (error) {
966 			error = EFAULT;
967 			goto done2;
968 		}
969 	}
970 
971 	/*
972 	 * If the map has changed, due to the subyte, the previous
973 	 * output may be invalid.
974 	 */
975 	vm_map_lock_read(map);
976 	if (timestamp != map->timestamp)
977 		goto RestartScan;
978 	vm_map_unlock_read(map);
979 done2:
980 	return (error);
981 }
982 
983 #ifndef _SYS_SYSPROTO_H_
984 struct mlock_args {
985 	const void *addr;
986 	size_t len;
987 };
988 #endif
989 /*
990  * MPSAFE
991  */
992 int
sys_mlock(td,uap)993 sys_mlock(td, uap)
994 	struct thread *td;
995 	struct mlock_args *uap;
996 {
997 
998 	return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
999 }
1000 
1001 int
vm_mlock(struct proc * proc,struct ucred * cred,const void * addr0,size_t len)1002 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
1003 {
1004 	vm_offset_t addr, end, last, start;
1005 	vm_size_t npages, size;
1006 	vm_map_t map;
1007 	unsigned long nsize;
1008 	int error;
1009 
1010 	error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
1011 	if (error)
1012 		return (error);
1013 	addr = (vm_offset_t)addr0;
1014 	size = len;
1015 	last = addr + size;
1016 	start = trunc_page(addr);
1017 	end = round_page(last);
1018 	if (last < addr || end < addr)
1019 		return (EINVAL);
1020 	npages = atop(end - start);
1021 	if (npages > vm_page_max_wired)
1022 		return (ENOMEM);
1023 	map = &proc->p_vmspace->vm_map;
1024 	PROC_LOCK(proc);
1025 	nsize = ptoa(npages + pmap_wired_count(map->pmap));
1026 	if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1027 		PROC_UNLOCK(proc);
1028 		return (ENOMEM);
1029 	}
1030 	PROC_UNLOCK(proc);
1031 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
1032 		return (EAGAIN);
1033 #ifdef RACCT
1034 	if (racct_enable) {
1035 		PROC_LOCK(proc);
1036 		error = racct_set(proc, RACCT_MEMLOCK, nsize);
1037 		PROC_UNLOCK(proc);
1038 		if (error != 0)
1039 			return (ENOMEM);
1040 	}
1041 #endif
1042 	error = vm_map_wire(map, start, end,
1043 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1044 #ifdef RACCT
1045 	if (racct_enable && error != KERN_SUCCESS) {
1046 		PROC_LOCK(proc);
1047 		racct_set(proc, RACCT_MEMLOCK,
1048 		    ptoa(pmap_wired_count(map->pmap)));
1049 		PROC_UNLOCK(proc);
1050 	}
1051 #endif
1052 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1053 }
1054 
1055 #ifndef _SYS_SYSPROTO_H_
1056 struct mlockall_args {
1057 	int	how;
1058 };
1059 #endif
1060 
1061 /*
1062  * MPSAFE
1063  */
1064 int
sys_mlockall(td,uap)1065 sys_mlockall(td, uap)
1066 	struct thread *td;
1067 	struct mlockall_args *uap;
1068 {
1069 	vm_map_t map;
1070 	int error;
1071 
1072 	map = &td->td_proc->p_vmspace->vm_map;
1073 	error = priv_check(td, PRIV_VM_MLOCK);
1074 	if (error)
1075 		return (error);
1076 
1077 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1078 		return (EINVAL);
1079 
1080 	/*
1081 	 * If wiring all pages in the process would cause it to exceed
1082 	 * a hard resource limit, return ENOMEM.
1083 	 */
1084 	if (!old_mlock && uap->how & MCL_CURRENT) {
1085 		PROC_LOCK(td->td_proc);
1086 		if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
1087 			PROC_UNLOCK(td->td_proc);
1088 			return (ENOMEM);
1089 		}
1090 		PROC_UNLOCK(td->td_proc);
1091 	}
1092 #ifdef RACCT
1093 	if (racct_enable) {
1094 		PROC_LOCK(td->td_proc);
1095 		error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1096 		PROC_UNLOCK(td->td_proc);
1097 		if (error != 0)
1098 			return (ENOMEM);
1099 	}
1100 #endif
1101 
1102 	if (uap->how & MCL_FUTURE) {
1103 		vm_map_lock(map);
1104 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
1105 		vm_map_unlock(map);
1106 		error = 0;
1107 	}
1108 
1109 	if (uap->how & MCL_CURRENT) {
1110 		/*
1111 		 * P1003.1-2001 mandates that all currently mapped pages
1112 		 * will be memory resident and locked (wired) upon return
1113 		 * from mlockall(). vm_map_wire() will wire pages, by
1114 		 * calling vm_fault_wire() for each page in the region.
1115 		 */
1116 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1117 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1118 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1119 	}
1120 #ifdef RACCT
1121 	if (racct_enable && error != KERN_SUCCESS) {
1122 		PROC_LOCK(td->td_proc);
1123 		racct_set(td->td_proc, RACCT_MEMLOCK,
1124 		    ptoa(pmap_wired_count(map->pmap)));
1125 		PROC_UNLOCK(td->td_proc);
1126 	}
1127 #endif
1128 
1129 	return (error);
1130 }
1131 
1132 #ifndef _SYS_SYSPROTO_H_
1133 struct munlockall_args {
1134 	register_t dummy;
1135 };
1136 #endif
1137 
1138 /*
1139  * MPSAFE
1140  */
1141 int
sys_munlockall(td,uap)1142 sys_munlockall(td, uap)
1143 	struct thread *td;
1144 	struct munlockall_args *uap;
1145 {
1146 	vm_map_t map;
1147 	int error;
1148 
1149 	map = &td->td_proc->p_vmspace->vm_map;
1150 	error = priv_check(td, PRIV_VM_MUNLOCK);
1151 	if (error)
1152 		return (error);
1153 
1154 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
1155 	vm_map_lock(map);
1156 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
1157 	vm_map_unlock(map);
1158 
1159 	/* Forcibly unwire all pages. */
1160 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1161 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1162 #ifdef RACCT
1163 	if (racct_enable && error == KERN_SUCCESS) {
1164 		PROC_LOCK(td->td_proc);
1165 		racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1166 		PROC_UNLOCK(td->td_proc);
1167 	}
1168 #endif
1169 
1170 	return (error);
1171 }
1172 
1173 #ifndef _SYS_SYSPROTO_H_
1174 struct munlock_args {
1175 	const void *addr;
1176 	size_t len;
1177 };
1178 #endif
1179 /*
1180  * MPSAFE
1181  */
1182 int
sys_munlock(td,uap)1183 sys_munlock(td, uap)
1184 	struct thread *td;
1185 	struct munlock_args *uap;
1186 {
1187 	vm_offset_t addr, end, last, start;
1188 	vm_size_t size;
1189 #ifdef RACCT
1190 	vm_map_t map;
1191 #endif
1192 	int error;
1193 
1194 	error = priv_check(td, PRIV_VM_MUNLOCK);
1195 	if (error)
1196 		return (error);
1197 	addr = (vm_offset_t)uap->addr;
1198 	size = uap->len;
1199 	last = addr + size;
1200 	start = trunc_page(addr);
1201 	end = round_page(last);
1202 	if (last < addr || end < addr)
1203 		return (EINVAL);
1204 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1205 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1206 #ifdef RACCT
1207 	if (racct_enable && error == KERN_SUCCESS) {
1208 		PROC_LOCK(td->td_proc);
1209 		map = &td->td_proc->p_vmspace->vm_map;
1210 		racct_set(td->td_proc, RACCT_MEMLOCK,
1211 		    ptoa(pmap_wired_count(map->pmap)));
1212 		PROC_UNLOCK(td->td_proc);
1213 	}
1214 #endif
1215 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
1216 }
1217 
1218 /*
1219  * vm_mmap_vnode()
1220  *
1221  * Helper function for vm_mmap.  Perform sanity check specific for mmap
1222  * operations on vnodes.
1223  */
1224 int
vm_mmap_vnode(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct vnode * vp,vm_ooffset_t * foffp,vm_object_t * objp,boolean_t * writecounted)1225 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1226     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1227     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1228     boolean_t *writecounted)
1229 {
1230 	struct vattr va;
1231 	vm_object_t obj;
1232 	vm_offset_t foff;
1233 	struct ucred *cred;
1234 	int error, flags, locktype;
1235 
1236 	cred = td->td_ucred;
1237 	if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
1238 		locktype = LK_EXCLUSIVE;
1239 	else
1240 		locktype = LK_SHARED;
1241 	if ((error = vget(vp, locktype, td)) != 0)
1242 		return (error);
1243 	foff = *foffp;
1244 	flags = *flagsp;
1245 	obj = vp->v_object;
1246 	if (vp->v_type == VREG) {
1247 		/*
1248 		 * Get the proper underlying object
1249 		 */
1250 		if (obj == NULL) {
1251 			error = EINVAL;
1252 			goto done;
1253 		}
1254 		if (obj->type == OBJT_VNODE && obj->handle != vp) {
1255 			vput(vp);
1256 			vp = (struct vnode *)obj->handle;
1257 			/*
1258 			 * Bypass filesystems obey the mpsafety of the
1259 			 * underlying fs.  Tmpfs never bypasses.
1260 			 */
1261 			error = vget(vp, locktype, td);
1262 			if (error != 0)
1263 				return (error);
1264 		}
1265 		if (locktype == LK_EXCLUSIVE) {
1266 			*writecounted = TRUE;
1267 			vnode_pager_update_writecount(obj, 0, objsize);
1268 		}
1269 	} else {
1270 		error = EINVAL;
1271 		goto done;
1272 	}
1273 	if ((error = VOP_GETATTR(vp, &va, cred)))
1274 		goto done;
1275 #ifdef MAC
1276 	/* This relies on VM_PROT_* matching PROT_*. */
1277 	error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1278 	if (error != 0)
1279 		goto done;
1280 #endif
1281 	if ((flags & MAP_SHARED) != 0) {
1282 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1283 			if (prot & VM_PROT_WRITE) {
1284 				error = EPERM;
1285 				goto done;
1286 			}
1287 			*maxprotp &= ~VM_PROT_WRITE;
1288 		}
1289 	}
1290 	/*
1291 	 * If it is a regular file without any references
1292 	 * we do not need to sync it.
1293 	 * Adjust object size to be the size of actual file.
1294 	 */
1295 	objsize = round_page(va.va_size);
1296 	if (va.va_nlink == 0)
1297 		flags |= MAP_NOSYNC;
1298 	if (obj->type == OBJT_VNODE) {
1299 		obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1300 		    cred);
1301 		if (obj == NULL) {
1302 			error = ENOMEM;
1303 			goto done;
1304 		}
1305 	} else {
1306 		KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
1307 		    ("wrong object type"));
1308 		VM_OBJECT_WLOCK(obj);
1309 		vm_object_reference_locked(obj);
1310 #if VM_NRESERVLEVEL > 0
1311 		vm_object_color(obj, 0);
1312 #endif
1313 		VM_OBJECT_WUNLOCK(obj);
1314 	}
1315 	*objp = obj;
1316 	*flagsp = flags;
1317 
1318 	vfs_mark_atime(vp, cred);
1319 
1320 done:
1321 	if (error != 0 && *writecounted) {
1322 		*writecounted = FALSE;
1323 		vnode_pager_update_writecount(obj, objsize, 0);
1324 	}
1325 	vput(vp);
1326 	return (error);
1327 }
1328 
1329 /*
1330  * vm_mmap_cdev()
1331  *
1332  * MPSAFE
1333  *
1334  * Helper function for vm_mmap.  Perform sanity check specific for mmap
1335  * operations on cdevs.
1336  */
1337 int
vm_mmap_cdev(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct cdev * cdev,struct cdevsw * dsw,vm_ooffset_t * foff,vm_object_t * objp)1338 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1339     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1340     vm_ooffset_t *foff, vm_object_t *objp)
1341 {
1342 	vm_object_t obj;
1343 	int error, flags;
1344 
1345 	flags = *flagsp;
1346 
1347 	if (dsw->d_flags & D_MMAP_ANON) {
1348 		*objp = NULL;
1349 		*foff = 0;
1350 		*maxprotp = VM_PROT_ALL;
1351 		*flagsp |= MAP_ANON;
1352 		return (0);
1353 	}
1354 	/*
1355 	 * cdevs do not provide private mappings of any kind.
1356 	 */
1357 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1358 	    (prot & VM_PROT_WRITE) != 0)
1359 		return (EACCES);
1360 	if (flags & (MAP_PRIVATE|MAP_COPY))
1361 		return (EINVAL);
1362 	/*
1363 	 * Force device mappings to be shared.
1364 	 */
1365 	flags |= MAP_SHARED;
1366 #ifdef MAC_XXX
1367 	error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1368 	if (error != 0)
1369 		return (error);
1370 #endif
1371 	/*
1372 	 * First, try d_mmap_single().  If that is not implemented
1373 	 * (returns ENODEV), fall back to using the device pager.
1374 	 * Note that d_mmap_single() must return a reference to the
1375 	 * object (it needs to bump the reference count of the object
1376 	 * it returns somehow).
1377 	 *
1378 	 * XXX assumes VM_PROT_* == PROT_*
1379 	 */
1380 	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1381 	if (error != ENODEV)
1382 		return (error);
1383 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1384 	    td->td_ucred);
1385 	if (obj == NULL)
1386 		return (EINVAL);
1387 	*objp = obj;
1388 	*flagsp = flags;
1389 	return (0);
1390 }
1391 
1392 /*
1393  * vm_mmap()
1394  *
1395  * Internal version of mmap used by exec, sys5 shared memory, and
1396  * various device drivers.  Handle is either a vnode pointer, a
1397  * character device, or NULL for MAP_ANON.
1398  */
1399 int
vm_mmap(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,objtype_t handle_type,void * handle,vm_ooffset_t foff)1400 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1401 	vm_prot_t maxprot, int flags,
1402 	objtype_t handle_type, void *handle,
1403 	vm_ooffset_t foff)
1404 {
1405 	vm_object_t object;
1406 	struct thread *td = curthread;
1407 	int error;
1408 	boolean_t writecounted;
1409 
1410 	if (size == 0)
1411 		return (EINVAL);
1412 
1413 	size = round_page(size);
1414 	object = NULL;
1415 	writecounted = FALSE;
1416 
1417 	/*
1418 	 * Lookup/allocate object.
1419 	 */
1420 	switch (handle_type) {
1421 	case OBJT_DEVICE: {
1422 		struct cdevsw *dsw;
1423 		struct cdev *cdev;
1424 		int ref;
1425 
1426 		cdev = handle;
1427 		dsw = dev_refthread(cdev, &ref);
1428 		if (dsw == NULL)
1429 			return (ENXIO);
1430 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1431 		    dsw, &foff, &object);
1432 		dev_relthread(cdev, ref);
1433 		break;
1434 	}
1435 	case OBJT_VNODE:
1436 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1437 		    handle, &foff, &object, &writecounted);
1438 		break;
1439 	case OBJT_DEFAULT:
1440 		if (handle == NULL) {
1441 			error = 0;
1442 			break;
1443 		}
1444 		/* FALLTHROUGH */
1445 	default:
1446 		error = EINVAL;
1447 		break;
1448 	}
1449 	if (error)
1450 		return (error);
1451 
1452 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1453 	    foff, writecounted, td);
1454 	if (error != 0 && object != NULL) {
1455 		/*
1456 		 * If this mapping was accounted for in the vnode's
1457 		 * writecount, then undo that now.
1458 		 */
1459 		if (writecounted)
1460 			vnode_pager_release_writecount(object, 0, size);
1461 		vm_object_deallocate(object);
1462 	}
1463 	return (error);
1464 }
1465 
1466 /*
1467  * Internal version of mmap that maps a specific VM object into an
1468  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1469  */
1470 int
vm_mmap_object(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vm_object_t object,vm_ooffset_t foff,boolean_t writecounted,struct thread * td)1471 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1472     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1473     boolean_t writecounted, struct thread *td)
1474 {
1475 	boolean_t fitit;
1476 	int docow, error, findspace, rv;
1477 
1478 	if (map == &td->td_proc->p_vmspace->vm_map) {
1479 		PROC_LOCK(td->td_proc);
1480 		if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
1481 			PROC_UNLOCK(td->td_proc);
1482 			return (ENOMEM);
1483 		}
1484 		if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1485 			PROC_UNLOCK(td->td_proc);
1486 			return (ENOMEM);
1487 		}
1488 		if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1489 			if (ptoa(pmap_wired_count(map->pmap)) + size >
1490 			    lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
1491 				racct_set_force(td->td_proc, RACCT_VMEM,
1492 				    map->size);
1493 				PROC_UNLOCK(td->td_proc);
1494 				return (ENOMEM);
1495 			}
1496 			error = racct_set(td->td_proc, RACCT_MEMLOCK,
1497 			    ptoa(pmap_wired_count(map->pmap)) + size);
1498 			if (error != 0) {
1499 				racct_set_force(td->td_proc, RACCT_VMEM,
1500 				    map->size);
1501 				PROC_UNLOCK(td->td_proc);
1502 				return (error);
1503 			}
1504 		}
1505 		PROC_UNLOCK(td->td_proc);
1506 	}
1507 
1508 	/*
1509 	 * We currently can only deal with page aligned file offsets.
1510 	 * The mmap() system call already enforces this by subtracting
1511 	 * the page offset from the file offset, but checking here
1512 	 * catches errors in device drivers (e.g. d_single_mmap()
1513 	 * callbacks) and other internal mapping requests (such as in
1514 	 * exec).
1515 	 */
1516 	if (foff & PAGE_MASK)
1517 		return (EINVAL);
1518 
1519 	if ((flags & MAP_FIXED) == 0) {
1520 		fitit = TRUE;
1521 		*addr = round_page(*addr);
1522 	} else {
1523 		if (*addr != trunc_page(*addr))
1524 			return (EINVAL);
1525 		fitit = FALSE;
1526 	}
1527 
1528 	if (flags & MAP_ANON) {
1529 		if (object != NULL || foff != 0)
1530 			return (EINVAL);
1531 		docow = 0;
1532 	} else if (flags & MAP_PREFAULT_READ)
1533 		docow = MAP_PREFAULT;
1534 	else
1535 		docow = MAP_PREFAULT_PARTIAL;
1536 
1537 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1538 		docow |= MAP_COPY_ON_WRITE;
1539 	if (flags & MAP_NOSYNC)
1540 		docow |= MAP_DISABLE_SYNCER;
1541 	if (flags & MAP_NOCORE)
1542 		docow |= MAP_DISABLE_COREDUMP;
1543 	/* Shared memory is also shared with children. */
1544 	if (flags & MAP_SHARED)
1545 		docow |= MAP_INHERIT_SHARE;
1546 	if (writecounted)
1547 		docow |= MAP_VN_WRITECOUNT;
1548 	if (flags & MAP_STACK) {
1549 		if (object != NULL)
1550 			return (EINVAL);
1551 		docow |= MAP_STACK_GROWS_DOWN;
1552 	}
1553 	if ((flags & MAP_EXCL) != 0)
1554 		docow |= MAP_CHECK_EXCL;
1555 
1556 	if (fitit) {
1557 		if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1558 			findspace = VMFS_SUPER_SPACE;
1559 		else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1560 			findspace = VMFS_ALIGNED_SPACE(flags >>
1561 			    MAP_ALIGNMENT_SHIFT);
1562 		else
1563 			findspace = VMFS_OPTIMAL_SPACE;
1564 		rv = vm_map_find(map, object, foff, addr, size,
1565 #ifdef MAP_32BIT
1566 		    flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
1567 #endif
1568 		    0, findspace, prot, maxprot, docow);
1569 	} else {
1570 		rv = vm_map_fixed(map, object, foff, *addr, size,
1571 		    prot, maxprot, docow);
1572 	}
1573 
1574 	if (rv == KERN_SUCCESS) {
1575 		/*
1576 		 * If the process has requested that all future mappings
1577 		 * be wired, then heed this.
1578 		 */
1579 		if (map->flags & MAP_WIREFUTURE) {
1580 			vm_map_wire(map, *addr, *addr + size,
1581 			    VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
1582 			    VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
1583 		}
1584 	}
1585 	return (vm_mmap_to_errno(rv));
1586 }
1587 
1588 /*
1589  * Translate a Mach VM return code to zero on success or the appropriate errno
1590  * on failure.
1591  */
1592 int
vm_mmap_to_errno(int rv)1593 vm_mmap_to_errno(int rv)
1594 {
1595 
1596 	switch (rv) {
1597 	case KERN_SUCCESS:
1598 		return (0);
1599 	case KERN_INVALID_ADDRESS:
1600 	case KERN_NO_SPACE:
1601 		return (ENOMEM);
1602 	case KERN_PROTECTION_FAILURE:
1603 		return (EACCES);
1604 	default:
1605 		return (EINVAL);
1606 	}
1607 }
1608