1 /*-
2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
8 * Science Department.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35 *
36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
37 */
38
39 /*
40 * Mapped file (mmap) interface to VM
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45
46 #include "opt_compat.h"
47 #include "opt_hwpmc_hooks.h"
48 #include "opt_vm.h"
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/capsicum.h>
53 #include <sys/kernel.h>
54 #include <sys/lock.h>
55 #include <sys/mutex.h>
56 #include <sys/sysproto.h>
57 #include <sys/filedesc.h>
58 #include <sys/priv.h>
59 #include <sys/proc.h>
60 #include <sys/procctl.h>
61 #include <sys/racct.h>
62 #include <sys/resource.h>
63 #include <sys/resourcevar.h>
64 #include <sys/rwlock.h>
65 #include <sys/sysctl.h>
66 #include <sys/vnode.h>
67 #include <sys/fcntl.h>
68 #include <sys/file.h>
69 #include <sys/mman.h>
70 #include <sys/mount.h>
71 #include <sys/conf.h>
72 #include <sys/stat.h>
73 #include <sys/syscallsubr.h>
74 #include <sys/sysent.h>
75 #include <sys/vmmeter.h>
76
77 #include <security/mac/mac_framework.h>
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_object.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_extern.h>
88 #include <vm/vm_page.h>
89 #include <vm/vnode_pager.h>
90
91 #ifdef HWPMC_HOOKS
92 #include <sys/pmckern.h>
93 #endif
94
95 int old_mlock = 0;
96 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
97 "Do not apply RLIMIT_MEMLOCK on mlockall");
98
99 #ifdef MAP_32BIT
100 #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31)
101 #endif
102
103 #ifndef _SYS_SYSPROTO_H_
104 struct sbrk_args {
105 int incr;
106 };
107 #endif
108
109 /*
110 * MPSAFE
111 */
112 /* ARGSUSED */
113 int
sys_sbrk(td,uap)114 sys_sbrk(td, uap)
115 struct thread *td;
116 struct sbrk_args *uap;
117 {
118 /* Not yet implemented */
119 return (EOPNOTSUPP);
120 }
121
122 #ifndef _SYS_SYSPROTO_H_
123 struct sstk_args {
124 int incr;
125 };
126 #endif
127
128 /*
129 * MPSAFE
130 */
131 /* ARGSUSED */
132 int
sys_sstk(td,uap)133 sys_sstk(td, uap)
134 struct thread *td;
135 struct sstk_args *uap;
136 {
137 /* Not yet implemented */
138 return (EOPNOTSUPP);
139 }
140
141 #if defined(COMPAT_43)
142 #ifndef _SYS_SYSPROTO_H_
143 struct getpagesize_args {
144 int dummy;
145 };
146 #endif
147
148 int
ogetpagesize(td,uap)149 ogetpagesize(td, uap)
150 struct thread *td;
151 struct getpagesize_args *uap;
152 {
153 /* MP SAFE */
154 td->td_retval[0] = PAGE_SIZE;
155 return (0);
156 }
157 #endif /* COMPAT_43 */
158
159
160 /*
161 * Memory Map (mmap) system call. Note that the file offset
162 * and address are allowed to be NOT page aligned, though if
163 * the MAP_FIXED flag it set, both must have the same remainder
164 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
165 * page-aligned, the actual mapping starts at trunc_page(addr)
166 * and the return value is adjusted up by the page offset.
167 *
168 * Generally speaking, only character devices which are themselves
169 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
170 * there would be no cache coherency between a descriptor and a VM mapping
171 * both to the same character device.
172 */
173 #ifndef _SYS_SYSPROTO_H_
174 struct mmap_args {
175 void *addr;
176 size_t len;
177 int prot;
178 int flags;
179 int fd;
180 long pad;
181 off_t pos;
182 };
183 #endif
184
185 /*
186 * MPSAFE
187 */
188 int
sys_mmap(td,uap)189 sys_mmap(td, uap)
190 struct thread *td;
191 struct mmap_args *uap;
192 {
193 struct file *fp;
194 vm_offset_t addr;
195 vm_size_t size, pageoff;
196 vm_prot_t cap_maxprot;
197 int align, error, flags, prot;
198 off_t pos;
199 struct vmspace *vms = td->td_proc->p_vmspace;
200 cap_rights_t rights;
201
202 addr = (vm_offset_t) uap->addr;
203 size = uap->len;
204 prot = uap->prot;
205 flags = uap->flags;
206 pos = uap->pos;
207
208 fp = NULL;
209
210 /*
211 * Ignore old flags that used to be defined but did not do anything.
212 */
213 flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
214
215 /*
216 * Enforce the constraints.
217 * Mapping of length 0 is only allowed for old binaries.
218 * Anonymous mapping shall specify -1 as filedescriptor and
219 * zero position for new code. Be nice to ancient a.out
220 * binaries and correct pos for anonymous mapping, since old
221 * ld.so sometimes issues anonymous map requests with non-zero
222 * pos.
223 */
224 if (!SV_CURPROC_FLAG(SV_AOUT)) {
225 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
226 ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
227 return (EINVAL);
228 } else {
229 if ((flags & MAP_ANON) != 0)
230 pos = 0;
231 }
232
233 if (flags & MAP_STACK) {
234 if ((uap->fd != -1) ||
235 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
236 return (EINVAL);
237 flags |= MAP_ANON;
238 pos = 0;
239 }
240 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
241 MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
242 MAP_PREFAULT_READ |
243 #ifdef MAP_32BIT
244 MAP_32BIT |
245 #endif
246 MAP_ALIGNMENT_MASK)) != 0)
247 return (EINVAL);
248 if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
249 return (EINVAL);
250 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
251 return (EINVAL);
252 if (prot != PROT_NONE &&
253 (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
254 return (EINVAL);
255
256 /*
257 * Align the file position to a page boundary,
258 * and save its page offset component.
259 */
260 pageoff = (pos & PAGE_MASK);
261 pos -= pageoff;
262
263 /* Adjust size for rounding (on both ends). */
264 size += pageoff; /* low end... */
265 size = (vm_size_t) round_page(size); /* hi end */
266
267 /* Ensure alignment is at least a page and fits in a pointer. */
268 align = flags & MAP_ALIGNMENT_MASK;
269 if (align != 0 && align != MAP_ALIGNED_SUPER &&
270 (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
271 align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
272 return (EINVAL);
273
274 /*
275 * Check for illegal addresses. Watch out for address wrap... Note
276 * that VM_*_ADDRESS are not constants due to casts (argh).
277 */
278 if (flags & MAP_FIXED) {
279 /*
280 * The specified address must have the same remainder
281 * as the file offset taken modulo PAGE_SIZE, so it
282 * should be aligned after adjustment by pageoff.
283 */
284 addr -= pageoff;
285 if (addr & PAGE_MASK)
286 return (EINVAL);
287
288 /* Address range must be all in user VM space. */
289 if (addr < vm_map_min(&vms->vm_map) ||
290 addr + size > vm_map_max(&vms->vm_map))
291 return (EINVAL);
292 if (addr + size < addr)
293 return (EINVAL);
294 #ifdef MAP_32BIT
295 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
296 return (EINVAL);
297 } else if (flags & MAP_32BIT) {
298 /*
299 * For MAP_32BIT, override the hint if it is too high and
300 * do not bother moving the mapping past the heap (since
301 * the heap is usually above 2GB).
302 */
303 if (addr + size > MAP_32BIT_MAX_ADDR)
304 addr = 0;
305 #endif
306 } else {
307 /*
308 * XXX for non-fixed mappings where no hint is provided or
309 * the hint would fall in the potential heap space,
310 * place it after the end of the largest possible heap.
311 *
312 * There should really be a pmap call to determine a reasonable
313 * location.
314 */
315 if (addr == 0 ||
316 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
317 addr < round_page((vm_offset_t)vms->vm_daddr +
318 lim_max(td, RLIMIT_DATA))))
319 addr = round_page((vm_offset_t)vms->vm_daddr +
320 lim_max(td, RLIMIT_DATA));
321 }
322 if (size == 0) {
323 /*
324 * Return success without mapping anything for old
325 * binaries that request a page-aligned mapping of
326 * length 0. For modern binaries, this function
327 * returns an error earlier.
328 */
329 error = 0;
330 } else if (flags & MAP_ANON) {
331 /*
332 * Mapping blank space is trivial.
333 *
334 * This relies on VM_PROT_* matching PROT_*.
335 */
336 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
337 VM_PROT_ALL, flags, NULL, pos, FALSE, td);
338 } else {
339 /*
340 * Mapping file, get fp for validation and don't let the
341 * descriptor disappear on us if we block. Check capability
342 * rights, but also return the maximum rights to be combined
343 * with maxprot later.
344 */
345 cap_rights_init(&rights, CAP_MMAP);
346 if (prot & PROT_READ)
347 cap_rights_set(&rights, CAP_MMAP_R);
348 if ((flags & MAP_SHARED) != 0) {
349 if (prot & PROT_WRITE)
350 cap_rights_set(&rights, CAP_MMAP_W);
351 }
352 if (prot & PROT_EXEC)
353 cap_rights_set(&rights, CAP_MMAP_X);
354 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
355 if (error != 0)
356 goto done;
357 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
358 td->td_proc->p_osrel >= P_OSREL_MAP_FSTRICT) {
359 error = EINVAL;
360 goto done;
361 }
362
363 /* This relies on VM_PROT_* matching PROT_*. */
364 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
365 cap_maxprot, flags, pos, td);
366 }
367
368 if (error == 0)
369 td->td_retval[0] = (register_t) (addr + pageoff);
370 done:
371 if (fp)
372 fdrop(fp, td);
373
374 return (error);
375 }
376
377 #if defined(COMPAT_FREEBSD6)
378 int
freebsd6_mmap(struct thread * td,struct freebsd6_mmap_args * uap)379 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
380 {
381 struct mmap_args oargs;
382
383 oargs.addr = uap->addr;
384 oargs.len = uap->len;
385 oargs.prot = uap->prot;
386 oargs.flags = uap->flags;
387 oargs.fd = uap->fd;
388 oargs.pos = uap->pos;
389 return (sys_mmap(td, &oargs));
390 }
391 #endif
392
393 #ifdef COMPAT_43
394 #ifndef _SYS_SYSPROTO_H_
395 struct ommap_args {
396 caddr_t addr;
397 int len;
398 int prot;
399 int flags;
400 int fd;
401 long pos;
402 };
403 #endif
404 int
ommap(td,uap)405 ommap(td, uap)
406 struct thread *td;
407 struct ommap_args *uap;
408 {
409 struct mmap_args nargs;
410 static const char cvtbsdprot[8] = {
411 0,
412 PROT_EXEC,
413 PROT_WRITE,
414 PROT_EXEC | PROT_WRITE,
415 PROT_READ,
416 PROT_EXEC | PROT_READ,
417 PROT_WRITE | PROT_READ,
418 PROT_EXEC | PROT_WRITE | PROT_READ,
419 };
420
421 #define OMAP_ANON 0x0002
422 #define OMAP_COPY 0x0020
423 #define OMAP_SHARED 0x0010
424 #define OMAP_FIXED 0x0100
425
426 nargs.addr = uap->addr;
427 nargs.len = uap->len;
428 nargs.prot = cvtbsdprot[uap->prot & 0x7];
429 #ifdef COMPAT_FREEBSD32
430 #if defined(__amd64__)
431 if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
432 nargs.prot != 0)
433 nargs.prot |= PROT_EXEC;
434 #endif
435 #endif
436 nargs.flags = 0;
437 if (uap->flags & OMAP_ANON)
438 nargs.flags |= MAP_ANON;
439 if (uap->flags & OMAP_COPY)
440 nargs.flags |= MAP_COPY;
441 if (uap->flags & OMAP_SHARED)
442 nargs.flags |= MAP_SHARED;
443 else
444 nargs.flags |= MAP_PRIVATE;
445 if (uap->flags & OMAP_FIXED)
446 nargs.flags |= MAP_FIXED;
447 nargs.fd = uap->fd;
448 nargs.pos = uap->pos;
449 return (sys_mmap(td, &nargs));
450 }
451 #endif /* COMPAT_43 */
452
453
454 #ifndef _SYS_SYSPROTO_H_
455 struct msync_args {
456 void *addr;
457 size_t len;
458 int flags;
459 };
460 #endif
461 /*
462 * MPSAFE
463 */
464 int
sys_msync(td,uap)465 sys_msync(td, uap)
466 struct thread *td;
467 struct msync_args *uap;
468 {
469 vm_offset_t addr;
470 vm_size_t size, pageoff;
471 int flags;
472 vm_map_t map;
473 int rv;
474
475 addr = (vm_offset_t) uap->addr;
476 size = uap->len;
477 flags = uap->flags;
478
479 pageoff = (addr & PAGE_MASK);
480 addr -= pageoff;
481 size += pageoff;
482 size = (vm_size_t) round_page(size);
483 if (addr + size < addr)
484 return (EINVAL);
485
486 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
487 return (EINVAL);
488
489 map = &td->td_proc->p_vmspace->vm_map;
490
491 /*
492 * Clean the pages and interpret the return value.
493 */
494 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
495 (flags & MS_INVALIDATE) != 0);
496 switch (rv) {
497 case KERN_SUCCESS:
498 return (0);
499 case KERN_INVALID_ADDRESS:
500 return (ENOMEM);
501 case KERN_INVALID_ARGUMENT:
502 return (EBUSY);
503 case KERN_FAILURE:
504 return (EIO);
505 default:
506 return (EINVAL);
507 }
508 }
509
510 #ifndef _SYS_SYSPROTO_H_
511 struct munmap_args {
512 void *addr;
513 size_t len;
514 };
515 #endif
516 /*
517 * MPSAFE
518 */
519 int
sys_munmap(td,uap)520 sys_munmap(td, uap)
521 struct thread *td;
522 struct munmap_args *uap;
523 {
524 #ifdef HWPMC_HOOKS
525 struct pmckern_map_out pkm;
526 vm_map_entry_t entry;
527 #endif
528 vm_offset_t addr;
529 vm_size_t size, pageoff;
530 vm_map_t map;
531
532 addr = (vm_offset_t) uap->addr;
533 size = uap->len;
534 if (size == 0)
535 return (EINVAL);
536
537 pageoff = (addr & PAGE_MASK);
538 addr -= pageoff;
539 size += pageoff;
540 size = (vm_size_t) round_page(size);
541 if (addr + size < addr)
542 return (EINVAL);
543
544 /*
545 * Check for illegal addresses. Watch out for address wrap...
546 */
547 map = &td->td_proc->p_vmspace->vm_map;
548 if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
549 return (EINVAL);
550 vm_map_lock(map);
551 #ifdef HWPMC_HOOKS
552 /*
553 * Inform hwpmc if the address range being unmapped contains
554 * an executable region.
555 */
556 pkm.pm_address = (uintptr_t) NULL;
557 if (vm_map_lookup_entry(map, addr, &entry)) {
558 for (;
559 entry != &map->header && entry->start < addr + size;
560 entry = entry->next) {
561 if (vm_map_check_protection(map, entry->start,
562 entry->end, VM_PROT_EXECUTE) == TRUE) {
563 pkm.pm_address = (uintptr_t) addr;
564 pkm.pm_size = (size_t) size;
565 break;
566 }
567 }
568 }
569 #endif
570 vm_map_delete(map, addr, addr + size);
571
572 #ifdef HWPMC_HOOKS
573 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
574 vm_map_lock_downgrade(map);
575 if (pkm.pm_address != (uintptr_t) NULL)
576 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
577 vm_map_unlock_read(map);
578 #else
579 vm_map_unlock(map);
580 #endif
581 /* vm_map_delete returns nothing but KERN_SUCCESS anyway */
582 return (0);
583 }
584
585 #ifndef _SYS_SYSPROTO_H_
586 struct mprotect_args {
587 const void *addr;
588 size_t len;
589 int prot;
590 };
591 #endif
592 /*
593 * MPSAFE
594 */
595 int
sys_mprotect(td,uap)596 sys_mprotect(td, uap)
597 struct thread *td;
598 struct mprotect_args *uap;
599 {
600 vm_offset_t addr;
601 vm_size_t size, pageoff;
602 vm_prot_t prot;
603
604 addr = (vm_offset_t) uap->addr;
605 size = uap->len;
606 prot = uap->prot & VM_PROT_ALL;
607
608 pageoff = (addr & PAGE_MASK);
609 addr -= pageoff;
610 size += pageoff;
611 size = (vm_size_t) round_page(size);
612 if (addr + size < addr)
613 return (EINVAL);
614
615 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
616 addr + size, prot, FALSE)) {
617 case KERN_SUCCESS:
618 return (0);
619 case KERN_PROTECTION_FAILURE:
620 return (EACCES);
621 case KERN_RESOURCE_SHORTAGE:
622 return (ENOMEM);
623 }
624 return (EINVAL);
625 }
626
627 #ifndef _SYS_SYSPROTO_H_
628 struct minherit_args {
629 void *addr;
630 size_t len;
631 int inherit;
632 };
633 #endif
634 /*
635 * MPSAFE
636 */
637 int
sys_minherit(td,uap)638 sys_minherit(td, uap)
639 struct thread *td;
640 struct minherit_args *uap;
641 {
642 vm_offset_t addr;
643 vm_size_t size, pageoff;
644 vm_inherit_t inherit;
645
646 addr = (vm_offset_t)uap->addr;
647 size = uap->len;
648 inherit = uap->inherit;
649
650 pageoff = (addr & PAGE_MASK);
651 addr -= pageoff;
652 size += pageoff;
653 size = (vm_size_t) round_page(size);
654 if (addr + size < addr)
655 return (EINVAL);
656
657 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
658 addr + size, inherit)) {
659 case KERN_SUCCESS:
660 return (0);
661 case KERN_PROTECTION_FAILURE:
662 return (EACCES);
663 }
664 return (EINVAL);
665 }
666
667 #ifndef _SYS_SYSPROTO_H_
668 struct madvise_args {
669 void *addr;
670 size_t len;
671 int behav;
672 };
673 #endif
674
675 /*
676 * MPSAFE
677 */
678 int
sys_madvise(td,uap)679 sys_madvise(td, uap)
680 struct thread *td;
681 struct madvise_args *uap;
682 {
683 vm_offset_t start, end;
684 vm_map_t map;
685 int flags;
686
687 /*
688 * Check for our special case, advising the swap pager we are
689 * "immortal."
690 */
691 if (uap->behav == MADV_PROTECT) {
692 flags = PPROT_SET;
693 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
694 PROC_SPROTECT, &flags));
695 }
696
697 /*
698 * Check for illegal behavior
699 */
700 if (uap->behav < 0 || uap->behav > MADV_CORE)
701 return (EINVAL);
702 /*
703 * Check for illegal addresses. Watch out for address wrap... Note
704 * that VM_*_ADDRESS are not constants due to casts (argh).
705 */
706 map = &td->td_proc->p_vmspace->vm_map;
707 if ((vm_offset_t)uap->addr < vm_map_min(map) ||
708 (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
709 return (EINVAL);
710 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
711 return (EINVAL);
712
713 /*
714 * Since this routine is only advisory, we default to conservative
715 * behavior.
716 */
717 start = trunc_page((vm_offset_t) uap->addr);
718 end = round_page((vm_offset_t) uap->addr + uap->len);
719
720 if (vm_map_madvise(map, start, end, uap->behav))
721 return (EINVAL);
722 return (0);
723 }
724
725 #ifndef _SYS_SYSPROTO_H_
726 struct mincore_args {
727 const void *addr;
728 size_t len;
729 char *vec;
730 };
731 #endif
732
733 /*
734 * MPSAFE
735 */
736 int
sys_mincore(td,uap)737 sys_mincore(td, uap)
738 struct thread *td;
739 struct mincore_args *uap;
740 {
741 vm_offset_t addr, first_addr;
742 vm_offset_t end, cend;
743 pmap_t pmap;
744 vm_map_t map;
745 char *vec;
746 int error = 0;
747 int vecindex, lastvecindex;
748 vm_map_entry_t current;
749 vm_map_entry_t entry;
750 vm_object_t object;
751 vm_paddr_t locked_pa;
752 vm_page_t m;
753 vm_pindex_t pindex;
754 int mincoreinfo;
755 unsigned int timestamp;
756 boolean_t locked;
757
758 /*
759 * Make sure that the addresses presented are valid for user
760 * mode.
761 */
762 first_addr = addr = trunc_page((vm_offset_t) uap->addr);
763 end = addr + (vm_size_t)round_page(uap->len);
764 map = &td->td_proc->p_vmspace->vm_map;
765 if (end > vm_map_max(map) || end < addr)
766 return (ENOMEM);
767
768 /*
769 * Address of byte vector
770 */
771 vec = uap->vec;
772
773 pmap = vmspace_pmap(td->td_proc->p_vmspace);
774
775 vm_map_lock_read(map);
776 RestartScan:
777 timestamp = map->timestamp;
778
779 if (!vm_map_lookup_entry(map, addr, &entry)) {
780 vm_map_unlock_read(map);
781 return (ENOMEM);
782 }
783
784 /*
785 * Do this on a map entry basis so that if the pages are not
786 * in the current processes address space, we can easily look
787 * up the pages elsewhere.
788 */
789 lastvecindex = -1;
790 for (current = entry;
791 (current != &map->header) && (current->start < end);
792 current = current->next) {
793
794 /*
795 * check for contiguity
796 */
797 if (current->end < end &&
798 (entry->next == &map->header ||
799 current->next->start > current->end)) {
800 vm_map_unlock_read(map);
801 return (ENOMEM);
802 }
803
804 /*
805 * ignore submaps (for now) or null objects
806 */
807 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
808 current->object.vm_object == NULL)
809 continue;
810
811 /*
812 * limit this scan to the current map entry and the
813 * limits for the mincore call
814 */
815 if (addr < current->start)
816 addr = current->start;
817 cend = current->end;
818 if (cend > end)
819 cend = end;
820
821 /*
822 * scan this entry one page at a time
823 */
824 while (addr < cend) {
825 /*
826 * Check pmap first, it is likely faster, also
827 * it can provide info as to whether we are the
828 * one referencing or modifying the page.
829 */
830 object = NULL;
831 locked_pa = 0;
832 retry:
833 m = NULL;
834 mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
835 if (locked_pa != 0) {
836 /*
837 * The page is mapped by this process but not
838 * both accessed and modified. It is also
839 * managed. Acquire the object lock so that
840 * other mappings might be examined.
841 */
842 m = PHYS_TO_VM_PAGE(locked_pa);
843 if (m->object != object) {
844 if (object != NULL)
845 VM_OBJECT_WUNLOCK(object);
846 object = m->object;
847 locked = VM_OBJECT_TRYWLOCK(object);
848 vm_page_unlock(m);
849 if (!locked) {
850 VM_OBJECT_WLOCK(object);
851 vm_page_lock(m);
852 goto retry;
853 }
854 } else
855 vm_page_unlock(m);
856 KASSERT(m->valid == VM_PAGE_BITS_ALL,
857 ("mincore: page %p is mapped but invalid",
858 m));
859 } else if (mincoreinfo == 0) {
860 /*
861 * The page is not mapped by this process. If
862 * the object implements managed pages, then
863 * determine if the page is resident so that
864 * the mappings might be examined.
865 */
866 if (current->object.vm_object != object) {
867 if (object != NULL)
868 VM_OBJECT_WUNLOCK(object);
869 object = current->object.vm_object;
870 VM_OBJECT_WLOCK(object);
871 }
872 if (object->type == OBJT_DEFAULT ||
873 object->type == OBJT_SWAP ||
874 object->type == OBJT_VNODE) {
875 pindex = OFF_TO_IDX(current->offset +
876 (addr - current->start));
877 m = vm_page_lookup(object, pindex);
878 if (m != NULL && m->valid == 0)
879 m = NULL;
880 if (m != NULL)
881 mincoreinfo = MINCORE_INCORE;
882 }
883 }
884 if (m != NULL) {
885 /* Examine other mappings to the page. */
886 if (m->dirty == 0 && pmap_is_modified(m))
887 vm_page_dirty(m);
888 if (m->dirty != 0)
889 mincoreinfo |= MINCORE_MODIFIED_OTHER;
890 /*
891 * The first test for PGA_REFERENCED is an
892 * optimization. The second test is
893 * required because a concurrent pmap
894 * operation could clear the last reference
895 * and set PGA_REFERENCED before the call to
896 * pmap_is_referenced().
897 */
898 if ((m->aflags & PGA_REFERENCED) != 0 ||
899 pmap_is_referenced(m) ||
900 (m->aflags & PGA_REFERENCED) != 0)
901 mincoreinfo |= MINCORE_REFERENCED_OTHER;
902 }
903 if (object != NULL)
904 VM_OBJECT_WUNLOCK(object);
905
906 /*
907 * subyte may page fault. In case it needs to modify
908 * the map, we release the lock.
909 */
910 vm_map_unlock_read(map);
911
912 /*
913 * calculate index into user supplied byte vector
914 */
915 vecindex = OFF_TO_IDX(addr - first_addr);
916
917 /*
918 * If we have skipped map entries, we need to make sure that
919 * the byte vector is zeroed for those skipped entries.
920 */
921 while ((lastvecindex + 1) < vecindex) {
922 ++lastvecindex;
923 error = subyte(vec + lastvecindex, 0);
924 if (error) {
925 error = EFAULT;
926 goto done2;
927 }
928 }
929
930 /*
931 * Pass the page information to the user
932 */
933 error = subyte(vec + vecindex, mincoreinfo);
934 if (error) {
935 error = EFAULT;
936 goto done2;
937 }
938
939 /*
940 * If the map has changed, due to the subyte, the previous
941 * output may be invalid.
942 */
943 vm_map_lock_read(map);
944 if (timestamp != map->timestamp)
945 goto RestartScan;
946
947 lastvecindex = vecindex;
948 addr += PAGE_SIZE;
949 }
950 }
951
952 /*
953 * subyte may page fault. In case it needs to modify
954 * the map, we release the lock.
955 */
956 vm_map_unlock_read(map);
957
958 /*
959 * Zero the last entries in the byte vector.
960 */
961 vecindex = OFF_TO_IDX(end - first_addr);
962 while ((lastvecindex + 1) < vecindex) {
963 ++lastvecindex;
964 error = subyte(vec + lastvecindex, 0);
965 if (error) {
966 error = EFAULT;
967 goto done2;
968 }
969 }
970
971 /*
972 * If the map has changed, due to the subyte, the previous
973 * output may be invalid.
974 */
975 vm_map_lock_read(map);
976 if (timestamp != map->timestamp)
977 goto RestartScan;
978 vm_map_unlock_read(map);
979 done2:
980 return (error);
981 }
982
983 #ifndef _SYS_SYSPROTO_H_
984 struct mlock_args {
985 const void *addr;
986 size_t len;
987 };
988 #endif
989 /*
990 * MPSAFE
991 */
992 int
sys_mlock(td,uap)993 sys_mlock(td, uap)
994 struct thread *td;
995 struct mlock_args *uap;
996 {
997
998 return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
999 }
1000
1001 int
vm_mlock(struct proc * proc,struct ucred * cred,const void * addr0,size_t len)1002 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
1003 {
1004 vm_offset_t addr, end, last, start;
1005 vm_size_t npages, size;
1006 vm_map_t map;
1007 unsigned long nsize;
1008 int error;
1009
1010 error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
1011 if (error)
1012 return (error);
1013 addr = (vm_offset_t)addr0;
1014 size = len;
1015 last = addr + size;
1016 start = trunc_page(addr);
1017 end = round_page(last);
1018 if (last < addr || end < addr)
1019 return (EINVAL);
1020 npages = atop(end - start);
1021 if (npages > vm_page_max_wired)
1022 return (ENOMEM);
1023 map = &proc->p_vmspace->vm_map;
1024 PROC_LOCK(proc);
1025 nsize = ptoa(npages + pmap_wired_count(map->pmap));
1026 if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
1027 PROC_UNLOCK(proc);
1028 return (ENOMEM);
1029 }
1030 PROC_UNLOCK(proc);
1031 if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
1032 return (EAGAIN);
1033 #ifdef RACCT
1034 if (racct_enable) {
1035 PROC_LOCK(proc);
1036 error = racct_set(proc, RACCT_MEMLOCK, nsize);
1037 PROC_UNLOCK(proc);
1038 if (error != 0)
1039 return (ENOMEM);
1040 }
1041 #endif
1042 error = vm_map_wire(map, start, end,
1043 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1044 #ifdef RACCT
1045 if (racct_enable && error != KERN_SUCCESS) {
1046 PROC_LOCK(proc);
1047 racct_set(proc, RACCT_MEMLOCK,
1048 ptoa(pmap_wired_count(map->pmap)));
1049 PROC_UNLOCK(proc);
1050 }
1051 #endif
1052 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1053 }
1054
1055 #ifndef _SYS_SYSPROTO_H_
1056 struct mlockall_args {
1057 int how;
1058 };
1059 #endif
1060
1061 /*
1062 * MPSAFE
1063 */
1064 int
sys_mlockall(td,uap)1065 sys_mlockall(td, uap)
1066 struct thread *td;
1067 struct mlockall_args *uap;
1068 {
1069 vm_map_t map;
1070 int error;
1071
1072 map = &td->td_proc->p_vmspace->vm_map;
1073 error = priv_check(td, PRIV_VM_MLOCK);
1074 if (error)
1075 return (error);
1076
1077 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1078 return (EINVAL);
1079
1080 /*
1081 * If wiring all pages in the process would cause it to exceed
1082 * a hard resource limit, return ENOMEM.
1083 */
1084 if (!old_mlock && uap->how & MCL_CURRENT) {
1085 PROC_LOCK(td->td_proc);
1086 if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) {
1087 PROC_UNLOCK(td->td_proc);
1088 return (ENOMEM);
1089 }
1090 PROC_UNLOCK(td->td_proc);
1091 }
1092 #ifdef RACCT
1093 if (racct_enable) {
1094 PROC_LOCK(td->td_proc);
1095 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
1096 PROC_UNLOCK(td->td_proc);
1097 if (error != 0)
1098 return (ENOMEM);
1099 }
1100 #endif
1101
1102 if (uap->how & MCL_FUTURE) {
1103 vm_map_lock(map);
1104 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1105 vm_map_unlock(map);
1106 error = 0;
1107 }
1108
1109 if (uap->how & MCL_CURRENT) {
1110 /*
1111 * P1003.1-2001 mandates that all currently mapped pages
1112 * will be memory resident and locked (wired) upon return
1113 * from mlockall(). vm_map_wire() will wire pages, by
1114 * calling vm_fault_wire() for each page in the region.
1115 */
1116 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1117 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1118 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1119 }
1120 #ifdef RACCT
1121 if (racct_enable && error != KERN_SUCCESS) {
1122 PROC_LOCK(td->td_proc);
1123 racct_set(td->td_proc, RACCT_MEMLOCK,
1124 ptoa(pmap_wired_count(map->pmap)));
1125 PROC_UNLOCK(td->td_proc);
1126 }
1127 #endif
1128
1129 return (error);
1130 }
1131
1132 #ifndef _SYS_SYSPROTO_H_
1133 struct munlockall_args {
1134 register_t dummy;
1135 };
1136 #endif
1137
1138 /*
1139 * MPSAFE
1140 */
1141 int
sys_munlockall(td,uap)1142 sys_munlockall(td, uap)
1143 struct thread *td;
1144 struct munlockall_args *uap;
1145 {
1146 vm_map_t map;
1147 int error;
1148
1149 map = &td->td_proc->p_vmspace->vm_map;
1150 error = priv_check(td, PRIV_VM_MUNLOCK);
1151 if (error)
1152 return (error);
1153
1154 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1155 vm_map_lock(map);
1156 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1157 vm_map_unlock(map);
1158
1159 /* Forcibly unwire all pages. */
1160 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1161 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1162 #ifdef RACCT
1163 if (racct_enable && error == KERN_SUCCESS) {
1164 PROC_LOCK(td->td_proc);
1165 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
1166 PROC_UNLOCK(td->td_proc);
1167 }
1168 #endif
1169
1170 return (error);
1171 }
1172
1173 #ifndef _SYS_SYSPROTO_H_
1174 struct munlock_args {
1175 const void *addr;
1176 size_t len;
1177 };
1178 #endif
1179 /*
1180 * MPSAFE
1181 */
1182 int
sys_munlock(td,uap)1183 sys_munlock(td, uap)
1184 struct thread *td;
1185 struct munlock_args *uap;
1186 {
1187 vm_offset_t addr, end, last, start;
1188 vm_size_t size;
1189 #ifdef RACCT
1190 vm_map_t map;
1191 #endif
1192 int error;
1193
1194 error = priv_check(td, PRIV_VM_MUNLOCK);
1195 if (error)
1196 return (error);
1197 addr = (vm_offset_t)uap->addr;
1198 size = uap->len;
1199 last = addr + size;
1200 start = trunc_page(addr);
1201 end = round_page(last);
1202 if (last < addr || end < addr)
1203 return (EINVAL);
1204 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1205 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1206 #ifdef RACCT
1207 if (racct_enable && error == KERN_SUCCESS) {
1208 PROC_LOCK(td->td_proc);
1209 map = &td->td_proc->p_vmspace->vm_map;
1210 racct_set(td->td_proc, RACCT_MEMLOCK,
1211 ptoa(pmap_wired_count(map->pmap)));
1212 PROC_UNLOCK(td->td_proc);
1213 }
1214 #endif
1215 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1216 }
1217
1218 /*
1219 * vm_mmap_vnode()
1220 *
1221 * Helper function for vm_mmap. Perform sanity check specific for mmap
1222 * operations on vnodes.
1223 */
1224 int
vm_mmap_vnode(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct vnode * vp,vm_ooffset_t * foffp,vm_object_t * objp,boolean_t * writecounted)1225 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1226 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1227 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
1228 boolean_t *writecounted)
1229 {
1230 struct vattr va;
1231 vm_object_t obj;
1232 vm_offset_t foff;
1233 struct ucred *cred;
1234 int error, flags, locktype;
1235
1236 cred = td->td_ucred;
1237 if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
1238 locktype = LK_EXCLUSIVE;
1239 else
1240 locktype = LK_SHARED;
1241 if ((error = vget(vp, locktype, td)) != 0)
1242 return (error);
1243 foff = *foffp;
1244 flags = *flagsp;
1245 obj = vp->v_object;
1246 if (vp->v_type == VREG) {
1247 /*
1248 * Get the proper underlying object
1249 */
1250 if (obj == NULL) {
1251 error = EINVAL;
1252 goto done;
1253 }
1254 if (obj->type == OBJT_VNODE && obj->handle != vp) {
1255 vput(vp);
1256 vp = (struct vnode *)obj->handle;
1257 /*
1258 * Bypass filesystems obey the mpsafety of the
1259 * underlying fs. Tmpfs never bypasses.
1260 */
1261 error = vget(vp, locktype, td);
1262 if (error != 0)
1263 return (error);
1264 }
1265 if (locktype == LK_EXCLUSIVE) {
1266 *writecounted = TRUE;
1267 vnode_pager_update_writecount(obj, 0, objsize);
1268 }
1269 } else {
1270 error = EINVAL;
1271 goto done;
1272 }
1273 if ((error = VOP_GETATTR(vp, &va, cred)))
1274 goto done;
1275 #ifdef MAC
1276 /* This relies on VM_PROT_* matching PROT_*. */
1277 error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
1278 if (error != 0)
1279 goto done;
1280 #endif
1281 if ((flags & MAP_SHARED) != 0) {
1282 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1283 if (prot & VM_PROT_WRITE) {
1284 error = EPERM;
1285 goto done;
1286 }
1287 *maxprotp &= ~VM_PROT_WRITE;
1288 }
1289 }
1290 /*
1291 * If it is a regular file without any references
1292 * we do not need to sync it.
1293 * Adjust object size to be the size of actual file.
1294 */
1295 objsize = round_page(va.va_size);
1296 if (va.va_nlink == 0)
1297 flags |= MAP_NOSYNC;
1298 if (obj->type == OBJT_VNODE) {
1299 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
1300 cred);
1301 if (obj == NULL) {
1302 error = ENOMEM;
1303 goto done;
1304 }
1305 } else {
1306 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
1307 ("wrong object type"));
1308 VM_OBJECT_WLOCK(obj);
1309 vm_object_reference_locked(obj);
1310 #if VM_NRESERVLEVEL > 0
1311 vm_object_color(obj, 0);
1312 #endif
1313 VM_OBJECT_WUNLOCK(obj);
1314 }
1315 *objp = obj;
1316 *flagsp = flags;
1317
1318 vfs_mark_atime(vp, cred);
1319
1320 done:
1321 if (error != 0 && *writecounted) {
1322 *writecounted = FALSE;
1323 vnode_pager_update_writecount(obj, objsize, 0);
1324 }
1325 vput(vp);
1326 return (error);
1327 }
1328
1329 /*
1330 * vm_mmap_cdev()
1331 *
1332 * MPSAFE
1333 *
1334 * Helper function for vm_mmap. Perform sanity check specific for mmap
1335 * operations on cdevs.
1336 */
1337 int
vm_mmap_cdev(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t * maxprotp,int * flagsp,struct cdev * cdev,struct cdevsw * dsw,vm_ooffset_t * foff,vm_object_t * objp)1338 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1339 vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
1340 vm_ooffset_t *foff, vm_object_t *objp)
1341 {
1342 vm_object_t obj;
1343 int error, flags;
1344
1345 flags = *flagsp;
1346
1347 if (dsw->d_flags & D_MMAP_ANON) {
1348 *objp = NULL;
1349 *foff = 0;
1350 *maxprotp = VM_PROT_ALL;
1351 *flagsp |= MAP_ANON;
1352 return (0);
1353 }
1354 /*
1355 * cdevs do not provide private mappings of any kind.
1356 */
1357 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1358 (prot & VM_PROT_WRITE) != 0)
1359 return (EACCES);
1360 if (flags & (MAP_PRIVATE|MAP_COPY))
1361 return (EINVAL);
1362 /*
1363 * Force device mappings to be shared.
1364 */
1365 flags |= MAP_SHARED;
1366 #ifdef MAC_XXX
1367 error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
1368 if (error != 0)
1369 return (error);
1370 #endif
1371 /*
1372 * First, try d_mmap_single(). If that is not implemented
1373 * (returns ENODEV), fall back to using the device pager.
1374 * Note that d_mmap_single() must return a reference to the
1375 * object (it needs to bump the reference count of the object
1376 * it returns somehow).
1377 *
1378 * XXX assumes VM_PROT_* == PROT_*
1379 */
1380 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1381 if (error != ENODEV)
1382 return (error);
1383 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
1384 td->td_ucred);
1385 if (obj == NULL)
1386 return (EINVAL);
1387 *objp = obj;
1388 *flagsp = flags;
1389 return (0);
1390 }
1391
1392 /*
1393 * vm_mmap()
1394 *
1395 * Internal version of mmap used by exec, sys5 shared memory, and
1396 * various device drivers. Handle is either a vnode pointer, a
1397 * character device, or NULL for MAP_ANON.
1398 */
1399 int
vm_mmap(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,objtype_t handle_type,void * handle,vm_ooffset_t foff)1400 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1401 vm_prot_t maxprot, int flags,
1402 objtype_t handle_type, void *handle,
1403 vm_ooffset_t foff)
1404 {
1405 vm_object_t object;
1406 struct thread *td = curthread;
1407 int error;
1408 boolean_t writecounted;
1409
1410 if (size == 0)
1411 return (EINVAL);
1412
1413 size = round_page(size);
1414 object = NULL;
1415 writecounted = FALSE;
1416
1417 /*
1418 * Lookup/allocate object.
1419 */
1420 switch (handle_type) {
1421 case OBJT_DEVICE: {
1422 struct cdevsw *dsw;
1423 struct cdev *cdev;
1424 int ref;
1425
1426 cdev = handle;
1427 dsw = dev_refthread(cdev, &ref);
1428 if (dsw == NULL)
1429 return (ENXIO);
1430 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
1431 dsw, &foff, &object);
1432 dev_relthread(cdev, ref);
1433 break;
1434 }
1435 case OBJT_VNODE:
1436 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1437 handle, &foff, &object, &writecounted);
1438 break;
1439 case OBJT_DEFAULT:
1440 if (handle == NULL) {
1441 error = 0;
1442 break;
1443 }
1444 /* FALLTHROUGH */
1445 default:
1446 error = EINVAL;
1447 break;
1448 }
1449 if (error)
1450 return (error);
1451
1452 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1453 foff, writecounted, td);
1454 if (error != 0 && object != NULL) {
1455 /*
1456 * If this mapping was accounted for in the vnode's
1457 * writecount, then undo that now.
1458 */
1459 if (writecounted)
1460 vnode_pager_release_writecount(object, 0, size);
1461 vm_object_deallocate(object);
1462 }
1463 return (error);
1464 }
1465
1466 /*
1467 * Internal version of mmap that maps a specific VM object into an
1468 * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
1469 */
1470 int
vm_mmap_object(vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t maxprot,int flags,vm_object_t object,vm_ooffset_t foff,boolean_t writecounted,struct thread * td)1471 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1472 vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
1473 boolean_t writecounted, struct thread *td)
1474 {
1475 boolean_t fitit;
1476 int docow, error, findspace, rv;
1477
1478 if (map == &td->td_proc->p_vmspace->vm_map) {
1479 PROC_LOCK(td->td_proc);
1480 if (map->size + size > lim_cur_proc(td->td_proc, RLIMIT_VMEM)) {
1481 PROC_UNLOCK(td->td_proc);
1482 return (ENOMEM);
1483 }
1484 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
1485 PROC_UNLOCK(td->td_proc);
1486 return (ENOMEM);
1487 }
1488 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
1489 if (ptoa(pmap_wired_count(map->pmap)) + size >
1490 lim_cur_proc(td->td_proc, RLIMIT_MEMLOCK)) {
1491 racct_set_force(td->td_proc, RACCT_VMEM,
1492 map->size);
1493 PROC_UNLOCK(td->td_proc);
1494 return (ENOMEM);
1495 }
1496 error = racct_set(td->td_proc, RACCT_MEMLOCK,
1497 ptoa(pmap_wired_count(map->pmap)) + size);
1498 if (error != 0) {
1499 racct_set_force(td->td_proc, RACCT_VMEM,
1500 map->size);
1501 PROC_UNLOCK(td->td_proc);
1502 return (error);
1503 }
1504 }
1505 PROC_UNLOCK(td->td_proc);
1506 }
1507
1508 /*
1509 * We currently can only deal with page aligned file offsets.
1510 * The mmap() system call already enforces this by subtracting
1511 * the page offset from the file offset, but checking here
1512 * catches errors in device drivers (e.g. d_single_mmap()
1513 * callbacks) and other internal mapping requests (such as in
1514 * exec).
1515 */
1516 if (foff & PAGE_MASK)
1517 return (EINVAL);
1518
1519 if ((flags & MAP_FIXED) == 0) {
1520 fitit = TRUE;
1521 *addr = round_page(*addr);
1522 } else {
1523 if (*addr != trunc_page(*addr))
1524 return (EINVAL);
1525 fitit = FALSE;
1526 }
1527
1528 if (flags & MAP_ANON) {
1529 if (object != NULL || foff != 0)
1530 return (EINVAL);
1531 docow = 0;
1532 } else if (flags & MAP_PREFAULT_READ)
1533 docow = MAP_PREFAULT;
1534 else
1535 docow = MAP_PREFAULT_PARTIAL;
1536
1537 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1538 docow |= MAP_COPY_ON_WRITE;
1539 if (flags & MAP_NOSYNC)
1540 docow |= MAP_DISABLE_SYNCER;
1541 if (flags & MAP_NOCORE)
1542 docow |= MAP_DISABLE_COREDUMP;
1543 /* Shared memory is also shared with children. */
1544 if (flags & MAP_SHARED)
1545 docow |= MAP_INHERIT_SHARE;
1546 if (writecounted)
1547 docow |= MAP_VN_WRITECOUNT;
1548 if (flags & MAP_STACK) {
1549 if (object != NULL)
1550 return (EINVAL);
1551 docow |= MAP_STACK_GROWS_DOWN;
1552 }
1553 if ((flags & MAP_EXCL) != 0)
1554 docow |= MAP_CHECK_EXCL;
1555
1556 if (fitit) {
1557 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
1558 findspace = VMFS_SUPER_SPACE;
1559 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
1560 findspace = VMFS_ALIGNED_SPACE(flags >>
1561 MAP_ALIGNMENT_SHIFT);
1562 else
1563 findspace = VMFS_OPTIMAL_SPACE;
1564 rv = vm_map_find(map, object, foff, addr, size,
1565 #ifdef MAP_32BIT
1566 flags & MAP_32BIT ? MAP_32BIT_MAX_ADDR :
1567 #endif
1568 0, findspace, prot, maxprot, docow);
1569 } else {
1570 rv = vm_map_fixed(map, object, foff, *addr, size,
1571 prot, maxprot, docow);
1572 }
1573
1574 if (rv == KERN_SUCCESS) {
1575 /*
1576 * If the process has requested that all future mappings
1577 * be wired, then heed this.
1578 */
1579 if (map->flags & MAP_WIREFUTURE) {
1580 vm_map_wire(map, *addr, *addr + size,
1581 VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
1582 VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
1583 }
1584 }
1585 return (vm_mmap_to_errno(rv));
1586 }
1587
1588 /*
1589 * Translate a Mach VM return code to zero on success or the appropriate errno
1590 * on failure.
1591 */
1592 int
vm_mmap_to_errno(int rv)1593 vm_mmap_to_errno(int rv)
1594 {
1595
1596 switch (rv) {
1597 case KERN_SUCCESS:
1598 return (0);
1599 case KERN_INVALID_ADDRESS:
1600 case KERN_NO_SPACE:
1601 return (ENOMEM);
1602 case KERN_PROTECTION_FAILURE:
1603 return (EACCES);
1604 default:
1605 return (EINVAL);
1606 }
1607 }
1608