1 /*-
2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * The Mach Operating System project at Carnegie-Mellon University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39 *
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
45 *
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49 *
50 * Carnegie Mellon requests users of this software to return to
51 *
52 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
53 * School of Computer Science
54 * Carnegie Mellon University
55 * Pittsburgh PA 15213-3890
56 *
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
59 */
60
61 /*
62 * Virtual memory mapping module.
63 */
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/elf.h>
68 #include <sys/kernel.h>
69 #include <sys/ktr.h>
70 #include <sys/lock.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/vmmeter.h>
74 #include <sys/mman.h>
75 #include <sys/vnode.h>
76 #include <sys/racct.h>
77 #include <sys/resourcevar.h>
78 #include <sys/rwlock.h>
79 #include <sys/file.h>
80 #include <sys/sysctl.h>
81 #include <sys/sysent.h>
82 #include <sys/shm.h>
83
84 #include <vm/vm.h>
85 #include <vm/vm_param.h>
86 #include <vm/pmap.h>
87 #include <vm/vm_map.h>
88 #include <vm/vm_page.h>
89 #include <vm/vm_pageout.h>
90 #include <vm/vm_object.h>
91 #include <vm/vm_pager.h>
92 #include <vm/vm_radix.h>
93 #include <vm/vm_kern.h>
94 #include <vm/vm_extern.h>
95 #include <vm/vnode_pager.h>
96 #include <vm/swap_pager.h>
97 #include <vm/uma.h>
98
99 /*
100 * Virtual memory maps provide for the mapping, protection,
101 * and sharing of virtual memory objects. In addition,
102 * this module provides for an efficient virtual copy of
103 * memory from one map to another.
104 *
105 * Synchronization is required prior to most operations.
106 *
107 * Maps consist of an ordered doubly-linked list of simple
108 * entries; a self-adjusting binary search tree of these
109 * entries is used to speed up lookups.
110 *
111 * Since portions of maps are specified by start/end addresses,
112 * which may not align with existing map entries, all
113 * routines merely "clip" entries to these start/end values.
114 * [That is, an entry is split into two, bordering at a
115 * start or end value.] Note that these clippings may not
116 * always be necessary (as the two resulting entries are then
117 * not changed); however, the clipping is done for convenience.
118 *
119 * As mentioned above, virtual copy operations are performed
120 * by copying VM object references from one map to
121 * another, and then marking both regions as copy-on-write.
122 */
123
124 static struct mtx map_sleep_mtx;
125 static uma_zone_t mapentzone;
126 static uma_zone_t kmapentzone;
127 static uma_zone_t vmspace_zone;
128 static int vmspace_zinit(void *mem, int size, int flags);
129 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
130 vm_offset_t max);
131 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
132 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
133 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
134 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
135 vm_map_entry_t gap_entry);
136 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
137 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
138 #ifdef INVARIANTS
139 static void vmspace_zdtor(void *mem, int size, void *arg);
140 #endif
141 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
142 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
143 int cow);
144 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
145 vm_offset_t failed_addr);
146
147 #define CONTAINS_BITS(set, bits) ((~(set) & (bits)) == 0)
148
149 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
150 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
151 !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
152
153 /*
154 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
155 * stable.
156 */
157 #define PROC_VMSPACE_LOCK(p) do { } while (0)
158 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
159
160 /*
161 * VM_MAP_RANGE_CHECK: [ internal use only ]
162 *
163 * Asserts that the starting and ending region
164 * addresses fall within the valid range of the map.
165 */
166 #define VM_MAP_RANGE_CHECK(map, start, end) \
167 { \
168 if (start < vm_map_min(map)) \
169 start = vm_map_min(map); \
170 if (end > vm_map_max(map)) \
171 end = vm_map_max(map); \
172 if (start > end) \
173 start = end; \
174 }
175
176 #ifndef UMA_USE_DMAP
177
178 /*
179 * Allocate a new slab for kernel map entries. The kernel map may be locked or
180 * unlocked, depending on whether the request is coming from the kernel map or a
181 * submap. This function allocates a virtual address range directly from the
182 * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
183 * lock and also to avoid triggering allocator recursion in the vmem boundary
184 * tag allocator.
185 */
186 static void *
kmapent_alloc(uma_zone_t zone,vm_size_t bytes,int domain,uint8_t * pflag,int wait)187 kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
188 int wait)
189 {
190 vm_offset_t addr;
191 int error, locked;
192
193 *pflag = UMA_SLAB_PRIV;
194
195 if (!(locked = vm_map_locked(kernel_map)))
196 vm_map_lock(kernel_map);
197 addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
198 if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
199 panic("%s: kernel map is exhausted", __func__);
200 error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
201 VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
202 if (error != KERN_SUCCESS)
203 panic("%s: vm_map_insert() failed: %d", __func__, error);
204 if (!locked)
205 vm_map_unlock(kernel_map);
206 error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
207 M_USE_RESERVE | (wait & M_ZERO));
208 if (error == KERN_SUCCESS) {
209 return ((void *)addr);
210 } else {
211 if (!locked)
212 vm_map_lock(kernel_map);
213 vm_map_delete(kernel_map, addr, bytes);
214 if (!locked)
215 vm_map_unlock(kernel_map);
216 return (NULL);
217 }
218 }
219
220 static void
kmapent_free(void * item,vm_size_t size,uint8_t pflag)221 kmapent_free(void *item, vm_size_t size, uint8_t pflag)
222 {
223 vm_offset_t addr;
224 int error __diagused;
225
226 if ((pflag & UMA_SLAB_PRIV) == 0)
227 /* XXX leaked */
228 return;
229
230 addr = (vm_offset_t)item;
231 kmem_unback(kernel_object, addr, size);
232 error = vm_map_remove(kernel_map, addr, addr + size);
233 KASSERT(error == KERN_SUCCESS,
234 ("%s: vm_map_remove failed: %d", __func__, error));
235 }
236
237 /*
238 * The worst-case upper bound on the number of kernel map entries that may be
239 * created before the zone must be replenished in _vm_map_unlock().
240 */
241 #define KMAPENT_RESERVE 1
242
243 #endif /* !UMD_MD_SMALL_ALLOC */
244
245 /*
246 * vm_map_startup:
247 *
248 * Initialize the vm_map module. Must be called before any other vm_map
249 * routines.
250 *
251 * User map and entry structures are allocated from the general purpose
252 * memory pool. Kernel maps are statically defined. Kernel map entries
253 * require special handling to avoid recursion; see the comments above
254 * kmapent_alloc() and in vm_map_entry_create().
255 */
256 void
vm_map_startup(void)257 vm_map_startup(void)
258 {
259 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
260
261 /*
262 * Disable the use of per-CPU buckets: map entry allocation is
263 * serialized by the kernel map lock.
264 */
265 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
266 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
267 UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
268 #ifndef UMA_USE_DMAP
269 /* Reserve an extra map entry for use when replenishing the reserve. */
270 uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
271 uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
272 uma_zone_set_allocf(kmapentzone, kmapent_alloc);
273 uma_zone_set_freef(kmapentzone, kmapent_free);
274 #endif
275
276 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
277 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
278 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
279 #ifdef INVARIANTS
280 vmspace_zdtor,
281 #else
282 NULL,
283 #endif
284 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
285 }
286
287 static int
vmspace_zinit(void * mem,int size,int flags)288 vmspace_zinit(void *mem, int size, int flags)
289 {
290 struct vmspace *vm;
291 vm_map_t map;
292
293 vm = (struct vmspace *)mem;
294 map = &vm->vm_map;
295
296 memset(map, 0, sizeof(*map)); /* set MAP_SYSTEM_MAP to false */
297 sx_init(&map->lock, "vm map (user)");
298 PMAP_LOCK_INIT(vmspace_pmap(vm));
299 return (0);
300 }
301
302 #ifdef INVARIANTS
303 static void
vmspace_zdtor(void * mem,int size,void * arg)304 vmspace_zdtor(void *mem, int size, void *arg)
305 {
306 struct vmspace *vm;
307
308 vm = (struct vmspace *)mem;
309 KASSERT(vm->vm_map.nentries == 0,
310 ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
311 KASSERT(vm->vm_map.size == 0,
312 ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
313 }
314 #endif /* INVARIANTS */
315
316 /*
317 * Allocate a vmspace structure, including a vm_map and pmap,
318 * and initialize those structures. The refcnt is set to 1.
319 */
320 struct vmspace *
vmspace_alloc(vm_offset_t min,vm_offset_t max,pmap_pinit_t pinit)321 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
322 {
323 struct vmspace *vm;
324
325 vm = uma_zalloc(vmspace_zone, M_WAITOK);
326 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
327 if (!pinit(vmspace_pmap(vm))) {
328 uma_zfree(vmspace_zone, vm);
329 return (NULL);
330 }
331 CTR1(KTR_VM, "vmspace_alloc: %p", vm);
332 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
333 refcount_init(&vm->vm_refcnt, 1);
334 vm->vm_shm = NULL;
335 vm->vm_swrss = 0;
336 vm->vm_tsize = 0;
337 vm->vm_dsize = 0;
338 vm->vm_ssize = 0;
339 vm->vm_taddr = 0;
340 vm->vm_daddr = 0;
341 vm->vm_maxsaddr = 0;
342 return (vm);
343 }
344
345 #ifdef RACCT
346 static void
vmspace_container_reset(struct proc * p)347 vmspace_container_reset(struct proc *p)
348 {
349
350 PROC_LOCK(p);
351 racct_set(p, RACCT_DATA, 0);
352 racct_set(p, RACCT_STACK, 0);
353 racct_set(p, RACCT_RSS, 0);
354 racct_set(p, RACCT_MEMLOCK, 0);
355 racct_set(p, RACCT_VMEM, 0);
356 PROC_UNLOCK(p);
357 }
358 #endif
359
360 static inline void
vmspace_dofree(struct vmspace * vm)361 vmspace_dofree(struct vmspace *vm)
362 {
363
364 CTR1(KTR_VM, "vmspace_free: %p", vm);
365
366 /*
367 * Make sure any SysV shm is freed, it might not have been in
368 * exit1().
369 */
370 shmexit(vm);
371
372 /*
373 * Lock the map, to wait out all other references to it.
374 * Delete all of the mappings and pages they hold, then call
375 * the pmap module to reclaim anything left.
376 */
377 (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
378 vm_map_max(&vm->vm_map));
379
380 pmap_release(vmspace_pmap(vm));
381 vm->vm_map.pmap = NULL;
382 uma_zfree(vmspace_zone, vm);
383 }
384
385 void
vmspace_free(struct vmspace * vm)386 vmspace_free(struct vmspace *vm)
387 {
388
389 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
390 "vmspace_free() called");
391
392 if (refcount_release(&vm->vm_refcnt))
393 vmspace_dofree(vm);
394 }
395
396 void
vmspace_exitfree(struct proc * p)397 vmspace_exitfree(struct proc *p)
398 {
399 struct vmspace *vm;
400
401 PROC_VMSPACE_LOCK(p);
402 vm = p->p_vmspace;
403 p->p_vmspace = NULL;
404 PROC_VMSPACE_UNLOCK(p);
405 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
406 vmspace_free(vm);
407 }
408
409 void
vmspace_exit(struct thread * td)410 vmspace_exit(struct thread *td)
411 {
412 struct vmspace *vm;
413 struct proc *p;
414 bool released;
415
416 p = td->td_proc;
417 vm = p->p_vmspace;
418
419 /*
420 * Prepare to release the vmspace reference. The thread that releases
421 * the last reference is responsible for tearing down the vmspace.
422 * However, threads not releasing the final reference must switch to the
423 * kernel's vmspace0 before the decrement so that the subsequent pmap
424 * deactivation does not modify a freed vmspace.
425 */
426 refcount_acquire(&vmspace0.vm_refcnt);
427 if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
428 if (p->p_vmspace != &vmspace0) {
429 PROC_VMSPACE_LOCK(p);
430 p->p_vmspace = &vmspace0;
431 PROC_VMSPACE_UNLOCK(p);
432 pmap_activate(td);
433 }
434 released = refcount_release(&vm->vm_refcnt);
435 }
436 if (released) {
437 /*
438 * pmap_remove_pages() expects the pmap to be active, so switch
439 * back first if necessary.
440 */
441 if (p->p_vmspace != vm) {
442 PROC_VMSPACE_LOCK(p);
443 p->p_vmspace = vm;
444 PROC_VMSPACE_UNLOCK(p);
445 pmap_activate(td);
446 }
447 pmap_remove_pages(vmspace_pmap(vm));
448 PROC_VMSPACE_LOCK(p);
449 p->p_vmspace = &vmspace0;
450 PROC_VMSPACE_UNLOCK(p);
451 pmap_activate(td);
452 vmspace_dofree(vm);
453 }
454 #ifdef RACCT
455 if (racct_enable)
456 vmspace_container_reset(p);
457 #endif
458 }
459
460 /* Acquire reference to vmspace owned by another process. */
461
462 struct vmspace *
vmspace_acquire_ref(struct proc * p)463 vmspace_acquire_ref(struct proc *p)
464 {
465 struct vmspace *vm;
466
467 PROC_VMSPACE_LOCK(p);
468 vm = p->p_vmspace;
469 if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
470 PROC_VMSPACE_UNLOCK(p);
471 return (NULL);
472 }
473 if (vm != p->p_vmspace) {
474 PROC_VMSPACE_UNLOCK(p);
475 vmspace_free(vm);
476 return (NULL);
477 }
478 PROC_VMSPACE_UNLOCK(p);
479 return (vm);
480 }
481
482 /*
483 * Switch between vmspaces in an AIO kernel process.
484 *
485 * The new vmspace is either the vmspace of a user process obtained
486 * from an active AIO request or the initial vmspace of the AIO kernel
487 * process (when it is idling). Because user processes will block to
488 * drain any active AIO requests before proceeding in exit() or
489 * execve(), the reference count for vmspaces from AIO requests can
490 * never be 0. Similarly, AIO kernel processes hold an extra
491 * reference on their initial vmspace for the life of the process. As
492 * a result, the 'newvm' vmspace always has a non-zero reference
493 * count. This permits an additional reference on 'newvm' to be
494 * acquired via a simple atomic increment rather than the loop in
495 * vmspace_acquire_ref() above.
496 */
497 void
vmspace_switch_aio(struct vmspace * newvm)498 vmspace_switch_aio(struct vmspace *newvm)
499 {
500 struct vmspace *oldvm;
501
502 /* XXX: Need some way to assert that this is an aio daemon. */
503
504 KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
505 ("vmspace_switch_aio: newvm unreferenced"));
506
507 oldvm = curproc->p_vmspace;
508 if (oldvm == newvm)
509 return;
510
511 /*
512 * Point to the new address space and refer to it.
513 */
514 curproc->p_vmspace = newvm;
515 refcount_acquire(&newvm->vm_refcnt);
516
517 /* Activate the new mapping. */
518 pmap_activate(curthread);
519
520 vmspace_free(oldvm);
521 }
522
523 void
_vm_map_lock(vm_map_t map,const char * file,int line)524 _vm_map_lock(vm_map_t map, const char *file, int line)
525 {
526
527 if (vm_map_is_system(map))
528 mtx_lock_flags_(&map->system_mtx, 0, file, line);
529 else
530 sx_xlock_(&map->lock, file, line);
531 map->timestamp++;
532 }
533
534 void
vm_map_entry_set_vnode_text(vm_map_entry_t entry,bool add)535 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
536 {
537 vm_object_t object;
538 struct vnode *vp;
539 bool vp_held;
540
541 if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
542 return;
543 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
544 ("Submap with execs"));
545 object = entry->object.vm_object;
546 KASSERT(object != NULL, ("No object for text, entry %p", entry));
547 if ((object->flags & OBJ_ANON) != 0)
548 object = object->handle;
549 else
550 KASSERT(object->backing_object == NULL,
551 ("non-anon object %p shadows", object));
552 KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
553 entry, entry->object.vm_object));
554
555 /*
556 * Mostly, we do not lock the backing object. It is
557 * referenced by the entry we are processing, so it cannot go
558 * away.
559 */
560 vm_pager_getvp(object, &vp, &vp_held);
561 if (vp != NULL) {
562 if (add) {
563 VOP_SET_TEXT_CHECKED(vp);
564 } else {
565 vn_lock(vp, LK_SHARED | LK_RETRY);
566 VOP_UNSET_TEXT_CHECKED(vp);
567 VOP_UNLOCK(vp);
568 }
569 if (vp_held)
570 vdrop(vp);
571 }
572 }
573
574 /*
575 * Use a different name for this vm_map_entry field when it's use
576 * is not consistent with its use as part of an ordered search tree.
577 */
578 #define defer_next right
579
580 static void
vm_map_process_deferred(void)581 vm_map_process_deferred(void)
582 {
583 struct thread *td;
584 vm_map_entry_t entry, next;
585 vm_object_t object;
586
587 td = curthread;
588 entry = td->td_map_def_user;
589 td->td_map_def_user = NULL;
590 while (entry != NULL) {
591 next = entry->defer_next;
592 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
593 MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
594 MAP_ENTRY_VN_EXEC));
595 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
596 /*
597 * Decrement the object's writemappings and
598 * possibly the vnode's v_writecount.
599 */
600 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
601 ("Submap with writecount"));
602 object = entry->object.vm_object;
603 KASSERT(object != NULL, ("No object for writecount"));
604 vm_pager_release_writecount(object, entry->start,
605 entry->end);
606 }
607 vm_map_entry_set_vnode_text(entry, false);
608 vm_map_entry_deallocate(entry, FALSE);
609 entry = next;
610 }
611 }
612
613 #ifdef INVARIANTS
614 static void
_vm_map_assert_locked(vm_map_t map,const char * file,int line)615 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
616 {
617
618 if (vm_map_is_system(map))
619 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
620 else
621 sx_assert_(&map->lock, SA_XLOCKED, file, line);
622 }
623
624 #define VM_MAP_ASSERT_LOCKED(map) \
625 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
626
627 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
628 #ifdef DIAGNOSTIC
629 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
630 #else
631 static int enable_vmmap_check = VMMAP_CHECK_NONE;
632 #endif
633 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
634 &enable_vmmap_check, 0, "Enable vm map consistency checking");
635
636 static void _vm_map_assert_consistent(vm_map_t map, int check);
637
638 #define VM_MAP_ASSERT_CONSISTENT(map) \
639 _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
640 #ifdef DIAGNOSTIC
641 #define VM_MAP_UNLOCK_CONSISTENT(map) do { \
642 if (map->nupdates > map->nentries) { \
643 _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
644 map->nupdates = 0; \
645 } \
646 } while (0)
647 #else
648 #define VM_MAP_UNLOCK_CONSISTENT(map)
649 #endif
650 #else
651 #define VM_MAP_ASSERT_LOCKED(map)
652 #define VM_MAP_ASSERT_CONSISTENT(map)
653 #define VM_MAP_UNLOCK_CONSISTENT(map)
654 #endif /* INVARIANTS */
655
656 void
_vm_map_unlock(vm_map_t map,const char * file,int line)657 _vm_map_unlock(vm_map_t map, const char *file, int line)
658 {
659
660 VM_MAP_UNLOCK_CONSISTENT(map);
661 if (vm_map_is_system(map)) {
662 #ifndef UMA_USE_DMAP
663 if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
664 uma_prealloc(kmapentzone, 1);
665 map->flags &= ~MAP_REPLENISH;
666 }
667 #endif
668 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
669 } else {
670 sx_xunlock_(&map->lock, file, line);
671 vm_map_process_deferred();
672 }
673 }
674
675 void
_vm_map_lock_read(vm_map_t map,const char * file,int line)676 _vm_map_lock_read(vm_map_t map, const char *file, int line)
677 {
678
679 if (vm_map_is_system(map))
680 mtx_lock_flags_(&map->system_mtx, 0, file, line);
681 else
682 sx_slock_(&map->lock, file, line);
683 }
684
685 void
_vm_map_unlock_read(vm_map_t map,const char * file,int line)686 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
687 {
688
689 if (vm_map_is_system(map)) {
690 KASSERT((map->flags & MAP_REPLENISH) == 0,
691 ("%s: MAP_REPLENISH leaked", __func__));
692 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
693 } else {
694 sx_sunlock_(&map->lock, file, line);
695 vm_map_process_deferred();
696 }
697 }
698
699 int
_vm_map_trylock(vm_map_t map,const char * file,int line)700 _vm_map_trylock(vm_map_t map, const char *file, int line)
701 {
702 int error;
703
704 error = vm_map_is_system(map) ?
705 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
706 !sx_try_xlock_(&map->lock, file, line);
707 if (error == 0)
708 map->timestamp++;
709 return (error == 0);
710 }
711
712 int
_vm_map_trylock_read(vm_map_t map,const char * file,int line)713 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
714 {
715 int error;
716
717 error = vm_map_is_system(map) ?
718 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
719 !sx_try_slock_(&map->lock, file, line);
720 return (error == 0);
721 }
722
723 /*
724 * _vm_map_lock_upgrade: [ internal use only ]
725 *
726 * Tries to upgrade a read (shared) lock on the specified map to a write
727 * (exclusive) lock. Returns the value "0" if the upgrade succeeds and a
728 * non-zero value if the upgrade fails. If the upgrade fails, the map is
729 * returned without a read or write lock held.
730 *
731 * Requires that the map be read locked.
732 */
733 int
_vm_map_lock_upgrade(vm_map_t map,const char * file,int line)734 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
735 {
736 unsigned int last_timestamp;
737
738 if (vm_map_is_system(map)) {
739 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
740 } else {
741 if (!sx_try_upgrade_(&map->lock, file, line)) {
742 last_timestamp = map->timestamp;
743 sx_sunlock_(&map->lock, file, line);
744 vm_map_process_deferred();
745 /*
746 * If the map's timestamp does not change while the
747 * map is unlocked, then the upgrade succeeds.
748 */
749 sx_xlock_(&map->lock, file, line);
750 if (last_timestamp != map->timestamp) {
751 sx_xunlock_(&map->lock, file, line);
752 return (1);
753 }
754 }
755 }
756 map->timestamp++;
757 return (0);
758 }
759
760 void
_vm_map_lock_downgrade(vm_map_t map,const char * file,int line)761 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
762 {
763
764 if (vm_map_is_system(map)) {
765 KASSERT((map->flags & MAP_REPLENISH) == 0,
766 ("%s: MAP_REPLENISH leaked", __func__));
767 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
768 } else {
769 VM_MAP_UNLOCK_CONSISTENT(map);
770 sx_downgrade_(&map->lock, file, line);
771 }
772 }
773
774 /*
775 * vm_map_locked:
776 *
777 * Returns a non-zero value if the caller holds a write (exclusive) lock
778 * on the specified map and the value "0" otherwise.
779 */
780 int
vm_map_locked(vm_map_t map)781 vm_map_locked(vm_map_t map)
782 {
783
784 if (vm_map_is_system(map))
785 return (mtx_owned(&map->system_mtx));
786 return (sx_xlocked(&map->lock));
787 }
788
789 /*
790 * _vm_map_unlock_and_wait:
791 *
792 * Atomically releases the lock on the specified map and puts the calling
793 * thread to sleep. The calling thread will remain asleep until either
794 * vm_map_wakeup() is performed on the map or the specified timeout is
795 * exceeded.
796 *
797 * WARNING! This function does not perform deferred deallocations of
798 * objects and map entries. Therefore, the calling thread is expected to
799 * reacquire the map lock after reawakening and later perform an ordinary
800 * unlock operation, such as vm_map_unlock(), before completing its
801 * operation on the map.
802 */
803 int
_vm_map_unlock_and_wait(vm_map_t map,int timo,const char * file,int line)804 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
805 {
806
807 VM_MAP_UNLOCK_CONSISTENT(map);
808 mtx_lock(&map_sleep_mtx);
809 if (vm_map_is_system(map)) {
810 KASSERT((map->flags & MAP_REPLENISH) == 0,
811 ("%s: MAP_REPLENISH leaked", __func__));
812 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
813 } else {
814 sx_xunlock_(&map->lock, file, line);
815 }
816 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
817 timo));
818 }
819
820 /*
821 * vm_map_wakeup:
822 *
823 * Awaken any threads that have slept on the map using
824 * vm_map_unlock_and_wait().
825 */
826 void
vm_map_wakeup(vm_map_t map)827 vm_map_wakeup(vm_map_t map)
828 {
829
830 /*
831 * Acquire and release map_sleep_mtx to prevent a wakeup()
832 * from being performed (and lost) between the map unlock
833 * and the msleep() in _vm_map_unlock_and_wait().
834 */
835 mtx_lock(&map_sleep_mtx);
836 mtx_unlock(&map_sleep_mtx);
837 wakeup(&map->root);
838 }
839
840 void
vm_map_busy(vm_map_t map)841 vm_map_busy(vm_map_t map)
842 {
843
844 VM_MAP_ASSERT_LOCKED(map);
845 map->busy++;
846 }
847
848 void
vm_map_unbusy(vm_map_t map)849 vm_map_unbusy(vm_map_t map)
850 {
851
852 VM_MAP_ASSERT_LOCKED(map);
853 KASSERT(map->busy, ("vm_map_unbusy: not busy"));
854 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
855 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
856 wakeup(&map->busy);
857 }
858 }
859
860 void
vm_map_wait_busy(vm_map_t map)861 vm_map_wait_busy(vm_map_t map)
862 {
863
864 VM_MAP_ASSERT_LOCKED(map);
865 while (map->busy) {
866 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
867 if (vm_map_is_system(map))
868 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
869 else
870 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
871 }
872 map->timestamp++;
873 }
874
875 long
vmspace_resident_count(struct vmspace * vmspace)876 vmspace_resident_count(struct vmspace *vmspace)
877 {
878 return pmap_resident_count(vmspace_pmap(vmspace));
879 }
880
881 /*
882 * Initialize an existing vm_map structure
883 * such as that in the vmspace structure.
884 */
885 static void
_vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)886 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
887 {
888
889 map->header.eflags = MAP_ENTRY_HEADER;
890 map->pmap = pmap;
891 map->header.end = min;
892 map->header.start = max;
893 map->flags = 0;
894 map->header.left = map->header.right = &map->header;
895 map->root = NULL;
896 map->timestamp = 0;
897 map->busy = 0;
898 map->anon_loc = 0;
899 #ifdef DIAGNOSTIC
900 map->nupdates = 0;
901 #endif
902 }
903
904 void
vm_map_init(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)905 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
906 {
907 _vm_map_init(map, pmap, min, max);
908 sx_init(&map->lock, "vm map (user)");
909 }
910
911 void
vm_map_init_system(vm_map_t map,pmap_t pmap,vm_offset_t min,vm_offset_t max)912 vm_map_init_system(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
913 {
914 _vm_map_init(map, pmap, min, max);
915 vm_map_modflags(map, MAP_SYSTEM_MAP, 0);
916 mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF |
917 MTX_DUPOK);
918 }
919
920 /*
921 * vm_map_entry_dispose: [ internal use only ]
922 *
923 * Inverse of vm_map_entry_create.
924 */
925 static void
vm_map_entry_dispose(vm_map_t map,vm_map_entry_t entry)926 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
927 {
928 uma_zfree(vm_map_is_system(map) ? kmapentzone : mapentzone, entry);
929 }
930
931 /*
932 * vm_map_entry_create: [ internal use only ]
933 *
934 * Allocates a VM map entry for insertion.
935 * No entry fields are filled in.
936 */
937 static vm_map_entry_t
vm_map_entry_create(vm_map_t map)938 vm_map_entry_create(vm_map_t map)
939 {
940 vm_map_entry_t new_entry;
941
942 #ifndef UMA_USE_DMAP
943 if (map == kernel_map) {
944 VM_MAP_ASSERT_LOCKED(map);
945
946 /*
947 * A new slab of kernel map entries cannot be allocated at this
948 * point because the kernel map has not yet been updated to
949 * reflect the caller's request. Therefore, we allocate a new
950 * map entry, dipping into the reserve if necessary, and set a
951 * flag indicating that the reserve must be replenished before
952 * the map is unlocked.
953 */
954 new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
955 if (new_entry == NULL) {
956 new_entry = uma_zalloc(kmapentzone,
957 M_NOWAIT | M_NOVM | M_USE_RESERVE);
958 kernel_map->flags |= MAP_REPLENISH;
959 }
960 } else
961 #endif
962 if (vm_map_is_system(map)) {
963 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
964 } else {
965 new_entry = uma_zalloc(mapentzone, M_WAITOK);
966 }
967 KASSERT(new_entry != NULL,
968 ("vm_map_entry_create: kernel resources exhausted"));
969 return (new_entry);
970 }
971
972 /*
973 * vm_map_entry_set_behavior:
974 *
975 * Set the expected access behavior, either normal, random, or
976 * sequential.
977 */
978 static inline void
vm_map_entry_set_behavior(vm_map_entry_t entry,u_char behavior)979 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
980 {
981 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
982 (behavior & MAP_ENTRY_BEHAV_MASK);
983 }
984
985 /*
986 * vm_map_entry_max_free_{left,right}:
987 *
988 * Compute the size of the largest free gap between two entries,
989 * one the root of a tree and the other the ancestor of that root
990 * that is the least or greatest ancestor found on the search path.
991 */
992 static inline vm_size_t
vm_map_entry_max_free_left(vm_map_entry_t root,vm_map_entry_t left_ancestor)993 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
994 {
995
996 return (root->left != left_ancestor ?
997 root->left->max_free : root->start - left_ancestor->end);
998 }
999
1000 static inline vm_size_t
vm_map_entry_max_free_right(vm_map_entry_t root,vm_map_entry_t right_ancestor)1001 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1002 {
1003
1004 return (root->right != right_ancestor ?
1005 root->right->max_free : right_ancestor->start - root->end);
1006 }
1007
1008 /*
1009 * vm_map_entry_{pred,succ}:
1010 *
1011 * Find the {predecessor, successor} of the entry by taking one step
1012 * in the appropriate direction and backtracking as much as necessary.
1013 * vm_map_entry_succ is defined in vm_map.h.
1014 */
1015 static inline vm_map_entry_t
vm_map_entry_pred(vm_map_entry_t entry)1016 vm_map_entry_pred(vm_map_entry_t entry)
1017 {
1018 vm_map_entry_t prior;
1019
1020 prior = entry->left;
1021 if (prior->right->start < entry->start) {
1022 do
1023 prior = prior->right;
1024 while (prior->right != entry);
1025 }
1026 return (prior);
1027 }
1028
1029 static inline vm_size_t
vm_size_max(vm_size_t a,vm_size_t b)1030 vm_size_max(vm_size_t a, vm_size_t b)
1031 {
1032
1033 return (a > b ? a : b);
1034 }
1035
1036 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1037 vm_map_entry_t z; \
1038 vm_size_t max_free; \
1039 \
1040 /* \
1041 * Infer root->right->max_free == root->max_free when \
1042 * y->max_free < root->max_free || root->max_free == 0. \
1043 * Otherwise, look right to find it. \
1044 */ \
1045 y = root->left; \
1046 max_free = root->max_free; \
1047 KASSERT(max_free == vm_size_max( \
1048 vm_map_entry_max_free_left(root, llist), \
1049 vm_map_entry_max_free_right(root, rlist)), \
1050 ("%s: max_free invariant fails", __func__)); \
1051 if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1052 max_free = vm_map_entry_max_free_right(root, rlist); \
1053 if (y != llist && (test)) { \
1054 /* Rotate right and make y root. */ \
1055 z = y->right; \
1056 if (z != root) { \
1057 root->left = z; \
1058 y->right = root; \
1059 if (max_free < y->max_free) \
1060 root->max_free = max_free = \
1061 vm_size_max(max_free, z->max_free); \
1062 } else if (max_free < y->max_free) \
1063 root->max_free = max_free = \
1064 vm_size_max(max_free, root->start - y->end);\
1065 root = y; \
1066 y = root->left; \
1067 } \
1068 /* Copy right->max_free. Put root on rlist. */ \
1069 root->max_free = max_free; \
1070 KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1071 ("%s: max_free not copied from right", __func__)); \
1072 root->left = rlist; \
1073 rlist = root; \
1074 root = y != llist ? y : NULL; \
1075 } while (0)
1076
1077 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1078 vm_map_entry_t z; \
1079 vm_size_t max_free; \
1080 \
1081 /* \
1082 * Infer root->left->max_free == root->max_free when \
1083 * y->max_free < root->max_free || root->max_free == 0. \
1084 * Otherwise, look left to find it. \
1085 */ \
1086 y = root->right; \
1087 max_free = root->max_free; \
1088 KASSERT(max_free == vm_size_max( \
1089 vm_map_entry_max_free_left(root, llist), \
1090 vm_map_entry_max_free_right(root, rlist)), \
1091 ("%s: max_free invariant fails", __func__)); \
1092 if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1093 max_free = vm_map_entry_max_free_left(root, llist); \
1094 if (y != rlist && (test)) { \
1095 /* Rotate left and make y root. */ \
1096 z = y->left; \
1097 if (z != root) { \
1098 root->right = z; \
1099 y->left = root; \
1100 if (max_free < y->max_free) \
1101 root->max_free = max_free = \
1102 vm_size_max(max_free, z->max_free); \
1103 } else if (max_free < y->max_free) \
1104 root->max_free = max_free = \
1105 vm_size_max(max_free, y->start - root->end);\
1106 root = y; \
1107 y = root->right; \
1108 } \
1109 /* Copy left->max_free. Put root on llist. */ \
1110 root->max_free = max_free; \
1111 KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1112 ("%s: max_free not copied from left", __func__)); \
1113 root->right = llist; \
1114 llist = root; \
1115 root = y != rlist ? y : NULL; \
1116 } while (0)
1117
1118 /*
1119 * Walk down the tree until we find addr or a gap where addr would go, breaking
1120 * off left and right subtrees of nodes less than, or greater than addr. Treat
1121 * subtrees with root->max_free < length as empty trees. llist and rlist are
1122 * the two sides in reverse order (bottom-up), with llist linked by the right
1123 * pointer and rlist linked by the left pointer in the vm_map_entry, and both
1124 * lists terminated by &map->header. This function, and the subsequent call to
1125 * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1126 * values in &map->header.
1127 */
1128 static __always_inline vm_map_entry_t
vm_map_splay_split(vm_map_t map,vm_offset_t addr,vm_size_t length,vm_map_entry_t * llist,vm_map_entry_t * rlist)1129 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1130 vm_map_entry_t *llist, vm_map_entry_t *rlist)
1131 {
1132 vm_map_entry_t left, right, root, y;
1133
1134 left = right = &map->header;
1135 root = map->root;
1136 while (root != NULL && root->max_free >= length) {
1137 KASSERT(left->end <= root->start &&
1138 root->end <= right->start,
1139 ("%s: root not within tree bounds", __func__));
1140 if (addr < root->start) {
1141 SPLAY_LEFT_STEP(root, y, left, right,
1142 y->max_free >= length && addr < y->start);
1143 } else if (addr >= root->end) {
1144 SPLAY_RIGHT_STEP(root, y, left, right,
1145 y->max_free >= length && addr >= y->end);
1146 } else
1147 break;
1148 }
1149 *llist = left;
1150 *rlist = right;
1151 return (root);
1152 }
1153
1154 static __always_inline void
vm_map_splay_findnext(vm_map_entry_t root,vm_map_entry_t * rlist)1155 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1156 {
1157 vm_map_entry_t hi, right, y;
1158
1159 right = *rlist;
1160 hi = root->right == right ? NULL : root->right;
1161 if (hi == NULL)
1162 return;
1163 do
1164 SPLAY_LEFT_STEP(hi, y, root, right, true);
1165 while (hi != NULL);
1166 *rlist = right;
1167 }
1168
1169 static __always_inline void
vm_map_splay_findprev(vm_map_entry_t root,vm_map_entry_t * llist)1170 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1171 {
1172 vm_map_entry_t left, lo, y;
1173
1174 left = *llist;
1175 lo = root->left == left ? NULL : root->left;
1176 if (lo == NULL)
1177 return;
1178 do
1179 SPLAY_RIGHT_STEP(lo, y, left, root, true);
1180 while (lo != NULL);
1181 *llist = left;
1182 }
1183
1184 static inline void
vm_map_entry_swap(vm_map_entry_t * a,vm_map_entry_t * b)1185 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1186 {
1187 vm_map_entry_t tmp;
1188
1189 tmp = *b;
1190 *b = *a;
1191 *a = tmp;
1192 }
1193
1194 /*
1195 * Walk back up the two spines, flip the pointers and set max_free. The
1196 * subtrees of the root go at the bottom of llist and rlist.
1197 */
1198 static vm_size_t
vm_map_splay_merge_left_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t llist)1199 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1200 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1201 {
1202 do {
1203 /*
1204 * The max_free values of the children of llist are in
1205 * llist->max_free and max_free. Update with the
1206 * max value.
1207 */
1208 llist->max_free = max_free =
1209 vm_size_max(llist->max_free, max_free);
1210 vm_map_entry_swap(&llist->right, &tail);
1211 vm_map_entry_swap(&tail, &llist);
1212 } while (llist != header);
1213 root->left = tail;
1214 return (max_free);
1215 }
1216
1217 /*
1218 * When llist is known to be the predecessor of root.
1219 */
1220 static inline vm_size_t
vm_map_splay_merge_pred(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1221 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1222 vm_map_entry_t llist)
1223 {
1224 vm_size_t max_free;
1225
1226 max_free = root->start - llist->end;
1227 if (llist != header) {
1228 max_free = vm_map_splay_merge_left_walk(header, root,
1229 root, max_free, llist);
1230 } else {
1231 root->left = header;
1232 header->right = root;
1233 }
1234 return (max_free);
1235 }
1236
1237 /*
1238 * When llist may or may not be the predecessor of root.
1239 */
1240 static inline vm_size_t
vm_map_splay_merge_left(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t llist)1241 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1242 vm_map_entry_t llist)
1243 {
1244 vm_size_t max_free;
1245
1246 max_free = vm_map_entry_max_free_left(root, llist);
1247 if (llist != header) {
1248 max_free = vm_map_splay_merge_left_walk(header, root,
1249 root->left == llist ? root : root->left,
1250 max_free, llist);
1251 }
1252 return (max_free);
1253 }
1254
1255 static vm_size_t
vm_map_splay_merge_right_walk(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t tail,vm_size_t max_free,vm_map_entry_t rlist)1256 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1257 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1258 {
1259 do {
1260 /*
1261 * The max_free values of the children of rlist are in
1262 * rlist->max_free and max_free. Update with the
1263 * max value.
1264 */
1265 rlist->max_free = max_free =
1266 vm_size_max(rlist->max_free, max_free);
1267 vm_map_entry_swap(&rlist->left, &tail);
1268 vm_map_entry_swap(&tail, &rlist);
1269 } while (rlist != header);
1270 root->right = tail;
1271 return (max_free);
1272 }
1273
1274 /*
1275 * When rlist is known to be the succecessor of root.
1276 */
1277 static inline vm_size_t
vm_map_splay_merge_succ(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1278 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1279 vm_map_entry_t rlist)
1280 {
1281 vm_size_t max_free;
1282
1283 max_free = rlist->start - root->end;
1284 if (rlist != header) {
1285 max_free = vm_map_splay_merge_right_walk(header, root,
1286 root, max_free, rlist);
1287 } else {
1288 root->right = header;
1289 header->left = root;
1290 }
1291 return (max_free);
1292 }
1293
1294 /*
1295 * When rlist may or may not be the succecessor of root.
1296 */
1297 static inline vm_size_t
vm_map_splay_merge_right(vm_map_entry_t header,vm_map_entry_t root,vm_map_entry_t rlist)1298 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1299 vm_map_entry_t rlist)
1300 {
1301 vm_size_t max_free;
1302
1303 max_free = vm_map_entry_max_free_right(root, rlist);
1304 if (rlist != header) {
1305 max_free = vm_map_splay_merge_right_walk(header, root,
1306 root->right == rlist ? root : root->right,
1307 max_free, rlist);
1308 }
1309 return (max_free);
1310 }
1311
1312 /*
1313 * vm_map_splay:
1314 *
1315 * The Sleator and Tarjan top-down splay algorithm with the
1316 * following variation. Max_free must be computed bottom-up, so
1317 * on the downward pass, maintain the left and right spines in
1318 * reverse order. Then, make a second pass up each side to fix
1319 * the pointers and compute max_free. The time bound is O(log n)
1320 * amortized.
1321 *
1322 * The tree is threaded, which means that there are no null pointers.
1323 * When a node has no left child, its left pointer points to its
1324 * predecessor, which the last ancestor on the search path from the root
1325 * where the search branched right. Likewise, when a node has no right
1326 * child, its right pointer points to its successor. The map header node
1327 * is the predecessor of the first map entry, and the successor of the
1328 * last.
1329 *
1330 * The new root is the vm_map_entry containing "addr", or else an
1331 * adjacent entry (lower if possible) if addr is not in the tree.
1332 *
1333 * The map must be locked, and leaves it so.
1334 *
1335 * Returns: the new root.
1336 */
1337 static vm_map_entry_t
vm_map_splay(vm_map_t map,vm_offset_t addr)1338 vm_map_splay(vm_map_t map, vm_offset_t addr)
1339 {
1340 vm_map_entry_t header, llist, rlist, root;
1341 vm_size_t max_free_left, max_free_right;
1342
1343 header = &map->header;
1344 root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1345 if (root != NULL) {
1346 max_free_left = vm_map_splay_merge_left(header, root, llist);
1347 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1348 } else if (llist != header) {
1349 /*
1350 * Recover the greatest node in the left
1351 * subtree and make it the root.
1352 */
1353 root = llist;
1354 llist = root->right;
1355 max_free_left = vm_map_splay_merge_left(header, root, llist);
1356 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1357 } else if (rlist != header) {
1358 /*
1359 * Recover the least node in the right
1360 * subtree and make it the root.
1361 */
1362 root = rlist;
1363 rlist = root->left;
1364 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1365 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1366 } else {
1367 /* There is no root. */
1368 return (NULL);
1369 }
1370 root->max_free = vm_size_max(max_free_left, max_free_right);
1371 map->root = root;
1372 VM_MAP_ASSERT_CONSISTENT(map);
1373 return (root);
1374 }
1375
1376 /*
1377 * vm_map_entry_{un,}link:
1378 *
1379 * Insert/remove entries from maps. On linking, if new entry clips
1380 * existing entry, trim existing entry to avoid overlap, and manage
1381 * offsets. On unlinking, merge disappearing entry with neighbor, if
1382 * called for, and manage offsets. Callers should not modify fields in
1383 * entries already mapped.
1384 */
1385 static void
vm_map_entry_link(vm_map_t map,vm_map_entry_t entry)1386 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1387 {
1388 vm_map_entry_t header, llist, rlist, root;
1389 vm_size_t max_free_left, max_free_right;
1390
1391 CTR3(KTR_VM,
1392 "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1393 map->nentries, entry);
1394 VM_MAP_ASSERT_LOCKED(map);
1395 map->nentries++;
1396 header = &map->header;
1397 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1398 if (root == NULL) {
1399 /*
1400 * The new entry does not overlap any existing entry in the
1401 * map, so it becomes the new root of the map tree.
1402 */
1403 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1404 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1405 } else if (entry->start == root->start) {
1406 /*
1407 * The new entry is a clone of root, with only the end field
1408 * changed. The root entry will be shrunk to abut the new
1409 * entry, and will be the right child of the new root entry in
1410 * the modified map.
1411 */
1412 KASSERT(entry->end < root->end,
1413 ("%s: clip_start not within entry", __func__));
1414 vm_map_splay_findprev(root, &llist);
1415 if ((root->eflags & MAP_ENTRY_STACK_GAP) == 0)
1416 root->offset += entry->end - root->start;
1417 root->start = entry->end;
1418 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1419 max_free_right = root->max_free = vm_size_max(
1420 vm_map_splay_merge_pred(entry, root, entry),
1421 vm_map_splay_merge_right(header, root, rlist));
1422 } else {
1423 /*
1424 * The new entry is a clone of root, with only the start field
1425 * changed. The root entry will be shrunk to abut the new
1426 * entry, and will be the left child of the new root entry in
1427 * the modified map.
1428 */
1429 KASSERT(entry->end == root->end,
1430 ("%s: clip_start not within entry", __func__));
1431 vm_map_splay_findnext(root, &rlist);
1432 if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
1433 entry->offset += entry->start - root->start;
1434 root->end = entry->start;
1435 max_free_left = root->max_free = vm_size_max(
1436 vm_map_splay_merge_left(header, root, llist),
1437 vm_map_splay_merge_succ(entry, root, entry));
1438 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1439 }
1440 entry->max_free = vm_size_max(max_free_left, max_free_right);
1441 map->root = entry;
1442 VM_MAP_ASSERT_CONSISTENT(map);
1443 }
1444
1445 enum unlink_merge_type {
1446 UNLINK_MERGE_NONE,
1447 UNLINK_MERGE_NEXT
1448 };
1449
1450 static void
vm_map_entry_unlink(vm_map_t map,vm_map_entry_t entry,enum unlink_merge_type op)1451 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1452 enum unlink_merge_type op)
1453 {
1454 vm_map_entry_t header, llist, rlist, root;
1455 vm_size_t max_free_left, max_free_right;
1456
1457 VM_MAP_ASSERT_LOCKED(map);
1458 header = &map->header;
1459 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1460 KASSERT(root != NULL,
1461 ("vm_map_entry_unlink: unlink object not mapped"));
1462
1463 vm_map_splay_findprev(root, &llist);
1464 vm_map_splay_findnext(root, &rlist);
1465 if (op == UNLINK_MERGE_NEXT) {
1466 rlist->start = root->start;
1467 MPASS((rlist->eflags & MAP_ENTRY_STACK_GAP) == 0);
1468 rlist->offset = root->offset;
1469 }
1470 if (llist != header) {
1471 root = llist;
1472 llist = root->right;
1473 max_free_left = vm_map_splay_merge_left(header, root, llist);
1474 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1475 } else if (rlist != header) {
1476 root = rlist;
1477 rlist = root->left;
1478 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1479 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1480 } else {
1481 header->left = header->right = header;
1482 root = NULL;
1483 }
1484 if (root != NULL)
1485 root->max_free = vm_size_max(max_free_left, max_free_right);
1486 map->root = root;
1487 VM_MAP_ASSERT_CONSISTENT(map);
1488 map->nentries--;
1489 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1490 map->nentries, entry);
1491 }
1492
1493 /*
1494 * vm_map_entry_resize:
1495 *
1496 * Resize a vm_map_entry, recompute the amount of free space that
1497 * follows it and propagate that value up the tree.
1498 *
1499 * The map must be locked, and leaves it so.
1500 */
1501 static void
vm_map_entry_resize(vm_map_t map,vm_map_entry_t entry,vm_size_t grow_amount)1502 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1503 {
1504 vm_map_entry_t header, llist, rlist, root;
1505
1506 VM_MAP_ASSERT_LOCKED(map);
1507 header = &map->header;
1508 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1509 KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1510 vm_map_splay_findnext(root, &rlist);
1511 entry->end += grow_amount;
1512 root->max_free = vm_size_max(
1513 vm_map_splay_merge_left(header, root, llist),
1514 vm_map_splay_merge_succ(header, root, rlist));
1515 map->root = root;
1516 VM_MAP_ASSERT_CONSISTENT(map);
1517 CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1518 __func__, map, map->nentries, entry);
1519 }
1520
1521 /*
1522 * vm_map_lookup_entry: [ internal use only ]
1523 *
1524 * Finds the map entry containing (or
1525 * immediately preceding) the specified address
1526 * in the given map; the entry is returned
1527 * in the "entry" parameter. The boolean
1528 * result indicates whether the address is
1529 * actually contained in the map.
1530 */
1531 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_offset_t address,vm_map_entry_t * entry)1532 vm_map_lookup_entry(
1533 vm_map_t map,
1534 vm_offset_t address,
1535 vm_map_entry_t *entry) /* OUT */
1536 {
1537 vm_map_entry_t cur, header, lbound, ubound;
1538 boolean_t locked;
1539
1540 /*
1541 * If the map is empty, then the map entry immediately preceding
1542 * "address" is the map's header.
1543 */
1544 header = &map->header;
1545 cur = map->root;
1546 if (cur == NULL) {
1547 *entry = header;
1548 return (FALSE);
1549 }
1550 if (address >= cur->start && cur->end > address) {
1551 *entry = cur;
1552 return (TRUE);
1553 }
1554 if ((locked = vm_map_locked(map)) ||
1555 sx_try_upgrade(&map->lock)) {
1556 /*
1557 * Splay requires a write lock on the map. However, it only
1558 * restructures the binary search tree; it does not otherwise
1559 * change the map. Thus, the map's timestamp need not change
1560 * on a temporary upgrade.
1561 */
1562 cur = vm_map_splay(map, address);
1563 if (!locked) {
1564 VM_MAP_UNLOCK_CONSISTENT(map);
1565 sx_downgrade(&map->lock);
1566 }
1567
1568 /*
1569 * If "address" is contained within a map entry, the new root
1570 * is that map entry. Otherwise, the new root is a map entry
1571 * immediately before or after "address".
1572 */
1573 if (address < cur->start) {
1574 *entry = header;
1575 return (FALSE);
1576 }
1577 *entry = cur;
1578 return (address < cur->end);
1579 }
1580 /*
1581 * Since the map is only locked for read access, perform a
1582 * standard binary search tree lookup for "address".
1583 */
1584 lbound = ubound = header;
1585 for (;;) {
1586 if (address < cur->start) {
1587 ubound = cur;
1588 cur = cur->left;
1589 if (cur == lbound)
1590 break;
1591 } else if (cur->end <= address) {
1592 lbound = cur;
1593 cur = cur->right;
1594 if (cur == ubound)
1595 break;
1596 } else {
1597 *entry = cur;
1598 return (TRUE);
1599 }
1600 }
1601 *entry = lbound;
1602 return (FALSE);
1603 }
1604
1605 /*
1606 * vm_map_insert1() is identical to vm_map_insert() except that it
1607 * returns the newly inserted map entry in '*res'. In case the new
1608 * entry is coalesced with a neighbor or an existing entry was
1609 * resized, that entry is returned. In any case, the returned entry
1610 * covers the specified address range.
1611 */
1612 static int
vm_map_insert1(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_offset_t end,vm_prot_t prot,vm_prot_t max,int cow,vm_map_entry_t * res)1613 vm_map_insert1(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1614 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow,
1615 vm_map_entry_t *res)
1616 {
1617 vm_map_entry_t new_entry, next_entry, prev_entry;
1618 struct ucred *cred;
1619 vm_eflags_t protoeflags;
1620 vm_inherit_t inheritance;
1621 u_long bdry;
1622 u_int bidx;
1623
1624 VM_MAP_ASSERT_LOCKED(map);
1625 KASSERT(object != kernel_object ||
1626 (cow & MAP_COPY_ON_WRITE) == 0,
1627 ("vm_map_insert: kernel object and COW"));
1628 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1629 (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1630 ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1631 object, cow));
1632 KASSERT((prot & ~max) == 0,
1633 ("prot %#x is not subset of max_prot %#x", prot, max));
1634
1635 /*
1636 * Check that the start and end points are not bogus.
1637 */
1638 if (start == end || !vm_map_range_valid(map, start, end))
1639 return (KERN_INVALID_ADDRESS);
1640
1641 if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1642 VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1643 return (KERN_PROTECTION_FAILURE);
1644
1645 /*
1646 * Find the entry prior to the proposed starting address; if it's part
1647 * of an existing entry, this range is bogus.
1648 */
1649 if (vm_map_lookup_entry(map, start, &prev_entry))
1650 return (KERN_NO_SPACE);
1651
1652 /*
1653 * Assert that the next entry doesn't overlap the end point.
1654 */
1655 next_entry = vm_map_entry_succ(prev_entry);
1656 if (next_entry->start < end)
1657 return (KERN_NO_SPACE);
1658
1659 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1660 max != VM_PROT_NONE))
1661 return (KERN_INVALID_ARGUMENT);
1662
1663 protoeflags = 0;
1664 if (cow & MAP_COPY_ON_WRITE)
1665 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1666 if (cow & MAP_NOFAULT)
1667 protoeflags |= MAP_ENTRY_NOFAULT;
1668 if (cow & MAP_DISABLE_SYNCER)
1669 protoeflags |= MAP_ENTRY_NOSYNC;
1670 if (cow & MAP_DISABLE_COREDUMP)
1671 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1672 if (cow & MAP_STACK_AREA)
1673 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1674 if (cow & MAP_WRITECOUNT)
1675 protoeflags |= MAP_ENTRY_WRITECNT;
1676 if (cow & MAP_VN_EXEC)
1677 protoeflags |= MAP_ENTRY_VN_EXEC;
1678 if ((cow & MAP_CREATE_GUARD) != 0)
1679 protoeflags |= MAP_ENTRY_GUARD;
1680 if ((cow & MAP_CREATE_STACK_GAP) != 0)
1681 protoeflags |= MAP_ENTRY_STACK_GAP;
1682 if (cow & MAP_INHERIT_SHARE)
1683 inheritance = VM_INHERIT_SHARE;
1684 else
1685 inheritance = VM_INHERIT_DEFAULT;
1686 if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1687 /* This magically ignores index 0, for usual page size. */
1688 bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1689 MAP_SPLIT_BOUNDARY_SHIFT;
1690 if (bidx >= MAXPAGESIZES)
1691 return (KERN_INVALID_ARGUMENT);
1692 bdry = pagesizes[bidx] - 1;
1693 if ((start & bdry) != 0 || (end & bdry) != 0)
1694 return (KERN_INVALID_ARGUMENT);
1695 protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1696 }
1697
1698 cred = NULL;
1699 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1700 goto charged;
1701 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1702 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1703 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1704 return (KERN_RESOURCE_SHORTAGE);
1705 KASSERT(object == NULL ||
1706 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1707 object->cred == NULL,
1708 ("overcommit: vm_map_insert o %p", object));
1709 cred = curthread->td_ucred;
1710 }
1711
1712 charged:
1713 /* Expand the kernel pmap, if necessary. */
1714 if (map == kernel_map && end > kernel_vm_end)
1715 pmap_growkernel(end);
1716 if (object != NULL) {
1717 /*
1718 * OBJ_ONEMAPPING must be cleared unless this mapping
1719 * is trivially proven to be the only mapping for any
1720 * of the object's pages. (Object granularity
1721 * reference counting is insufficient to recognize
1722 * aliases with precision.)
1723 */
1724 if ((object->flags & OBJ_ANON) != 0) {
1725 VM_OBJECT_WLOCK(object);
1726 if (object->ref_count > 1 || object->shadow_count != 0)
1727 vm_object_clear_flag(object, OBJ_ONEMAPPING);
1728 VM_OBJECT_WUNLOCK(object);
1729 }
1730 } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1731 protoeflags &&
1732 (cow & (MAP_STACK_AREA | MAP_VN_EXEC)) == 0 &&
1733 prev_entry->end == start && (prev_entry->cred == cred ||
1734 (prev_entry->object.vm_object != NULL &&
1735 prev_entry->object.vm_object->cred == cred)) &&
1736 vm_object_coalesce(prev_entry->object.vm_object,
1737 prev_entry->offset,
1738 (vm_size_t)(prev_entry->end - prev_entry->start),
1739 (vm_size_t)(end - prev_entry->end), cred != NULL &&
1740 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1741 /*
1742 * We were able to extend the object. Determine if we
1743 * can extend the previous map entry to include the
1744 * new range as well.
1745 */
1746 if (prev_entry->inheritance == inheritance &&
1747 prev_entry->protection == prot &&
1748 prev_entry->max_protection == max &&
1749 prev_entry->wired_count == 0) {
1750 KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1751 0, ("prev_entry %p has incoherent wiring",
1752 prev_entry));
1753 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1754 map->size += end - prev_entry->end;
1755 vm_map_entry_resize(map, prev_entry,
1756 end - prev_entry->end);
1757 *res = vm_map_try_merge_entries(map, prev_entry,
1758 next_entry);
1759 return (KERN_SUCCESS);
1760 }
1761
1762 /*
1763 * If we can extend the object but cannot extend the
1764 * map entry, we have to create a new map entry. We
1765 * must bump the ref count on the extended object to
1766 * account for it. object may be NULL.
1767 */
1768 object = prev_entry->object.vm_object;
1769 offset = prev_entry->offset +
1770 (prev_entry->end - prev_entry->start);
1771 vm_object_reference(object);
1772 if (cred != NULL && object != NULL && object->cred != NULL &&
1773 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1774 /* Object already accounts for this uid. */
1775 cred = NULL;
1776 }
1777 }
1778 if (cred != NULL)
1779 crhold(cred);
1780
1781 /*
1782 * Create a new entry
1783 */
1784 new_entry = vm_map_entry_create(map);
1785 new_entry->start = start;
1786 new_entry->end = end;
1787 new_entry->cred = NULL;
1788
1789 new_entry->eflags = protoeflags;
1790 new_entry->object.vm_object = object;
1791 new_entry->offset = offset;
1792
1793 new_entry->inheritance = inheritance;
1794 new_entry->protection = prot;
1795 new_entry->max_protection = max;
1796 new_entry->wired_count = 0;
1797 new_entry->wiring_thread = NULL;
1798 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1799 new_entry->next_read = start;
1800
1801 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1802 ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1803 new_entry->cred = cred;
1804
1805 /*
1806 * Insert the new entry into the list
1807 */
1808 vm_map_entry_link(map, new_entry);
1809 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1810 map->size += new_entry->end - new_entry->start;
1811
1812 /*
1813 * Try to coalesce the new entry with both the previous and next
1814 * entries in the list. Previously, we only attempted to coalesce
1815 * with the previous entry when object is NULL. Here, we handle the
1816 * other cases, which are less common.
1817 */
1818 vm_map_try_merge_entries(map, prev_entry, new_entry);
1819 *res = vm_map_try_merge_entries(map, new_entry, next_entry);
1820
1821 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1822 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1823 end - start, cow & MAP_PREFAULT_PARTIAL);
1824 }
1825
1826 return (KERN_SUCCESS);
1827 }
1828
1829 /*
1830 * vm_map_insert:
1831 *
1832 * Inserts the given VM object into the target map at the
1833 * specified address range.
1834 *
1835 * Requires that the map be locked, and leaves it so.
1836 *
1837 * If object is non-NULL, ref count must be bumped by caller
1838 * prior to making call to account for the new entry.
1839 */
1840 int
vm_map_insert(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_offset_t end,vm_prot_t prot,vm_prot_t max,int cow)1841 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1842 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1843 {
1844 vm_map_entry_t res;
1845
1846 return (vm_map_insert1(map, object, offset, start, end, prot, max,
1847 cow, &res));
1848 }
1849
1850 /*
1851 * vm_map_findspace:
1852 *
1853 * Find the first fit (lowest VM address) for "length" free bytes
1854 * beginning at address >= start in the given map.
1855 *
1856 * In a vm_map_entry, "max_free" is the maximum amount of
1857 * contiguous free space between an entry in its subtree and a
1858 * neighbor of that entry. This allows finding a free region in
1859 * one path down the tree, so O(log n) amortized with splay
1860 * trees.
1861 *
1862 * The map must be locked, and leaves it so.
1863 *
1864 * Returns: starting address if sufficient space,
1865 * vm_map_max(map)-length+1 if insufficient space.
1866 */
1867 vm_offset_t
vm_map_findspace(vm_map_t map,vm_offset_t start,vm_size_t length)1868 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1869 {
1870 vm_map_entry_t header, llist, rlist, root, y;
1871 vm_size_t left_length, max_free_left, max_free_right;
1872 vm_offset_t gap_end;
1873
1874 VM_MAP_ASSERT_LOCKED(map);
1875
1876 /*
1877 * Request must fit within min/max VM address and must avoid
1878 * address wrap.
1879 */
1880 start = MAX(start, vm_map_min(map));
1881 if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1882 return (vm_map_max(map) - length + 1);
1883
1884 /* Empty tree means wide open address space. */
1885 if (map->root == NULL)
1886 return (start);
1887
1888 /*
1889 * After splay_split, if start is within an entry, push it to the start
1890 * of the following gap. If rlist is at the end of the gap containing
1891 * start, save the end of that gap in gap_end to see if the gap is big
1892 * enough; otherwise set gap_end to start skip gap-checking and move
1893 * directly to a search of the right subtree.
1894 */
1895 header = &map->header;
1896 root = vm_map_splay_split(map, start, length, &llist, &rlist);
1897 gap_end = rlist->start;
1898 if (root != NULL) {
1899 start = root->end;
1900 if (root->right != rlist)
1901 gap_end = start;
1902 max_free_left = vm_map_splay_merge_left(header, root, llist);
1903 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1904 } else if (rlist != header) {
1905 root = rlist;
1906 rlist = root->left;
1907 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1908 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1909 } else {
1910 root = llist;
1911 llist = root->right;
1912 max_free_left = vm_map_splay_merge_left(header, root, llist);
1913 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1914 }
1915 root->max_free = vm_size_max(max_free_left, max_free_right);
1916 map->root = root;
1917 VM_MAP_ASSERT_CONSISTENT(map);
1918 if (length <= gap_end - start)
1919 return (start);
1920
1921 /* With max_free, can immediately tell if no solution. */
1922 if (root->right == header || length > root->right->max_free)
1923 return (vm_map_max(map) - length + 1);
1924
1925 /*
1926 * Splay for the least large-enough gap in the right subtree.
1927 */
1928 llist = rlist = header;
1929 for (left_length = 0;;
1930 left_length = vm_map_entry_max_free_left(root, llist)) {
1931 if (length <= left_length)
1932 SPLAY_LEFT_STEP(root, y, llist, rlist,
1933 length <= vm_map_entry_max_free_left(y, llist));
1934 else
1935 SPLAY_RIGHT_STEP(root, y, llist, rlist,
1936 length > vm_map_entry_max_free_left(y, root));
1937 if (root == NULL)
1938 break;
1939 }
1940 root = llist;
1941 llist = root->right;
1942 max_free_left = vm_map_splay_merge_left(header, root, llist);
1943 if (rlist == header) {
1944 root->max_free = vm_size_max(max_free_left,
1945 vm_map_splay_merge_succ(header, root, rlist));
1946 } else {
1947 y = rlist;
1948 rlist = y->left;
1949 y->max_free = vm_size_max(
1950 vm_map_splay_merge_pred(root, y, root),
1951 vm_map_splay_merge_right(header, y, rlist));
1952 root->max_free = vm_size_max(max_free_left, y->max_free);
1953 }
1954 map->root = root;
1955 VM_MAP_ASSERT_CONSISTENT(map);
1956 return (root->end);
1957 }
1958
1959 int
vm_map_fixed(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t start,vm_size_t length,vm_prot_t prot,vm_prot_t max,int cow)1960 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1961 vm_offset_t start, vm_size_t length, vm_prot_t prot,
1962 vm_prot_t max, int cow)
1963 {
1964 vm_offset_t end;
1965 int result;
1966
1967 end = start + length;
1968 KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
1969 ("vm_map_fixed: non-NULL backing object for stack"));
1970 vm_map_lock(map);
1971 VM_MAP_RANGE_CHECK(map, start, end);
1972 if ((cow & MAP_CHECK_EXCL) == 0) {
1973 result = vm_map_delete(map, start, end);
1974 if (result != KERN_SUCCESS)
1975 goto out;
1976 }
1977 if ((cow & MAP_STACK_AREA) != 0) {
1978 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1979 prot, max, cow);
1980 } else {
1981 result = vm_map_insert(map, object, offset, start, end,
1982 prot, max, cow);
1983 }
1984 out:
1985 vm_map_unlock(map);
1986 return (result);
1987 }
1988
1989 #if VM_NRESERVLEVEL <= 1
1990 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
1991 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
1992 #elif VM_NRESERVLEVEL == 2
1993 static const int aslr_pages_rnd_64[3] = {0x1000, 0x1000, 0x10};
1994 static const int aslr_pages_rnd_32[3] = {0x100, 0x100, 0x4};
1995 #else
1996 #error "Unsupported VM_NRESERVLEVEL"
1997 #endif
1998
1999 static int cluster_anon = 1;
2000 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2001 &cluster_anon, 0,
2002 "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2003
2004 static bool
clustering_anon_allowed(vm_offset_t addr,int cow)2005 clustering_anon_allowed(vm_offset_t addr, int cow)
2006 {
2007
2008 switch (cluster_anon) {
2009 case 0:
2010 return (false);
2011 case 1:
2012 return (addr == 0 || (cow & MAP_NO_HINT) != 0);
2013 case 2:
2014 default:
2015 return (true);
2016 }
2017 }
2018
2019 static long aslr_restarts;
2020 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2021 &aslr_restarts, 0,
2022 "Number of aslr failures");
2023
2024 /*
2025 * Searches for the specified amount of free space in the given map with the
2026 * specified alignment. Performs an address-ordered, first-fit search from
2027 * the given address "*addr", with an optional upper bound "max_addr". If the
2028 * parameter "alignment" is zero, then the alignment is computed from the
2029 * given (object, offset) pair so as to enable the greatest possible use of
2030 * superpage mappings. Returns KERN_SUCCESS and the address of the free space
2031 * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2032 *
2033 * The map must be locked. Initially, there must be at least "length" bytes
2034 * of free space at the given address.
2035 */
2036 static int
vm_map_alignspace(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2037 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2038 vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2039 vm_offset_t alignment)
2040 {
2041 vm_offset_t aligned_addr, free_addr;
2042
2043 VM_MAP_ASSERT_LOCKED(map);
2044 free_addr = *addr;
2045 KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2046 ("caller failed to provide space %#jx at address %p",
2047 (uintmax_t)length, (void *)free_addr));
2048 for (;;) {
2049 /*
2050 * At the start of every iteration, the free space at address
2051 * "*addr" is at least "length" bytes.
2052 */
2053 if (alignment == 0)
2054 pmap_align_superpage(object, offset, addr, length);
2055 else
2056 *addr = roundup2(*addr, alignment);
2057 aligned_addr = *addr;
2058 if (aligned_addr == free_addr) {
2059 /*
2060 * Alignment did not change "*addr", so "*addr" must
2061 * still provide sufficient free space.
2062 */
2063 return (KERN_SUCCESS);
2064 }
2065
2066 /*
2067 * Test for address wrap on "*addr". A wrapped "*addr" could
2068 * be a valid address, in which case vm_map_findspace() cannot
2069 * be relied upon to fail.
2070 */
2071 if (aligned_addr < free_addr)
2072 return (KERN_NO_SPACE);
2073 *addr = vm_map_findspace(map, aligned_addr, length);
2074 if (*addr + length > vm_map_max(map) ||
2075 (max_addr != 0 && *addr + length > max_addr))
2076 return (KERN_NO_SPACE);
2077 free_addr = *addr;
2078 if (free_addr == aligned_addr) {
2079 /*
2080 * If a successful call to vm_map_findspace() did not
2081 * change "*addr", then "*addr" must still be aligned
2082 * and provide sufficient free space.
2083 */
2084 return (KERN_SUCCESS);
2085 }
2086 }
2087 }
2088
2089 int
vm_map_find_aligned(vm_map_t map,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,vm_offset_t alignment)2090 vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2091 vm_offset_t max_addr, vm_offset_t alignment)
2092 {
2093 /* XXXKIB ASLR eh ? */
2094 *addr = vm_map_findspace(map, *addr, length);
2095 if (*addr + length > vm_map_max(map) ||
2096 (max_addr != 0 && *addr + length > max_addr))
2097 return (KERN_NO_SPACE);
2098 return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2099 alignment));
2100 }
2101
2102 /*
2103 * vm_map_find finds an unallocated region in the target address
2104 * map with the given length. The search is defined to be
2105 * first-fit from the specified address; the region found is
2106 * returned in the same parameter.
2107 *
2108 * If object is non-NULL, ref count must be bumped by caller
2109 * prior to making call to account for the new entry.
2110 */
2111 int
vm_map_find(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2112 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2113 vm_offset_t *addr, /* IN/OUT */
2114 vm_size_t length, vm_offset_t max_addr, int find_space,
2115 vm_prot_t prot, vm_prot_t max, int cow)
2116 {
2117 int rv;
2118
2119 vm_map_lock(map);
2120 rv = vm_map_find_locked(map, object, offset, addr, length, max_addr,
2121 find_space, prot, max, cow);
2122 vm_map_unlock(map);
2123 return (rv);
2124 }
2125
2126 int
vm_map_find_locked(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2127 vm_map_find_locked(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2128 vm_offset_t *addr, /* IN/OUT */
2129 vm_size_t length, vm_offset_t max_addr, int find_space,
2130 vm_prot_t prot, vm_prot_t max, int cow)
2131 {
2132 vm_offset_t alignment, curr_min_addr, min_addr;
2133 int gap, pidx, rv, try;
2134 bool cluster, en_aslr, update_anon;
2135
2136 KASSERT((cow & MAP_STACK_AREA) == 0 || object == NULL,
2137 ("non-NULL backing object for stack"));
2138 MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2139 (cow & MAP_STACK_AREA) == 0));
2140 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2141 (object->flags & OBJ_COLORED) == 0))
2142 find_space = VMFS_ANY_SPACE;
2143 if (find_space >> 8 != 0) {
2144 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2145 alignment = (vm_offset_t)1 << (find_space >> 8);
2146 } else
2147 alignment = 0;
2148 en_aslr = (map->flags & MAP_ASLR) != 0;
2149 update_anon = cluster = clustering_anon_allowed(*addr, cow) &&
2150 (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2151 find_space != VMFS_NO_SPACE && object == NULL &&
2152 (cow & (MAP_INHERIT_SHARE | MAP_STACK_AREA)) == 0 &&
2153 prot != PROT_NONE;
2154 curr_min_addr = min_addr = *addr;
2155 if (en_aslr && min_addr == 0 && !cluster &&
2156 find_space != VMFS_NO_SPACE &&
2157 (map->flags & MAP_ASLR_IGNSTART) != 0)
2158 curr_min_addr = min_addr = vm_map_min(map);
2159 try = 0;
2160 if (cluster) {
2161 curr_min_addr = map->anon_loc;
2162 if (curr_min_addr == 0)
2163 cluster = false;
2164 }
2165 if (find_space != VMFS_NO_SPACE) {
2166 KASSERT(find_space == VMFS_ANY_SPACE ||
2167 find_space == VMFS_OPTIMAL_SPACE ||
2168 find_space == VMFS_SUPER_SPACE ||
2169 alignment != 0, ("unexpected VMFS flag"));
2170 again:
2171 /*
2172 * When creating an anonymous mapping, try clustering
2173 * with an existing anonymous mapping first.
2174 *
2175 * We make up to two attempts to find address space
2176 * for a given find_space value. The first attempt may
2177 * apply randomization or may cluster with an existing
2178 * anonymous mapping. If this first attempt fails,
2179 * perform a first-fit search of the available address
2180 * space.
2181 *
2182 * If all tries failed, and find_space is
2183 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2184 * Again enable clustering and randomization.
2185 */
2186 try++;
2187 MPASS(try <= 2);
2188
2189 if (try == 2) {
2190 /*
2191 * Second try: we failed either to find a
2192 * suitable region for randomizing the
2193 * allocation, or to cluster with an existing
2194 * mapping. Retry with free run.
2195 */
2196 curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2197 vm_map_min(map) : min_addr;
2198 atomic_add_long(&aslr_restarts, 1);
2199 }
2200
2201 if (try == 1 && en_aslr && !cluster) {
2202 /*
2203 * Find space for allocation, including
2204 * gap needed for later randomization.
2205 */
2206 pidx = 0;
2207 #if VM_NRESERVLEVEL > 0
2208 if ((find_space == VMFS_SUPER_SPACE ||
2209 find_space == VMFS_OPTIMAL_SPACE) &&
2210 pagesizes[VM_NRESERVLEVEL] != 0) {
2211 /*
2212 * Do not pointlessly increase the space that
2213 * is requested from vm_map_findspace().
2214 * pmap_align_superpage() will only change a
2215 * mapping's alignment if that mapping is at
2216 * least a superpage in size.
2217 */
2218 pidx = VM_NRESERVLEVEL;
2219 while (pidx > 0 && length < pagesizes[pidx])
2220 pidx--;
2221 }
2222 #endif
2223 gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2224 (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2225 aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2226 *addr = vm_map_findspace(map, curr_min_addr,
2227 length + gap * pagesizes[pidx]);
2228 if (*addr + length + gap * pagesizes[pidx] >
2229 vm_map_max(map))
2230 goto again;
2231 /* And randomize the start address. */
2232 *addr += (arc4random() % gap) * pagesizes[pidx];
2233 if (max_addr != 0 && *addr + length > max_addr)
2234 goto again;
2235 } else {
2236 *addr = vm_map_findspace(map, curr_min_addr, length);
2237 if (*addr + length > vm_map_max(map) ||
2238 (max_addr != 0 && *addr + length > max_addr)) {
2239 if (cluster) {
2240 cluster = false;
2241 MPASS(try == 1);
2242 goto again;
2243 }
2244 return (KERN_NO_SPACE);
2245 }
2246 }
2247
2248 if (find_space != VMFS_ANY_SPACE &&
2249 (rv = vm_map_alignspace(map, object, offset, addr, length,
2250 max_addr, alignment)) != KERN_SUCCESS) {
2251 if (find_space == VMFS_OPTIMAL_SPACE) {
2252 find_space = VMFS_ANY_SPACE;
2253 curr_min_addr = min_addr;
2254 cluster = update_anon;
2255 try = 0;
2256 goto again;
2257 }
2258 return (rv);
2259 }
2260 } else if ((cow & MAP_REMAP) != 0) {
2261 if (!vm_map_range_valid(map, *addr, *addr + length))
2262 return (KERN_INVALID_ADDRESS);
2263 rv = vm_map_delete(map, *addr, *addr + length);
2264 if (rv != KERN_SUCCESS)
2265 return (rv);
2266 }
2267 if ((cow & MAP_STACK_AREA) != 0) {
2268 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2269 max, cow);
2270 } else {
2271 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2272 prot, max, cow);
2273 }
2274
2275 /*
2276 * Update the starting address for clustered anonymous memory mappings
2277 * if a starting address was not previously defined or an ASLR restart
2278 * placed an anonymous memory mapping at a lower address.
2279 */
2280 if (update_anon && rv == KERN_SUCCESS && (map->anon_loc == 0 ||
2281 *addr < map->anon_loc))
2282 map->anon_loc = *addr;
2283 return (rv);
2284 }
2285
2286 /*
2287 * vm_map_find_min() is a variant of vm_map_find() that takes an
2288 * additional parameter ("default_addr") and treats the given address
2289 * ("*addr") differently. Specifically, it treats "*addr" as a hint
2290 * and not as the minimum address where the mapping is created.
2291 *
2292 * This function works in two phases. First, it tries to
2293 * allocate above the hint. If that fails and the hint is
2294 * greater than "default_addr", it performs a second pass, replacing
2295 * the hint with "default_addr" as the minimum address for the
2296 * allocation.
2297 */
2298 int
vm_map_find_min(vm_map_t map,vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_offset_t default_addr,vm_offset_t max_addr,int find_space,vm_prot_t prot,vm_prot_t max,int cow)2299 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2300 vm_offset_t *addr, vm_size_t length, vm_offset_t default_addr,
2301 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2302 int cow)
2303 {
2304 vm_offset_t hint;
2305 int rv;
2306
2307 hint = *addr;
2308 if (hint == 0) {
2309 cow |= MAP_NO_HINT;
2310 *addr = hint = default_addr;
2311 }
2312 for (;;) {
2313 rv = vm_map_find(map, object, offset, addr, length, max_addr,
2314 find_space, prot, max, cow);
2315 if (rv == KERN_SUCCESS || default_addr >= hint)
2316 return (rv);
2317 *addr = hint = default_addr;
2318 }
2319 }
2320
2321 /*
2322 * A map entry with any of the following flags set must not be merged with
2323 * another entry.
2324 */
2325 #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | \
2326 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC | \
2327 MAP_ENTRY_STACK_GAP)
2328
2329 static bool
vm_map_mergeable_neighbors(vm_map_entry_t prev,vm_map_entry_t entry)2330 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2331 {
2332
2333 KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2334 (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2335 ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2336 prev, entry));
2337 return (prev->end == entry->start &&
2338 prev->object.vm_object == entry->object.vm_object &&
2339 (prev->object.vm_object == NULL ||
2340 prev->offset + (prev->end - prev->start) == entry->offset) &&
2341 prev->eflags == entry->eflags &&
2342 prev->protection == entry->protection &&
2343 prev->max_protection == entry->max_protection &&
2344 prev->inheritance == entry->inheritance &&
2345 prev->wired_count == entry->wired_count &&
2346 prev->cred == entry->cred);
2347 }
2348
2349 static void
vm_map_merged_neighbor_dispose(vm_map_t map,vm_map_entry_t entry)2350 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2351 {
2352
2353 /*
2354 * If the backing object is a vnode object, vm_object_deallocate()
2355 * calls vrele(). However, vrele() does not lock the vnode because
2356 * the vnode has additional references. Thus, the map lock can be
2357 * kept without causing a lock-order reversal with the vnode lock.
2358 *
2359 * Since we count the number of virtual page mappings in
2360 * object->un_pager.vnp.writemappings, the writemappings value
2361 * should not be adjusted when the entry is disposed of.
2362 */
2363 if (entry->object.vm_object != NULL)
2364 vm_object_deallocate(entry->object.vm_object);
2365 if (entry->cred != NULL)
2366 crfree(entry->cred);
2367 vm_map_entry_dispose(map, entry);
2368 }
2369
2370 /*
2371 * vm_map_try_merge_entries:
2372 *
2373 * Compare two map entries that represent consecutive ranges. If
2374 * the entries can be merged, expand the range of the second to
2375 * cover the range of the first and delete the first. Then return
2376 * the map entry that includes the first range.
2377 *
2378 * The map must be locked.
2379 */
2380 vm_map_entry_t
vm_map_try_merge_entries(vm_map_t map,vm_map_entry_t prev_entry,vm_map_entry_t entry)2381 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2382 vm_map_entry_t entry)
2383 {
2384
2385 VM_MAP_ASSERT_LOCKED(map);
2386 if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2387 vm_map_mergeable_neighbors(prev_entry, entry)) {
2388 vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2389 vm_map_merged_neighbor_dispose(map, prev_entry);
2390 return (entry);
2391 }
2392 return (prev_entry);
2393 }
2394
2395 /*
2396 * vm_map_entry_back:
2397 *
2398 * Allocate an object to back a map entry.
2399 */
2400 static inline void
vm_map_entry_back(vm_map_entry_t entry)2401 vm_map_entry_back(vm_map_entry_t entry)
2402 {
2403 vm_object_t object;
2404
2405 KASSERT(entry->object.vm_object == NULL,
2406 ("map entry %p has backing object", entry));
2407 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2408 ("map entry %p is a submap", entry));
2409 object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2410 entry->cred, entry->end - entry->start);
2411 entry->object.vm_object = object;
2412 entry->offset = 0;
2413 entry->cred = NULL;
2414 }
2415
2416 /*
2417 * vm_map_entry_charge_object
2418 *
2419 * If there is no object backing this entry, create one. Otherwise, if
2420 * the entry has cred, give it to the backing object.
2421 */
2422 static inline void
vm_map_entry_charge_object(vm_map_t map,vm_map_entry_t entry)2423 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2424 {
2425
2426 VM_MAP_ASSERT_LOCKED(map);
2427 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2428 ("map entry %p is a submap", entry));
2429 if (entry->object.vm_object == NULL && !vm_map_is_system(map) &&
2430 (entry->eflags & MAP_ENTRY_GUARD) == 0)
2431 vm_map_entry_back(entry);
2432 else if (entry->object.vm_object != NULL &&
2433 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2434 entry->cred != NULL) {
2435 VM_OBJECT_WLOCK(entry->object.vm_object);
2436 KASSERT(entry->object.vm_object->cred == NULL,
2437 ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2438 entry->object.vm_object->cred = entry->cred;
2439 entry->object.vm_object->charge = entry->end - entry->start;
2440 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2441 entry->cred = NULL;
2442 }
2443 }
2444
2445 /*
2446 * vm_map_entry_clone
2447 *
2448 * Create a duplicate map entry for clipping.
2449 */
2450 static vm_map_entry_t
vm_map_entry_clone(vm_map_t map,vm_map_entry_t entry)2451 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2452 {
2453 vm_map_entry_t new_entry;
2454
2455 VM_MAP_ASSERT_LOCKED(map);
2456
2457 /*
2458 * Create a backing object now, if none exists, so that more individual
2459 * objects won't be created after the map entry is split.
2460 */
2461 vm_map_entry_charge_object(map, entry);
2462
2463 /* Clone the entry. */
2464 new_entry = vm_map_entry_create(map);
2465 *new_entry = *entry;
2466 if (new_entry->cred != NULL)
2467 crhold(entry->cred);
2468 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2469 vm_object_reference(new_entry->object.vm_object);
2470 vm_map_entry_set_vnode_text(new_entry, true);
2471 /*
2472 * The object->un_pager.vnp.writemappings for the object of
2473 * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2474 * virtual pages are re-distributed among the clipped entries,
2475 * so the sum is left the same.
2476 */
2477 }
2478 return (new_entry);
2479 }
2480
2481 /*
2482 * vm_map_clip_start: [ internal use only ]
2483 *
2484 * Asserts that the given entry begins at or after
2485 * the specified address; if necessary,
2486 * it splits the entry into two.
2487 */
2488 static int
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_offset_t startaddr)2489 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2490 {
2491 vm_map_entry_t new_entry;
2492 int bdry_idx;
2493
2494 if (!vm_map_is_system(map))
2495 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2496 "%s: map %p entry %p start 0x%jx", __func__, map, entry,
2497 (uintmax_t)startaddr);
2498
2499 if (startaddr <= entry->start)
2500 return (KERN_SUCCESS);
2501
2502 VM_MAP_ASSERT_LOCKED(map);
2503 KASSERT(entry->end > startaddr && entry->start < startaddr,
2504 ("%s: invalid clip of entry %p", __func__, entry));
2505
2506 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2507 if (bdry_idx != 0) {
2508 if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2509 return (KERN_INVALID_ARGUMENT);
2510 }
2511
2512 new_entry = vm_map_entry_clone(map, entry);
2513
2514 /*
2515 * Split off the front portion. Insert the new entry BEFORE this one,
2516 * so that this entry has the specified starting address.
2517 */
2518 new_entry->end = startaddr;
2519 vm_map_entry_link(map, new_entry);
2520 return (KERN_SUCCESS);
2521 }
2522
2523 /*
2524 * vm_map_lookup_clip_start:
2525 *
2526 * Find the entry at or just after 'start', and clip it if 'start' is in
2527 * the interior of the entry. Return entry after 'start', and in
2528 * prev_entry set the entry before 'start'.
2529 */
2530 static int
vm_map_lookup_clip_start(vm_map_t map,vm_offset_t start,vm_map_entry_t * res_entry,vm_map_entry_t * prev_entry)2531 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2532 vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2533 {
2534 vm_map_entry_t entry;
2535 int rv;
2536
2537 if (!vm_map_is_system(map))
2538 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2539 "%s: map %p start 0x%jx prev %p", __func__, map,
2540 (uintmax_t)start, prev_entry);
2541
2542 if (vm_map_lookup_entry(map, start, prev_entry)) {
2543 entry = *prev_entry;
2544 rv = vm_map_clip_start(map, entry, start);
2545 if (rv != KERN_SUCCESS)
2546 return (rv);
2547 *prev_entry = vm_map_entry_pred(entry);
2548 } else
2549 entry = vm_map_entry_succ(*prev_entry);
2550 *res_entry = entry;
2551 return (KERN_SUCCESS);
2552 }
2553
2554 /*
2555 * vm_map_clip_end: [ internal use only ]
2556 *
2557 * Asserts that the given entry ends at or before
2558 * the specified address; if necessary,
2559 * it splits the entry into two.
2560 */
2561 static int
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_offset_t endaddr)2562 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2563 {
2564 vm_map_entry_t new_entry;
2565 int bdry_idx;
2566
2567 if (!vm_map_is_system(map))
2568 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2569 "%s: map %p entry %p end 0x%jx", __func__, map, entry,
2570 (uintmax_t)endaddr);
2571
2572 if (endaddr >= entry->end)
2573 return (KERN_SUCCESS);
2574
2575 VM_MAP_ASSERT_LOCKED(map);
2576 KASSERT(entry->start < endaddr && entry->end > endaddr,
2577 ("%s: invalid clip of entry %p", __func__, entry));
2578
2579 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
2580 if (bdry_idx != 0) {
2581 if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2582 return (KERN_INVALID_ARGUMENT);
2583 }
2584
2585 new_entry = vm_map_entry_clone(map, entry);
2586
2587 /*
2588 * Split off the back portion. Insert the new entry AFTER this one,
2589 * so that this entry has the specified ending address.
2590 */
2591 new_entry->start = endaddr;
2592 vm_map_entry_link(map, new_entry);
2593
2594 return (KERN_SUCCESS);
2595 }
2596
2597 /*
2598 * vm_map_submap: [ kernel use only ]
2599 *
2600 * Mark the given range as handled by a subordinate map.
2601 *
2602 * This range must have been created with vm_map_find,
2603 * and no other operations may have been performed on this
2604 * range prior to calling vm_map_submap.
2605 *
2606 * Only a limited number of operations can be performed
2607 * within this rage after calling vm_map_submap:
2608 * vm_fault
2609 * [Don't try vm_map_copy!]
2610 *
2611 * To remove a submapping, one must first remove the
2612 * range from the superior map, and then destroy the
2613 * submap (if desired). [Better yet, don't try it.]
2614 */
2615 int
vm_map_submap(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_map_t submap)2616 vm_map_submap(
2617 vm_map_t map,
2618 vm_offset_t start,
2619 vm_offset_t end,
2620 vm_map_t submap)
2621 {
2622 vm_map_entry_t entry;
2623 int result;
2624
2625 result = KERN_INVALID_ARGUMENT;
2626
2627 vm_map_lock(submap);
2628 submap->flags |= MAP_IS_SUB_MAP;
2629 vm_map_unlock(submap);
2630
2631 vm_map_lock(map);
2632 VM_MAP_RANGE_CHECK(map, start, end);
2633 if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2634 (entry->eflags & MAP_ENTRY_COW) == 0 &&
2635 entry->object.vm_object == NULL) {
2636 result = vm_map_clip_start(map, entry, start);
2637 if (result != KERN_SUCCESS)
2638 goto unlock;
2639 result = vm_map_clip_end(map, entry, end);
2640 if (result != KERN_SUCCESS)
2641 goto unlock;
2642 entry->object.sub_map = submap;
2643 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2644 result = KERN_SUCCESS;
2645 }
2646 unlock:
2647 vm_map_unlock(map);
2648
2649 if (result != KERN_SUCCESS) {
2650 vm_map_lock(submap);
2651 submap->flags &= ~MAP_IS_SUB_MAP;
2652 vm_map_unlock(submap);
2653 }
2654 return (result);
2655 }
2656
2657 /*
2658 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2659 */
2660 #define MAX_INIT_PT 96
2661
2662 /*
2663 * vm_map_pmap_enter:
2664 *
2665 * Preload the specified map's pmap with mappings to the specified
2666 * object's memory-resident pages. No further physical pages are
2667 * allocated, and no further virtual pages are retrieved from secondary
2668 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2669 * limited number of page mappings are created at the low-end of the
2670 * specified address range. (For this purpose, a superpage mapping
2671 * counts as one page mapping.) Otherwise, all resident pages within
2672 * the specified address range are mapped.
2673 */
2674 static void
vm_map_pmap_enter(vm_map_t map,vm_offset_t addr,vm_prot_t prot,vm_object_t object,vm_pindex_t pindex,vm_size_t size,int flags)2675 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2676 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2677 {
2678 struct pctrie_iter pages;
2679 vm_offset_t start;
2680 vm_page_t p, p_start;
2681 vm_pindex_t jump, mask, psize, threshold, tmpidx;
2682 int psind;
2683
2684 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2685 return;
2686 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2687 VM_OBJECT_WLOCK(object);
2688 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2689 pmap_object_init_pt(map->pmap, addr, object, pindex,
2690 size);
2691 VM_OBJECT_WUNLOCK(object);
2692 return;
2693 }
2694 VM_OBJECT_LOCK_DOWNGRADE(object);
2695 } else
2696 VM_OBJECT_RLOCK(object);
2697
2698 psize = atop(size);
2699 if (psize + pindex > object->size) {
2700 if (pindex >= object->size) {
2701 VM_OBJECT_RUNLOCK(object);
2702 return;
2703 }
2704 psize = object->size - pindex;
2705 }
2706
2707 start = 0;
2708 p_start = NULL;
2709 threshold = MAX_INIT_PT;
2710
2711 vm_page_iter_limit_init(&pages, object, pindex + psize);
2712 for (p = vm_radix_iter_lookup_ge(&pages, pindex); p != NULL;
2713 p = vm_radix_iter_jump(&pages, jump)) {
2714 /*
2715 * don't allow an madvise to blow away our really
2716 * free pages allocating pv entries.
2717 */
2718 tmpidx = p->pindex - pindex;
2719 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2720 vm_page_count_severe()) ||
2721 ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2722 tmpidx >= threshold)) {
2723 psize = tmpidx;
2724 break;
2725 }
2726 jump = 1;
2727 if (vm_page_all_valid(p)) {
2728 if (p_start == NULL) {
2729 start = addr + ptoa(tmpidx);
2730 p_start = p;
2731 }
2732 /* Jump ahead if a superpage mapping is possible. */
2733 for (psind = p->psind; psind > 0; psind--) {
2734 if (((addr + ptoa(tmpidx)) &
2735 (pagesizes[psind] - 1)) == 0) {
2736 mask = atop(pagesizes[psind]) - 1;
2737 if (tmpidx + mask < psize &&
2738 vm_page_ps_test(p, psind,
2739 PS_ALL_VALID, NULL)) {
2740 jump += mask;
2741 threshold += mask;
2742 break;
2743 }
2744 }
2745 }
2746 } else if (p_start != NULL) {
2747 pmap_enter_object(map->pmap, start, addr +
2748 ptoa(tmpidx), p_start, prot);
2749 p_start = NULL;
2750 }
2751 }
2752 if (p_start != NULL)
2753 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2754 p_start, prot);
2755 VM_OBJECT_RUNLOCK(object);
2756 }
2757
2758 static void
vm_map_protect_guard(vm_map_entry_t entry,vm_prot_t new_prot,vm_prot_t new_maxprot,int flags)2759 vm_map_protect_guard(vm_map_entry_t entry, vm_prot_t new_prot,
2760 vm_prot_t new_maxprot, int flags)
2761 {
2762 vm_prot_t old_prot;
2763
2764 MPASS((entry->eflags & MAP_ENTRY_GUARD) != 0);
2765 if ((entry->eflags & MAP_ENTRY_STACK_GAP) == 0)
2766 return;
2767
2768 old_prot = PROT_EXTRACT(entry->offset);
2769 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2770 entry->offset = PROT_MAX(new_maxprot) |
2771 (new_maxprot & old_prot);
2772 }
2773 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0) {
2774 entry->offset = new_prot | PROT_MAX(
2775 PROT_MAX_EXTRACT(entry->offset));
2776 }
2777 }
2778
2779 /*
2780 * vm_map_protect:
2781 *
2782 * Sets the protection and/or the maximum protection of the
2783 * specified address region in the target map.
2784 */
2785 int
vm_map_protect(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t new_prot,vm_prot_t new_maxprot,int flags)2786 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2787 vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2788 {
2789 vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2790 vm_object_t obj;
2791 struct ucred *cred;
2792 vm_offset_t orig_start;
2793 vm_prot_t check_prot, max_prot, old_prot;
2794 int rv;
2795
2796 if (start == end)
2797 return (KERN_SUCCESS);
2798
2799 if (CONTAINS_BITS(flags, VM_MAP_PROTECT_SET_PROT |
2800 VM_MAP_PROTECT_SET_MAXPROT) &&
2801 !CONTAINS_BITS(new_maxprot, new_prot))
2802 return (KERN_OUT_OF_BOUNDS);
2803
2804 orig_start = start;
2805 again:
2806 in_tran = NULL;
2807 start = orig_start;
2808 vm_map_lock(map);
2809
2810 if ((map->flags & MAP_WXORX) != 0 &&
2811 (flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2812 CONTAINS_BITS(new_prot, VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2813 vm_map_unlock(map);
2814 return (KERN_PROTECTION_FAILURE);
2815 }
2816
2817 /*
2818 * Ensure that we are not concurrently wiring pages. vm_map_wire() may
2819 * need to fault pages into the map and will drop the map lock while
2820 * doing so, and the VM object may end up in an inconsistent state if we
2821 * update the protection on the map entry in between faults.
2822 */
2823 vm_map_wait_busy(map);
2824
2825 VM_MAP_RANGE_CHECK(map, start, end);
2826
2827 if (!vm_map_lookup_entry(map, start, &first_entry))
2828 first_entry = vm_map_entry_succ(first_entry);
2829
2830 if ((flags & VM_MAP_PROTECT_GROWSDOWN) != 0 &&
2831 (first_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
2832 /*
2833 * Handle Linux's PROT_GROWSDOWN flag.
2834 * It means that protection is applied down to the
2835 * whole stack, including the specified range of the
2836 * mapped region, and the grow down region (AKA
2837 * guard).
2838 */
2839 while (!CONTAINS_BITS(first_entry->eflags,
2840 MAP_ENTRY_GUARD | MAP_ENTRY_STACK_GAP) &&
2841 first_entry != vm_map_entry_first(map))
2842 first_entry = vm_map_entry_pred(first_entry);
2843 start = first_entry->start;
2844 }
2845
2846 /*
2847 * Make a first pass to check for protection violations.
2848 */
2849 check_prot = 0;
2850 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2851 check_prot |= new_prot;
2852 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0)
2853 check_prot |= new_maxprot;
2854 for (entry = first_entry; entry->start < end;
2855 entry = vm_map_entry_succ(entry)) {
2856 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2857 vm_map_unlock(map);
2858 return (KERN_INVALID_ARGUMENT);
2859 }
2860 if ((entry->eflags & (MAP_ENTRY_GUARD |
2861 MAP_ENTRY_STACK_GAP)) == MAP_ENTRY_GUARD)
2862 continue;
2863 max_prot = (entry->eflags & MAP_ENTRY_STACK_GAP) != 0 ?
2864 PROT_MAX_EXTRACT(entry->offset) : entry->max_protection;
2865 if (!CONTAINS_BITS(max_prot, check_prot)) {
2866 vm_map_unlock(map);
2867 return (KERN_PROTECTION_FAILURE);
2868 }
2869 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2870 in_tran = entry;
2871 }
2872
2873 /*
2874 * Postpone the operation until all in-transition map entries have
2875 * stabilized. An in-transition entry might already have its pages
2876 * wired and wired_count incremented, but not yet have its
2877 * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2878 * vm_fault_copy_entry() in the final loop below.
2879 */
2880 if (in_tran != NULL) {
2881 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2882 vm_map_unlock_and_wait(map, 0);
2883 goto again;
2884 }
2885
2886 /*
2887 * Before changing the protections, try to reserve swap space for any
2888 * private (i.e., copy-on-write) mappings that are transitioning from
2889 * read-only to read/write access. If a reservation fails, break out
2890 * of this loop early and let the next loop simplify the entries, since
2891 * some may now be mergeable.
2892 */
2893 rv = vm_map_clip_start(map, first_entry, start);
2894 if (rv != KERN_SUCCESS) {
2895 vm_map_unlock(map);
2896 return (rv);
2897 }
2898 for (entry = first_entry; entry->start < end;
2899 entry = vm_map_entry_succ(entry)) {
2900 rv = vm_map_clip_end(map, entry, end);
2901 if (rv != KERN_SUCCESS) {
2902 vm_map_unlock(map);
2903 return (rv);
2904 }
2905
2906 if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2907 ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2908 ENTRY_CHARGED(entry) ||
2909 (entry->eflags & MAP_ENTRY_GUARD) != 0)
2910 continue;
2911
2912 cred = curthread->td_ucred;
2913 obj = entry->object.vm_object;
2914
2915 if (obj == NULL ||
2916 (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2917 if (!swap_reserve(entry->end - entry->start)) {
2918 rv = KERN_RESOURCE_SHORTAGE;
2919 end = entry->end;
2920 break;
2921 }
2922 crhold(cred);
2923 entry->cred = cred;
2924 continue;
2925 }
2926
2927 VM_OBJECT_WLOCK(obj);
2928 if ((obj->flags & OBJ_SWAP) == 0) {
2929 VM_OBJECT_WUNLOCK(obj);
2930 continue;
2931 }
2932
2933 /*
2934 * Charge for the whole object allocation now, since
2935 * we cannot distinguish between non-charged and
2936 * charged clipped mapping of the same object later.
2937 */
2938 KASSERT(obj->charge == 0,
2939 ("vm_map_protect: object %p overcharged (entry %p)",
2940 obj, entry));
2941 if (!swap_reserve(ptoa(obj->size))) {
2942 VM_OBJECT_WUNLOCK(obj);
2943 rv = KERN_RESOURCE_SHORTAGE;
2944 end = entry->end;
2945 break;
2946 }
2947
2948 crhold(cred);
2949 obj->cred = cred;
2950 obj->charge = ptoa(obj->size);
2951 VM_OBJECT_WUNLOCK(obj);
2952 }
2953
2954 /*
2955 * If enough swap space was available, go back and fix up protections.
2956 * Otherwise, just simplify entries, since some may have been modified.
2957 * [Note that clipping is not necessary the second time.]
2958 */
2959 for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2960 entry->start < end;
2961 vm_map_try_merge_entries(map, prev_entry, entry),
2962 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2963 if (rv != KERN_SUCCESS)
2964 continue;
2965
2966 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2967 vm_map_protect_guard(entry, new_prot, new_maxprot,
2968 flags);
2969 continue;
2970 }
2971
2972 old_prot = entry->protection;
2973
2974 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2975 entry->max_protection = new_maxprot;
2976 entry->protection = new_maxprot & old_prot;
2977 }
2978 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2979 entry->protection = new_prot;
2980
2981 /*
2982 * For user wired map entries, the normal lazy evaluation of
2983 * write access upgrades through soft page faults is
2984 * undesirable. Instead, immediately copy any pages that are
2985 * copy-on-write and enable write access in the physical map.
2986 */
2987 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2988 (entry->protection & VM_PROT_WRITE) != 0 &&
2989 (old_prot & VM_PROT_WRITE) == 0)
2990 vm_fault_copy_entry(map, map, entry, entry, NULL);
2991
2992 /*
2993 * When restricting access, update the physical map. Worry
2994 * about copy-on-write here.
2995 */
2996 if ((old_prot & ~entry->protection) != 0) {
2997 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2998 VM_PROT_ALL)
2999 pmap_protect(map->pmap, entry->start,
3000 entry->end,
3001 entry->protection & MASK(entry));
3002 #undef MASK
3003 }
3004 }
3005 vm_map_try_merge_entries(map, prev_entry, entry);
3006 vm_map_unlock(map);
3007 return (rv);
3008 }
3009
3010 /*
3011 * vm_map_madvise:
3012 *
3013 * This routine traverses a processes map handling the madvise
3014 * system call. Advisories are classified as either those effecting
3015 * the vm_map_entry structure, or those effecting the underlying
3016 * objects.
3017 */
3018 int
vm_map_madvise(vm_map_t map,vm_offset_t start,vm_offset_t end,int behav)3019 vm_map_madvise(
3020 vm_map_t map,
3021 vm_offset_t start,
3022 vm_offset_t end,
3023 int behav)
3024 {
3025 vm_map_entry_t entry, prev_entry;
3026 int rv;
3027 bool modify_map;
3028
3029 /*
3030 * Some madvise calls directly modify the vm_map_entry, in which case
3031 * we need to use an exclusive lock on the map and we need to perform
3032 * various clipping operations. Otherwise we only need a read-lock
3033 * on the map.
3034 */
3035 switch(behav) {
3036 case MADV_NORMAL:
3037 case MADV_SEQUENTIAL:
3038 case MADV_RANDOM:
3039 case MADV_NOSYNC:
3040 case MADV_AUTOSYNC:
3041 case MADV_NOCORE:
3042 case MADV_CORE:
3043 if (start == end)
3044 return (0);
3045 modify_map = true;
3046 vm_map_lock(map);
3047 break;
3048 case MADV_WILLNEED:
3049 case MADV_DONTNEED:
3050 case MADV_FREE:
3051 if (start == end)
3052 return (0);
3053 modify_map = false;
3054 vm_map_lock_read(map);
3055 break;
3056 default:
3057 return (EINVAL);
3058 }
3059
3060 /*
3061 * Locate starting entry and clip if necessary.
3062 */
3063 VM_MAP_RANGE_CHECK(map, start, end);
3064
3065 if (modify_map) {
3066 /*
3067 * madvise behaviors that are implemented in the vm_map_entry.
3068 *
3069 * We clip the vm_map_entry so that behavioral changes are
3070 * limited to the specified address range.
3071 */
3072 rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3073 if (rv != KERN_SUCCESS) {
3074 vm_map_unlock(map);
3075 return (vm_mmap_to_errno(rv));
3076 }
3077
3078 for (; entry->start < end; prev_entry = entry,
3079 entry = vm_map_entry_succ(entry)) {
3080 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3081 continue;
3082
3083 rv = vm_map_clip_end(map, entry, end);
3084 if (rv != KERN_SUCCESS) {
3085 vm_map_unlock(map);
3086 return (vm_mmap_to_errno(rv));
3087 }
3088
3089 switch (behav) {
3090 case MADV_NORMAL:
3091 vm_map_entry_set_behavior(entry,
3092 MAP_ENTRY_BEHAV_NORMAL);
3093 break;
3094 case MADV_SEQUENTIAL:
3095 vm_map_entry_set_behavior(entry,
3096 MAP_ENTRY_BEHAV_SEQUENTIAL);
3097 break;
3098 case MADV_RANDOM:
3099 vm_map_entry_set_behavior(entry,
3100 MAP_ENTRY_BEHAV_RANDOM);
3101 break;
3102 case MADV_NOSYNC:
3103 entry->eflags |= MAP_ENTRY_NOSYNC;
3104 break;
3105 case MADV_AUTOSYNC:
3106 entry->eflags &= ~MAP_ENTRY_NOSYNC;
3107 break;
3108 case MADV_NOCORE:
3109 entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3110 break;
3111 case MADV_CORE:
3112 entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3113 break;
3114 default:
3115 break;
3116 }
3117 vm_map_try_merge_entries(map, prev_entry, entry);
3118 }
3119 vm_map_try_merge_entries(map, prev_entry, entry);
3120 vm_map_unlock(map);
3121 } else {
3122 vm_pindex_t pstart, pend;
3123
3124 /*
3125 * madvise behaviors that are implemented in the underlying
3126 * vm_object.
3127 *
3128 * Since we don't clip the vm_map_entry, we have to clip
3129 * the vm_object pindex and count.
3130 */
3131 if (!vm_map_lookup_entry(map, start, &entry))
3132 entry = vm_map_entry_succ(entry);
3133 for (; entry->start < end;
3134 entry = vm_map_entry_succ(entry)) {
3135 vm_offset_t useEnd, useStart;
3136
3137 if ((entry->eflags & (MAP_ENTRY_IS_SUB_MAP |
3138 MAP_ENTRY_GUARD)) != 0)
3139 continue;
3140
3141 /*
3142 * MADV_FREE would otherwise rewind time to
3143 * the creation of the shadow object. Because
3144 * we hold the VM map read-locked, neither the
3145 * entry's object nor the presence of a
3146 * backing object can change.
3147 */
3148 if (behav == MADV_FREE &&
3149 entry->object.vm_object != NULL &&
3150 entry->object.vm_object->backing_object != NULL)
3151 continue;
3152
3153 pstart = OFF_TO_IDX(entry->offset);
3154 pend = pstart + atop(entry->end - entry->start);
3155 useStart = entry->start;
3156 useEnd = entry->end;
3157
3158 if (entry->start < start) {
3159 pstart += atop(start - entry->start);
3160 useStart = start;
3161 }
3162 if (entry->end > end) {
3163 pend -= atop(entry->end - end);
3164 useEnd = end;
3165 }
3166
3167 if (pstart >= pend)
3168 continue;
3169
3170 /*
3171 * Perform the pmap_advise() before clearing
3172 * PGA_REFERENCED in vm_page_advise(). Otherwise, a
3173 * concurrent pmap operation, such as pmap_remove(),
3174 * could clear a reference in the pmap and set
3175 * PGA_REFERENCED on the page before the pmap_advise()
3176 * had completed. Consequently, the page would appear
3177 * referenced based upon an old reference that
3178 * occurred before this pmap_advise() ran.
3179 */
3180 if (behav == MADV_DONTNEED || behav == MADV_FREE)
3181 pmap_advise(map->pmap, useStart, useEnd,
3182 behav);
3183
3184 vm_object_madvise(entry->object.vm_object, pstart,
3185 pend, behav);
3186
3187 /*
3188 * Pre-populate paging structures in the
3189 * WILLNEED case. For wired entries, the
3190 * paging structures are already populated.
3191 */
3192 if (behav == MADV_WILLNEED &&
3193 entry->wired_count == 0) {
3194 vm_map_pmap_enter(map,
3195 useStart,
3196 entry->protection,
3197 entry->object.vm_object,
3198 pstart,
3199 ptoa(pend - pstart),
3200 MAP_PREFAULT_MADVISE
3201 );
3202 }
3203 }
3204 vm_map_unlock_read(map);
3205 }
3206 return (0);
3207 }
3208
3209 /*
3210 * vm_map_inherit:
3211 *
3212 * Sets the inheritance of the specified address
3213 * range in the target map. Inheritance
3214 * affects how the map will be shared with
3215 * child maps at the time of vmspace_fork.
3216 */
3217 int
vm_map_inherit(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_inherit_t new_inheritance)3218 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3219 vm_inherit_t new_inheritance)
3220 {
3221 vm_map_entry_t entry, lentry, prev_entry, start_entry;
3222 int rv;
3223
3224 switch (new_inheritance) {
3225 case VM_INHERIT_NONE:
3226 case VM_INHERIT_COPY:
3227 case VM_INHERIT_SHARE:
3228 case VM_INHERIT_ZERO:
3229 break;
3230 default:
3231 return (KERN_INVALID_ARGUMENT);
3232 }
3233 if (start == end)
3234 return (KERN_SUCCESS);
3235 vm_map_lock(map);
3236 VM_MAP_RANGE_CHECK(map, start, end);
3237 rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3238 if (rv != KERN_SUCCESS)
3239 goto unlock;
3240 if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3241 rv = vm_map_clip_end(map, lentry, end);
3242 if (rv != KERN_SUCCESS)
3243 goto unlock;
3244 }
3245 if (new_inheritance == VM_INHERIT_COPY) {
3246 for (entry = start_entry; entry->start < end;
3247 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3248 if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3249 != 0) {
3250 rv = KERN_INVALID_ARGUMENT;
3251 goto unlock;
3252 }
3253 }
3254 }
3255 for (entry = start_entry; entry->start < end; prev_entry = entry,
3256 entry = vm_map_entry_succ(entry)) {
3257 KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3258 entry, (uintmax_t)entry->end, (uintmax_t)end));
3259 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3260 new_inheritance != VM_INHERIT_ZERO)
3261 entry->inheritance = new_inheritance;
3262 vm_map_try_merge_entries(map, prev_entry, entry);
3263 }
3264 vm_map_try_merge_entries(map, prev_entry, entry);
3265 unlock:
3266 vm_map_unlock(map);
3267 return (rv);
3268 }
3269
3270 /*
3271 * vm_map_entry_in_transition:
3272 *
3273 * Release the map lock, and sleep until the entry is no longer in
3274 * transition. Awake and acquire the map lock. If the map changed while
3275 * another held the lock, lookup a possibly-changed entry at or after the
3276 * 'start' position of the old entry.
3277 */
3278 static vm_map_entry_t
vm_map_entry_in_transition(vm_map_t map,vm_offset_t in_start,vm_offset_t * io_end,bool holes_ok,vm_map_entry_t in_entry)3279 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3280 vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3281 {
3282 vm_map_entry_t entry;
3283 vm_offset_t start;
3284 u_int last_timestamp;
3285
3286 VM_MAP_ASSERT_LOCKED(map);
3287 KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3288 ("not in-tranition map entry %p", in_entry));
3289 /*
3290 * We have not yet clipped the entry.
3291 */
3292 start = MAX(in_start, in_entry->start);
3293 in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3294 last_timestamp = map->timestamp;
3295 if (vm_map_unlock_and_wait(map, 0)) {
3296 /*
3297 * Allow interruption of user wiring/unwiring?
3298 */
3299 }
3300 vm_map_lock(map);
3301 if (last_timestamp + 1 == map->timestamp)
3302 return (in_entry);
3303
3304 /*
3305 * Look again for the entry because the map was modified while it was
3306 * unlocked. Specifically, the entry may have been clipped, merged, or
3307 * deleted.
3308 */
3309 if (!vm_map_lookup_entry(map, start, &entry)) {
3310 if (!holes_ok) {
3311 *io_end = start;
3312 return (NULL);
3313 }
3314 entry = vm_map_entry_succ(entry);
3315 }
3316 return (entry);
3317 }
3318
3319 /*
3320 * vm_map_unwire:
3321 *
3322 * Implements both kernel and user unwiring.
3323 */
3324 int
vm_map_unwire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3325 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3326 int flags)
3327 {
3328 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3329 int rv;
3330 bool holes_ok, need_wakeup, user_unwire;
3331
3332 if (start == end)
3333 return (KERN_SUCCESS);
3334 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3335 user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3336 vm_map_lock(map);
3337 VM_MAP_RANGE_CHECK(map, start, end);
3338 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3339 if (holes_ok)
3340 first_entry = vm_map_entry_succ(first_entry);
3341 else {
3342 vm_map_unlock(map);
3343 return (KERN_INVALID_ADDRESS);
3344 }
3345 }
3346 rv = KERN_SUCCESS;
3347 for (entry = first_entry; entry->start < end; entry = next_entry) {
3348 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3349 /*
3350 * We have not yet clipped the entry.
3351 */
3352 next_entry = vm_map_entry_in_transition(map, start,
3353 &end, holes_ok, entry);
3354 if (next_entry == NULL) {
3355 if (entry == first_entry) {
3356 vm_map_unlock(map);
3357 return (KERN_INVALID_ADDRESS);
3358 }
3359 rv = KERN_INVALID_ADDRESS;
3360 break;
3361 }
3362 first_entry = (entry == first_entry) ?
3363 next_entry : NULL;
3364 continue;
3365 }
3366 rv = vm_map_clip_start(map, entry, start);
3367 if (rv != KERN_SUCCESS)
3368 break;
3369 rv = vm_map_clip_end(map, entry, end);
3370 if (rv != KERN_SUCCESS)
3371 break;
3372
3373 /*
3374 * Mark the entry in case the map lock is released. (See
3375 * above.)
3376 */
3377 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3378 entry->wiring_thread == NULL,
3379 ("owned map entry %p", entry));
3380 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3381 entry->wiring_thread = curthread;
3382 next_entry = vm_map_entry_succ(entry);
3383 /*
3384 * Check the map for holes in the specified region.
3385 * If holes_ok, skip this check.
3386 */
3387 if (!holes_ok &&
3388 entry->end < end && next_entry->start > entry->end) {
3389 end = entry->end;
3390 rv = KERN_INVALID_ADDRESS;
3391 break;
3392 }
3393 /*
3394 * If system unwiring, require that the entry is system wired.
3395 */
3396 if (!user_unwire &&
3397 vm_map_entry_system_wired_count(entry) == 0) {
3398 end = entry->end;
3399 rv = KERN_INVALID_ARGUMENT;
3400 break;
3401 }
3402 }
3403 need_wakeup = false;
3404 if (first_entry == NULL &&
3405 !vm_map_lookup_entry(map, start, &first_entry)) {
3406 KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3407 prev_entry = first_entry;
3408 entry = vm_map_entry_succ(first_entry);
3409 } else {
3410 prev_entry = vm_map_entry_pred(first_entry);
3411 entry = first_entry;
3412 }
3413 for (; entry->start < end;
3414 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3415 /*
3416 * If holes_ok was specified, an empty
3417 * space in the unwired region could have been mapped
3418 * while the map lock was dropped for draining
3419 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3420 * could be simultaneously wiring this new mapping
3421 * entry. Detect these cases and skip any entries
3422 * marked as in transition by us.
3423 */
3424 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3425 entry->wiring_thread != curthread) {
3426 KASSERT(holes_ok,
3427 ("vm_map_unwire: !HOLESOK and new/changed entry"));
3428 continue;
3429 }
3430
3431 if (rv == KERN_SUCCESS && (!user_unwire ||
3432 (entry->eflags & MAP_ENTRY_USER_WIRED))) {
3433 if (entry->wired_count == 1)
3434 vm_map_entry_unwire(map, entry);
3435 else
3436 entry->wired_count--;
3437 if (user_unwire)
3438 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3439 }
3440 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3441 ("vm_map_unwire: in-transition flag missing %p", entry));
3442 KASSERT(entry->wiring_thread == curthread,
3443 ("vm_map_unwire: alien wire %p", entry));
3444 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3445 entry->wiring_thread = NULL;
3446 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3447 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3448 need_wakeup = true;
3449 }
3450 vm_map_try_merge_entries(map, prev_entry, entry);
3451 }
3452 vm_map_try_merge_entries(map, prev_entry, entry);
3453 vm_map_unlock(map);
3454 if (need_wakeup)
3455 vm_map_wakeup(map);
3456 return (rv);
3457 }
3458
3459 static void
vm_map_wire_user_count_sub(u_long npages)3460 vm_map_wire_user_count_sub(u_long npages)
3461 {
3462
3463 atomic_subtract_long(&vm_user_wire_count, npages);
3464 }
3465
3466 static bool
vm_map_wire_user_count_add(u_long npages)3467 vm_map_wire_user_count_add(u_long npages)
3468 {
3469 u_long wired;
3470
3471 wired = vm_user_wire_count;
3472 do {
3473 if (npages + wired > vm_page_max_user_wired)
3474 return (false);
3475 } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3476 npages + wired));
3477
3478 return (true);
3479 }
3480
3481 /*
3482 * vm_map_wire_entry_failure:
3483 *
3484 * Handle a wiring failure on the given entry.
3485 *
3486 * The map should be locked.
3487 */
3488 static void
vm_map_wire_entry_failure(vm_map_t map,vm_map_entry_t entry,vm_offset_t failed_addr)3489 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3490 vm_offset_t failed_addr)
3491 {
3492
3493 VM_MAP_ASSERT_LOCKED(map);
3494 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3495 entry->wired_count == 1,
3496 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3497 KASSERT(failed_addr < entry->end,
3498 ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3499
3500 /*
3501 * If any pages at the start of this entry were successfully wired,
3502 * then unwire them.
3503 */
3504 if (failed_addr > entry->start) {
3505 pmap_unwire(map->pmap, entry->start, failed_addr);
3506 vm_object_unwire(entry->object.vm_object, entry->offset,
3507 failed_addr - entry->start, PQ_ACTIVE);
3508 }
3509
3510 /*
3511 * Assign an out-of-range value to represent the failure to wire this
3512 * entry.
3513 */
3514 entry->wired_count = -1;
3515 }
3516
3517 int
vm_map_wire(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3518 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3519 {
3520 int rv;
3521
3522 vm_map_lock(map);
3523 rv = vm_map_wire_locked(map, start, end, flags);
3524 vm_map_unlock(map);
3525 return (rv);
3526 }
3527
3528 /*
3529 * vm_map_wire_locked:
3530 *
3531 * Implements both kernel and user wiring. Returns with the map locked,
3532 * the map lock may be dropped.
3533 */
3534 int
vm_map_wire_locked(vm_map_t map,vm_offset_t start,vm_offset_t end,int flags)3535 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3536 {
3537 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3538 vm_offset_t faddr, saved_end, saved_start;
3539 u_long incr, npages;
3540 u_int bidx, last_timestamp;
3541 int rv;
3542 bool holes_ok, need_wakeup, user_wire;
3543 vm_prot_t prot;
3544
3545 VM_MAP_ASSERT_LOCKED(map);
3546
3547 if (start == end)
3548 return (KERN_SUCCESS);
3549 prot = 0;
3550 if (flags & VM_MAP_WIRE_WRITE)
3551 prot |= VM_PROT_WRITE;
3552 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3553 user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3554 VM_MAP_RANGE_CHECK(map, start, end);
3555 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3556 if (holes_ok)
3557 first_entry = vm_map_entry_succ(first_entry);
3558 else
3559 return (KERN_INVALID_ADDRESS);
3560 }
3561 for (entry = first_entry; entry->start < end; entry = next_entry) {
3562 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3563 /*
3564 * We have not yet clipped the entry.
3565 */
3566 next_entry = vm_map_entry_in_transition(map, start,
3567 &end, holes_ok, entry);
3568 if (next_entry == NULL) {
3569 if (entry == first_entry)
3570 return (KERN_INVALID_ADDRESS);
3571 rv = KERN_INVALID_ADDRESS;
3572 goto done;
3573 }
3574 first_entry = (entry == first_entry) ?
3575 next_entry : NULL;
3576 continue;
3577 }
3578 rv = vm_map_clip_start(map, entry, start);
3579 if (rv != KERN_SUCCESS)
3580 goto done;
3581 rv = vm_map_clip_end(map, entry, end);
3582 if (rv != KERN_SUCCESS)
3583 goto done;
3584
3585 /*
3586 * Mark the entry in case the map lock is released. (See
3587 * above.)
3588 */
3589 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3590 entry->wiring_thread == NULL,
3591 ("owned map entry %p", entry));
3592 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3593 entry->wiring_thread = curthread;
3594 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3595 || (entry->protection & prot) != prot) {
3596 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3597 if (!holes_ok) {
3598 end = entry->end;
3599 rv = KERN_INVALID_ADDRESS;
3600 goto done;
3601 }
3602 } else if (entry->wired_count == 0) {
3603 entry->wired_count++;
3604
3605 npages = atop(entry->end - entry->start);
3606 if (user_wire && !vm_map_wire_user_count_add(npages)) {
3607 vm_map_wire_entry_failure(map, entry,
3608 entry->start);
3609 end = entry->end;
3610 rv = KERN_RESOURCE_SHORTAGE;
3611 goto done;
3612 }
3613
3614 /*
3615 * Release the map lock, relying on the in-transition
3616 * mark. Mark the map busy for fork.
3617 */
3618 saved_start = entry->start;
3619 saved_end = entry->end;
3620 last_timestamp = map->timestamp;
3621 bidx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3622 incr = pagesizes[bidx];
3623 vm_map_busy(map);
3624 vm_map_unlock(map);
3625
3626 for (faddr = saved_start; faddr < saved_end;
3627 faddr += incr) {
3628 /*
3629 * Simulate a fault to get the page and enter
3630 * it into the physical map.
3631 */
3632 rv = vm_fault(map, faddr, VM_PROT_NONE,
3633 VM_FAULT_WIRE, NULL);
3634 if (rv != KERN_SUCCESS)
3635 break;
3636 }
3637 vm_map_lock(map);
3638 vm_map_unbusy(map);
3639 if (last_timestamp + 1 != map->timestamp) {
3640 /*
3641 * Look again for the entry because the map was
3642 * modified while it was unlocked. The entry
3643 * may have been clipped, but NOT merged or
3644 * deleted.
3645 */
3646 if (!vm_map_lookup_entry(map, saved_start,
3647 &next_entry))
3648 KASSERT(false,
3649 ("vm_map_wire: lookup failed"));
3650 first_entry = (entry == first_entry) ?
3651 next_entry : NULL;
3652 for (entry = next_entry; entry->end < saved_end;
3653 entry = vm_map_entry_succ(entry)) {
3654 /*
3655 * In case of failure, handle entries
3656 * that were not fully wired here;
3657 * fully wired entries are handled
3658 * later.
3659 */
3660 if (rv != KERN_SUCCESS &&
3661 faddr < entry->end)
3662 vm_map_wire_entry_failure(map,
3663 entry, faddr);
3664 }
3665 }
3666 if (rv != KERN_SUCCESS) {
3667 vm_map_wire_entry_failure(map, entry, faddr);
3668 if (user_wire)
3669 vm_map_wire_user_count_sub(npages);
3670 end = entry->end;
3671 goto done;
3672 }
3673 } else if (!user_wire ||
3674 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3675 entry->wired_count++;
3676 }
3677 /*
3678 * Check the map for holes in the specified region.
3679 * If holes_ok was specified, skip this check.
3680 */
3681 next_entry = vm_map_entry_succ(entry);
3682 if (!holes_ok &&
3683 entry->end < end && next_entry->start > entry->end) {
3684 end = entry->end;
3685 rv = KERN_INVALID_ADDRESS;
3686 goto done;
3687 }
3688 }
3689 rv = KERN_SUCCESS;
3690 done:
3691 need_wakeup = false;
3692 if (first_entry == NULL &&
3693 !vm_map_lookup_entry(map, start, &first_entry)) {
3694 KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3695 prev_entry = first_entry;
3696 entry = vm_map_entry_succ(first_entry);
3697 } else {
3698 prev_entry = vm_map_entry_pred(first_entry);
3699 entry = first_entry;
3700 }
3701 for (; entry->start < end;
3702 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3703 /*
3704 * If holes_ok was specified, an empty
3705 * space in the unwired region could have been mapped
3706 * while the map lock was dropped for faulting in the
3707 * pages or draining MAP_ENTRY_IN_TRANSITION.
3708 * Moreover, another thread could be simultaneously
3709 * wiring this new mapping entry. Detect these cases
3710 * and skip any entries marked as in transition not by us.
3711 *
3712 * Another way to get an entry not marked with
3713 * MAP_ENTRY_IN_TRANSITION is after failed clipping,
3714 * which set rv to KERN_INVALID_ARGUMENT.
3715 */
3716 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3717 entry->wiring_thread != curthread) {
3718 KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3719 ("vm_map_wire: !HOLESOK and new/changed entry"));
3720 continue;
3721 }
3722
3723 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3724 /* do nothing */
3725 } else if (rv == KERN_SUCCESS) {
3726 if (user_wire)
3727 entry->eflags |= MAP_ENTRY_USER_WIRED;
3728 } else if (entry->wired_count == -1) {
3729 /*
3730 * Wiring failed on this entry. Thus, unwiring is
3731 * unnecessary.
3732 */
3733 entry->wired_count = 0;
3734 } else if (!user_wire ||
3735 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3736 /*
3737 * Undo the wiring. Wiring succeeded on this entry
3738 * but failed on a later entry.
3739 */
3740 if (entry->wired_count == 1) {
3741 vm_map_entry_unwire(map, entry);
3742 if (user_wire)
3743 vm_map_wire_user_count_sub(
3744 atop(entry->end - entry->start));
3745 } else
3746 entry->wired_count--;
3747 }
3748 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3749 ("vm_map_wire: in-transition flag missing %p", entry));
3750 KASSERT(entry->wiring_thread == curthread,
3751 ("vm_map_wire: alien wire %p", entry));
3752 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3753 MAP_ENTRY_WIRE_SKIPPED);
3754 entry->wiring_thread = NULL;
3755 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3756 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3757 need_wakeup = true;
3758 }
3759 vm_map_try_merge_entries(map, prev_entry, entry);
3760 }
3761 vm_map_try_merge_entries(map, prev_entry, entry);
3762 if (need_wakeup)
3763 vm_map_wakeup(map);
3764 return (rv);
3765 }
3766
3767 /*
3768 * vm_map_sync
3769 *
3770 * Push any dirty cached pages in the address range to their pager.
3771 * If syncio is TRUE, dirty pages are written synchronously.
3772 * If invalidate is TRUE, any cached pages are freed as well.
3773 *
3774 * If the size of the region from start to end is zero, we are
3775 * supposed to flush all modified pages within the region containing
3776 * start. Unfortunately, a region can be split or coalesced with
3777 * neighboring regions, making it difficult to determine what the
3778 * original region was. Therefore, we approximate this requirement by
3779 * flushing the current region containing start.
3780 *
3781 * Returns an error if any part of the specified range is not mapped.
3782 */
3783 int
vm_map_sync(vm_map_t map,vm_offset_t start,vm_offset_t end,boolean_t syncio,boolean_t invalidate)3784 vm_map_sync(
3785 vm_map_t map,
3786 vm_offset_t start,
3787 vm_offset_t end,
3788 boolean_t syncio,
3789 boolean_t invalidate)
3790 {
3791 vm_map_entry_t entry, first_entry, next_entry;
3792 vm_size_t size;
3793 vm_object_t object;
3794 vm_ooffset_t offset;
3795 unsigned int last_timestamp;
3796 int bdry_idx;
3797 boolean_t failed;
3798
3799 vm_map_lock_read(map);
3800 VM_MAP_RANGE_CHECK(map, start, end);
3801 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3802 vm_map_unlock_read(map);
3803 return (KERN_INVALID_ADDRESS);
3804 } else if (start == end) {
3805 start = first_entry->start;
3806 end = first_entry->end;
3807 }
3808
3809 /*
3810 * Make a first pass to check for user-wired memory, holes,
3811 * and partial invalidation of largepage mappings.
3812 */
3813 for (entry = first_entry; entry->start < end; entry = next_entry) {
3814 if (invalidate) {
3815 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3816 vm_map_unlock_read(map);
3817 return (KERN_INVALID_ARGUMENT);
3818 }
3819 bdry_idx = MAP_ENTRY_SPLIT_BOUNDARY_INDEX(entry);
3820 if (bdry_idx != 0 &&
3821 ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3822 (end & (pagesizes[bdry_idx] - 1)) != 0)) {
3823 vm_map_unlock_read(map);
3824 return (KERN_INVALID_ARGUMENT);
3825 }
3826 }
3827 next_entry = vm_map_entry_succ(entry);
3828 if (end > entry->end &&
3829 entry->end != next_entry->start) {
3830 vm_map_unlock_read(map);
3831 return (KERN_INVALID_ADDRESS);
3832 }
3833 }
3834
3835 if (invalidate)
3836 pmap_remove(map->pmap, start, end);
3837 failed = FALSE;
3838
3839 /*
3840 * Make a second pass, cleaning/uncaching pages from the indicated
3841 * objects as we go.
3842 */
3843 for (entry = first_entry; entry->start < end;) {
3844 offset = entry->offset + (start - entry->start);
3845 size = (end <= entry->end ? end : entry->end) - start;
3846 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3847 vm_map_t smap;
3848 vm_map_entry_t tentry;
3849 vm_size_t tsize;
3850
3851 smap = entry->object.sub_map;
3852 vm_map_lock_read(smap);
3853 (void) vm_map_lookup_entry(smap, offset, &tentry);
3854 tsize = tentry->end - offset;
3855 if (tsize < size)
3856 size = tsize;
3857 object = tentry->object.vm_object;
3858 offset = tentry->offset + (offset - tentry->start);
3859 vm_map_unlock_read(smap);
3860 } else {
3861 object = entry->object.vm_object;
3862 }
3863 vm_object_reference(object);
3864 last_timestamp = map->timestamp;
3865 vm_map_unlock_read(map);
3866 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3867 failed = TRUE;
3868 start += size;
3869 vm_object_deallocate(object);
3870 vm_map_lock_read(map);
3871 if (last_timestamp == map->timestamp ||
3872 !vm_map_lookup_entry(map, start, &entry))
3873 entry = vm_map_entry_succ(entry);
3874 }
3875
3876 vm_map_unlock_read(map);
3877 return (failed ? KERN_FAILURE : KERN_SUCCESS);
3878 }
3879
3880 /*
3881 * vm_map_entry_unwire: [ internal use only ]
3882 *
3883 * Make the region specified by this entry pageable.
3884 *
3885 * The map in question should be locked.
3886 * [This is the reason for this routine's existence.]
3887 */
3888 static void
vm_map_entry_unwire(vm_map_t map,vm_map_entry_t entry)3889 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3890 {
3891 vm_size_t size;
3892
3893 VM_MAP_ASSERT_LOCKED(map);
3894 KASSERT(entry->wired_count > 0,
3895 ("vm_map_entry_unwire: entry %p isn't wired", entry));
3896
3897 size = entry->end - entry->start;
3898 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3899 vm_map_wire_user_count_sub(atop(size));
3900 pmap_unwire(map->pmap, entry->start, entry->end);
3901 vm_object_unwire(entry->object.vm_object, entry->offset, size,
3902 PQ_ACTIVE);
3903 entry->wired_count = 0;
3904 }
3905
3906 static void
vm_map_entry_deallocate(vm_map_entry_t entry,boolean_t system_map)3907 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3908 {
3909
3910 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3911 vm_object_deallocate(entry->object.vm_object);
3912 uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3913 }
3914
3915 /*
3916 * vm_map_entry_delete: [ internal use only ]
3917 *
3918 * Deallocate the given entry from the target map.
3919 */
3920 static void
vm_map_entry_delete(vm_map_t map,vm_map_entry_t entry)3921 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3922 {
3923 vm_object_t object;
3924 vm_pindex_t offidxstart, offidxend, size1;
3925 vm_size_t size;
3926
3927 vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3928 object = entry->object.vm_object;
3929
3930 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3931 MPASS(entry->cred == NULL);
3932 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3933 MPASS(object == NULL);
3934 vm_map_entry_deallocate(entry, vm_map_is_system(map));
3935 return;
3936 }
3937
3938 size = entry->end - entry->start;
3939 map->size -= size;
3940
3941 if (entry->cred != NULL) {
3942 swap_release_by_cred(size, entry->cred);
3943 crfree(entry->cred);
3944 }
3945
3946 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3947 entry->object.vm_object = NULL;
3948 } else if ((object->flags & OBJ_ANON) != 0 ||
3949 object == kernel_object) {
3950 KASSERT(entry->cred == NULL || object->cred == NULL ||
3951 (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3952 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3953 offidxstart = OFF_TO_IDX(entry->offset);
3954 offidxend = offidxstart + atop(size);
3955 VM_OBJECT_WLOCK(object);
3956 if (object->ref_count != 1 &&
3957 ((object->flags & OBJ_ONEMAPPING) != 0 ||
3958 object == kernel_object)) {
3959 vm_object_collapse(object);
3960
3961 /*
3962 * The option OBJPR_NOTMAPPED can be passed here
3963 * because vm_map_delete() already performed
3964 * pmap_remove() on the only mapping to this range
3965 * of pages.
3966 */
3967 vm_object_page_remove(object, offidxstart, offidxend,
3968 OBJPR_NOTMAPPED);
3969 if (offidxend >= object->size &&
3970 offidxstart < object->size) {
3971 size1 = object->size;
3972 object->size = offidxstart;
3973 if (object->cred != NULL) {
3974 size1 -= object->size;
3975 KASSERT(object->charge >= ptoa(size1),
3976 ("object %p charge < 0", object));
3977 swap_release_by_cred(ptoa(size1),
3978 object->cred);
3979 object->charge -= ptoa(size1);
3980 }
3981 }
3982 }
3983 VM_OBJECT_WUNLOCK(object);
3984 }
3985 if (vm_map_is_system(map))
3986 vm_map_entry_deallocate(entry, TRUE);
3987 else {
3988 entry->defer_next = curthread->td_map_def_user;
3989 curthread->td_map_def_user = entry;
3990 }
3991 }
3992
3993 /*
3994 * vm_map_delete: [ internal use only ]
3995 *
3996 * Deallocates the given address range from the target
3997 * map.
3998 */
3999 int
vm_map_delete(vm_map_t map,vm_offset_t start,vm_offset_t end)4000 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
4001 {
4002 vm_map_entry_t entry, next_entry, scratch_entry;
4003 int rv;
4004
4005 VM_MAP_ASSERT_LOCKED(map);
4006
4007 if (start == end)
4008 return (KERN_SUCCESS);
4009
4010 /*
4011 * Find the start of the region, and clip it.
4012 * Step through all entries in this region.
4013 */
4014 rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
4015 if (rv != KERN_SUCCESS)
4016 return (rv);
4017 for (; entry->start < end; entry = next_entry) {
4018 /*
4019 * Wait for wiring or unwiring of an entry to complete.
4020 * Also wait for any system wirings to disappear on
4021 * user maps.
4022 */
4023 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
4024 (vm_map_pmap(map) != kernel_pmap &&
4025 vm_map_entry_system_wired_count(entry) != 0)) {
4026 unsigned int last_timestamp;
4027 vm_offset_t saved_start;
4028
4029 saved_start = entry->start;
4030 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4031 last_timestamp = map->timestamp;
4032 (void) vm_map_unlock_and_wait(map, 0);
4033 vm_map_lock(map);
4034 if (last_timestamp + 1 != map->timestamp) {
4035 /*
4036 * Look again for the entry because the map was
4037 * modified while it was unlocked.
4038 * Specifically, the entry may have been
4039 * clipped, merged, or deleted.
4040 */
4041 rv = vm_map_lookup_clip_start(map, saved_start,
4042 &next_entry, &scratch_entry);
4043 if (rv != KERN_SUCCESS)
4044 break;
4045 } else
4046 next_entry = entry;
4047 continue;
4048 }
4049
4050 /* XXXKIB or delete to the upper superpage boundary ? */
4051 rv = vm_map_clip_end(map, entry, end);
4052 if (rv != KERN_SUCCESS)
4053 break;
4054 next_entry = vm_map_entry_succ(entry);
4055
4056 /*
4057 * Unwire before removing addresses from the pmap; otherwise,
4058 * unwiring will put the entries back in the pmap.
4059 */
4060 if (entry->wired_count != 0)
4061 vm_map_entry_unwire(map, entry);
4062
4063 /*
4064 * Remove mappings for the pages, but only if the
4065 * mappings could exist. For instance, it does not
4066 * make sense to call pmap_remove() for guard entries.
4067 */
4068 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4069 entry->object.vm_object != NULL)
4070 pmap_map_delete(map->pmap, entry->start, entry->end);
4071
4072 /*
4073 * Delete the entry only after removing all pmap
4074 * entries pointing to its pages. (Otherwise, its
4075 * page frames may be reallocated, and any modify bits
4076 * will be set in the wrong object!)
4077 */
4078 vm_map_entry_delete(map, entry);
4079 }
4080 return (rv);
4081 }
4082
4083 /*
4084 * vm_map_remove:
4085 *
4086 * Remove the given address range from the target map.
4087 * This is the exported form of vm_map_delete.
4088 */
4089 int
vm_map_remove(vm_map_t map,vm_offset_t start,vm_offset_t end)4090 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4091 {
4092 int result;
4093
4094 vm_map_lock(map);
4095 VM_MAP_RANGE_CHECK(map, start, end);
4096 result = vm_map_delete(map, start, end);
4097 vm_map_unlock(map);
4098 return (result);
4099 }
4100
4101 /*
4102 * vm_map_check_protection:
4103 *
4104 * Assert that the target map allows the specified privilege on the
4105 * entire address region given. The entire region must be allocated.
4106 *
4107 * WARNING! This code does not and should not check whether the
4108 * contents of the region is accessible. For example a smaller file
4109 * might be mapped into a larger address space.
4110 *
4111 * NOTE! This code is also called by munmap().
4112 *
4113 * The map must be locked. A read lock is sufficient.
4114 */
4115 boolean_t
vm_map_check_protection(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t protection)4116 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4117 vm_prot_t protection)
4118 {
4119 vm_map_entry_t entry;
4120 vm_map_entry_t tmp_entry;
4121
4122 if (!vm_map_lookup_entry(map, start, &tmp_entry))
4123 return (FALSE);
4124 entry = tmp_entry;
4125
4126 while (start < end) {
4127 /*
4128 * No holes allowed!
4129 */
4130 if (start < entry->start)
4131 return (FALSE);
4132 /*
4133 * Check protection associated with entry.
4134 */
4135 if ((entry->protection & protection) != protection)
4136 return (FALSE);
4137 /* go to next entry */
4138 start = entry->end;
4139 entry = vm_map_entry_succ(entry);
4140 }
4141 return (TRUE);
4142 }
4143
4144 /*
4145 *
4146 * vm_map_copy_swap_object:
4147 *
4148 * Copies a swap-backed object from an existing map entry to a
4149 * new one. Carries forward the swap charge. May change the
4150 * src object on return.
4151 */
4152 static void
vm_map_copy_swap_object(vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_offset_t size,vm_ooffset_t * fork_charge)4153 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4154 vm_offset_t size, vm_ooffset_t *fork_charge)
4155 {
4156 vm_object_t src_object;
4157 struct ucred *cred;
4158 int charged;
4159
4160 src_object = src_entry->object.vm_object;
4161 charged = ENTRY_CHARGED(src_entry);
4162 if ((src_object->flags & OBJ_ANON) != 0) {
4163 VM_OBJECT_WLOCK(src_object);
4164 vm_object_collapse(src_object);
4165 if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4166 vm_object_split(src_entry);
4167 src_object = src_entry->object.vm_object;
4168 }
4169 vm_object_reference_locked(src_object);
4170 vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4171 VM_OBJECT_WUNLOCK(src_object);
4172 } else
4173 vm_object_reference(src_object);
4174 if (src_entry->cred != NULL &&
4175 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4176 KASSERT(src_object->cred == NULL,
4177 ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4178 src_object));
4179 src_object->cred = src_entry->cred;
4180 src_object->charge = size;
4181 }
4182 dst_entry->object.vm_object = src_object;
4183 if (charged) {
4184 cred = curthread->td_ucred;
4185 crhold(cred);
4186 dst_entry->cred = cred;
4187 *fork_charge += size;
4188 if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4189 crhold(cred);
4190 src_entry->cred = cred;
4191 *fork_charge += size;
4192 }
4193 }
4194 }
4195
4196 /*
4197 * vm_map_copy_entry:
4198 *
4199 * Copies the contents of the source entry to the destination
4200 * entry. The entries *must* be aligned properly.
4201 */
4202 static void
vm_map_copy_entry(vm_map_t src_map,vm_map_t dst_map,vm_map_entry_t src_entry,vm_map_entry_t dst_entry,vm_ooffset_t * fork_charge)4203 vm_map_copy_entry(
4204 vm_map_t src_map,
4205 vm_map_t dst_map,
4206 vm_map_entry_t src_entry,
4207 vm_map_entry_t dst_entry,
4208 vm_ooffset_t *fork_charge)
4209 {
4210 vm_object_t src_object;
4211 vm_map_entry_t fake_entry;
4212 vm_offset_t size;
4213
4214 VM_MAP_ASSERT_LOCKED(dst_map);
4215
4216 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4217 return;
4218
4219 if (src_entry->wired_count == 0 ||
4220 (src_entry->protection & VM_PROT_WRITE) == 0) {
4221 /*
4222 * If the source entry is marked needs_copy, it is already
4223 * write-protected.
4224 */
4225 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4226 (src_entry->protection & VM_PROT_WRITE) != 0) {
4227 pmap_protect(src_map->pmap,
4228 src_entry->start,
4229 src_entry->end,
4230 src_entry->protection & ~VM_PROT_WRITE);
4231 }
4232
4233 /*
4234 * Make a copy of the object.
4235 */
4236 size = src_entry->end - src_entry->start;
4237 if ((src_object = src_entry->object.vm_object) != NULL) {
4238 if ((src_object->flags & OBJ_SWAP) != 0) {
4239 vm_map_copy_swap_object(src_entry, dst_entry,
4240 size, fork_charge);
4241 /* May have split/collapsed, reload obj. */
4242 src_object = src_entry->object.vm_object;
4243 } else {
4244 vm_object_reference(src_object);
4245 dst_entry->object.vm_object = src_object;
4246 }
4247 src_entry->eflags |= MAP_ENTRY_COW |
4248 MAP_ENTRY_NEEDS_COPY;
4249 dst_entry->eflags |= MAP_ENTRY_COW |
4250 MAP_ENTRY_NEEDS_COPY;
4251 dst_entry->offset = src_entry->offset;
4252 if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4253 /*
4254 * MAP_ENTRY_WRITECNT cannot
4255 * indicate write reference from
4256 * src_entry, since the entry is
4257 * marked as needs copy. Allocate a
4258 * fake entry that is used to
4259 * decrement object->un_pager writecount
4260 * at the appropriate time. Attach
4261 * fake_entry to the deferred list.
4262 */
4263 fake_entry = vm_map_entry_create(dst_map);
4264 fake_entry->eflags = MAP_ENTRY_WRITECNT;
4265 src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4266 vm_object_reference(src_object);
4267 fake_entry->object.vm_object = src_object;
4268 fake_entry->start = src_entry->start;
4269 fake_entry->end = src_entry->end;
4270 fake_entry->defer_next =
4271 curthread->td_map_def_user;
4272 curthread->td_map_def_user = fake_entry;
4273 }
4274
4275 pmap_copy(dst_map->pmap, src_map->pmap,
4276 dst_entry->start, dst_entry->end - dst_entry->start,
4277 src_entry->start);
4278 } else {
4279 dst_entry->object.vm_object = NULL;
4280 if ((dst_entry->eflags & MAP_ENTRY_GUARD) == 0)
4281 dst_entry->offset = 0;
4282 if (src_entry->cred != NULL) {
4283 dst_entry->cred = curthread->td_ucred;
4284 crhold(dst_entry->cred);
4285 *fork_charge += size;
4286 }
4287 }
4288 } else {
4289 /*
4290 * We don't want to make writeable wired pages copy-on-write.
4291 * Immediately copy these pages into the new map by simulating
4292 * page faults. The new pages are pageable.
4293 */
4294 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4295 fork_charge);
4296 }
4297 }
4298
4299 /*
4300 * vmspace_map_entry_forked:
4301 * Update the newly-forked vmspace each time a map entry is inherited
4302 * or copied. The values for vm_dsize and vm_tsize are approximate
4303 * (and mostly-obsolete ideas in the face of mmap(2) et al.)
4304 */
4305 static void
vmspace_map_entry_forked(const struct vmspace * vm1,struct vmspace * vm2,vm_map_entry_t entry)4306 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4307 vm_map_entry_t entry)
4308 {
4309 vm_size_t entrysize;
4310 vm_offset_t newend;
4311
4312 if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4313 return;
4314 entrysize = entry->end - entry->start;
4315 vm2->vm_map.size += entrysize;
4316 if ((entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0) {
4317 vm2->vm_ssize += btoc(entrysize);
4318 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4319 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4320 newend = MIN(entry->end,
4321 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4322 vm2->vm_dsize += btoc(newend - entry->start);
4323 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4324 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4325 newend = MIN(entry->end,
4326 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4327 vm2->vm_tsize += btoc(newend - entry->start);
4328 }
4329 }
4330
4331 /*
4332 * vmspace_fork:
4333 * Create a new process vmspace structure and vm_map
4334 * based on those of an existing process. The new map
4335 * is based on the old map, according to the inheritance
4336 * values on the regions in that map.
4337 *
4338 * XXX It might be worth coalescing the entries added to the new vmspace.
4339 *
4340 * The source map must not be locked.
4341 */
4342 struct vmspace *
vmspace_fork(struct vmspace * vm1,vm_ooffset_t * fork_charge)4343 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4344 {
4345 struct vmspace *vm2;
4346 vm_map_t new_map, old_map;
4347 vm_map_entry_t new_entry, old_entry;
4348 vm_object_t object;
4349 int error, locked __diagused;
4350 vm_inherit_t inh;
4351
4352 old_map = &vm1->vm_map;
4353 /* Copy immutable fields of vm1 to vm2. */
4354 vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4355 pmap_pinit);
4356 if (vm2 == NULL)
4357 return (NULL);
4358
4359 vm2->vm_taddr = vm1->vm_taddr;
4360 vm2->vm_daddr = vm1->vm_daddr;
4361 vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4362 vm2->vm_stacktop = vm1->vm_stacktop;
4363 vm2->vm_shp_base = vm1->vm_shp_base;
4364 vm_map_lock(old_map);
4365 if (old_map->busy)
4366 vm_map_wait_busy(old_map);
4367 new_map = &vm2->vm_map;
4368 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4369 KASSERT(locked, ("vmspace_fork: lock failed"));
4370
4371 error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4372 if (error != 0) {
4373 sx_xunlock(&old_map->lock);
4374 sx_xunlock(&new_map->lock);
4375 vm_map_process_deferred();
4376 vmspace_free(vm2);
4377 return (NULL);
4378 }
4379
4380 new_map->anon_loc = old_map->anon_loc;
4381 new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4382 MAP_ASLR_STACK | MAP_WXORX);
4383
4384 VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4385 if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4386 panic("vm_map_fork: encountered a submap");
4387
4388 inh = old_entry->inheritance;
4389 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4390 inh != VM_INHERIT_NONE)
4391 inh = VM_INHERIT_COPY;
4392
4393 switch (inh) {
4394 case VM_INHERIT_NONE:
4395 break;
4396
4397 case VM_INHERIT_SHARE:
4398 /*
4399 * Clone the entry, creating the shared object if
4400 * necessary.
4401 */
4402 object = old_entry->object.vm_object;
4403 if (object == NULL) {
4404 vm_map_entry_back(old_entry);
4405 object = old_entry->object.vm_object;
4406 }
4407
4408 /*
4409 * Add the reference before calling vm_object_shadow
4410 * to insure that a shadow object is created.
4411 */
4412 vm_object_reference(object);
4413 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4414 vm_object_shadow(&old_entry->object.vm_object,
4415 &old_entry->offset,
4416 old_entry->end - old_entry->start,
4417 old_entry->cred,
4418 /* Transfer the second reference too. */
4419 true);
4420 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4421 old_entry->cred = NULL;
4422
4423 /*
4424 * As in vm_map_merged_neighbor_dispose(),
4425 * the vnode lock will not be acquired in
4426 * this call to vm_object_deallocate().
4427 */
4428 vm_object_deallocate(object);
4429 object = old_entry->object.vm_object;
4430 } else {
4431 VM_OBJECT_WLOCK(object);
4432 vm_object_clear_flag(object, OBJ_ONEMAPPING);
4433 if (old_entry->cred != NULL) {
4434 KASSERT(object->cred == NULL,
4435 ("vmspace_fork both cred"));
4436 object->cred = old_entry->cred;
4437 object->charge = old_entry->end -
4438 old_entry->start;
4439 old_entry->cred = NULL;
4440 }
4441
4442 /*
4443 * Assert the correct state of the vnode
4444 * v_writecount while the object is locked, to
4445 * not relock it later for the assertion
4446 * correctness.
4447 */
4448 if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4449 object->type == OBJT_VNODE) {
4450 KASSERT(((struct vnode *)object->
4451 handle)->v_writecount > 0,
4452 ("vmspace_fork: v_writecount %p",
4453 object));
4454 KASSERT(object->un_pager.vnp.
4455 writemappings > 0,
4456 ("vmspace_fork: vnp.writecount %p",
4457 object));
4458 }
4459 VM_OBJECT_WUNLOCK(object);
4460 }
4461
4462 /*
4463 * Clone the entry, referencing the shared object.
4464 */
4465 new_entry = vm_map_entry_create(new_map);
4466 *new_entry = *old_entry;
4467 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4468 MAP_ENTRY_IN_TRANSITION);
4469 new_entry->wiring_thread = NULL;
4470 new_entry->wired_count = 0;
4471 if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4472 vm_pager_update_writecount(object,
4473 new_entry->start, new_entry->end);
4474 }
4475 vm_map_entry_set_vnode_text(new_entry, true);
4476
4477 /*
4478 * Insert the entry into the new map -- we know we're
4479 * inserting at the end of the new map.
4480 */
4481 vm_map_entry_link(new_map, new_entry);
4482 vmspace_map_entry_forked(vm1, vm2, new_entry);
4483
4484 /*
4485 * Update the physical map
4486 */
4487 pmap_copy(new_map->pmap, old_map->pmap,
4488 new_entry->start,
4489 (old_entry->end - old_entry->start),
4490 old_entry->start);
4491 break;
4492
4493 case VM_INHERIT_COPY:
4494 /*
4495 * Clone the entry and link into the map.
4496 */
4497 new_entry = vm_map_entry_create(new_map);
4498 *new_entry = *old_entry;
4499 /*
4500 * Copied entry is COW over the old object.
4501 */
4502 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4503 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4504 new_entry->wiring_thread = NULL;
4505 new_entry->wired_count = 0;
4506 new_entry->object.vm_object = NULL;
4507 new_entry->cred = NULL;
4508 vm_map_entry_link(new_map, new_entry);
4509 vmspace_map_entry_forked(vm1, vm2, new_entry);
4510 vm_map_copy_entry(old_map, new_map, old_entry,
4511 new_entry, fork_charge);
4512 vm_map_entry_set_vnode_text(new_entry, true);
4513 break;
4514
4515 case VM_INHERIT_ZERO:
4516 /*
4517 * Create a new anonymous mapping entry modelled from
4518 * the old one.
4519 */
4520 new_entry = vm_map_entry_create(new_map);
4521 memset(new_entry, 0, sizeof(*new_entry));
4522
4523 new_entry->start = old_entry->start;
4524 new_entry->end = old_entry->end;
4525 new_entry->eflags = old_entry->eflags &
4526 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4527 MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4528 MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4529 new_entry->protection = old_entry->protection;
4530 new_entry->max_protection = old_entry->max_protection;
4531 new_entry->inheritance = VM_INHERIT_ZERO;
4532
4533 vm_map_entry_link(new_map, new_entry);
4534 vmspace_map_entry_forked(vm1, vm2, new_entry);
4535
4536 new_entry->cred = curthread->td_ucred;
4537 crhold(new_entry->cred);
4538 *fork_charge += (new_entry->end - new_entry->start);
4539
4540 break;
4541 }
4542 }
4543 /*
4544 * Use inlined vm_map_unlock() to postpone handling the deferred
4545 * map entries, which cannot be done until both old_map and
4546 * new_map locks are released.
4547 */
4548 sx_xunlock(&old_map->lock);
4549 sx_xunlock(&new_map->lock);
4550 vm_map_process_deferred();
4551
4552 return (vm2);
4553 }
4554
4555 /*
4556 * Create a process's stack for exec_new_vmspace(). This function is never
4557 * asked to wire the newly created stack.
4558 */
4559 int
vm_map_stack(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_prot_t prot,vm_prot_t max,int cow)4560 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4561 vm_prot_t prot, vm_prot_t max, int cow)
4562 {
4563 vm_size_t growsize, init_ssize;
4564 rlim_t vmemlim;
4565 int rv;
4566
4567 MPASS((map->flags & MAP_WIREFUTURE) == 0);
4568 growsize = sgrowsiz;
4569 init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4570 vm_map_lock(map);
4571 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4572 /* If we would blow our VMEM resource limit, no go */
4573 if (map->size + init_ssize > vmemlim) {
4574 rv = KERN_NO_SPACE;
4575 goto out;
4576 }
4577 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4578 max, cow);
4579 out:
4580 vm_map_unlock(map);
4581 return (rv);
4582 }
4583
4584 static int stack_guard_page = 1;
4585 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4586 &stack_guard_page, 0,
4587 "Specifies the number of guard pages for a stack that grows");
4588
4589 static int
vm_map_stack_locked(vm_map_t map,vm_offset_t addrbos,vm_size_t max_ssize,vm_size_t growsize,vm_prot_t prot,vm_prot_t max,int cow)4590 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4591 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4592 {
4593 vm_map_entry_t gap_entry, new_entry, prev_entry;
4594 vm_offset_t bot, gap_bot, gap_top, top;
4595 vm_size_t init_ssize, sgp;
4596 int rv;
4597
4598 KASSERT((cow & MAP_STACK_AREA) != 0,
4599 ("New mapping is not a stack"));
4600
4601 if (max_ssize == 0 ||
4602 !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4603 return (KERN_INVALID_ADDRESS);
4604 sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4605 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4606 (vm_size_t)stack_guard_page * PAGE_SIZE;
4607 if (sgp >= max_ssize)
4608 return (KERN_INVALID_ARGUMENT);
4609
4610 init_ssize = growsize;
4611 if (max_ssize < init_ssize + sgp)
4612 init_ssize = max_ssize - sgp;
4613
4614 /* If addr is already mapped, no go */
4615 if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4616 return (KERN_NO_SPACE);
4617
4618 /*
4619 * If we can't accommodate max_ssize in the current mapping, no go.
4620 */
4621 if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4622 return (KERN_NO_SPACE);
4623
4624 /*
4625 * We initially map a stack of only init_ssize, at the top of
4626 * the range. We will grow as needed later.
4627 *
4628 * Note: we would normally expect prot and max to be VM_PROT_ALL,
4629 * and cow to be 0. Possibly we should eliminate these as input
4630 * parameters, and just pass these values here in the insert call.
4631 */
4632 bot = addrbos + max_ssize - init_ssize;
4633 top = bot + init_ssize;
4634 gap_bot = addrbos;
4635 gap_top = bot;
4636 rv = vm_map_insert1(map, NULL, 0, bot, top, prot, max, cow,
4637 &new_entry);
4638 if (rv != KERN_SUCCESS)
4639 return (rv);
4640 KASSERT(new_entry->end == top || new_entry->start == bot,
4641 ("Bad entry start/end for new stack entry"));
4642 KASSERT((new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4643 ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4644 if (gap_bot == gap_top)
4645 return (KERN_SUCCESS);
4646 rv = vm_map_insert1(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4647 VM_PROT_NONE, MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4648 &gap_entry);
4649 if (rv == KERN_SUCCESS) {
4650 KASSERT((gap_entry->eflags & MAP_ENTRY_GUARD) != 0,
4651 ("entry %p not gap %#x", gap_entry, gap_entry->eflags));
4652 KASSERT((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0,
4653 ("entry %p not stack gap %#x", gap_entry,
4654 gap_entry->eflags));
4655
4656 /*
4657 * Gap can never successfully handle a fault, so
4658 * read-ahead logic is never used for it. Re-use
4659 * next_read of the gap entry to store
4660 * stack_guard_page for vm_map_growstack().
4661 * Similarly, since a gap cannot have a backing object,
4662 * store the original stack protections in the
4663 * object offset.
4664 */
4665 gap_entry->next_read = sgp;
4666 gap_entry->offset = prot | PROT_MAX(max);
4667 } else {
4668 (void)vm_map_delete(map, bot, top);
4669 }
4670 return (rv);
4671 }
4672
4673 /*
4674 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4675 * successfully grow the stack.
4676 */
4677 static int
vm_map_growstack(vm_map_t map,vm_offset_t addr,vm_map_entry_t gap_entry)4678 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4679 {
4680 vm_map_entry_t stack_entry;
4681 struct proc *p;
4682 struct vmspace *vm;
4683 vm_offset_t gap_end, gap_start, grow_start;
4684 vm_size_t grow_amount, guard, max_grow, sgp;
4685 vm_prot_t prot, max;
4686 rlim_t lmemlim, stacklim, vmemlim;
4687 int rv, rv1 __diagused;
4688 bool gap_deleted, is_procstack;
4689 #ifdef notyet
4690 uint64_t limit;
4691 #endif
4692 #ifdef RACCT
4693 int error __diagused;
4694 #endif
4695
4696 p = curproc;
4697 vm = p->p_vmspace;
4698
4699 /*
4700 * Disallow stack growth when the access is performed by a
4701 * debugger or AIO daemon. The reason is that the wrong
4702 * resource limits are applied.
4703 */
4704 if (p != initproc && (map != &p->p_vmspace->vm_map ||
4705 p->p_textvp == NULL))
4706 return (KERN_FAILURE);
4707
4708 MPASS(!vm_map_is_system(map));
4709
4710 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4711 stacklim = lim_cur(curthread, RLIMIT_STACK);
4712 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4713 retry:
4714 /* If addr is not in a hole for a stack grow area, no need to grow. */
4715 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4716 return (KERN_FAILURE);
4717 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4718 return (KERN_SUCCESS);
4719 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP) != 0) {
4720 stack_entry = vm_map_entry_succ(gap_entry);
4721 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4722 stack_entry->start != gap_entry->end)
4723 return (KERN_FAILURE);
4724 grow_amount = round_page(stack_entry->start - addr);
4725 } else {
4726 return (KERN_FAILURE);
4727 }
4728 guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4729 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4730 gap_entry->next_read;
4731 max_grow = gap_entry->end - gap_entry->start;
4732 if (guard > max_grow)
4733 return (KERN_NO_SPACE);
4734 max_grow -= guard;
4735 if (grow_amount > max_grow)
4736 return (KERN_NO_SPACE);
4737
4738 /*
4739 * If this is the main process stack, see if we're over the stack
4740 * limit.
4741 */
4742 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4743 addr < (vm_offset_t)vm->vm_stacktop;
4744 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4745 return (KERN_NO_SPACE);
4746
4747 #ifdef RACCT
4748 if (racct_enable) {
4749 PROC_LOCK(p);
4750 if (is_procstack && racct_set(p, RACCT_STACK,
4751 ctob(vm->vm_ssize) + grow_amount)) {
4752 PROC_UNLOCK(p);
4753 return (KERN_NO_SPACE);
4754 }
4755 PROC_UNLOCK(p);
4756 }
4757 #endif
4758
4759 grow_amount = roundup(grow_amount, sgrowsiz);
4760 if (grow_amount > max_grow)
4761 grow_amount = max_grow;
4762 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4763 grow_amount = trunc_page((vm_size_t)stacklim) -
4764 ctob(vm->vm_ssize);
4765 }
4766
4767 #ifdef notyet
4768 PROC_LOCK(p);
4769 limit = racct_get_available(p, RACCT_STACK);
4770 PROC_UNLOCK(p);
4771 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4772 grow_amount = limit - ctob(vm->vm_ssize);
4773 #endif
4774
4775 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4776 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4777 rv = KERN_NO_SPACE;
4778 goto out;
4779 }
4780 #ifdef RACCT
4781 if (racct_enable) {
4782 PROC_LOCK(p);
4783 if (racct_set(p, RACCT_MEMLOCK,
4784 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4785 PROC_UNLOCK(p);
4786 rv = KERN_NO_SPACE;
4787 goto out;
4788 }
4789 PROC_UNLOCK(p);
4790 }
4791 #endif
4792 }
4793
4794 /* If we would blow our VMEM resource limit, no go */
4795 if (map->size + grow_amount > vmemlim) {
4796 rv = KERN_NO_SPACE;
4797 goto out;
4798 }
4799 #ifdef RACCT
4800 if (racct_enable) {
4801 PROC_LOCK(p);
4802 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4803 PROC_UNLOCK(p);
4804 rv = KERN_NO_SPACE;
4805 goto out;
4806 }
4807 PROC_UNLOCK(p);
4808 }
4809 #endif
4810
4811 if (vm_map_lock_upgrade(map)) {
4812 gap_entry = NULL;
4813 vm_map_lock_read(map);
4814 goto retry;
4815 }
4816
4817 /*
4818 * The gap_entry "offset" field is overloaded. See
4819 * vm_map_stack_locked().
4820 */
4821 prot = PROT_EXTRACT(gap_entry->offset);
4822 max = PROT_MAX_EXTRACT(gap_entry->offset);
4823 sgp = gap_entry->next_read;
4824
4825 grow_start = gap_entry->end - grow_amount;
4826 if (gap_entry->start + grow_amount == gap_entry->end) {
4827 gap_start = gap_entry->start;
4828 gap_end = gap_entry->end;
4829 vm_map_entry_delete(map, gap_entry);
4830 gap_deleted = true;
4831 } else {
4832 MPASS(gap_entry->start < gap_entry->end - grow_amount);
4833 vm_map_entry_resize(map, gap_entry, -grow_amount);
4834 gap_deleted = false;
4835 }
4836 rv = vm_map_insert(map, NULL, 0, grow_start,
4837 grow_start + grow_amount, prot, max, MAP_STACK_AREA);
4838 if (rv != KERN_SUCCESS) {
4839 if (gap_deleted) {
4840 rv1 = vm_map_insert1(map, NULL, 0, gap_start,
4841 gap_end, VM_PROT_NONE, VM_PROT_NONE,
4842 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP,
4843 &gap_entry);
4844 MPASS(rv1 == KERN_SUCCESS);
4845 gap_entry->next_read = sgp;
4846 gap_entry->offset = prot | PROT_MAX(max);
4847 } else {
4848 vm_map_entry_resize(map, gap_entry,
4849 grow_amount);
4850 }
4851 }
4852 if (rv == KERN_SUCCESS && is_procstack)
4853 vm->vm_ssize += btoc(grow_amount);
4854
4855 /*
4856 * Heed the MAP_WIREFUTURE flag if it was set for this process.
4857 */
4858 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4859 rv = vm_map_wire_locked(map, grow_start,
4860 grow_start + grow_amount,
4861 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4862 }
4863 vm_map_lock_downgrade(map);
4864
4865 out:
4866 #ifdef RACCT
4867 if (racct_enable && rv != KERN_SUCCESS) {
4868 PROC_LOCK(p);
4869 error = racct_set(p, RACCT_VMEM, map->size);
4870 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4871 if (!old_mlock) {
4872 error = racct_set(p, RACCT_MEMLOCK,
4873 ptoa(pmap_wired_count(map->pmap)));
4874 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4875 }
4876 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4877 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4878 PROC_UNLOCK(p);
4879 }
4880 #endif
4881
4882 return (rv);
4883 }
4884
4885 /*
4886 * Unshare the specified VM space for exec. If other processes are
4887 * mapped to it, then create a new one. The new vmspace is null.
4888 */
4889 int
vmspace_exec(struct proc * p,vm_offset_t minuser,vm_offset_t maxuser)4890 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4891 {
4892 struct vmspace *oldvmspace = p->p_vmspace;
4893 struct vmspace *newvmspace;
4894
4895 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4896 ("vmspace_exec recursed"));
4897 newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4898 if (newvmspace == NULL)
4899 return (ENOMEM);
4900 newvmspace->vm_swrss = oldvmspace->vm_swrss;
4901 /*
4902 * This code is written like this for prototype purposes. The
4903 * goal is to avoid running down the vmspace here, but let the
4904 * other process's that are still using the vmspace to finally
4905 * run it down. Even though there is little or no chance of blocking
4906 * here, it is a good idea to keep this form for future mods.
4907 */
4908 PROC_VMSPACE_LOCK(p);
4909 p->p_vmspace = newvmspace;
4910 PROC_VMSPACE_UNLOCK(p);
4911 if (p == curthread->td_proc)
4912 pmap_activate(curthread);
4913 curthread->td_pflags |= TDP_EXECVMSPC;
4914 return (0);
4915 }
4916
4917 /*
4918 * Unshare the specified VM space for forcing COW. This
4919 * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4920 */
4921 int
vmspace_unshare(struct proc * p)4922 vmspace_unshare(struct proc *p)
4923 {
4924 struct vmspace *oldvmspace = p->p_vmspace;
4925 struct vmspace *newvmspace;
4926 vm_ooffset_t fork_charge;
4927
4928 /*
4929 * The caller is responsible for ensuring that the reference count
4930 * cannot concurrently transition 1 -> 2.
4931 */
4932 if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4933 return (0);
4934 fork_charge = 0;
4935 newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4936 if (newvmspace == NULL)
4937 return (ENOMEM);
4938 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4939 vmspace_free(newvmspace);
4940 return (ENOMEM);
4941 }
4942 PROC_VMSPACE_LOCK(p);
4943 p->p_vmspace = newvmspace;
4944 PROC_VMSPACE_UNLOCK(p);
4945 if (p == curthread->td_proc)
4946 pmap_activate(curthread);
4947 vmspace_free(oldvmspace);
4948 return (0);
4949 }
4950
4951 /*
4952 * vm_map_lookup:
4953 *
4954 * Finds the VM object, offset, and
4955 * protection for a given virtual address in the
4956 * specified map, assuming a page fault of the
4957 * type specified.
4958 *
4959 * Leaves the map in question locked for read; return
4960 * values are guaranteed until a vm_map_lookup_done
4961 * call is performed. Note that the map argument
4962 * is in/out; the returned map must be used in
4963 * the call to vm_map_lookup_done.
4964 *
4965 * A handle (out_entry) is returned for use in
4966 * vm_map_lookup_done, to make that fast.
4967 *
4968 * If a lookup is requested with "write protection"
4969 * specified, the map may be changed to perform virtual
4970 * copying operations, although the data referenced will
4971 * remain the same.
4972 */
4973 int
vm_map_lookup(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)4974 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
4975 vm_offset_t vaddr,
4976 vm_prot_t fault_typea,
4977 vm_map_entry_t *out_entry, /* OUT */
4978 vm_object_t *object, /* OUT */
4979 vm_pindex_t *pindex, /* OUT */
4980 vm_prot_t *out_prot, /* OUT */
4981 boolean_t *wired) /* OUT */
4982 {
4983 vm_map_entry_t entry;
4984 vm_map_t map = *var_map;
4985 vm_prot_t prot;
4986 vm_prot_t fault_type;
4987 vm_object_t eobject;
4988 vm_size_t size;
4989 struct ucred *cred;
4990
4991 RetryLookup:
4992
4993 vm_map_lock_read(map);
4994
4995 RetryLookupLocked:
4996 /*
4997 * Lookup the faulting address.
4998 */
4999 if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
5000 vm_map_unlock_read(map);
5001 return (KERN_INVALID_ADDRESS);
5002 }
5003
5004 entry = *out_entry;
5005
5006 /*
5007 * Handle submaps.
5008 */
5009 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5010 vm_map_t old_map = map;
5011
5012 *var_map = map = entry->object.sub_map;
5013 vm_map_unlock_read(old_map);
5014 goto RetryLookup;
5015 }
5016
5017 /*
5018 * Check whether this task is allowed to have this page.
5019 */
5020 prot = entry->protection;
5021 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
5022 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
5023 if (prot == VM_PROT_NONE && map != kernel_map &&
5024 (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
5025 (entry->eflags & MAP_ENTRY_STACK_GAP) != 0 &&
5026 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
5027 goto RetryLookupLocked;
5028 }
5029 fault_type = fault_typea & VM_PROT_ALL;
5030 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
5031 vm_map_unlock_read(map);
5032 return (KERN_PROTECTION_FAILURE);
5033 }
5034 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5035 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5036 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5037 ("entry %p flags %x", entry, entry->eflags));
5038 if ((fault_typea & VM_PROT_COPY) != 0 &&
5039 (entry->max_protection & VM_PROT_WRITE) == 0 &&
5040 (entry->eflags & MAP_ENTRY_COW) == 0) {
5041 vm_map_unlock_read(map);
5042 return (KERN_PROTECTION_FAILURE);
5043 }
5044
5045 /*
5046 * If this page is not pageable, we have to get it for all possible
5047 * accesses.
5048 */
5049 *wired = (entry->wired_count != 0);
5050 if (*wired)
5051 fault_type = entry->protection;
5052 size = entry->end - entry->start;
5053
5054 /*
5055 * If the entry was copy-on-write, we either ...
5056 */
5057 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5058 /*
5059 * If we want to write the page, we may as well handle that
5060 * now since we've got the map locked.
5061 *
5062 * If we don't need to write the page, we just demote the
5063 * permissions allowed.
5064 */
5065 if ((fault_type & VM_PROT_WRITE) != 0 ||
5066 (fault_typea & VM_PROT_COPY) != 0) {
5067 /*
5068 * Make a new object, and place it in the object
5069 * chain. Note that no new references have appeared
5070 * -- one just moved from the map to the new
5071 * object.
5072 */
5073 if (vm_map_lock_upgrade(map))
5074 goto RetryLookup;
5075
5076 if (entry->cred == NULL) {
5077 /*
5078 * The debugger owner is charged for
5079 * the memory.
5080 */
5081 cred = curthread->td_ucred;
5082 crhold(cred);
5083 if (!swap_reserve_by_cred(size, cred)) {
5084 crfree(cred);
5085 vm_map_unlock(map);
5086 return (KERN_RESOURCE_SHORTAGE);
5087 }
5088 entry->cred = cred;
5089 }
5090 eobject = entry->object.vm_object;
5091 vm_object_shadow(&entry->object.vm_object,
5092 &entry->offset, size, entry->cred, false);
5093 if (eobject == entry->object.vm_object) {
5094 /*
5095 * The object was not shadowed.
5096 */
5097 swap_release_by_cred(size, entry->cred);
5098 crfree(entry->cred);
5099 }
5100 entry->cred = NULL;
5101 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5102
5103 vm_map_lock_downgrade(map);
5104 } else {
5105 /*
5106 * We're attempting to read a copy-on-write page --
5107 * don't allow writes.
5108 */
5109 prot &= ~VM_PROT_WRITE;
5110 }
5111 }
5112
5113 /*
5114 * Create an object if necessary.
5115 */
5116 if (entry->object.vm_object == NULL && !vm_map_is_system(map)) {
5117 if (vm_map_lock_upgrade(map))
5118 goto RetryLookup;
5119 entry->object.vm_object = vm_object_allocate_anon(atop(size),
5120 NULL, entry->cred, size);
5121 entry->offset = 0;
5122 entry->cred = NULL;
5123 vm_map_lock_downgrade(map);
5124 }
5125
5126 /*
5127 * Return the object/offset from this entry. If the entry was
5128 * copy-on-write or empty, it has been fixed up.
5129 */
5130 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5131 *object = entry->object.vm_object;
5132
5133 *out_prot = prot;
5134 return (KERN_SUCCESS);
5135 }
5136
5137 /*
5138 * vm_map_lookup_locked:
5139 *
5140 * Lookup the faulting address. A version of vm_map_lookup that returns
5141 * KERN_FAILURE instead of blocking on map lock or memory allocation.
5142 */
5143 int
vm_map_lookup_locked(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,vm_object_t * object,vm_pindex_t * pindex,vm_prot_t * out_prot,boolean_t * wired)5144 vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5145 vm_offset_t vaddr,
5146 vm_prot_t fault_typea,
5147 vm_map_entry_t *out_entry, /* OUT */
5148 vm_object_t *object, /* OUT */
5149 vm_pindex_t *pindex, /* OUT */
5150 vm_prot_t *out_prot, /* OUT */
5151 boolean_t *wired) /* OUT */
5152 {
5153 vm_map_entry_t entry;
5154 vm_map_t map = *var_map;
5155 vm_prot_t prot;
5156 vm_prot_t fault_type = fault_typea;
5157
5158 /*
5159 * Lookup the faulting address.
5160 */
5161 if (!vm_map_lookup_entry(map, vaddr, out_entry))
5162 return (KERN_INVALID_ADDRESS);
5163
5164 entry = *out_entry;
5165
5166 /*
5167 * Fail if the entry refers to a submap.
5168 */
5169 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5170 return (KERN_FAILURE);
5171
5172 /*
5173 * Check whether this task is allowed to have this page.
5174 */
5175 prot = entry->protection;
5176 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5177 if ((fault_type & prot) != fault_type)
5178 return (KERN_PROTECTION_FAILURE);
5179
5180 /*
5181 * If this page is not pageable, we have to get it for all possible
5182 * accesses.
5183 */
5184 *wired = (entry->wired_count != 0);
5185 if (*wired)
5186 fault_type = entry->protection;
5187
5188 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5189 /*
5190 * Fail if the entry was copy-on-write for a write fault.
5191 */
5192 if (fault_type & VM_PROT_WRITE)
5193 return (KERN_FAILURE);
5194 /*
5195 * We're attempting to read a copy-on-write page --
5196 * don't allow writes.
5197 */
5198 prot &= ~VM_PROT_WRITE;
5199 }
5200
5201 /*
5202 * Fail if an object should be created.
5203 */
5204 if (entry->object.vm_object == NULL && !vm_map_is_system(map))
5205 return (KERN_FAILURE);
5206
5207 /*
5208 * Return the object/offset from this entry. If the entry was
5209 * copy-on-write or empty, it has been fixed up.
5210 */
5211 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5212 *object = entry->object.vm_object;
5213
5214 *out_prot = prot;
5215 return (KERN_SUCCESS);
5216 }
5217
5218 /*
5219 * vm_map_lookup_done:
5220 *
5221 * Releases locks acquired by a vm_map_lookup
5222 * (according to the handle returned by that lookup).
5223 */
5224 void
vm_map_lookup_done(vm_map_t map,vm_map_entry_t entry)5225 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5226 {
5227 /*
5228 * Unlock the main-level map
5229 */
5230 vm_map_unlock_read(map);
5231 }
5232
5233 vm_offset_t
vm_map_max_KBI(const struct vm_map * map)5234 vm_map_max_KBI(const struct vm_map *map)
5235 {
5236
5237 return (vm_map_max(map));
5238 }
5239
5240 vm_offset_t
vm_map_min_KBI(const struct vm_map * map)5241 vm_map_min_KBI(const struct vm_map *map)
5242 {
5243
5244 return (vm_map_min(map));
5245 }
5246
5247 pmap_t
vm_map_pmap_KBI(vm_map_t map)5248 vm_map_pmap_KBI(vm_map_t map)
5249 {
5250
5251 return (map->pmap);
5252 }
5253
5254 bool
vm_map_range_valid_KBI(vm_map_t map,vm_offset_t start,vm_offset_t end)5255 vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5256 {
5257
5258 return (vm_map_range_valid(map, start, end));
5259 }
5260
5261 #ifdef INVARIANTS
5262 static void
_vm_map_assert_consistent(vm_map_t map,int check)5263 _vm_map_assert_consistent(vm_map_t map, int check)
5264 {
5265 vm_map_entry_t entry, prev;
5266 vm_map_entry_t cur, header, lbound, ubound;
5267 vm_size_t max_left, max_right;
5268
5269 #ifdef DIAGNOSTIC
5270 ++map->nupdates;
5271 #endif
5272 if (enable_vmmap_check != check)
5273 return;
5274
5275 header = prev = &map->header;
5276 VM_MAP_ENTRY_FOREACH(entry, map) {
5277 KASSERT(prev->end <= entry->start,
5278 ("map %p prev->end = %jx, start = %jx", map,
5279 (uintmax_t)prev->end, (uintmax_t)entry->start));
5280 KASSERT(entry->start < entry->end,
5281 ("map %p start = %jx, end = %jx", map,
5282 (uintmax_t)entry->start, (uintmax_t)entry->end));
5283 KASSERT(entry->left == header ||
5284 entry->left->start < entry->start,
5285 ("map %p left->start = %jx, start = %jx", map,
5286 (uintmax_t)entry->left->start, (uintmax_t)entry->start));
5287 KASSERT(entry->right == header ||
5288 entry->start < entry->right->start,
5289 ("map %p start = %jx, right->start = %jx", map,
5290 (uintmax_t)entry->start, (uintmax_t)entry->right->start));
5291 cur = map->root;
5292 lbound = ubound = header;
5293 for (;;) {
5294 if (entry->start < cur->start) {
5295 ubound = cur;
5296 cur = cur->left;
5297 KASSERT(cur != lbound,
5298 ("map %p cannot find %jx",
5299 map, (uintmax_t)entry->start));
5300 } else if (cur->end <= entry->start) {
5301 lbound = cur;
5302 cur = cur->right;
5303 KASSERT(cur != ubound,
5304 ("map %p cannot find %jx",
5305 map, (uintmax_t)entry->start));
5306 } else {
5307 KASSERT(cur == entry,
5308 ("map %p cannot find %jx",
5309 map, (uintmax_t)entry->start));
5310 break;
5311 }
5312 }
5313 max_left = vm_map_entry_max_free_left(entry, lbound);
5314 max_right = vm_map_entry_max_free_right(entry, ubound);
5315 KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5316 ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5317 (uintmax_t)entry->max_free,
5318 (uintmax_t)max_left, (uintmax_t)max_right));
5319 prev = entry;
5320 }
5321 KASSERT(prev->end <= entry->start,
5322 ("map %p prev->end = %jx, start = %jx", map,
5323 (uintmax_t)prev->end, (uintmax_t)entry->start));
5324 }
5325 #endif
5326
5327 #include "opt_ddb.h"
5328 #ifdef DDB
5329 #include <sys/kernel.h>
5330
5331 #include <ddb/ddb.h>
5332
5333 static void
vm_map_print(vm_map_t map)5334 vm_map_print(vm_map_t map)
5335 {
5336 vm_map_entry_t entry, prev;
5337
5338 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5339 (void *)map,
5340 (void *)map->pmap, map->nentries, map->timestamp);
5341
5342 db_indent += 2;
5343 prev = &map->header;
5344 VM_MAP_ENTRY_FOREACH(entry, map) {
5345 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5346 (void *)entry, (void *)entry->start, (void *)entry->end,
5347 entry->eflags);
5348 {
5349 static const char * const inheritance_name[4] =
5350 {"share", "copy", "none", "donate_copy"};
5351
5352 db_iprintf(" prot=%x/%x/%s",
5353 entry->protection,
5354 entry->max_protection,
5355 inheritance_name[(int)(unsigned char)
5356 entry->inheritance]);
5357 if (entry->wired_count != 0)
5358 db_printf(", wired");
5359 }
5360 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5361 db_printf(", share=%p, offset=0x%jx\n",
5362 (void *)entry->object.sub_map,
5363 (uintmax_t)entry->offset);
5364 if (prev == &map->header ||
5365 prev->object.sub_map !=
5366 entry->object.sub_map) {
5367 db_indent += 2;
5368 vm_map_print((vm_map_t)entry->object.sub_map);
5369 db_indent -= 2;
5370 }
5371 } else {
5372 if (entry->cred != NULL)
5373 db_printf(", ruid %d", entry->cred->cr_ruid);
5374 db_printf(", object=%p, offset=0x%jx",
5375 (void *)entry->object.vm_object,
5376 (uintmax_t)entry->offset);
5377 if (entry->object.vm_object && entry->object.vm_object->cred)
5378 db_printf(", obj ruid %d charge %jx",
5379 entry->object.vm_object->cred->cr_ruid,
5380 (uintmax_t)entry->object.vm_object->charge);
5381 if (entry->eflags & MAP_ENTRY_COW)
5382 db_printf(", copy (%s)",
5383 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5384 db_printf("\n");
5385
5386 if (prev == &map->header ||
5387 prev->object.vm_object !=
5388 entry->object.vm_object) {
5389 db_indent += 2;
5390 vm_object_print((db_expr_t)(intptr_t)
5391 entry->object.vm_object,
5392 0, 0, (char *)0);
5393 db_indent -= 2;
5394 }
5395 }
5396 prev = entry;
5397 }
5398 db_indent -= 2;
5399 }
5400
DB_SHOW_COMMAND(map,map)5401 DB_SHOW_COMMAND(map, map)
5402 {
5403
5404 if (!have_addr) {
5405 db_printf("usage: show map <addr>\n");
5406 return;
5407 }
5408 vm_map_print((vm_map_t)addr);
5409 }
5410
DB_SHOW_COMMAND(procvm,procvm)5411 DB_SHOW_COMMAND(procvm, procvm)
5412 {
5413 struct proc *p;
5414
5415 if (have_addr) {
5416 p = db_lookup_proc(addr);
5417 } else {
5418 p = curproc;
5419 }
5420
5421 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5422 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5423 (void *)vmspace_pmap(p->p_vmspace));
5424
5425 vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5426 }
5427
5428 #endif /* DDB */
5429