xref: /dragonfly/sys/vm/vm_map.c (revision c936cb6fb98d3c40c4f9174ee19179e918a2b544)
1 /*
2  * Copyright (c) 1991, 1993
3  *        The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 2003-2022 The DragonFly Project.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * The Mach Operating System project at Carnegie-Mellon University.
8  *
9  * This code is derived from software contributed to The DragonFly Project
10  * by Matthew Dillon <dillon@backplane.com>
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *        from: @(#)vm_map.c  8.3 (Berkeley) 1/12/94
37  *
38  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
39  * All rights reserved.
40  *
41  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 #include <sys/param.h>
64 #include <sys/systm.h>
65 #include <sys/kernel.h>
66 #include <sys/proc.h>
67 #include <sys/serialize.h>
68 #include <sys/lock.h>
69 #include <sys/vmmeter.h>
70 #include <sys/mman.h>
71 #include <sys/vnode.h>
72 #include <sys/resourcevar.h>
73 #include <sys/shm.h>
74 #include <sys/tree.h>
75 #include <sys/malloc.h>
76 #include <sys/objcache.h>
77 #include <sys/kern_syscall.h>
78 
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_page.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_pager.h>
86 #include <vm/vm_kern.h>
87 #include <vm/vm_extern.h>
88 #include <vm/swap_pager.h>
89 #include <vm/vm_zone.h>
90 
91 #include <sys/random.h>
92 #include <sys/sysctl.h>
93 #include <sys/spinlock.h>
94 
95 #include <sys/thread2.h>
96 #include <sys/spinlock2.h>
97 
98 /*
99  * Virtual memory maps provide for the mapping, protection, and sharing
100  * of virtual memory objects.  In addition, this module provides for an
101  * efficient virtual copy of memory from one map to another.
102  *
103  * Synchronization is required prior to most operations.
104  *
105  * Maps consist of an ordered doubly-linked list of simple entries.
106  * A hint and a RB tree is used to speed-up lookups.
107  *
108  * Callers looking to modify maps specify start/end addresses which cause
109  * the related map entry to be clipped if necessary, and then later
110  * recombined if the pieces remained compatible.
111  *
112  * Virtual copy operations are performed by copying VM object references
113  * from one map to another, and then marking both regions as copy-on-write.
114  */
115 static boolean_t vmspace_ctor(void *obj, void *privdata, int ocflags);
116 static void vmspace_dtor(void *obj, void *privdata);
117 static void vmspace_terminate(struct vmspace *vm, int final);
118 
119 MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore");
120 MALLOC_DEFINE(M_MAP_BACKING, "map_backing", "vm_map_backing to entry");
121 static struct objcache *vmspace_cache;
122 
123 /*
124  * per-cpu page table cross mappings are initialized in early boot
125  * and might require a considerable number of vm_map_entry structures.
126  */
127 #define MAPENTRYBSP_CACHE     (MAXCPU+1)
128 #define MAPENTRYAP_CACHE      8
129 
130 /*
131  * Partioning threaded programs with large anonymous memory areas can
132  * improve concurrent fault performance.
133  */
134 #define MAP_ENTRY_PARTITION_SIZE        ((vm_offset_t)(32 * 1024 * 1024))
135 #define MAP_ENTRY_PARTITION_MASK        (MAP_ENTRY_PARTITION_SIZE - 1)
136 
137 #define VM_MAP_ENTRY_WITHIN_PARTITION(entry)      \
138           ((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0)
139 
140 static struct vm_zone mapentzone_store;
141 __read_mostly static vm_zone_t mapentzone;
142 
143 static struct vm_map_entry map_entry_init[MAX_MAPENT];
144 static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE];
145 static struct vm_map_entry cpu_map_entry_init_ap[MAXCPU][MAPENTRYAP_CACHE];
146 
147 __read_mostly static int randomize_mmap;
148 SYSCTL_INT(_vm, OID_AUTO, randomize_mmap, CTLFLAG_RW, &randomize_mmap, 0,
149     "Randomize mmap offsets");
150 __read_mostly static int vm_map_relock_enable = 1;
151 SYSCTL_INT(_vm, OID_AUTO, map_relock_enable, CTLFLAG_RW,
152              &vm_map_relock_enable, 0, "insert pop pgtable optimization");
153 __read_mostly static int vm_map_partition_enable = 1;
154 SYSCTL_INT(_vm, OID_AUTO, map_partition_enable, CTLFLAG_RW,
155              &vm_map_partition_enable, 0, "Break up larger vm_map_entry's");
156 __read_mostly static int vm_map_backing_limit = 5;
157 SYSCTL_INT(_vm, OID_AUTO, map_backing_limit, CTLFLAG_RW,
158              &vm_map_backing_limit, 0, "ba.backing_ba link depth");
159 __read_mostly static int vm_map_backing_shadow_test = 1;
160 SYSCTL_INT(_vm, OID_AUTO, map_backing_shadow_test, CTLFLAG_RW,
161              &vm_map_backing_shadow_test, 0, "ba.object shadow test");
162 
163 static void vmspace_drop_notoken(struct vmspace *vm);
164 static void vm_map_entry_shadow(vm_map_entry_t entry);
165 static vm_map_entry_t vm_map_entry_create(int *);
166 static void vm_map_entry_dispose (vm_map_t map, vm_map_entry_t entry, int *);
167 static void vm_map_entry_dispose_ba (vm_map_entry_t entry, vm_map_backing_t ba);
168 static void vm_map_backing_replicated(vm_map_t map,
169                     vm_map_entry_t entry, int flags);
170 static void vm_map_backing_adjust_start(vm_map_entry_t entry,
171                     vm_ooffset_t start);
172 static void vm_map_backing_adjust_end(vm_map_entry_t entry,
173                     vm_ooffset_t end);
174 static void vm_map_backing_attach (vm_map_entry_t entry, vm_map_backing_t ba);
175 static void vm_map_backing_detach (vm_map_entry_t entry, vm_map_backing_t ba);
176 static void _vm_map_clip_end (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
177 static void _vm_map_clip_start (vm_map_t, vm_map_entry_t, vm_offset_t, int *);
178 static void vm_map_entry_delete (vm_map_t, vm_map_entry_t, int *);
179 static void vm_map_entry_unwire (vm_map_t, vm_map_entry_t);
180 static void vm_map_copy_entry (vm_map_t, vm_map_t, vm_map_entry_t,
181                     vm_map_entry_t);
182 static void vm_map_unclip_range (vm_map_t map, vm_map_entry_t start_entry,
183                     vm_offset_t start, vm_offset_t end, int *countp, int flags);
184 static void vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
185                     vm_offset_t vaddr, int *countp);
186 
187 #define MAP_BACK_CLIPPED      0x0001
188 #define MAP_BACK_BASEOBJREFD  0x0002
189 
190 /*
191  * Initialize the vm_map module.  Must be called before any other vm_map
192  * routines.
193  *
194  * Map and entry structures are allocated from the general purpose
195  * memory pool with some exceptions:
196  *
197  *        - The kernel map is allocated statically.
198  *        - Initial kernel map entries are allocated out of a static pool.
199  *        - We must set ZONE_SPECIAL here or the early boot code can get
200  *          stuck if there are >63 cores.
201  *
202  *        These restrictions are necessary since malloc() uses the
203  *        maps and requires map entries.
204  *
205  * Called from the low level boot code only.
206  */
207 void
vm_map_startup(void)208 vm_map_startup(void)
209 {
210           mapentzone = &mapentzone_store;
211           zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
212                       map_entry_init, MAX_MAPENT);
213           mapentzone_store.zflags |= ZONE_SPECIAL;
214 }
215 
216 /*
217  * Called prior to any vmspace allocations.
218  *
219  * Called from the low level boot code only.
220  */
221 void
vm_init2(void)222 vm_init2(void)
223 {
224           vmspace_cache = objcache_create_mbacked(M_VMSPACE,
225                                                             sizeof(struct vmspace),
226                                                             0, ncpus * 4,
227                                                             vmspace_ctor, vmspace_dtor,
228                                                             NULL);
229           zinitna(mapentzone, NULL, 0, 0, ZONE_USE_RESERVE | ZONE_SPECIAL);
230           pmap_init2();
231           vm_object_init2();
232 }
233 
234 /*
235  * objcache support.  We leave the pmap root cached as long as possible
236  * for performance reasons.
237  */
238 static
239 boolean_t
vmspace_ctor(void * obj,void * privdata,int ocflags)240 vmspace_ctor(void *obj, void *privdata, int ocflags)
241 {
242           struct vmspace *vm = obj;
243 
244           bzero(vm, sizeof(*vm));
245           vm->vm_refcnt = VM_REF_DELETED;
246 
247           return 1;
248 }
249 
250 static
251 void
vmspace_dtor(void * obj,void * privdata)252 vmspace_dtor(void *obj, void *privdata)
253 {
254           struct vmspace *vm = obj;
255 
256           KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
257           pmap_puninit(vmspace_pmap(vm));
258 }
259 
260 /*
261  * Red black tree functions
262  *
263  * The caller must hold the related map lock.
264  */
265 static int rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b);
266 RB_GENERATE(vm_map_rb_tree, vm_map_entry, rb_entry, rb_vm_map_compare);
267 
268 /* a->ba.start is address, and the only field which must be initialized */
269 static int
rb_vm_map_compare(vm_map_entry_t a,vm_map_entry_t b)270 rb_vm_map_compare(vm_map_entry_t a, vm_map_entry_t b)
271 {
272           if (a->ba.start < b->ba.start)
273                     return(-1);
274           else if (a->ba.start > b->ba.start)
275                     return(1);
276           return(0);
277 }
278 
279 /*
280  * Initialize vmspace ref/hold counts vmspace0.  There is a holdcnt for
281  * every refcnt.
282  */
283 void
vmspace_initrefs(struct vmspace * vm)284 vmspace_initrefs(struct vmspace *vm)
285 {
286           vm->vm_refcnt = 1;
287           vm->vm_holdcnt = 1;
288 }
289 
290 /*
291  * Allocate a vmspace structure, including a vm_map and pmap.
292  * Initialize numerous fields.  While the initial allocation is zerod,
293  * subsequence reuse from the objcache leaves elements of the structure
294  * intact (particularly the pmap), so portions must be zerod.
295  *
296  * Returns a referenced vmspace.
297  *
298  * No requirements.
299  */
300 struct vmspace *
vmspace_alloc(vm_offset_t min,vm_offset_t max)301 vmspace_alloc(vm_offset_t min, vm_offset_t max)
302 {
303           struct vmspace *vm;
304 
305           vm = objcache_get(vmspace_cache, M_WAITOK);
306 
307           bzero(&vm->vm_startcopy,
308                 (char *)&vm->vm_endcopy - (char *)&vm->vm_startcopy);
309           vm_map_init(&vm->vm_map, min, max, NULL);         /* initializes token */
310 
311           /*
312            * NOTE: hold to acquires token for safety.
313            *
314            * On return vmspace is referenced (refs=1, hold=1).  That is,
315            * each refcnt also has a holdcnt.  There can be additional holds
316            * (holdcnt) above and beyond the refcnt.  Finalization is handled in
317            * two stages, one on refs 1->0, and the the second on hold 1->0.
318            */
319           KKASSERT(vm->vm_holdcnt == 0);
320           KKASSERT(vm->vm_refcnt == VM_REF_DELETED);
321           vmspace_initrefs(vm);
322           vmspace_hold(vm);
323           pmap_pinit(vmspace_pmap(vm));           /* (some fields reused) */
324           vm->vm_map.pmap = vmspace_pmap(vm);     /* XXX */
325           vm->vm_shm = NULL;
326           vm->vm_flags = 0;
327           cpu_vmspace_alloc(vm);
328           vmspace_drop(vm);
329 
330           return (vm);
331 }
332 
333 /*
334  * NOTE: Can return 0 if the vmspace is exiting.
335  */
336 int
vmspace_getrefs(struct vmspace * vm)337 vmspace_getrefs(struct vmspace *vm)
338 {
339           int32_t n;
340 
341           n = vm->vm_refcnt;
342           cpu_ccfence();
343           if (n & VM_REF_DELETED)
344                     n = -1;
345           return n;
346 }
347 
348 void
vmspace_hold(struct vmspace * vm)349 vmspace_hold(struct vmspace *vm)
350 {
351           atomic_add_int(&vm->vm_holdcnt, 1);
352           lwkt_gettoken(&vm->vm_map.token);
353 }
354 
355 /*
356  * Drop with final termination interlock.
357  */
358 void
vmspace_drop(struct vmspace * vm)359 vmspace_drop(struct vmspace *vm)
360 {
361           lwkt_reltoken(&vm->vm_map.token);
362           vmspace_drop_notoken(vm);
363 }
364 
365 static void
vmspace_drop_notoken(struct vmspace * vm)366 vmspace_drop_notoken(struct vmspace *vm)
367 {
368           if (atomic_fetchadd_int(&vm->vm_holdcnt, -1) == 1) {
369                     if (vm->vm_refcnt & VM_REF_DELETED)
370                               vmspace_terminate(vm, 1);
371           }
372 }
373 
374 /*
375  * A vmspace object must not be in a terminated state to be able to obtain
376  * additional refs on it.
377  *
378  * These are official references to the vmspace, the count is used to check
379  * for vmspace sharing.  Foreign accessors should use 'hold' and not 'ref'.
380  *
381  * XXX we need to combine hold & ref together into one 64-bit field to allow
382  * holds to prevent stage-1 termination.
383  */
384 void
vmspace_ref(struct vmspace * vm)385 vmspace_ref(struct vmspace *vm)
386 {
387           uint32_t n;
388 
389           atomic_add_int(&vm->vm_holdcnt, 1);
390           n = atomic_fetchadd_int(&vm->vm_refcnt, 1);
391           KKASSERT((n & VM_REF_DELETED) == 0);
392 }
393 
394 /*
395  * Release a ref on the vmspace.  On the 1->0 transition we do stage-1
396  * termination of the vmspace.  Then, on the final drop of the hold we
397  * will do stage-2 final termination.
398  */
399 void
vmspace_rel(struct vmspace * vm)400 vmspace_rel(struct vmspace *vm)
401 {
402           uint32_t n;
403 
404           /*
405            * Drop refs.  Each ref also has a hold which is also dropped.
406            *
407            * When refs hits 0 compete to get the VM_REF_DELETED flag (hold
408            * prevent finalization) to start termination processing.
409            * Finalization occurs when the last hold count drops to 0.
410            */
411           n = atomic_fetchadd_int(&vm->vm_refcnt, -1) - 1;
412           while (n == 0) {
413                     if (atomic_cmpset_int(&vm->vm_refcnt, 0, VM_REF_DELETED)) {
414                               vmspace_terminate(vm, 0);
415                               break;
416                     }
417                     n = vm->vm_refcnt;
418                     cpu_ccfence();
419           }
420           vmspace_drop_notoken(vm);
421 }
422 
423 /*
424  * This is called during exit indicating that the vmspace is no
425  * longer in used by an exiting process, but the process has not yet
426  * been reaped.
427  *
428  * We drop refs, allowing for stage-1 termination, but maintain a holdcnt
429  * to prevent stage-2 until the process is reaped.  Note hte order of
430  * operation, we must hold first.
431  *
432  * No requirements.
433  */
434 void
vmspace_relexit(struct vmspace * vm)435 vmspace_relexit(struct vmspace *vm)
436 {
437           atomic_add_int(&vm->vm_holdcnt, 1);
438           vmspace_rel(vm);
439 }
440 
441 /*
442  * Called during reap to disconnect the remainder of the vmspace from
443  * the process.  On the hold drop the vmspace termination is finalized.
444  *
445  * No requirements.
446  */
447 void
vmspace_exitfree(struct proc * p)448 vmspace_exitfree(struct proc *p)
449 {
450           struct vmspace *vm;
451 
452           vm = p->p_vmspace;
453           p->p_vmspace = NULL;
454           vmspace_drop_notoken(vm);
455 }
456 
457 /*
458  * Called in two cases:
459  *
460  * (1) When the last refcnt is dropped and the vmspace becomes inactive,
461  *     called with final == 0.  refcnt will be (u_int)-1 at this point,
462  *     and holdcnt will still be non-zero.
463  *
464  * (2) When holdcnt becomes 0, called with final == 1.  There should no
465  *     longer be anyone with access to the vmspace.
466  *
467  * VMSPACE_EXIT1 flags the primary deactivation
468  * VMSPACE_EXIT2 flags the last reap
469  */
470 static void
vmspace_terminate(struct vmspace * vm,int final)471 vmspace_terminate(struct vmspace *vm, int final)
472 {
473           int count;
474 
475           lwkt_gettoken(&vm->vm_map.token);
476           if (final == 0) {
477                     KKASSERT((vm->vm_flags & VMSPACE_EXIT1) == 0);
478                     vm->vm_flags |= VMSPACE_EXIT1;
479 
480                     /*
481                      * Get rid of most of the resources.  Leave the kernel pmap
482                      * intact.
483                      *
484                      * If the pmap does not contain wired pages we can bulk-delete
485                      * the pmap as a performance optimization before removing the
486                      * related mappings.
487                      *
488                      * If the pmap contains wired pages we cannot do this
489                      * pre-optimization because currently vm_fault_unwire()
490                      * expects the pmap pages to exist and will not decrement
491                      * p->wire_count if they do not.
492                      */
493                     shmexit(vm);
494                     if (vmspace_pmap(vm)->pm_stats.wired_count) {
495                               vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
496                                               VM_MAX_USER_ADDRESS);
497                               pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
498                                                     VM_MAX_USER_ADDRESS);
499                     } else {
500                               pmap_remove_pages(vmspace_pmap(vm), VM_MIN_USER_ADDRESS,
501                                                     VM_MAX_USER_ADDRESS);
502                               vm_map_remove(&vm->vm_map, VM_MIN_USER_ADDRESS,
503                                               VM_MAX_USER_ADDRESS);
504                     }
505                     lwkt_reltoken(&vm->vm_map.token);
506           } else {
507                     KKASSERT((vm->vm_flags & VMSPACE_EXIT1) != 0);
508                     KKASSERT((vm->vm_flags & VMSPACE_EXIT2) == 0);
509 
510                     /*
511                      * Get rid of remaining basic resources.
512                      */
513                     vm->vm_flags |= VMSPACE_EXIT2;
514                     shmexit(vm);
515 
516                     count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
517                     vm_map_lock(&vm->vm_map);
518                     cpu_vmspace_free(vm);
519 
520                     /*
521                      * Lock the map, to wait out all other references to it.
522                      * Delete all of the mappings and pages they hold, then call
523                      * the pmap module to reclaim anything left.
524                      */
525                     vm_map_delete(&vm->vm_map,
526                                     vm_map_min(&vm->vm_map),
527                                     vm_map_max(&vm->vm_map),
528                                     &count);
529                     vm_map_unlock(&vm->vm_map);
530                     vm_map_entry_release(count);
531 
532                     pmap_release(vmspace_pmap(vm));
533                     lwkt_reltoken(&vm->vm_map.token);
534                     objcache_put(vmspace_cache, vm);
535           }
536 }
537 
538 /*
539  * Swap useage is determined by taking the proportional swap used by
540  * VM objects backing the VM map.  To make up for fractional losses,
541  * if the VM object has any swap use at all the associated map entries
542  * count for at least 1 swap page.
543  *
544  * No requirements.
545  */
546 vm_offset_t
vmspace_swap_count(struct vmspace * vm)547 vmspace_swap_count(struct vmspace *vm)
548 {
549           vm_map_t map = &vm->vm_map;
550           vm_map_entry_t cur;
551           vm_object_t object;
552           vm_offset_t count = 0;
553           vm_offset_t n;
554 
555           vmspace_hold(vm);
556 
557           RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
558                     switch(cur->maptype) {
559                     case VM_MAPTYPE_NORMAL:
560                               if ((object = cur->ba.object) == NULL)
561                                         break;
562                               if (object->swblock_count) {
563                                         n = (cur->ba.end - cur->ba.start) / PAGE_SIZE;
564                                         count += object->swblock_count *
565                                             SWAP_META_PAGES * n / object->size + 1;
566                               }
567                               break;
568                     default:
569                               break;
570                     }
571           }
572           vmspace_drop(vm);
573 
574           return(count);
575 }
576 
577 /*
578  * Calculate the approximate number of anonymous pages in use by
579  * this vmspace.  To make up for fractional losses, we count each
580  * VM object as having at least 1 anonymous page.
581  *
582  * No requirements.
583  */
584 vm_offset_t
vmspace_anonymous_count(struct vmspace * vm)585 vmspace_anonymous_count(struct vmspace *vm)
586 {
587           vm_map_t map = &vm->vm_map;
588           vm_map_entry_t cur;
589           vm_object_t object;
590           vm_offset_t count = 0;
591 
592           vmspace_hold(vm);
593           RB_FOREACH(cur, vm_map_rb_tree, &map->rb_root) {
594                     switch(cur->maptype) {
595                     case VM_MAPTYPE_NORMAL:
596                               if ((object = cur->ba.object) == NULL)
597                                         break;
598                               if (object->type != OBJT_DEFAULT &&
599                                   object->type != OBJT_SWAP) {
600                                         break;
601                               }
602                               count += object->resident_page_count;
603                               break;
604                     default:
605                               break;
606                     }
607           }
608           vmspace_drop(vm);
609 
610           return(count);
611 }
612 
613 /*
614  * Initialize an existing vm_map structure such as that in the vmspace
615  * structure.  The pmap is initialized elsewhere.
616  *
617  * No requirements.
618  */
619 void
vm_map_init(struct vm_map * map,vm_offset_t min_addr,vm_offset_t max_addr,pmap_t pmap)620 vm_map_init(struct vm_map *map, vm_offset_t min_addr, vm_offset_t max_addr,
621               pmap_t pmap)
622 {
623           RB_INIT(&map->rb_root);
624           spin_init(&map->ilock_spin, "ilock");
625           map->ilock_base = NULL;
626           map->nentries = 0;
627           map->size = 0;
628           map->system_map = 0;
629           vm_map_min(map) = min_addr;
630           vm_map_max(map) = max_addr;
631           map->pmap = pmap;
632           map->timestamp = 0;
633           map->flags = 0;
634           bzero(&map->freehint, sizeof(map->freehint));
635           lwkt_token_init(&map->token, "vm_map");
636           lockinit(&map->lock, "vm_maplk", (hz + 9) / 10, 0);
637 }
638 
639 /*
640  * Find the first possible free address for the specified request length.
641  * Returns 0 if we don't have one cached.
642  */
643 static
644 vm_offset_t
vm_map_freehint_find(vm_map_t map,vm_size_t length,vm_size_t align)645 vm_map_freehint_find(vm_map_t map, vm_size_t length, vm_size_t align)
646 {
647           vm_map_freehint_t *scan;
648 
649           scan = &map->freehint[0];
650           while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
651                     if (scan->length == length && scan->align == align)
652                               return(scan->start);
653                     ++scan;
654           }
655           return 0;
656 }
657 
658 /*
659  * Unconditionally set the freehint.  Called by vm_map_findspace() after
660  * it finds an address.  This will help us iterate optimally on the next
661  * similar findspace.
662  */
663 static
664 void
vm_map_freehint_update(vm_map_t map,vm_offset_t start,vm_size_t length,vm_size_t align)665 vm_map_freehint_update(vm_map_t map, vm_offset_t start,
666                            vm_size_t length, vm_size_t align)
667 {
668           vm_map_freehint_t *scan;
669 
670           scan = &map->freehint[0];
671           while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
672                     if (scan->length == length && scan->align == align) {
673                               scan->start = start;
674                               return;
675                     }
676                     ++scan;
677           }
678           scan = &map->freehint[map->freehint_newindex & VM_MAP_FFMASK];
679           scan->start = start;
680           scan->align = align;
681           scan->length = length;
682           ++map->freehint_newindex;
683 }
684 
685 /*
686  * Update any existing freehints (for any alignment), for the hole we just
687  * added.
688  */
689 static
690 void
vm_map_freehint_hole(vm_map_t map,vm_offset_t start,vm_size_t length)691 vm_map_freehint_hole(vm_map_t map, vm_offset_t start, vm_size_t length)
692 {
693           vm_map_freehint_t *scan;
694 
695           scan = &map->freehint[0];
696           while (scan < &map->freehint[VM_MAP_FFCOUNT]) {
697                     if (scan->length <= length && scan->start > start)
698                               scan->start = start;
699                     ++scan;
700           }
701 }
702 
703 /*
704  * This function handles MAP_ENTRY_NEEDS_COPY by inserting a fronting
705  * object in the entry for COW faults.
706  *
707  * The entire chain including entry->ba (prior to inserting the fronting
708  * object) essentially becomes set in stone... elements of it can be paged
709  * in or out, but cannot be further modified.
710  *
711  * NOTE: If we do not optimize the backing chain then a unique copy is not
712  *         needed.  Note, however, that because portions of the chain are
713  *         shared across pmaps we cannot make any changes to the vm_map_backing
714  *         elements themselves.
715  *
716  * If the map segment is governed by a virtual page table then it is
717  * possible to address offsets beyond the mapped area.  Just allocate
718  * a maximally sized object for this case.
719  *
720  * If addref is non-zero an additional reference is added to the returned
721  * entry.  This mechanic exists because the additional reference might have
722  * to be added atomically and not after return to prevent a premature
723  * collapse.  XXX currently there is no collapse code.
724  *
725  * The vm_map must be exclusively locked.
726  * No other requirements.
727  */
728 static
729 void
vm_map_entry_shadow(vm_map_entry_t entry)730 vm_map_entry_shadow(vm_map_entry_t entry)
731 {
732           vm_map_backing_t ba;
733           vm_size_t length;
734           vm_object_t source;
735           vm_object_t result;
736 
737           /*
738            * Number of bytes we have to shadow
739            */
740           length = atop(entry->ba.end - entry->ba.start);
741 
742           /*
743            * Don't create the new object if the old object isn't shared.
744            * This case occurs quite often when programs fork/exec/wait.
745            *
746            * Caller ensures source exists (all backing_ba's must have objects),
747            * typically indirectly by virtue of the NEEDS_COPY flag being set.
748            * We have a ref on source by virtue of the entry and do not need
749            * to lock it to do this test.
750            */
751           source = entry->ba.object;
752           KKASSERT(source);
753 
754           if (source->type != OBJT_VNODE) {
755                     if (source->ref_count == 1 &&
756                         source->handle == NULL &&
757                         (source->type == OBJT_DEFAULT ||
758                          source->type == OBJT_SWAP)) {
759                               goto done;
760                     }
761           }
762           ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */
763           vm_object_hold_shared(source);
764 
765           /*
766            * Once it becomes part of a backing_ba chain it can wind up anywhere,
767            * drop the ONEMAPPING flag now.
768            */
769           vm_object_clear_flag(source, OBJ_ONEMAPPING);
770 
771           /*
772            * Allocate a new object with the given length.  The new object
773            * is returned referenced but we may have to add another one.
774            * If we are adding a second reference we must clear OBJ_ONEMAPPING.
775            * (typically because the caller is about to clone a vm_map_entry).
776            *
777            * The source object currently has an extra reference to prevent
778            * collapses into it while we mess with its shadow list, which
779            * we will remove later in this routine.
780            *
781            * The target object may require a second reference if asked for one
782            * by the caller.
783            */
784           result = vm_object_allocate_hold(OBJT_DEFAULT, length);
785           if (result == NULL)
786                     panic("vm_object_shadow: no object for shadowing");
787 
788           /*
789            * The new object shadows the source object.
790            *
791            * Try to optimize the result object's page color when shadowing
792            * in order to maintain page coloring consistency in the combined
793            * shadowed object.
794            *
795            * The source object is moved to ba, retaining its existing ref-count.
796            * No additional ref is needed.
797            *
798            * SHADOWING IS NOT APPLICABLE TO OBJT_VNODE OBJECTS
799            */
800           vm_map_backing_detach(entry, &entry->ba);
801           *ba = entry->ba;              /* previous ba */
802           entry->ba.object = result;    /* new ba (at head of entry) */
803           entry->ba.backing_ba = ba;
804           entry->ba.backing_count = ba->backing_count + 1;
805           entry->ba.offset = 0;
806 
807           /* cpu localization twist */
808           result->pg_color = vm_quickcolor();
809 
810           vm_map_backing_attach(entry, &entry->ba);
811           vm_map_backing_attach(entry, ba);
812 
813           /*
814            * Adjust the return storage.  Drop the ref on source before
815            * returning.
816            */
817           vm_object_drop(result);
818           vm_object_drop(source);
819 done:
820           entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
821 }
822 
823 /*
824  * Allocate an object for a vm_map_entry.
825  *
826  * Object allocation for anonymous mappings is defered as long as possible.
827  * This function is called when we can defer no longer, generally when a map
828  * entry might be split or forked or takes a page fault.
829  *
830  * If the map segment is governed by a virtual page table then it is
831  * possible to address offsets beyond the mapped area.  Just allocate
832  * a maximally sized object for this case.
833  *
834  * The vm_map must be exclusively locked.
835  * No other requirements.
836  */
837 void
vm_map_entry_allocate_object(vm_map_entry_t entry)838 vm_map_entry_allocate_object(vm_map_entry_t entry)
839 {
840           vm_object_t obj;
841 
842           /*
843            * ba.offset is NOT cumulatively added in the backing_ba scan like
844            * it was in the old object chain, so we can assign whatever offset
845            * we like to the new object.
846            *
847            * For now assign a value of 0 to make debugging object sizes
848            * easier.
849            */
850           entry->ba.offset = 0;
851 
852           obj = vm_object_allocate(OBJT_DEFAULT,
853                                          atop(entry->ba.end - entry->ba.start) +
854                                          entry->ba.offset);
855           entry->ba.object = obj;
856           vm_map_backing_attach(entry, &entry->ba);
857 }
858 
859 /*
860  * Set an initial negative count so the first attempt to reserve
861  * space preloads a bunch of vm_map_entry's for this cpu.  Also
862  * pre-allocate 2 vm_map_entries which will be needed by zalloc() to
863  * map a new page for vm_map_entry structures.  SMP systems are
864  * particularly sensitive.
865  *
866  * This routine is called in early boot so we cannot just call
867  * vm_map_entry_reserve().
868  *
869  * Called from the low level boot code only (for each cpu)
870  *
871  * WARNING! Take care not to have too-big a static/BSS structure here
872  *            as MAXCPU can be 256+, otherwise the loader's 64MB heap
873  *            can get blown out by the kernel plus the initrd image.
874  */
875 void
vm_map_entry_reserve_cpu_init(globaldata_t gd)876 vm_map_entry_reserve_cpu_init(globaldata_t gd)
877 {
878           vm_map_entry_t entry;
879           int count;
880           int i;
881 
882           atomic_add_int(&gd->gd_vme_avail, -MAP_RESERVE_COUNT * 2);
883           if (gd->gd_cpuid == 0) {
884                     entry = &cpu_map_entry_init_bsp[0];
885                     count = MAPENTRYBSP_CACHE;
886           } else {
887                     entry = &cpu_map_entry_init_ap[gd->gd_cpuid][0];
888                     count = MAPENTRYAP_CACHE;
889           }
890           for (i = 0; i < count; ++i, ++entry) {
891                     MAPENT_FREELIST(entry) = gd->gd_vme_base;
892                     gd->gd_vme_base = entry;
893           }
894 }
895 
896 /*
897  * Reserves vm_map_entry structures so code later-on can manipulate
898  * map_entry structures within a locked map without blocking trying
899  * to allocate a new vm_map_entry.
900  *
901  * No requirements.
902  *
903  * WARNING!  We must not decrement gd_vme_avail until after we have
904  *             ensured that sufficient entries exist, otherwise we can
905  *             get into an endless call recursion in the zalloc code
906  *             itself.
907  */
908 int
vm_map_entry_reserve(int count)909 vm_map_entry_reserve(int count)
910 {
911           struct globaldata *gd = mycpu;
912           vm_map_entry_t entry;
913 
914           /*
915            * Make sure we have enough structures in gd_vme_base to handle
916            * the reservation request.
917            *
918            * Use a critical section to protect against VM faults.  It might
919            * not be needed, but we have to be careful here.
920            */
921           if (gd->gd_vme_avail < count) {
922                     crit_enter();
923                     while (gd->gd_vme_avail < count) {
924                               entry = zalloc(mapentzone);
925                               MAPENT_FREELIST(entry) = gd->gd_vme_base;
926                               gd->gd_vme_base = entry;
927                               atomic_add_int(&gd->gd_vme_avail, 1);
928                     }
929                     crit_exit();
930           }
931           atomic_add_int(&gd->gd_vme_avail, -count);
932 
933           return(count);
934 }
935 
936 /*
937  * Releases previously reserved vm_map_entry structures that were not
938  * used.  If we have too much junk in our per-cpu cache clean some of
939  * it out.
940  *
941  * No requirements.
942  */
943 void
vm_map_entry_release(int count)944 vm_map_entry_release(int count)
945 {
946           struct globaldata *gd = mycpu;
947           vm_map_entry_t entry;
948           vm_map_entry_t efree;
949 
950           count = atomic_fetchadd_int(&gd->gd_vme_avail, count) + count;
951           if (gd->gd_vme_avail > MAP_RESERVE_SLOP) {
952                     efree = NULL;
953                     crit_enter();
954                     while (gd->gd_vme_avail > MAP_RESERVE_HYST) {
955                               entry = gd->gd_vme_base;
956                               KKASSERT(entry != NULL);
957                               gd->gd_vme_base = MAPENT_FREELIST(entry);
958                               atomic_add_int(&gd->gd_vme_avail, -1);
959                               MAPENT_FREELIST(entry) = efree;
960                               efree = entry;
961                     }
962                     crit_exit();
963                     while ((entry = efree) != NULL) {
964                               efree = MAPENT_FREELIST(efree);
965                               zfree(mapentzone, entry);
966                     }
967           }
968 }
969 
970 /*
971  * Reserve map entry structures for use in kernel_map itself.  These
972  * entries have *ALREADY* been reserved on a per-cpu basis when the map
973  * was inited.  This function is used by zalloc() to avoid a recursion
974  * when zalloc() itself needs to allocate additional kernel memory.
975  *
976  * This function works like the normal reserve but does not load the
977  * vm_map_entry cache (because that would result in an infinite
978  * recursion).  Note that gd_vme_avail may go negative.  This is expected.
979  *
980  * Any caller of this function must be sure to renormalize after
981  * potentially eating entries to ensure that the reserve supply
982  * remains intact.
983  *
984  * No requirements.
985  */
986 int
vm_map_entry_kreserve(int count)987 vm_map_entry_kreserve(int count)
988 {
989           struct globaldata *gd = mycpu;
990 
991           atomic_add_int(&gd->gd_vme_avail, -count);
992           KASSERT(gd->gd_vme_base != NULL,
993                     ("no reserved entries left, gd_vme_avail = %d",
994                     gd->gd_vme_avail));
995           return(count);
996 }
997 
998 /*
999  * Release previously reserved map entries for kernel_map.  We do not
1000  * attempt to clean up like the normal release function as this would
1001  * cause an unnecessary (but probably not fatal) deep procedure call.
1002  *
1003  * No requirements.
1004  */
1005 void
vm_map_entry_krelease(int count)1006 vm_map_entry_krelease(int count)
1007 {
1008           struct globaldata *gd = mycpu;
1009 
1010           atomic_add_int(&gd->gd_vme_avail, count);
1011 }
1012 
1013 /*
1014  * Allocates a VM map entry for insertion.  No entry fields are filled in.
1015  *
1016  * The entries should have previously been reserved.  The reservation count
1017  * is tracked in (*countp).
1018  *
1019  * No requirements.
1020  */
1021 static vm_map_entry_t
vm_map_entry_create(int * countp)1022 vm_map_entry_create(int *countp)
1023 {
1024           struct globaldata *gd = mycpu;
1025           vm_map_entry_t entry;
1026 
1027           KKASSERT(*countp > 0);
1028           --*countp;
1029           crit_enter();
1030           entry = gd->gd_vme_base;
1031           KASSERT(entry != NULL, ("gd_vme_base NULL! count %d", *countp));
1032           gd->gd_vme_base = MAPENT_FREELIST(entry);
1033           crit_exit();
1034 
1035           return(entry);
1036 }
1037 
1038 /*
1039  * Attach and detach backing store elements
1040  */
1041 static void
vm_map_backing_attach(vm_map_entry_t entry,vm_map_backing_t ba)1042 vm_map_backing_attach(vm_map_entry_t entry, vm_map_backing_t ba)
1043 {
1044           vm_object_t obj;
1045 
1046           switch(entry->maptype) {
1047           case VM_MAPTYPE_NORMAL:
1048                     obj = ba->object;
1049                     lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1050                     TAILQ_INSERT_TAIL(&obj->backing_list, ba, entry);
1051                     lockmgr(&obj->backing_lk, LK_RELEASE);
1052                     break;
1053           case VM_MAPTYPE_UKSMAP:
1054                     ba->uksmap(ba, UKSMAPOP_ADD, entry->aux.dev, NULL);
1055                     break;
1056           }
1057 }
1058 
1059 static void
vm_map_backing_detach(vm_map_entry_t entry,vm_map_backing_t ba)1060 vm_map_backing_detach(vm_map_entry_t entry, vm_map_backing_t ba)
1061 {
1062           vm_object_t obj;
1063 
1064           switch(entry->maptype) {
1065           case VM_MAPTYPE_NORMAL:
1066                     obj = ba->object;
1067                     lockmgr(&obj->backing_lk, LK_EXCLUSIVE);
1068                     TAILQ_REMOVE(&obj->backing_list, ba, entry);
1069                     lockmgr(&obj->backing_lk, LK_RELEASE);
1070                     break;
1071           case VM_MAPTYPE_UKSMAP:
1072                     ba->uksmap(ba, UKSMAPOP_REM, entry->aux.dev, NULL);
1073                     break;
1074           }
1075 }
1076 
1077 /*
1078  * Dispose of the dynamically allocated backing_ba chain associated
1079  * with a vm_map_entry.
1080  *
1081  * We decrement the (possibly shared) element and kfree() on the
1082  * 1->0 transition.  We only iterate to the next backing_ba when
1083  * the previous one went through a 1->0 transition.
1084  *
1085  * These can only be normal vm_object based backings.
1086  */
1087 static void
vm_map_entry_dispose_ba(vm_map_entry_t entry,vm_map_backing_t ba)1088 vm_map_entry_dispose_ba(vm_map_entry_t entry, vm_map_backing_t ba)
1089 {
1090           vm_map_backing_t next;
1091 
1092           while (ba) {
1093                     if (ba->map_object) {
1094                               vm_map_backing_detach(entry, ba);
1095                               vm_object_deallocate(ba->object);
1096                     }
1097                     next = ba->backing_ba;
1098                     kfree(ba, M_MAP_BACKING);
1099                     ba = next;
1100           }
1101 }
1102 
1103 /*
1104  * Dispose of a vm_map_entry that is no longer being referenced.
1105  *
1106  * No requirements.
1107  */
1108 static void
vm_map_entry_dispose(vm_map_t map,vm_map_entry_t entry,int * countp)1109 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry, int *countp)
1110 {
1111           struct globaldata *gd = mycpu;
1112 
1113           /*
1114            * Dispose of the base object and the backing link.
1115            */
1116           switch(entry->maptype) {
1117           case VM_MAPTYPE_NORMAL:
1118                     if (entry->ba.map_object) {
1119                               vm_map_backing_detach(entry, &entry->ba);
1120                               vm_object_deallocate(entry->ba.object);
1121                     }
1122                     break;
1123           case VM_MAPTYPE_SUBMAP:
1124                     break;
1125           case VM_MAPTYPE_UKSMAP:
1126                     vm_map_backing_detach(entry, &entry->ba);
1127                     break;
1128           default:
1129                     break;
1130           }
1131           vm_map_entry_dispose_ba(entry, entry->ba.backing_ba);
1132 
1133           /*
1134            * Cleanup for safety.
1135            */
1136           entry->ba.backing_ba = NULL;
1137           entry->ba.object = NULL;
1138           entry->ba.offset = 0;
1139 
1140           ++*countp;
1141           crit_enter();
1142           MAPENT_FREELIST(entry) = gd->gd_vme_base;
1143           gd->gd_vme_base = entry;
1144           crit_exit();
1145 }
1146 
1147 
1148 /*
1149  * Insert/remove entries from maps.
1150  *
1151  * The related map must be exclusively locked.
1152  * The caller must hold map->token
1153  * No other requirements.
1154  */
1155 static __inline void
vm_map_entry_link(vm_map_t map,vm_map_entry_t entry)1156 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1157 {
1158           ASSERT_VM_MAP_LOCKED(map);
1159 
1160           map->nentries++;
1161           if (vm_map_rb_tree_RB_INSERT(&map->rb_root, entry))
1162                     panic("vm_map_entry_link: dup addr map %p ent %p", map, entry);
1163 }
1164 
1165 static __inline void
vm_map_entry_unlink(vm_map_t map,vm_map_entry_t entry)1166 vm_map_entry_unlink(vm_map_t map,
1167                         vm_map_entry_t entry)
1168 {
1169           ASSERT_VM_MAP_LOCKED(map);
1170 
1171           if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1172                     panic("vm_map_entry_unlink: attempt to mess with "
1173                           "locked entry! %p", entry);
1174           }
1175           vm_map_rb_tree_RB_REMOVE(&map->rb_root, entry);
1176           map->nentries--;
1177 }
1178 
1179 /*
1180  * Finds the map entry containing (or immediately preceding) the specified
1181  * address in the given map.  The entry is returned in (*entry).
1182  *
1183  * The boolean result indicates whether the address is actually contained
1184  * in the map.
1185  *
1186  * The related map must be locked.
1187  * No other requirements.
1188  */
1189 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_offset_t address,vm_map_entry_t * entry)1190 vm_map_lookup_entry(vm_map_t map, vm_offset_t address, vm_map_entry_t *entry)
1191 {
1192           vm_map_entry_t tmp;
1193           vm_map_entry_t last;
1194 
1195           ASSERT_VM_MAP_LOCKED(map);
1196 
1197           /*
1198            * Locate the record from the top of the tree.  'last' tracks the
1199            * closest prior record and is returned if no match is found, which
1200            * in binary tree terms means tracking the most recent right-branch
1201            * taken.  If there is no prior record, *entry is set to NULL.
1202            */
1203           last = NULL;
1204           tmp = RB_ROOT(&map->rb_root);
1205 
1206           while (tmp) {
1207                     if (address >= tmp->ba.start) {
1208                               if (address < tmp->ba.end) {
1209                                         *entry = tmp;
1210                                         return(TRUE);
1211                               }
1212                               last = tmp;
1213                               tmp = RB_RIGHT(tmp, rb_entry);
1214                     } else {
1215                               tmp = RB_LEFT(tmp, rb_entry);
1216                     }
1217           }
1218           *entry = last;
1219           return (FALSE);
1220 }
1221 
1222 /*
1223  * Inserts the given whole VM object into the target map at the specified
1224  * address range.  The object's size should match that of the address range.
1225  *
1226  * The map must be exclusively locked.
1227  * The object must be held.
1228  * The caller must have reserved sufficient vm_map_entry structures.
1229  *
1230  * If object is non-NULL, ref count must be bumped by caller prior to
1231  * making call to account for the new entry.  XXX API is a bit messy.
1232  */
1233 int
vm_map_insert(vm_map_t map,int * countp,void * map_object,void * map_aux,vm_ooffset_t offset,void * aux_info,vm_offset_t start,vm_offset_t end,vm_maptype_t maptype,vm_subsys_t id,vm_prot_t prot,vm_prot_t max,int cow)1234 vm_map_insert(vm_map_t map, int *countp,
1235                 void *map_object, void *map_aux,
1236                 vm_ooffset_t offset, void *aux_info,
1237                 vm_offset_t start, vm_offset_t end,
1238                 vm_maptype_t maptype, vm_subsys_t id,
1239                 vm_prot_t prot, vm_prot_t max, int cow)
1240 {
1241           vm_map_entry_t new_entry;
1242           vm_map_entry_t prev_entry;
1243           vm_map_entry_t next;
1244           vm_map_entry_t temp_entry;
1245           vm_eflags_t protoeflags;
1246           vm_object_t object;
1247           int must_drop = 0;
1248 
1249           if (maptype == VM_MAPTYPE_UKSMAP)
1250                     object = NULL;
1251           else
1252                     object = map_object;
1253 
1254           ASSERT_VM_MAP_LOCKED(map);
1255           if (object)
1256                     ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
1257 
1258           /*
1259            * Check that the start and end points are not bogus.
1260            */
1261           if ((start < vm_map_min(map)) || (end > vm_map_max(map)) ||
1262               (start >= end)) {
1263                     return (KERN_INVALID_ADDRESS);
1264           }
1265 
1266           /*
1267            * Find the entry prior to the proposed starting address; if it's part
1268            * of an existing entry, this range is bogus.
1269            */
1270           if (vm_map_lookup_entry(map, start, &temp_entry))
1271                     return (KERN_NO_SPACE);
1272           prev_entry = temp_entry;
1273 
1274           /*
1275            * Assert that the next entry doesn't overlap the end point.
1276            */
1277           if (prev_entry)
1278                     next = vm_map_rb_tree_RB_NEXT(prev_entry);
1279           else
1280                     next = RB_MIN(vm_map_rb_tree, &map->rb_root);
1281           if (next && next->ba.start < end)
1282                     return (KERN_NO_SPACE);
1283 
1284           protoeflags = 0;
1285 
1286           if (cow & MAP_COPY_ON_WRITE)
1287                     protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
1288 
1289           if (cow & MAP_NOFAULT) {
1290                     protoeflags |= MAP_ENTRY_NOFAULT;
1291 
1292                     KASSERT(object == NULL,
1293                               ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1294           }
1295           if (cow & MAP_DISABLE_SYNCER)
1296                     protoeflags |= MAP_ENTRY_NOSYNC;
1297           if (cow & MAP_DISABLE_COREDUMP)
1298                     protoeflags |= MAP_ENTRY_NOCOREDUMP;
1299           if (cow & MAP_IS_STACK)
1300                     protoeflags |= MAP_ENTRY_STACK;
1301           if (cow & MAP_IS_KSTACK)
1302                     protoeflags |= MAP_ENTRY_KSTACK;
1303 
1304           lwkt_gettoken(&map->token);
1305 
1306           if (object) {
1307                     ;
1308           } else if (prev_entry &&
1309                      (prev_entry->eflags == protoeflags) &&
1310                      (prev_entry->ba.end == start) &&
1311                      (prev_entry->wired_count == 0) &&
1312                      (prev_entry->id == id) &&
1313                      prev_entry->maptype == maptype &&
1314                      maptype == VM_MAPTYPE_NORMAL &&
1315                      prev_entry->ba.backing_ba == NULL &&   /* not backed */
1316                      ((prev_entry->ba.object == NULL) ||
1317                       vm_object_coalesce(prev_entry->ba.object,
1318                                              OFF_TO_IDX(prev_entry->ba.offset),
1319                                              (vm_size_t)(prev_entry->ba.end - prev_entry->ba.start),
1320                                              (vm_size_t)(end - prev_entry->ba.end)))) {
1321                     /*
1322                      * We were able to extend the object.  Determine if we
1323                      * can extend the previous map entry to include the
1324                      * new range as well.
1325                      */
1326                     if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
1327                         (prev_entry->protection == prot) &&
1328                         (prev_entry->max_protection == max)) {
1329                               map->size += (end - prev_entry->ba.end);
1330                               vm_map_backing_adjust_end(prev_entry, end);
1331                               vm_map_simplify_entry(map, prev_entry, countp);
1332                               lwkt_reltoken(&map->token);
1333                               return (KERN_SUCCESS);
1334                     }
1335 
1336                     /*
1337                      * If we can extend the object but cannot extend the
1338                      * map entry, we have to create a new map entry.  We
1339                      * must bump the ref count on the extended object to
1340                      * account for it.  object may be NULL.
1341                      */
1342                     object = prev_entry->ba.object;
1343                     offset = prev_entry->ba.offset +
1344                               (prev_entry->ba.end - prev_entry->ba.start);
1345                     if (object) {
1346                               vm_object_hold(object);
1347                               vm_object_lock_swap(); /* map->token order */
1348                               vm_object_reference_locked(object);
1349                               map_object = object;
1350                               must_drop = 1;
1351                     }
1352           }
1353 
1354           /*
1355            * NOTE: if conditionals fail, object can be NULL here.  This occurs
1356            * in things like the buffer map where we manage kva but do not manage
1357            * backing objects.
1358            */
1359 
1360           /*
1361            * Create a new entry
1362            */
1363           new_entry = vm_map_entry_create(countp);
1364           new_entry->ba.pmap = map->pmap;
1365           new_entry->ba.start = start;
1366           new_entry->ba.end = end;
1367           new_entry->id = id;
1368 
1369           new_entry->maptype = maptype;
1370           new_entry->eflags = protoeflags;
1371           new_entry->aux.master_pde = 0;                    /* in case size is different */
1372           new_entry->aux.map_aux = map_aux;
1373           new_entry->ba.map_object = map_object;
1374           new_entry->ba.backing_ba = NULL;
1375           new_entry->ba.backing_count = 0;
1376           new_entry->ba.offset = offset;
1377           new_entry->ba.aux_info = aux_info;
1378           new_entry->ba.flags = 0;
1379           new_entry->ba.pmap = map->pmap;
1380 
1381           new_entry->inheritance = VM_INHERIT_DEFAULT;
1382           new_entry->protection = prot;
1383           new_entry->max_protection = max;
1384           new_entry->wired_count = 0;
1385 
1386           /*
1387            * Insert the new entry into the list
1388            */
1389           vm_map_backing_replicated(map, new_entry, MAP_BACK_BASEOBJREFD);
1390           vm_map_entry_link(map, new_entry);
1391           map->size += new_entry->ba.end - new_entry->ba.start;
1392 
1393           /*
1394            * Don't worry about updating freehint[] when inserting, allow
1395            * addresses to be lower than the actual first free spot.
1396            */
1397 #if 0
1398           /*
1399            * Temporarily removed to avoid MAP_STACK panic, due to
1400            * MAP_STACK being a huge hack.  Will be added back in
1401            * when MAP_STACK (and the user stack mapping) is fixed.
1402            */
1403           /*
1404            * It may be possible to simplify the entry
1405            */
1406           vm_map_simplify_entry(map, new_entry, countp);
1407 #endif
1408 
1409           /*
1410            * Try to pre-populate the page table.  Mappings governed by virtual
1411            * page tables cannot be prepopulated without a lot of work, so
1412            * don't try.
1413            */
1414           if ((cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) &&
1415               maptype != VM_MAPTYPE_UKSMAP) {
1416                     int dorelock = 0;
1417                     if (vm_map_relock_enable && (cow & MAP_PREFAULT_RELOCK)) {
1418                               dorelock = 1;
1419                               vm_object_lock_swap();
1420                               vm_object_drop(object);
1421                     }
1422                     pmap_object_init_pt(map->pmap, new_entry,
1423                                             new_entry->ba.start,
1424                                             new_entry->ba.end - new_entry->ba.start,
1425                                             cow & MAP_PREFAULT_PARTIAL);
1426                     if (dorelock) {
1427                               vm_object_hold(object);
1428                               vm_object_lock_swap();
1429                     }
1430           }
1431           lwkt_reltoken(&map->token);
1432           if (must_drop)
1433                     vm_object_drop(object);
1434 
1435           return (KERN_SUCCESS);
1436 }
1437 
1438 /*
1439  * Find sufficient space for `length' bytes in the given map, starting at
1440  * `start'.  Returns 0 on success, 1 on no space.
1441  *
1442  * This function will returned an arbitrarily aligned pointer.  If no
1443  * particular alignment is required you should pass align as 1.  Note that
1444  * the map may return PAGE_SIZE aligned pointers if all the lengths used in
1445  * the map are a multiple of PAGE_SIZE, even if you pass a smaller align
1446  * argument.
1447  *
1448  * 'align' should be a power of 2 but is not required to be.
1449  *
1450  * The map must be exclusively locked.
1451  * No other requirements.
1452  */
1453 int
vm_map_findspace(vm_map_t map,vm_offset_t start,vm_size_t length,vm_size_t align,int flags,vm_offset_t * addr)1454 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1455                      vm_size_t align, int flags, vm_offset_t *addr)
1456 {
1457           vm_map_entry_t entry;
1458           vm_map_entry_t tmp;
1459           vm_offset_t hole_start;
1460           vm_offset_t end;
1461           vm_offset_t align_mask;
1462 
1463           if (start < vm_map_min(map))
1464                     start = vm_map_min(map);
1465           if (start > vm_map_max(map))
1466                     return (1);
1467 
1468           /*
1469            * If the alignment is not a power of 2 we will have to use
1470            * a mod/division, set align_mask to a special value.
1471            */
1472           if ((align | (align - 1)) + 1 != (align << 1))
1473                     align_mask = (vm_offset_t)-1;
1474           else
1475                     align_mask = align - 1;
1476 
1477           /*
1478            * Use freehint to adjust the start point, hopefully reducing
1479            * the iteration to O(1).
1480            */
1481           hole_start = vm_map_freehint_find(map, length, align);
1482           if (start < hole_start)
1483                     start = hole_start;
1484           if (vm_map_lookup_entry(map, start, &tmp))
1485                     start = tmp->ba.end;
1486           entry = tmp;        /* may be NULL */
1487 
1488           /*
1489            * Look through the rest of the map, trying to fit a new region in the
1490            * gap between existing regions, or after the very last region.
1491            */
1492           for (;;) {
1493                     /*
1494                      * Adjust the proposed start by the requested alignment,
1495                      * be sure that we didn't wrap the address.
1496                      */
1497                     if (align_mask == (vm_offset_t)-1)
1498                               end = roundup(start, align);
1499                     else
1500                               end = (start + align_mask) & ~align_mask;
1501                     if (end < start)
1502                               return (1);
1503                     start = end;
1504 
1505                     /*
1506                      * Find the end of the proposed new region.  Be sure we didn't
1507                      * go beyond the end of the map, or wrap around the address.
1508                      * Then check to see if this is the last entry or if the
1509                      * proposed end fits in the gap between this and the next
1510                      * entry.
1511                      */
1512                     end = start + length;
1513                     if (end > vm_map_max(map) || end < start)
1514                               return (1);
1515 
1516                     /*
1517                      * Locate the next entry, we can stop if this is the
1518                      * last entry (we know we are in-bounds so that would
1519                      * be a sucess).
1520                      */
1521                     if (entry)
1522                               entry = vm_map_rb_tree_RB_NEXT(entry);
1523                     else
1524                               entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
1525                     if (entry == NULL)
1526                               break;
1527 
1528                     /*
1529                      * Determine if the proposed area would overlap the
1530                      * next entry.
1531                      *
1532                      * When matching against a STACK entry, only allow the
1533                      * memory map to intrude on the ungrown portion of the
1534                      * STACK entry when MAP_TRYFIXED is set.
1535                      */
1536                     if (entry->ba.start >= end) {
1537                               if ((entry->eflags & MAP_ENTRY_STACK) == 0)
1538                                         break;
1539                               if (flags & MAP_TRYFIXED)
1540                                         break;
1541                               if (entry->ba.start - entry->aux.avail_ssize >= end)
1542                                         break;
1543                     }
1544                     start = entry->ba.end;
1545           }
1546 
1547           /*
1548            * Update the freehint
1549            */
1550           vm_map_freehint_update(map, start, length, align);
1551 
1552           /*
1553            * Grow the kernel_map if necessary.  pmap_growkernel() will panic
1554            * if it fails.  The kernel_map is locked and nothing can steal
1555            * our address space if pmap_growkernel() blocks.
1556            *
1557            * NOTE: This may be unconditionally called for kldload areas on
1558            *         x86_64 because these do not bump kernel_vm_end (which would
1559            *         fill 128G worth of page tables!).  Therefore we must not
1560            *         retry.
1561            */
1562           if (map == kernel_map) {
1563                     vm_offset_t kstop;
1564 
1565                     kstop = round_page(start + length);
1566                     if (kstop > kernel_vm_end)
1567                               pmap_growkernel(start, kstop);
1568           }
1569           *addr = start;
1570           return (0);
1571 }
1572 
1573 /*
1574  * vm_map_find finds an unallocated region in the target address map with
1575  * the given length and allocates it.  The search is defined to be first-fit
1576  * from the specified address; the region found is returned in the same
1577  * parameter.
1578  *
1579  * If object is non-NULL, ref count must be bumped by caller
1580  * prior to making call to account for the new entry.
1581  *
1582  * No requirements.  This function will lock the map temporarily.
1583  */
1584 int
vm_map_find(vm_map_t map,void * map_object,void * map_aux,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t length,vm_size_t align,boolean_t fitit,vm_maptype_t maptype,vm_subsys_t id,vm_prot_t prot,vm_prot_t max,int cow)1585 vm_map_find(vm_map_t map, void *map_object, void *map_aux,
1586               vm_ooffset_t offset, vm_offset_t *addr,
1587               vm_size_t length, vm_size_t align, boolean_t fitit,
1588               vm_maptype_t maptype, vm_subsys_t id,
1589               vm_prot_t prot, vm_prot_t max, int cow)
1590 {
1591           vm_offset_t start;
1592           vm_object_t object;
1593           void *aux_info;
1594           int result;
1595           int count;
1596 
1597           /*
1598            * Certain UKSMAPs may need aux_info.
1599            *
1600            * (map_object is the callback function, aux_info is the process
1601            *  or thread, if necessary).
1602            */
1603           aux_info = NULL;
1604           if (maptype == VM_MAPTYPE_UKSMAP) {
1605                     KKASSERT(map_aux != NULL && map_object != NULL);
1606 
1607                     switch(minor(((struct cdev *)map_aux))) {
1608                     case 5:
1609                               /*
1610                                * /dev/upmap
1611                                */
1612                               aux_info = curproc;
1613                               break;
1614                     case 6:
1615                               /*
1616                                * /dev/kpmap
1617                                */
1618                               break;
1619                     case 7:
1620                               /*
1621                                * /dev/lpmap
1622                                */
1623                               aux_info = curthread->td_lwp;
1624                               break;
1625                     }
1626                     object = NULL;
1627           } else {
1628                     object = map_object;
1629           }
1630 
1631           start = *addr;
1632 
1633           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
1634           vm_map_lock(map);
1635           if (object)
1636                     vm_object_hold_shared(object);
1637           if (fitit) {
1638                     if (vm_map_findspace(map, start, length, align, 0, addr)) {
1639                               if (object)
1640                                         vm_object_drop(object);
1641                               vm_map_unlock(map);
1642                               vm_map_entry_release(count);
1643                               return (KERN_NO_SPACE);
1644                     }
1645                     start = *addr;
1646           }
1647           result = vm_map_insert(map, &count,
1648                                      map_object, map_aux,
1649                                      offset, aux_info,
1650                                      start, start + length,
1651                                      maptype, id, prot, max, cow);
1652           if (object)
1653                     vm_object_drop(object);
1654           vm_map_unlock(map);
1655           vm_map_entry_release(count);
1656 
1657           return (result);
1658 }
1659 
1660 /*
1661  * Simplify the given map entry by merging with either neighbor.  This
1662  * routine also has the ability to merge with both neighbors.
1663  *
1664  * This routine guarentees that the passed entry remains valid (though
1665  * possibly extended).  When merging, this routine may delete one or
1666  * both neighbors.  No action is taken on entries which have their
1667  * in-transition flag set.
1668  *
1669  * The map must be exclusively locked.
1670  */
1671 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t entry,int * countp)1672 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry, int *countp)
1673 {
1674           vm_map_entry_t next, prev;
1675           vm_size_t prevsize, esize;
1676 
1677           if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1678                     ++mycpu->gd_cnt.v_intrans_coll;
1679                     return;
1680           }
1681 
1682           if (entry->maptype == VM_MAPTYPE_SUBMAP)
1683                     return;
1684           if (entry->maptype == VM_MAPTYPE_UKSMAP)
1685                     return;
1686 
1687           prev = vm_map_rb_tree_RB_PREV(entry);
1688           if (prev) {
1689                     prevsize = prev->ba.end - prev->ba.start;
1690                     if ( (prev->ba.end == entry->ba.start) &&
1691                          (prev->maptype == entry->maptype) &&
1692                          (prev->ba.object == entry->ba.object) &&
1693                          (prev->ba.backing_ba == entry->ba.backing_ba) &&
1694                          (!prev->ba.object ||
1695                               (prev->ba.offset + prevsize == entry->ba.offset)) &&
1696                          (prev->eflags == entry->eflags) &&
1697                          (prev->protection == entry->protection) &&
1698                          (prev->max_protection == entry->max_protection) &&
1699                          (prev->inheritance == entry->inheritance) &&
1700                          (prev->id == entry->id) &&
1701                          (prev->wired_count == entry->wired_count)) {
1702                               /*
1703                                * NOTE: order important.  Unlink before gumming up
1704                                *         the RBTREE w/adjust, adjust before disposal
1705                                *         of prior entry, to avoid pmap snafus.
1706                                */
1707                               vm_map_entry_unlink(map, prev);
1708                               vm_map_backing_adjust_start(entry, prev->ba.start);
1709                               if (entry->ba.object == NULL)
1710                                         entry->ba.offset = 0;
1711                               vm_map_entry_dispose(map, prev, countp);
1712                     }
1713           }
1714 
1715           next = vm_map_rb_tree_RB_NEXT(entry);
1716           if (next) {
1717                     esize = entry->ba.end - entry->ba.start;
1718                     if ((entry->ba.end == next->ba.start) &&
1719                         (next->maptype == entry->maptype) &&
1720                         (next->ba.object == entry->ba.object) &&
1721                          (prev->ba.backing_ba == entry->ba.backing_ba) &&
1722                          (!entry->ba.object ||
1723                               (entry->ba.offset + esize == next->ba.offset)) &&
1724                         (next->eflags == entry->eflags) &&
1725                         (next->protection == entry->protection) &&
1726                         (next->max_protection == entry->max_protection) &&
1727                         (next->inheritance == entry->inheritance) &&
1728                         (next->id == entry->id) &&
1729                         (next->wired_count == entry->wired_count)) {
1730                               /*
1731                                * NOTE: order important.  Unlink before gumming up
1732                                *         the RBTREE w/adjust, adjust before disposal
1733                                *         of prior entry, to avoid pmap snafus.
1734                                */
1735                               vm_map_entry_unlink(map, next);
1736                               vm_map_backing_adjust_end(entry, next->ba.end);
1737                               vm_map_entry_dispose(map, next, countp);
1738                   }
1739           }
1740 }
1741 
1742 /*
1743  * Asserts that the given entry begins at or after the specified address.
1744  * If necessary, it splits the entry into two.
1745  */
1746 #define vm_map_clip_start(map, entry, startaddr, countp)              \
1747 {                                                                                         \
1748           if (startaddr > entry->ba.start)                                      \
1749                     _vm_map_clip_start(map, entry, startaddr, countp);          \
1750 }
1751 
1752 /*
1753  * This routine is called only when it is known that the entry must be split.
1754  *
1755  * The map must be exclusively locked.
1756  */
1757 static void
_vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_offset_t start,int * countp)1758 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start,
1759                        int *countp)
1760 {
1761           vm_map_entry_t new_entry;
1762 
1763           /*
1764            * Split off the front portion -- note that we must insert the new
1765            * entry BEFORE this one, so that this entry has the specified
1766            * starting address.
1767            */
1768 
1769           vm_map_simplify_entry(map, entry, countp);
1770 
1771           /*
1772            * If there is no object backing this entry, we might as well create
1773            * one now.  If we defer it, an object can get created after the map
1774            * is clipped, and individual objects will be created for the split-up
1775            * map.  This is a bit of a hack, but is also about the best place to
1776            * put this improvement.
1777            */
1778           if (entry->ba.object == NULL && !map->system_map &&
1779               VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1780                     vm_map_entry_allocate_object(entry);
1781           }
1782 
1783           /*
1784            * NOTE: The replicated function will adjust start, end, and offset
1785            *         for the remainder of the backing_ba linkages.  We must fixup
1786            *         the embedded ba.
1787            */
1788           new_entry = vm_map_entry_create(countp);
1789           *new_entry = *entry;
1790           new_entry->ba.end = start;
1791 
1792           /*
1793            * Ordering is important, make sure the new entry is replicated
1794            * before we cut the exiting entry.
1795            */
1796           vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1797           vm_map_backing_adjust_start(entry, start);
1798           vm_map_entry_link(map, new_entry);
1799 }
1800 
1801 /*
1802  * Asserts that the given entry ends at or before the specified address.
1803  * If necessary, it splits the entry into two.
1804  *
1805  * The map must be exclusively locked.
1806  */
1807 #define vm_map_clip_end(map, entry, endaddr, countp)                  \
1808 {                                                                               \
1809           if (endaddr < entry->ba.end)                                \
1810                     _vm_map_clip_end(map, entry, endaddr, countp);    \
1811 }
1812 
1813 /*
1814  * This routine is called only when it is known that the entry must be split.
1815  *
1816  * The map must be exclusively locked.
1817  */
1818 static void
_vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_offset_t end,int * countp)1819 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end,
1820                      int *countp)
1821 {
1822           vm_map_entry_t new_entry;
1823 
1824           /*
1825            * If there is no object backing this entry, we might as well create
1826            * one now.  If we defer it, an object can get created after the map
1827            * is clipped, and individual objects will be created for the split-up
1828            * map.  This is a bit of a hack, but is also about the best place to
1829            * put this improvement.
1830            */
1831 
1832           if (entry->ba.object == NULL && !map->system_map &&
1833               VM_MAP_ENTRY_WITHIN_PARTITION(entry)) {
1834                     vm_map_entry_allocate_object(entry);
1835           }
1836 
1837           /*
1838            * Create a new entry and insert it AFTER the specified entry
1839            *
1840            * NOTE: The replicated function will adjust start, end, and offset
1841            *         for the remainder of the backing_ba linkages.  We must fixup
1842            *         the embedded ba.
1843            */
1844           new_entry = vm_map_entry_create(countp);
1845           *new_entry = *entry;
1846           new_entry->ba.start = end;
1847           new_entry->ba.offset += (new_entry->ba.start - entry->ba.start);
1848 
1849           /*
1850            * Ordering is important, make sure the new entry is replicated
1851            * before we cut the exiting entry.
1852            */
1853           vm_map_backing_replicated(map, new_entry, MAP_BACK_CLIPPED);
1854           vm_map_backing_adjust_end(entry, end);
1855           vm_map_entry_link(map, new_entry);
1856 }
1857 
1858 /*
1859  * Asserts that the starting and ending region addresses fall within the
1860  * valid range for the map.
1861  */
1862 #define   VM_MAP_RANGE_CHECK(map, start, end)     \
1863 {                                                           \
1864           if (start < vm_map_min(map))            \
1865                     start = vm_map_min(map);      \
1866           if (end > vm_map_max(map))              \
1867                     end = vm_map_max(map);                  \
1868           if (start > end)                        \
1869                     start = end;                            \
1870 }
1871 
1872 /*
1873  * Used to block when an in-transition collison occurs.  The map
1874  * is unlocked for the sleep and relocked before the return.
1875  */
1876 void
vm_map_transition_wait(vm_map_t map,int relock)1877 vm_map_transition_wait(vm_map_t map, int relock)
1878 {
1879           tsleep_interlock(map, 0);
1880           vm_map_unlock(map);
1881           tsleep(map, PINTERLOCKED, "vment", 0);
1882           if (relock)
1883                     vm_map_lock(map);
1884 }
1885 
1886 /*
1887  * When we do blocking operations with the map lock held it is
1888  * possible that a clip might have occured on our in-transit entry,
1889  * requiring an adjustment to the entry in our loop.  These macros
1890  * help the pageable and clip_range code deal with the case.  The
1891  * conditional costs virtually nothing if no clipping has occured.
1892  */
1893 
1894 #define CLIP_CHECK_BACK(entry, save_start)                            \
1895     do {                                                              \
1896               while (entry->ba.start != save_start) {                 \
1897                         entry = vm_map_rb_tree_RB_PREV(entry);        \
1898                         KASSERT(entry, ("bad entry clip"));           \
1899               }                                                                 \
1900     } while(0)
1901 
1902 #define CLIP_CHECK_FWD(entry, save_end)                               \
1903     do {                                                              \
1904               while (entry->ba.end != save_end) {                     \
1905                         entry = vm_map_rb_tree_RB_NEXT(entry);        \
1906                         KASSERT(entry, ("bad entry clip"));           \
1907               }                                                                 \
1908     } while(0)
1909 
1910 
1911 /*
1912  * Clip the specified range and return the base entry.  The
1913  * range may cover several entries starting at the returned base
1914  * and the first and last entry in the covering sequence will be
1915  * properly clipped to the requested start and end address.
1916  *
1917  * If no holes are allowed you should pass the MAP_CLIP_NO_HOLES
1918  * flag.
1919  *
1920  * The MAP_ENTRY_IN_TRANSITION flag will be set for the entries
1921  * covered by the requested range.
1922  *
1923  * The map must be exclusively locked on entry and will remain locked
1924  * on return. If no range exists or the range contains holes and you
1925  * specified that no holes were allowed, NULL will be returned.  This
1926  * routine may temporarily unlock the map in order avoid a deadlock when
1927  * sleeping.
1928  */
1929 static
1930 vm_map_entry_t
vm_map_clip_range(vm_map_t map,vm_offset_t start,vm_offset_t end,int * countp,int flags)1931 vm_map_clip_range(vm_map_t map, vm_offset_t start, vm_offset_t end,
1932                       int *countp, int flags)
1933 {
1934           vm_map_entry_t start_entry;
1935           vm_map_entry_t entry;
1936           vm_map_entry_t next;
1937 
1938           /*
1939            * Locate the entry and effect initial clipping.  The in-transition
1940            * case does not occur very often so do not try to optimize it.
1941            */
1942 again:
1943           if (vm_map_lookup_entry(map, start, &start_entry) == FALSE)
1944                     return (NULL);
1945           entry = start_entry;
1946           if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
1947                     entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1948                     ++mycpu->gd_cnt.v_intrans_coll;
1949                     ++mycpu->gd_cnt.v_intrans_wait;
1950                     vm_map_transition_wait(map, 1);
1951                     /*
1952                      * entry and/or start_entry may have been clipped while
1953                      * we slept, or may have gone away entirely.  We have
1954                      * to restart from the lookup.
1955                      */
1956                     goto again;
1957           }
1958 
1959           /*
1960            * Since we hold an exclusive map lock we do not have to restart
1961            * after clipping, even though clipping may block in zalloc.
1962            */
1963           vm_map_clip_start(map, entry, start, countp);
1964           vm_map_clip_end(map, entry, end, countp);
1965           entry->eflags |= MAP_ENTRY_IN_TRANSITION;
1966 
1967           /*
1968            * Scan entries covered by the range.  When working on the next
1969            * entry a restart need only re-loop on the current entry which
1970            * we have already locked, since 'next' may have changed.  Also,
1971            * even though entry is safe, it may have been clipped so we
1972            * have to iterate forwards through the clip after sleeping.
1973            */
1974           for (;;) {
1975                     next = vm_map_rb_tree_RB_NEXT(entry);
1976                     if (next == NULL || next->ba.start >= end)
1977                               break;
1978                     if (flags & MAP_CLIP_NO_HOLES) {
1979                               if (next->ba.start > entry->ba.end) {
1980                                         vm_map_unclip_range(map, start_entry,
1981                                                   start, entry->ba.end, countp, flags);
1982                                         return(NULL);
1983                               }
1984                     }
1985 
1986                     if (next->eflags & MAP_ENTRY_IN_TRANSITION) {
1987                               vm_offset_t save_end = entry->ba.end;
1988                               next->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
1989                               ++mycpu->gd_cnt.v_intrans_coll;
1990                               ++mycpu->gd_cnt.v_intrans_wait;
1991                               vm_map_transition_wait(map, 1);
1992 
1993                               /*
1994                                * clips might have occured while we blocked.
1995                                */
1996                               CLIP_CHECK_FWD(entry, save_end);
1997                               CLIP_CHECK_BACK(start_entry, start);
1998                               continue;
1999                     }
2000 
2001                     /*
2002                      * No restart necessary even though clip_end may block, we
2003                      * are holding the map lock.
2004                      */
2005                     vm_map_clip_end(map, next, end, countp);
2006                     next->eflags |= MAP_ENTRY_IN_TRANSITION;
2007                     entry = next;
2008           }
2009           if (flags & MAP_CLIP_NO_HOLES) {
2010                     if (entry->ba.end != end) {
2011                               vm_map_unclip_range(map, start_entry,
2012                                         start, entry->ba.end, countp, flags);
2013                               return(NULL);
2014                     }
2015           }
2016           return(start_entry);
2017 }
2018 
2019 /*
2020  * Undo the effect of vm_map_clip_range().  You should pass the same
2021  * flags and the same range that you passed to vm_map_clip_range().
2022  * This code will clear the in-transition flag on the entries and
2023  * wake up anyone waiting.  This code will also simplify the sequence
2024  * and attempt to merge it with entries before and after the sequence.
2025  *
2026  * The map must be locked on entry and will remain locked on return.
2027  *
2028  * Note that you should also pass the start_entry returned by
2029  * vm_map_clip_range().  However, if you block between the two calls
2030  * with the map unlocked please be aware that the start_entry may
2031  * have been clipped and you may need to scan it backwards to find
2032  * the entry corresponding with the original start address.  You are
2033  * responsible for this, vm_map_unclip_range() expects the correct
2034  * start_entry to be passed to it and will KASSERT otherwise.
2035  */
2036 static
2037 void
vm_map_unclip_range(vm_map_t map,vm_map_entry_t start_entry,vm_offset_t start,vm_offset_t end,int * countp,int flags)2038 vm_map_unclip_range(vm_map_t map, vm_map_entry_t start_entry,
2039                         vm_offset_t start, vm_offset_t end,
2040                         int *countp, int flags)
2041 {
2042           vm_map_entry_t entry;
2043 
2044           entry = start_entry;
2045 
2046           KASSERT(entry->ba.start == start, ("unclip_range: illegal base entry"));
2047           while (entry && entry->ba.start < end) {
2048                     KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
2049                               ("in-transition flag not set during unclip on: %p",
2050                               entry));
2051                     KASSERT(entry->ba.end <= end,
2052                               ("unclip_range: tail wasn't clipped"));
2053                     entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2054                     if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2055                               entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2056                               wakeup(map);
2057                     }
2058                     entry = vm_map_rb_tree_RB_NEXT(entry);
2059           }
2060 
2061           /*
2062            * Simplification does not block so there is no restart case.
2063            */
2064           entry = start_entry;
2065           while (entry && entry->ba.start < end) {
2066                     vm_map_simplify_entry(map, entry, countp);
2067                     entry = vm_map_rb_tree_RB_NEXT(entry);
2068           }
2069 }
2070 
2071 /*
2072  * Mark the given range as handled by a subordinate map.
2073  *
2074  * This range must have been created with vm_map_find(), and no other
2075  * operations may have been performed on this range prior to calling
2076  * vm_map_submap().
2077  *
2078  * Submappings cannot be removed.
2079  *
2080  * No requirements.
2081  */
2082 int
vm_map_submap(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_map_t submap)2083 vm_map_submap(vm_map_t map, vm_offset_t start, vm_offset_t end, vm_map_t submap)
2084 {
2085           vm_map_entry_t entry;
2086           int result = KERN_INVALID_ARGUMENT;
2087           int count;
2088 
2089           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2090           vm_map_lock(map);
2091 
2092           VM_MAP_RANGE_CHECK(map, start, end);
2093 
2094           if (vm_map_lookup_entry(map, start, &entry)) {
2095                     vm_map_clip_start(map, entry, start, &count);
2096           } else if (entry) {
2097                     entry = vm_map_rb_tree_RB_NEXT(entry);
2098           } else {
2099                     entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2100           }
2101 
2102           vm_map_clip_end(map, entry, end, &count);
2103 
2104           if ((entry->ba.start == start) && (entry->ba.end == end) &&
2105               ((entry->eflags & MAP_ENTRY_COW) == 0) &&
2106               (entry->ba.object == NULL)) {
2107                     entry->ba.sub_map = submap;
2108                     entry->maptype = VM_MAPTYPE_SUBMAP;
2109                     result = KERN_SUCCESS;
2110           }
2111           vm_map_unlock(map);
2112           vm_map_entry_release(count);
2113 
2114           return (result);
2115 }
2116 
2117 /*
2118  * Sets the protection of the specified address region in the target map.
2119  * If "set_max" is specified, the maximum protection is to be set;
2120  * otherwise, only the current protection is affected.
2121  *
2122  * The protection is not applicable to submaps, but is applicable to normal
2123  * maps and maps governed by virtual page tables.  For example, when operating
2124  * on a virtual page table our protection basically controls how COW occurs
2125  * on the backing object, whereas the virtual page table abstraction itself
2126  * is an abstraction for userland.
2127  *
2128  * No requirements.
2129  */
2130 int
vm_map_protect(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t new_prot,boolean_t set_max)2131 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2132                  vm_prot_t new_prot, boolean_t set_max)
2133 {
2134           vm_map_entry_t current;
2135           vm_map_entry_t entry;
2136           int count;
2137 
2138           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2139           vm_map_lock(map);
2140 
2141           VM_MAP_RANGE_CHECK(map, start, end);
2142 
2143           if (vm_map_lookup_entry(map, start, &entry)) {
2144                     vm_map_clip_start(map, entry, start, &count);
2145           } else if (entry) {
2146                     entry = vm_map_rb_tree_RB_NEXT(entry);
2147           } else {
2148                     entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2149           }
2150 
2151           /*
2152            * Make a first pass to check for protection violations.
2153            */
2154           current = entry;
2155           while (current && current->ba.start < end) {
2156                     if (current->maptype == VM_MAPTYPE_SUBMAP) {
2157                               vm_map_unlock(map);
2158                               vm_map_entry_release(count);
2159                               return (KERN_INVALID_ARGUMENT);
2160                     }
2161                     if ((new_prot & current->max_protection) != new_prot) {
2162                               vm_map_unlock(map);
2163                               vm_map_entry_release(count);
2164                               return (KERN_PROTECTION_FAILURE);
2165                     }
2166 
2167                     /*
2168                      * When making a SHARED+RW file mmap writable, update
2169                      * v_lastwrite_ts.
2170                      */
2171                     if (new_prot & PROT_WRITE &&
2172                         (current->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
2173                         current->maptype == VM_MAPTYPE_NORMAL &&
2174                         current->ba.object &&
2175                         current->ba.object->type == OBJT_VNODE) {
2176                               struct vnode *vp;
2177 
2178                               vp = current->ba.object->handle;
2179                               if (vp && vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT) == 0) {
2180                                         vfs_timestamp(&vp->v_lastwrite_ts);
2181                                         vsetflags(vp, VLASTWRITETS);
2182                                         vn_unlock(vp);
2183                               }
2184                     }
2185                     current = vm_map_rb_tree_RB_NEXT(current);
2186           }
2187 
2188           /*
2189            * Go back and fix up protections. [Note that clipping is not
2190            * necessary the second time.]
2191            */
2192           current = entry;
2193 
2194           while (current && current->ba.start < end) {
2195                     vm_prot_t old_prot;
2196 
2197                     vm_map_clip_end(map, current, end, &count);
2198 
2199                     old_prot = current->protection;
2200                     if (set_max) {
2201                               current->max_protection = new_prot;
2202                               current->protection = new_prot & old_prot;
2203                     } else {
2204                               current->protection = new_prot;
2205                     }
2206 
2207                     /*
2208                      * Update physical map if necessary. Worry about copy-on-write
2209                      * here -- CHECK THIS XXX
2210                      */
2211                     if (current->protection != old_prot) {
2212 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2213                                                                       VM_PROT_ALL)
2214 
2215                               pmap_protect(map->pmap, current->ba.start,
2216                                   current->ba.end,
2217                                   current->protection & MASK(current));
2218 #undef    MASK
2219                     }
2220 
2221                     vm_map_simplify_entry(map, current, &count);
2222 
2223                     current = vm_map_rb_tree_RB_NEXT(current);
2224           }
2225           vm_map_unlock(map);
2226           vm_map_entry_release(count);
2227           return (KERN_SUCCESS);
2228 }
2229 
2230 /*
2231  * This routine traverses a processes map handling the madvise
2232  * system call.  Advisories are classified as either those effecting
2233  * the vm_map_entry structure, or those effecting the underlying
2234  * objects.
2235  *
2236  * The <value> argument is used for extended madvise calls.
2237  *
2238  * No requirements.
2239  */
2240 int
vm_map_madvise(vm_map_t map,vm_offset_t start,vm_offset_t end,int behav,off_t value)2241 vm_map_madvise(vm_map_t map, vm_offset_t start, vm_offset_t end,
2242                  int behav, off_t value)
2243 {
2244           vm_map_entry_t current, entry;
2245           int modify_map = 0;
2246           int error = 0;
2247           int count;
2248 
2249           /*
2250            * Some madvise calls directly modify the vm_map_entry, in which case
2251            * we need to use an exclusive lock on the map and we need to perform
2252            * various clipping operations.  Otherwise we only need a read-lock
2253            * on the map.
2254            */
2255           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2256 
2257           switch(behav) {
2258           case MADV_NORMAL:
2259           case MADV_SEQUENTIAL:
2260           case MADV_RANDOM:
2261           case MADV_NOSYNC:
2262           case MADV_AUTOSYNC:
2263           case MADV_NOCORE:
2264           case MADV_CORE:
2265           case MADV_SETMAP:
2266                     modify_map = 1;
2267                     vm_map_lock(map);
2268                     break;
2269           case MADV_INVAL:
2270           case MADV_WILLNEED:
2271           case MADV_DONTNEED:
2272           case MADV_FREE:
2273                     vm_map_lock_read(map);
2274                     break;
2275           default:
2276                     vm_map_entry_release(count);
2277                     return (EINVAL);
2278           }
2279 
2280           /*
2281            * Locate starting entry and clip if necessary.
2282            */
2283 
2284           VM_MAP_RANGE_CHECK(map, start, end);
2285 
2286           if (vm_map_lookup_entry(map, start, &entry)) {
2287                     if (modify_map)
2288                               vm_map_clip_start(map, entry, start, &count);
2289           } else if (entry) {
2290                     entry = vm_map_rb_tree_RB_NEXT(entry);
2291           } else {
2292                     entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2293           }
2294 
2295           if (modify_map) {
2296                     /*
2297                      * madvise behaviors that are implemented in the vm_map_entry.
2298                      *
2299                      * We clip the vm_map_entry so that behavioral changes are
2300                      * limited to the specified address range.
2301                      */
2302                     for (current = entry;
2303                          current && current->ba.start < end;
2304                          current = vm_map_rb_tree_RB_NEXT(current)) {
2305                               /*
2306                                * Ignore submaps
2307                                */
2308                               if (current->maptype == VM_MAPTYPE_SUBMAP)
2309                                         continue;
2310 
2311                               vm_map_clip_end(map, current, end, &count);
2312 
2313                               switch (behav) {
2314                               case MADV_NORMAL:
2315                                         vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2316                                         break;
2317                               case MADV_SEQUENTIAL:
2318                                         vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2319                                         break;
2320                               case MADV_RANDOM:
2321                                         vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2322                                         break;
2323                               case MADV_NOSYNC:
2324                                         current->eflags |= MAP_ENTRY_NOSYNC;
2325                                         break;
2326                               case MADV_AUTOSYNC:
2327                                         current->eflags &= ~MAP_ENTRY_NOSYNC;
2328                                         break;
2329                               case MADV_NOCORE:
2330                                         current->eflags |= MAP_ENTRY_NOCOREDUMP;
2331                                         break;
2332                               case MADV_CORE:
2333                                         current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2334                                         break;
2335                               case MADV_SETMAP:
2336                                         /*
2337                                          * Set the page directory page for a map
2338                                          * governed by a virtual page table.
2339                                          *
2340                                          * Software virtual page table support has
2341                                          * been removed, this MADV is no longer
2342                                          * supported.
2343                                          */
2344                                         error = EINVAL;
2345                                         break;
2346                               case MADV_INVAL:
2347                                         /*
2348                                          * Invalidate the related pmap entries, used
2349                                          * to flush portions of the real kernel's
2350                                          * pmap when the caller has removed or
2351                                          * modified existing mappings in a virtual
2352                                          * page table.
2353                                          *
2354                                          * (exclusive locked map version does not
2355                                          * need the range interlock).
2356                                          */
2357                                         pmap_remove(map->pmap,
2358                                                       current->ba.start, current->ba.end);
2359                                         break;
2360                               default:
2361                                         error = EINVAL;
2362                                         break;
2363                               }
2364                               vm_map_simplify_entry(map, current, &count);
2365                     }
2366                     vm_map_unlock(map);
2367           } else {
2368                     vm_pindex_t pindex;
2369                     vm_pindex_t delta;
2370 
2371                     /*
2372                      * madvise behaviors that are implemented in the underlying
2373                      * vm_object.
2374                      *
2375                      * Since we don't clip the vm_map_entry, we have to clip
2376                      * the vm_object pindex and count.
2377                      *
2378                      * NOTE!  These functions are only supported on normal maps.
2379                      *
2380                      * NOTE!  These functions only apply to the top-most object.
2381                      *          It is not applicable to backing objects.
2382                      */
2383                     for (current = entry;
2384                          current && current->ba.start < end;
2385                          current = vm_map_rb_tree_RB_NEXT(current)) {
2386                               vm_offset_t useStart;
2387 
2388                               if (current->maptype != VM_MAPTYPE_NORMAL)
2389                                         continue;
2390 
2391                               pindex = OFF_TO_IDX(current->ba.offset);
2392                               delta = atop(current->ba.end - current->ba.start);
2393                               useStart = current->ba.start;
2394 
2395                               if (current->ba.start < start) {
2396                                         pindex += atop(start - current->ba.start);
2397                                         delta -= atop(start - current->ba.start);
2398                                         useStart = start;
2399                               }
2400                               if (current->ba.end > end)
2401                                         delta -= atop(current->ba.end - end);
2402 
2403                               if ((vm_spindex_t)delta <= 0)
2404                                         continue;
2405 
2406                               if (behav == MADV_INVAL) {
2407                                         /*
2408                                          * Invalidate the related pmap entries, used
2409                                          * to flush portions of the real kernel's
2410                                          * pmap when the caller has removed or
2411                                          * modified existing mappings in a virtual
2412                                          * page table.
2413                                          *
2414                                          * (shared locked map version needs the
2415                                          * interlock, see vm_fault()).
2416                                          */
2417                                         struct vm_map_ilock ilock;
2418 
2419                                         KASSERT(useStart >= VM_MIN_USER_ADDRESS &&
2420                                                       useStart + ptoa(delta) <=
2421                                                       VM_MAX_USER_ADDRESS,
2422                                                    ("Bad range %016jx-%016jx (%016jx)",
2423                                                    useStart, useStart + ptoa(delta),
2424                                                    delta));
2425                                         vm_map_interlock(map, &ilock,
2426                                                              useStart,
2427                                                              useStart + ptoa(delta));
2428                                         pmap_remove(map->pmap,
2429                                                       useStart,
2430                                                       useStart + ptoa(delta));
2431                                         vm_map_deinterlock(map, &ilock);
2432                               } else {
2433                                         vm_object_madvise(current->ba.object,
2434                                                               pindex, delta, behav);
2435                               }
2436 
2437                               /*
2438                                * Try to pre-populate the page table.
2439                                */
2440                               if (behav == MADV_WILLNEED) {
2441                                         pmap_object_init_pt(
2442                                             map->pmap, current,
2443                                             useStart,
2444                                             (delta << PAGE_SHIFT),
2445                                             MAP_PREFAULT_MADVISE
2446                                         );
2447                               }
2448                     }
2449                     vm_map_unlock_read(map);
2450           }
2451           vm_map_entry_release(count);
2452           return(error);
2453 }
2454 
2455 
2456 /*
2457  * Sets the inheritance of the specified address range in the target map.
2458  * Inheritance affects how the map will be shared with child maps at the
2459  * time of vm_map_fork.
2460  */
2461 int
vm_map_inherit(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_inherit_t new_inheritance)2462 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2463                  vm_inherit_t new_inheritance)
2464 {
2465           vm_map_entry_t entry;
2466           vm_map_entry_t temp_entry;
2467           int count;
2468 
2469           switch (new_inheritance) {
2470           case VM_INHERIT_NONE:
2471           case VM_INHERIT_COPY:
2472           case VM_INHERIT_SHARE:
2473                     break;
2474           default:
2475                     return (KERN_INVALID_ARGUMENT);
2476           }
2477 
2478           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2479           vm_map_lock(map);
2480 
2481           VM_MAP_RANGE_CHECK(map, start, end);
2482 
2483           if (vm_map_lookup_entry(map, start, &temp_entry)) {
2484                     entry = temp_entry;
2485                     vm_map_clip_start(map, entry, start, &count);
2486           } else if (temp_entry) {
2487                     entry = vm_map_rb_tree_RB_NEXT(temp_entry);
2488           } else {
2489                     entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
2490           }
2491 
2492           while (entry && entry->ba.start < end) {
2493                     vm_map_clip_end(map, entry, end, &count);
2494 
2495                     entry->inheritance = new_inheritance;
2496 
2497                     vm_map_simplify_entry(map, entry, &count);
2498 
2499                     entry = vm_map_rb_tree_RB_NEXT(entry);
2500           }
2501           vm_map_unlock(map);
2502           vm_map_entry_release(count);
2503           return (KERN_SUCCESS);
2504 }
2505 
2506 /*
2507  * Wiring/Unwiring of memory for user-related operation.
2508  *
2509  * Implement the semantics of mlock
2510  */
2511 int
vm_map_user_wiring(vm_map_t map,vm_offset_t start,vm_offset_t real_end,boolean_t new_pageable)2512 vm_map_user_wiring(vm_map_t map, vm_offset_t start, vm_offset_t real_end,
2513                        boolean_t new_pageable)
2514 {
2515           vm_map_entry_t entry;
2516           vm_map_entry_t start_entry;
2517           vm_offset_t end;
2518           int rv = KERN_SUCCESS;
2519           int count;
2520 
2521           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2522           vm_map_lock(map);
2523           VM_MAP_RANGE_CHECK(map, start, real_end);
2524           end = real_end;
2525 
2526           start_entry = vm_map_clip_range(map, start, end, &count,
2527                                                   MAP_CLIP_NO_HOLES);
2528           if (start_entry == NULL) {
2529                     vm_map_unlock(map);
2530                     vm_map_entry_release(count);
2531                     return (KERN_INVALID_ADDRESS);
2532           }
2533 
2534           if (new_pageable == 0) {
2535                     entry = start_entry;
2536                     while (entry && entry->ba.start < end) {
2537                               vm_offset_t save_start;
2538                               vm_offset_t save_end;
2539 
2540                               /*
2541                                * Already user wired or hard wired (trivial cases)
2542                                */
2543                               if (entry->eflags & MAP_ENTRY_USER_WIRED) {
2544                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2545                                         continue;
2546                               }
2547                               if (entry->wired_count != 0) {
2548                                         entry->wired_count++;
2549                                         entry->eflags |= MAP_ENTRY_USER_WIRED;
2550                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2551                                         continue;
2552                               }
2553 
2554                               /*
2555                                * A new wiring requires instantiation of appropriate
2556                                * management structures and the faulting in of the
2557                                * page.
2558                                */
2559                               if (entry->maptype == VM_MAPTYPE_NORMAL) {
2560                                         int copyflag = entry->eflags &
2561                                                          MAP_ENTRY_NEEDS_COPY;
2562                                         if (copyflag && ((entry->protection &
2563                                                               VM_PROT_WRITE) != 0)) {
2564                                                   vm_map_entry_shadow(entry);
2565                                         } else if (entry->ba.object == NULL &&
2566                                                      !map->system_map) {
2567                                                   vm_map_entry_allocate_object(entry);
2568                                         }
2569                               }
2570                               entry->wired_count++;
2571                               entry->eflags |= MAP_ENTRY_USER_WIRED;
2572 
2573                               /*
2574                                * Now fault in the area.  Note that vm_fault_wire()
2575                                * may release the map lock temporarily, it will be
2576                                * relocked on return.  The in-transition
2577                                * flag protects the entries.
2578                                */
2579                               save_start = entry->ba.start;
2580                               save_end = entry->ba.end;
2581                               rv = vm_fault_wire(map, entry, TRUE, 0);
2582                               if (rv) {
2583                                         CLIP_CHECK_BACK(entry, save_start);
2584                                         for (;;) {
2585                                                   KASSERT(entry->wired_count == 1, ("bad wired_count on entry"));
2586                                                   entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2587                                                   entry->wired_count = 0;
2588                                                   if (entry->ba.end == save_end)
2589                                                             break;
2590                                                   entry = vm_map_rb_tree_RB_NEXT(entry);
2591                                                   KASSERT(entry,
2592                                                        ("bad entry clip during backout"));
2593                                         }
2594                                         end = save_start;   /* unwire the rest */
2595                                         break;
2596                               }
2597                               /*
2598                                * note that even though the entry might have been
2599                                * clipped, the USER_WIRED flag we set prevents
2600                                * duplication so we do not have to do a
2601                                * clip check.
2602                                */
2603                               entry = vm_map_rb_tree_RB_NEXT(entry);
2604                     }
2605 
2606                     /*
2607                      * If we failed fall through to the unwiring section to
2608                      * unwire what we had wired so far.  'end' has already
2609                      * been adjusted.
2610                      */
2611                     if (rv)
2612                               new_pageable = 1;
2613 
2614                     /*
2615                      * start_entry might have been clipped if we unlocked the
2616                      * map and blocked.  No matter how clipped it has gotten
2617                      * there should be a fragment that is on our start boundary.
2618                      */
2619                     CLIP_CHECK_BACK(start_entry, start);
2620           }
2621 
2622           /*
2623            * Deal with the unwiring case.
2624            */
2625           if (new_pageable) {
2626                     /*
2627                      * This is the unwiring case.  We must first ensure that the
2628                      * range to be unwired is really wired down.  We know there
2629                      * are no holes.
2630                      */
2631                     entry = start_entry;
2632                     while (entry && entry->ba.start < end) {
2633                               if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2634                                         rv = KERN_INVALID_ARGUMENT;
2635                                         goto done;
2636                               }
2637                               KASSERT(entry->wired_count != 0,
2638                                         ("wired count was 0 with USER_WIRED set! %p",
2639                                          entry));
2640                               entry = vm_map_rb_tree_RB_NEXT(entry);
2641                     }
2642 
2643                     /*
2644                      * Now decrement the wiring count for each region. If a region
2645                      * becomes completely unwired, unwire its physical pages and
2646                      * mappings.
2647                      */
2648                     /*
2649                      * The map entries are processed in a loop, checking to
2650                      * make sure the entry is wired and asserting it has a wired
2651                      * count. However, another loop was inserted more-or-less in
2652                      * the middle of the unwiring path. This loop picks up the
2653                      * "entry" loop variable from the first loop without first
2654                      * setting it to start_entry. Naturally, the secound loop
2655                      * is never entered and the pages backing the entries are
2656                      * never unwired. This can lead to a leak of wired pages.
2657                      */
2658                     entry = start_entry;
2659                     while (entry && entry->ba.start < end) {
2660                               KASSERT(entry->eflags & MAP_ENTRY_USER_WIRED,
2661                                         ("expected USER_WIRED on entry %p", entry));
2662                               entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2663                               entry->wired_count--;
2664                               if (entry->wired_count == 0)
2665                                         vm_fault_unwire(map, entry);
2666                               entry = vm_map_rb_tree_RB_NEXT(entry);
2667                     }
2668           }
2669 done:
2670           vm_map_unclip_range(map, start_entry, start, real_end, &count,
2671                                   MAP_CLIP_NO_HOLES);
2672           vm_map_unlock(map);
2673           vm_map_entry_release(count);
2674 
2675           return (rv);
2676 }
2677 
2678 /*
2679  * Wiring/Unwiring of memory for kernel-related operation.
2680  *
2681  * Sets the pageability of the specified address range in the target map.
2682  * Regions specified as not pageable require locked-down physical
2683  * memory and physical page maps.
2684  *
2685  * The map must not be locked, but a reference must remain to the map
2686  * throughout the call.
2687  *
2688  * This function may be called via the zalloc path and must properly
2689  * reserve map entries for kernel_map.
2690  *
2691  * No requirements.
2692  */
2693 int
vm_map_kernel_wiring(vm_map_t map,vm_offset_t start,vm_offset_t real_end,int kmflags)2694 vm_map_kernel_wiring(vm_map_t map, vm_offset_t start,
2695                          vm_offset_t real_end, int kmflags)
2696 {
2697           vm_map_entry_t entry;
2698           vm_map_entry_t start_entry;
2699           vm_offset_t end;
2700           int rv = KERN_SUCCESS;
2701           int count;
2702 
2703           if (kmflags & KM_KRESERVE)
2704                     count = vm_map_entry_kreserve(MAP_RESERVE_COUNT);
2705           else
2706                     count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
2707           vm_map_lock(map);
2708           VM_MAP_RANGE_CHECK(map, start, real_end);
2709           end = real_end;
2710 
2711           start_entry = vm_map_clip_range(map, start, end, &count,
2712                                                   MAP_CLIP_NO_HOLES);
2713           if (start_entry == NULL) {
2714                     vm_map_unlock(map);
2715                     rv = KERN_INVALID_ADDRESS;
2716                     goto failure;
2717           }
2718           if ((kmflags & KM_PAGEABLE) == 0) {
2719                     /*
2720                      * Wiring.
2721                      *
2722                      * 1.  Holding the write lock, we create any shadow or zero-fill
2723                      * objects that need to be created. Then we clip each map
2724                      * entry to the region to be wired and increment its wiring
2725                      * count.  We create objects before clipping the map entries
2726                      * to avoid object proliferation.
2727                      *
2728                      * 2.  We downgrade to a read lock, and call vm_fault_wire to
2729                      * fault in the pages for any newly wired area (wired_count is
2730                      * 1).
2731                      *
2732                      * Downgrading to a read lock for vm_fault_wire avoids a
2733                      * possible deadlock with another process that may have faulted
2734                      * on one of the pages to be wired (it would mark the page busy,
2735                      * blocking us, then in turn block on the map lock that we
2736                      * hold).  Because of problems in the recursive lock package,
2737                      * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
2738                      * any actions that require the write lock must be done
2739                      * beforehand.  Because we keep the read lock on the map, the
2740                      * copy-on-write status of the entries we modify here cannot
2741                      * change.
2742                      */
2743                     entry = start_entry;
2744                     while (entry && entry->ba.start < end) {
2745                               /*
2746                                * Trivial case if the entry is already wired
2747                                */
2748                               if (entry->wired_count) {
2749                                         entry->wired_count++;
2750                                         entry = vm_map_rb_tree_RB_NEXT(entry);
2751                                         continue;
2752                               }
2753 
2754                               /*
2755                                * The entry is being newly wired, we have to setup
2756                                * appropriate management structures.  A shadow
2757                                * object is required for a copy-on-write region,
2758                                * or a normal object for a zero-fill region.  We
2759                                * do not have to do this for entries that point to sub
2760                                * maps because we won't hold the lock on the sub map.
2761                                */
2762                               if (entry->maptype == VM_MAPTYPE_NORMAL) {
2763                                         int copyflag = entry->eflags &
2764                                                          MAP_ENTRY_NEEDS_COPY;
2765                                         if (copyflag && ((entry->protection &
2766                                                               VM_PROT_WRITE) != 0)) {
2767                                                   vm_map_entry_shadow(entry);
2768                                         } else if (entry->ba.object == NULL &&
2769                                                      !map->system_map) {
2770                                                   vm_map_entry_allocate_object(entry);
2771                                         }
2772                               }
2773                               entry->wired_count++;
2774                               entry = vm_map_rb_tree_RB_NEXT(entry);
2775                     }
2776 
2777                     /*
2778                      * Pass 2.
2779                      */
2780 
2781                     /*
2782                      * HACK HACK HACK HACK
2783                      *
2784                      * vm_fault_wire() temporarily unlocks the map to avoid
2785                      * deadlocks.  The in-transition flag from vm_map_clip_range
2786                      * call should protect us from changes while the map is
2787                      * unlocked.  T
2788                      *
2789                      * NOTE: Previously this comment stated that clipping might
2790                      *         still occur while the entry is unlocked, but from
2791                      *         what I can tell it actually cannot.
2792                      *
2793                      *         It is unclear whether the CLIP_CHECK_*() calls
2794                      *         are still needed but we keep them in anyway.
2795                      *
2796                      * HACK HACK HACK HACK
2797                      */
2798 
2799                     entry = start_entry;
2800                     while (entry && entry->ba.start < end) {
2801                               /*
2802                                * If vm_fault_wire fails for any page we need to undo
2803                                * what has been done.  We decrement the wiring count
2804                                * for those pages which have not yet been wired (now)
2805                                * and unwire those that have (later).
2806                                */
2807                               vm_offset_t save_start = entry->ba.start;
2808                               vm_offset_t save_end = entry->ba.end;
2809 
2810                               if (entry->wired_count == 1)
2811                                         rv = vm_fault_wire(map, entry, FALSE, kmflags);
2812                               if (rv) {
2813                                         CLIP_CHECK_BACK(entry, save_start);
2814                                         for (;;) {
2815                                                   KASSERT(entry->wired_count == 1,
2816                                                     ("wired_count changed unexpectedly"));
2817                                                   entry->wired_count = 0;
2818                                                   if (entry->ba.end == save_end)
2819                                                             break;
2820                                                   entry = vm_map_rb_tree_RB_NEXT(entry);
2821                                                   KASSERT(entry,
2822                                                     ("bad entry clip during backout"));
2823                                         }
2824                                         end = save_start;
2825                                         break;
2826                               }
2827                               CLIP_CHECK_FWD(entry, save_end);
2828                               entry = vm_map_rb_tree_RB_NEXT(entry);
2829                     }
2830 
2831                     /*
2832                      * If a failure occured undo everything by falling through
2833                      * to the unwiring code.  'end' has already been adjusted
2834                      * appropriately.
2835                      */
2836                     if (rv)
2837                               kmflags |= KM_PAGEABLE;
2838 
2839                     /*
2840                      * start_entry is still IN_TRANSITION but may have been
2841                      * clipped since vm_fault_wire() unlocks and relocks the
2842                      * map.  No matter how clipped it has gotten there should
2843                      * be a fragment that is on our start boundary.
2844                      */
2845                     CLIP_CHECK_BACK(start_entry, start);
2846           }
2847 
2848           if (kmflags & KM_PAGEABLE) {
2849                     /*
2850                      * This is the unwiring case.  We must first ensure that the
2851                      * range to be unwired is really wired down.  We know there
2852                      * are no holes.
2853                      */
2854                     entry = start_entry;
2855                     while (entry && entry->ba.start < end) {
2856                               if (entry->wired_count == 0) {
2857                                         rv = KERN_INVALID_ARGUMENT;
2858                                         goto done;
2859                               }
2860                               entry = vm_map_rb_tree_RB_NEXT(entry);
2861                     }
2862 
2863                     /*
2864                      * Now decrement the wiring count for each region. If a region
2865                      * becomes completely unwired, unwire its physical pages and
2866                      * mappings.
2867                      */
2868                     entry = start_entry;
2869                     while (entry && entry->ba.start < end) {
2870                               entry->wired_count--;
2871                               if (entry->wired_count == 0)
2872                                         vm_fault_unwire(map, entry);
2873                               entry = vm_map_rb_tree_RB_NEXT(entry);
2874                     }
2875           }
2876 done:
2877           vm_map_unclip_range(map, start_entry, start, real_end,
2878                                   &count, MAP_CLIP_NO_HOLES);
2879           vm_map_unlock(map);
2880 failure:
2881           if (kmflags & KM_KRESERVE)
2882                     vm_map_entry_krelease(count);
2883           else
2884                     vm_map_entry_release(count);
2885           return (rv);
2886 }
2887 
2888 /*
2889  * Mark a newly allocated address range as wired but do not fault in
2890  * the pages.  The caller is expected to load the pages into the object.
2891  *
2892  * The map must be locked on entry and will remain locked on return.
2893  * No other requirements.
2894  */
2895 void
vm_map_set_wired_quick(vm_map_t map,vm_offset_t addr,vm_size_t size,int * countp)2896 vm_map_set_wired_quick(vm_map_t map, vm_offset_t addr, vm_size_t size,
2897                            int *countp)
2898 {
2899           vm_map_entry_t scan;
2900           vm_map_entry_t entry;
2901 
2902           entry = vm_map_clip_range(map, addr, addr + size,
2903                                           countp, MAP_CLIP_NO_HOLES);
2904           scan = entry;
2905           while (scan && scan->ba.start < addr + size) {
2906                     KKASSERT(scan->wired_count == 0);
2907                     scan->wired_count = 1;
2908                     scan = vm_map_rb_tree_RB_NEXT(scan);
2909           }
2910           vm_map_unclip_range(map, entry, addr, addr + size,
2911                                   countp, MAP_CLIP_NO_HOLES);
2912 }
2913 
2914 /*
2915  * Push any dirty cached pages in the address range to their pager.
2916  * If syncio is TRUE, dirty pages are written synchronously.
2917  * If invalidate is TRUE, any cached pages are freed as well.
2918  *
2919  * This routine is called by sys_msync()
2920  *
2921  * Returns an error if any part of the specified range is not mapped.
2922  *
2923  * No requirements.
2924  */
2925 int
vm_map_clean(vm_map_t map,vm_offset_t start,vm_offset_t end,boolean_t syncio,boolean_t invalidate)2926 vm_map_clean(vm_map_t map, vm_offset_t start, vm_offset_t end,
2927                boolean_t syncio, boolean_t invalidate)
2928 {
2929           vm_map_entry_t current;
2930           vm_map_entry_t next;
2931           vm_map_entry_t entry;
2932           vm_map_backing_t ba;
2933           vm_size_t size;
2934           vm_object_t object;
2935           vm_ooffset_t offset;
2936 
2937           vm_map_lock_read(map);
2938           VM_MAP_RANGE_CHECK(map, start, end);
2939           if (!vm_map_lookup_entry(map, start, &entry)) {
2940                     vm_map_unlock_read(map);
2941                     return (KERN_INVALID_ADDRESS);
2942           }
2943           lwkt_gettoken(&map->token);
2944 
2945           /*
2946            * Make a first pass to check for holes.
2947            */
2948           current = entry;
2949           while (current && current->ba.start < end) {
2950                     if (current->maptype == VM_MAPTYPE_SUBMAP) {
2951                               lwkt_reltoken(&map->token);
2952                               vm_map_unlock_read(map);
2953                               return (KERN_INVALID_ARGUMENT);
2954                     }
2955                     next = vm_map_rb_tree_RB_NEXT(current);
2956                     if (end > current->ba.end &&
2957                         (next == NULL ||
2958                          current->ba.end != next->ba.start)) {
2959                               lwkt_reltoken(&map->token);
2960                               vm_map_unlock_read(map);
2961                               return (KERN_INVALID_ADDRESS);
2962                     }
2963                     current = next;
2964           }
2965 
2966           if (invalidate)
2967                     pmap_remove(vm_map_pmap(map), start, end);
2968 
2969           /*
2970            * Make a second pass, cleaning/uncaching pages from the indicated
2971            * objects as we go.
2972            */
2973           current = entry;
2974           while (current && current->ba.start < end) {
2975                     offset = current->ba.offset + (start - current->ba.start);
2976                     size = (end <= current->ba.end ? end : current->ba.end) - start;
2977 
2978                     switch(current->maptype) {
2979                     case VM_MAPTYPE_SUBMAP:
2980                     {
2981                               vm_map_t smap;
2982                               vm_map_entry_t tentry;
2983                               vm_size_t tsize;
2984 
2985                               smap = current->ba.sub_map;
2986                               vm_map_lock_read(smap);
2987                               vm_map_lookup_entry(smap, offset, &tentry);
2988                               if (tentry == NULL) {
2989                                         tsize = vm_map_max(smap) - offset;
2990                                         ba = NULL;
2991                                         offset = 0 + (offset - vm_map_min(smap));
2992                               } else {
2993                                         tsize = tentry->ba.end - offset;
2994                                         ba = &tentry->ba;
2995                                         offset = tentry->ba.offset +
2996                                                    (offset - tentry->ba.start);
2997                               }
2998                               vm_map_unlock_read(smap);
2999                               if (tsize < size)
3000                                         size = tsize;
3001                               break;
3002                     }
3003                     case VM_MAPTYPE_NORMAL:
3004                               ba = &current->ba;
3005                               break;
3006                     default:
3007                               ba = NULL;
3008                               break;
3009                     }
3010                     if (ba) {
3011                               object = ba->object;
3012                               if (object)
3013                                         vm_object_hold(object);
3014                     } else {
3015                               object = NULL;
3016                     }
3017 
3018                     /*
3019                      * Note that there is absolutely no sense in writing out
3020                      * anonymous objects, so we track down the vnode object
3021                      * to write out.
3022                      * We invalidate (remove) all pages from the address space
3023                      * anyway, for semantic correctness.
3024                      *
3025                      * note: certain anonymous maps, such as MAP_NOSYNC maps,
3026                      * may start out with a NULL object.
3027                      *
3028                      * XXX do we really want to stop at the first backing store
3029                      * here if there are more? XXX
3030                      */
3031                     if (ba) {
3032                               vm_object_t tobj;
3033 
3034                               tobj = object;
3035                               while (ba->backing_ba != NULL) {
3036                                         offset -= ba->offset;
3037                                         ba = ba->backing_ba;
3038                                         offset += ba->offset;
3039                                         tobj = ba->object;
3040                                         if (tobj->size < OFF_TO_IDX(offset + size))
3041                                                   size = IDX_TO_OFF(tobj->size) - offset;
3042                                         break; /* XXX this break is not correct */
3043                               }
3044                               if (object != tobj) {
3045                                         if (object)
3046                                                   vm_object_drop(object);
3047                                         object = tobj;
3048                                         vm_object_hold(object);
3049                               }
3050                     }
3051 
3052                     if (object && (object->type == OBJT_VNODE) &&
3053                         (current->protection & VM_PROT_WRITE) &&
3054                         (object->flags & OBJ_NOMSYNC) == 0) {
3055                               /*
3056                                * Flush pages if writing is allowed, invalidate them
3057                                * if invalidation requested.  Pages undergoing I/O
3058                                * will be ignored by vm_object_page_remove().
3059                                *
3060                                * We cannot lock the vnode and then wait for paging
3061                                * to complete without deadlocking against vm_fault.
3062                                * Instead we simply call vm_object_page_remove() and
3063                                * allow it to block internally on a page-by-page
3064                                * basis when it encounters pages undergoing async
3065                                * I/O.
3066                                */
3067                               int flags;
3068 
3069                               /* no chain wait needed for vnode objects */
3070                               vm_object_reference_locked(object);
3071                               vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY);
3072                               flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
3073                               flags |= invalidate ? OBJPC_INVAL : 0;
3074 
3075                               if (current->maptype == VM_MAPTYPE_NORMAL) {
3076                                         vm_object_page_clean(object,
3077                                             OFF_TO_IDX(offset),
3078                                             OFF_TO_IDX(offset + size + PAGE_MASK),
3079                                             flags);
3080                               }
3081                               vn_unlock(((struct vnode *)object->handle));
3082                               vm_object_deallocate_locked(object);
3083                     }
3084                     if (object && invalidate &&
3085                        ((object->type == OBJT_VNODE) ||
3086                         (object->type == OBJT_DEVICE) ||
3087                         (object->type == OBJT_MGTDEVICE))) {
3088                               int clean_only =
3089                                         ((object->type == OBJT_DEVICE) ||
3090                                         (object->type == OBJT_MGTDEVICE)) ? FALSE : TRUE;
3091                               /* no chain wait needed for vnode/device objects */
3092                               vm_object_reference_locked(object);
3093                               if (current->maptype == VM_MAPTYPE_NORMAL) {
3094                                         vm_object_page_remove(object,
3095                                             OFF_TO_IDX(offset),
3096                                             OFF_TO_IDX(offset + size + PAGE_MASK),
3097                                             clean_only);
3098                               }
3099                               vm_object_deallocate_locked(object);
3100                     }
3101                     start += size;
3102                     if (object)
3103                               vm_object_drop(object);
3104                     current = vm_map_rb_tree_RB_NEXT(current);
3105           }
3106 
3107           lwkt_reltoken(&map->token);
3108           vm_map_unlock_read(map);
3109 
3110           return (KERN_SUCCESS);
3111 }
3112 
3113 /*
3114  * Make the region specified by this entry pageable.
3115  *
3116  * The vm_map must be exclusively locked.
3117  */
3118 static void
vm_map_entry_unwire(vm_map_t map,vm_map_entry_t entry)3119 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3120 {
3121           entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3122           entry->wired_count = 0;
3123           vm_fault_unwire(map, entry);
3124 }
3125 
3126 /*
3127  * Deallocate the given entry from the target map.
3128  *
3129  * The vm_map must be exclusively locked.
3130  */
3131 static void
vm_map_entry_delete(vm_map_t map,vm_map_entry_t entry,int * countp)3132 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry, int *countp)
3133 {
3134           vm_map_entry_unlink(map, entry);
3135           map->size -= entry->ba.end - entry->ba.start;
3136           vm_map_entry_dispose(map, entry, countp);
3137 }
3138 
3139 /*
3140  * Deallocates the given address range from the target map.
3141  *
3142  * The vm_map must be exclusively locked.
3143  */
3144 int
vm_map_delete(vm_map_t map,vm_offset_t start,vm_offset_t end,int * countp)3145 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end, int *countp)
3146 {
3147           vm_object_t object;
3148           vm_map_entry_t entry;
3149           vm_map_entry_t first_entry;
3150           vm_offset_t hole_start;
3151 
3152           ASSERT_VM_MAP_LOCKED(map);
3153           lwkt_gettoken(&map->token);
3154 again:
3155           /*
3156            * Find the start of the region, and clip it.  Set entry to point
3157            * at the first record containing the requested address or, if no
3158            * such record exists, the next record with a greater address.  The
3159            * loop will run from this point until a record beyond the termination
3160            * address is encountered.
3161            *
3162            * Adjust freehint[] for either the clip case or the extension case.
3163            *
3164            * GGG see other GGG comment.
3165            */
3166           if (vm_map_lookup_entry(map, start, &first_entry)) {
3167                     entry = first_entry;
3168                     vm_map_clip_start(map, entry, start, countp);
3169                     hole_start = start;
3170           } else {
3171                     if (first_entry) {
3172                               entry = vm_map_rb_tree_RB_NEXT(first_entry);
3173                               if (entry == NULL)
3174                                         hole_start = first_entry->ba.start;
3175                               else
3176                                         hole_start = first_entry->ba.end;
3177                     } else {
3178                               entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
3179                               if (entry == NULL)
3180                                         hole_start = vm_map_min(map);
3181                               else
3182                                         hole_start = vm_map_max(map);
3183                     }
3184           }
3185 
3186           /*
3187            * Step through all entries in this region
3188            */
3189           while (entry && entry->ba.start < end) {
3190                     vm_map_entry_t next;
3191                     vm_offset_t s, e;
3192                     vm_pindex_t offidxstart, offidxend, count;
3193 
3194                     /*
3195                      * If we hit an in-transition entry we have to sleep and
3196                      * retry.  It's easier (and not really slower) to just retry
3197                      * since this case occurs so rarely and the hint is already
3198                      * pointing at the right place.  We have to reset the
3199                      * start offset so as not to accidently delete an entry
3200                      * another process just created in vacated space.
3201                      */
3202                     if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3203                               entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3204                               start = entry->ba.start;
3205                               ++mycpu->gd_cnt.v_intrans_coll;
3206                               ++mycpu->gd_cnt.v_intrans_wait;
3207                               vm_map_transition_wait(map, 1);
3208                               goto again;
3209                     }
3210                     vm_map_clip_end(map, entry, end, countp);
3211 
3212                     s = entry->ba.start;
3213                     e = entry->ba.end;
3214                     next = vm_map_rb_tree_RB_NEXT(entry);
3215 
3216                     offidxstart = OFF_TO_IDX(entry->ba.offset);
3217                     count = OFF_TO_IDX(e - s);
3218 
3219                     switch(entry->maptype) {
3220                     case VM_MAPTYPE_NORMAL:
3221                     case VM_MAPTYPE_SUBMAP:
3222                               object = entry->ba.object;
3223                               break;
3224                     default:
3225                               object = NULL;
3226                               break;
3227                     }
3228 
3229                     /*
3230                      * Unwire before removing addresses from the pmap; otherwise,
3231                      * unwiring will put the entries back in the pmap.
3232                      *
3233                      * Generally speaking, doing a bulk pmap_remove() before
3234                      * removing the pages from the VM object is better at
3235                      * reducing unnecessary IPIs.  The pmap code is now optimized
3236                      * to not blindly iterate the range when pt and pd pages
3237                      * are missing.
3238                      */
3239                     if (entry->wired_count != 0)
3240                               vm_map_entry_unwire(map, entry);
3241 
3242                     offidxend = offidxstart + count;
3243 
3244                     if (object == kernel_object) {
3245                               pmap_remove(map->pmap, s, e);
3246                               vm_object_hold(object);
3247                               vm_object_page_remove(object, offidxstart,
3248                                                         offidxend, FALSE);
3249                               vm_object_drop(object);
3250                     } else if (object && object->type != OBJT_DEFAULT &&
3251                                  object->type != OBJT_SWAP) {
3252                               /*
3253                                * vnode object routines cannot be chain-locked,
3254                                * but since we aren't removing pages from the
3255                                * object here we can use a shared hold.
3256                                */
3257                               vm_object_hold_shared(object);
3258                               pmap_remove(map->pmap, s, e);
3259                               vm_object_drop(object);
3260                     } else if (object) {
3261                               vm_object_hold(object);
3262                               pmap_remove(map->pmap, s, e);
3263 
3264                               if (object != NULL &&
3265                                   object->ref_count != 1 &&
3266                                   (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) ==
3267                                    OBJ_ONEMAPPING &&
3268                                   (object->type == OBJT_DEFAULT ||
3269                                    object->type == OBJT_SWAP)) {
3270                                         /*
3271                                          * When ONEMAPPING is set we can destroy the
3272                                          * pages underlying the entry's range.
3273                                          */
3274                                         vm_object_page_remove(object, offidxstart,
3275                                                                   offidxend, FALSE);
3276                                         if (object->type == OBJT_SWAP) {
3277                                                   swap_pager_freespace(object,
3278                                                                            offidxstart,
3279                                                                            count);
3280                                         }
3281                                         if (offidxend >= object->size &&
3282                                             offidxstart < object->size) {
3283                                                   object->size = offidxstart;
3284                                         }
3285                               }
3286                               vm_object_drop(object);
3287                     } else if (entry->maptype == VM_MAPTYPE_UKSMAP) {
3288                               pmap_remove(map->pmap, s, e);
3289                     }
3290 
3291                     /*
3292                      * Delete the entry (which may delete the object) only after
3293                      * removing all pmap entries pointing to its pages.
3294                      * (Otherwise, its page frames may be reallocated, and any
3295                      * modify bits will be set in the wrong object!)
3296                      */
3297                     vm_map_entry_delete(map, entry, countp);
3298                     entry = next;
3299           }
3300 
3301           /*
3302            * We either reached the end and use vm_map_max as the end
3303            * address, or we didn't and we use the next entry as the
3304            * end address.
3305            */
3306           if (entry == NULL) {
3307                     vm_map_freehint_hole(map, hole_start,
3308                                              vm_map_max(map) - hole_start);
3309           } else {
3310                     vm_map_freehint_hole(map, hole_start,
3311                                              entry->ba.start - hole_start);
3312           }
3313 
3314           lwkt_reltoken(&map->token);
3315 
3316           return (KERN_SUCCESS);
3317 }
3318 
3319 /*
3320  * Remove the given address range from the target map.
3321  * This is the exported form of vm_map_delete.
3322  *
3323  * No requirements.
3324  */
3325 int
vm_map_remove(vm_map_t map,vm_offset_t start,vm_offset_t end)3326 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3327 {
3328           int result;
3329           int count;
3330 
3331           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3332           vm_map_lock(map);
3333           VM_MAP_RANGE_CHECK(map, start, end);
3334           result = vm_map_delete(map, start, end, &count);
3335           vm_map_unlock(map);
3336           vm_map_entry_release(count);
3337 
3338           return (result);
3339 }
3340 
3341 /*
3342  * Assert that the target map allows the specified privilege on the
3343  * entire address region given.  The entire region must be allocated.
3344  *
3345  * The caller must specify whether the vm_map is already locked or not.
3346  */
3347 boolean_t
vm_map_check_protection(vm_map_t map,vm_offset_t start,vm_offset_t end,vm_prot_t protection,boolean_t have_lock)3348 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3349                               vm_prot_t protection, boolean_t have_lock)
3350 {
3351           vm_map_entry_t entry;
3352           vm_map_entry_t tmp_entry;
3353           boolean_t result;
3354 
3355           if (have_lock == FALSE)
3356                     vm_map_lock_read(map);
3357 
3358           if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
3359                     if (have_lock == FALSE)
3360                               vm_map_unlock_read(map);
3361                     return (FALSE);
3362           }
3363           entry = tmp_entry;
3364 
3365           result = TRUE;
3366           while (start < end) {
3367                     if (entry == NULL) {
3368                               result = FALSE;
3369                               break;
3370                     }
3371 
3372                     /*
3373                      * No holes allowed!
3374                      */
3375 
3376                     if (start < entry->ba.start) {
3377                               result = FALSE;
3378                               break;
3379                     }
3380                     /*
3381                      * Check protection associated with entry.
3382                      */
3383 
3384                     if ((entry->protection & protection) != protection) {
3385                               result = FALSE;
3386                               break;
3387                     }
3388                     /* go to next entry */
3389                     start = entry->ba.end;
3390                     entry = vm_map_rb_tree_RB_NEXT(entry);
3391           }
3392           if (have_lock == FALSE)
3393                     vm_map_unlock_read(map);
3394           return (result);
3395 }
3396 
3397 /*
3398  * vm_map_backing structures are not shared across forks and must be
3399  * replicated.
3400  *
3401  * Generally speaking we must reallocate the backing_ba sequence and
3402  * also adjust it for any changes made to the base entry->ba.start and
3403  * entry->ba.end.  The first ba in the chain is of course &entry->ba,
3404  * so we only need to adjust subsequent ba's start, end, and offset.
3405  *
3406  * MAP_BACK_CLIPPED - Called as part of a clipping replication.
3407  *                              Do not clear OBJ_ONEMAPPING.
3408  *
3409  * MAP_BACK_BASEOBJREFD - Called from vm_map_insert().  The base object
3410  *                              has already been referenced.
3411  */
3412 static
3413 void
vm_map_backing_replicated(vm_map_t map,vm_map_entry_t entry,int flags)3414 vm_map_backing_replicated(vm_map_t map, vm_map_entry_t entry, int flags)
3415 {
3416           vm_map_backing_t ba;
3417           vm_map_backing_t nba;
3418           vm_object_t object;
3419 
3420           ba = &entry->ba;
3421           for (;;) {
3422                     ba->pmap = map->pmap;
3423 
3424                     if (ba->map_object) {
3425                               switch(entry->maptype) {
3426                               case VM_MAPTYPE_NORMAL:
3427                                         object = ba->object;
3428                                         if (ba != &entry->ba ||
3429                                             (flags & MAP_BACK_BASEOBJREFD) == 0) {
3430                                                   vm_object_reference_quick(object);
3431                                         }
3432                                         vm_map_backing_attach(entry, ba);
3433                                         if ((flags & MAP_BACK_CLIPPED) == 0 &&
3434                                             object->ref_count > 1) {
3435                                                   vm_object_clear_flag(object,
3436                                                                            OBJ_ONEMAPPING);
3437                                         }
3438                                         break;
3439                               case VM_MAPTYPE_UKSMAP:
3440                                         vm_map_backing_attach(entry, ba);
3441                                         break;
3442                               default:
3443                                         break;
3444                               }
3445                     }
3446                     if (ba->backing_ba == NULL)
3447                               break;
3448 
3449                     /*
3450                      * NOTE: The aux_info field is retained.
3451                      */
3452                     nba = kmalloc(sizeof(*nba), M_MAP_BACKING, M_INTWAIT);
3453                     *nba = *ba->backing_ba;
3454                     nba->offset += (ba->start - nba->start);  /* += (new - old) */
3455                     nba->start = ba->start;
3456                     nba->end = ba->end;
3457                     ba->backing_ba = nba;
3458                     ba = nba;
3459                     /* pmap is replaced at the top of the loop */
3460           }
3461 }
3462 
3463 static
3464 void
vm_map_backing_adjust_start(vm_map_entry_t entry,vm_ooffset_t start)3465 vm_map_backing_adjust_start(vm_map_entry_t entry, vm_ooffset_t start)
3466 {
3467           vm_map_backing_t ba;
3468 
3469           if (entry->maptype == VM_MAPTYPE_NORMAL) {
3470                     for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3471                               if (ba->object) {
3472                                         lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3473                                         ba->offset += (start - ba->start);
3474                                         ba->start = start;
3475                                         lockmgr(&ba->object->backing_lk, LK_RELEASE);
3476                               } else {
3477                                         ba->offset += (start - ba->start);
3478                                         ba->start = start;
3479                               }
3480                     }
3481           } else {
3482                     /* not an object and can't be shadowed */
3483           }
3484 }
3485 
3486 static
3487 void
vm_map_backing_adjust_end(vm_map_entry_t entry,vm_ooffset_t end)3488 vm_map_backing_adjust_end(vm_map_entry_t entry, vm_ooffset_t end)
3489 {
3490           vm_map_backing_t ba;
3491 
3492           if (entry->maptype == VM_MAPTYPE_NORMAL) {
3493                     for (ba = &entry->ba; ba; ba = ba->backing_ba) {
3494                               if (ba->object) {
3495                                         lockmgr(&ba->object->backing_lk, LK_EXCLUSIVE);
3496                                         ba->end = end;
3497                                         lockmgr(&ba->object->backing_lk, LK_RELEASE);
3498                               } else {
3499                                         ba->end = end;
3500                               }
3501                     }
3502           } /* else not an object and/or can't be shadowed */
3503 }
3504 
3505 /*
3506  * Handles the dirty work of making src_entry and dst_entry copy-on-write
3507  * after src_entry has been cloned to dst_entry.  For normal entries only.
3508  *
3509  * The vm_maps must be exclusively locked.
3510  * The vm_map's token must be held.
3511  *
3512  * Because the maps are locked no faults can be in progress during the
3513  * operation.
3514  */
3515 static void
vm_map_copy_entry(vm_map_t src_map,vm_map_t dst_map,vm_map_entry_t src_entry,vm_map_entry_t dst_entry)3516 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
3517                       vm_map_entry_t src_entry, vm_map_entry_t dst_entry)
3518 {
3519           vm_object_t obj;
3520 
3521           KKASSERT(dst_entry->maptype == VM_MAPTYPE_NORMAL);
3522 
3523           if (src_entry->wired_count) {
3524                     /*
3525                      * Of course, wired down pages can't be set copy-on-write.
3526                      * Cause wired pages to be copied into the new map by
3527                      * simulating faults (the new pages are pageable)
3528                      *
3529                      * Scrap ba.object (its ref-count has not yet been adjusted
3530                      * so we can just NULL out the field).  Remove the backing
3531                      * store.
3532                      *
3533                      * Then call vm_fault_copy_entry() to create a new object
3534                      * in dst_entry and copy the wired pages from src to dst.
3535                      *
3536                      * The fault-copy code doesn't work with virtual page
3537                      * tables.
3538                      *
3539                      * NOTE: obj is not actually an object for all MAPTYPEs,
3540                      *         just test against NULL.
3541                      */
3542                     if (dst_entry->ba.map_object != NULL) {
3543                               vm_map_backing_detach(dst_entry, &dst_entry->ba);
3544                               dst_entry->ba.map_object = NULL;
3545                               vm_map_entry_dispose_ba(dst_entry,
3546                                                             dst_entry->ba.backing_ba);
3547                               dst_entry->ba.backing_ba = NULL;
3548                               dst_entry->ba.backing_count = 0;
3549                     }
3550                     vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
3551           } else {
3552                     if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
3553                               /*
3554                                * If the source entry is not already marked NEEDS_COPY
3555                                * we need to write-protect the PTEs.
3556                                */
3557                               pmap_protect(src_map->pmap,
3558                                              src_entry->ba.start,
3559                                              src_entry->ba.end,
3560                                              src_entry->protection & ~VM_PROT_WRITE);
3561                     }
3562 
3563                     /*
3564                      * dst_entry.ba_object might be stale.  Update it (its
3565                      * ref-count has not yet been updated so just overwrite
3566                      * the field).
3567                      *
3568                      * If there is no object then we are golden.  Also, in
3569                      * this situation if there are no backing_ba linkages then
3570                      * we can set ba.offset to whatever we want.  For now we
3571                      * set the offset for 0 for make debugging object sizes
3572                      * easier.
3573                      */
3574                     obj = src_entry->ba.object;
3575 
3576                     if (obj) {
3577                               src_entry->eflags |= (MAP_ENTRY_COW |
3578                                                         MAP_ENTRY_NEEDS_COPY);
3579                               dst_entry->eflags |= (MAP_ENTRY_COW |
3580                                                         MAP_ENTRY_NEEDS_COPY);
3581                               KKASSERT(dst_entry->ba.offset == src_entry->ba.offset);
3582                     } else {
3583                               dst_entry->ba.offset = 0;
3584                     }
3585 
3586                     /*
3587                      * Normal, allow the backing_ba link depth to
3588                      * increase.
3589                      */
3590                     pmap_copy(dst_map->pmap, src_map->pmap,
3591                                 dst_entry->ba.start,
3592                                 dst_entry->ba.end - dst_entry->ba.start,
3593                                 src_entry->ba.start);
3594           }
3595 }
3596 
3597 /*
3598  * Create a vmspace for a new process and its related vm_map based on an
3599  * existing vmspace.  The new map inherits information from the old map
3600  * according to inheritance settings.
3601  *
3602  * The source map must not be locked.
3603  * No requirements.
3604  */
3605 static void vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3606                                 vm_map_entry_t old_entry, int *countp);
3607 static void vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3608                                 vm_map_t old_map, vm_map_t new_map,
3609                                 vm_map_entry_t old_entry, int *countp);
3610 
3611 struct vmspace *
vmspace_fork(struct vmspace * vm1,struct proc * p2,struct lwp * lp2)3612 vmspace_fork(struct vmspace *vm1, struct proc *p2, struct lwp *lp2)
3613 {
3614           struct vmspace *vm2;
3615           vm_map_t old_map = &vm1->vm_map;
3616           vm_map_t new_map;
3617           vm_map_entry_t old_entry;
3618           int count;
3619 
3620           lwkt_gettoken(&vm1->vm_map.token);
3621           vm_map_lock(old_map);
3622 
3623           vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map));
3624           lwkt_gettoken(&vm2->vm_map.token);
3625 
3626           /*
3627            * We must bump the timestamp to force any concurrent fault
3628            * to retry.
3629            */
3630           bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
3631                 (caddr_t)&vm1->vm_endcopy - (caddr_t)&vm1->vm_startcopy);
3632           new_map = &vm2->vm_map;       /* XXX */
3633           new_map->timestamp = 1;
3634 
3635           vm_map_lock(new_map);
3636 
3637           count = old_map->nentries;
3638           count = vm_map_entry_reserve(count + MAP_RESERVE_COUNT);
3639 
3640           RB_FOREACH(old_entry, vm_map_rb_tree, &old_map->rb_root) {
3641                     switch(old_entry->maptype) {
3642                     case VM_MAPTYPE_SUBMAP:
3643                               panic("vm_map_fork: encountered a submap");
3644                               break;
3645                     case VM_MAPTYPE_UKSMAP:
3646                               vmspace_fork_uksmap_entry(p2, lp2,
3647                                                               old_map, new_map,
3648                                                               old_entry, &count);
3649                               break;
3650                     case VM_MAPTYPE_NORMAL:
3651                               vmspace_fork_normal_entry(old_map, new_map,
3652                                                               old_entry, &count);
3653                               break;
3654                     default:
3655                               /* nothing to do */
3656                               break;
3657                     }
3658           }
3659 
3660           new_map->size = old_map->size;
3661           vm_map_unlock(new_map);
3662           vm_map_unlock(old_map);
3663           vm_map_entry_release(count);
3664 
3665           lwkt_reltoken(&vm2->vm_map.token);
3666           lwkt_reltoken(&vm1->vm_map.token);
3667 
3668           return (vm2);
3669 }
3670 
3671 static
3672 void
vmspace_fork_normal_entry(vm_map_t old_map,vm_map_t new_map,vm_map_entry_t old_entry,int * countp)3673 vmspace_fork_normal_entry(vm_map_t old_map, vm_map_t new_map,
3674                                 vm_map_entry_t old_entry, int *countp)
3675 {
3676           vm_map_entry_t new_entry;
3677           vm_map_backing_t ba;
3678           vm_object_t object;
3679 
3680           /*
3681            * If the backing_ba link list gets too long then fault it
3682            * all into the head object and dispose of the list.  We do
3683            * this in old_entry prior to cloning in order to benefit both
3684            * parent and child.
3685            *
3686            * We can test our fronting object's size against its
3687            * resident_page_count for a really cheap (but probably not perfect)
3688            * all-shadowed test, allowing us to disconnect the backing_ba
3689            * link list early.
3690            */
3691           object = old_entry->ba.object;
3692           if (old_entry->ba.backing_ba &&
3693               (old_entry->ba.backing_count >= vm_map_backing_limit ||
3694                (vm_map_backing_shadow_test && object &&
3695                 object->size == object->resident_page_count))) {
3696                     /*
3697                      * If there are too many backing_ba linkages we
3698                      * collapse everything into the head
3699                      *
3700                      * This will also remove all the pte's.
3701                      */
3702                     if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY)
3703                               vm_map_entry_shadow(old_entry);
3704                     if (object == NULL)
3705                               vm_map_entry_allocate_object(old_entry);
3706                     if (vm_fault_collapse(old_map, old_entry) == KERN_SUCCESS) {
3707                               ba = old_entry->ba.backing_ba;
3708                               old_entry->ba.backing_ba = NULL;
3709                               old_entry->ba.backing_count = 0;
3710                               vm_map_entry_dispose_ba(old_entry, ba);
3711                     }
3712           }
3713           object = NULL;      /* object variable is now invalid */
3714 
3715           /*
3716            * Fork the entry
3717            */
3718           switch (old_entry->inheritance) {
3719           case VM_INHERIT_NONE:
3720                     break;
3721           case VM_INHERIT_SHARE:
3722                     /*
3723                      * Clone the entry as a shared entry.  This will look like
3724                      * shared memory across the old and the new process.  We must
3725                      * ensure that the object is allocated.
3726                      */
3727                     if (old_entry->ba.object == NULL)
3728                               vm_map_entry_allocate_object(old_entry);
3729 
3730                     if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3731                               /*
3732                                * Create the fronting vm_map_backing for
3733                                * an entry which needs a copy, plus an extra
3734                                * ref because we are going to duplicate it
3735                                * in the fork.
3736                                *
3737                                * The call to vm_map_entry_shadow() will also clear
3738                                * OBJ_ONEMAPPING.
3739                                *
3740                                * XXX no more collapse.  Still need extra ref
3741                                * for the fork.
3742                                */
3743                               vm_map_entry_shadow(old_entry);
3744                     } else if (old_entry->ba.object) {
3745                               object = old_entry->ba.object;
3746                     }
3747 
3748                     /*
3749                      * Clone the entry.  We've already bumped the ref on
3750                      * the vm_object for our new entry.
3751                      */
3752                     new_entry = vm_map_entry_create(countp);
3753                     *new_entry = *old_entry;
3754 
3755                     new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3756                     new_entry->wired_count = 0;
3757 
3758                     /*
3759                      * Replicate and index the vm_map_backing.  Don't share
3760                      * the vm_map_backing across vm_map's (only across clips).
3761                      *
3762                      * Insert the entry into the new map -- we know we're
3763                      * inserting at the end of the new map.
3764                      */
3765                     vm_map_backing_replicated(new_map, new_entry, 0);
3766                     vm_map_entry_link(new_map, new_entry);
3767 
3768                     /*
3769                      * Update the physical map
3770                      */
3771                     pmap_copy(new_map->pmap, old_map->pmap,
3772                                 new_entry->ba.start,
3773                                 (old_entry->ba.end - old_entry->ba.start),
3774                                 old_entry->ba.start);
3775                     break;
3776           case VM_INHERIT_COPY:
3777                     /*
3778                      * Clone the entry and link the copy into the new map.
3779                      *
3780                      * Note that ref-counting adjustment for old_entry->ba.object
3781                      * (if it isn't a special map that is) is handled by
3782                      * vm_map_copy_entry().
3783                      */
3784                     new_entry = vm_map_entry_create(countp);
3785                     *new_entry = *old_entry;
3786 
3787                     new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3788                     new_entry->wired_count = 0;
3789 
3790                     vm_map_backing_replicated(new_map, new_entry, 0);
3791                     vm_map_entry_link(new_map, new_entry);
3792 
3793                     /*
3794                      * This does the actual dirty work of making both entries
3795                      * copy-on-write, and will also handle the fronting object.
3796                      */
3797                     vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
3798                     break;
3799           }
3800 }
3801 
3802 /*
3803  * When forking user-kernel shared maps, the map might change in the
3804  * child so do not try to copy the underlying pmap entries.
3805  */
3806 static
3807 void
vmspace_fork_uksmap_entry(struct proc * p2,struct lwp * lp2,vm_map_t old_map,vm_map_t new_map,vm_map_entry_t old_entry,int * countp)3808 vmspace_fork_uksmap_entry(struct proc *p2, struct lwp *lp2,
3809                                 vm_map_t old_map, vm_map_t new_map,
3810                                 vm_map_entry_t old_entry, int *countp)
3811 {
3812           vm_map_entry_t new_entry;
3813 
3814           /*
3815            * Do not fork lpmap entries whos TIDs do not match lp2's tid.
3816            *
3817            * XXX if p2 is NULL and lp2 is non-NULL, we retain the lpmap entry
3818            * (this is for e.g. resident'ing vmspace's) but set the field
3819            * to NULL.  Upon restore it should be restored. XXX NOT IMPL YET
3820            */
3821           if (old_entry->aux.dev) {
3822                     switch(minor(old_entry->aux.dev)) {
3823                     case 5:
3824                               break;
3825                     case 6:
3826                               break;
3827                     case 7:
3828                               if (lp2 == NULL)
3829                                         return;
3830                               if (old_entry->ba.aux_info == NULL)
3831                                         return;
3832                               if (((struct lwp *)old_entry->ba.aux_info)->lwp_tid !=
3833                                   lp2->lwp_tid)
3834                                         return;
3835                               break;
3836                     }
3837           }
3838 
3839           new_entry = vm_map_entry_create(countp);
3840           *new_entry = *old_entry;
3841 
3842           new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3843           new_entry->wired_count = 0;
3844           KKASSERT(new_entry->ba.backing_ba == NULL);
3845 
3846           if (new_entry->aux.dev) {
3847                     switch(minor(new_entry->aux.dev)) {
3848                     case 5:
3849                               /*
3850                                * upmap
3851                                */
3852                               new_entry->ba.aux_info = p2;
3853                               break;
3854                     case 6:
3855                               /*
3856                                * kpmap
3857                                */
3858                               new_entry->ba.aux_info = NULL;
3859                               break;
3860                     case 7:
3861                               /*
3862                                * lpmap
3863                                */
3864                               new_entry->ba.aux_info = lp2;
3865                               break;
3866                     }
3867           } else {
3868                     new_entry->ba.aux_info = NULL;
3869           }
3870 
3871           vm_map_backing_replicated(new_map, new_entry, 0);
3872 
3873           vm_map_entry_link(new_map, new_entry);
3874 }
3875 
3876 /*
3877  * Create an auto-grow stack entry
3878  *
3879  * No requirements.
3880  */
3881 int
vm_map_stack(vm_map_t map,vm_offset_t * addrbos,vm_size_t max_ssize,int flags,vm_prot_t prot,vm_prot_t max,int cow)3882 vm_map_stack (vm_map_t map, vm_offset_t *addrbos, vm_size_t max_ssize,
3883                 int flags, vm_prot_t prot, vm_prot_t max, int cow)
3884 {
3885           vm_map_entry_t      prev_entry;
3886           vm_map_entry_t      next;
3887           vm_size_t init_ssize;
3888           int                 rv;
3889           int                 count;
3890           vm_offset_t         tmpaddr;
3891 
3892           cow |= MAP_IS_STACK;
3893 
3894           if (max_ssize < sgrowsiz)
3895                     init_ssize = max_ssize;
3896           else
3897                     init_ssize = sgrowsiz;
3898 
3899           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
3900           vm_map_lock(map);
3901 
3902           /*
3903            * Find space for the mapping
3904            */
3905           if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
3906                     if (vm_map_findspace(map, *addrbos, max_ssize, 1,
3907                                              flags, &tmpaddr)) {
3908                               vm_map_unlock(map);
3909                               vm_map_entry_release(count);
3910                               return (KERN_NO_SPACE);
3911                     }
3912                     *addrbos = tmpaddr;
3913           }
3914 
3915           /* If addr is already mapped, no go */
3916           if (vm_map_lookup_entry(map, *addrbos, &prev_entry)) {
3917                     vm_map_unlock(map);
3918                     vm_map_entry_release(count);
3919                     return (KERN_NO_SPACE);
3920           }
3921 
3922 #if 0
3923           /* XXX already handled by kern_mmap() */
3924           /* If we would blow our VMEM resource limit, no go */
3925           if (map->size + init_ssize >
3926               curproc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
3927                     vm_map_unlock(map);
3928                     vm_map_entry_release(count);
3929                     return (KERN_NO_SPACE);
3930           }
3931 #endif
3932 
3933           /*
3934            * If we can't accomodate max_ssize in the current mapping,
3935            * no go.  However, we need to be aware that subsequent user
3936            * mappings might map into the space we have reserved for
3937            * stack, and currently this space is not protected.
3938            *
3939            * Hopefully we will at least detect this condition
3940            * when we try to grow the stack.
3941            */
3942           if (prev_entry)
3943                     next = vm_map_rb_tree_RB_NEXT(prev_entry);
3944           else
3945                     next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3946 
3947           if (next && next->ba.start < *addrbos + max_ssize) {
3948                     vm_map_unlock(map);
3949                     vm_map_entry_release(count);
3950                     return (KERN_NO_SPACE);
3951           }
3952 
3953           /*
3954            * We initially map a stack of only init_ssize.  We will
3955            * grow as needed later.  Since this is to be a grow
3956            * down stack, we map at the top of the range.
3957            *
3958            * Note: we would normally expect prot and max to be
3959            * VM_PROT_ALL, and cow to be 0.  Possibly we should
3960            * eliminate these as input parameters, and just
3961            * pass these values here in the insert call.
3962            */
3963           rv = vm_map_insert(map, &count,
3964                                  NULL, NULL,
3965                                  0, NULL,
3966                                  *addrbos + max_ssize - init_ssize,
3967                              *addrbos + max_ssize,
3968                                  VM_MAPTYPE_NORMAL,
3969                                  VM_SUBSYS_STACK, prot, max, cow);
3970 
3971           /* Now set the avail_ssize amount */
3972           if (rv == KERN_SUCCESS) {
3973                     if (prev_entry)
3974                               next = vm_map_rb_tree_RB_NEXT(prev_entry);
3975                     else
3976                               next = RB_MIN(vm_map_rb_tree, &map->rb_root);
3977                     if (prev_entry != NULL) {
3978                               vm_map_clip_end(map,
3979                                                   prev_entry,
3980                                                   *addrbos + max_ssize - init_ssize,
3981                                                   &count);
3982                     }
3983                     if (next->ba.end   != *addrbos + max_ssize ||
3984                         next->ba.start != *addrbos + max_ssize - init_ssize){
3985                               panic ("Bad entry start/end for new stack entry");
3986                     } else {
3987                               next->aux.avail_ssize = max_ssize - init_ssize;
3988                     }
3989           }
3990 
3991           vm_map_unlock(map);
3992           vm_map_entry_release(count);
3993           return (rv);
3994 }
3995 
3996 /*
3997  * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
3998  * desired address is already mapped, or if we successfully grow
3999  * the stack.  Also returns KERN_SUCCESS if addr is outside the
4000  * stack range (this is strange, but preserves compatibility with
4001  * the grow function in vm_machdep.c).
4002  *
4003  * No requirements.
4004  */
4005 int
vm_map_growstack(vm_map_t map,vm_offset_t addr)4006 vm_map_growstack (vm_map_t map, vm_offset_t addr)
4007 {
4008           vm_map_entry_t prev_entry;
4009           vm_map_entry_t stack_entry;
4010           vm_map_entry_t next;
4011           struct vmspace *vm;
4012           struct lwp *lp;
4013           struct proc *p;
4014           vm_offset_t    end;
4015           int grow_amount;
4016           int rv = KERN_SUCCESS;
4017           int is_procstack;
4018           int use_read_lock = 1;
4019           int count;
4020 
4021           /*
4022            * Find the vm
4023            */
4024           lp = curthread->td_lwp;
4025           p = curthread->td_proc;
4026           KKASSERT(lp != NULL);
4027           vm = lp->lwp_vmspace;
4028 
4029           /*
4030            * Growstack is only allowed on the current process.  We disallow
4031            * other use cases, e.g. trying to access memory via procfs that
4032            * the stack hasn't grown into.
4033            */
4034           if (map != &vm->vm_map) {
4035                     return KERN_FAILURE;
4036           }
4037 
4038           count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4039 Retry:
4040           if (use_read_lock)
4041                     vm_map_lock_read(map);
4042           else
4043                     vm_map_lock(map);
4044 
4045           /*
4046            * If addr is already in the entry range, no need to grow.
4047            * prev_entry returns NULL if addr is at the head.
4048            */
4049           if (vm_map_lookup_entry(map, addr, &prev_entry))
4050                     goto done;
4051           if (prev_entry)
4052                     stack_entry = vm_map_rb_tree_RB_NEXT(prev_entry);
4053           else
4054                     stack_entry = RB_MIN(vm_map_rb_tree, &map->rb_root);
4055 
4056           if (stack_entry == NULL)
4057                     goto done;
4058           if (prev_entry == NULL)
4059                     end = stack_entry->ba.start - stack_entry->aux.avail_ssize;
4060           else
4061                     end = prev_entry->ba.end;
4062 
4063           /*
4064            * This next test mimics the old grow function in vm_machdep.c.
4065            * It really doesn't quite make sense, but we do it anyway
4066            * for compatibility.
4067            *
4068            * If not growable stack, return success.  This signals the
4069            * caller to proceed as he would normally with normal vm.
4070            */
4071           if (stack_entry->aux.avail_ssize < 1 ||
4072               addr >= stack_entry->ba.start ||
4073               addr <  stack_entry->ba.start - stack_entry->aux.avail_ssize) {
4074                     goto done;
4075           }
4076 
4077           /* Find the minimum grow amount */
4078           grow_amount = roundup (stack_entry->ba.start - addr, PAGE_SIZE);
4079           if (grow_amount > stack_entry->aux.avail_ssize) {
4080                     rv = KERN_NO_SPACE;
4081                     goto done;
4082           }
4083 
4084           /*
4085            * If there is no longer enough space between the entries
4086            * nogo, and adjust the available space.  Note: this
4087            * should only happen if the user has mapped into the
4088            * stack area after the stack was created, and is
4089            * probably an error.
4090            *
4091            * This also effectively destroys any guard page the user
4092            * might have intended by limiting the stack size.
4093            */
4094           if (grow_amount > stack_entry->ba.start - end) {
4095                     if (use_read_lock && vm_map_lock_upgrade(map)) {
4096                               /* lost lock */
4097                               use_read_lock = 0;
4098                               goto Retry;
4099                     }
4100                     use_read_lock = 0;
4101                     stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4102                     rv = KERN_NO_SPACE;
4103                     goto done;
4104           }
4105 
4106           is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
4107 
4108           /* If this is the main process stack, see if we're over the
4109            * stack limit.
4110            */
4111           if (is_procstack && (vm->vm_ssize + grow_amount >
4112                                    p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4113                     rv = KERN_NO_SPACE;
4114                     goto done;
4115           }
4116 
4117           /* Round up the grow amount modulo SGROWSIZ */
4118           grow_amount = roundup (grow_amount, sgrowsiz);
4119           if (grow_amount > stack_entry->aux.avail_ssize) {
4120                     grow_amount = stack_entry->aux.avail_ssize;
4121           }
4122           if (is_procstack && (vm->vm_ssize + grow_amount >
4123                                p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
4124                     grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur - vm->vm_ssize;
4125           }
4126 
4127           /* If we would blow our VMEM resource limit, no go */
4128           if (map->size + grow_amount > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
4129                     rv = KERN_NO_SPACE;
4130                     goto done;
4131           }
4132 
4133           if (use_read_lock && vm_map_lock_upgrade(map)) {
4134                     /* lost lock */
4135                     use_read_lock = 0;
4136                     goto Retry;
4137           }
4138           use_read_lock = 0;
4139 
4140           /* Get the preliminary new entry start value */
4141           addr = stack_entry->ba.start - grow_amount;
4142 
4143           /* If this puts us into the previous entry, cut back our growth
4144            * to the available space.  Also, see the note above.
4145            */
4146           if (addr < end) {
4147                     stack_entry->aux.avail_ssize = stack_entry->ba.start - end;
4148                     addr = end;
4149           }
4150 
4151           rv = vm_map_insert(map, &count,
4152                                  NULL, NULL,
4153                                  0, NULL,
4154                                  addr, stack_entry->ba.start,
4155                                  VM_MAPTYPE_NORMAL,
4156                                  VM_SUBSYS_STACK, VM_PROT_ALL, VM_PROT_ALL, 0);
4157 
4158           /* Adjust the available stack space by the amount we grew. */
4159           if (rv == KERN_SUCCESS) {
4160                     if (prev_entry) {
4161                               vm_map_clip_end(map, prev_entry, addr, &count);
4162                               next = vm_map_rb_tree_RB_NEXT(prev_entry);
4163                     } else {
4164                               next = RB_MIN(vm_map_rb_tree, &map->rb_root);
4165                     }
4166                     if (next->ba.end != stack_entry->ba.start  ||
4167                         next->ba.start != addr) {
4168                               panic ("Bad stack grow start/end in new stack entry");
4169                     } else {
4170                               next->aux.avail_ssize =
4171                                         stack_entry->aux.avail_ssize -
4172                                         (next->ba.end - next->ba.start);
4173                               if (is_procstack) {
4174                                         vm->vm_ssize += next->ba.end -
4175                                                             next->ba.start;
4176                               }
4177                     }
4178 
4179                     if (map->flags & MAP_WIREFUTURE) {
4180                               vm_map_user_wiring(map,
4181                                                      next->ba.start,
4182                                                      next->ba.end,
4183                                                      FALSE);
4184                     }
4185           }
4186 
4187 done:
4188           if (use_read_lock)
4189                     vm_map_unlock_read(map);
4190           else
4191                     vm_map_unlock(map);
4192           vm_map_entry_release(count);
4193           return (rv);
4194 }
4195 
4196 /*
4197  * Unshare the specified VM space for exec.  If other processes are
4198  * mapped to it, then create a new one.  The new vmspace is null.
4199  *
4200  * No requirements.
4201  */
4202 void
vmspace_exec(struct proc * p,struct vmspace * vmcopy)4203 vmspace_exec(struct proc *p, struct vmspace *vmcopy)
4204 {
4205           struct vmspace *oldvmspace = p->p_vmspace;
4206           struct vmspace *newvmspace;
4207           vm_map_t map = &p->p_vmspace->vm_map;
4208 
4209           /*
4210            * If we are execing a resident vmspace we fork it, otherwise
4211            * we create a new vmspace.  Note that exitingcnt is not
4212            * copied to the new vmspace.
4213            */
4214           lwkt_gettoken(&oldvmspace->vm_map.token);
4215           if (vmcopy)  {
4216                     newvmspace = vmspace_fork(vmcopy, NULL, NULL);
4217                     lwkt_gettoken(&newvmspace->vm_map.token);
4218           } else {
4219                     newvmspace = vmspace_alloc(vm_map_min(map), vm_map_max(map));
4220                     lwkt_gettoken(&newvmspace->vm_map.token);
4221                     bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
4222                           (caddr_t)&oldvmspace->vm_endcopy -
4223                            (caddr_t)&oldvmspace->vm_startcopy);
4224           }
4225 
4226           /*
4227            * Finish initializing the vmspace before assigning it
4228            * to the process.  The vmspace will become the current vmspace
4229            * if p == curproc.
4230            */
4231           pmap_pinit2(vmspace_pmap(newvmspace));
4232           pmap_replacevm(p, newvmspace, 0);
4233           lwkt_reltoken(&newvmspace->vm_map.token);
4234           lwkt_reltoken(&oldvmspace->vm_map.token);
4235           vmspace_rel(oldvmspace);
4236 }
4237 
4238 /*
4239  * Unshare the specified VM space for forcing COW.  This
4240  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4241  */
4242 void
vmspace_unshare(struct proc * p)4243 vmspace_unshare(struct proc *p)
4244 {
4245           struct vmspace *oldvmspace = p->p_vmspace;
4246           struct vmspace *newvmspace;
4247 
4248           lwkt_gettoken(&oldvmspace->vm_map.token);
4249           if (vmspace_getrefs(oldvmspace) == 1) {
4250                     lwkt_reltoken(&oldvmspace->vm_map.token);
4251                     return;
4252           }
4253           newvmspace = vmspace_fork(oldvmspace, NULL, NULL);
4254           lwkt_gettoken(&newvmspace->vm_map.token);
4255           pmap_pinit2(vmspace_pmap(newvmspace));
4256           pmap_replacevm(p, newvmspace, 0);
4257           lwkt_reltoken(&newvmspace->vm_map.token);
4258           lwkt_reltoken(&oldvmspace->vm_map.token);
4259           vmspace_rel(oldvmspace);
4260 }
4261 
4262 /*
4263  * vm_map_hint: return the beginning of the best area suitable for
4264  * creating a new mapping with "prot" protection.
4265  *
4266  * No requirements.
4267  */
4268 vm_offset_t
vm_map_hint(struct proc * p,vm_offset_t addr,vm_prot_t prot)4269 vm_map_hint(struct proc *p, vm_offset_t addr, vm_prot_t prot)
4270 {
4271           struct vmspace *vms = p->p_vmspace;
4272           struct rlimit limit;
4273           rlim_t dsiz;
4274 
4275           /*
4276            * Acquire datasize limit for mmap() operation,
4277            * calculate nearest power of 2.
4278            */
4279           if (kern_getrlimit(RLIMIT_DATA, &limit))
4280                     limit.rlim_cur = maxdsiz;
4281           dsiz = limit.rlim_cur;
4282 
4283           if (!randomize_mmap || addr != 0) {
4284                     /*
4285                      * Set a reasonable start point for the hint if it was
4286                      * not specified or if it falls within the heap space.
4287                      * Hinted mmap()s do not allocate out of the heap space.
4288                      */
4289                     if (addr == 0 ||
4290                         (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
4291                          addr < round_page((vm_offset_t)vms->vm_daddr + dsiz))) {
4292                               addr = round_page((vm_offset_t)vms->vm_daddr + dsiz);
4293                     }
4294 
4295                     return addr;
4296           }
4297 
4298           /*
4299            * randomize_mmap && addr == 0.  For now randomize the
4300            * address within a dsiz range beyond the data limit.
4301            */
4302           addr = (vm_offset_t)vms->vm_daddr + dsiz;
4303           if (dsiz)
4304                     addr += (karc4random64() & 0x7FFFFFFFFFFFFFFFLU) % dsiz;
4305           return (round_page(addr));
4306 }
4307 
4308 /*
4309  * Finds the VM object, offset, and protection for a given virtual address
4310  * in the specified map, assuming a page fault of the type specified.
4311  *
4312  * Leaves the map in question locked for read; return values are guaranteed
4313  * until a vm_map_lookup_done call is performed.  Note that the map argument
4314  * is in/out; the returned map must be used in the call to vm_map_lookup_done.
4315  *
4316  * A handle (out_entry) is returned for use in vm_map_lookup_done, to make
4317  * that fast.
4318  *
4319  * If a lookup is requested with "write protection" specified, the map may
4320  * be changed to perform virtual copying operations, although the data
4321  * referenced will remain the same.
4322  *
4323  * No requirements.
4324  */
4325 int
vm_map_lookup(vm_map_t * var_map,vm_offset_t vaddr,vm_prot_t fault_typea,vm_map_entry_t * out_entry,struct vm_map_backing ** bap,vm_pindex_t * pindex,vm_pindex_t * pcount,vm_prot_t * out_prot,int * wflags)4326 vm_map_lookup(vm_map_t *var_map,                  /* IN/OUT */
4327                 vm_offset_t vaddr,
4328                 vm_prot_t fault_typea,
4329                 vm_map_entry_t *out_entry,        /* OUT */
4330                 struct vm_map_backing **bap,      /* OUT */
4331                 vm_pindex_t *pindex,              /* OUT */
4332                 vm_pindex_t *pcount,              /* OUT */
4333                 vm_prot_t *out_prot,              /* OUT */
4334                 int *wflags)                      /* OUT */
4335 {
4336           vm_map_entry_t entry;
4337           vm_map_t map = *var_map;
4338           vm_prot_t prot;
4339           vm_prot_t fault_type = fault_typea;
4340           int use_read_lock = 1;
4341           int rv = KERN_SUCCESS;
4342           int count;
4343           thread_t td = curthread;
4344 
4345           /*
4346            * vm_map_entry_reserve() implements an important mitigation
4347            * against mmap() span running the kernel out of vm_map_entry
4348            * structures, but it can also cause an infinite call recursion.
4349            * Use td_nest_count to prevent an infinite recursion (allows
4350            * the vm_map code to dig into the pcpu vm_map_entry reserve).
4351            */
4352           count = 0;
4353           if (td->td_nest_count == 0) {
4354                     ++td->td_nest_count;
4355                     count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
4356                     --td->td_nest_count;
4357           }
4358 RetryLookup:
4359           if (use_read_lock)
4360                     vm_map_lock_read(map);
4361           else
4362                     vm_map_lock(map);
4363 
4364           /*
4365            * Always do a full lookup.  The hint doesn't get us much anymore
4366            * now that the map is RB'd.
4367            */
4368           cpu_ccfence();
4369           *out_entry = NULL;
4370           *bap = NULL;
4371 
4372           {
4373                     vm_map_entry_t tmp_entry;
4374 
4375                     if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
4376                               rv = KERN_INVALID_ADDRESS;
4377                               goto done;
4378                     }
4379                     entry = tmp_entry;
4380                     *out_entry = entry;
4381           }
4382 
4383           /*
4384            * Handle submaps.
4385            */
4386           if (entry->maptype == VM_MAPTYPE_SUBMAP) {
4387                     vm_map_t old_map = map;
4388 
4389                     *var_map = map = entry->ba.sub_map;
4390                     if (use_read_lock)
4391                               vm_map_unlock_read(old_map);
4392                     else
4393                               vm_map_unlock(old_map);
4394                     use_read_lock = 1;
4395                     goto RetryLookup;
4396           }
4397 
4398           /*
4399            * Check whether this task is allowed to have this page.
4400            * Note the special case for MAP_ENTRY_COW pages with an override.
4401            * This is to implement a forced COW for debuggers.
4402            */
4403           if (fault_type & VM_PROT_OVERRIDE_WRITE)
4404                     prot = entry->max_protection;
4405           else
4406                     prot = entry->protection;
4407 
4408           fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
4409           if ((fault_type & prot) != fault_type) {
4410                     rv = KERN_PROTECTION_FAILURE;
4411                     goto done;
4412           }
4413 
4414           if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
4415               (entry->eflags & MAP_ENTRY_COW) &&
4416               (fault_type & VM_PROT_WRITE) &&
4417               (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
4418                     rv = KERN_PROTECTION_FAILURE;
4419                     goto done;
4420           }
4421 
4422           /*
4423            * Flag regular pages that are supposed to be wired.  Remove prior
4424            * semantics that disallowed protection changes for such pages.
4425            *
4426            * The prior semantics are not used by modern systems.  Applications
4427            * do not assume an inability to change protection modes and may
4428            * operate incorrectly if we try to prevent protection changes.
4429            *
4430            * Modern applications are aware that even for locked memory,
4431            * changing protection modes, modifying MAP_PRIVATE mappings,
4432            * or fork() may still cause page faults on the locked memory.
4433            */
4434           *wflags = 0;
4435           if (entry->wired_count) {
4436                     *wflags |= FW_WIRED;
4437 #if 0
4438                     prot = fault_type = entry->protection;
4439 #endif
4440           }
4441 
4442           if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
4443               pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
4444                     if ((prot & VM_PROT_WRITE) == 0)
4445                               fault_type |= VM_PROT_WRITE;
4446           }
4447 
4448           /*
4449            * Only NORMAL maps are object-based.  UKSMAPs are not.
4450            */
4451           if (entry->maptype != VM_MAPTYPE_NORMAL) {
4452                     *bap = NULL;
4453                     goto skip;
4454           }
4455 
4456           /*
4457            * If the entry was copy-on-write, we either ...
4458            */
4459           if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4460                     /*
4461                      * If we want to write the page, we may as well handle that
4462                      * now since we've got the map locked.
4463                      *
4464                      * If we don't need to write the page, we just demote the
4465                      * permissions allowed.
4466                      */
4467                     if (fault_type & VM_PROT_WRITE) {
4468                               /*
4469                                * Not allowed if TDF_NOFAULT is set as the shadowing
4470                                * operation can deadlock against the faulting
4471                                * function due to the copy-on-write.
4472                                */
4473                               if (curthread->td_flags & TDF_NOFAULT) {
4474                                         rv = KERN_FAILURE_NOFAULT;
4475                                         goto done;
4476                               }
4477 
4478                               /*
4479                                * Make a new vm_map_backing + object, and place it
4480                                * in the object chain.  Note that no new references
4481                                * have appeared -- one just moved from the map to
4482                                * the new object.
4483                                */
4484                               if (use_read_lock && vm_map_lock_upgrade(map)) {
4485                                         /* lost lock */
4486                                         use_read_lock = 0;
4487                                         goto RetryLookup;
4488                               }
4489                               use_read_lock = 0;
4490                               vm_map_entry_shadow(entry);
4491                               *wflags |= FW_DIDCOW;
4492                     } else {
4493                               /*
4494                                * We're attempting to read a copy-on-write page --
4495                                * don't allow writes.
4496                                */
4497                               prot &= ~VM_PROT_WRITE;
4498                     }
4499           }
4500 
4501           /*
4502            * Create an object if necessary.  This code also handles
4503            * partitioning large entries to improve vm_fault performance.
4504            */
4505           if (entry->ba.object == NULL && !map->system_map) {
4506                     if (use_read_lock && vm_map_lock_upgrade(map))  {
4507                               /* lost lock */
4508                               use_read_lock = 0;
4509                               goto RetryLookup;
4510                     }
4511                     use_read_lock = 0;
4512 
4513                     /*
4514                      * Partition large entries, giving each its own VM object,
4515                      * to improve concurrent fault performance.  This is only
4516                      * applicable to userspace.
4517                      */
4518                     if (map != kernel_map &&
4519                         entry->maptype == VM_MAPTYPE_NORMAL &&
4520                         ((entry->ba.start ^ entry->ba.end) &
4521                          ~MAP_ENTRY_PARTITION_MASK) &&
4522                         vm_map_partition_enable) {
4523                               if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
4524                                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
4525                                         ++mycpu->gd_cnt.v_intrans_coll;
4526                                         ++mycpu->gd_cnt.v_intrans_wait;
4527                                         vm_map_transition_wait(map, 0);
4528                                         goto RetryLookup;
4529                               }
4530                               vm_map_entry_partition(map, entry, vaddr, &count);
4531                     }
4532                     vm_map_entry_allocate_object(entry);
4533           }
4534 
4535           /*
4536            * Return the object/offset from this entry.  If the entry was
4537            * copy-on-write or empty, it has been fixed up.
4538            */
4539           *bap = &entry->ba;
4540 
4541 skip:
4542           *pindex = OFF_TO_IDX((vaddr - entry->ba.start) + entry->ba.offset);
4543           *pcount = OFF_TO_IDX(entry->ba.end - trunc_page(vaddr));
4544 
4545           /*
4546            * Return whether this is the only map sharing this data.  On
4547            * success we return with a read lock held on the map.  On failure
4548            * we return with the map unlocked.
4549            */
4550           *out_prot = prot;
4551 done:
4552           if (rv == KERN_SUCCESS) {
4553                     if (use_read_lock == 0)
4554                               vm_map_lock_downgrade(map);
4555           } else if (use_read_lock) {
4556                     vm_map_unlock_read(map);
4557           } else {
4558                     vm_map_unlock(map);
4559           }
4560           if (count > 0)
4561                     vm_map_entry_release(count);
4562 
4563           return (rv);
4564 }
4565 
4566 /*
4567  * Releases locks acquired by a vm_map_lookup()
4568  * (according to the handle returned by that lookup).
4569  *
4570  * No other requirements.
4571  */
4572 void
vm_map_lookup_done(vm_map_t map,vm_map_entry_t entry,int count)4573 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry, int count)
4574 {
4575           /*
4576            * Unlock the main-level map
4577            */
4578           vm_map_unlock_read(map);
4579           if (count)
4580                     vm_map_entry_release(count);
4581 }
4582 
4583 static void
vm_map_entry_partition(vm_map_t map,vm_map_entry_t entry,vm_offset_t vaddr,int * countp)4584 vm_map_entry_partition(vm_map_t map, vm_map_entry_t entry,
4585                            vm_offset_t vaddr, int *countp)
4586 {
4587           vaddr &= ~MAP_ENTRY_PARTITION_MASK;
4588           vm_map_clip_start(map, entry, vaddr, countp);
4589           vaddr += MAP_ENTRY_PARTITION_SIZE;
4590           vm_map_clip_end(map, entry, vaddr, countp);
4591 }
4592 
4593 /*
4594  * Quick hack, needs some help to make it more SMP friendly.
4595  */
4596 void
vm_map_interlock(vm_map_t map,struct vm_map_ilock * ilock,vm_offset_t ran_beg,vm_offset_t ran_end)4597 vm_map_interlock(vm_map_t map, struct vm_map_ilock *ilock,
4598                      vm_offset_t ran_beg, vm_offset_t ran_end)
4599 {
4600           struct vm_map_ilock *scan;
4601 
4602           ilock->ran_beg = ran_beg;
4603           ilock->ran_end = ran_end;
4604           ilock->flags = 0;
4605 
4606           spin_lock(&map->ilock_spin);
4607 restart:
4608           for (scan = map->ilock_base; scan; scan = scan->next) {
4609                     if (ran_end > scan->ran_beg && ran_beg < scan->ran_end) {
4610                               scan->flags |= ILOCK_WAITING;
4611                               ssleep(scan, &map->ilock_spin, 0, "ilock", 0);
4612                               goto restart;
4613                     }
4614           }
4615           ilock->next = map->ilock_base;
4616           map->ilock_base = ilock;
4617           spin_unlock(&map->ilock_spin);
4618 }
4619 
4620 void
vm_map_deinterlock(vm_map_t map,struct vm_map_ilock * ilock)4621 vm_map_deinterlock(vm_map_t map, struct  vm_map_ilock *ilock)
4622 {
4623           struct vm_map_ilock *scan;
4624           struct vm_map_ilock **scanp;
4625 
4626           spin_lock(&map->ilock_spin);
4627           scanp = &map->ilock_base;
4628           while ((scan = *scanp) != NULL) {
4629                     if (scan == ilock) {
4630                               *scanp = ilock->next;
4631                               spin_unlock(&map->ilock_spin);
4632                               if (ilock->flags & ILOCK_WAITING)
4633                                         wakeup(ilock);
4634                               return;
4635                     }
4636                     scanp = &scan->next;
4637           }
4638           spin_unlock(&map->ilock_spin);
4639           panic("vm_map_deinterlock: missing ilock!");
4640 }
4641 
4642 #include "opt_ddb.h"
4643 #ifdef DDB
4644 #include <ddb/ddb.h>
4645 
4646 /*
4647  * Debugging only
4648  */
DB_SHOW_COMMAND(map,vm_map_print)4649 DB_SHOW_COMMAND(map, vm_map_print)
4650 {
4651           static int nlines;
4652           /* XXX convert args. */
4653           vm_map_t map = (vm_map_t)addr;
4654           boolean_t full = have_addr;
4655 
4656           vm_map_entry_t entry;
4657 
4658           db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4659               (void *)map,
4660               (void *)map->pmap, map->nentries, map->timestamp);
4661           nlines++;
4662 
4663           if (!full && db_indent)
4664                     return;
4665 
4666           db_indent += 2;
4667           RB_FOREACH(entry, vm_map_rb_tree, &map->rb_root) {
4668                     db_iprintf("map entry %p: start=%p, end=%p\n",
4669                         (void *)entry,
4670                         (void *)entry->ba.start, (void *)entry->ba.end);
4671                     nlines++;
4672                     {
4673                               static char *inheritance_name[4] =
4674                               {"share", "copy", "none", "donate_copy"};
4675 
4676                               db_iprintf(" prot=%x/%x/%s",
4677                                   entry->protection,
4678                                   entry->max_protection,
4679                                   inheritance_name[(int)(unsigned char)
4680                                                             entry->inheritance]);
4681                               if (entry->wired_count != 0)
4682                                         db_printf(", wired");
4683                     }
4684                     switch(entry->maptype) {
4685                     case VM_MAPTYPE_SUBMAP:
4686                               /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4687                               db_printf(", share=%p, offset=0x%lx\n",
4688                                   (void *)entry->ba.sub_map,
4689                                   (long)entry->ba.offset);
4690                               nlines++;
4691 
4692                               db_indent += 2;
4693                               vm_map_print((db_expr_t)(intptr_t)entry->ba.sub_map,
4694                                              full, 0, NULL);
4695                               db_indent -= 2;
4696                               break;
4697                     case VM_MAPTYPE_NORMAL:
4698                               /* XXX no %qd in kernel.  Truncate entry->ba.offset. */
4699                               db_printf(", object=%p, offset=0x%lx",
4700                                   (void *)entry->ba.object,
4701                                   (long)entry->ba.offset);
4702                               if (entry->eflags & MAP_ENTRY_COW)
4703                                         db_printf(", copy (%s)",
4704                                             ((entry->eflags & MAP_ENTRY_NEEDS_COPY) ?
4705                                              "needed" : "done"));
4706                               db_printf("\n");
4707                               nlines++;
4708 
4709                               if (entry->ba.object) {
4710                                         db_indent += 2;
4711                                         vm_object_print((db_expr_t)(intptr_t)
4712                                                             entry->ba.object,
4713                                                             full, 0, NULL);
4714                                         nlines += 4;
4715                                         db_indent -= 2;
4716                               }
4717                               break;
4718                     case VM_MAPTYPE_UKSMAP:
4719                               db_printf(", uksmap=%p, offset=0x%lx",
4720                                   (void *)entry->ba.uksmap,
4721                                   (long)entry->ba.offset);
4722                               if (entry->eflags & MAP_ENTRY_COW)
4723                                         db_printf(", copy (%s)",
4724                                             (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4725                               db_printf("\n");
4726                               nlines++;
4727                               break;
4728                     default:
4729                               break;
4730                     }
4731           }
4732           db_indent -= 2;
4733           if (db_indent == 0)
4734                     nlines = 0;
4735 }
4736 
4737 /*
4738  * Debugging only
4739  */
DB_SHOW_COMMAND(procvm,procvm)4740 DB_SHOW_COMMAND(procvm, procvm)
4741 {
4742           struct proc *p;
4743 
4744           if (have_addr) {
4745                     p = (struct proc *) addr;
4746           } else {
4747                     p = curproc;
4748           }
4749 
4750           db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4751               (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4752               (void *)vmspace_pmap(p->p_vmspace));
4753 
4754           vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
4755 }
4756 
4757 #endif /* DDB */
4758