1 /*        $NetBSD: uvm_map.c,v 1.427 2025/04/27 17:40:55 riastradh Exp $        */
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993, The Regents of the University of California.
6  *
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *        @(#)vm_map.c    8.3 (Berkeley) 1/12/94
37  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
38  *
39  *
40  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41  * All rights reserved.
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 
64 /*
65  * uvm_map.c: uvm map operations
66  */
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.427 2025/04/27 17:40:55 riastradh Exp $");
70 
71 #include "opt_ddb.h"
72 #include "opt_pax.h"
73 #include "opt_uvmhist.h"
74 #include "opt_uvm.h"
75 #include "opt_sysv.h"
76 
77 #include <sys/param.h>
78 #include <sys/systm.h>
79 #include <sys/mman.h>
80 #include <sys/proc.h>
81 #include <sys/pool.h>
82 #include <sys/kernel.h>
83 #include <sys/mount.h>
84 #include <sys/pax.h>
85 #include <sys/vnode.h>
86 #include <sys/filedesc.h>
87 #include <sys/lockdebug.h>
88 #include <sys/atomic.h>
89 #include <sys/sysctl.h>
90 #ifndef __USER_VA0_IS_SAFE
91 #include <sys/kauth.h>
92 #include "opt_user_va0_disable_default.h"
93 #endif
94 
95 #include <sys/shm.h>
96 
97 #include <uvm/uvm.h>
98 #include <uvm/uvm_readahead.h>
99 
100 #if defined(DDB) || defined(DEBUGPRINT)
101 #include <uvm/uvm_ddb.h>
102 #endif
103 
104 #ifdef UVMHIST
105 #ifndef UVMHIST_MAPHIST_SIZE
106 #define UVMHIST_MAPHIST_SIZE 100
107 #endif
108 static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
109 UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
110 #endif
111 
112 #if !defined(UVMMAP_COUNTERS)
113 
114 #define   UVMMAP_EVCNT_DEFINE(name)     /* nothing */
115 #define UVMMAP_EVCNT_INCR(ev)           /* nothing */
116 #define UVMMAP_EVCNT_DECR(ev)           /* nothing */
117 
118 #else /* defined(UVMMAP_NOCOUNTERS) */
119 
120 #include <sys/evcnt.h>
121 #define   UVMMAP_EVCNT_DEFINE(name) \
122 struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
123     "uvmmap", #name); \
124 EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
125 #define   UVMMAP_EVCNT_INCR(ev)                   uvmmap_evcnt_##ev.ev_count++
126 #define   UVMMAP_EVCNT_DECR(ev)                   uvmmap_evcnt_##ev.ev_count--
127 
128 #endif /* defined(UVMMAP_NOCOUNTERS) */
129 
130 UVMMAP_EVCNT_DEFINE(ubackmerge)
131 UVMMAP_EVCNT_DEFINE(uforwmerge)
132 UVMMAP_EVCNT_DEFINE(ubimerge)
133 UVMMAP_EVCNT_DEFINE(unomerge)
134 UVMMAP_EVCNT_DEFINE(kbackmerge)
135 UVMMAP_EVCNT_DEFINE(kforwmerge)
136 UVMMAP_EVCNT_DEFINE(kbimerge)
137 UVMMAP_EVCNT_DEFINE(knomerge)
138 UVMMAP_EVCNT_DEFINE(map_call)
139 UVMMAP_EVCNT_DEFINE(mlk_call)
140 UVMMAP_EVCNT_DEFINE(mlk_hint)
141 UVMMAP_EVCNT_DEFINE(mlk_tree)
142 UVMMAP_EVCNT_DEFINE(mlk_treeloop)
143 
144 const char vmmapbsy[] = "vmmapbsy";
145 
146 /*
147  * cache for dynamically-allocated map entries.
148  */
149 
150 static struct pool_cache uvm_map_entry_cache;
151 
152 #ifdef PMAP_GROWKERNEL
153 /*
154  * This global represents the end of the kernel virtual address
155  * space.  If we want to exceed this, we must grow the kernel
156  * virtual address space dynamically.
157  *
158  * Note, this variable is locked by kernel_map's lock.
159  */
160 vaddr_t uvm_maxkaddr;
161 #endif
162 
163 #ifndef __USER_VA0_IS_SAFE
164 #ifndef __USER_VA0_DISABLE_DEFAULT
165 #define __USER_VA0_DISABLE_DEFAULT 1
166 #endif
167 #ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
168 #undef __USER_VA0_DISABLE_DEFAULT
169 #define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
170 #endif
171 int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
172 #endif
173 
174 /*
175  * macros
176  */
177 
178 /*
179  * uvm_map_align_va: round down or up virtual address
180  */
181 static __inline void
uvm_map_align_va(vaddr_t * vap,vsize_t align,int topdown)182 uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
183 {
184 
185           KASSERT(powerof2(align));
186 
187           if (align != 0 && (*vap & (align - 1)) != 0) {
188                     if (topdown)
189                               *vap = rounddown2(*vap, align);
190                     else
191                               *vap = roundup2(*vap, align);
192           }
193 }
194 
195 /*
196  * UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
197  */
198 extern struct vm_map *pager_map;
199 
200 #define   UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
201     prot, maxprot, inh, adv, wire) \
202           ((ent)->etype == (type) && \
203           (((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
204           (ent)->object.uvm_obj == (uobj) && \
205           (ent)->protection == (prot) && \
206           (ent)->max_protection == (maxprot) && \
207           (ent)->inheritance == (inh) && \
208           (ent)->advice == (adv) && \
209           (ent)->wired_count == (wire))
210 
211 /*
212  * uvm_map_entry_link: insert entry into a map
213  *
214  * => map must be locked
215  */
216 #define uvm_map_entry_link(map, after_where, entry) do { \
217           uvm_mapent_check(entry); \
218           (map)->nentries++; \
219           (entry)->prev = (after_where); \
220           (entry)->next = (after_where)->next; \
221           (entry)->prev->next = (entry); \
222           (entry)->next->prev = (entry); \
223           uvm_rb_insert((map), (entry)); \
224 } while (/*CONSTCOND*/ 0)
225 
226 /*
227  * uvm_map_entry_unlink: remove entry from a map
228  *
229  * => map must be locked
230  */
231 #define uvm_map_entry_unlink(map, entry) do { \
232           KASSERT((entry) != (map)->first_free); \
233           KASSERT((entry) != (map)->hint); \
234           uvm_mapent_check(entry); \
235           (map)->nentries--; \
236           (entry)->next->prev = (entry)->prev; \
237           (entry)->prev->next = (entry)->next; \
238           uvm_rb_remove((map), (entry)); \
239 } while (/*CONSTCOND*/ 0)
240 
241 /*
242  * SAVE_HINT: saves the specified entry as the hint for future lookups.
243  *
244  * => map need not be locked.
245  */
246 #define SAVE_HINT(map, check, value) do { \
247           if ((map)->hint == (check)) \
248                     (map)->hint = (value); \
249 } while (/*CONSTCOND*/ 0)
250 
251 /*
252  * clear_hints: ensure that hints don't point to the entry.
253  *
254  * => map must be write-locked.
255  */
256 static void
clear_hints(struct vm_map * map,struct vm_map_entry * ent)257 clear_hints(struct vm_map *map, struct vm_map_entry *ent)
258 {
259 
260           SAVE_HINT(map, ent, ent->prev);
261           if (map->first_free == ent) {
262                     map->first_free = ent->prev;
263           }
264 }
265 
266 /*
267  * VM_MAP_RANGE_CHECK: check and correct range
268  *
269  * => map must at least be read locked
270  */
271 
272 #define VM_MAP_RANGE_CHECK(map, start, end) do { \
273           if (start < vm_map_min(map))            \
274                     start = vm_map_min(map);      \
275           if (end > vm_map_max(map))              \
276                     end = vm_map_max(map);                  \
277           if (start > end)                        \
278                     start = end;                            \
279 } while (/*CONSTCOND*/ 0)
280 
281 /*
282  * local prototypes
283  */
284 
285 static struct vm_map_entry *
286                     uvm_mapent_alloc(struct vm_map *, int);
287 static void         uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
288 static void         uvm_mapent_free(struct vm_map_entry *);
289 #if defined(DEBUG)
290 static void         _uvm_mapent_check(const struct vm_map_entry *, int);
291 #define   uvm_mapent_check(map)         _uvm_mapent_check(map, __LINE__)
292 #else /* defined(DEBUG) */
293 #define   uvm_mapent_check(e) /* nothing */
294 #endif /* defined(DEBUG) */
295 
296 static void         uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
297 static void         uvm_map_reference_amap(struct vm_map_entry *, int);
298 static int          uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
299                         int, struct vm_map_entry *);
300 static void         uvm_map_unreference_amap(struct vm_map_entry *, int);
301 
302 int _uvm_map_sanity(struct vm_map *);
303 int _uvm_tree_sanity(struct vm_map *);
304 static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);
305 
306 /*
307  * Tree iteration.  We violate the rbtree(9) abstraction for various
308  * things here.  Entries are ascending left to right, so, provided the
309  * child entry in question exists:
310  *
311  *        LEFT_ENTRY(entry)->end <= entry->start
312  *        entry->end <= RIGHT_ENTRY(entry)->start
313  */
314 __CTASSERT(offsetof(struct vm_map_entry, rb_node) == 0);
315 #define   ROOT_ENTRY(map)                                                                       \
316           ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
317 #define   LEFT_ENTRY(entry)                                                           \
318           ((struct vm_map_entry *)(entry)->rb_node.rb_left)
319 #define   RIGHT_ENTRY(entry)                                                          \
320           ((struct vm_map_entry *)(entry)->rb_node.rb_right)
321 #define   PARENT_ENTRY(map, entry)                                                    \
322           (ROOT_ENTRY(map) == (entry)                                                 \
323               ? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))
324 
325 /*
326  * These get filled in if/when SYSVSHM shared memory code is loaded
327  *
328  * We do this with function pointers rather the #ifdef SYSVSHM so the
329  * SYSVSHM code can be loaded and unloaded
330  */
331 void (*uvm_shmexit)(struct vmspace *) = NULL;
332 void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;
333 
334 static int
uvm_map_compare_nodes(void * ctx,const void * nparent,const void * nkey)335 uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
336 {
337           const struct vm_map_entry *eparent = nparent;
338           const struct vm_map_entry *ekey = nkey;
339 
340           KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end);
341           KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end);
342 
343           if (eparent->start < ekey->start)
344                     return -1;
345           if (eparent->end >= ekey->start)
346                     return 1;
347           return 0;
348 }
349 
350 static int
uvm_map_compare_key(void * ctx,const void * nparent,const void * vkey)351 uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
352 {
353           const struct vm_map_entry *eparent = nparent;
354           const vaddr_t va = *(const vaddr_t *) vkey;
355 
356           if (eparent->start < va)
357                     return -1;
358           if (eparent->end >= va)
359                     return 1;
360           return 0;
361 }
362 
363 static const rb_tree_ops_t uvm_map_tree_ops = {
364           .rbto_compare_nodes = uvm_map_compare_nodes,
365           .rbto_compare_key = uvm_map_compare_key,
366           .rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
367           .rbto_context = NULL
368 };
369 
370 /*
371  * uvm_rb_gap: return the gap size between our entry and next entry.
372  */
373 static inline vsize_t
uvm_rb_gap(const struct vm_map_entry * entry)374 uvm_rb_gap(const struct vm_map_entry *entry)
375 {
376 
377           KASSERT(entry->next != NULL);
378           return entry->next->start - entry->end;
379 }
380 
381 static vsize_t
uvm_rb_maxgap(const struct vm_map_entry * entry)382 uvm_rb_maxgap(const struct vm_map_entry *entry)
383 {
384           struct vm_map_entry *child;
385           vsize_t maxgap = entry->gap;
386 
387           /*
388            * We need maxgap to be the largest gap of us or any of our
389            * descendents.  Since each of our children's maxgap is the
390            * cached value of their largest gap of themselves or their
391            * descendents, we can just use that value and avoid recursing
392            * down the tree to calculate it.
393            */
394           if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
395                     maxgap = child->maxgap;
396 
397           if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
398                     maxgap = child->maxgap;
399 
400           return maxgap;
401 }
402 
403 static void
uvm_rb_fixup(struct vm_map * map,struct vm_map_entry * entry)404 uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
405 {
406           struct vm_map_entry *parent;
407 
408           KASSERT(entry->gap == uvm_rb_gap(entry));
409           entry->maxgap = uvm_rb_maxgap(entry);
410 
411           while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
412                     struct vm_map_entry *brother;
413                     vsize_t maxgap = parent->gap;
414                     unsigned int which;
415 
416                     KDASSERT(parent->gap == uvm_rb_gap(parent));
417                     if (maxgap < entry->maxgap)
418                               maxgap = entry->maxgap;
419                     /*
420                      * Since we work towards the root, we know entry's maxgap
421                      * value is OK, but its brothers may now be out-of-date due
422                      * to rebalancing.  So refresh it.
423                      */
424                     which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
425                     brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
426                     if (brother != NULL) {
427                               KDASSERT(brother->gap == uvm_rb_gap(brother));
428                               brother->maxgap = uvm_rb_maxgap(brother);
429                               if (maxgap < brother->maxgap)
430                                         maxgap = brother->maxgap;
431                     }
432 
433                     parent->maxgap = maxgap;
434                     entry = parent;
435           }
436 }
437 
438 static void
uvm_rb_insert(struct vm_map * map,struct vm_map_entry * entry)439 uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
440 {
441           struct vm_map_entry *ret __diagused;
442 
443           entry->gap = entry->maxgap = uvm_rb_gap(entry);
444           if (entry->prev != &map->header)
445                     entry->prev->gap = uvm_rb_gap(entry->prev);
446 
447           ret = rb_tree_insert_node(&map->rb_tree, entry);
448           KASSERTMSG(ret == entry,
449               "uvm_rb_insert: map %p: duplicate entry %p", map, ret);
450 
451           /*
452            * If the previous entry is not our immediate left child, then it's an
453            * ancestor and will be fixed up on the way to the root.  We don't
454            * have to check entry->prev against &map->header since &map->header
455            * will never be in the tree.
456            */
457           uvm_rb_fixup(map,
458               LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
459 }
460 
461 static void
uvm_rb_remove(struct vm_map * map,struct vm_map_entry * entry)462 uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
463 {
464           struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;
465 
466           /*
467            * If we are removing an interior node, then an adjacent node will
468            * be used to replace its position in the tree.  Therefore we will
469            * need to fixup the tree starting at the parent of the replacement
470            * node.  So record their parents for later use.
471            */
472           if (entry->prev != &map->header)
473                     prev_parent = PARENT_ENTRY(map, entry->prev);
474           if (entry->next != &map->header)
475                     next_parent = PARENT_ENTRY(map, entry->next);
476 
477           rb_tree_remove_node(&map->rb_tree, entry);
478 
479           /*
480            * If the previous node has a new parent, fixup the tree starting
481            * at the previous node's old parent.
482            */
483           if (entry->prev != &map->header) {
484                     /*
485                      * Update the previous entry's gap due to our absence.
486                      */
487                     entry->prev->gap = uvm_rb_gap(entry->prev);
488                     uvm_rb_fixup(map, entry->prev);
489                     if (prev_parent != NULL
490                         && prev_parent != entry
491                         && prev_parent != PARENT_ENTRY(map, entry->prev))
492                               uvm_rb_fixup(map, prev_parent);
493           }
494 
495           /*
496            * If the next node has a new parent, fixup the tree starting
497            * at the next node's old parent.
498            */
499           if (entry->next != &map->header) {
500                     uvm_rb_fixup(map, entry->next);
501                     if (next_parent != NULL
502                         && next_parent != entry
503                         && next_parent != PARENT_ENTRY(map, entry->next))
504                               uvm_rb_fixup(map, next_parent);
505           }
506 }
507 
508 #if defined(DEBUG)
509 int uvm_debug_check_map = 0;
510 int uvm_debug_check_rbtree = 0;
511 #define uvm_map_check(map, name) \
512           _uvm_map_check((map), (name), __FILE__, __LINE__)
513 static void
_uvm_map_check(struct vm_map * map,const char * name,const char * file,int line)514 _uvm_map_check(struct vm_map *map, const char *name,
515     const char *file, int line)
516 {
517 
518           if ((uvm_debug_check_map && _uvm_map_sanity(map)) ||
519               (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
520                     panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
521                         name, map, file, line);
522           }
523 }
524 #else /* defined(DEBUG) */
525 #define uvm_map_check(map, name)        /* nothing */
526 #endif /* defined(DEBUG) */
527 
528 #if defined(DEBUG) || defined(DDB)
529 int
_uvm_map_sanity(struct vm_map * map)530 _uvm_map_sanity(struct vm_map *map)
531 {
532           bool first_free_found = false;
533           bool hint_found = false;
534           const struct vm_map_entry *e;
535           struct vm_map_entry *hint = map->hint;
536 
537           e = &map->header;
538           for (;;) {
539                     if (map->first_free == e) {
540                               first_free_found = true;
541                     } else if (!first_free_found && e->next->start > e->end) {
542                               printf("first_free %p should be %p\n",
543                                   map->first_free, e);
544                               return -1;
545                     }
546                     if (hint == e) {
547                               hint_found = true;
548                     }
549 
550                     e = e->next;
551                     if (e == &map->header) {
552                               break;
553                     }
554           }
555           if (!first_free_found) {
556                     printf("stale first_free\n");
557                     return -1;
558           }
559           if (!hint_found) {
560                     printf("stale hint\n");
561                     return -1;
562           }
563           return 0;
564 }
565 
566 int
_uvm_tree_sanity(struct vm_map * map)567 _uvm_tree_sanity(struct vm_map *map)
568 {
569           struct vm_map_entry *tmp, *trtmp;
570           int n = 0, i = 1;
571 
572           for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
573                     if (tmp->gap != uvm_rb_gap(tmp)) {
574                               printf("%d/%d gap %#lx != %#lx %s\n",
575                                   n + 1, map->nentries,
576                                   (ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
577                                   tmp->next == &map->header ? "(last)" : "");
578                               goto error;
579                     }
580                     /*
581                      * If any entries are out of order, tmp->gap will be unsigned
582                      * and will likely exceed the size of the map.
583                      */
584                     if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
585                               printf("too large gap %zu\n", (size_t)tmp->gap);
586                               goto error;
587                     }
588                     n++;
589           }
590 
591           if (n != map->nentries) {
592                     printf("nentries: %d vs %d\n", n, map->nentries);
593                     goto error;
594           }
595 
596           trtmp = NULL;
597           for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
598                     if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
599                               printf("maxgap %#lx != %#lx\n",
600                                   (ulong)tmp->maxgap,
601                                   (ulong)uvm_rb_maxgap(tmp));
602                               goto error;
603                     }
604                     if (trtmp != NULL && trtmp->start >= tmp->start) {
605                               printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
606                                   trtmp->start, tmp->start);
607                               goto error;
608                     }
609 
610                     trtmp = tmp;
611           }
612 
613           for (tmp = map->header.next; tmp != &map->header;
614               tmp = tmp->next, i++) {
615                     trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
616                     if (trtmp == NULL)
617                               trtmp = &map->header;
618                     if (tmp->prev != trtmp) {
619                               printf("lookup: %d: %p->prev=%p: %p\n",
620                                   i, tmp, tmp->prev, trtmp);
621                               goto error;
622                     }
623                     trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
624                     if (trtmp == NULL)
625                               trtmp = &map->header;
626                     if (tmp->next != trtmp) {
627                               printf("lookup: %d: %p->next=%p: %p\n",
628                                   i, tmp, tmp->next, trtmp);
629                               goto error;
630                     }
631                     trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
632                     if (trtmp != tmp) {
633                               printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
634                                   PARENT_ENTRY(map, tmp));
635                               goto error;
636                     }
637           }
638 
639           return (0);
640  error:
641           return (-1);
642 }
643 #endif /* defined(DEBUG) || defined(DDB) */
644 
645 /*
646  * vm_map_lock: acquire an exclusive (write) lock on a map.
647  *
648  * => The locking protocol provides for guaranteed upgrade from shared ->
649  *    exclusive by whichever thread currently has the map marked busy.
650  *    See "LOCKING PROTOCOL NOTES" in uvm_map.h.  This is horrible; among
651  *    other problems, it defeats any fairness guarantees provided by RW
652  *    locks.
653  */
654 
655 void
vm_map_lock(struct vm_map * map)656 vm_map_lock(struct vm_map *map)
657 {
658 
659           for (;;) {
660                     rw_enter(&map->lock, RW_WRITER);
661                     if (map->busy == NULL || map->busy == curlwp) {
662                               break;
663                     }
664                     mutex_enter(&map->misc_lock);
665                     rw_exit(&map->lock);
666                     if (map->busy != NULL) {
667                               cv_wait(&map->cv, &map->misc_lock);
668                     }
669                     mutex_exit(&map->misc_lock);
670           }
671           map->timestamp++;
672 }
673 
674 /*
675  * vm_map_lock_try: try to lock a map, failing if it is already locked.
676  */
677 
678 bool
vm_map_lock_try(struct vm_map * map)679 vm_map_lock_try(struct vm_map *map)
680 {
681 
682           if (!rw_tryenter(&map->lock, RW_WRITER)) {
683                     return false;
684           }
685           if (map->busy != NULL) {
686                     rw_exit(&map->lock);
687                     return false;
688           }
689           map->timestamp++;
690           return true;
691 }
692 
693 /*
694  * vm_map_unlock: release an exclusive lock on a map.
695  */
696 
697 void
vm_map_unlock(struct vm_map * map)698 vm_map_unlock(struct vm_map *map)
699 {
700 
701           KASSERT(rw_write_held(&map->lock));
702           KASSERT(map->busy == NULL || map->busy == curlwp);
703           rw_exit(&map->lock);
704 }
705 
706 /*
707  * vm_map_unbusy: mark the map as unbusy, and wake any waiters that
708  *     want an exclusive lock.
709  */
710 
711 void
vm_map_unbusy(struct vm_map * map)712 vm_map_unbusy(struct vm_map *map)
713 {
714 
715           KASSERT(map->busy == curlwp);
716 
717           /*
718            * Safe to clear 'busy' and 'waiters' with only a read lock held:
719            *
720            * o they can only be set with a write lock held
721            * o writers are blocked out with a read or write hold
722            * o at any time, only one thread owns the set of values
723            */
724           mutex_enter(&map->misc_lock);
725           map->busy = NULL;
726           cv_broadcast(&map->cv);
727           mutex_exit(&map->misc_lock);
728 }
729 
730 /*
731  * vm_map_lock_read: acquire a shared (read) lock on a map.
732  */
733 
734 void
vm_map_lock_read(struct vm_map * map)735 vm_map_lock_read(struct vm_map *map)
736 {
737 
738           rw_enter(&map->lock, RW_READER);
739 }
740 
741 /*
742  * vm_map_unlock_read: release a shared lock on a map.
743  */
744 
745 void
vm_map_unlock_read(struct vm_map * map)746 vm_map_unlock_read(struct vm_map *map)
747 {
748 
749           rw_exit(&map->lock);
750 }
751 
752 /*
753  * vm_map_busy: mark a map as busy.
754  *
755  * => the caller must hold the map write locked
756  */
757 
758 void
vm_map_busy(struct vm_map * map)759 vm_map_busy(struct vm_map *map)
760 {
761 
762           KASSERT(rw_write_held(&map->lock));
763           KASSERT(map->busy == NULL);
764 
765           map->busy = curlwp;
766 }
767 
768 /*
769  * vm_map_locked_p: return true if the map is write locked.
770  *
771  * => only for debug purposes like KASSERTs.
772  * => should not be used to verify that a map is not locked.
773  */
774 
775 bool
vm_map_locked_p(struct vm_map * map)776 vm_map_locked_p(struct vm_map *map)
777 {
778 
779           return rw_write_held(&map->lock);
780 }
781 
782 /*
783  * uvm_mapent_alloc: allocate a map entry
784  */
785 
786 static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map * map,int flags)787 uvm_mapent_alloc(struct vm_map *map, int flags)
788 {
789           struct vm_map_entry *me;
790           int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
791           UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
792 
793           me = pool_cache_get(&uvm_map_entry_cache, pflags);
794           if (__predict_false(me == NULL)) {
795                     return NULL;
796           }
797           me->flags = 0;
798 
799           UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
800               (map == kernel_map), 0, 0);
801           return me;
802 }
803 
804 /*
805  * uvm_mapent_free: free map entry
806  */
807 
808 static void
uvm_mapent_free(struct vm_map_entry * me)809 uvm_mapent_free(struct vm_map_entry *me)
810 {
811           UVMHIST_FUNC(__func__);
812           UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]",
813                     (uintptr_t)me, me->flags, 0, 0);
814           pool_cache_put(&uvm_map_entry_cache, me);
815 }
816 
817 /*
818  * uvm_mapent_copy: copy a map entry, preserving flags
819  */
820 
821 static inline void
uvm_mapent_copy(struct vm_map_entry * src,struct vm_map_entry * dst)822 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
823 {
824 
825           memcpy(dst, src, sizeof(*dst));
826           dst->flags = 0;
827 }
828 
829 #if defined(DEBUG)
830 static void
_uvm_mapent_check(const struct vm_map_entry * entry,int line)831 _uvm_mapent_check(const struct vm_map_entry *entry, int line)
832 {
833 
834           if (entry->start >= entry->end) {
835                     goto bad;
836           }
837           if (UVM_ET_ISOBJ(entry)) {
838                     if (entry->object.uvm_obj == NULL) {
839                               goto bad;
840                     }
841           } else if (UVM_ET_ISSUBMAP(entry)) {
842                     if (entry->object.sub_map == NULL) {
843                               goto bad;
844                     }
845           } else {
846                     if (entry->object.uvm_obj != NULL ||
847                         entry->object.sub_map != NULL) {
848                               goto bad;
849                     }
850           }
851           if (!UVM_ET_ISOBJ(entry)) {
852                     if (entry->offset != 0) {
853                               goto bad;
854                     }
855           }
856 
857           return;
858 
859 bad:
860           panic("%s: bad entry %p, line %d", __func__, entry, line);
861 }
862 #endif /* defined(DEBUG) */
863 
864 /*
865  * uvm_map_entry_unwire: unwire a map entry
866  *
867  * => map should be locked by caller
868  */
869 
870 static inline void
uvm_map_entry_unwire(struct vm_map * map,struct vm_map_entry * entry)871 uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
872 {
873 
874           entry->wired_count = 0;
875           uvm_fault_unwire_locked(map, entry->start, entry->end);
876 }
877 
878 
879 /*
880  * wrapper for calling amap_ref()
881  */
882 static inline void
uvm_map_reference_amap(struct vm_map_entry * entry,int flags)883 uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
884 {
885 
886           amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
887               (entry->end - entry->start) >> PAGE_SHIFT, flags);
888 }
889 
890 
891 /*
892  * wrapper for calling amap_unref()
893  */
894 static inline void
uvm_map_unreference_amap(struct vm_map_entry * entry,int flags)895 uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
896 {
897 
898           amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
899               (entry->end - entry->start) >> PAGE_SHIFT, flags);
900 }
901 
902 
903 /*
904  * uvm_map_init: init mapping system at boot time.
905  */
906 
907 void
uvm_map_init(void)908 uvm_map_init(void)
909 {
910           /*
911            * first, init logging system.
912            */
913 
914           UVMHIST_FUNC(__func__);
915           UVMHIST_LINK_STATIC(maphist);
916           UVMHIST_LINK_STATIC(pdhist);
917           UVMHIST_CALLED(maphist);
918           UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
919 
920           /*
921            * initialize the global lock for kernel map entry.
922            */
923 
924           mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
925 }
926 
927 /*
928  * uvm_map_init_caches: init mapping system caches.
929  */
930 void
uvm_map_init_caches(void)931 uvm_map_init_caches(void)
932 {
933           /*
934            * initialize caches.
935            */
936 
937           pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
938               coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL,
939               NULL, NULL);
940 }
941 
942 /*
943  * clippers
944  */
945 
946 /*
947  * uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
948  */
949 
950 static void
uvm_mapent_splitadj(struct vm_map_entry * entry1,struct vm_map_entry * entry2,vaddr_t splitat)951 uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
952     vaddr_t splitat)
953 {
954           vaddr_t adj;
955 
956           KASSERT(entry1->start < splitat);
957           KASSERT(splitat < entry1->end);
958 
959           adj = splitat - entry1->start;
960           entry1->end = entry2->start = splitat;
961 
962           if (entry1->aref.ar_amap) {
963                     amap_splitref(&entry1->aref, &entry2->aref, adj);
964           }
965           if (UVM_ET_ISSUBMAP(entry1)) {
966                     /* ... unlikely to happen, but play it safe */
967                      uvm_map_reference(entry1->object.sub_map);
968           } else if (UVM_ET_ISOBJ(entry1)) {
969                     KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
970                     entry2->offset += adj;
971                     if (entry1->object.uvm_obj->pgops &&
972                         entry1->object.uvm_obj->pgops->pgo_reference)
973                               entry1->object.uvm_obj->pgops->pgo_reference(
974                                   entry1->object.uvm_obj);
975           }
976 }
977 
978 /*
979  * uvm_map_clip_start: ensure that the entry begins at or after
980  *        the starting address, if it doesn't we split the entry.
981  *
982  * => caller should use UVM_MAP_CLIP_START macro rather than calling
983  *    this directly
984  * => map must be locked by caller
985  */
986 
987 void
uvm_map_clip_start(struct vm_map * map,struct vm_map_entry * entry,vaddr_t start)988 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
989     vaddr_t start)
990 {
991           struct vm_map_entry *new_entry;
992 
993           /* uvm_map_simplify_entry(map, entry); */ /* XXX */
994 
995           uvm_map_check(map, "clip_start entry");
996           uvm_mapent_check(entry);
997 
998           /*
999            * Split off the front portion.  note that we must insert the new
1000            * entry BEFORE this one, so that this entry has the specified
1001            * starting address.
1002            */
1003           new_entry = uvm_mapent_alloc(map, 0);
1004           uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
1005           uvm_mapent_splitadj(new_entry, entry, start);
1006           uvm_map_entry_link(map, entry->prev, new_entry);
1007 
1008           uvm_map_check(map, "clip_start leave");
1009 }
1010 
1011 /*
1012  * uvm_map_clip_end: ensure that the entry ends at or before
1013  *        the ending address, if it does't we split the reference
1014  *
1015  * => caller should use UVM_MAP_CLIP_END macro rather than calling
1016  *    this directly
1017  * => map must be locked by caller
1018  */
1019 
1020 void
uvm_map_clip_end(struct vm_map * map,struct vm_map_entry * entry,vaddr_t end)1021 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
1022 {
1023           struct vm_map_entry *new_entry;
1024 
1025           uvm_map_check(map, "clip_end entry");
1026           uvm_mapent_check(entry);
1027 
1028           /*
1029            *        Create a new entry and insert it
1030            *        AFTER the specified entry
1031            */
1032           new_entry = uvm_mapent_alloc(map, 0);
1033           uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
1034           uvm_mapent_splitadj(entry, new_entry, end);
1035           uvm_map_entry_link(map, entry, new_entry);
1036 
1037           uvm_map_check(map, "clip_end leave");
1038 }
1039 
1040 /*
1041  *   M A P   -   m a i n   e n t r y   p o i n t
1042  */
1043 /*
1044  * uvm_map: establish a valid mapping in a map
1045  *
1046  * => assume startp is page aligned.
1047  * => assume size is a multiple of PAGE_SIZE.
1048  * => assume sys_mmap provides enough of a "hint" to have us skip
1049  *        over text/data/bss area.
1050  * => map must be unlocked (we will lock it)
1051  * => <uobj,uoffset> value meanings (4 cases):
1052  *         [1] <NULL,uoffset>           == uoffset is a hint for PMAP_PREFER
1053  *         [2] <NULL,UVM_UNKNOWN_OFFSET>          == don't PMAP_PREFER
1054  *         [3] <uobj,uoffset>           == normal mapping
1055  *         [4] <uobj,UVM_UNKNOWN_OFFSET>          == uvm_map finds offset based on VA
1056  *
1057  *    case [4] is for kernel mappings where we don't know the offset until
1058  *    we've found a virtual address.   note that kernel object offsets are
1059  *    always relative to vm_map_min(kernel_map).
1060  *
1061  * => if `align' is non-zero, we align the virtual address to the specified
1062  *        alignment.
1063  *        this is provided as a mechanism for large pages.
1064  *
1065  * => XXXCDC: need way to map in external amap?
1066  */
1067 
1068 int
uvm_map(struct vm_map * map,vaddr_t * startp,vsize_t size,struct uvm_object * uobj,voff_t uoffset,vsize_t align,uvm_flag_t flags)1069 uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
1070     struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
1071 {
1072           struct uvm_map_args args;
1073           struct vm_map_entry *new_entry;
1074           int error;
1075 
1076           KASSERT((size & PAGE_MASK) == 0);
1077           KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
1078 
1079           /*
1080            * for pager_map, allocate the new entry first to avoid sleeping
1081            * for memory while we have the map locked.
1082            */
1083 
1084           new_entry = NULL;
1085           if (map == pager_map) {
1086                     new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
1087                     if (__predict_false(new_entry == NULL))
1088                               return ENOMEM;
1089           }
1090           if (map == pager_map)
1091                     flags |= UVM_FLAG_NOMERGE;
1092 
1093           error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
1094               flags, &args);
1095           if (!error) {
1096                     error = uvm_map_enter(map, &args, new_entry);
1097                     *startp = args.uma_start;
1098           } else if (new_entry) {
1099                     uvm_mapent_free(new_entry);
1100           }
1101 
1102 #if defined(DEBUG)
1103           if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
1104                     uvm_km_check_empty(map, *startp, *startp + size);
1105           }
1106 #endif /* defined(DEBUG) */
1107 
1108           return error;
1109 }
1110 
1111 /*
1112  * uvm_map_prepare:
1113  *
1114  * called with map unlocked.
1115  * on success, returns the map locked.
1116  */
1117 
1118 int
uvm_map_prepare(struct vm_map * map,vaddr_t start,vsize_t size,struct uvm_object * uobj,voff_t uoffset,vsize_t align,uvm_flag_t flags,struct uvm_map_args * args)1119 uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
1120     struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
1121     struct uvm_map_args *args)
1122 {
1123           struct vm_map_entry *prev_entry;
1124           vm_prot_t prot = UVM_PROTECTION(flags);
1125           vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
1126 
1127           UVMHIST_FUNC(__func__);
1128           UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)",
1129               (uintptr_t)map, start, size, flags);
1130           UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
1131               uoffset,0,0);
1132 
1133           /*
1134            * detect a popular device driver bug.
1135            */
1136 
1137           KASSERT(doing_shutdown || curlwp != NULL);
1138 
1139           /*
1140            * zero-sized mapping doesn't make any sense.
1141            */
1142           KASSERT(size > 0);
1143 
1144           KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0);
1145 
1146           uvm_map_check(map, "map entry");
1147 
1148           /*
1149            * check sanity of protection code
1150            */
1151 
1152           if ((prot & maxprot) != prot) {
1153                     UVMHIST_LOG(maphist, "<- prot. failure:  prot=%#jx, max=%#jx",
1154                     prot, maxprot,0,0);
1155                     return EACCES;
1156           }
1157 
1158           /*
1159            * figure out where to put new VM range
1160            */
1161 retry:
1162           if (vm_map_lock_try(map) == false) {
1163                     if ((flags & UVM_FLAG_TRYLOCK) != 0) {
1164                               return EAGAIN;
1165                     }
1166                     vm_map_lock(map); /* could sleep here */
1167           }
1168           if (flags & UVM_FLAG_UNMAP) {
1169                     KASSERT(flags & UVM_FLAG_FIXED);
1170                     KASSERT((flags & UVM_FLAG_NOWAIT) == 0);
1171 
1172                     /*
1173                      * Set prev_entry to what it will need to be after any existing
1174                      * entries are removed later in uvm_map_enter().
1175                      */
1176 
1177                     if (uvm_map_lookup_entry(map, start, &prev_entry)) {
1178                               if (start == prev_entry->start)
1179                                         prev_entry = prev_entry->prev;
1180                               else
1181                                         UVM_MAP_CLIP_END(map, prev_entry, start);
1182                               SAVE_HINT(map, map->hint, prev_entry);
1183                     }
1184           } else {
1185                     prev_entry = uvm_map_findspace(map, start, size, &start,
1186                         uobj, uoffset, align, flags);
1187           }
1188           if (prev_entry == NULL) {
1189                     unsigned int timestamp;
1190 
1191                     timestamp = map->timestamp;
1192                     UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
1193                                   timestamp,0,0,0);
1194                     map->flags |= VM_MAP_WANTVA;
1195                     vm_map_unlock(map);
1196 
1197                     /*
1198                      * try to reclaim kva and wait until someone does unmap.
1199                      * fragile locking here, so we awaken every second to
1200                      * recheck the condition.
1201                      */
1202 
1203                     mutex_enter(&map->misc_lock);
1204                     while ((map->flags & VM_MAP_WANTVA) != 0 &&
1205                        map->timestamp == timestamp) {
1206                               if ((flags & UVM_FLAG_WAITVA) == 0) {
1207                                         mutex_exit(&map->misc_lock);
1208                                         UVMHIST_LOG(maphist,
1209                                             "<- uvm_map_findspace failed!", 0,0,0,0);
1210                                         return ENOMEM;
1211                               } else {
1212                                         cv_timedwait(&map->cv, &map->misc_lock, hz);
1213                               }
1214                     }
1215                     mutex_exit(&map->misc_lock);
1216                     goto retry;
1217           }
1218 
1219 #ifdef PMAP_GROWKERNEL
1220           /*
1221            * If the kernel pmap can't map the requested space,
1222            * then allocate more resources for it.
1223            */
1224           if (map == kernel_map && uvm_maxkaddr < (start + size))
1225                     uvm_maxkaddr = pmap_growkernel(start + size);
1226 #endif
1227 
1228           UVMMAP_EVCNT_INCR(map_call);
1229 
1230           /*
1231            * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
1232            * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET.   in
1233            * either case we want to zero it  before storing it in the map entry
1234            * (because it looks strange and confusing when debugging...)
1235            *
1236            * if uobj is not null
1237            *   if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
1238            *      and we do not need to change uoffset.
1239            *   if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
1240            *      now (based on the starting address of the map).   this case is
1241            *      for kernel object mappings where we don't know the offset until
1242            *      the virtual address is found (with uvm_map_findspace).   the
1243            *      offset is the distance we are from the start of the map.
1244            */
1245 
1246           if (uobj == NULL) {
1247                     uoffset = 0;
1248           } else {
1249                     if (uoffset == UVM_UNKNOWN_OFFSET) {
1250                               KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
1251                               uoffset = start - vm_map_min(kernel_map);
1252                     }
1253           }
1254 
1255           args->uma_flags = flags;
1256           args->uma_prev = prev_entry;
1257           args->uma_start = start;
1258           args->uma_size = size;
1259           args->uma_uobj = uobj;
1260           args->uma_uoffset = uoffset;
1261 
1262           UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
1263           return 0;
1264 }
1265 
1266 /*
1267  * uvm_map_enter:
1268  *
1269  * called with map locked.
1270  * unlock the map before returning.
1271  */
1272 
1273 int
uvm_map_enter(struct vm_map * map,const struct uvm_map_args * args,struct vm_map_entry * new_entry)1274 uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
1275     struct vm_map_entry *new_entry)
1276 {
1277           struct vm_map_entry *prev_entry = args->uma_prev;
1278           struct vm_map_entry *dead = NULL, *dead_entries = NULL;
1279 
1280           const uvm_flag_t flags = args->uma_flags;
1281           const vm_prot_t prot = UVM_PROTECTION(flags);
1282           const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
1283           const vm_inherit_t inherit = UVM_INHERIT(flags);
1284           const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
1285               AMAP_EXTEND_NOWAIT : 0;
1286           const int advice = UVM_ADVICE(flags);
1287 
1288           vaddr_t start = args->uma_start;
1289           vsize_t size = args->uma_size;
1290           struct uvm_object *uobj = args->uma_uobj;
1291           voff_t uoffset = args->uma_uoffset;
1292 
1293           const int kmap = (vm_map_pmap(map) == pmap_kernel());
1294           int merged = 0;
1295           int error;
1296           int newetype;
1297 
1298           UVMHIST_FUNC(__func__);
1299           UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
1300               (uintptr_t)map, start, size, flags);
1301           UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
1302               uoffset,0,0);
1303 
1304           KASSERT(map->hint == prev_entry); /* bimerge case assumes this */
1305           KASSERT(vm_map_locked_p(map));
1306           KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
1307                     (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));
1308 
1309           if (uobj)
1310                     newetype = UVM_ET_OBJ;
1311           else
1312                     newetype = 0;
1313 
1314           if (flags & UVM_FLAG_COPYONW) {
1315                     newetype |= UVM_ET_COPYONWRITE;
1316                     if ((flags & UVM_FLAG_OVERLAY) == 0)
1317                               newetype |= UVM_ET_NEEDSCOPY;
1318           }
1319 
1320           /*
1321            * For mappings with unmap, remove any old entries now.  Adding the new
1322            * entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
1323            * is set, and we do not support nowait and unmap together.
1324            */
1325 
1326           if (flags & UVM_FLAG_UNMAP) {
1327                     KASSERT(flags & UVM_FLAG_FIXED);
1328                     uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
1329 #ifdef DEBUG
1330                     struct vm_map_entry *tmp_entry __diagused;
1331                     bool rv __diagused;
1332 
1333                     rv = uvm_map_lookup_entry(map, start, &tmp_entry);
1334                     KASSERT(!rv);
1335                     KASSERTMSG(prev_entry == tmp_entry,
1336                                  "args %p prev_entry %p tmp_entry %p",
1337                                  args, prev_entry, tmp_entry);
1338 #endif
1339                     SAVE_HINT(map, map->hint, prev_entry);
1340           }
1341 
1342           /*
1343            * try and insert in map by extending previous entry, if possible.
1344            * XXX: we don't try and pull back the next entry.   might be useful
1345            * for a stack, but we are currently allocating our stack in advance.
1346            */
1347 
1348           if (flags & UVM_FLAG_NOMERGE)
1349                     goto nomerge;
1350 
1351           if (prev_entry->end == start &&
1352               prev_entry != &map->header &&
1353               UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
1354               prot, maxprot, inherit, advice, 0)) {
1355 
1356                     if (uobj && prev_entry->offset +
1357                         (prev_entry->end - prev_entry->start) != uoffset)
1358                               goto forwardmerge;
1359 
1360                     /*
1361                      * can't extend a shared amap.  note: no need to lock amap to
1362                      * look at refs since we don't care about its exact value.
1363                      * if it is one (i.e. we have only reference) it will stay there
1364                      */
1365 
1366                     if (prev_entry->aref.ar_amap &&
1367                         amap_refs(prev_entry->aref.ar_amap) != 1) {
1368                               goto forwardmerge;
1369                     }
1370 
1371                     if (prev_entry->aref.ar_amap) {
1372                               error = amap_extend(prev_entry, size,
1373                                   amapwaitflag | AMAP_EXTEND_FORWARDS);
1374                               if (error)
1375                                         goto nomerge;
1376                     }
1377 
1378                     if (kmap) {
1379                               UVMMAP_EVCNT_INCR(kbackmerge);
1380                     } else {
1381                               UVMMAP_EVCNT_INCR(ubackmerge);
1382                     }
1383                     UVMHIST_LOG(maphist,"  starting back merge", 0, 0, 0, 0);
1384 
1385                     /*
1386                      * drop our reference to uobj since we are extending a reference
1387                      * that we already have (the ref count can not drop to zero).
1388                      */
1389 
1390                     if (uobj && uobj->pgops->pgo_detach)
1391                               uobj->pgops->pgo_detach(uobj);
1392 
1393                     /*
1394                      * Now that we've merged the entries, note that we've grown
1395                      * and our gap has shrunk.  Then fix the tree.
1396                      */
1397                     prev_entry->end += size;
1398                     prev_entry->gap -= size;
1399                     uvm_rb_fixup(map, prev_entry);
1400 
1401                     uvm_map_check(map, "map backmerged");
1402 
1403                     UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
1404                     merged++;
1405           }
1406 
1407 forwardmerge:
1408           if (prev_entry->next->start == (start + size) &&
1409               prev_entry->next != &map->header &&
1410               UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
1411               prot, maxprot, inherit, advice, 0)) {
1412 
1413                     if (uobj && prev_entry->next->offset != uoffset + size)
1414                               goto nomerge;
1415 
1416                     /*
1417                      * can't extend a shared amap.  note: no need to lock amap to
1418                      * look at refs since we don't care about its exact value.
1419                      * if it is one (i.e. we have only reference) it will stay there.
1420                      *
1421                      * note that we also can't merge two amaps, so if we
1422                      * merged with the previous entry which has an amap,
1423                      * and the next entry also has an amap, we give up.
1424                      *
1425                      * Interesting cases:
1426                      * amap, new, amap -> give up second merge (single fwd extend)
1427                      * amap, new, none -> double forward extend (extend again here)
1428                      * none, new, amap -> double backward extend (done here)
1429                      * uobj, new, amap -> single backward extend (done here)
1430                      *
1431                      * XXX should we attempt to deal with someone refilling
1432                      * the deallocated region between two entries that are
1433                      * backed by the same amap (ie, arefs is 2, "prev" and
1434                      * "next" refer to it, and adding this allocation will
1435                      * close the hole, thus restoring arefs to 1 and
1436                      * deallocating the "next" vm_map_entry)?  -- @@@
1437                      */
1438 
1439                     if (prev_entry->next->aref.ar_amap &&
1440                         (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
1441                          (merged && prev_entry->aref.ar_amap))) {
1442                               goto nomerge;
1443                     }
1444 
1445                     if (merged) {
1446                               /*
1447                                * Try to extend the amap of the previous entry to
1448                                * cover the next entry as well.  If it doesn't work
1449                                * just skip on, don't actually give up, since we've
1450                                * already completed the back merge.
1451                                */
1452                               if (prev_entry->aref.ar_amap) {
1453                                         if (amap_extend(prev_entry,
1454                                             prev_entry->next->end -
1455                                             prev_entry->next->start,
1456                                             amapwaitflag | AMAP_EXTEND_FORWARDS))
1457                                                   goto nomerge;
1458                               }
1459 
1460                               /*
1461                                * Try to extend the amap of the *next* entry
1462                                * back to cover the new allocation *and* the
1463                                * previous entry as well (the previous merge
1464                                * didn't have an amap already otherwise we
1465                                * wouldn't be checking here for an amap).  If
1466                                * it doesn't work just skip on, again, don't
1467                                * actually give up, since we've already
1468                                * completed the back merge.
1469                                */
1470                               else if (prev_entry->next->aref.ar_amap) {
1471                                         if (amap_extend(prev_entry->next,
1472                                             prev_entry->end -
1473                                             prev_entry->start,
1474                                             amapwaitflag | AMAP_EXTEND_BACKWARDS))
1475                                                   goto nomerge;
1476                               }
1477                     } else {
1478                               /*
1479                                * Pull the next entry's amap backwards to cover this
1480                                * new allocation.
1481                                */
1482                               if (prev_entry->next->aref.ar_amap) {
1483                                         error = amap_extend(prev_entry->next, size,
1484                                             amapwaitflag | AMAP_EXTEND_BACKWARDS);
1485                                         if (error)
1486                                                   goto nomerge;
1487                               }
1488                     }
1489 
1490                     if (merged) {
1491                               if (kmap) {
1492                                         UVMMAP_EVCNT_DECR(kbackmerge);
1493                                         UVMMAP_EVCNT_INCR(kbimerge);
1494                               } else {
1495                                         UVMMAP_EVCNT_DECR(ubackmerge);
1496                                         UVMMAP_EVCNT_INCR(ubimerge);
1497                               }
1498                     } else {
1499                               if (kmap) {
1500                                         UVMMAP_EVCNT_INCR(kforwmerge);
1501                               } else {
1502                                         UVMMAP_EVCNT_INCR(uforwmerge);
1503                               }
1504                     }
1505                     UVMHIST_LOG(maphist,"  starting forward merge", 0, 0, 0, 0);
1506 
1507                     /*
1508                      * drop our reference to uobj since we are extending a reference
1509                      * that we already have (the ref count can not drop to zero).
1510                      */
1511                     if (uobj && uobj->pgops->pgo_detach)
1512                               uobj->pgops->pgo_detach(uobj);
1513 
1514                     if (merged) {
1515                               dead = prev_entry->next;
1516                               prev_entry->end = dead->end;
1517                               uvm_map_entry_unlink(map, dead);
1518                               if (dead->aref.ar_amap != NULL) {
1519                                         prev_entry->aref = dead->aref;
1520                                         dead->aref.ar_amap = NULL;
1521                               }
1522                     } else {
1523                               prev_entry->next->start -= size;
1524                               if (prev_entry != &map->header) {
1525                                         prev_entry->gap -= size;
1526                                         KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
1527                                         uvm_rb_fixup(map, prev_entry);
1528                               }
1529                               if (uobj)
1530                                         prev_entry->next->offset = uoffset;
1531                     }
1532 
1533                     uvm_map_check(map, "map forwardmerged");
1534 
1535                     UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
1536                     merged++;
1537           }
1538 
1539 nomerge:
1540           if (!merged) {
1541                     UVMHIST_LOG(maphist,"  allocating new map entry", 0, 0, 0, 0);
1542                     if (kmap) {
1543                               UVMMAP_EVCNT_INCR(knomerge);
1544                     } else {
1545                               UVMMAP_EVCNT_INCR(unomerge);
1546                     }
1547 
1548                     /*
1549                      * allocate new entry and link it in.
1550                      */
1551 
1552                     if (new_entry == NULL) {
1553                               new_entry = uvm_mapent_alloc(map,
1554                                         (flags & UVM_FLAG_NOWAIT));
1555                               if (__predict_false(new_entry == NULL)) {
1556                                         error = ENOMEM;
1557                                         goto done;
1558                               }
1559                     }
1560                     new_entry->start = start;
1561                     new_entry->end = new_entry->start + size;
1562                     new_entry->object.uvm_obj = uobj;
1563                     new_entry->offset = uoffset;
1564 
1565                     new_entry->etype = newetype;
1566 
1567                     if (flags & UVM_FLAG_NOMERGE) {
1568                               new_entry->flags |= UVM_MAP_NOMERGE;
1569                     }
1570 
1571                     new_entry->protection = prot;
1572                     new_entry->max_protection = maxprot;
1573                     new_entry->inheritance = inherit;
1574                     new_entry->wired_count = 0;
1575                     new_entry->advice = advice;
1576                     if (flags & UVM_FLAG_OVERLAY) {
1577 
1578                               /*
1579                                * to_add: for BSS we overallocate a little since we
1580                                * are likely to extend
1581                                */
1582 
1583                               vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
1584                                         UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
1585                               struct vm_amap *amap = amap_alloc(size, to_add,
1586                                   (flags & UVM_FLAG_NOWAIT));
1587                               if (__predict_false(amap == NULL)) {
1588                                         error = ENOMEM;
1589                                         goto done;
1590                               }
1591                               new_entry->aref.ar_pageoff = 0;
1592                               new_entry->aref.ar_amap = amap;
1593                     } else {
1594                               new_entry->aref.ar_pageoff = 0;
1595                               new_entry->aref.ar_amap = NULL;
1596                     }
1597                     uvm_map_entry_link(map, prev_entry, new_entry);
1598 
1599                     /*
1600                      * Update the free space hint
1601                      */
1602 
1603                     if ((map->first_free == prev_entry) &&
1604                         (prev_entry->end >= new_entry->start))
1605                               map->first_free = new_entry;
1606 
1607                     new_entry = NULL;
1608           }
1609 
1610           map->size += size;
1611 
1612           UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
1613 
1614           error = 0;
1615 
1616 done:
1617           vm_map_unlock(map);
1618 
1619           if (new_entry) {
1620                     uvm_mapent_free(new_entry);
1621           }
1622           if (dead) {
1623                     KDASSERT(merged);
1624                     uvm_mapent_free(dead);
1625           }
1626           if (dead_entries)
1627                     uvm_unmap_detach(dead_entries, 0);
1628 
1629           return error;
1630 }
1631 
1632 /*
1633  * uvm_map_lookup_entry_bytree: lookup an entry in tree
1634  *
1635  * => map must at least be read-locked by caller.
1636  *
1637  * => If address lies in an entry, set *entry to it and return true;
1638  *    then (*entry)->start <= address < (*entry)->end.
1639 
1640  * => If address is below all entries in map, return false and set
1641  *    *entry to &map->header.
1642  *
1643  * => Otherwise, return false and set *entry to the highest entry below
1644  *    address, so (*entry)->end <= address, and if (*entry)->next is
1645  *    not &map->header, address < (*entry)->next->start.
1646  */
1647 
1648 static inline bool
uvm_map_lookup_entry_bytree(struct vm_map * map,vaddr_t address,struct vm_map_entry ** entry)1649 uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
1650     struct vm_map_entry **entry         /* OUT */)
1651 {
1652           struct vm_map_entry *prev = &map->header;
1653           struct vm_map_entry *cur = ROOT_ENTRY(map);
1654 
1655           KASSERT(rw_lock_held(&map->lock));
1656 
1657           while (cur) {
1658                     KASSERT(prev == &map->header || prev->end <= address);
1659                     KASSERT(prev == &map->header || prev->end <= cur->start);
1660                     UVMMAP_EVCNT_INCR(mlk_treeloop);
1661                     if (address >= cur->start) {
1662                               if (address < cur->end) {
1663                                         *entry = cur;
1664                                         return true;
1665                               }
1666                               prev = cur;
1667                               KASSERT(prev->end <= address);
1668                               cur = RIGHT_ENTRY(cur);
1669                               KASSERT(cur == NULL || prev->end <= cur->start);
1670                     } else
1671                               cur = LEFT_ENTRY(cur);
1672           }
1673           KASSERT(prev == &map->header || prev->end <= address);
1674           KASSERT(prev->next == &map->header || address < prev->next->start);
1675           *entry = prev;
1676           return false;
1677 }
1678 
1679 /*
1680  * uvm_map_lookup_entry: find map entry at or before an address
1681  *
1682  * => map must at least be read-locked by caller.
1683  *
1684  * => If address lies in an entry, set *entry to it and return true;
1685  *    then (*entry)->start <= address < (*entry)->end.
1686 
1687  * => If address is below all entries in map, return false and set
1688  *    *entry to &map->header.
1689  *
1690  * => Otherwise, return false and set *entry to the highest entry below
1691  *    address, so (*entry)->end <= address, and if (*entry)->next is
1692  *    not &map->header, address < (*entry)->next->start.
1693  */
1694 
1695 bool
uvm_map_lookup_entry(struct vm_map * map,vaddr_t address,struct vm_map_entry ** entry)1696 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
1697     struct vm_map_entry **entry         /* OUT */)
1698 {
1699           struct vm_map_entry *cur;
1700           UVMHIST_FUNC(__func__);
1701           UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
1702               (uintptr_t)map, address, (uintptr_t)entry, 0);
1703 
1704           KASSERT(rw_lock_held(&map->lock));
1705 
1706           /*
1707            * make a quick check to see if we are already looking at
1708            * the entry we want (which is usually the case).  note also
1709            * that we don't need to save the hint here...  it is the
1710            * same hint (unless we are at the header, in which case the
1711            * hint didn't buy us anything anyway).
1712            */
1713 
1714           cur = map->hint;
1715           UVMMAP_EVCNT_INCR(mlk_call);
1716           if (cur != &map->header &&
1717               address >= cur->start && cur->end > address) {
1718                     UVMMAP_EVCNT_INCR(mlk_hint);
1719                     *entry = cur;
1720                     UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
1721                         (uintptr_t)cur, 0, 0, 0);
1722                     uvm_mapent_check(*entry);
1723                     return (true);
1724           }
1725           uvm_map_check(map, __func__);
1726 
1727           /*
1728            * lookup in the tree.
1729            */
1730 
1731           UVMMAP_EVCNT_INCR(mlk_tree);
1732           if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) {
1733                     SAVE_HINT(map, map->hint, *entry);
1734                     UVMHIST_LOG(maphist,"<- search got it (%#jx)",
1735                         (uintptr_t)cur, 0, 0, 0);
1736                     KDASSERT((*entry)->start <= address);
1737                     KDASSERT(address < (*entry)->end);
1738                     uvm_mapent_check(*entry);
1739                     return (true);
1740           }
1741 
1742           SAVE_HINT(map, map->hint, *entry);
1743           UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
1744           KDASSERT((*entry) == &map->header || (*entry)->end <= address);
1745           KDASSERT((*entry)->next == &map->header ||
1746               address < (*entry)->next->start);
1747           return (false);
1748 }
1749 
1750 /*
1751  * See if the range between start and start + length fits in the gap
1752  * entry->next->start and entry->end.  Returns 1 if fits, 0 if doesn't
1753  * fit, and -1 address wraps around.
1754  */
1755 static int
uvm_map_space_avail(vaddr_t * start,vsize_t length,voff_t uoffset,vsize_t align,int flags,int topdown,struct vm_map_entry * entry)1756 uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
1757     vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
1758 {
1759           vaddr_t orig_start = *start;
1760           vaddr_t end;
1761 
1762 #define   INVARIANTS()                                                                          \
1763           KASSERTMSG((topdown                                                         \
1764                     ? *start <= orig_start                                                      \
1765                     : *start >= orig_start),                                          \
1766               "[%s] *start=%"PRIxVADDR" orig_start=%"PRIxVADDR                        \
1767               " length=%"PRIxVSIZE" uoffset=%#llx align=%"PRIxVSIZE         \
1768               " flags=%x entry@%p=[%"PRIxVADDR",%"PRIxVADDR")"                        \
1769               " ncolors=%d colormask=%x",                                                       \
1770               topdown ? "topdown" : "bottomup", *start, orig_start,         \
1771               length, (unsigned long long)uoffset, align,                             \
1772               flags, entry, entry->start, entry->end,                                 \
1773               uvmexp.ncolors, uvmexp.colormask)
1774 
1775           INVARIANTS();
1776 
1777 #ifdef PMAP_PREFER
1778           /*
1779            * push start address forward as needed to avoid VAC alias problems.
1780            * we only do this if a valid offset is specified.
1781            */
1782 
1783           if (uoffset != UVM_UNKNOWN_OFFSET) {
1784                     PMAP_PREFER(uoffset, start, length, topdown);
1785                     INVARIANTS();
1786           }
1787 #endif
1788           if ((flags & UVM_FLAG_COLORMATCH) != 0) {
1789                     KASSERT(align < uvmexp.ncolors);
1790                     if (uvmexp.ncolors > 1) {
1791                               const u_int colormask = uvmexp.colormask;
1792                               const u_int colorsize = colormask + 1;
1793                               vaddr_t hint = atop(*start);
1794                               const u_int color = hint & colormask;
1795                               if (color != align) {
1796                                         hint -= color;      /* adjust to color boundary */
1797                                         KASSERT((hint & colormask) == 0);
1798                                         if (topdown) {
1799                                                   if (align > color)
1800                                                             hint -= colorsize;
1801                                         } else {
1802                                                   if (align < color)
1803                                                             hint += colorsize;
1804                                         }
1805                                         *start = ptoa(hint + align); /* adjust to color */
1806                                         INVARIANTS();
1807                               }
1808                     }
1809           } else {
1810                     KASSERT(powerof2(align));
1811                     uvm_map_align_va(start, align, topdown);
1812                     INVARIANTS();
1813                     /*
1814                      * XXX Should we PMAP_PREFER() here again?
1815                      * eh...i think we're okay
1816                      */
1817           }
1818 
1819           /*
1820            * Find the end of the proposed new region.  Be sure we didn't
1821            * wrap around the address; if so, we lose.  Otherwise, if the
1822            * proposed new region fits before the next entry, we win.
1823            *
1824            * XXX Should this use vm_map_max(map) as the max?
1825            */
1826 
1827           if (length > __type_max(vaddr_t) - *start)
1828                     return (-1);
1829           end = *start + length;
1830 
1831           if (entry->next->start >= end && *start >= entry->end)
1832                     return (1);
1833 
1834           return (0);
1835 
1836 #undef INVARIANTS
1837 }
1838 
1839 static void
uvm_findspace_invariants(struct vm_map * map,vaddr_t orig_hint,vaddr_t length,struct uvm_object * uobj,voff_t uoffset,vsize_t align,int flags,vaddr_t hint,struct vm_map_entry * entry,int line)1840 uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length,
1841     struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags,
1842     vaddr_t hint, struct vm_map_entry *entry, int line)
1843 {
1844           const int topdown = map->flags & VM_MAP_TOPDOWN;
1845           const int hint_location_ok =
1846                     topdown ? hint <= orig_hint
1847                               : hint >= orig_hint;
1848 
1849           KASSERTMSG(hint_location_ok,
1850               "%s map=%p hint=%#" PRIxVADDR " %s orig_hint=%#" PRIxVADDR
1851               " length=%#" PRIxVSIZE " uobj=%p uoffset=%#llx align=%" PRIxVSIZE
1852               " flags=%#x entry@%p=[%" PRIxVADDR ",%" PRIxVADDR ")"
1853               " entry->next@%p=[%" PRIxVADDR ",%" PRIxVADDR ")"
1854               " (uvm_map_findspace line %d)",
1855               topdown ? "topdown" : "bottomup",
1856               map, hint, topdown ? ">" : "<", orig_hint,
1857               length, uobj, (unsigned long long)uoffset, align,
1858               flags, entry, entry ? entry->start : 0, entry ? entry->end : 0,
1859               entry ? entry->next : NULL,
1860               entry && entry->next ? entry->next->start : 0,
1861               entry && entry->next ? entry->next->end : 0,
1862               line);
1863 }
1864 
1865 /*
1866  * uvm_map_findspace: find "length" sized space in "map".
1867  *
1868  * => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
1869  *        set in "flags" (in which case we insist on using "hint").
1870  * => "result" is VA returned
1871  * => uobj/uoffset are to be used to handle VAC alignment, if required
1872  * => if "align" is non-zero, we attempt to align to that value.
1873  * => caller must at least have read-locked map
1874  * => returns NULL on failure, or pointer to prev. map entry if success
1875  * => note this is a cross between the old vm_map_findspace and vm_map_find
1876  */
1877 
1878 struct vm_map_entry *
uvm_map_findspace(struct vm_map * map,vaddr_t hint,vsize_t length,vaddr_t * result,struct uvm_object * uobj,voff_t uoffset,vsize_t align,int flags)1879 uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
1880     vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
1881     vsize_t align, int flags)
1882 {
1883 #define   INVARIANTS()                                                                          \
1884           uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\
1885               flags, hint, entry, __LINE__)
1886           struct vm_map_entry *entry = NULL;
1887           struct vm_map_entry *child, *prev, *tmp;
1888           vaddr_t orig_hint __diagused;
1889           const int topdown = map->flags & VM_MAP_TOPDOWN;
1890           int avail;
1891           UVMHIST_FUNC(__func__);
1892           UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...",
1893               (uintptr_t)map, hint, length, flags);
1894           UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)",
1895               (uintptr_t)uobj, uoffset, align, 0);
1896 
1897           KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align));
1898           KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors);
1899           KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
1900 
1901           uvm_map_check(map, "map_findspace entry");
1902 
1903           /*
1904            * Clamp the hint to the VM map's min/max address, and remmeber
1905            * the clamped original hint.  Remember the original hint,
1906            * clamped to the min/max address.  If we are aligning, then we
1907            * may have to try again with no alignment constraint if we
1908            * fail the first time.
1909            *
1910            * We use the original hint to verify later that the search has
1911            * been monotonic -- that is, nonincreasing or nondecreasing,
1912            * according to topdown or !topdown respectively.  But the
1913            * clamping is not monotonic.
1914            */
1915           if (hint < vm_map_min(map)) { /* check ranges ... */
1916                     if (flags & UVM_FLAG_FIXED) {
1917                               UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
1918                               return (NULL);
1919                     }
1920                     hint = vm_map_min(map);
1921           }
1922           if (hint > vm_map_max(map)) {
1923                     UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
1924                         hint, vm_map_min(map), vm_map_max(map), 0);
1925                     return (NULL);
1926           }
1927           orig_hint = hint;
1928           INVARIANTS();
1929 
1930           UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
1931               hint, vm_map_min(map), vm_map_max(map), 0);
1932 
1933           /*
1934            * hint may not be aligned properly; we need round up or down it
1935            * before proceeding further.
1936            */
1937           if ((flags & UVM_FLAG_COLORMATCH) == 0) {
1938                     uvm_map_align_va(&hint, align, topdown);
1939                     INVARIANTS();
1940           }
1941 
1942           UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
1943               hint, vm_map_min(map), vm_map_max(map), 0);
1944           /*
1945            * Look for the first possible address; if there's already
1946            * something at this address, we have to start after it.
1947            */
1948 
1949           /*
1950            * @@@: there are four, no, eight cases to consider.
1951            *
1952            * 0: found,     fixed,     bottom up -> fail
1953            * 1: found,     fixed,     top down  -> fail
1954            * 2: found,     not fixed, bottom up -> start after entry->end,
1955            *                                       loop up
1956            * 3: found,     not fixed, top down  -> start before entry->start,
1957            *                                       loop down
1958            * 4: not found, fixed,     bottom up -> check entry->next->start, fail
1959            * 5: not found, fixed,     top down  -> check entry->next->start, fail
1960            * 6: not found, not fixed, bottom up -> check entry->next->start,
1961            *                                       loop up
1962            * 7: not found, not fixed, top down  -> check entry->next->start,
1963            *                                       loop down
1964            *
1965            * as you can see, it reduces to roughly five cases, and that
1966            * adding top down mapping only adds one unique case (without
1967            * it, there would be four cases).
1968            */
1969 
1970           if ((flags & UVM_FLAG_FIXED) == 0 &&
1971               hint == (topdown ? vm_map_max(map) : vm_map_min(map))) {
1972                     /*
1973                      * The uvm_map_findspace algorithm is monotonic -- for
1974                      * topdown VM it starts with a high hint and returns a
1975                      * lower free address; for !topdown VM it starts with a
1976                      * low hint and returns a higher free address.  As an
1977                      * optimization, start with the first (highest for
1978                      * topdown, lowest for !topdown) free address.
1979                      *
1980                      * XXX This `optimization' probably doesn't actually do
1981                      * much in practice unless userland explicitly passes
1982                      * the VM map's minimum or maximum address, which
1983                      * varies from machine to machine (VM_MAX/MIN_ADDRESS,
1984                      * e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on
1985                      * aarch64) and may vary according to other factors
1986                      * like sysctl vm.user_va0_disable.  In particular, if
1987                      * the user specifies 0 as a hint to mmap, then mmap
1988                      * will choose a default address which is usually _not_
1989                      * VM_MAX/MIN_ADDRESS but something else instead like
1990                      * VM_MAX_ADDRESS - stack size - guard page overhead,
1991                      * in which case this branch is never hit.
1992                      *
1993                      * In fact, this branch appears to have been broken for
1994                      * two decades between when topdown was introduced in
1995                      * ~2003 and when it was adapted to handle the topdown
1996                      * case without violating the monotonicity assertion in
1997                      * 2022.  Maybe Someone^TM should either ditch the
1998                      * optimization or find a better way to do it.
1999                      */
2000                     entry = map->first_free;
2001           } else if (uvm_map_lookup_entry(map, hint, &entry)) {
2002                     KASSERT(entry->start <= hint);
2003                     KASSERT(hint < entry->end);
2004                     /* "hint" address already in use ... */
2005                     if (flags & UVM_FLAG_FIXED) {
2006                               UVMHIST_LOG(maphist, "<- fixed & VA in use",
2007                                   0, 0, 0, 0);
2008                               return (NULL);
2009                     }
2010                     if (topdown)
2011                               /* Start from lower gap. */
2012                               entry = entry->prev;
2013           } else {
2014                     KASSERT(entry == &map->header || entry->end <= hint);
2015                     KASSERT(entry->next == &map->header ||
2016                         hint < entry->next->start);
2017                     if (flags & UVM_FLAG_FIXED) {
2018                               if (entry->next->start >= hint &&
2019                                   length <= entry->next->start - hint)
2020                                         goto found;
2021 
2022                               /* "hint" address is gap but too small */
2023                               UVMHIST_LOG(maphist, "<- fixed mapping failed",
2024                                   0, 0, 0, 0);
2025                               return (NULL); /* only one shot at it ... */
2026                     } else {
2027                               /*
2028                                * See if given hint fits in this gap.
2029                                */
2030                               avail = uvm_map_space_avail(&hint, length,
2031                                   uoffset, align, flags, topdown, entry);
2032                               INVARIANTS();
2033                               switch (avail) {
2034                               case 1:
2035                                         goto found;
2036                               case -1:
2037                                         goto wraparound;
2038                               }
2039 
2040                               if (topdown) {
2041                                         /*
2042                                          * Still there is a chance to fit
2043                                          * if hint > entry->end.
2044                                          */
2045                               } else {
2046                                         /* Start from higher gap. */
2047                                         entry = entry->next;
2048                                         if (entry == &map->header)
2049                                                   goto notfound;
2050                                         goto nextgap;
2051                               }
2052                     }
2053           }
2054 
2055           /*
2056            * Note that all UVM_FLAGS_FIXED case is already handled.
2057            */
2058           KDASSERT((flags & UVM_FLAG_FIXED) == 0);
2059 
2060           /* Try to find the space in the red-black tree */
2061 
2062           /* Check slot before any entry */
2063           if (topdown) {
2064                     KASSERTMSG(entry->next->start >= vm_map_min(map),
2065                         "map=%p entry=%p entry->next=%p"
2066                         " entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR,
2067                         map, entry, entry->next,
2068                         entry->next->start, vm_map_min(map));
2069                     if (length > entry->next->start - vm_map_min(map))
2070                               hint = vm_map_min(map); /* XXX goto wraparound? */
2071                     else
2072                               hint = MIN(orig_hint, entry->next->start - length);
2073                     KASSERT(hint >= vm_map_min(map));
2074           } else {
2075                     hint = entry->end;
2076           }
2077           INVARIANTS();
2078           avail = uvm_map_space_avail(&hint, length, uoffset, align, flags,
2079               topdown, entry);
2080           INVARIANTS();
2081           switch (avail) {
2082           case 1:
2083                     goto found;
2084           case -1:
2085                     goto wraparound;
2086           }
2087 
2088 nextgap:
2089           KDASSERT((flags & UVM_FLAG_FIXED) == 0);
2090           /* If there is not enough space in the whole tree, we fail */
2091           tmp = ROOT_ENTRY(map);
2092           if (tmp == NULL || tmp->maxgap < length)
2093                     goto notfound;
2094 
2095           prev = NULL; /* previous candidate */
2096 
2097           /* Find an entry close to hint that has enough space */
2098           for (; tmp;) {
2099                     KASSERT(tmp->next->start == tmp->end + tmp->gap);
2100                     if (topdown) {
2101                               if (tmp->next->start < hint + length &&
2102                                   (prev == NULL || tmp->end > prev->end)) {
2103                                         if (tmp->gap >= length)
2104                                                   prev = tmp;
2105                                         else if ((child = LEFT_ENTRY(tmp)) != NULL
2106                                             && child->maxgap >= length)
2107                                                   prev = tmp;
2108                               }
2109                     } else {
2110                               if (tmp->end >= hint &&
2111                                   (prev == NULL || tmp->end < prev->end)) {
2112                                         if (tmp->gap >= length)
2113                                                   prev = tmp;
2114                                         else if ((child = RIGHT_ENTRY(tmp)) != NULL
2115                                             && child->maxgap >= length)
2116                                                   prev = tmp;
2117                               }
2118                     }
2119                     if (tmp->next->start < hint + length)
2120                               child = RIGHT_ENTRY(tmp);
2121                     else if (tmp->end > hint)
2122                               child = LEFT_ENTRY(tmp);
2123                     else {
2124                               if (tmp->gap >= length)
2125                                         break;
2126                               if (topdown)
2127                                         child = LEFT_ENTRY(tmp);
2128                               else
2129                                         child = RIGHT_ENTRY(tmp);
2130                     }
2131                     if (child == NULL || child->maxgap < length)
2132                               break;
2133                     tmp = child;
2134           }
2135 
2136           if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
2137                     /*
2138                      * Check if the entry that we found satifies the
2139                      * space requirement
2140                      */
2141                     if (topdown) {
2142                               if (hint > tmp->next->start - length)
2143                                         hint = tmp->next->start - length;
2144                     } else {
2145                               if (hint < tmp->end)
2146                                         hint = tmp->end;
2147                     }
2148                     INVARIANTS();
2149                     avail = uvm_map_space_avail(&hint, length, uoffset, align,
2150                         flags, topdown, tmp);
2151                     INVARIANTS();
2152                     switch (avail) {
2153                     case 1:
2154                               entry = tmp;
2155                               goto found;
2156                     case -1:
2157                               goto wraparound;
2158                     }
2159                     if (tmp->gap >= length)
2160                               goto listsearch;
2161           }
2162           if (prev == NULL)
2163                     goto notfound;
2164 
2165           if (topdown) {
2166                     KASSERT(orig_hint >= prev->next->start - length ||
2167                         prev->next->start - length > prev->next->start);
2168                     hint = prev->next->start - length;
2169           } else {
2170                     KASSERT(orig_hint <= prev->end);
2171                     hint = prev->end;
2172           }
2173           INVARIANTS();
2174           avail = uvm_map_space_avail(&hint, length, uoffset, align,
2175               flags, topdown, prev);
2176           INVARIANTS();
2177           switch (avail) {
2178           case 1:
2179                     entry = prev;
2180                     goto found;
2181           case -1:
2182                     goto wraparound;
2183           }
2184           if (prev->gap >= length)
2185                     goto listsearch;
2186 
2187           if (topdown)
2188                     tmp = LEFT_ENTRY(prev);
2189           else
2190                     tmp = RIGHT_ENTRY(prev);
2191           for (;;) {
2192                     KASSERT(tmp);
2193                     KASSERTMSG(tmp->maxgap >= length,
2194                         "tmp->maxgap=0x%"PRIxVSIZE" length=0x%"PRIxVSIZE,
2195                         tmp->maxgap, length);
2196                     if (topdown)
2197                               child = RIGHT_ENTRY(tmp);
2198                     else
2199                               child = LEFT_ENTRY(tmp);
2200                     if (child && child->maxgap >= length) {
2201                               tmp = child;
2202                               continue;
2203                     }
2204                     if (tmp->gap >= length)
2205                               break;
2206                     if (topdown)
2207                               tmp = LEFT_ENTRY(tmp);
2208                     else
2209                               tmp = RIGHT_ENTRY(tmp);
2210           }
2211 
2212           if (topdown) {
2213                     KASSERT(orig_hint >= tmp->next->start - length ||
2214                         tmp->next->start - length > tmp->next->start);
2215                     hint = tmp->next->start - length;
2216           } else {
2217                     KASSERT(orig_hint <= tmp->end);
2218                     hint = tmp->end;
2219           }
2220           INVARIANTS();
2221           avail = uvm_map_space_avail(&hint, length, uoffset, align,
2222               flags, topdown, tmp);
2223           INVARIANTS();
2224           switch (avail) {
2225           case 1:
2226                     entry = tmp;
2227                     goto found;
2228           case -1:
2229                     goto wraparound;
2230           }
2231 
2232           /*
2233            * The tree fails to find an entry because of offset or alignment
2234            * restrictions.  Search the list instead.
2235            */
2236  listsearch:
2237           /*
2238            * Look through the rest of the map, trying to fit a new region in
2239            * the gap between existing regions, or after the very last region.
2240            * note: entry->end = base VA of current gap,
2241            *         entry->next->start = VA of end of current gap
2242            */
2243 
2244           INVARIANTS();
2245           for (;;) {
2246                     /* Update hint for current gap. */
2247                     hint = topdown ? MIN(orig_hint, entry->next->start - length)
2248                         : entry->end;
2249                     INVARIANTS();
2250 
2251                     /* See if it fits. */
2252                     avail = uvm_map_space_avail(&hint, length, uoffset, align,
2253                         flags, topdown, entry);
2254                     INVARIANTS();
2255                     switch (avail) {
2256                     case 1:
2257                               goto found;
2258                     case -1:
2259                               goto wraparound;
2260                     }
2261 
2262                     /* Advance to next/previous gap */
2263                     if (topdown) {
2264                               if (entry == &map->header) {
2265                                         UVMHIST_LOG(maphist, "<- failed (off start)",
2266                                             0,0,0,0);
2267                                         goto notfound;
2268                               }
2269                               entry = entry->prev;
2270                     } else {
2271                               entry = entry->next;
2272                               if (entry == &map->header) {
2273                                         UVMHIST_LOG(maphist, "<- failed (off end)",
2274                                             0,0,0,0);
2275                                         goto notfound;
2276                               }
2277                     }
2278           }
2279 
2280  found:
2281           SAVE_HINT(map, map->hint, entry);
2282           *result = hint;
2283           UVMHIST_LOG(maphist,"<- got it!  (result=%#jx)", hint, 0,0,0);
2284           INVARIANTS();
2285           KASSERT(entry->end <= hint);
2286           KASSERT(hint <= entry->next->start);
2287           KASSERT(length <= entry->next->start - hint);
2288           return (entry);
2289 
2290  wraparound:
2291           UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
2292 
2293           return (NULL);
2294 
2295  notfound:
2296           UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);
2297 
2298           return (NULL);
2299 #undef INVARIANTS
2300 }
2301 
2302 /*
2303  *   U N M A P   -   m a i n   h e l p e r   f u n c t i o n s
2304  */
2305 
2306 /*
2307  * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
2308  *
2309  * => caller must check alignment and size
2310  * => map must be locked by caller
2311  * => we return a list of map entries that we've remove from the map
2312  *    in "entry_list"
2313  */
2314 
2315 void
uvm_unmap_remove(struct vm_map * map,vaddr_t start,vaddr_t end,struct vm_map_entry ** entry_list,int flags)2316 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
2317     struct vm_map_entry **entry_list /* OUT */, int flags)
2318 {
2319           struct vm_map_entry *entry, *first_entry, *next;
2320           vaddr_t len;
2321           UVMHIST_FUNC(__func__);
2322           UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
2323               (uintptr_t)map, start, end, 0);
2324           VM_MAP_RANGE_CHECK(map, start, end);
2325 
2326           KASSERT(vm_map_locked_p(map));
2327 
2328           uvm_map_check(map, "unmap_remove entry");
2329 
2330           /*
2331            * find first entry
2332            */
2333 
2334           if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
2335                     /* clip and go... */
2336                     entry = first_entry;
2337                     UVM_MAP_CLIP_START(map, entry, start);
2338                     /* critical!  prevents stale hint */
2339                     SAVE_HINT(map, entry, entry->prev);
2340           } else {
2341                     entry = first_entry->next;
2342           }
2343 
2344           /*
2345            * save the free space hint
2346            */
2347 
2348           if (map->first_free != &map->header && map->first_free->start >= start)
2349                     map->first_free = entry->prev;
2350 
2351           /*
2352            * note: we now re-use first_entry for a different task.  we remove
2353            * a number of map entries from the map and save them in a linked
2354            * list headed by "first_entry".  once we remove them from the map
2355            * the caller should unlock the map and drop the references to the
2356            * backing objects [c.f. uvm_unmap_detach].  the object is to
2357            * separate unmapping from reference dropping.  why?
2358            *   [1] the map has to be locked for unmapping
2359            *   [2] the map need not be locked for reference dropping
2360            *   [3] dropping references may trigger pager I/O, and if we hit
2361            *       a pager that does synchronous I/O we may have to wait for it.
2362            *   [4] we would like all waiting for I/O to occur with maps unlocked
2363            *       so that we don't block other threads.
2364            */
2365 
2366           first_entry = NULL;
2367           *entry_list = NULL;
2368 
2369           /*
2370            * break up the area into map entry sized regions and unmap.  note
2371            * that all mappings have to be removed before we can even consider
2372            * dropping references to amaps or VM objects (otherwise we could end
2373            * up with a mapping to a page on the free list which would be very bad)
2374            */
2375 
2376           while ((entry != &map->header) && (entry->start < end)) {
2377                     KASSERT((entry->flags & UVM_MAP_STATIC) == 0);
2378 
2379                     UVM_MAP_CLIP_END(map, entry, end);
2380                     next = entry->next;
2381                     len = entry->end - entry->start;
2382 
2383                     /*
2384                      * unwire before removing addresses from the pmap; otherwise
2385                      * unwiring will put the entries back into the pmap (XXX).
2386                      */
2387 
2388                     if (VM_MAPENT_ISWIRED(entry)) {
2389                               uvm_map_entry_unwire(map, entry);
2390                     }
2391                     if (flags & UVM_FLAG_VAONLY) {
2392 
2393                               /* nothing */
2394 
2395                     } else if ((map->flags & VM_MAP_PAGEABLE) == 0) {
2396 
2397                               /*
2398                                * if the map is non-pageable, any pages mapped there
2399                                * must be wired and entered with pmap_kenter_pa(),
2400                                * and we should free any such pages immediately.
2401                                * this is mostly used for kmem_map.
2402                                */
2403                               KASSERT(vm_map_pmap(map) == pmap_kernel());
2404 
2405                               uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
2406                     } else if (UVM_ET_ISOBJ(entry) &&
2407                                  UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
2408                               panic("%s: kernel object %p %p\n",
2409                                   __func__, map, entry);
2410                     } else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
2411                               /*
2412                                * remove mappings the standard way.  lock object
2413                                * and/or amap to ensure vm_page state does not
2414                                * change while in pmap_remove().
2415                                */
2416 
2417 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
2418                               uvm_map_lock_entry(entry, RW_WRITER);
2419 #else
2420                               uvm_map_lock_entry(entry, RW_READER);
2421 #endif
2422                               pmap_remove(map->pmap, entry->start, entry->end);
2423 
2424                               /*
2425                                * note: if map is dying, leave pmap_update() for
2426                                * later.  if the map is to be reused (exec) then
2427                                * pmap_update() will be called.  if the map is
2428                                * being disposed of (exit) then pmap_destroy()
2429                                * will be called.
2430                                */
2431 
2432                               if ((map->flags & VM_MAP_DYING) == 0) {
2433                                         pmap_update(vm_map_pmap(map));
2434                               } else {
2435                                         KASSERT(vm_map_pmap(map) != pmap_kernel());
2436                               }
2437 
2438                               uvm_map_unlock_entry(entry);
2439                     }
2440 
2441 #if defined(UVMDEBUG)
2442                     /*
2443                      * check if there's remaining mapping,
2444                      * which is a bug in caller.
2445                      */
2446 
2447                     vaddr_t va;
2448                     for (va = entry->start; va < entry->end;
2449                         va += PAGE_SIZE) {
2450                               if (pmap_extract(vm_map_pmap(map), va, NULL)) {
2451                                         panic("%s: %#"PRIxVADDR" has mapping",
2452                                             __func__, va);
2453                               }
2454                     }
2455 
2456                     if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
2457                               uvm_km_check_empty(map, entry->start, entry->end);
2458                     }
2459 #endif /* defined(UVMDEBUG) */
2460 
2461                     /*
2462                      * remove entry from map and put it on our list of entries
2463                      * that we've nuked.  then go to next entry.
2464                      */
2465 
2466                     UVMHIST_LOG(maphist, "  removed map entry %#jx",
2467                         (uintptr_t)entry, 0, 0, 0);
2468 
2469                     /* critical!  prevents stale hint */
2470                     SAVE_HINT(map, entry, entry->prev);
2471 
2472                     uvm_map_entry_unlink(map, entry);
2473                     KASSERT(map->size >= len);
2474                     map->size -= len;
2475                     entry->prev = NULL;
2476                     entry->next = first_entry;
2477                     first_entry = entry;
2478                     entry = next;
2479           }
2480 
2481           uvm_map_check(map, "unmap_remove leave");
2482 
2483           /*
2484            * now we've cleaned up the map and are ready for the caller to drop
2485            * references to the mapped objects.
2486            */
2487 
2488           *entry_list = first_entry;
2489           UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
2490 
2491           if (map->flags & VM_MAP_WANTVA) {
2492                     mutex_enter(&map->misc_lock);
2493                     map->flags &= ~VM_MAP_WANTVA;
2494                     cv_broadcast(&map->cv);
2495                     mutex_exit(&map->misc_lock);
2496           }
2497 }
2498 
2499 /*
2500  * uvm_unmap_detach: drop references in a chain of map entries
2501  *
2502  * => we will free the map entries as we traverse the list.
2503  */
2504 
2505 void
uvm_unmap_detach(struct vm_map_entry * first_entry,int flags)2506 uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
2507 {
2508           struct vm_map_entry *next_entry;
2509           UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
2510 
2511           while (first_entry) {
2512                     KASSERT(!VM_MAPENT_ISWIRED(first_entry));
2513                     UVMHIST_LOG(maphist,
2514                         "  detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
2515                         (uintptr_t)first_entry,
2516                         (uintptr_t)first_entry->aref.ar_amap,
2517                         (uintptr_t)first_entry->object.uvm_obj,
2518                         UVM_ET_ISSUBMAP(first_entry));
2519 
2520                     /*
2521                      * drop reference to amap, if we've got one
2522                      */
2523 
2524                     if (first_entry->aref.ar_amap)
2525                               uvm_map_unreference_amap(first_entry, flags);
2526 
2527                     /*
2528                      * drop reference to our backing object, if we've got one
2529                      */
2530 
2531                     KASSERT(!UVM_ET_ISSUBMAP(first_entry));
2532                     if (UVM_ET_ISOBJ(first_entry) &&
2533                         first_entry->object.uvm_obj->pgops->pgo_detach) {
2534                               (*first_entry->object.uvm_obj->pgops->pgo_detach)
2535                                         (first_entry->object.uvm_obj);
2536                     }
2537                     next_entry = first_entry->next;
2538                     uvm_mapent_free(first_entry);
2539                     first_entry = next_entry;
2540           }
2541           UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
2542 }
2543 
2544 /*
2545  *   E X T R A C T I O N   F U N C T I O N S
2546  */
2547 
2548 /*
2549  * uvm_map_reserve: reserve space in a vm_map for future use.
2550  *
2551  * => we reserve space in a map by putting a dummy map entry in the
2552  *    map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
2553  * => map should be unlocked (we will write lock it)
2554  * => we return true if we were able to reserve space
2555  * => XXXCDC: should be inline?
2556  */
2557 
2558 int
uvm_map_reserve(struct vm_map * map,vsize_t size,vaddr_t offset,vsize_t align,vaddr_t * raddr,uvm_flag_t flags)2559 uvm_map_reserve(struct vm_map *map, vsize_t size,
2560     vaddr_t offset  /* hint for pmap_prefer */,
2561     vsize_t align   /* alignment */,
2562     vaddr_t *raddr  /* IN:hint, OUT: reserved VA */,
2563     uvm_flag_t flags          /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
2564 {
2565           UVMHIST_FUNC(__func__);
2566           UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
2567               (uintptr_t)map, size, offset, (uintptr_t)raddr);
2568 
2569           size = round_page(size);
2570 
2571           /*
2572            * reserve some virtual space.
2573            */
2574 
2575           if (uvm_map(map, raddr, size, NULL, offset, align,
2576               UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
2577               UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
2578               UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
2579                     return (false);
2580           }
2581 
2582           UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
2583           return (true);
2584 }
2585 
2586 /*
2587  * uvm_map_replace: replace a reserved (blank) area of memory with
2588  * real mappings.
2589  *
2590  * => caller must WRITE-LOCK the map
2591  * => we return true if replacement was a success
2592  * => we expect the newents chain to have nnewents entrys on it and
2593  *    we expect newents->prev to point to the last entry on the list
2594  * => note newents is allowed to be NULL
2595  */
2596 
2597 static int
uvm_map_replace(struct vm_map * map,vaddr_t start,vaddr_t end,struct vm_map_entry * newents,int nnewents,vsize_t nsize,struct vm_map_entry ** oldentryp)2598 uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
2599     struct vm_map_entry *newents, int nnewents, vsize_t nsize,
2600     struct vm_map_entry **oldentryp)
2601 {
2602           struct vm_map_entry *oldent, *last;
2603 
2604           uvm_map_check(map, "map_replace entry");
2605 
2606           /*
2607            * first find the blank map entry at the specified address
2608            */
2609 
2610           if (!uvm_map_lookup_entry(map, start, &oldent)) {
2611                     return (false);
2612           }
2613 
2614           /*
2615            * check to make sure we have a proper blank entry
2616            */
2617 
2618           if (end < oldent->end) {
2619                     UVM_MAP_CLIP_END(map, oldent, end);
2620           }
2621           if (oldent->start != start || oldent->end != end ||
2622               oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
2623                     return (false);
2624           }
2625 
2626 #ifdef DIAGNOSTIC
2627 
2628           /*
2629            * sanity check the newents chain
2630            */
2631 
2632           {
2633                     struct vm_map_entry *tmpent = newents;
2634                     int nent = 0;
2635                     vsize_t sz = 0;
2636                     vaddr_t cur = start;
2637 
2638                     while (tmpent) {
2639                               nent++;
2640                               sz += tmpent->end - tmpent->start;
2641                               if (tmpent->start < cur)
2642                                         panic("uvm_map_replace1");
2643                               if (tmpent->start >= tmpent->end || tmpent->end > end) {
2644                                         panic("uvm_map_replace2: "
2645                                             "tmpent->start=%#"PRIxVADDR
2646                                             ", tmpent->end=%#"PRIxVADDR
2647                                             ", end=%#"PRIxVADDR,
2648                                             tmpent->start, tmpent->end, end);
2649                               }
2650                               cur = tmpent->end;
2651                               if (tmpent->next) {
2652                                         if (tmpent->next->prev != tmpent)
2653                                                   panic("uvm_map_replace3");
2654                               } else {
2655                                         if (newents->prev != tmpent)
2656                                                   panic("uvm_map_replace4");
2657                               }
2658                               tmpent = tmpent->next;
2659                     }
2660                     if (nent != nnewents)
2661                               panic("uvm_map_replace5");
2662                     if (sz != nsize)
2663                               panic("uvm_map_replace6");
2664           }
2665 #endif
2666 
2667           /*
2668            * map entry is a valid blank!   replace it.   (this does all the
2669            * work of map entry link/unlink...).
2670            */
2671 
2672           if (newents) {
2673                     last = newents->prev;
2674 
2675                     /* critical: flush stale hints out of map */
2676                     SAVE_HINT(map, map->hint, newents);
2677                     if (map->first_free == oldent)
2678                               map->first_free = last;
2679 
2680                     last->next = oldent->next;
2681                     last->next->prev = last;
2682 
2683                     /* Fix RB tree */
2684                     uvm_rb_remove(map, oldent);
2685 
2686                     newents->prev = oldent->prev;
2687                     newents->prev->next = newents;
2688                     map->nentries = map->nentries + (nnewents - 1);
2689 
2690                     /* Fixup the RB tree */
2691                     {
2692                               int i;
2693                               struct vm_map_entry *tmp;
2694 
2695                               tmp = newents;
2696                               for (i = 0; i < nnewents && tmp; i++) {
2697                                         uvm_rb_insert(map, tmp);
2698                                         tmp = tmp->next;
2699                               }
2700                     }
2701           } else {
2702                     /* NULL list of new entries: just remove the old one */
2703                     clear_hints(map, oldent);
2704                     uvm_map_entry_unlink(map, oldent);
2705           }
2706           map->size -= end - start - nsize;
2707 
2708           uvm_map_check(map, "map_replace leave");
2709 
2710           /*
2711            * now we can free the old blank entry and return.
2712            */
2713 
2714           *oldentryp = oldent;
2715           return (true);
2716 }
2717 
2718 /*
2719  * uvm_map_extract: extract a mapping from a map and put it somewhere
2720  *        (maybe removing the old mapping)
2721  *
2722  * => maps should be unlocked (we will write lock them)
2723  * => returns 0 on success, error code otherwise
2724  * => start must be page aligned
2725  * => len must be page sized
2726  * => flags:
2727  *      UVM_EXTRACT_REMOVE: remove mappings from srcmap
2728  *      UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
2729  *      UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
2730  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
2731  *      UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
2732  *    >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
2733  *    >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
2734  *             be used from within the kernel in a kernel level map <<<
2735  */
2736 
2737 int
uvm_map_extract(struct vm_map * srcmap,vaddr_t start,vsize_t len,struct vm_map * dstmap,vaddr_t * dstaddrp,int flags)2738 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
2739     struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
2740 {
2741           vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
2742           struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
2743               *deadentry, *oldentry;
2744           struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
2745           vsize_t elen __unused;
2746           int nchain, error, copy_ok;
2747           vsize_t nsize;
2748           UVMHIST_FUNC(__func__);
2749           UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
2750               (uintptr_t)srcmap, start, len, 0);
2751           UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
2752               (uintptr_t)dstmap, flags, 0, 0);
2753 
2754           /*
2755            * step 0: sanity check: start must be on a page boundary, length
2756            * must be page sized.  can't ask for CONTIG/QREF if you asked for
2757            * REMOVE.
2758            */
2759 
2760           KASSERTMSG((start & PAGE_MASK) == 0, "start=0x%"PRIxVADDR, start);
2761           KASSERTMSG((len & PAGE_MASK) == 0, "len=0x%"PRIxVADDR, len);
2762           KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
2763                     (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
2764 
2765           /*
2766            * step 1: reserve space in the target map for the extracted area
2767            */
2768 
2769           if ((flags & UVM_EXTRACT_RESERVED) == 0) {
2770                     dstaddr = vm_map_min(dstmap);
2771                     if (!uvm_map_reserve(dstmap, len, start,
2772                         atop(start) & uvmexp.colormask, &dstaddr,
2773                         UVM_FLAG_COLORMATCH))
2774                               return (ENOMEM);
2775                     KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
2776                     *dstaddrp = dstaddr;          /* pass address back to caller */
2777                     UVMHIST_LOG(maphist, "  dstaddr=%#jx", dstaddr,0,0,0);
2778           } else {
2779                     dstaddr = *dstaddrp;
2780           }
2781 
2782           /*
2783            * step 2: setup for the extraction process loop by init'ing the
2784            * map entry chain, locking src map, and looking up the first useful
2785            * entry in the map.
2786            */
2787 
2788           end = start + len;
2789           newend = dstaddr + len;
2790           chain = endchain = NULL;
2791           nchain = 0;
2792           nsize = 0;
2793           vm_map_lock(srcmap);
2794 
2795           if (uvm_map_lookup_entry(srcmap, start, &entry)) {
2796 
2797                     /* "start" is within an entry */
2798                     if (flags & UVM_EXTRACT_QREF) {
2799 
2800                               /*
2801                                * for quick references we don't clip the entry, so
2802                                * the entry may map space "before" the starting
2803                                * virtual address... this is the "fudge" factor
2804                                * (which can be non-zero only the first time
2805                                * through the "while" loop in step 3).
2806                                */
2807 
2808                               fudge = start - entry->start;
2809                     } else {
2810 
2811                               /*
2812                                * normal reference: we clip the map to fit (thus
2813                                * fudge is zero)
2814                                */
2815 
2816                               UVM_MAP_CLIP_START(srcmap, entry, start);
2817                               SAVE_HINT(srcmap, srcmap->hint, entry->prev);
2818                               fudge = 0;
2819                     }
2820           } else {
2821 
2822                     /* "start" is not within an entry ... skip to next entry */
2823                     if (flags & UVM_EXTRACT_CONTIG) {
2824                               error = EINVAL;
2825                               goto bad;    /* definite hole here ... */
2826                     }
2827 
2828                     entry = entry->next;
2829                     fudge = 0;
2830           }
2831 
2832           /* save values from srcmap for step 6 */
2833           orig_entry = entry;
2834           orig_fudge = fudge;
2835 
2836           /*
2837            * step 3: now start looping through the map entries, extracting
2838            * as we go.
2839            */
2840 
2841           while (entry->start < end && entry != &srcmap->header) {
2842 
2843                     /* if we are not doing a quick reference, clip it */
2844                     if ((flags & UVM_EXTRACT_QREF) == 0)
2845                               UVM_MAP_CLIP_END(srcmap, entry, end);
2846 
2847                     /* clear needs_copy (allow chunking) */
2848                     if (UVM_ET_ISNEEDSCOPY(entry)) {
2849                               amap_copy(srcmap, entry,
2850                                   AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
2851                               if (UVM_ET_ISNEEDSCOPY(entry)) {  /* failed? */
2852                                         error = ENOMEM;
2853                                         goto bad;
2854                               }
2855 
2856                               /* amap_copy could clip (during chunk)!  update fudge */
2857                               if (fudge) {
2858                                         fudge = start - entry->start;
2859                                         orig_fudge = fudge;
2860                               }
2861                     }
2862 
2863                     /* calculate the offset of this from "start" */
2864                     oldoffset = (entry->start + fudge) - start;
2865 
2866                     /* allocate a new map entry */
2867                     newentry = uvm_mapent_alloc(dstmap, 0);
2868                     if (newentry == NULL) {
2869                               error = ENOMEM;
2870                               goto bad;
2871                     }
2872 
2873                     /* set up new map entry */
2874                     newentry->next = NULL;
2875                     newentry->prev = endchain;
2876                     newentry->start = dstaddr + oldoffset;
2877                     newentry->end =
2878                         newentry->start + (entry->end - (entry->start + fudge));
2879                     if (newentry->end > newend || newentry->end < newentry->start)
2880                               newentry->end = newend;
2881                     newentry->object.uvm_obj = entry->object.uvm_obj;
2882                     if (newentry->object.uvm_obj) {
2883                               if (newentry->object.uvm_obj->pgops->pgo_reference)
2884                                         newentry->object.uvm_obj->pgops->
2885                                             pgo_reference(newentry->object.uvm_obj);
2886                               newentry->offset = entry->offset + fudge;
2887                     } else {
2888                               newentry->offset = 0;
2889                     }
2890                     newentry->etype = entry->etype;
2891                     if (flags & UVM_EXTRACT_PROT_ALL) {
2892                               newentry->protection = newentry->max_protection =
2893                                   UVM_PROT_ALL;
2894                     } else {
2895                               newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
2896                                   entry->max_protection : entry->protection;
2897                               newentry->max_protection = entry->max_protection;
2898                     }
2899                     newentry->inheritance = entry->inheritance;
2900                     newentry->wired_count = 0;
2901                     newentry->aref.ar_amap = entry->aref.ar_amap;
2902                     if (newentry->aref.ar_amap) {
2903                               newentry->aref.ar_pageoff =
2904                                   entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
2905                               uvm_map_reference_amap(newentry, AMAP_SHARED |
2906                                   ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
2907                     } else {
2908                               newentry->aref.ar_pageoff = 0;
2909                     }
2910                     newentry->advice = entry->advice;
2911                     if ((flags & UVM_EXTRACT_QREF) != 0) {
2912                               newentry->flags |= UVM_MAP_NOMERGE;
2913                     }
2914 
2915                     /* now link it on the chain */
2916                     nchain++;
2917                     nsize += newentry->end - newentry->start;
2918                     if (endchain == NULL) {
2919                               chain = endchain = newentry;
2920                     } else {
2921                               endchain->next = newentry;
2922                               endchain = newentry;
2923                     }
2924 
2925                     /* end of 'while' loop! */
2926                     if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
2927                         (entry->next == &srcmap->header ||
2928                         entry->next->start != entry->end)) {
2929                               error = EINVAL;
2930                               goto bad;
2931                     }
2932                     entry = entry->next;
2933                     fudge = 0;
2934           }
2935 
2936           /*
2937            * step 4: close off chain (in format expected by uvm_map_replace)
2938            */
2939 
2940           if (chain)
2941                     chain->prev = endchain;
2942 
2943           /*
2944            * step 5: attempt to lock the dest map so we can pmap_copy.
2945            * note usage of copy_ok:
2946            *   1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
2947            *   0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
2948            */
2949 
2950           if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
2951                     copy_ok = 1;
2952                     if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
2953                         nchain, nsize, &resentry)) {
2954                               if (srcmap != dstmap)
2955                                         vm_map_unlock(dstmap);
2956                               error = EIO;
2957                               goto bad;
2958                     }
2959           } else {
2960                     copy_ok = 0;
2961                     /* replace deferred until step 7 */
2962           }
2963 
2964           /*
2965            * step 6: traverse the srcmap a second time to do the following:
2966            *  - if we got a lock on the dstmap do pmap_copy
2967            *  - if UVM_EXTRACT_REMOVE remove the entries
2968            * we make use of orig_entry and orig_fudge (saved in step 2)
2969            */
2970 
2971           if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
2972 
2973                     /* purge possible stale hints from srcmap */
2974                     if (flags & UVM_EXTRACT_REMOVE) {
2975                               SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
2976                               if (srcmap->first_free != &srcmap->header &&
2977                                   srcmap->first_free->start >= start)
2978                                         srcmap->first_free = orig_entry->prev;
2979                     }
2980 
2981                     entry = orig_entry;
2982                     fudge = orig_fudge;
2983                     deadentry = NULL;   /* for UVM_EXTRACT_REMOVE */
2984 
2985                     while (entry->start < end && entry != &srcmap->header) {
2986                               if (copy_ok) {
2987                                         oldoffset = (entry->start + fudge) - start;
2988                                         elen = MIN(end, entry->end) -
2989                                             (entry->start + fudge);
2990                                         pmap_copy(dstmap->pmap, srcmap->pmap,
2991                                             dstaddr + oldoffset, elen,
2992                                             entry->start + fudge);
2993                               }
2994 
2995                               /* we advance "entry" in the following if statement */
2996                               if (flags & UVM_EXTRACT_REMOVE) {
2997 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
2998                                         uvm_map_lock_entry(entry, RW_WRITER);
2999 #else
3000                                         uvm_map_lock_entry(entry, RW_READER);
3001 #endif
3002                                         pmap_remove(srcmap->pmap, entry->start,
3003                                                             entry->end);
3004                                         uvm_map_unlock_entry(entry);
3005                                         oldentry = entry;   /* save entry */
3006                                         entry = entry->next;          /* advance */
3007                                         uvm_map_entry_unlink(srcmap, oldentry);
3008                                                                       /* add to dead list */
3009                                         oldentry->next = deadentry;
3010                                         deadentry = oldentry;
3011                               } else {
3012                                         entry = entry->next;                    /* advance */
3013                               }
3014 
3015                               /* end of 'while' loop */
3016                               fudge = 0;
3017                     }
3018                     pmap_update(srcmap->pmap);
3019 
3020                     /*
3021                      * unlock dstmap.  we will dispose of deadentry in
3022                      * step 7 if needed
3023                      */
3024 
3025                     if (copy_ok && srcmap != dstmap)
3026                               vm_map_unlock(dstmap);
3027 
3028           } else {
3029                     deadentry = NULL;
3030           }
3031 
3032           /*
3033            * step 7: we are done with the source map, unlock.   if copy_ok
3034            * is 0 then we have not replaced the dummy mapping in dstmap yet
3035            * and we need to do so now.
3036            */
3037 
3038           vm_map_unlock(srcmap);
3039           if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
3040                     uvm_unmap_detach(deadentry, 0);   /* dispose of old entries */
3041 
3042           /* now do the replacement if we didn't do it in step 5 */
3043           if (copy_ok == 0) {
3044                     vm_map_lock(dstmap);
3045                     error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
3046                         nchain, nsize, &resentry);
3047                     vm_map_unlock(dstmap);
3048 
3049                     if (error == false) {
3050                               error = EIO;
3051                               goto bad2;
3052                     }
3053           }
3054 
3055           if (resentry != NULL)
3056                     uvm_mapent_free(resentry);
3057 
3058           return (0);
3059 
3060           /*
3061            * bad: failure recovery
3062            */
3063 bad:
3064           vm_map_unlock(srcmap);
3065 bad2:                         /* src already unlocked */
3066           if (chain)
3067                     uvm_unmap_detach(chain,
3068                         (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);
3069 
3070           if (resentry != NULL)
3071                     uvm_mapent_free(resentry);
3072 
3073           if ((flags & UVM_EXTRACT_RESERVED) == 0) {
3074                     uvm_unmap(dstmap, dstaddr, dstaddr+len);   /* ??? */
3075           }
3076           return (error);
3077 }
3078 
3079 /* end of extraction functions */
3080 
3081 /*
3082  * uvm_map_submap: punch down part of a map into a submap
3083  *
3084  * => only the kernel_map is allowed to be submapped
3085  * => the purpose of submapping is to break up the locking granularity
3086  *        of a larger map
3087  * => the range specified must have been mapped previously with a uvm_map()
3088  *        call [with uobj==NULL] to create a blank map entry in the main map.
3089  *        [And it had better still be blank!]
3090  * => maps which contain submaps should never be copied or forked.
3091  * => to remove a submap, use uvm_unmap() on the main map
3092  *        and then uvm_map_deallocate() the submap.
3093  * => main map must be unlocked.
3094  * => submap must have been init'd and have a zero reference count.
3095  *        [need not be locked as we don't actually reference it]
3096  */
3097 
3098 int
uvm_map_submap(struct vm_map * map,vaddr_t start,vaddr_t end,struct vm_map * submap)3099 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
3100     struct vm_map *submap)
3101 {
3102           struct vm_map_entry *entry;
3103           int error;
3104 
3105           vm_map_lock(map);
3106           VM_MAP_RANGE_CHECK(map, start, end);
3107 
3108           if (uvm_map_lookup_entry(map, start, &entry)) {
3109                     UVM_MAP_CLIP_START(map, entry, start);
3110                     UVM_MAP_CLIP_END(map, entry, end);      /* to be safe */
3111           } else {
3112                     entry = NULL;
3113           }
3114 
3115           if (entry != NULL &&
3116               entry->start == start && entry->end == end &&
3117               entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
3118               !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
3119                     entry->etype |= UVM_ET_SUBMAP;
3120                     entry->object.sub_map = submap;
3121                     entry->offset = 0;
3122                     uvm_map_reference(submap);
3123                     error = 0;
3124           } else {
3125                     error = EINVAL;
3126           }
3127           vm_map_unlock(map);
3128 
3129           return error;
3130 }
3131 
3132 /*
3133  * uvm_map_protect_user: change map protection on behalf of the user.
3134  * Enforces PAX settings as necessary.
3135  */
3136 int
uvm_map_protect_user(struct lwp * l,vaddr_t start,vaddr_t end,vm_prot_t new_prot)3137 uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
3138     vm_prot_t new_prot)
3139 {
3140           int error;
3141 
3142           if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
3143                     return error;
3144 
3145           return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
3146               new_prot, false);
3147 }
3148 
3149 
3150 /*
3151  * uvm_map_protect: change map protection
3152  *
3153  * => set_max means set max_protection.
3154  * => map must be unlocked.
3155  */
3156 
3157 #define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
3158                                ~VM_PROT_WRITE : VM_PROT_ALL)
3159 
3160 int
uvm_map_protect(struct vm_map * map,vaddr_t start,vaddr_t end,vm_prot_t new_prot,bool set_max)3161 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
3162     vm_prot_t new_prot, bool set_max)
3163 {
3164           struct vm_map_entry *current, *entry;
3165           int error = 0;
3166           UVMHIST_FUNC(__func__);
3167           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
3168               (uintptr_t)map, start, end, new_prot);
3169 
3170           vm_map_lock(map);
3171           VM_MAP_RANGE_CHECK(map, start, end);
3172           if (uvm_map_lookup_entry(map, start, &entry)) {
3173                     UVM_MAP_CLIP_START(map, entry, start);
3174           } else {
3175                     entry = entry->next;
3176           }
3177 
3178           /*
3179            * make a first pass to check for protection violations.
3180            */
3181 
3182           current = entry;
3183           while ((current != &map->header) && (current->start < end)) {
3184                     if (UVM_ET_ISSUBMAP(current)) {
3185                               error = EINVAL;
3186                               goto out;
3187                     }
3188                     if ((new_prot & current->max_protection) != new_prot) {
3189                               error = EACCES;
3190                               goto out;
3191                     }
3192                     /*
3193                      * Don't allow VM_PROT_EXECUTE to be set on entries that
3194                      * point to vnodes that are associated with a NOEXEC file
3195                      * system.
3196                      */
3197                     if (UVM_ET_ISOBJ(current) &&
3198                         UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
3199                               struct vnode *vp =
3200                                   (struct vnode *) current->object.uvm_obj;
3201 
3202                               if ((new_prot & VM_PROT_EXECUTE) != 0 &&
3203                                   (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
3204                                         error = EACCES;
3205                                         goto out;
3206                               }
3207                     }
3208 
3209                     current = current->next;
3210           }
3211 
3212           /* go back and fix up protections (no need to clip this time). */
3213 
3214           current = entry;
3215           while ((current != &map->header) && (current->start < end)) {
3216                     vm_prot_t old_prot;
3217 
3218                     UVM_MAP_CLIP_END(map, current, end);
3219                     old_prot = current->protection;
3220                     if (set_max)
3221                               current->protection =
3222                                   (current->max_protection = new_prot) & old_prot;
3223                     else
3224                               current->protection = new_prot;
3225 
3226                     /*
3227                      * update physical map if necessary.  worry about copy-on-write
3228                      * here -- CHECK THIS XXX
3229                      */
3230 
3231                     if (current->protection != old_prot) {
3232                               /* update pmap! */
3233 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
3234                               uvm_map_lock_entry(current, RW_WRITER);
3235 #else
3236                               uvm_map_lock_entry(current, RW_READER);
3237 #endif
3238                               pmap_protect(map->pmap, current->start, current->end,
3239                                   current->protection & MASK(current));
3240                               uvm_map_unlock_entry(current);
3241 
3242                               /*
3243                                * If this entry points at a vnode, and the
3244                                * protection includes VM_PROT_EXECUTE, mark
3245                                * the vnode as VEXECMAP.
3246                                */
3247                               if (UVM_ET_ISOBJ(current)) {
3248                                         struct uvm_object *uobj =
3249                                             current->object.uvm_obj;
3250 
3251                                         if (UVM_OBJ_IS_VNODE(uobj) &&
3252                                             (current->protection & VM_PROT_EXECUTE)) {
3253                                                   vn_markexec((struct vnode *) uobj);
3254                                         }
3255                               }
3256                     }
3257 
3258                     /*
3259                      * If the map is configured to lock any future mappings,
3260                      * wire this entry now if the old protection was VM_PROT_NONE
3261                      * and the new protection is not VM_PROT_NONE.
3262                      */
3263 
3264                     if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
3265                         VM_MAPENT_ISWIRED(current) == 0 &&
3266                         old_prot == VM_PROT_NONE &&
3267                         new_prot != VM_PROT_NONE) {
3268 
3269                               /*
3270                                * We must call pmap_update() here because the
3271                                * pmap_protect() call above might have removed some
3272                                * pmap entries and uvm_map_pageable() might create
3273                                * some new pmap entries that rely on the prior
3274                                * removals being completely finished.
3275                                */
3276 
3277                               pmap_update(map->pmap);
3278 
3279                               if (uvm_map_pageable(map, current->start,
3280                                   current->end, false,
3281                                   UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
3282 
3283                                         /*
3284                                          * If locking the entry fails, remember the
3285                                          * error if it's the first one.  Note we
3286                                          * still continue setting the protection in
3287                                          * the map, but will return the error
3288                                          * condition regardless.
3289                                          *
3290                                          * XXX Ignore what the actual error is,
3291                                          * XXX just call it a resource shortage
3292                                          * XXX so that it doesn't get confused
3293                                          * XXX what uvm_map_protect() itself would
3294                                          * XXX normally return.
3295                                          */
3296 
3297                                         error = ENOMEM;
3298                               }
3299                     }
3300                     current = current->next;
3301           }
3302           pmap_update(map->pmap);
3303 
3304  out:
3305           vm_map_unlock(map);
3306 
3307           UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
3308           return error;
3309 }
3310 
3311 #undef  MASK
3312 
3313 /*
3314  * uvm_map_inherit: set inheritance code for range of addrs in map.
3315  *
3316  * => map must be unlocked
3317  * => note that the inherit code is used during a "fork".  see fork
3318  *        code for details.
3319  */
3320 
3321 int
uvm_map_inherit(struct vm_map * map,vaddr_t start,vaddr_t end,vm_inherit_t new_inheritance)3322 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
3323     vm_inherit_t new_inheritance)
3324 {
3325           struct vm_map_entry *entry, *temp_entry;
3326           UVMHIST_FUNC(__func__);
3327           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
3328               (uintptr_t)map, start, end, new_inheritance);
3329 
3330           switch (new_inheritance) {
3331           case MAP_INHERIT_NONE:
3332           case MAP_INHERIT_COPY:
3333           case MAP_INHERIT_SHARE:
3334           case MAP_INHERIT_ZERO:
3335                     break;
3336           default:
3337                     UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
3338                     return EINVAL;
3339           }
3340 
3341           vm_map_lock(map);
3342           VM_MAP_RANGE_CHECK(map, start, end);
3343           if (uvm_map_lookup_entry(map, start, &temp_entry)) {
3344                     entry = temp_entry;
3345                     UVM_MAP_CLIP_START(map, entry, start);
3346           }  else {
3347                     entry = temp_entry->next;
3348           }
3349           while ((entry != &map->header) && (entry->start < end)) {
3350                     UVM_MAP_CLIP_END(map, entry, end);
3351                     entry->inheritance = new_inheritance;
3352                     entry = entry->next;
3353           }
3354           vm_map_unlock(map);
3355           UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
3356           return 0;
3357 }
3358 
3359 /*
3360  * uvm_map_advice: set advice code for range of addrs in map.
3361  *
3362  * => map must be unlocked
3363  */
3364 
3365 int
uvm_map_advice(struct vm_map * map,vaddr_t start,vaddr_t end,int new_advice)3366 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
3367 {
3368           struct vm_map_entry *entry, *temp_entry;
3369           UVMHIST_FUNC(__func__);
3370           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
3371               (uintptr_t)map, start, end, new_advice);
3372 
3373           vm_map_lock(map);
3374           VM_MAP_RANGE_CHECK(map, start, end);
3375           if (uvm_map_lookup_entry(map, start, &temp_entry)) {
3376                     entry = temp_entry;
3377                     UVM_MAP_CLIP_START(map, entry, start);
3378           } else {
3379                     entry = temp_entry->next;
3380           }
3381 
3382           /*
3383            * XXXJRT: disallow holes?
3384            */
3385 
3386           while ((entry != &map->header) && (entry->start < end)) {
3387                     UVM_MAP_CLIP_END(map, entry, end);
3388 
3389                     switch (new_advice) {
3390                     case MADV_NORMAL:
3391                     case MADV_RANDOM:
3392                     case MADV_SEQUENTIAL:
3393                               /* nothing special here */
3394                               break;
3395 
3396                     default:
3397                               vm_map_unlock(map);
3398                               UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
3399                               return EINVAL;
3400                     }
3401                     entry->advice = new_advice;
3402                     entry = entry->next;
3403           }
3404 
3405           vm_map_unlock(map);
3406           UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
3407           return 0;
3408 }
3409 
3410 /*
3411  * uvm_map_willneed: apply MADV_WILLNEED
3412  */
3413 
3414 int
uvm_map_willneed(struct vm_map * map,vaddr_t start,vaddr_t end)3415 uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
3416 {
3417           struct vm_map_entry *entry;
3418           UVMHIST_FUNC(__func__);
3419           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
3420               (uintptr_t)map, start, end, 0);
3421 
3422           vm_map_lock_read(map);
3423           VM_MAP_RANGE_CHECK(map, start, end);
3424           if (!uvm_map_lookup_entry(map, start, &entry)) {
3425                     entry = entry->next;
3426           }
3427           while (entry->start < end) {
3428                     struct vm_amap * const amap = entry->aref.ar_amap;
3429                     struct uvm_object * const uobj = entry->object.uvm_obj;
3430 
3431                     KASSERT(entry != &map->header);
3432                     KASSERT(start < entry->end);
3433                     /*
3434                      * For now, we handle only the easy but commonly-requested case.
3435                      * ie. start prefetching of backing uobj pages.
3436                      *
3437                      * XXX It might be useful to pmap_enter() the already-in-core
3438                      * pages by inventing a "weak" mode for uvm_fault() which would
3439                      * only do the PGO_LOCKED pgo_get().
3440                      */
3441                     if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
3442                               off_t offset;
3443                               off_t size;
3444 
3445                               offset = entry->offset;
3446                               if (start < entry->start) {
3447                                         offset += entry->start - start;
3448                               }
3449                               size = entry->offset + (entry->end - entry->start);
3450                               if (entry->end < end) {
3451                                         size -= end - entry->end;
3452                               }
3453                               uvm_readahead(uobj, offset, size);
3454                     }
3455                     entry = entry->next;
3456           }
3457           vm_map_unlock_read(map);
3458           UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
3459           return 0;
3460 }
3461 
3462 /*
3463  * uvm_map_pageable: sets the pageability of a range in a map.
3464  *
3465  * => wires map entries.  should not be used for transient page locking.
3466  *        for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
3467  * => regions specified as not pageable require lock-down (wired) memory
3468  *        and page tables.
3469  * => map must never be read-locked
3470  * => if islocked is true, map is already write-locked
3471  * => we always unlock the map, since we must downgrade to a read-lock
3472  *        to call uvm_fault_wire()
3473  * => XXXCDC: check this and try and clean it up.
3474  */
3475 
3476 int
uvm_map_pageable(struct vm_map * map,vaddr_t start,vaddr_t end,bool new_pageable,int lockflags)3477 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
3478     bool new_pageable, int lockflags)
3479 {
3480           struct vm_map_entry *entry, *start_entry, *failed_entry;
3481           int rv;
3482 #ifdef DIAGNOSTIC
3483           u_int timestamp_save;
3484 #endif
3485           UVMHIST_FUNC(__func__);
3486           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
3487               (uintptr_t)map, start, end, new_pageable);
3488           KASSERT(map->flags & VM_MAP_PAGEABLE);
3489 
3490           if ((lockflags & UVM_LK_ENTER) == 0)
3491                     vm_map_lock(map);
3492           VM_MAP_RANGE_CHECK(map, start, end);
3493 
3494           /*
3495            * only one pageability change may take place at one time, since
3496            * uvm_fault_wire assumes it will be called only once for each
3497            * wiring/unwiring.  therefore, we have to make sure we're actually
3498            * changing the pageability for the entire region.  we do so before
3499            * making any changes.
3500            */
3501 
3502           if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
3503                     if ((lockflags & UVM_LK_EXIT) == 0)
3504                               vm_map_unlock(map);
3505 
3506                     UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
3507                     return EFAULT;
3508           }
3509           entry = start_entry;
3510 
3511           if (start == end) {           /* nothing required */
3512                     if ((lockflags & UVM_LK_EXIT) == 0)
3513                               vm_map_unlock(map);
3514 
3515                     UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
3516                     return 0;
3517           }
3518 
3519           /*
3520            * handle wiring and unwiring separately.
3521            */
3522 
3523           if (new_pageable) {           /* unwire */
3524                     UVM_MAP_CLIP_START(map, entry, start);
3525 
3526                     /*
3527                      * unwiring.  first ensure that the range to be unwired is
3528                      * really wired down and that there are no holes.
3529                      */
3530 
3531                     while ((entry != &map->header) && (entry->start < end)) {
3532                               if (entry->wired_count == 0 ||
3533                                   (entry->end < end &&
3534                                    (entry->next == &map->header ||
3535                                     entry->next->start > entry->end))) {
3536                                         if ((lockflags & UVM_LK_EXIT) == 0)
3537                                                   vm_map_unlock(map);
3538                                         UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
3539                                         return EINVAL;
3540                               }
3541                               entry = entry->next;
3542                     }
3543 
3544                     /*
3545                      * POSIX 1003.1b - a single munlock call unlocks a region,
3546                      * regardless of the number of mlock calls made on that
3547                      * region.
3548                      */
3549 
3550                     entry = start_entry;
3551                     while ((entry != &map->header) && (entry->start < end)) {
3552                               UVM_MAP_CLIP_END(map, entry, end);
3553                               if (VM_MAPENT_ISWIRED(entry))
3554                                         uvm_map_entry_unwire(map, entry);
3555                               entry = entry->next;
3556                     }
3557                     if ((lockflags & UVM_LK_EXIT) == 0)
3558                               vm_map_unlock(map);
3559                     UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
3560                     return 0;
3561           }
3562 
3563           /*
3564            * wire case: in two passes [XXXCDC: ugly block of code here]
3565            *
3566            * 1: holding the write lock, we create any anonymous maps that need
3567            *    to be created.  then we clip each map entry to the region to
3568            *    be wired and increment its wiring count.
3569            *
3570            * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
3571            *    in the pages for any newly wired area (wired_count == 1).
3572            *
3573            *    downgrading to a read lock for uvm_fault_wire avoids a possible
3574            *    deadlock with another thread that may have faulted on one of
3575            *    the pages to be wired (it would mark the page busy, blocking
3576            *    us, then in turn block on the map lock that we hold).  because
3577            *    of problems in the recursive lock package, we cannot upgrade
3578            *    to a write lock in vm_map_lookup.  thus, any actions that
3579            *    require the write lock must be done beforehand.  because we
3580            *    keep the read lock on the map, the copy-on-write status of the
3581            *    entries we modify here cannot change.
3582            */
3583 
3584           while ((entry != &map->header) && (entry->start < end)) {
3585                     if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
3586 
3587                               /*
3588                                * perform actions of vm_map_lookup that need the
3589                                * write lock on the map: create an anonymous map
3590                                * for a copy-on-write region, or an anonymous map
3591                                * for a zero-fill region.  (XXXCDC: submap case
3592                                * ok?)
3593                                */
3594 
3595                               if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
3596                                         if (UVM_ET_ISNEEDSCOPY(entry) &&
3597                                             ((entry->max_protection & VM_PROT_WRITE) ||
3598                                              (entry->object.uvm_obj == NULL))) {
3599                                                   amap_copy(map, entry, 0, start, end);
3600                                                   /* XXXCDC: wait OK? */
3601                                         }
3602                               }
3603                     }
3604                     UVM_MAP_CLIP_START(map, entry, start);
3605                     UVM_MAP_CLIP_END(map, entry, end);
3606                     entry->wired_count++;
3607 
3608                     /*
3609                      * Check for holes
3610                      */
3611 
3612                     if (entry->protection == VM_PROT_NONE ||
3613                         (entry->end < end &&
3614                          (entry->next == &map->header ||
3615                           entry->next->start > entry->end))) {
3616 
3617                               /*
3618                                * found one.  amap creation actions do not need to
3619                                * be undone, but the wired counts need to be restored.
3620                                */
3621 
3622                               while (entry != &map->header && entry->end > start) {
3623                                         entry->wired_count--;
3624                                         entry = entry->prev;
3625                               }
3626                               if ((lockflags & UVM_LK_EXIT) == 0)
3627                                         vm_map_unlock(map);
3628                               UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
3629                               return EINVAL;
3630                     }
3631                     entry = entry->next;
3632           }
3633 
3634           /*
3635            * Pass 2.
3636            */
3637 
3638 #ifdef DIAGNOSTIC
3639           timestamp_save = map->timestamp;
3640 #endif
3641           vm_map_busy(map);
3642           vm_map_unlock(map);
3643 
3644           rv = 0;
3645           entry = start_entry;
3646           while (entry != &map->header && entry->start < end) {
3647                     if (entry->wired_count == 1) {
3648                               rv = uvm_fault_wire(map, entry->start, entry->end,
3649                                   entry->max_protection, 1);
3650                               if (rv) {
3651 
3652                                         /*
3653                                          * wiring failed.  break out of the loop.
3654                                          * we'll clean up the map below, once we
3655                                          * have a write lock again.
3656                                          */
3657 
3658                                         break;
3659                               }
3660                     }
3661                     entry = entry->next;
3662           }
3663 
3664           if (rv) { /* failed? */
3665 
3666                     /*
3667                      * Get back to an exclusive (write) lock.
3668                      */
3669 
3670                     vm_map_lock(map);
3671                     vm_map_unbusy(map);
3672 
3673 #ifdef DIAGNOSTIC
3674                     if (timestamp_save + 1 != map->timestamp)
3675                               panic("uvm_map_pageable: stale map");
3676 #endif
3677 
3678                     /*
3679                      * first drop the wiring count on all the entries
3680                      * which haven't actually been wired yet.
3681                      */
3682 
3683                     failed_entry = entry;
3684                     while (entry != &map->header && entry->start < end) {
3685                               entry->wired_count--;
3686                               entry = entry->next;
3687                     }
3688 
3689                     /*
3690                      * now, unwire all the entries that were successfully
3691                      * wired above.
3692                      */
3693 
3694                     entry = start_entry;
3695                     while (entry != failed_entry) {
3696                               entry->wired_count--;
3697                               if (VM_MAPENT_ISWIRED(entry) == 0)
3698                                         uvm_map_entry_unwire(map, entry);
3699                               entry = entry->next;
3700                     }
3701                     if ((lockflags & UVM_LK_EXIT) == 0)
3702                               vm_map_unlock(map);
3703                     UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
3704                     return (rv);
3705           }
3706 
3707           if ((lockflags & UVM_LK_EXIT) == 0) {
3708                     vm_map_unbusy(map);
3709           } else {
3710 
3711                     /*
3712                      * Get back to an exclusive (write) lock.
3713                      */
3714 
3715                     vm_map_lock(map);
3716                     vm_map_unbusy(map);
3717           }
3718 
3719           UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
3720           return 0;
3721 }
3722 
3723 /*
3724  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
3725  * all mapped regions.
3726  *
3727  * => map must not be locked.
3728  * => if no flags are specified, all regions are unwired.
3729  * => XXXJRT: has some of the same problems as uvm_map_pageable() above.
3730  */
3731 
3732 int
uvm_map_pageable_all(struct vm_map * map,int flags,vsize_t limit)3733 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
3734 {
3735           struct vm_map_entry *entry, *failed_entry;
3736           vsize_t size;
3737           int rv;
3738 #ifdef DIAGNOSTIC
3739           u_int timestamp_save;
3740 #endif
3741           UVMHIST_FUNC(__func__);
3742           UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
3743               0, 0);
3744 
3745           KASSERT(map->flags & VM_MAP_PAGEABLE);
3746 
3747           vm_map_lock(map);
3748 
3749           /*
3750            * handle wiring and unwiring separately.
3751            */
3752 
3753           if (flags == 0) {                       /* unwire */
3754 
3755                     /*
3756                      * POSIX 1003.1b -- munlockall unlocks all regions,
3757                      * regardless of how many times mlockall has been called.
3758                      */
3759 
3760                     for (entry = map->header.next; entry != &map->header;
3761                          entry = entry->next) {
3762                               if (VM_MAPENT_ISWIRED(entry))
3763                                         uvm_map_entry_unwire(map, entry);
3764                     }
3765                     map->flags &= ~VM_MAP_WIREFUTURE;
3766                     vm_map_unlock(map);
3767                     UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
3768                     return 0;
3769           }
3770 
3771           if (flags & MCL_FUTURE) {
3772 
3773                     /*
3774                      * must wire all future mappings; remember this.
3775                      */
3776 
3777                     map->flags |= VM_MAP_WIREFUTURE;
3778           }
3779 
3780           if ((flags & MCL_CURRENT) == 0) {
3781 
3782                     /*
3783                      * no more work to do!
3784                      */
3785 
3786                     UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
3787                     vm_map_unlock(map);
3788                     return 0;
3789           }
3790 
3791           /*
3792            * wire case: in three passes [XXXCDC: ugly block of code here]
3793            *
3794            * 1: holding the write lock, count all pages mapped by non-wired
3795            *    entries.  if this would cause us to go over our limit, we fail.
3796            *
3797            * 2: still holding the write lock, we create any anonymous maps that
3798            *    need to be created.  then we increment its wiring count.
3799            *
3800            * 3: we downgrade to a read lock, and call uvm_fault_wire to fault
3801            *    in the pages for any newly wired area (wired_count == 1).
3802            *
3803            *    downgrading to a read lock for uvm_fault_wire avoids a possible
3804            *    deadlock with another thread that may have faulted on one of
3805            *    the pages to be wired (it would mark the page busy, blocking
3806            *    us, then in turn block on the map lock that we hold).  because
3807            *    of problems in the recursive lock package, we cannot upgrade
3808            *    to a write lock in vm_map_lookup.  thus, any actions that
3809            *    require the write lock must be done beforehand.  because we
3810            *    keep the read lock on the map, the copy-on-write status of the
3811            *    entries we modify here cannot change.
3812            */
3813 
3814           for (size = 0, entry = map->header.next; entry != &map->header;
3815                entry = entry->next) {
3816                     if (entry->protection != VM_PROT_NONE &&
3817                         VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
3818                               size += entry->end - entry->start;
3819                     }
3820           }
3821 
3822           if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
3823                     vm_map_unlock(map);
3824                     return ENOMEM;
3825           }
3826 
3827           if (limit != 0 &&
3828               (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
3829                     vm_map_unlock(map);
3830                     return ENOMEM;
3831           }
3832 
3833           /*
3834            * Pass 2.
3835            */
3836 
3837           for (entry = map->header.next; entry != &map->header;
3838                entry = entry->next) {
3839                     if (entry->protection == VM_PROT_NONE)
3840                               continue;
3841                     if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
3842 
3843                               /*
3844                                * perform actions of vm_map_lookup that need the
3845                                * write lock on the map: create an anonymous map
3846                                * for a copy-on-write region, or an anonymous map
3847                                * for a zero-fill region.  (XXXCDC: submap case
3848                                * ok?)
3849                                */
3850 
3851                               if (!UVM_ET_ISSUBMAP(entry)) {          /* not submap */
3852                                         if (UVM_ET_ISNEEDSCOPY(entry) &&
3853                                             ((entry->max_protection & VM_PROT_WRITE) ||
3854                                              (entry->object.uvm_obj == NULL))) {
3855                                                   amap_copy(map, entry, 0, entry->start,
3856                                                       entry->end);
3857                                                   /* XXXCDC: wait OK? */
3858                                         }
3859                               }
3860                     }
3861                     entry->wired_count++;
3862           }
3863 
3864           /*
3865            * Pass 3.
3866            */
3867 
3868 #ifdef DIAGNOSTIC
3869           timestamp_save = map->timestamp;
3870 #endif
3871           vm_map_busy(map);
3872           vm_map_unlock(map);
3873 
3874           rv = 0;
3875           for (entry = map->header.next; entry != &map->header;
3876                entry = entry->next) {
3877                     if (entry->wired_count == 1) {
3878                               rv = uvm_fault_wire(map, entry->start, entry->end,
3879                                   entry->max_protection, 1);
3880                               if (rv) {
3881 
3882                                         /*
3883                                          * wiring failed.  break out of the loop.
3884                                          * we'll clean up the map below, once we
3885                                          * have a write lock again.
3886                                          */
3887 
3888                                         break;
3889                               }
3890                     }
3891           }
3892 
3893           if (rv) {
3894 
3895                     /*
3896                      * Get back an exclusive (write) lock.
3897                      */
3898 
3899                     vm_map_lock(map);
3900                     vm_map_unbusy(map);
3901 
3902 #ifdef DIAGNOSTIC
3903                     if (timestamp_save + 1 != map->timestamp)
3904                               panic("uvm_map_pageable_all: stale map");
3905 #endif
3906 
3907                     /*
3908                      * first drop the wiring count on all the entries
3909                      * which haven't actually been wired yet.
3910                      *
3911                      * Skip VM_PROT_NONE entries like we did above.
3912                      */
3913 
3914                     failed_entry = entry;
3915                     for (/* nothing */; entry != &map->header;
3916                          entry = entry->next) {
3917                               if (entry->protection == VM_PROT_NONE)
3918                                         continue;
3919                               entry->wired_count--;
3920                     }
3921 
3922                     /*
3923                      * now, unwire all the entries that were successfully
3924                      * wired above.
3925                      *
3926                      * Skip VM_PROT_NONE entries like we did above.
3927                      */
3928 
3929                     for (entry = map->header.next; entry != failed_entry;
3930                          entry = entry->next) {
3931                               if (entry->protection == VM_PROT_NONE)
3932                                         continue;
3933                               entry->wired_count--;
3934                               if (VM_MAPENT_ISWIRED(entry))
3935                                         uvm_map_entry_unwire(map, entry);
3936                     }
3937                     vm_map_unlock(map);
3938                     UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
3939                     return (rv);
3940           }
3941 
3942           vm_map_unbusy(map);
3943 
3944           UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
3945           return 0;
3946 }
3947 
3948 /*
3949  * uvm_map_clean: clean out a map range
3950  *
3951  * => valid flags:
3952  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
3953  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
3954  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
3955  *   if (flags & PGO_FREE): any cached pages are freed after clean
3956  * => returns an error if any part of the specified range isn't mapped
3957  * => never a need to flush amap layer since the anonymous memory has
3958  *        no permanent home, but may deactivate pages there
3959  * => called from sys_msync() and sys_madvise()
3960  * => caller must not have map locked
3961  */
3962 
3963 int
uvm_map_clean(struct vm_map * map,vaddr_t start,vaddr_t end,int flags)3964 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
3965 {
3966           struct vm_map_entry *current, *entry;
3967           struct uvm_object *uobj;
3968           struct vm_amap *amap;
3969           struct vm_anon *anon;
3970           struct vm_page *pg;
3971           vaddr_t offset;
3972           vsize_t size;
3973           voff_t uoff;
3974           int error, refs;
3975           UVMHIST_FUNC(__func__);
3976           UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
3977               (uintptr_t)map, start, end, flags);
3978 
3979           KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
3980                     (PGO_FREE|PGO_DEACTIVATE));
3981 
3982           vm_map_lock(map);
3983           VM_MAP_RANGE_CHECK(map, start, end);
3984           if (!uvm_map_lookup_entry(map, start, &entry)) {
3985                     vm_map_unlock(map);
3986                     return EFAULT;
3987           }
3988 
3989           /*
3990            * Make a first pass to check for holes and wiring problems.
3991            */
3992 
3993           for (current = entry; current->start < end; current = current->next) {
3994                     if (UVM_ET_ISSUBMAP(current)) {
3995                               vm_map_unlock(map);
3996                               return EINVAL;
3997                     }
3998                     if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) {
3999                               vm_map_unlock(map);
4000                               return EBUSY;
4001                     }
4002                     if (end <= current->end) {
4003                               break;
4004                     }
4005                     if (current->end != current->next->start) {
4006                               vm_map_unlock(map);
4007                               return EFAULT;
4008                     }
4009           }
4010 
4011           vm_map_busy(map);
4012           vm_map_unlock(map);
4013           error = 0;
4014           for (current = entry; start < end; current = current->next) {
4015                     amap = current->aref.ar_amap; /* upper layer */
4016                     uobj = current->object.uvm_obj;         /* lower layer */
4017                     KASSERT(start >= current->start);
4018 
4019                     /*
4020                      * No amap cleaning necessary if:
4021                      *
4022                      *        (1) There's no amap.
4023                      *
4024                      *        (2) We're not deactivating or freeing pages.
4025                      */
4026 
4027                     if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
4028                               goto flush_object;
4029 
4030                     offset = start - current->start;
4031                     size = MIN(end, current->end) - start;
4032 
4033                     amap_lock(amap, RW_WRITER);
4034                     for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
4035                               anon = amap_lookup(&current->aref, offset);
4036                               if (anon == NULL)
4037                                         continue;
4038 
4039                               KASSERT(anon->an_lock == amap->am_lock);
4040                               pg = anon->an_page;
4041                               if (pg == NULL) {
4042                                         continue;
4043                               }
4044                               if (pg->flags & PG_BUSY) {
4045                                         continue;
4046                               }
4047 
4048                               switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
4049 
4050                               /*
4051                                * In these first 3 cases, we just deactivate the page.
4052                                */
4053 
4054                               case PGO_CLEANIT|PGO_FREE:
4055                               case PGO_CLEANIT|PGO_DEACTIVATE:
4056                               case PGO_DEACTIVATE:
4057  deactivate_it:
4058                                         /*
4059                                          * skip the page if it's loaned or wired,
4060                                          * since it shouldn't be on a paging queue
4061                                          * at all in these cases.
4062                                          */
4063 
4064                                         if (pg->loan_count != 0 ||
4065                                             pg->wire_count != 0) {
4066                                                   continue;
4067                                         }
4068                                         KASSERT(pg->uanon == anon);
4069                                         uvm_pagelock(pg);
4070                                         uvm_pagedeactivate(pg);
4071                                         uvm_pageunlock(pg);
4072                                         continue;
4073 
4074                               case PGO_FREE:
4075 
4076                                         /*
4077                                          * If there are multiple references to
4078                                          * the amap, just deactivate the page.
4079                                          */
4080 
4081                                         if (amap_refs(amap) > 1)
4082                                                   goto deactivate_it;
4083 
4084                                         /* skip the page if it's wired */
4085                                         if (pg->wire_count != 0) {
4086                                                   continue;
4087                                         }
4088                                         amap_unadd(&current->aref, offset);
4089                                         refs = --anon->an_ref;
4090                                         if (refs == 0) {
4091                                                   uvm_anfree(anon);
4092                                         }
4093                                         continue;
4094                               }
4095                     }
4096                     amap_unlock(amap);
4097 
4098  flush_object:
4099                     /*
4100                      * flush pages if we've got a valid backing object.
4101                      * note that we must always clean object pages before
4102                      * freeing them since otherwise we could reveal stale
4103                      * data from files.
4104                      */
4105 
4106                     uoff = current->offset + (start - current->start);
4107                     size = MIN(end, current->end) - start;
4108                     if (uobj != NULL) {
4109                               rw_enter(uobj->vmobjlock, RW_WRITER);
4110                               if (uobj->pgops->pgo_put != NULL)
4111                                         error = (uobj->pgops->pgo_put)(uobj, uoff,
4112                                             uoff + size, flags | PGO_CLEANIT);
4113                               else
4114                                         error = 0;
4115                     }
4116                     start += size;
4117           }
4118           vm_map_unbusy(map);
4119           return error;
4120 }
4121 
4122 
4123 /*
4124  * uvm_map_checkprot: check protection in map
4125  *
4126  * => must allow specified protection in a fully allocated region.
4127  * => map must be read or write locked by caller.
4128  */
4129 
4130 bool
uvm_map_checkprot(struct vm_map * map,vaddr_t start,vaddr_t end,vm_prot_t protection)4131 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
4132     vm_prot_t protection)
4133 {
4134           struct vm_map_entry *entry;
4135           struct vm_map_entry *tmp_entry;
4136 
4137           if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
4138                     return (false);
4139           }
4140           entry = tmp_entry;
4141           while (start < end) {
4142                     if (entry == &map->header) {
4143                               return (false);
4144                     }
4145 
4146                     /*
4147                      * no holes allowed
4148                      */
4149 
4150                     if (start < entry->start) {
4151                               return (false);
4152                     }
4153 
4154                     /*
4155                      * check protection associated with entry
4156                      */
4157 
4158                     if ((entry->protection & protection) != protection) {
4159                               return (false);
4160                     }
4161                     start = entry->end;
4162                     entry = entry->next;
4163           }
4164           return (true);
4165 }
4166 
4167 /*
4168  * uvmspace_alloc: allocate a vmspace structure.
4169  *
4170  * - structure includes vm_map and pmap
4171  * - XXX: no locking on this structure
4172  * - refcnt set to 1, rest must be init'd by caller
4173  */
4174 struct vmspace *
uvmspace_alloc(vaddr_t vmin,vaddr_t vmax,bool topdown)4175 uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
4176 {
4177           struct vmspace *vm;
4178           UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
4179 
4180           vm = kmem_alloc(sizeof(*vm), KM_SLEEP);
4181           uvmspace_init(vm, NULL, vmin, vmax, topdown);
4182           UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
4183           return (vm);
4184 }
4185 
4186 /*
4187  * uvmspace_init: initialize a vmspace structure.
4188  *
4189  * - XXX: no locking on this structure
4190  * - refcnt set to 1, rest must be init'd by caller
4191  */
4192 void
uvmspace_init(struct vmspace * vm,struct pmap * pmap,vaddr_t vmin,vaddr_t vmax,bool topdown)4193 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
4194     vaddr_t vmax, bool topdown)
4195 {
4196           UVMHIST_FUNC(__func__);
4197           UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
4198               (uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
4199           UVMHIST_LOG(maphist, "   topdown=%ju)", topdown, 0, 0, 0);
4200 
4201           memset(vm, 0, sizeof(*vm));
4202           uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
4203               | (topdown ? VM_MAP_TOPDOWN : 0)
4204               );
4205           if (pmap)
4206                     pmap_reference(pmap);
4207           else
4208                     pmap = pmap_create();
4209           vm->vm_map.pmap = pmap;
4210           vm->vm_refcnt = 1;
4211           UVMHIST_LOG(maphist,"<- done",0,0,0,0);
4212 }
4213 
4214 /*
4215  * uvmspace_share: share a vmspace between two processes
4216  *
4217  * - used for vfork, threads(?)
4218  */
4219 
4220 void
uvmspace_share(struct proc * p1,struct proc * p2)4221 uvmspace_share(struct proc *p1, struct proc *p2)
4222 {
4223 
4224           uvmspace_addref(p1->p_vmspace);
4225           p2->p_vmspace = p1->p_vmspace;
4226 }
4227 
4228 #if 0
4229 
4230 /*
4231  * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
4232  *
4233  * - XXX: no locking on vmspace
4234  */
4235 
4236 void
4237 uvmspace_unshare(struct lwp *l)
4238 {
4239           struct proc *p = l->l_proc;
4240           struct vmspace *nvm, *ovm = p->p_vmspace;
4241 
4242           if (ovm->vm_refcnt == 1)
4243                     /* nothing to do: vmspace isn't shared in the first place */
4244                     return;
4245 
4246           /* make a new vmspace, still holding old one */
4247           nvm = uvmspace_fork(ovm);
4248 
4249           kpreempt_disable();
4250           pmap_deactivate(l);           /* unbind old vmspace */
4251           p->p_vmspace = nvm;
4252           pmap_activate(l);             /* switch to new vmspace */
4253           kpreempt_enable();
4254 
4255           uvmspace_free(ovm);           /* drop reference to old vmspace */
4256 }
4257 
4258 #endif
4259 
4260 /*
4261  * uvmspace_exec: the process wants to exec a new program
4262  */
4263 
4264 void
uvmspace_exec(struct lwp * l,vaddr_t start,vaddr_t end,bool topdown)4265 uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
4266 {
4267           struct proc *p = l->l_proc;
4268           struct vmspace *nvm, *ovm = p->p_vmspace;
4269           struct vm_map *map;
4270           int flags;
4271 
4272           KASSERT(ovm != NULL);
4273 #ifdef __HAVE_CPU_VMSPACE_EXEC
4274           cpu_vmspace_exec(l, start, end);
4275 #endif
4276 
4277           /*
4278            * If p is the only process using the vmspace, we can safely
4279            * recycle it for the program that is being exec'd, rather than
4280            * allocate a new vmspace -- but we have to make sure it's
4281            * empty first.
4282            */
4283           map = &ovm->vm_map;
4284           if (ovm->vm_refcnt == 1 && map->nentries != 0) {
4285                     /*
4286                      * SYSV SHM semantics require us to kill all segments on an exec
4287                      */
4288                     if (uvm_shmexit && ovm->vm_shm)
4289                               (*uvm_shmexit)(ovm);
4290 
4291                     /*
4292                      * POSIX 1003.1b -- "lock future mappings" is revoked
4293                      * when a process execs another program image.
4294                      */
4295                     map->flags &= ~VM_MAP_WIREFUTURE;
4296 
4297                     /*
4298                      * now unmap the old program.
4299                      *
4300                      * XXX set VM_MAP_DYING for the duration, so pmap_update()
4301                      * is not called until the pmap has been totally cleared out
4302                      * after pmap_remove_all(), or it can confuse some pmap
4303                      * implementations.  it would be nice to handle this by
4304                      * deferring the pmap_update() while it is known the address
4305                      * space is not visible to any user LWP other than curlwp,
4306                      * but there isn't an elegant way of inferring that right
4307                      * now.
4308                      */
4309                     flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
4310                     map->flags |= VM_MAP_DYING;
4311                     uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags);
4312                     map->flags &= ~VM_MAP_DYING;
4313                     pmap_update(map->pmap);
4314                     KASSERT(map->header.prev == &map->header);
4315                     KASSERT(map->nentries == 0);
4316           }
4317 
4318           if (ovm->vm_refcnt == 1) {
4319                     /*
4320                      * The vmspace is not shared and is empty (if it
4321                      * weren't, we would have emptied it above).
4322                      *
4323                      * Resize the map and set topdown as appropriate.
4324                      */
4325                     KASSERT(map->nentries == 0);
4326                     vm_map_setmin(map, start);
4327                     vm_map_setmax(map, end);
4328                     if (topdown) {
4329                               map->flags |= VM_MAP_TOPDOWN;
4330                     } else {
4331                               map->flags &= ~VM_MAP_TOPDOWN;
4332                     }
4333           } else {
4334                     /*
4335                      * p's vmspace is being shared, so we can't reuse it for p since
4336                      * it is still being used for others.   allocate a new vmspace
4337                      * for p
4338                      */
4339                     nvm = uvmspace_alloc(start, end, topdown);
4340 
4341                     /*
4342                      * install new vmspace and drop our ref to the old one.
4343                      */
4344                     kpreempt_disable();
4345                     pmap_deactivate(l);
4346                     p->p_vmspace = nvm;
4347                     pmap_activate(l);
4348                     kpreempt_enable();
4349 
4350                     uvmspace_free(ovm);
4351           }
4352 }
4353 
4354 /*
4355  * uvmspace_addref: add a reference to a vmspace.
4356  */
4357 
4358 void
uvmspace_addref(struct vmspace * vm)4359 uvmspace_addref(struct vmspace *vm)
4360 {
4361 
4362           KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0);
4363           KASSERT(vm->vm_refcnt > 0);
4364           atomic_inc_uint(&vm->vm_refcnt);
4365 }
4366 
4367 /*
4368  * uvmspace_free: free a vmspace data structure
4369  */
4370 
4371 void
uvmspace_free(struct vmspace * vm)4372 uvmspace_free(struct vmspace *vm)
4373 {
4374           struct vm_map_entry *dead_entries;
4375           struct vm_map *map = &vm->vm_map;
4376           int flags;
4377 
4378           UVMHIST_FUNC(__func__);
4379           UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm,
4380               vm->vm_refcnt, 0, 0);
4381 
4382           membar_release();
4383           if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0)
4384                     return;
4385           membar_acquire();
4386 
4387           /*
4388            * at this point, there should be no other references to the map.
4389            * delete all of the mappings, then destroy the pmap.
4390            */
4391 
4392           map->flags |= VM_MAP_DYING;
4393           flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
4394 
4395           /* Get rid of any SYSV shared memory segments. */
4396           if (uvm_shmexit && vm->vm_shm != NULL)
4397                     (*uvm_shmexit)(vm);
4398 
4399           if (map->nentries) {
4400                     vm_map_lock(map);
4401                     uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
4402                         &dead_entries, flags);
4403                     vm_map_unlock(map);
4404                     if (dead_entries != NULL)
4405                               uvm_unmap_detach(dead_entries, 0);
4406           }
4407           KASSERT(map->nentries == 0);
4408           KASSERT(map->size == 0);
4409 
4410           mutex_destroy(&map->misc_lock);
4411           rw_destroy(&map->lock);
4412           cv_destroy(&map->cv);
4413           pmap_destroy(map->pmap);
4414           kmem_free(vm, sizeof(*vm));
4415 }
4416 
4417 static struct vm_map_entry *
uvm_mapent_clone(struct vm_map * new_map,struct vm_map_entry * old_entry,int flags)4418 uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
4419     int flags)
4420 {
4421           struct vm_map_entry *new_entry;
4422 
4423           new_entry = uvm_mapent_alloc(new_map, 0);
4424           /* old_entry -> new_entry */
4425           uvm_mapent_copy(old_entry, new_entry);
4426 
4427           /* new pmap has nothing wired in it */
4428           new_entry->wired_count = 0;
4429 
4430           /*
4431            * gain reference to object backing the map (can't
4432            * be a submap, already checked this case).
4433            */
4434 
4435           if (new_entry->aref.ar_amap)
4436                     uvm_map_reference_amap(new_entry, flags);
4437 
4438           if (new_entry->object.uvm_obj &&
4439               new_entry->object.uvm_obj->pgops->pgo_reference)
4440                     new_entry->object.uvm_obj->pgops->pgo_reference(
4441                               new_entry->object.uvm_obj);
4442 
4443           /* insert entry at end of new_map's entry list */
4444           uvm_map_entry_link(new_map, new_map->header.prev,
4445               new_entry);
4446 
4447           return new_entry;
4448 }
4449 
4450 /*
4451  * share the mapping: this means we want the old and
4452  * new entries to share amaps and backing objects.
4453  */
4454 static void
uvm_mapent_forkshared(struct vm_map * new_map,struct vm_map * old_map,struct vm_map_entry * old_entry)4455 uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
4456     struct vm_map_entry *old_entry)
4457 {
4458           /*
4459            * if the old_entry needs a new amap (due to prev fork)
4460            * then we need to allocate it now so that we have
4461            * something we own to share with the new_entry.   [in
4462            * other words, we need to clear needs_copy]
4463            */
4464 
4465           if (UVM_ET_ISNEEDSCOPY(old_entry)) {
4466                     /* get our own amap, clears needs_copy */
4467                     amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
4468                         0, 0);
4469                     /* XXXCDC: WAITOK??? */
4470           }
4471 
4472           uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
4473 }
4474 
4475 
4476 static void
uvm_mapent_forkcopy(struct vm_map * new_map,struct vm_map * old_map,struct vm_map_entry * old_entry)4477 uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
4478     struct vm_map_entry *old_entry)
4479 {
4480           struct vm_map_entry *new_entry;
4481 
4482           /*
4483            * copy-on-write the mapping (using mmap's
4484            * MAP_PRIVATE semantics)
4485            *
4486            * allocate new_entry, adjust reference counts.
4487            * (note that new references are read-only).
4488            */
4489 
4490           new_entry = uvm_mapent_clone(new_map, old_entry, 0);
4491 
4492           new_entry->etype |=
4493               (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
4494 
4495           /*
4496            * the new entry will need an amap.  it will either
4497            * need to be copied from the old entry or created
4498            * from scratch (if the old entry does not have an
4499            * amap).  can we defer this process until later
4500            * (by setting "needs_copy") or do we need to copy
4501            * the amap now?
4502            *
4503            * we must copy the amap now if any of the following
4504            * conditions hold:
4505            * 1. the old entry has an amap and that amap is
4506            *    being shared.  this means that the old (parent)
4507            *    process is sharing the amap with another
4508            *    process.  if we do not clear needs_copy here
4509            *    we will end up in a situation where both the
4510            *    parent and child process are referring to the
4511            *    same amap with "needs_copy" set.  if the
4512            *    parent write-faults, the fault routine will
4513            *    clear "needs_copy" in the parent by allocating
4514            *    a new amap.   this is wrong because the
4515            *    parent is supposed to be sharing the old amap
4516            *    and the new amap will break that.
4517            *
4518            * 2. if the old entry has an amap and a non-zero
4519            *    wire count then we are going to have to call
4520            *    amap_cow_now to avoid page faults in the
4521            *    parent process.   since amap_cow_now requires
4522            *    "needs_copy" to be clear we might as well
4523            *    clear it here as well.
4524            *
4525            */
4526 
4527           if (old_entry->aref.ar_amap != NULL) {
4528                     if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
4529                         VM_MAPENT_ISWIRED(old_entry)) {
4530 
4531                               amap_copy(new_map, new_entry,
4532                                   AMAP_COPY_NOCHUNK, 0, 0);
4533                               /* XXXCDC: M_WAITOK ... ok? */
4534                     }
4535           }
4536 
4537           /*
4538            * if the parent's entry is wired down, then the
4539            * parent process does not want page faults on
4540            * access to that memory.  this means that we
4541            * cannot do copy-on-write because we can't write
4542            * protect the old entry.   in this case we
4543            * resolve all copy-on-write faults now, using
4544            * amap_cow_now.   note that we have already
4545            * allocated any needed amap (above).
4546            */
4547 
4548           if (VM_MAPENT_ISWIRED(old_entry)) {
4549 
4550                     /*
4551                      * resolve all copy-on-write faults now
4552                      * (note that there is nothing to do if
4553                      * the old mapping does not have an amap).
4554                      */
4555                     if (old_entry->aref.ar_amap)
4556                               amap_cow_now(new_map, new_entry);
4557 
4558           } else {
4559                     /*
4560                      * setup mappings to trigger copy-on-write faults
4561                      * we must write-protect the parent if it has
4562                      * an amap and it is not already "needs_copy"...
4563                      * if it is already "needs_copy" then the parent
4564                      * has already been write-protected by a previous
4565                      * fork operation.
4566                      */
4567                     if (old_entry->aref.ar_amap &&
4568                         !UVM_ET_ISNEEDSCOPY(old_entry)) {
4569                               if (old_entry->max_protection & VM_PROT_WRITE) {
4570 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
4571                                         uvm_map_lock_entry(old_entry, RW_WRITER);
4572 #else
4573                                         uvm_map_lock_entry(old_entry, RW_READER);
4574 #endif
4575                                         pmap_protect(old_map->pmap,
4576                                             old_entry->start, old_entry->end,
4577                                             old_entry->protection & ~VM_PROT_WRITE);
4578                                         uvm_map_unlock_entry(old_entry);
4579                               }
4580                               old_entry->etype |= UVM_ET_NEEDSCOPY;
4581                     }
4582           }
4583 }
4584 
4585 /*
4586  * zero the mapping: the new entry will be zero initialized
4587  */
4588 static void
uvm_mapent_forkzero(struct vm_map * new_map,struct vm_map * old_map,struct vm_map_entry * old_entry)4589 uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
4590     struct vm_map_entry *old_entry)
4591 {
4592           struct vm_map_entry *new_entry;
4593 
4594           new_entry = uvm_mapent_clone(new_map, old_entry, 0);
4595 
4596           new_entry->etype |=
4597               (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
4598 
4599           if (new_entry->aref.ar_amap) {
4600                     uvm_map_unreference_amap(new_entry, 0);
4601                     new_entry->aref.ar_pageoff = 0;
4602                     new_entry->aref.ar_amap = NULL;
4603           }
4604 
4605           if (UVM_ET_ISOBJ(new_entry)) {
4606                     if (new_entry->object.uvm_obj->pgops->pgo_detach)
4607                               new_entry->object.uvm_obj->pgops->pgo_detach(
4608                                   new_entry->object.uvm_obj);
4609                     new_entry->object.uvm_obj = NULL;
4610                     new_entry->offset = 0;
4611                     new_entry->etype &= ~UVM_ET_OBJ;
4612           }
4613 }
4614 
4615 /*
4616  *   F O R K   -   m a i n   e n t r y   p o i n t
4617  */
4618 /*
4619  * uvmspace_fork: fork a process' main map
4620  *
4621  * => create a new vmspace for child process from parent.
4622  * => parent's map must not be locked.
4623  */
4624 
4625 struct vmspace *
uvmspace_fork(struct vmspace * vm1)4626 uvmspace_fork(struct vmspace *vm1)
4627 {
4628           struct vmspace *vm2;
4629           struct vm_map *old_map = &vm1->vm_map;
4630           struct vm_map *new_map;
4631           struct vm_map_entry *old_entry;
4632           UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
4633 
4634           vm_map_lock(old_map);
4635 
4636           vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4637               vm1->vm_map.flags & VM_MAP_TOPDOWN);
4638           memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
4639               (char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
4640           new_map = &vm2->vm_map;                   /* XXX */
4641 
4642           old_entry = old_map->header.next;
4643           new_map->size = old_map->size;
4644 
4645           /*
4646            * go entry-by-entry
4647            */
4648 
4649           while (old_entry != &old_map->header) {
4650 
4651                     /*
4652                      * first, some sanity checks on the old entry
4653                      */
4654 
4655                     KASSERT(!UVM_ET_ISSUBMAP(old_entry));
4656                     KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
4657                               !UVM_ET_ISNEEDSCOPY(old_entry));
4658 
4659                     switch (old_entry->inheritance) {
4660                     case MAP_INHERIT_NONE:
4661                               /*
4662                                * drop the mapping, modify size
4663                                */
4664                               new_map->size -= old_entry->end - old_entry->start;
4665                               break;
4666 
4667                     case MAP_INHERIT_SHARE:
4668                               uvm_mapent_forkshared(new_map, old_map, old_entry);
4669                               break;
4670 
4671                     case MAP_INHERIT_COPY:
4672                               uvm_mapent_forkcopy(new_map, old_map, old_entry);
4673                               break;
4674 
4675                     case MAP_INHERIT_ZERO:
4676                               uvm_mapent_forkzero(new_map, old_map, old_entry);
4677                               break;
4678                     default:
4679                               KASSERT(0);
4680                               break;
4681                     }
4682                     old_entry = old_entry->next;
4683           }
4684 
4685           pmap_update(old_map->pmap);
4686           vm_map_unlock(old_map);
4687 
4688           if (uvm_shmfork && vm1->vm_shm)
4689                     (*uvm_shmfork)(vm1, vm2);
4690 
4691 #ifdef PMAP_FORK
4692           pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
4693 #endif
4694 
4695           UVMHIST_LOG(maphist,"<- done",0,0,0,0);
4696           return (vm2);
4697 }
4698 
4699 
4700 /*
4701  * uvm_mapent_trymerge: try to merge an entry with its neighbors.
4702  *
4703  * => called with map locked.
4704  * => return non zero if successfully merged.
4705  */
4706 
4707 int
uvm_mapent_trymerge(struct vm_map * map,struct vm_map_entry * entry,int flags)4708 uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
4709 {
4710           struct uvm_object *uobj;
4711           struct vm_map_entry *next;
4712           struct vm_map_entry *prev;
4713           vsize_t size;
4714           int merged = 0;
4715           bool copying;
4716           int newetype;
4717 
4718           if (entry->aref.ar_amap != NULL) {
4719                     return 0;
4720           }
4721           if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
4722                     return 0;
4723           }
4724 
4725           uobj = entry->object.uvm_obj;
4726           size = entry->end - entry->start;
4727           copying = (flags & UVM_MERGE_COPYING) != 0;
4728           newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;
4729 
4730           next = entry->next;
4731           if (next != &map->header &&
4732               next->start == entry->end &&
4733               ((copying && next->aref.ar_amap != NULL &&
4734               amap_refs(next->aref.ar_amap) == 1) ||
4735               (!copying && next->aref.ar_amap == NULL)) &&
4736               UVM_ET_ISCOMPATIBLE(next, newetype,
4737               uobj, entry->flags, entry->protection,
4738               entry->max_protection, entry->inheritance, entry->advice,
4739               entry->wired_count) &&
4740               (uobj == NULL || entry->offset + size == next->offset)) {
4741                     int error;
4742 
4743                     if (copying) {
4744                               error = amap_extend(next, size,
4745                                   AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
4746                     } else {
4747                               error = 0;
4748                     }
4749                     if (error == 0) {
4750                               if (uobj) {
4751                                         if (uobj->pgops->pgo_detach) {
4752                                                   uobj->pgops->pgo_detach(uobj);
4753                                         }
4754                               }
4755 
4756                               entry->end = next->end;
4757                               clear_hints(map, next);
4758                               uvm_map_entry_unlink(map, next);
4759                               if (copying) {
4760                                         entry->aref = next->aref;
4761                                         entry->etype &= ~UVM_ET_NEEDSCOPY;
4762                               }
4763                               uvm_map_check(map, "trymerge forwardmerge");
4764                               uvm_mapent_free(next);
4765                               merged++;
4766                     }
4767           }
4768 
4769           prev = entry->prev;
4770           if (prev != &map->header &&
4771               prev->end == entry->start &&
4772               ((copying && !merged && prev->aref.ar_amap != NULL &&
4773               amap_refs(prev->aref.ar_amap) == 1) ||
4774               (!copying && prev->aref.ar_amap == NULL)) &&
4775               UVM_ET_ISCOMPATIBLE(prev, newetype,
4776               uobj, entry->flags, entry->protection,
4777               entry->max_protection, entry->inheritance, entry->advice,
4778               entry->wired_count) &&
4779               (uobj == NULL ||
4780               prev->offset + prev->end - prev->start == entry->offset)) {
4781                     int error;
4782 
4783                     if (copying) {
4784                               error = amap_extend(prev, size,
4785                                   AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
4786                     } else {
4787                               error = 0;
4788                     }
4789                     if (error == 0) {
4790                               if (uobj) {
4791                                         if (uobj->pgops->pgo_detach) {
4792                                                   uobj->pgops->pgo_detach(uobj);
4793                                         }
4794                                         entry->offset = prev->offset;
4795                               }
4796 
4797                               entry->start = prev->start;
4798                               clear_hints(map, prev);
4799                               uvm_map_entry_unlink(map, prev);
4800                               if (copying) {
4801                                         entry->aref = prev->aref;
4802                                         entry->etype &= ~UVM_ET_NEEDSCOPY;
4803                               }
4804                               uvm_map_check(map, "trymerge backmerge");
4805                               uvm_mapent_free(prev);
4806                               merged++;
4807                     }
4808           }
4809 
4810           return merged;
4811 }
4812 
4813 /*
4814  * uvm_map_setup: init map
4815  *
4816  * => map must not be in service yet.
4817  */
4818 
4819 void
uvm_map_setup(struct vm_map * map,vaddr_t vmin,vaddr_t vmax,int flags)4820 uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
4821 {
4822 
4823           rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
4824           map->header.next = map->header.prev = &map->header;
4825           map->nentries = 0;
4826           map->size = 0;
4827           map->ref_count = 1;
4828           vm_map_setmin(map, vmin);
4829           vm_map_setmax(map, vmax);
4830           map->flags = flags;
4831           map->first_free = &map->header;
4832           map->hint = &map->header;
4833           map->timestamp = 0;
4834           map->busy = NULL;
4835 
4836           rw_init(&map->lock);
4837           cv_init(&map->cv, "vm_map");
4838           mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
4839 }
4840 
4841 /*
4842  *   U N M A P   -   m a i n   e n t r y   p o i n t
4843  */
4844 
4845 /*
4846  * uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
4847  *
4848  * => caller must check alignment and size
4849  * => map must be unlocked (we will lock it)
4850  * => flags is UVM_FLAG_QUANTUM or 0.
4851  */
4852 
4853 void
uvm_unmap1(struct vm_map * map,vaddr_t start,vaddr_t end,int flags)4854 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
4855 {
4856           struct vm_map_entry *dead_entries;
4857           UVMHIST_FUNC(__func__);
4858           UVMHIST_CALLARGS(maphist, "  (map=%#jx, start=%#jx, end=%#jx)",
4859               (uintptr_t)map, start, end, 0);
4860 
4861           KASSERTMSG(start < end,
4862               "%s: map %p: start %#jx < end %#jx", __func__, map,
4863               (uintmax_t)start, (uintmax_t)end);
4864           if (map == kernel_map) {
4865                     LOCKDEBUG_MEM_CHECK((void *)start, end - start);
4866           }
4867 
4868           /*
4869            * work now done by helper functions.   wipe the pmap's and then
4870            * detach from the dead entries...
4871            */
4872           vm_map_lock(map);
4873           uvm_unmap_remove(map, start, end, &dead_entries, flags);
4874           vm_map_unlock(map);
4875 
4876           if (dead_entries != NULL)
4877                     uvm_unmap_detach(dead_entries, 0);
4878 
4879           UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
4880 }
4881 
4882 
4883 /*
4884  * uvm_map_reference: add reference to a map
4885  *
4886  * => map need not be locked
4887  */
4888 
4889 void
uvm_map_reference(struct vm_map * map)4890 uvm_map_reference(struct vm_map *map)
4891 {
4892 
4893           atomic_inc_uint(&map->ref_count);
4894 }
4895 
4896 void
uvm_map_lock_entry(struct vm_map_entry * entry,krw_t op)4897 uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op)
4898 {
4899 
4900           if (entry->aref.ar_amap != NULL) {
4901                     amap_lock(entry->aref.ar_amap, op);
4902           }
4903           if (UVM_ET_ISOBJ(entry)) {
4904                     rw_enter(entry->object.uvm_obj->vmobjlock, op);
4905           }
4906 }
4907 
4908 void
uvm_map_unlock_entry(struct vm_map_entry * entry)4909 uvm_map_unlock_entry(struct vm_map_entry *entry)
4910 {
4911 
4912           if (UVM_ET_ISOBJ(entry)) {
4913                     rw_exit(entry->object.uvm_obj->vmobjlock);
4914           }
4915           if (entry->aref.ar_amap != NULL) {
4916                     amap_unlock(entry->aref.ar_amap);
4917           }
4918 }
4919 
4920 #define   UVM_VOADDR_TYPE_MASK          0x3UL
4921 #define   UVM_VOADDR_TYPE_UOBJ          0x1UL
4922 #define   UVM_VOADDR_TYPE_ANON          0x2UL
4923 #define   UVM_VOADDR_OBJECT_MASK        ~UVM_VOADDR_TYPE_MASK
4924 
4925 #define   UVM_VOADDR_GET_TYPE(voa)                                              \
4926           ((voa)->object & UVM_VOADDR_TYPE_MASK)
4927 #define   UVM_VOADDR_GET_OBJECT(voa)                                            \
4928           ((voa)->object & UVM_VOADDR_OBJECT_MASK)
4929 #define   UVM_VOADDR_SET_OBJECT(voa, obj, type)                                 \
4930 do {                                                                                      \
4931           KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0);    \
4932           (voa)->object = ((uintptr_t)(obj)) | (type);                          \
4933 } while (/*CONSTCOND*/0)
4934 
4935 #define   UVM_VOADDR_GET_UOBJ(voa)                                              \
4936           ((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa))
4937 #define   UVM_VOADDR_SET_UOBJ(voa, uobj)                                                  \
4938           UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ)
4939 
4940 #define   UVM_VOADDR_GET_ANON(voa)                                              \
4941           ((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa))
4942 #define   UVM_VOADDR_SET_ANON(voa, anon)                                                  \
4943           UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON)
4944 
4945 /*
4946  * uvm_voaddr_acquire: returns the virtual object address corresponding
4947  * to the specified virtual address.
4948  *
4949  * => resolves COW so the true page identity is tracked.
4950  *
4951  * => acquires a reference on the page's owner (uvm_object or vm_anon)
4952  */
4953 bool
uvm_voaddr_acquire(struct vm_map * const map,vaddr_t const va,struct uvm_voaddr * const voaddr)4954 uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va,
4955     struct uvm_voaddr * const voaddr)
4956 {
4957           struct vm_map_entry *entry;
4958           struct vm_anon *anon = NULL;
4959           bool result = false;
4960           bool exclusive = false;
4961           void (*unlock_fn)(struct vm_map *);
4962 
4963           UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
4964           UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0);
4965 
4966           const vaddr_t start = trunc_page(va);
4967           const vaddr_t end = round_page(va+1);
4968 
4969  lookup_again:
4970           if (__predict_false(exclusive)) {
4971                     vm_map_lock(map);
4972                     unlock_fn = vm_map_unlock;
4973           } else {
4974                     vm_map_lock_read(map);
4975                     unlock_fn = vm_map_unlock_read;
4976           }
4977 
4978           if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) {
4979                     unlock_fn(map);
4980                     UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0);
4981                     return false;
4982           }
4983 
4984           if (__predict_false(entry->protection == VM_PROT_NONE)) {
4985                     unlock_fn(map);
4986                     UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0);
4987                     return false;
4988           }
4989 
4990           /*
4991            * We have a fast path for the common case of "no COW resolution
4992            * needed" whereby we have taken a read lock on the map and if
4993            * we don't encounter any need to create a vm_anon then great!
4994            * But if we do, we loop around again, instead taking an exclusive
4995            * lock so that we can perform the fault.
4996            *
4997            * In the event that we have to resolve the fault, we do nearly the
4998            * same work as uvm_map_pageable() does:
4999            *
5000            * 1: holding the write lock, we create any anonymous maps that need
5001            *    to be created.  however, we do NOT need to clip the map entries
5002            *    in this case.
5003            *
5004            * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
5005            *    in the page (assuming the entry is not already wired).  this
5006            *    is done because we need the vm_anon to be present.
5007            */
5008           if (__predict_true(!VM_MAPENT_ISWIRED(entry))) {
5009 
5010                     bool need_fault = false;
5011 
5012                     /*
5013                      * perform the action of vm_map_lookup that need the
5014                      * write lock on the map: create an anonymous map for
5015                      * a copy-on-write region, or an anonymous map for
5016                      * a zero-fill region.
5017                      */
5018                     if (__predict_false(UVM_ET_ISSUBMAP(entry))) {
5019                               unlock_fn(map);
5020                               UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0);
5021                               return false;
5022                     }
5023                     if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) &&
5024                         ((entry->max_protection & VM_PROT_WRITE) ||
5025                          (entry->object.uvm_obj == NULL)))) {
5026                               if (!exclusive) {
5027                                         /* need to take the slow path */
5028                                         KASSERT(unlock_fn == vm_map_unlock_read);
5029                                         vm_map_unlock_read(map);
5030                                         exclusive = true;
5031                                         goto lookup_again;
5032                               }
5033                               need_fault = true;
5034                               amap_copy(map, entry, 0, start, end);
5035                               /* XXXCDC: wait OK? */
5036                     }
5037 
5038                     /*
5039                      * do a quick check to see if the fault has already
5040                      * been resolved to the upper layer.
5041                      */
5042                     if (__predict_true(entry->aref.ar_amap != NULL &&
5043                                            need_fault == false)) {
5044                               amap_lock(entry->aref.ar_amap, RW_WRITER);
5045                               anon = amap_lookup(&entry->aref, start - entry->start);
5046                               if (__predict_true(anon != NULL)) {
5047                                         /* amap unlocked below */
5048                                         goto found_anon;
5049                               }
5050                               amap_unlock(entry->aref.ar_amap);
5051                               need_fault = true;
5052                     }
5053 
5054                     /*
5055                      * we predict this test as false because if we reach
5056                      * this point, then we are likely dealing with a
5057                      * shared memory region backed by a uvm_object, in
5058                      * which case a fault to create the vm_anon is not
5059                      * necessary.
5060                      */
5061                     if (__predict_false(need_fault)) {
5062                               if (exclusive) {
5063                                         vm_map_busy(map);
5064                                         vm_map_unlock(map);
5065                                         unlock_fn = vm_map_unbusy;
5066                               }
5067 
5068                               if (uvm_fault_wire(map, start, end,
5069                                                      entry->max_protection, 1)) {
5070                                         /* wiring failed */
5071                                         unlock_fn(map);
5072                                         UVMHIST_LOG(maphist,"<- done (wire failed)",
5073                                                       0,0,0,0);
5074                                         return false;
5075                               }
5076 
5077                               /*
5078                                * now that we have resolved the fault, we can unwire
5079                                * the page.
5080                                */
5081                               if (exclusive) {
5082                                         vm_map_lock(map);
5083                                         vm_map_unbusy(map);
5084                                         unlock_fn = vm_map_unlock;
5085                               }
5086 
5087                               uvm_fault_unwire_locked(map, start, end);
5088                     }
5089           }
5090 
5091           /* check the upper layer */
5092           if (entry->aref.ar_amap) {
5093                     amap_lock(entry->aref.ar_amap, RW_WRITER);
5094                     anon = amap_lookup(&entry->aref, start - entry->start);
5095                     if (anon) {
5096  found_anon:                  KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock);
5097                               anon->an_ref++;
5098                               rw_obj_hold(anon->an_lock);
5099                               KASSERT(anon->an_ref != 0);
5100                               UVM_VOADDR_SET_ANON(voaddr, anon);
5101                               voaddr->offset = va & PAGE_MASK;
5102                               result = true;
5103                     }
5104                     amap_unlock(entry->aref.ar_amap);
5105           }
5106 
5107           /* check the lower layer */
5108           if (!result && UVM_ET_ISOBJ(entry)) {
5109                     struct uvm_object *uobj = entry->object.uvm_obj;
5110 
5111                     KASSERT(uobj != NULL);
5112                     (*uobj->pgops->pgo_reference)(uobj);
5113                     UVM_VOADDR_SET_UOBJ(voaddr, uobj);
5114                     voaddr->offset = entry->offset + (va - entry->start);
5115                     result = true;
5116           }
5117 
5118           unlock_fn(map);
5119 
5120           if (result) {
5121                     UVMHIST_LOG(maphist,
5122                         "<- done OK (type=%jd,owner=%#jx,offset=%#jx)",
5123                         UVM_VOADDR_GET_TYPE(voaddr),
5124                         UVM_VOADDR_GET_OBJECT(voaddr),
5125                         voaddr->offset, 0);
5126           } else {
5127                     UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0);
5128           }
5129 
5130           return result;
5131 }
5132 
5133 /*
5134  * uvm_voaddr_release: release the references held by the
5135  * vitual object address.
5136  */
5137 void
uvm_voaddr_release(struct uvm_voaddr * const voaddr)5138 uvm_voaddr_release(struct uvm_voaddr * const voaddr)
5139 {
5140 
5141           switch (UVM_VOADDR_GET_TYPE(voaddr)) {
5142           case UVM_VOADDR_TYPE_UOBJ: {
5143                     struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr);
5144 
5145                     KASSERT(uobj != NULL);
5146                     KASSERT(uobj->pgops->pgo_detach != NULL);
5147                     (*uobj->pgops->pgo_detach)(uobj);
5148                     break;
5149               }
5150           case UVM_VOADDR_TYPE_ANON: {
5151                     struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr);
5152                     krwlock_t *lock;
5153 
5154                     KASSERT(anon != NULL);
5155                     rw_enter((lock = anon->an_lock), RW_WRITER);
5156                     KASSERT(anon->an_ref > 0);
5157                     if (--anon->an_ref == 0) {
5158                               uvm_anfree(anon);
5159                     }
5160                     rw_exit(lock);
5161                     rw_obj_free(lock);
5162                     break;
5163               }
5164           default:
5165                     panic("uvm_voaddr_release: bad type");
5166           }
5167           memset(voaddr, 0, sizeof(*voaddr));
5168 }
5169 
5170 /*
5171  * uvm_voaddr_compare: compare two uvm_voaddr objects.
5172  *
5173  * => memcmp() semantics
5174  */
5175 int
uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,const struct uvm_voaddr * const voaddr2)5176 uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,
5177     const struct uvm_voaddr * const voaddr2)
5178 {
5179           const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1);
5180           const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2);
5181 
5182           KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ ||
5183                     type1 == UVM_VOADDR_TYPE_ANON);
5184 
5185           KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ ||
5186                     type2 == UVM_VOADDR_TYPE_ANON);
5187 
5188           if (type1 < type2)
5189                     return -1;
5190           if (type1 > type2)
5191                     return 1;
5192 
5193           const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1);
5194           const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2);
5195 
5196           if (addr1 < addr2)
5197                     return -1;
5198           if (addr1 > addr2)
5199                     return 1;
5200 
5201           if (voaddr1->offset < voaddr2->offset)
5202                     return -1;
5203           if (voaddr1->offset > voaddr2->offset)
5204                     return 1;
5205 
5206           return 0;
5207 }
5208 
5209 #if defined(DDB) || defined(DEBUGPRINT)
5210 
5211 /*
5212  * uvm_map_printit: actually prints the map
5213  */
5214 
5215 void
uvm_map_printit(struct vm_map * map,bool full,void (* pr)(const char *,...))5216 uvm_map_printit(struct vm_map *map, bool full,
5217     void (*pr)(const char *, ...))
5218 {
5219           struct vm_map_entry *entry;
5220 
5221           (*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
5222               vm_map_max(map));
5223           (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
5224               map->nentries, map->size, map->ref_count, map->timestamp,
5225               map->flags);
5226           (*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
5227               pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
5228           if (!full)
5229                     return;
5230           for (entry = map->header.next; entry != &map->header;
5231               entry = entry->next) {
5232                     (*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
5233                         entry, entry->start, entry->end, entry->object.uvm_obj,
5234                         (long long)entry->offset, entry->aref.ar_amap,
5235                         entry->aref.ar_pageoff);
5236                     (*pr)(
5237                         "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
5238                         "wc=%d, adv=%d%s\n",
5239                         (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
5240                         (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
5241                         (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
5242                         entry->protection, entry->max_protection,
5243                         entry->inheritance, entry->wired_count, entry->advice,
5244                         entry == map->first_free ? " (first_free)" : "");
5245           }
5246 }
5247 
5248 void
uvm_whatis(uintptr_t addr,void (* pr)(const char *,...))5249 uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
5250 {
5251           struct vm_map *map;
5252 
5253           for (map = kernel_map;;) {
5254                     struct vm_map_entry *entry;
5255 
5256                     if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
5257                               break;
5258                     }
5259                     (*pr)("%p is %p+%zu from VMMAP %p\n",
5260                         (void *)addr, (void *)entry->start,
5261                         (size_t)(addr - (uintptr_t)entry->start), map);
5262                     if (!UVM_ET_ISSUBMAP(entry)) {
5263                               break;
5264                     }
5265                     map = entry->object.sub_map;
5266           }
5267 }
5268 
5269 #endif /* DDB || DEBUGPRINT */
5270 
5271 #ifndef __USER_VA0_IS_SAFE
5272 static int
sysctl_user_va0_disable(SYSCTLFN_ARGS)5273 sysctl_user_va0_disable(SYSCTLFN_ARGS)
5274 {
5275           struct sysctlnode node;
5276           int t, error;
5277 
5278           node = *rnode;
5279           node.sysctl_data = &t;
5280           t = user_va0_disable;
5281           error = sysctl_lookup(SYSCTLFN_CALL(&node));
5282           if (error || newp == NULL)
5283                     return (error);
5284 
5285           if (!t && user_va0_disable &&
5286               kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
5287               NULL, NULL, NULL))
5288                     return EPERM;
5289 
5290           user_va0_disable = !!t;
5291           return 0;
5292 }
5293 #endif
5294 
5295 static int
fill_vmentry(struct lwp * l,struct proc * p,struct kinfo_vmentry * kve,struct vm_map * m,struct vm_map_entry * e)5296 fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
5297     struct vm_map *m, struct vm_map_entry *e)
5298 {
5299 #ifndef _RUMPKERNEL
5300           int error;
5301 
5302           memset(kve, 0, sizeof(*kve));
5303           KASSERT(e != NULL);
5304           if (UVM_ET_ISOBJ(e)) {
5305                     struct uvm_object *uobj = e->object.uvm_obj;
5306                     KASSERT(uobj != NULL);
5307                     kve->kve_ref_count = uobj->uo_refs;
5308                     kve->kve_count = uobj->uo_npages;
5309                     if (UVM_OBJ_IS_VNODE(uobj)) {
5310                               struct vattr va;
5311                               struct vnode *vp = (struct vnode *)uobj;
5312                               vn_lock(vp, LK_SHARED | LK_RETRY);
5313                               error = VOP_GETATTR(vp, &va, l->l_cred);
5314                               VOP_UNLOCK(vp);
5315                               kve->kve_type = KVME_TYPE_VNODE;
5316                               if (error == 0) {
5317                                         kve->kve_vn_size = vp->v_size;
5318                                         kve->kve_vn_type = (int)vp->v_type;
5319                                         kve->kve_vn_mode = va.va_mode;
5320                                         kve->kve_vn_rdev = va.va_rdev;
5321                                         kve->kve_vn_fileid = va.va_fileid;
5322                                         kve->kve_vn_fsid = va.va_fsid;
5323                                         error = vnode_to_path(kve->kve_path,
5324                                             sizeof(kve->kve_path) / 2, vp, l, p);
5325                               }
5326                     } else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
5327                               kve->kve_type = KVME_TYPE_KERN;
5328                     } else if (UVM_OBJ_IS_DEVICE(uobj)) {
5329                               kve->kve_type = KVME_TYPE_DEVICE;
5330                     } else if (UVM_OBJ_IS_AOBJ(uobj)) {
5331                               kve->kve_type = KVME_TYPE_ANON;
5332                     } else {
5333                               kve->kve_type = KVME_TYPE_OBJECT;
5334                     }
5335           } else if (UVM_ET_ISSUBMAP(e)) {
5336                     struct vm_map *map = e->object.sub_map;
5337                     KASSERT(map != NULL);
5338                     kve->kve_ref_count = map->ref_count;
5339                     kve->kve_count = map->nentries;
5340                     kve->kve_type = KVME_TYPE_SUBMAP;
5341           } else
5342                     kve->kve_type = KVME_TYPE_UNKNOWN;
5343 
5344           kve->kve_start = e->start;
5345           kve->kve_end = e->end;
5346           kve->kve_offset = e->offset;
5347           kve->kve_wired_count = e->wired_count;
5348           kve->kve_inheritance = e->inheritance;
5349           kve->kve_attributes = 0; /* unused */
5350           kve->kve_advice = e->advice;
5351 #define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
5352           (((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
5353           (((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
5354           kve->kve_protection = PROT(e->protection);
5355           kve->kve_max_protection = PROT(e->max_protection);
5356           kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
5357               ? KVME_FLAG_COW : 0;
5358           kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
5359               ? KVME_FLAG_NEEDS_COPY : 0;
5360           kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
5361               ? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
5362           kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
5363               ? KVME_FLAG_PAGEABLE : 0;
5364 #endif
5365           return 0;
5366 }
5367 
5368 static int
fill_vmentries(struct lwp * l,pid_t pid,u_int elem_size,void * oldp,size_t * oldlenp)5369 fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
5370     size_t *oldlenp)
5371 {
5372           int error;
5373           struct proc *p;
5374           struct kinfo_vmentry *vme;
5375           struct vmspace *vm;
5376           struct vm_map *map;
5377           struct vm_map_entry *entry;
5378           char *dp;
5379           size_t count, vmesize;
5380 
5381           if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
5382                     return EINVAL;
5383 
5384           if (oldp) {
5385                     if (*oldlenp > 10UL * 1024UL * 1024UL)
5386                               return E2BIG;
5387                     count = *oldlenp / elem_size;
5388                     if (count == 0)
5389                               return ENOMEM;
5390                     vmesize = count * sizeof(*vme);
5391           } else
5392                     vmesize = 0;
5393 
5394           if ((error = proc_find_locked(l, &p, pid)) != 0)
5395                     return error;
5396 
5397           vme = NULL;
5398           count = 0;
5399 
5400           if ((error = proc_vmspace_getref(p, &vm)) != 0)
5401                     goto out;
5402 
5403           map = &vm->vm_map;
5404           vm_map_lock_read(map);
5405 
5406           dp = oldp;
5407           if (oldp)
5408                     vme = kmem_alloc(vmesize, KM_SLEEP);
5409           for (entry = map->header.next; entry != &map->header;
5410               entry = entry->next) {
5411                     if (oldp && (dp - (char *)oldp) < vmesize) {
5412                               error = fill_vmentry(l, p, &vme[count], map, entry);
5413                               if (error)
5414                                         goto out;
5415                               dp += elem_size;
5416                     }
5417                     count++;
5418           }
5419           vm_map_unlock_read(map);
5420           uvmspace_free(vm);
5421 
5422 out:
5423           if (pid != -1)
5424                     mutex_exit(p->p_lock);
5425           if (error == 0) {
5426                     const u_int esize = uimin(sizeof(*vme), elem_size);
5427                     dp = oldp;
5428                     for (size_t i = 0; i < count; i++) {
5429                               if (oldp && (dp - (char *)oldp) < vmesize) {
5430                                         error = sysctl_copyout(l, &vme[i], dp, esize);
5431                                         if (error)
5432                                                   break;
5433                                         dp += elem_size;
5434                               } else
5435                                         break;
5436                     }
5437                     count *= elem_size;
5438                     if (oldp != NULL && *oldlenp < count)
5439                               error = ENOSPC;
5440                     *oldlenp = count;
5441           }
5442           if (vme)
5443                     kmem_free(vme, vmesize);
5444           return error;
5445 }
5446 
5447 static int
sysctl_vmproc(SYSCTLFN_ARGS)5448 sysctl_vmproc(SYSCTLFN_ARGS)
5449 {
5450           int error;
5451 
5452           if (namelen == 1 && name[0] == CTL_QUERY)
5453                     return (sysctl_query(SYSCTLFN_CALL(rnode)));
5454 
5455           if (namelen == 0)
5456                     return EINVAL;
5457 
5458           switch (name[0]) {
5459           case VM_PROC_MAP:
5460                     if (namelen != 3)
5461                               return EINVAL;
5462                     sysctl_unlock();
5463                     error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
5464                     sysctl_relock();
5465                     return error;
5466           default:
5467                     return EINVAL;
5468           }
5469 }
5470 
5471 SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
5472 {
5473 
5474           sysctl_createv(clog, 0, NULL, NULL,
5475                            CTLFLAG_PERMANENT,
5476                            CTLTYPE_STRUCT, "proc",
5477                            SYSCTL_DESCR("Process vm information"),
5478                            sysctl_vmproc, 0, NULL, 0,
5479                            CTL_VM, VM_PROC, CTL_EOL);
5480 #ifndef __USER_VA0_IS_SAFE
5481         sysctl_createv(clog, 0, NULL, NULL,
5482                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
5483                        CTLTYPE_INT, "user_va0_disable",
5484                        SYSCTL_DESCR("Disable VA 0"),
5485                        sysctl_user_va0_disable, 0, &user_va0_disable, 0,
5486                        CTL_VM, CTL_CREATE, CTL_EOL);
5487 #endif
5488 }
5489