1 /*        $NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $ */
2 
3 /*
4  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran, and by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 2007 Manuel Bouyer.
34  *
35  * Redistribution and use in source and binary forms, with or without
36  * modification, are permitted provided that the following conditions
37  * are met:
38  * 1. Redistributions of source code must retain the above copyright
39  *    notice, this list of conditions and the following disclaimer.
40  * 2. Redistributions in binary form must reproduce the above copyright
41  *    notice, this list of conditions and the following disclaimer in the
42  *    documentation and/or other materials provided with the distribution.
43  *
44  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54  */
55 
56 /*
57  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58  *
59  * Permission to use, copy, modify, and distribute this software for any
60  * purpose with or without fee is hereby granted, provided that the above
61  * copyright notice and this permission notice appear in all copies.
62  *
63  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70  */
71 
72 /*
73  * Copyright 2001 (c) Wasabi Systems, Inc.
74  * All rights reserved.
75  *
76  * Written by Frank van der Linden for Wasabi Systems, Inc.
77  *
78  * Redistribution and use in source and binary forms, with or without
79  * modification, are permitted provided that the following conditions
80  * are met:
81  * 1. Redistributions of source code must retain the above copyright
82  *    notice, this list of conditions and the following disclaimer.
83  * 2. Redistributions in binary form must reproduce the above copyright
84  *    notice, this list of conditions and the following disclaimer in the
85  *    documentation and/or other materials provided with the distribution.
86  * 3. All advertising materials mentioning features or use of this software
87  *    must display the following acknowledgement:
88  *      This product includes software developed for the NetBSD Project by
89  *      Wasabi Systems, Inc.
90  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91  *    or promote products derived from this software without specific prior
92  *    written permission.
93  *
94  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
98  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104  * POSSIBILITY OF SUCH DAMAGE.
105  */
106 
107 /*
108  * Copyright (c) 1997 Charles D. Cranor and Washington University.
109  * All rights reserved.
110  *
111  * Redistribution and use in source and binary forms, with or without
112  * modification, are permitted provided that the following conditions
113  * are met:
114  * 1. Redistributions of source code must retain the above copyright
115  *    notice, this list of conditions and the following disclaimer.
116  * 2. Redistributions in binary form must reproduce the above copyright
117  *    notice, this list of conditions and the following disclaimer in the
118  *    documentation and/or other materials provided with the distribution.
119  *
120  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130  */
131 
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $");
134 
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142 
143 #define   __MUTEX_PRIVATE     /* for assertions */
144 
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159 
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162 
163 #include <dev/isa/isareg.h>
164 
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171 
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175 
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178 
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184 
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188 
189 /*
190  * general info:
191  *
192  *  - for an explanation of how the x86 MMU hardware works see
193  *    the comments in <machine/pte.h>.
194  *
195  *  - for an explanation of the general memory structure used by
196  *    this pmap (including the recursive mapping), see the comments
197  *    in <machine/pmap.h>.
198  *
199  * this file contains the code for the "pmap module."   the module's
200  * job is to manage the hardware's virtual to physical address mappings.
201  * note that there are two levels of mapping in the VM system:
202  *
203  *  [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204  *      to map ranges of virtual address space to objects/files.  for
205  *      example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206  *      to the file /bin/ls starting at offset zero."   note that
207  *      the upper layer mapping is not concerned with how individual
208  *      vm_pages are mapped.
209  *
210  *  [2] the lower layer of the VM system (the pmap) maintains the mappings
211  *      from virtual addresses.   it is concerned with which vm_page is
212  *      mapped where.   for example, when you run /bin/ls and start
213  *      at page 0x1000 the fault routine may lookup the correct page
214  *      of the /bin/ls file and then ask the pmap layer to establish
215  *      a mapping for it.
216  *
217  * note that information in the lower layer of the VM system can be
218  * thrown away since it can easily be reconstructed from the info
219  * in the upper layer.
220  *
221  * data structures we use include:
222  *
223  *  - struct pmap: describes the address space of one thread
224  *  - struct pmap_page: describes one pv-tracked page, without
225  *    necessarily a corresponding vm_page
226  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227  *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228  *    physical memory.   the pp_pvlist points to a list of pv_entry
229  *    structures which describe all the <PMAP,VA> pairs that this
230  *    page is mapped in.    this is critical for page based operations
231  *    such as pmap_page_protect() [change protection on _all_ mappings
232  *    of a page]
233  */
234 
235 /*
236  * Locking
237  *
238  * We have the following locks that we must deal with, listed in the order
239  * that they are acquired:
240  *
241  * pg->uobject->vmobjlock, pg->uanon->an_lock
242  *
243  *        For managed pages, these per-object locks are taken by the VM system
244  *        before calling into the pmap module - either a read or write hold.
245  *        The lock hold prevent pages from changing identity while the pmap is
246  *        operating on them.  For example, the same lock is held across a call
247  *        to pmap_remove() and the following call to pmap_update(), so that a
248  *        page does not gain a new identity while its TLB visibility is stale.
249  *
250  * pmap->pm_lock
251  *
252  *        This lock protects the fields in the pmap structure including the
253  *        non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254  *        structures.  For modifying unmanaged kernel PTEs it is not needed as
255  *        kernel PDEs are never freed, and the kernel is expected to be self
256  *        consistent (and the lock can't be taken for unmanaged kernel PTEs,
257  *        because they can be modified from interrupt context).
258  *
259  * pmaps_lock
260  *
261  *        This lock protects the list of active pmaps (headed by "pmaps").
262  *        It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263  *
264  * pp_lock
265  *
266  *        This per-page lock protects PV entry lists and the embedded PV entry
267  *        in each vm_page, allowing for concurrent operation on pages by
268  *        different pmaps.  This is a spin mutex at IPL_VM, because at the
269  *        points it is taken context switching is usually not tolerable, and
270  *        spin mutexes must block out interrupts that could take kernel_lock.
271  */
272 
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define   PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define   PMAP_DUMMY_UNLOCK(pm)         rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define   PMAP_DUMMY_LOCK(pm)
279 #define   PMAP_DUMMY_UNLOCK(pm)
280 #endif
281 
282 static const struct uvm_pagerops pmap_pager = {
283           /* nothing */
284 };
285 
286 /*
287  * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288  */
289 #define pl_i(va, lvl) \
290         (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291 
292 #define   pl_i_roundup(va, lvl)         pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293 
294 /*
295  * PTP macros:
296  *   a PTP's index is the PD index of the PDE that points to it
297  *   a PTP's offset is the byte-offset in the PTE space that this PTP is at
298  *   a PTP's VA is the first VA mapped by that PTP
299  */
300 
301 #define ptp_va2o(va, lvl)     (pl_i(va, (lvl)+1) * PAGE_SIZE)
302 
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313 
314 long nkptp[] = NKPTP_INITIALIZER;
315 
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318 
319 struct pcpu_area *pcpuarea __read_mostly;
320 
321 static vaddr_t pmap_maxkvaddr;
322 
323 /*
324  * Misc. event counters.
325  */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328 
329 /*
330  * PAT
331  */
332 static bool cpu_pat_enabled __read_mostly = false;
333 
334 /*
335  * Global data structures
336  */
337 
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341 
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344 
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347 
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350 
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353 
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357 
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
361 #endif
362 
363 #define   VM_PAGE_TO_PP(pg)   (&(pg)->mdpage.mp_pp)
364 #define   PMAP_CHECK_PP(pp) \
365     KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366 
367 #define PAGE_ALIGNED(pp)      \
368           __builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369 
370 /*
371  * Other data structures
372  */
373 
374 static pt_entry_t protection_codes[8] __read_mostly;
375 
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377 
378 /*
379  * The following two vaddr_t's are used during system startup to keep track of
380  * how much of the kernel's VM space we have used. Once the system is started,
381  * the management of the remaining kernel VM space is turned over to the
382  * kernel_map vm_map.
383  */
384 static vaddr_t virtual_avail __read_mostly;       /* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly;         /* VA of last free KVA */
386 
387 #ifndef XENPV
388 /*
389  * LAPIC virtual address, and fake physical address.
390  */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394 
395 /*
396  * pool that pmap structures are allocated from
397  */
398 struct pool_cache pmap_cache;
399 static int  pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401 
402 /*
403  * pv_page cache
404  */
405 static struct pool_cache pmap_pvp_cache;
406 
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411 
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414  * Special VAs and the PTEs that map them
415  */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424 
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426 
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431 
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437           .pa_alloc = pmap_pdp_alloc,
438           .pa_free = pmap_pdp_free,
439           .pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442 
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449 
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454 
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457           struct vm_page *pg[PTP_LEVELS + 1];
458           bool alloced[PTP_LEVELS + 1];
459 };
460 
461 /*
462  * PV entries are allocated in page-sized chunks and cached per-pmap to
463  * avoid intense pressure on memory allocators.
464  */
465 
466 struct pv_page {
467           LIST_HEAD(, pv_entry)         pvp_pves;
468           LIST_ENTRY(pv_page) pvp_list;
469           long                          pvp_nfree;
470           struct pmap                   *pvp_pmap;
471 };
472 
473 #define   PVE_PER_PVP         ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474 
475 /*
476  * PV tree prototypes
477  */
478 
479 static int          pmap_compare_key(void *, const void *, const void *);
480 static int          pmap_compare_nodes(void *, const void *, const void *);
481 
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484           .rbto_compare_nodes = pmap_compare_nodes,
485           .rbto_compare_key = pmap_compare_key,
486           .rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487           .rbto_context = NULL
488 };
489 
490 /*
491  * Local prototypes
492  */
493 
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507 
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509     struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512     pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516     pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518     vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520     vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526 
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528 
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531 
532 long
pmap_resident_count(struct pmap * pmap)533 pmap_resident_count(struct pmap *pmap)
534 {
535 
536           return pmap->pm_stats.resident_count;
537 }
538 
539 long
pmap_wired_count(struct pmap * pmap)540 pmap_wired_count(struct pmap *pmap)
541 {
542 
543           return pmap->pm_stats.wired_count;
544 }
545 
546 /*
547  * p m a p   h e l p e r   f u n c t i o n s
548  */
549 
550 static inline void
pmap_stats_update(struct pmap * pmap,int resid_diff,int wired_diff)551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553 
554           KASSERT(cold || mutex_owned(&pmap->pm_lock));
555           pmap->pm_stats.resident_count += resid_diff;
556           pmap->pm_stats.wired_count += wired_diff;
557 }
558 
559 static inline void
pmap_stats_update_bypte(struct pmap * pmap,pt_entry_t npte,pt_entry_t opte)560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562           int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563           int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564 
565           KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566           KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567 
568           pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570 
571 /*
572  * ptp_to_pmap: lookup pmap by ptp
573  */
574 static inline struct pmap *
ptp_to_pmap(struct vm_page * ptp)575 ptp_to_pmap(struct vm_page *ptp)
576 {
577           struct pmap *pmap;
578 
579           if (ptp == NULL) {
580                     return pmap_kernel();
581           }
582           pmap = (struct pmap *)ptp->uobject;
583           KASSERT(pmap != NULL);
584           KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585           return pmap;
586 }
587 
588 static inline struct pv_pte *
pve_to_pvpte(struct pv_entry * pve)589 pve_to_pvpte(struct pv_entry *pve)
590 {
591 
592           if (pve == NULL)
593                     return NULL;
594           KASSERT((void *)&pve->pve_pte == (void *)pve);
595           return &pve->pve_pte;
596 }
597 
598 static inline struct pv_entry *
pvpte_to_pve(struct pv_pte * pvpte)599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601           struct pv_entry *pve = (void *)pvpte;
602 
603           KASSERT(pve_to_pvpte(pve) == pvpte);
604           return pve;
605 }
606 
607 /*
608  * Return true if the pmap page has an embedded PV entry.
609  */
610 static inline bool
pv_pte_embedded(struct pmap_page * pp)611 pv_pte_embedded(struct pmap_page *pp)
612 {
613 
614           KASSERT(mutex_owned(&pp->pp_lock));
615           return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617 
618 /*
619  * pv_pte_first, pv_pte_next: PV list iterator.
620  */
621 static inline struct pv_pte *
pv_pte_first(struct pmap_page * pp)622 pv_pte_first(struct pmap_page *pp)
623 {
624 
625           KASSERT(mutex_owned(&pp->pp_lock));
626           if (pv_pte_embedded(pp)) {
627                     return &pp->pp_pte;
628           }
629           return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631 
632 static inline struct pv_pte *
pv_pte_next(struct pmap_page * pp,struct pv_pte * pvpte)633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635 
636           KASSERT(mutex_owned(&pp->pp_lock));
637           KASSERT(pvpte != NULL);
638           if (pvpte == &pp->pp_pte) {
639                     return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640           }
641           return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643 
644 static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647           uint8_t ret = 0;
648           if (pte & PTE_D)
649                     ret |= PP_ATTRS_D;
650           if (pte & PTE_A)
651                     ret |= PP_ATTRS_A;
652           if (pte & PTE_W)
653                     ret |= PP_ATTRS_W;
654           return ret;
655 }
656 
657 static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660           pt_entry_t pte = 0;
661           if (attrs & PP_ATTRS_D)
662                     pte |= PTE_D;
663           if (attrs & PP_ATTRS_A)
664                     pte |= PTE_A;
665           if (attrs & PP_ATTRS_W)
666                     pte |= PTE_W;
667           return pte;
668 }
669 
670 /*
671  * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672  * of course the kernel is always loaded
673  */
674 bool
pmap_is_curpmap(struct pmap * pmap)675 pmap_is_curpmap(struct pmap *pmap)
676 {
677           return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679 
680 inline void
pmap_reference(struct pmap * pmap)681 pmap_reference(struct pmap *pmap)
682 {
683 
684           atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686 
687 /*
688  * rbtree: compare two nodes.
689  */
690 static int
pmap_compare_nodes(void * context,const void * n1,const void * n2)691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693           const struct pv_entry *pve1 = n1;
694           const struct pv_entry *pve2 = n2;
695 
696           KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697 
698           if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699                     return -1;
700           }
701           if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702                     return 1;
703           }
704           return 0;
705 }
706 
707 /*
708  * rbtree: compare a node and a key.
709  */
710 static int
pmap_compare_key(void * context,const void * n,const void * k)711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713           const struct pv_entry *pve = n;
714           const vaddr_t key = (vaddr_t)k;
715 
716           if (pve->pve_pte.pte_va < key) {
717                     return -1;
718           }
719           if (pve->pve_pte.pte_va > key) {
720                     return 1;
721           }
722           return 0;
723 }
724 
725 /*
726  * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727  */
728 static inline void
pmap_ptp_range_set(struct vm_page * ptp,vaddr_t va)729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731           vaddr_t *min = (vaddr_t *)&ptp->uanon;
732 
733           if (va < *min) {
734                     *min = va;
735           }
736 }
737 
738 /*
739  * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740  */
741 static inline void
pmap_ptp_range_clip(struct vm_page * ptp,vaddr_t * startva,pt_entry_t ** pte)742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744           vaddr_t sclip;
745 
746           if (ptp == NULL) {
747                     return;
748           }
749 
750           sclip = (vaddr_t)ptp->uanon;
751           sclip = (*startva < sclip ? sclip : *startva);
752           *pte += (sclip - *startva) / PAGE_SIZE;
753           *startva = sclip;
754 }
755 
756 /*
757  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758  *
759  * there are several pmaps involved.  some or all of them might be same.
760  *
761  *        - the pmap given by the first argument
762  *                  our caller wants to access this pmap's PTEs.
763  *
764  *        - pmap_kernel()
765  *                  the kernel pmap.  note that it only contains the kernel part
766  *                  of the address space which is shared by any pmap.  ie. any
767  *                  pmap can be used instead of pmap_kernel() for our purpose.
768  *
769  *        - ci->ci_pmap
770  *                  pmap currently loaded on the cpu.
771  *
772  *        - vm_map_pmap(&curproc->p_vmspace->vm_map)
773  *                  current process' pmap.
774  *
775  * => caller must lock pmap first (if not the kernel pmap)
776  * => must be undone with pmap_unmap_ptes before returning
777  * => disables kernel preemption
778  */
779 void
pmap_map_ptes(struct pmap * pmap,struct pmap ** pmap2,pd_entry_t ** ptepp,pd_entry_t * const ** pdeppp)780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781     pd_entry_t * const **pdeppp)
782 {
783           struct pmap *curpmap;
784           struct cpu_info *ci;
785           lwp_t *l;
786 
787           kpreempt_disable();
788 
789           /* The kernel's pmap is always accessible. */
790           if (pmap == pmap_kernel()) {
791                     *pmap2 = NULL;
792                     *ptepp = PTE_BASE;
793                     *pdeppp = normal_pdes;
794                     return;
795           }
796 
797           KASSERT(mutex_owned(&pmap->pm_lock));
798 
799           l = curlwp;
800           ci = l->l_cpu;
801           curpmap = ci->ci_pmap;
802           if (pmap == curpmap) {
803                     /*
804                      * Already on the CPU: make it valid.  This is very
805                      * often the case during exit(), when we have switched
806                      * to the kernel pmap in order to destroy a user pmap.
807                      */
808                     if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809                               pmap_reactivate(pmap);
810                     }
811                     *pmap2 = NULL;
812           } else {
813                     /*
814                      * Toss current pmap from CPU and install new pmap, but keep
815                      * a reference to the old one.  Dropping the reference can
816                      * can block as it needs to take locks, so defer that to
817                      * pmap_unmap_ptes().
818                      */
819                     pmap_reference(pmap);
820                     pmap_load1(l, pmap, curpmap);
821                     *pmap2 = curpmap;
822           }
823           KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825           pmap->pm_pctr = lwp_pctr();
826 #endif
827           *ptepp = PTE_BASE;
828 
829 #if defined(XENPV) && defined(__x86_64__)
830           KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831           ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832           *pdeppp = ci->ci_normal_pdes;
833 #else
834           *pdeppp = normal_pdes;
835 #endif
836 }
837 
838 /*
839  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840  *
841  * => we cannot tolerate context switches while mapped in: assert this.
842  * => reenables kernel preemption.
843  * => does not unlock pmap.
844  */
845 void
pmap_unmap_ptes(struct pmap * pmap,struct pmap * pmap2)846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848           struct cpu_info *ci;
849           struct pmap *mypmap;
850           struct lwp *l;
851 
852           KASSERT(kpreempt_disabled());
853 
854           /* The kernel's pmap is always accessible. */
855           if (pmap == pmap_kernel()) {
856                     kpreempt_enable();
857                     return;
858           }
859 
860           l = curlwp;
861           ci = l->l_cpu;
862 
863           KASSERT(mutex_owned(&pmap->pm_lock));
864           KASSERT(pmap->pm_pctr == lwp_pctr());
865 
866 #if defined(XENPV) && defined(__x86_64__)
867           KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868           ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870 
871           /* If not our own pmap, mark whatever's on the CPU now as lazy. */
872           KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873           mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874           if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875                     ci->ci_want_pmapload = 0;
876           } else {
877                     ci->ci_want_pmapload = (mypmap != pmap_kernel());
878                     ci->ci_tlbstate = TLBSTATE_LAZY;
879           }
880 
881           /* Now safe to re-enable preemption. */
882           kpreempt_enable();
883 
884           /* Toss reference to other pmap taken earlier. */
885           if (pmap2 != NULL) {
886                     pmap_destroy(pmap2);
887           }
888 }
889 
890 inline static void
pmap_exec_account(struct pmap * pm,vaddr_t va,pt_entry_t opte,pt_entry_t npte)891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893 
894 #if !defined(__x86_64__)
895           if (curproc == NULL || curproc->p_vmspace == NULL ||
896               pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897                     return;
898 
899           if ((opte ^ npte) & PTE_X)
900                     pmap_update_pg(va);
901 
902           /*
903            * Executability was removed on the last executable change.
904            * Reset the code segment to something conservative and
905            * let the trap handler deal with setting the right limit.
906            * We can't do that because of locking constraints on the vm map.
907            */
908 
909           if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910                     struct trapframe *tf = curlwp->l_md.md_regs;
911 
912                     tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913                     pm->pm_hiexec = I386_MAX_EXE_ADDR;
914           }
915 #endif /* !defined(__x86_64__) */
916 }
917 
918 #if !defined(__x86_64__)
919 /*
920  * Fixup the code segment to cover all potential executable mappings.
921  * returns 0 if no changes to the code segment were made.
922  */
923 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,struct pcb * pcb)924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926           struct vm_map_entry *ent;
927           struct pmap *pm = vm_map_pmap(map);
928           vaddr_t va = 0;
929 
930           vm_map_lock_read(map);
931           for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932                     /*
933                      * This entry has greater va than the entries before.
934                      * We need to make it point to the last page, not past it.
935                      */
936                     if (ent->protection & VM_PROT_EXECUTE)
937                               va = trunc_page(ent->end) - PAGE_SIZE;
938           }
939           vm_map_unlock_read(map);
940           if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941                     return 0;
942 
943           pm->pm_hiexec = va;
944           if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945                     tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946           } else {
947                     tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948                     return 0;
949           }
950           return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953 
954 void
pat_init(struct cpu_info * ci)955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958           uint64_t pat;
959 
960           if (!(ci->ci_feat_val[0] & CPUID_PAT))
961                     return;
962 
963           /* We change WT to WC. Leave all other entries the default values. */
964           pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965                 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966                 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967                 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968 
969           wrmsr(MSR_CR_PAT, pat);
970           cpu_pat_enabled = true;
971 #endif
972 }
973 
974 static pt_entry_t
pmap_pat_flags(u_int flags)975 pmap_pat_flags(u_int flags)
976 {
977           u_int cacheflags = (flags & PMAP_CACHE_MASK);
978 
979           if (!cpu_pat_enabled) {
980                     switch (cacheflags) {
981                     case PMAP_NOCACHE:
982                     case PMAP_NOCACHE_OVR:
983                               /* results in PGC_UCMINUS on cpus which have
984                                * the cpuid PAT but PAT "disabled"
985                                */
986                               return PTE_PCD;
987                     default:
988                               return 0;
989                     }
990           }
991 
992           switch (cacheflags) {
993           case PMAP_NOCACHE:
994                     return PGC_UC;
995           case PMAP_WRITE_COMBINE:
996                     return PGC_WC;
997           case PMAP_WRITE_BACK:
998                     return PGC_WB;
999           case PMAP_NOCACHE_OVR:
1000                     return PGC_UCMINUS;
1001           }
1002 
1003           return 0;
1004 }
1005 
1006 /*
1007  * p m a p   k e n t e r   f u n c t i o n s
1008  *
1009  * functions to quickly enter/remove pages from the kernel address
1010  * space.   pmap_kremove is exported to MI kernel.  we make use of
1011  * the recursive PTE mappings.
1012  */
1013 
1014 /*
1015  * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016  *
1017  * => no need to lock anything, assume va is already allocated
1018  * => should be faster than normal pmap enter function
1019  */
1020 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023           pt_entry_t *pte, opte, npte;
1024 
1025           KASSERT(!(prot & ~VM_PROT_ALL));
1026 
1027           if (va < VM_MIN_KERNEL_ADDRESS)
1028                     pte = vtopte(va);
1029           else
1030                     pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032           if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034                     printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035                         " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037                     npte = pa;
1038           } else
1039 #endif /* XENPV && DOM0OPS */
1040                     npte = pmap_pa2pte(pa);
1041           npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042           npte |= pmap_pat_flags(flags);
1043           opte = pmap_pte_testset(pte, npte); /* zap! */
1044 
1045           /*
1046            * XXX: make sure we are not dealing with a large page, since the only
1047            * large pages created are for the kernel image, and they should never
1048            * be kentered.
1049            */
1050           KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051 
1052           if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053                     /* This should not happen. */
1054                     printf_nolog("%s: mapping already present\n", __func__);
1055                     kpreempt_disable();
1056                     pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057                     kpreempt_enable();
1058           }
1059 }
1060 
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062 
1063 #if defined(__x86_64__)
1064 /*
1065  * Change protection for a virtual address. Local for a CPU only, don't
1066  * care about TLB shootdowns.
1067  *
1068  * => must be called with preemption disabled
1069  */
1070 void
pmap_changeprot_local(vaddr_t va,vm_prot_t prot)1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073           pt_entry_t *pte, opte, npte;
1074 
1075           KASSERT(kpreempt_disabled());
1076 
1077           if (va < VM_MIN_KERNEL_ADDRESS)
1078                     pte = vtopte(va);
1079           else
1080                     pte = kvtopte(va);
1081 
1082           npte = opte = *pte;
1083 
1084           if ((prot & VM_PROT_WRITE) != 0)
1085                     npte |= PTE_W;
1086           else
1087                     npte &= ~(PTE_W|PTE_D);
1088 
1089           if (opte != npte) {
1090                     pmap_pte_set(pte, npte);
1091                     pmap_pte_flush();
1092                     invlpg(va);
1093           }
1094 }
1095 #endif /* defined(__x86_64__) */
1096 
1097 /*
1098  * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099  *
1100  * => no need to lock anything
1101  * => caller must dispose of any vm_page mapped in the va range
1102  * => note: not an inline function
1103  * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104  * => we assume kernel only unmaps valid addresses and thus don't bother
1105  *    checking the valid bit before doing TLB flushing
1106  * => must be followed by call to pmap_update() before reuse of page
1107  */
1108 static void
pmap_kremove1(vaddr_t sva,vsize_t len,bool localonly)1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111           pt_entry_t *pte, opte;
1112           vaddr_t va, eva;
1113 
1114           eva = sva + len;
1115 
1116           kpreempt_disable();
1117           for (va = sva; va < eva; va += PAGE_SIZE) {
1118                     pte = kvtopte(va);
1119                     opte = pmap_pte_testset(pte, 0); /* zap! */
1120                     if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121                               pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122                                   TLBSHOOT_KREMOVE);
1123                     }
1124                     KASSERTMSG((opte & PTE_PS) == 0,
1125                         "va %#" PRIxVADDR " is a large page", va);
1126                     KASSERTMSG((opte & PTE_PVLIST) == 0,
1127                         "va %#" PRIxVADDR " is a pv tracked page", va);
1128           }
1129           if (localonly) {
1130                     tlbflushg();
1131           }
1132           kpreempt_enable();
1133 }
1134 
1135 void
pmap_kremove(vaddr_t sva,vsize_t len)1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138 
1139           pmap_kremove1(sva, len, false);
1140 }
1141 
1142 /*
1143  * pmap_kremove_local: like pmap_kremove(), but only worry about
1144  * TLB invalidations on the current CPU.  this is only intended
1145  * for use while writing kernel crash dumps, either after panic
1146  * or via reboot -d.
1147  */
1148 void
pmap_kremove_local(vaddr_t sva,vsize_t len)1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151 
1152           pmap_kremove1(sva, len, true);
1153 }
1154 
1155 /*
1156  * p m a p   i n i t   f u n c t i o n s
1157  *
1158  * pmap_bootstrap and pmap_init are called during system startup
1159  * to init the pmap module.   pmap_bootstrap() does a low level
1160  * init just to get things rolling.   pmap_init() finishes the job.
1161  */
1162 
1163 /*
1164  * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165  * This function is to be used before any VM system has been set up.
1166  *
1167  * The va is taken from virtual_avail.
1168  */
1169 static vaddr_t
pmap_bootstrap_valloc(size_t npages)1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172           vaddr_t va = virtual_avail;
1173           virtual_avail += npages * PAGE_SIZE;
1174           return va;
1175 }
1176 
1177 /*
1178  * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179  * This function is to be used before any VM system has been set up.
1180  *
1181  * The pa is taken from avail_start.
1182  */
1183 static paddr_t
pmap_bootstrap_palloc(size_t npages)1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186           paddr_t pa = avail_start;
1187           avail_start += npages * PAGE_SIZE;
1188           return pa;
1189 }
1190 
1191 /*
1192  * pmap_bootstrap: get the system in a state where it can run with VM properly
1193  * enabled (called before main()). The VM system is fully init'd later.
1194  *
1195  * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196  *    kernel, and nkpde PTP's for the kernel.
1197  * => kva_start is the first free virtual address in kernel space.
1198  */
1199 void
pmap_bootstrap(vaddr_t kva_start)1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202           struct pmap *kpm;
1203           int i;
1204           vaddr_t kva;
1205 
1206           pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207 
1208           /*
1209            * Set up our local static global vars that keep track of the usage of
1210            * KVM before kernel_map is set up.
1211            */
1212           virtual_avail = kva_start;              /* first free KVA */
1213           virtual_end = VM_MAX_KERNEL_ADDRESS;    /* last KVA */
1214 
1215           /*
1216            * Set up protection_codes: we need to be able to convert from a MI
1217            * protection code (some combo of VM_PROT...) to something we can jam
1218            * into a x86 PTE.
1219            */
1220           protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221           protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222           protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223           protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224           protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225           protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226           protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227           protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228 
1229           /*
1230            * Now we init the kernel's pmap.
1231            *
1232            * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233            * the pm_obj contains the list of active PTPs.
1234            */
1235           kpm = pmap_kernel();
1236           mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237           rw_init(&kpm->pm_dummy_lock);
1238           for (i = 0; i < PTP_LEVELS - 1; i++) {
1239                     uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240                     uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241                     kpm->pm_ptphint[i] = NULL;
1242           }
1243           memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
1244 
1245           kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246           for (i = 0; i < PDP_SIZE; i++)
1247                     kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248 
1249           kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250                     x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251 
1252           kcpuset_create(&kpm->pm_cpus, true);
1253           kcpuset_create(&kpm->pm_kernel_cpus, true);
1254 
1255           kpm->pm_ldt = NULL;
1256           kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257 
1258           /*
1259            * the above is just a rough estimate and not critical to the proper
1260            * operation of the system.
1261            */
1262 
1263 #if !defined(XENPV)
1264           /*
1265            * Begin to enable global TLB entries if they are supported: add PTE_G
1266            * attribute to already mapped kernel pages. Do that only if SVS is
1267            * disabled.
1268            *
1269            * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270            * happens later in cpu_init().
1271            */
1272 #ifdef SVS
1273           if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275           if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277                     pmap_pg_g = PTE_G;
1278                     pmap_remap_global();
1279           }
1280 #endif
1281 
1282 #ifndef XENPV
1283           /*
1284            * Enable large pages if they are supported.
1285            */
1286           if (cpu_feature[0] & CPUID_PSE) {
1287                     lcr4(rcr4() | CR4_PSE);       /* enable hardware (via %cr4) */
1288                     pmap_largepages = 1;          /* enable software */
1289 
1290                     /*
1291                      * The TLB must be flushed after enabling large pages on Pentium
1292                      * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293                      * Software Developer's Manual, Volume 3: System Programming".
1294                      */
1295                     tlbflushg();
1296 
1297                     /* Remap the kernel. */
1298                     pmap_remap_largepages();
1299           }
1300           pmap_init_lapic();
1301 #endif /* !XENPV */
1302 
1303 #ifdef __HAVE_PCPU_AREA
1304           pmap_init_pcpu();
1305 #endif
1306 
1307 #ifdef __HAVE_DIRECT_MAP
1308           pmap_init_directmap(kpm);
1309 #else
1310           pmap_vpage_cpualloc(&cpu_info_primary);
1311 
1312           if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313                     early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314                     early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315           } else { /* amd64 */
1316                     /*
1317                      * zero_pte is stuck at the end of mapped space for the kernel
1318                      * image (disjunct from kva space). This is done so that it
1319                      * can safely be used in pmap_growkernel (pmap_get_physpage),
1320                      * when it's called for the first time.
1321                      * XXXfvdl fix this for MULTIPROCESSOR later.
1322                      */
1323 #ifdef XENPV
1324                     /* early_zerop initialized in xen_locore() */
1325 #else
1326                     early_zerop = (void *)bootspace.spareva;
1327 #endif
1328                     early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329           }
1330 #endif
1331 
1332 #if defined(XENPV) && defined(__x86_64__)
1333           extern vaddr_t xen_dummy_page;
1334           paddr_t xen_dummy_user_pgd;
1335 
1336           /*
1337            * We want a dummy page directory for Xen: when deactivating a pmap,
1338            * Xen will still consider it active. So we set user PGD to this one
1339            * to lift all protection on the now inactive page tables set.
1340            */
1341           xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342 
1343           /* Zero fill it, the less checks in Xen it requires the better */
1344           memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345           /* Mark read-only */
1346           HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347               pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348               UVMF_INVLPG);
1349           /* Pin as L4 */
1350           xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352 
1353           /*
1354            * Allocate space for the Interrupt Descriptor Table (IDT),
1355            * Global Descriptor Table (GDT), and Local Descriptor Table
1356            * (LDT).
1357            *
1358            * Currently there is an initial temporary GDT allocated on the
1359            * stack by the caller of init386/init_x86_64, which is (among
1360            * other things) needed on i386 for %fs-relative addressing for
1361            * CPU-local data (CPUVAR(...), curcpu(), curlwp).  This
1362            * initial temporary GDT will be popped off the stack before we
1363            * can enter main, so we need to make sure there is space for a
1364            * second temporary GDT to continue existing when we enter main
1365            * before we allocate space for the permanent GDT with
1366            * uvm_km(9) in gdt_init via cpu_startup and switch to that.
1367            */
1368           idt_vaddr = pmap_bootstrap_valloc(1);
1369           idt_paddr = pmap_bootstrap_palloc(1);
1370 
1371           gdt_vaddr = pmap_bootstrap_valloc(1);
1372           gdt_paddr = pmap_bootstrap_palloc(1);
1373 
1374 #ifdef __HAVE_PCPU_AREA
1375           ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1376 #else
1377           ldt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379           ldt_paddr = pmap_bootstrap_palloc(1);
1380 
1381 #if !defined(__x86_64__)
1382           /* pentium f00f bug stuff */
1383           pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1384 #endif
1385 
1386 #if defined(XENPVHVM)
1387           /* XXX: move to hypervisor.c with appropriate API adjustments */
1388           extern paddr_t HYPERVISOR_shared_info_pa;
1389           extern volatile struct xencons_interface *xencons_interface; /* XXX */
1390           extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1391 
1392           if (vm_guest != VM_GUEST_XENPVH) {
1393                     HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1394                     HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1395           }
1396           xencons_interface = (void *) pmap_bootstrap_valloc(1);
1397           xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1398 #endif
1399           /*
1400            * Now we reserve some VM for mapping pages when doing a crash dump.
1401            */
1402           virtual_avail = reserve_dumppages(virtual_avail);
1403 
1404           /*
1405            * Init the global lock and global list.
1406            */
1407           mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408           LIST_INIT(&pmaps);
1409 
1410           /*
1411            * Ensure the TLB is sync'd with reality by flushing it...
1412            */
1413           tlbflushg();
1414 
1415           /*
1416            * Calculate pmap_maxkvaddr from nkptp[].
1417            */
1418           kva = VM_MIN_KERNEL_ADDRESS;
1419           for (i = PTP_LEVELS - 1; i >= 1; i--) {
1420                     kva += nkptp[i] * nbpd[i];
1421           }
1422           pmap_maxkvaddr = kva;
1423 }
1424 
1425 #ifndef XENPV
1426 static void
1427 pmap_init_lapic(void)
1428 {
1429           /*
1430            * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1431            * x86 implementation relies a lot on this address to be valid; so just
1432            * allocate a fake physical page that will be kentered into
1433            * local_apic_va by machdep.
1434            *
1435            * If the LAPIC is present, the va will be remapped somewhere else
1436            * later in lapic_map.
1437            */
1438           local_apic_va = pmap_bootstrap_valloc(1);
1439           local_apic_pa = pmap_bootstrap_palloc(1);
1440 }
1441 #endif
1442 
1443 #ifdef __x86_64__
1444 static size_t
1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1446 {
1447           size_t npages;
1448           npages = (roundup(endva, pgsz) / pgsz) -
1449               (rounddown(startva, pgsz) / pgsz);
1450           return npages;
1451 }
1452 #endif
1453 
1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1455 static inline void
1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1457 {
1458           size_t sslot = slotspace.area[type].sslot;
1459           size_t nslot = slotspace.area[type].nslot;
1460 
1461           memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1462 }
1463 #endif
1464 
1465 #ifdef __x86_64__
1466 /*
1467  * Randomize the location of an area. We count the holes in the VM space. We
1468  * randomly select one hole, and then randomly select an area within that hole.
1469  * Finally we update the associated entry in the slotspace structure.
1470  */
1471 vaddr_t
1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1473     vaddr_t randva)
1474 {
1475           struct {
1476                     int start;
1477                     int end;
1478           } holes[SLSPACE_NAREAS+1];
1479           size_t i, nholes, hole;
1480           size_t startsl, endsl, nslots, winsize;
1481           vaddr_t startva, va;
1482 
1483           sz = roundup(sz, align);
1484 
1485           /*
1486            * Take one more slot with +NBPD_L4, because we may end up choosing
1487            * an area that crosses slots:
1488            *     +------+------+------+
1489            *     | Slot | Slot | Slot |
1490            *     +------+------+------+
1491            *        [Chosen Area]
1492            * And in that case we must take into account the additional slot
1493            * consumed.
1494            */
1495           nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1496 
1497           /* Get the holes. */
1498           nholes = 0;
1499           size_t curslot = 0 + 256; /* end of SLAREA_USER */
1500           while (1) {
1501                     /*
1502                      * Find the first occupied slot after the current one.
1503                      * The area between the two is a hole.
1504                      */
1505                     size_t minsslot = 512;
1506                     size_t minnslot = 0;
1507                     for (i = 0; i < SLSPACE_NAREAS; i++) {
1508                               if (!slotspace.area[i].active)
1509                                         continue;
1510                               if (slotspace.area[i].sslot >= curslot &&
1511                                   slotspace.area[i].sslot < minsslot) {
1512                                         minsslot = slotspace.area[i].sslot;
1513                                         minnslot = slotspace.area[i].nslot;
1514                               }
1515                     }
1516 
1517                     /* No hole anymore, stop here. */
1518                     if (minsslot == 512) {
1519                               break;
1520                     }
1521 
1522                     /* Register the hole. */
1523                     if (minsslot - curslot >= nslots) {
1524                               holes[nholes].start = curslot;
1525                               holes[nholes].end = minsslot;
1526                               nholes++;
1527                     }
1528 
1529                     /* Skip that hole, and iterate again. */
1530                     curslot = minsslot + minnslot;
1531           }
1532 
1533           if (nholes == 0) {
1534                     panic("%s: impossible", __func__);
1535           }
1536 
1537           /* Select a hole. */
1538           hole = randhole;
1539 #ifdef NO_X86_ASLR
1540           hole = 0;
1541 #endif
1542           hole %= nholes;
1543           startsl = holes[hole].start;
1544           endsl = holes[hole].end;
1545           startva = VA_SIGN_NEG(startsl * NBPD_L4);
1546 
1547           /* Select an area within the hole. */
1548           va = randva;
1549 #ifdef NO_X86_ASLR
1550           va = 0;
1551 #endif
1552           winsize = ((endsl - startsl) * NBPD_L4) - sz;
1553           va %= winsize;
1554           va = rounddown(va, align);
1555           va += startva;
1556 
1557           /* Update the entry. */
1558           slotspace.area[type].sslot = pl4_i(va);
1559           slotspace.area[type].nslot =
1560               pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1561           slotspace.area[type].active = true;
1562 
1563           return va;
1564 }
1565 #endif
1566 
1567 #ifdef __HAVE_PCPU_AREA
1568 static void
1569 pmap_init_pcpu(void)
1570 {
1571           const vaddr_t startva = PMAP_PCPU_BASE;
1572           size_t nL4e, nL3e, nL2e, nL1e;
1573           size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1574           paddr_t pa;
1575           vaddr_t endva;
1576           vaddr_t tmpva;
1577           pt_entry_t *pte;
1578           size_t size;
1579           int i;
1580 
1581           const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1582 
1583           size = sizeof(struct pcpu_area);
1584 
1585           endva = startva + size;
1586 
1587           /* We will use this temporary va. */
1588           tmpva = bootspace.spareva;
1589           pte = PTE_BASE + pl1_i(tmpva);
1590 
1591           /* Build L4 */
1592           L4e_idx = pl4_i(startva);
1593           nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1594           KASSERT(nL4e  == 1);
1595           for (i = 0; i < nL4e; i++) {
1596                     KASSERT(L4_BASE[L4e_idx+i] == 0);
1597 
1598                     pa = pmap_bootstrap_palloc(1);
1599                     *pte = (pa & PTE_FRAME) | pteflags;
1600                     pmap_update_pg(tmpva);
1601                     memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1602 
1603                     L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1604           }
1605 
1606           /* Build L3 */
1607           L3e_idx = pl3_i(startva);
1608           nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1609           for (i = 0; i < nL3e; i++) {
1610                     KASSERT(L3_BASE[L3e_idx+i] == 0);
1611 
1612                     pa = pmap_bootstrap_palloc(1);
1613                     *pte = (pa & PTE_FRAME) | pteflags;
1614                     pmap_update_pg(tmpva);
1615                     memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1616 
1617                     L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1618           }
1619 
1620           /* Build L2 */
1621           L2e_idx = pl2_i(startva);
1622           nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1623           for (i = 0; i < nL2e; i++) {
1624 
1625                     KASSERT(L2_BASE[L2e_idx+i] == 0);
1626 
1627                     pa = pmap_bootstrap_palloc(1);
1628                     *pte = (pa & PTE_FRAME) | pteflags;
1629                     pmap_update_pg(tmpva);
1630                     memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1631 
1632                     L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1633           }
1634 
1635           /* Build L1 */
1636           L1e_idx = pl1_i(startva);
1637           nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1638           for (i = 0; i < nL1e; i++) {
1639                     /*
1640                      * Nothing to do, the PTEs will be entered via
1641                      * pmap_kenter_pa.
1642                      */
1643                     KASSERT(L1_BASE[L1e_idx+i] == 0);
1644           }
1645 
1646           *pte = 0;
1647           pmap_update_pg(tmpva);
1648 
1649           pcpuarea = (struct pcpu_area *)startva;
1650 
1651           tlbflush();
1652 }
1653 #endif
1654 
1655 #ifdef __HAVE_DIRECT_MAP
1656 static void
1657 randomize_hole(size_t *randholep, vaddr_t *randvap)
1658 {
1659           struct nist_hash_drbg drbg;
1660           uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1661           const char p[] = "x86/directmap";
1662           int error;
1663 
1664           entropy_extract(seed, sizeof(seed), 0);
1665 
1666           error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1667               /*nonce*/NULL, 0,
1668               /*personalization*/p, strlen(p));
1669           KASSERTMSG(error == 0, "error=%d", error);
1670 
1671           error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1672               /*additional*/NULL, 0);
1673           KASSERTMSG(error == 0, "error=%d", error);
1674 
1675           error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1676               /*additional*/NULL, 0);
1677           KASSERTMSG(error == 0, "error=%d", error);
1678 
1679           explicit_memset(seed, 0, sizeof(seed));
1680           explicit_memset(&drbg, 0, sizeof(drbg));
1681 }
1682 
1683 /*
1684  * Create the amd64 direct map. Called only once at boot time. We map all of
1685  * the physical memory contiguously using 2MB large pages, with RW permissions.
1686  * However there is a hole: the kernel is mapped with RO permissions.
1687  */
1688 static void
1689 pmap_init_directmap(struct pmap *kpm)
1690 {
1691           extern phys_ram_seg_t mem_clusters[];
1692           extern int mem_cluster_cnt;
1693 
1694           vaddr_t startva;
1695           size_t nL4e, nL3e, nL2e;
1696           size_t L4e_idx, L3e_idx, L2e_idx;
1697           size_t spahole, epahole;
1698           paddr_t lastpa, pa;
1699           vaddr_t endva;
1700           vaddr_t tmpva;
1701           pt_entry_t *pte;
1702           phys_ram_seg_t *mc;
1703           int i;
1704           size_t randhole;
1705           vaddr_t randva;
1706 
1707           const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1708           const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1709 
1710           CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1711 
1712           spahole = roundup(bootspace.head.pa, NBPD_L2);
1713           epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1714 
1715           /* Get the last physical address available */
1716           lastpa = 0;
1717           for (i = 0; i < mem_cluster_cnt; i++) {
1718                     mc = &mem_clusters[i];
1719                     lastpa = MAX(lastpa, mc->start + mc->size);
1720           }
1721 
1722           /*
1723            * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1724            */
1725           if (lastpa > MAXPHYSMEM) {
1726                     panic("pmap_init_directmap: lastpa incorrect");
1727           }
1728 
1729           randomize_hole(&randhole, &randva);
1730           startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1731               randhole, randva);
1732           endva = startva + lastpa;
1733 
1734           /* We will use this temporary va. */
1735           tmpva = bootspace.spareva;
1736           pte = PTE_BASE + pl1_i(tmpva);
1737 
1738           /* Build L4 */
1739           L4e_idx = pl4_i(startva);
1740           nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1741           KASSERT(nL4e <= NL4_SLOT_DIRECT);
1742           for (i = 0; i < nL4e; i++) {
1743                     KASSERT(L4_BASE[L4e_idx+i] == 0);
1744 
1745                     pa = pmap_bootstrap_palloc(1);
1746                     *pte = (pa & PTE_FRAME) | pteflags;
1747                     pmap_update_pg(tmpva);
1748                     memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1749 
1750                     L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1751           }
1752 
1753           /* Build L3 */
1754           L3e_idx = pl3_i(startva);
1755           nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1756           for (i = 0; i < nL3e; i++) {
1757                     KASSERT(L3_BASE[L3e_idx+i] == 0);
1758 
1759                     pa = pmap_bootstrap_palloc(1);
1760                     *pte = (pa & PTE_FRAME) | pteflags;
1761                     pmap_update_pg(tmpva);
1762                     memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1763 
1764                     L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1765           }
1766 
1767           /* Build L2 */
1768           L2e_idx = pl2_i(startva);
1769           nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1770           for (i = 0; i < nL2e; i++) {
1771                     KASSERT(L2_BASE[L2e_idx+i] == 0);
1772 
1773                     pa = (paddr_t)(i * NBPD_L2);
1774 
1775                     if (spahole <= pa && pa < epahole) {
1776                               L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1777                                   PTE_PS | pmap_pg_g;
1778                     } else {
1779                               L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1780                                   PTE_PS | pmap_pg_g;
1781                     }
1782           }
1783 
1784           *pte = 0;
1785           pmap_update_pg(tmpva);
1786 
1787           pmap_direct_base = startva;
1788           pmap_direct_end = endva;
1789 
1790           tlbflush();
1791 }
1792 #endif /* __HAVE_DIRECT_MAP */
1793 
1794 #if !defined(XENPV)
1795 /*
1796  * Remap all of the virtual pages created so far with the PTE_G bit.
1797  */
1798 static void
1799 pmap_remap_global(void)
1800 {
1801           vaddr_t kva, kva_end;
1802           unsigned long p1i;
1803           size_t i;
1804 
1805           /* head */
1806           kva = bootspace.head.va;
1807           kva_end = kva + bootspace.head.sz;
1808           for ( ; kva < kva_end; kva += PAGE_SIZE) {
1809                     p1i = pl1_i(kva);
1810                     if (pmap_valid_entry(PTE_BASE[p1i]))
1811                               PTE_BASE[p1i] |= pmap_pg_g;
1812           }
1813 
1814           /* kernel segments */
1815           for (i = 0; i < BTSPACE_NSEGS; i++) {
1816                     if (bootspace.segs[i].type == BTSEG_NONE) {
1817                               continue;
1818                     }
1819                     kva = bootspace.segs[i].va;
1820                     kva_end = kva + bootspace.segs[i].sz;
1821                     for ( ; kva < kva_end; kva += PAGE_SIZE) {
1822                               p1i = pl1_i(kva);
1823                               if (pmap_valid_entry(PTE_BASE[p1i]))
1824                                         PTE_BASE[p1i] |= pmap_pg_g;
1825                     }
1826           }
1827 
1828           /* boot space */
1829           kva = bootspace.boot.va;
1830           kva_end = kva + bootspace.boot.sz;
1831           for ( ; kva < kva_end; kva += PAGE_SIZE) {
1832                     p1i = pl1_i(kva);
1833                     if (pmap_valid_entry(PTE_BASE[p1i]))
1834                               PTE_BASE[p1i] |= pmap_pg_g;
1835           }
1836 }
1837 #endif
1838 
1839 #ifndef XENPV
1840 /*
1841  * Remap several kernel segments with large pages. We cover as many pages as we
1842  * can. Called only once at boot time, if the CPU supports large pages.
1843  */
1844 static void
1845 pmap_remap_largepages(void)
1846 {
1847           pd_entry_t *pde;
1848           vaddr_t kva, kva_end;
1849           paddr_t pa;
1850           size_t i;
1851 
1852           /* Remap the kernel text using large pages. */
1853           for (i = 0; i < BTSPACE_NSEGS; i++) {
1854                     if (bootspace.segs[i].type != BTSEG_TEXT) {
1855                               continue;
1856                     }
1857                     kva = roundup(bootspace.segs[i].va, NBPD_L2);
1858                     if (kva < bootspace.segs[i].va) {
1859                               continue;
1860                     }
1861                     kva_end = rounddown(bootspace.segs[i].va +
1862                               bootspace.segs[i].sz, NBPD_L2);
1863                     pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1864                     for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1865                               pde = &L2_BASE[pl2_i(kva)];
1866                               *pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1867                               tlbflushg();
1868                     }
1869           }
1870 
1871           /* Remap the kernel rodata using large pages. */
1872           for (i = 0; i < BTSPACE_NSEGS; i++) {
1873                     if (bootspace.segs[i].type != BTSEG_RODATA) {
1874                               continue;
1875                     }
1876                     kva = roundup(bootspace.segs[i].va, NBPD_L2);
1877                     if (kva < bootspace.segs[i].va) {
1878                               continue;
1879                     }
1880                     kva_end = rounddown(bootspace.segs[i].va +
1881                               bootspace.segs[i].sz, NBPD_L2);
1882                     pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1883                     for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1884                               pde = &L2_BASE[pl2_i(kva)];
1885                               *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1886                               tlbflushg();
1887                     }
1888           }
1889 
1890           /* Remap the kernel data+bss using large pages. */
1891           for (i = 0; i < BTSPACE_NSEGS; i++) {
1892                     if (bootspace.segs[i].type != BTSEG_DATA) {
1893                               continue;
1894                     }
1895                     kva = roundup(bootspace.segs[i].va, NBPD_L2);
1896                     if (kva < bootspace.segs[i].va) {
1897                               continue;
1898                     }
1899                     kva_end = rounddown(bootspace.segs[i].va +
1900                               bootspace.segs[i].sz, NBPD_L2);
1901                     pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1902                     for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1903                               pde = &L2_BASE[pl2_i(kva)];
1904                               *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1905                               tlbflushg();
1906                     }
1907           }
1908 }
1909 #endif /* !XENPV */
1910 
1911 /*
1912  * pmap_init: called from uvm_init, our job is to get the pmap system ready
1913  * to manage mappings.
1914  */
1915 void
1916 pmap_init(void)
1917 {
1918           int flags;
1919 
1920           /*
1921            * initialize caches.
1922            */
1923 
1924           pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1925               0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1926 
1927 #ifdef XENPV
1928           /*
1929            * pool_cache(9) should not touch cached objects, since they
1930            * are pinned on xen and R/O for the domU
1931            */
1932           flags = PR_NOTOUCH;
1933 #else
1934           flags = 0;
1935 #endif
1936 
1937 #ifdef PAE
1938           pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1939               "pdppl", &pmap_pdp_allocator, IPL_NONE);
1940 #else
1941           pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1942               "pdppl", NULL, IPL_NONE);
1943 #endif
1944           pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1945                0, 0, "pvpage", &pool_allocator_kmem,
1946               IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1947 
1948           pmap_tlb_init();
1949 
1950           /* XXX: Since cpu_hatch() is only for secondary CPUs. */
1951           pmap_tlb_cpu_init(curcpu());
1952 
1953           evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1954               NULL, "x86", "io bitmap copy");
1955           evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1956               NULL, "x86", "ldt sync");
1957 
1958           /*
1959            * The kernel doesn't keep track of PTPs, so there's nowhere handy
1960            * to hang a tree of pv_entry records.  Dynamically allocated
1961            * pv_entry lists are not heavily used in the kernel's pmap (the
1962            * usual case is embedded), so cop out and use a single RB tree
1963            * to cover them.
1964            */
1965           rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1966 
1967           /*
1968            * done: pmap module is up (and ready for business)
1969            */
1970 
1971           pmap_initialized = true;
1972 }
1973 
1974 #ifndef XENPV
1975 /*
1976  * pmap_cpu_init_late: perform late per-CPU initialization.
1977  */
1978 void
1979 pmap_cpu_init_late(struct cpu_info *ci)
1980 {
1981           /*
1982            * The BP has already its own PD page allocated during early
1983            * MD startup.
1984            */
1985           if (ci == &cpu_info_primary)
1986                     return;
1987 #ifdef PAE
1988           cpu_alloc_l3_page(ci);
1989 #endif
1990 }
1991 #endif
1992 
1993 #ifndef __HAVE_DIRECT_MAP
1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1996 
1997 static void
1998 pmap_vpage_cpualloc(struct cpu_info *ci)
1999 {
2000           bool primary = (ci == &cpu_info_primary);
2001           size_t i, npages;
2002           vaddr_t vabase;
2003           vsize_t vrange;
2004 
2005           npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
2006           KASSERT(npages >= VPAGE_MAX);
2007           vrange = npages * PAGE_SIZE;
2008 
2009           if (primary) {
2010                     while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
2011                               /* Waste some pages to align properly */
2012                     }
2013                     /* The base is aligned, allocate the rest (contiguous) */
2014                     pmap_bootstrap_valloc(npages - 1);
2015           } else {
2016                     vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2017                         UVM_KMF_VAONLY);
2018                     if (vabase == 0) {
2019                               panic("%s: failed to allocate tmp VA for CPU %d\n",
2020                                   __func__, cpu_index(ci));
2021                     }
2022           }
2023 
2024           KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2025 
2026           for (i = 0; i < VPAGE_MAX; i++) {
2027                     ci->vpage[i] = vabase + i * PAGE_SIZE;
2028                     ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2029           }
2030 }
2031 
2032 void
2033 pmap_vpage_cpu_init(struct cpu_info *ci)
2034 {
2035           if (ci == &cpu_info_primary) {
2036                     /* cpu0 already taken care of in pmap_bootstrap */
2037                     return;
2038           }
2039 
2040           pmap_vpage_cpualloc(ci);
2041 }
2042 #endif
2043 
2044 /*
2045  * p v _ e n t r y   f u n c t i o n s
2046  */
2047 
2048 /*
2049  * pmap_pvp_dtor: pool_cache constructor for PV pages.
2050  */
2051 static int
2052 pmap_pvp_ctor(void *arg, void *obj, int flags)
2053 {
2054           struct pv_page *pvp = (struct pv_page *)obj;
2055           struct pv_entry *pve = (struct pv_entry *)obj + 1;
2056           struct pv_entry *maxpve = pve + PVE_PER_PVP;
2057 
2058           KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2059           KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2060 
2061           LIST_INIT(&pvp->pvp_pves);
2062           pvp->pvp_nfree = PVE_PER_PVP;
2063           pvp->pvp_pmap = NULL;
2064 
2065           for (; pve < maxpve; pve++) {
2066                     LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2067           }
2068 
2069           return 0;
2070 }
2071 
2072 /*
2073  * pmap_pvp_dtor: pool_cache destructor for PV pages.
2074  */
2075 static void
2076 pmap_pvp_dtor(void *arg, void *obj)
2077 {
2078           struct pv_page *pvp __diagused = obj;
2079 
2080           KASSERT(pvp->pvp_pmap == NULL);
2081           KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2082 }
2083 
2084 /*
2085  * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2086  */
2087 static struct pv_entry *
2088 pmap_alloc_pv(struct pmap *pmap)
2089 {
2090           struct pv_entry *pve;
2091           struct pv_page *pvp;
2092 
2093           KASSERT(mutex_owned(&pmap->pm_lock));
2094 
2095           if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2096                     if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2097                               LIST_REMOVE(pvp, pvp_list);
2098                     } else {
2099                               pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2100                     }
2101                     if (__predict_false(pvp == NULL)) {
2102                               return NULL;
2103                     }
2104                     /* full -> part */
2105                     LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2106                     pvp->pvp_pmap = pmap;
2107           }
2108 
2109           KASSERT(pvp->pvp_pmap == pmap);
2110           KASSERT(pvp->pvp_nfree > 0);
2111 
2112           pve = LIST_FIRST(&pvp->pvp_pves);
2113           LIST_REMOVE(pve, pve_list);
2114           pvp->pvp_nfree--;
2115 
2116           if (__predict_false(pvp->pvp_nfree == 0)) {
2117                     /* part -> empty */
2118                     KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2119                     LIST_REMOVE(pvp, pvp_list);
2120                     LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2121           } else {
2122                     KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2123           }
2124 
2125           return pve;
2126 }
2127 
2128 /*
2129  * pmap_free_pv: delayed free of a PV entry.
2130  */
2131 static void
2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2133 {
2134           struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2135 
2136           KASSERT(mutex_owned(&pmap->pm_lock));
2137           KASSERT(pvp->pvp_pmap == pmap);
2138           KASSERT(pvp->pvp_nfree >= 0);
2139 
2140           LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2141           pvp->pvp_nfree++;
2142 
2143           if (__predict_false(pvp->pvp_nfree == 1)) {
2144                     /* empty -> part */
2145                     LIST_REMOVE(pvp, pvp_list);
2146                     LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2147           } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2148                     /* part -> full */
2149                     LIST_REMOVE(pvp, pvp_list);
2150                     LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2151           }
2152 }
2153 
2154 /*
2155  * pmap_drain_pv: free full PV pages.
2156  */
2157 static void
2158 pmap_drain_pv(struct pmap *pmap)
2159 {
2160           struct pv_page *pvp;
2161 
2162           KASSERT(mutex_owned(&pmap->pm_lock));
2163 
2164           while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2165                     LIST_REMOVE(pvp, pvp_list);
2166                     KASSERT(pvp->pvp_pmap == pmap);
2167                     KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2168                     pvp->pvp_pmap = NULL;
2169                     pool_cache_put(&pmap_pvp_cache, pvp);
2170           }
2171 }
2172 
2173 /*
2174  * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2175  */
2176 static void
2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2178     vaddr_t va, bool tracked)
2179 {
2180 #ifdef DEBUG
2181           struct pv_pte *pvpte;
2182 
2183           PMAP_CHECK_PP(pp);
2184 
2185           mutex_spin_enter(&pp->pp_lock);
2186           for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2187                     if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2188                               break;
2189                     }
2190           }
2191           mutex_spin_exit(&pp->pp_lock);
2192 
2193           if (pvpte && !tracked) {
2194                     panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2195           } else if (!pvpte && tracked) {
2196                     panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2197           }
2198 #endif
2199 }
2200 
2201 /*
2202  * pmap_treelookup_pv: search the PV tree for a dynamic entry
2203  *
2204  * => pmap must be locked
2205  */
2206 static struct pv_entry *
2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2208     const rb_tree_t *tree, const vaddr_t va)
2209 {
2210           struct pv_entry *pve;
2211           rb_node_t *node;
2212 
2213           /*
2214            * Inlined lookup tailored for exactly what's needed here that is
2215            * quite a bit faster than using rb_tree_find_node().
2216            */
2217           for (node = tree->rbt_root;;) {
2218                     if (__predict_false(RB_SENTINEL_P(node))) {
2219                               return NULL;
2220                     }
2221                     pve = (struct pv_entry *)
2222                         ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2223                     if (pve->pve_pte.pte_va == va) {
2224                               KASSERT(pve->pve_pte.pte_ptp == ptp);
2225                               return pve;
2226                     }
2227                     node = node->rb_nodes[pve->pve_pte.pte_va < va];
2228           }
2229 }
2230 
2231 /*
2232  * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2233  *
2234  * => a PV entry must be known present (doesn't check for existence)
2235  * => pmap must be locked
2236  */
2237 static struct pv_entry *
2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2239     const struct pmap_page * const old_pp, const vaddr_t va)
2240 {
2241           struct pv_entry *pve;
2242           const rb_tree_t *tree;
2243 
2244           KASSERT(mutex_owned(&pmap->pm_lock));
2245           KASSERT(ptp != NULL || pmap == pmap_kernel());
2246 
2247           /*
2248            * [This mostly deals with the case of process-private pages, i.e.
2249            * anonymous memory allocations or COW.]
2250            *
2251            * If the page is tracked with an embedded entry then the tree
2252            * lookup can be avoided.  It's safe to check for this specific
2253            * set of values without pp_lock because both will only ever be
2254            * set together for this pmap.
2255            *
2256            */
2257           if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2258               atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2259                     return NULL;
2260           }
2261 
2262           /*
2263            * [This mostly deals with shared mappings, for example shared libs
2264            * and executables.]
2265            *
2266            * Optimise for pmap_remove_ptes() which works by ascending scan:
2267            * look at the lowest numbered node in the tree first.  The tree is
2268            * known non-empty because of the check above.  For short lived
2269            * processes where pmap_remove() isn't used much this gets close to
2270            * a 100% hit rate.
2271            */
2272           tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2273           KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2274           pve = (struct pv_entry *)
2275               ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2276               offsetof(struct pv_entry, pve_rb));
2277           if (__predict_true(pve->pve_pte.pte_va == va)) {
2278                     KASSERT(pve->pve_pte.pte_ptp == ptp);
2279                     return pve;
2280           }
2281 
2282           /* Search the RB tree for the key (uncommon). */
2283           return pmap_treelookup_pv(pmap, ptp, tree, va);
2284 }
2285 
2286 /*
2287  * pmap_enter_pv: enter a mapping onto a pmap_page lst
2288  *
2289  * => pmap must be locked
2290  * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2291  */
2292 static int
2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2294     vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2295     bool *samepage, bool *new_embedded, rb_tree_t *tree)
2296 {
2297           struct pv_entry *pve;
2298           int error;
2299 
2300           KASSERT(mutex_owned(&pmap->pm_lock));
2301           KASSERT(ptp_to_pmap(ptp) == pmap);
2302           KASSERT(ptp == NULL || ptp->uobject != NULL);
2303           KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2304           PMAP_CHECK_PP(pp);
2305 
2306           /*
2307            * If entering the same page and it's already tracked with an
2308            * embedded entry, we can avoid the expense below.  It's safe
2309            * to check for this very specific set of values without a lock
2310            * because both will only ever be set together for this pmap.
2311            */
2312           if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2313               atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2314                     *samepage = true;
2315                     pmap_check_pv(pmap, ptp, pp, va, true);
2316                     return 0;
2317           }
2318 
2319           /*
2320            * Check for an existing dynamic mapping at this address.  If it's
2321            * for the same page, then it will be reused and nothing needs to be
2322            * changed.
2323            */
2324           *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2325           if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2326                     *samepage = true;
2327                     pmap_check_pv(pmap, ptp, pp, va, true);
2328                     return 0;
2329           }
2330 
2331           /*
2332            * Need to put a new mapping in place.  Grab a spare pv_entry in
2333            * case it's needed; won't know for sure until the lock is taken.
2334            */
2335           if (pmap->pm_pve == NULL) {
2336                     pmap->pm_pve = pmap_alloc_pv(pmap);
2337           }
2338 
2339           error = 0;
2340           pmap_check_pv(pmap, ptp, pp, va, false);
2341           mutex_spin_enter(&pp->pp_lock);
2342           if (!pv_pte_embedded(pp)) {
2343                     /*
2344                      * Embedded PV tracking available - easy.
2345                      */
2346                     pp->pp_pte.pte_ptp = ptp;
2347                     pp->pp_pte.pte_va = va;
2348                     *new_embedded = true;
2349           } else if (__predict_false(pmap->pm_pve == NULL)) {
2350                     /*
2351                      * No memory.
2352                      */
2353                     error = ENOMEM;
2354           } else {
2355                     /*
2356                      * Install new pv_entry on the page.
2357                      */
2358                     pve = pmap->pm_pve;
2359                     pmap->pm_pve = NULL;
2360                     *new_pve = pve;
2361                     pve->pve_pte.pte_ptp = ptp;
2362                     pve->pve_pte.pte_va = va;
2363                     pve->pve_pp = pp;
2364                     LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2365           }
2366           mutex_spin_exit(&pp->pp_lock);
2367           if (error == 0) {
2368                     pmap_check_pv(pmap, ptp, pp, va, true);
2369           }
2370 
2371           return error;
2372 }
2373 
2374 /*
2375  * pmap_remove_pv: try to remove a mapping from a pv_list
2376  *
2377  * => pmap must be locked
2378  * => removes dynamic entries from tree and frees them
2379  * => caller should adjust ptp's wire_count and free PTP if needed
2380  */
2381 static void
2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2383     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2384 {
2385           rb_tree_t *tree = (ptp != NULL ?
2386               &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2387 
2388           KASSERT(mutex_owned(&pmap->pm_lock));
2389           KASSERT(ptp_to_pmap(ptp) == pmap);
2390           KASSERT(ptp == NULL || ptp->uobject != NULL);
2391           KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2392           KASSERT(ptp != NULL || pmap == pmap_kernel());
2393 
2394           pmap_check_pv(pmap, ptp, pp, va, true);
2395 
2396           if (pve == NULL) {
2397                     mutex_spin_enter(&pp->pp_lock);
2398                     KASSERT(pp->pp_pte.pte_ptp == ptp);
2399                     KASSERT(pp->pp_pte.pte_va == va);
2400                     pp->pp_attrs |= oattrs;
2401                     pp->pp_pte.pte_ptp = NULL;
2402                     pp->pp_pte.pte_va = 0;
2403                     mutex_spin_exit(&pp->pp_lock);
2404           } else {
2405                     mutex_spin_enter(&pp->pp_lock);
2406                     KASSERT(pp->pp_pte.pte_ptp != ptp ||
2407                         pp->pp_pte.pte_va != va);
2408                     KASSERT(pve->pve_pte.pte_ptp == ptp);
2409                     KASSERT(pve->pve_pte.pte_va == va);
2410                     KASSERT(pve->pve_pp == pp);
2411                     pp->pp_attrs |= oattrs;
2412                     LIST_REMOVE(pve, pve_list);
2413                     mutex_spin_exit(&pp->pp_lock);
2414 
2415                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2416                     rb_tree_remove_node(tree, pve);
2417 #ifdef DIAGNOSTIC
2418                     memset(pve, 0, sizeof(*pve));
2419 #endif
2420                     pmap_free_pv(pmap, pve);
2421           }
2422 
2423           KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2424           pmap_check_pv(pmap, ptp, pp, va, false);
2425 }
2426 
2427 /*
2428  * p t p   f u n c t i o n s
2429  */
2430 
2431 static struct vm_page *
2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2433 {
2434           int lidx = level - 1;
2435           off_t off = ptp_va2o(va, level);
2436           struct vm_page *pg;
2437 
2438           KASSERT(mutex_owned(&pmap->pm_lock));
2439 
2440           if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2441                     KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2442                     pg = pmap->pm_ptphint[lidx];
2443                     PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2444                     return pg;
2445           }
2446           PMAP_DUMMY_LOCK(pmap);
2447           pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2448           PMAP_DUMMY_UNLOCK(pmap);
2449           if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2450                     /* This page is queued to be freed - ignore. */
2451                     pg = NULL;
2452           }
2453           if (pg != NULL) {
2454                     PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2455           }
2456           pmap->pm_ptphint[lidx] = pg;
2457           return pg;
2458 }
2459 
2460 static inline void
2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2462 {
2463           int lidx;
2464 
2465           KASSERT(ptp->wire_count <= 1);
2466           PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2467 
2468           lidx = level - 1;
2469           pmap_stats_update(pmap, -ptp->wire_count, 0);
2470           if (pmap->pm_ptphint[lidx] == ptp)
2471                     pmap->pm_ptphint[lidx] = NULL;
2472           ptp->wire_count = 0;
2473           ptp->uanon = NULL;
2474           KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2475 
2476           /*
2477            * Enqueue the PTP to be freed by pmap_update().  We can't remove
2478            * the page from the uvm_object, as that can take further locks
2479            * (intolerable right now because the PTEs are likely mapped in).
2480            * Instead mark the PTP as free and if we bump into it again, we'll
2481            * either ignore or reuse (depending on what's useful at the time).
2482            */
2483           LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2484 }
2485 
2486 static void
2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2488                 pt_entry_t *ptes, pd_entry_t * const *pdes)
2489 {
2490           unsigned long index;
2491           int level;
2492           vaddr_t invaladdr;
2493           pd_entry_t opde;
2494 
2495           KASSERT(pmap != pmap_kernel());
2496           KASSERT(mutex_owned(&pmap->pm_lock));
2497           KASSERT(kpreempt_disabled());
2498 
2499           level = 1;
2500           do {
2501                     index = pl_i(va, level + 1);
2502                     opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2503 
2504                     /*
2505                      * On Xen-amd64 or SVS, we need to sync the top level page
2506                      * directory on each CPU.
2507                      */
2508 #if defined(XENPV) && defined(__x86_64__)
2509                     if (level == PTP_LEVELS - 1) {
2510                               xen_kpm_sync(pmap, index);
2511                     }
2512 #elif defined(SVS)
2513                     if (svs_enabled && level == PTP_LEVELS - 1 &&
2514                         pmap_is_user(pmap)) {
2515                               svs_pmap_sync(pmap, index);
2516                     }
2517 #endif
2518 
2519                     invaladdr = level == 1 ? (vaddr_t)ptes :
2520                         (vaddr_t)pdes[level - 2];
2521                     pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2522                         opde, TLBSHOOT_FREE_PTP);
2523 
2524 #if defined(XENPV)
2525                     pmap_tlb_shootnow();
2526 #endif
2527 
2528                     pmap_freepage(pmap, ptp, level);
2529                     if (level < PTP_LEVELS - 1) {
2530                               ptp = pmap_find_ptp(pmap, va, level + 1);
2531                               ptp->wire_count--;
2532                               if (ptp->wire_count > 1)
2533                                         break;
2534                     }
2535           } while (++level < PTP_LEVELS);
2536           pmap_pte_flush();
2537 }
2538 
2539 /*
2540  * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2541  *
2542  * => pmap should NOT be pmap_kernel()
2543  * => pmap should be locked
2544  * => we are not touching any PTEs yet, so they need not be mapped in
2545  */
2546 static int
2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2548     int flags, struct vm_page **resultp)
2549 {
2550           struct vm_page *ptp;
2551           int i, aflags;
2552           struct uvm_object *obj;
2553           voff_t off;
2554 
2555           KASSERT(pmap != pmap_kernel());
2556           KASSERT(mutex_owned(&pmap->pm_lock));
2557 
2558           /*
2559            * Loop through all page table levels allocating a page
2560            * for any level where we don't already have one.
2561            */
2562           memset(pt, 0, sizeof(*pt));
2563           aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2564                     UVM_PGA_ZERO;
2565           for (i = PTP_LEVELS; i > 1; i--) {
2566                     obj = &pmap->pm_obj[i - 2];
2567                     off = ptp_va2o(va, i - 1);
2568 
2569                     PMAP_DUMMY_LOCK(pmap);
2570                     pt->pg[i] = uvm_pagelookup(obj, off);
2571 
2572                     if (pt->pg[i] == NULL) {
2573                               pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2574                               pt->alloced[i] = (pt->pg[i] != NULL);
2575                     } else if (pt->pg[i]->wire_count == 0) {
2576                               /* This page was queued to be freed; dequeue it. */
2577                               LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2578                               pt->alloced[i] = true;
2579                     }
2580                     PMAP_DUMMY_UNLOCK(pmap);
2581                     if (pt->pg[i] == NULL) {
2582                               pmap_unget_ptp(pmap, pt);
2583                               return ENOMEM;
2584                     } else if (pt->alloced[i]) {
2585                               pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2586                               rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2587                                   &pmap_rbtree_ops);
2588                               PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2589                     }
2590           }
2591           ptp = pt->pg[2];
2592           KASSERT(ptp != NULL);
2593           *resultp = ptp;
2594           pmap->pm_ptphint[0] = ptp;
2595           return 0;
2596 }
2597 
2598 /*
2599  * pmap_install_ptp: install any freshly allocated PTPs
2600  *
2601  * => pmap should NOT be pmap_kernel()
2602  * => pmap should be locked
2603  * => PTEs must be mapped
2604  * => preemption must be disabled
2605  */
2606 static void
2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2608     pd_entry_t * const *pdes)
2609 {
2610           struct vm_page *ptp;
2611           unsigned long index;
2612           pd_entry_t *pva;
2613           paddr_t pa;
2614           int i;
2615 
2616           KASSERT(pmap != pmap_kernel());
2617           KASSERT(mutex_owned(&pmap->pm_lock));
2618           KASSERT(kpreempt_disabled());
2619 
2620           /*
2621            * Now that we have all the pages looked up or allocated,
2622            * loop through again installing any new ones into the tree.
2623            */
2624           for (i = PTP_LEVELS; i > 1; i--) {
2625                     index = pl_i(va, i);
2626                     pva = pdes[i - 2];
2627 
2628                     if (pmap_valid_entry(pva[index])) {
2629                               KASSERT(!pt->alloced[i]);
2630                               continue;
2631                     }
2632 
2633                     ptp = pt->pg[i];
2634                     ptp->flags &= ~PG_BUSY; /* never busy */
2635                     ptp->wire_count = 1;
2636                     pmap->pm_ptphint[i - 2] = ptp;
2637                     pa = VM_PAGE_TO_PHYS(ptp);
2638                     pmap_pte_set(&pva[index], (pd_entry_t)
2639                         (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2640 
2641                     /*
2642                      * On Xen-amd64 or SVS, we need to sync the top level page
2643                      * directory on each CPU.
2644                      */
2645 #if defined(XENPV) && defined(__x86_64__)
2646                     if (i == PTP_LEVELS) {
2647                               xen_kpm_sync(pmap, index);
2648                     }
2649 #elif defined(SVS)
2650                     if (svs_enabled && i == PTP_LEVELS &&
2651                         pmap_is_user(pmap)) {
2652                               svs_pmap_sync(pmap, index);
2653                     }
2654 #endif
2655 
2656                     pmap_pte_flush();
2657                     pmap_stats_update(pmap, 1, 0);
2658 
2659                     /*
2660                      * If we're not in the top level, increase the
2661                      * wire count of the parent page.
2662                      */
2663                     if (i < PTP_LEVELS) {
2664                               pt->pg[i + 1]->wire_count++;
2665                     }
2666           }
2667 }
2668 
2669 /*
2670  * pmap_unget_ptp: free unusued PTPs
2671  *
2672  * => pmap should NOT be pmap_kernel()
2673  * => pmap should be locked
2674  */
2675 static void
2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2677 {
2678           int i;
2679 
2680           KASSERT(pmap != pmap_kernel());
2681           KASSERT(mutex_owned(&pmap->pm_lock));
2682 
2683           for (i = PTP_LEVELS; i > 1; i--) {
2684                     if (!pt->alloced[i]) {
2685                               continue;
2686                     }
2687                     KASSERT(pt->pg[i]->wire_count == 0);
2688                     PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2689                     pmap_freepage(pmap, pt->pg[i], i - 1);
2690           }
2691 }
2692 
2693 /*
2694  * p m a p   l i f e c y c l e   f u n c t i o n s
2695  */
2696 
2697 /*
2698  * pmap_pdp_init: constructor a new PDP.
2699  */
2700 static void
2701 pmap_pdp_init(pd_entry_t *pdir)
2702 {
2703           paddr_t pdirpa = 0;
2704           vaddr_t object;
2705           int i;
2706 
2707 #if !defined(XENPV) || !defined(__x86_64__)
2708           int npde;
2709 #endif
2710 #ifdef XENPV
2711           int s;
2712 #endif
2713 
2714           memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2715 
2716           /*
2717            * NOTE: This is all done unlocked, but we will check afterwards
2718            * if we have raced with pmap_growkernel().
2719            */
2720 
2721 #if defined(XENPV) && defined(__x86_64__)
2722           /* Fetch the physical address of the page directory */
2723           (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2724 
2725           /*
2726            * This pdir will NEVER be active in kernel mode, so mark
2727            * recursive entry invalid.
2728            */
2729           pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2730 
2731           /*
2732            * PDP constructed this way won't be for the kernel, hence we
2733            * don't put kernel mappings on Xen.
2734            *
2735            * But we need to make pmap_create() happy, so put a dummy
2736            * (without PTE_P) value at the right place.
2737            */
2738           pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2739                (pd_entry_t)-1 & PTE_FRAME;
2740 #else /* XENPV && __x86_64__*/
2741           object = (vaddr_t)pdir;
2742           for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2743                     /* Fetch the physical address of the page directory */
2744                     (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2745 
2746                     /* Put in recursive PDE to map the PTEs */
2747                     pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2748                         pmap_pg_nx;
2749 #ifndef XENPV
2750                     pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2751 #endif
2752           }
2753 
2754           /* Copy the kernel's top level PDE */
2755           npde = nkptp[PTP_LEVELS - 1];
2756 
2757           memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2758               npde * sizeof(pd_entry_t));
2759 
2760           if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2761                     int idx = pl_i(KERNBASE, PTP_LEVELS);
2762                     pdir[idx] = PDP_BASE[idx];
2763           }
2764 
2765 #ifdef __HAVE_PCPU_AREA
2766           pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2767 #endif
2768 #ifdef __HAVE_DIRECT_MAP
2769           slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2770 #endif
2771 #ifdef KASAN
2772           slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2773 #endif
2774 #ifdef KMSAN
2775           slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2776 #endif
2777 #endif /* XENPV  && __x86_64__*/
2778 
2779 #ifdef XENPV
2780           s = splvm();
2781           object = (vaddr_t)pdir;
2782           pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2783               VM_PROT_READ);
2784           pmap_update(pmap_kernel());
2785           for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2786                     /*
2787                      * pin as L2/L4 page, we have to do the page with the
2788                      * PDIR_SLOT_PTE entries last
2789                      */
2790 #ifdef PAE
2791                     if (i == l2tol3(PDIR_SLOT_PTE))
2792                               continue;
2793 #endif
2794 
2795                     (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2796 #ifdef __x86_64__
2797                     xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2798 #else
2799                     xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2800 #endif
2801           }
2802 #ifdef PAE
2803           object = ((vaddr_t)pdir) + PAGE_SIZE  * l2tol3(PDIR_SLOT_PTE);
2804           (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2805           xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2806 #endif
2807           splx(s);
2808 #endif /* XENPV */
2809 }
2810 
2811 /*
2812  * pmap_pdp_fini: destructor for the PDPs.
2813  */
2814 static void
2815 pmap_pdp_fini(pd_entry_t *pdir)
2816 {
2817 #ifdef XENPV
2818           paddr_t pdirpa = 0; /* XXX: GCC */
2819           vaddr_t object = (vaddr_t)pdir;
2820           int i;
2821           int s = splvm();
2822           pt_entry_t *pte;
2823 
2824           for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2825                     /* fetch the physical address of the page directory. */
2826                     (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2827                     /* unpin page table */
2828                     xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2829           }
2830           object = (vaddr_t)pdir;
2831           for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2832                     /* Set page RW again */
2833                     pte = kvtopte(object);
2834                     pmap_pte_set(pte, *pte | PTE_W);
2835                     xen_bcast_invlpg((vaddr_t)object);
2836           }
2837           splx(s);
2838 #endif  /* XENPV */
2839 }
2840 
2841 #ifdef PAE
2842 static void *
2843 pmap_pdp_alloc(struct pool *pp, int flags)
2844 {
2845           return (void *)uvm_km_alloc(kernel_map,
2846               PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2847               ((flags & PR_WAITOK) ? UVM_KMF_WAITVA
2848                     : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2849               UVM_KMF_WIRED);
2850 }
2851 
2852 static void
2853 pmap_pdp_free(struct pool *pp, void *v)
2854 {
2855           uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2856               UVM_KMF_WIRED);
2857 }
2858 #endif /* PAE */
2859 
2860 /*
2861  * pmap_ctor: constructor for the pmap cache.
2862  */
2863 static int
2864 pmap_ctor(void *arg, void *obj, int flags)
2865 {
2866           struct pmap *pmap = obj;
2867           pt_entry_t p;
2868           int i;
2869 
2870           KASSERT((flags & PR_WAITOK) != 0);
2871 
2872           mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2873           rw_init(&pmap->pm_dummy_lock);
2874           kcpuset_create(&pmap->pm_cpus, true);
2875           kcpuset_create(&pmap->pm_kernel_cpus, true);
2876 #ifdef XENPV
2877           kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2878 #endif
2879           LIST_INIT(&pmap->pm_gc_ptp);
2880           pmap->pm_pve = NULL;
2881           LIST_INIT(&pmap->pm_pvp_full);
2882           LIST_INIT(&pmap->pm_pvp_part);
2883           LIST_INIT(&pmap->pm_pvp_empty);
2884 
2885           /* allocate and init PDP */
2886           pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2887 
2888           for (;;) {
2889                     pmap_pdp_init(pmap->pm_pdir);
2890                     mutex_enter(&pmaps_lock);
2891                     p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2892                     if (__predict_true(p != 0)) {
2893                               break;
2894                     }
2895                     mutex_exit(&pmaps_lock);
2896           }
2897 
2898           for (i = 0; i < PDP_SIZE; i++)
2899                     pmap->pm_pdirpa[i] =
2900                         pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2901 
2902           LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2903           mutex_exit(&pmaps_lock);
2904 
2905           return 0;
2906 }
2907 
2908 /*
2909  * pmap_ctor: destructor for the pmap cache.
2910  */
2911 static void
2912 pmap_dtor(void *arg, void *obj)
2913 {
2914           struct pmap *pmap = obj;
2915 
2916           mutex_enter(&pmaps_lock);
2917           LIST_REMOVE(pmap, pm_list);
2918           mutex_exit(&pmaps_lock);
2919 
2920           pmap_pdp_fini(pmap->pm_pdir);
2921           pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2922           mutex_destroy(&pmap->pm_lock);
2923           rw_destroy(&pmap->pm_dummy_lock);
2924           kcpuset_destroy(pmap->pm_cpus);
2925           kcpuset_destroy(pmap->pm_kernel_cpus);
2926 #ifdef XENPV
2927           kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2928 #endif
2929 }
2930 
2931 /*
2932  * pmap_create: create a pmap object.
2933  */
2934 struct pmap *
2935 pmap_create(void)
2936 {
2937           struct pmap *pmap;
2938           int i;
2939 
2940           pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2941 
2942           /* init uvm_object */
2943           for (i = 0; i < PTP_LEVELS - 1; i++) {
2944                     uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2945                     uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2946                     pmap->pm_ptphint[i] = NULL;
2947           }
2948           pmap->pm_stats.wired_count = 0;
2949           /* count the PDP allocd below */
2950           pmap->pm_stats.resident_count = PDP_SIZE;
2951 #if !defined(__x86_64__)
2952           pmap->pm_hiexec = 0;
2953 #endif
2954 
2955           /* Used by NVMM and Xen */
2956           pmap->pm_enter = NULL;
2957           pmap->pm_extract = NULL;
2958           pmap->pm_remove = NULL;
2959           pmap->pm_sync_pv = NULL;
2960           pmap->pm_pp_remove_ent = NULL;
2961           pmap->pm_write_protect = NULL;
2962           pmap->pm_unwire = NULL;
2963           pmap->pm_tlb_flush = NULL;
2964           pmap->pm_data = NULL;
2965 
2966           /* init the LDT */
2967           pmap->pm_ldt = NULL;
2968           pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2969 
2970           return pmap;
2971 }
2972 
2973 /*
2974  * pmap_check_ptps: verify that none of the pmap's page table objects
2975  * have any pages allocated to them.
2976  */
2977 static void
2978 pmap_check_ptps(struct pmap *pmap)
2979 {
2980           int i;
2981 
2982           for (i = 0; i < PTP_LEVELS - 1; i++) {
2983                     KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2984                         "pmap %p level %d still has %d pages",
2985                         pmap, i, (int)pmap->pm_obj[i].uo_npages);
2986           }
2987 }
2988 
2989 static void
2990 pmap_check_inuse(struct pmap *pmap)
2991 {
2992 #ifdef DEBUG
2993           CPU_INFO_ITERATOR cii;
2994           struct cpu_info *ci;
2995 
2996           for (CPU_INFO_FOREACH(cii, ci)) {
2997                     if (ci->ci_pmap == pmap)
2998                               panic("destroying pmap being used");
2999 #if defined(XENPV) && defined(__x86_64__)
3000                     for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
3001                               if (pmap->pm_pdir[i] != 0 &&
3002                                   ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
3003                                         printf("pmap_destroy(%p) pmap_kernel %p "
3004                                             "curcpu %d cpu %d ci_pmap %p "
3005                                             "ci->ci_kpm_pdir[%d]=%" PRIx64
3006                                             " pmap->pm_pdir[%d]=%" PRIx64 "\n",
3007                                             pmap, pmap_kernel(), curcpu()->ci_index,
3008                                             ci->ci_index, ci->ci_pmap,
3009                                             i, ci->ci_kpm_pdir[i],
3010                                             i, pmap->pm_pdir[i]);
3011                                         panic("%s: used pmap", __func__);
3012                               }
3013                     }
3014 #endif
3015           }
3016 #endif /* DEBUG */
3017 }
3018 
3019 /*
3020  * pmap_destroy:  drop reference count on pmap.  free pmap if reference
3021  * count goes to zero.
3022  *
3023  * => we can be called from pmap_unmap_ptes() with a different, unrelated
3024  *    pmap's lock held.  be careful!
3025  */
3026 void
3027 pmap_destroy(struct pmap *pmap)
3028 {
3029           int i;
3030 
3031           /*
3032            * drop reference count and verify not in use.
3033            */
3034 
3035           if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3036                     return;
3037           }
3038           pmap_check_inuse(pmap);
3039 
3040           /*
3041            * handle any deferred frees.
3042            */
3043 
3044           mutex_enter(&pmap->pm_lock);
3045           if (pmap->pm_pve != NULL) {
3046                     pmap_free_pv(pmap, pmap->pm_pve);
3047                     pmap->pm_pve = NULL;
3048           }
3049           pmap_drain_pv(pmap);
3050           mutex_exit(&pmap->pm_lock);
3051           pmap_update(pmap);
3052 
3053           /*
3054            * Reference count is zero, free pmap resources and then free pmap.
3055            */
3056 
3057           pmap_check_ptps(pmap);
3058           KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3059 
3060 #ifdef USER_LDT
3061           if (pmap->pm_ldt != NULL) {
3062                     /*
3063                      * No need to switch the LDT; this address space is gone,
3064                      * nothing is using it.
3065                      *
3066                      * No need to lock the pmap for ldt_free (or anything else),
3067                      * we're the last one to use it.
3068                      */
3069                     /* XXXAD can't take cpu_lock here - fix soon. */
3070                     mutex_enter(&cpu_lock);
3071                     ldt_free(pmap->pm_ldt_sel);
3072                     mutex_exit(&cpu_lock);
3073                     uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3074                         MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3075           }
3076 #endif
3077 
3078           for (i = 0; i < PTP_LEVELS - 1; i++) {
3079                     uvm_obj_destroy(&pmap->pm_obj[i], false);
3080           }
3081           kcpuset_zero(pmap->pm_cpus);
3082           kcpuset_zero(pmap->pm_kernel_cpus);
3083 #ifdef XENPV
3084           kcpuset_zero(pmap->pm_xen_ptp_cpus);
3085 #endif
3086 
3087           KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3088           KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3089           KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3090 
3091           pmap_check_ptps(pmap);
3092           if (__predict_false(pmap->pm_enter != NULL)) {
3093                     /* XXX make this a different cache */
3094                     pool_cache_destruct_object(&pmap_cache, pmap);
3095           } else {
3096                     pool_cache_put(&pmap_cache, pmap);
3097           }
3098 }
3099 
3100 /*
3101  * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3102  *
3103  * => caller must hold pmap's lock
3104  * => PTP must be mapped into KVA
3105  * => must be called with kernel preemption disabled
3106  * => does as little work as possible
3107  */
3108 static void
3109 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3110     vaddr_t startva, vaddr_t blkendva)
3111 {
3112 #ifndef XENPV
3113           struct pv_entry *pve;
3114           struct vm_page *pg;
3115           struct pmap_page *pp;
3116           pt_entry_t opte;
3117           rb_tree_t *tree;
3118           vaddr_t va;
3119           int wired;
3120           uint8_t oattrs;
3121           u_int cnt;
3122 
3123           KASSERT(mutex_owned(&pmap->pm_lock));
3124           KASSERT(kpreempt_disabled());
3125           KASSERT(pmap != pmap_kernel());
3126           KASSERT(ptp->wire_count > 1);
3127           KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3128 
3129           /*
3130            * Start at the lowest entered VA, and scan until there are no more
3131            * PTEs in the PTPs.
3132            */
3133           tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3134           pve = RB_TREE_MIN(tree);
3135           wired = 0;
3136           va = (vaddr_t)ptp->uanon;
3137           pte += ((va - startva) >> PAGE_SHIFT);
3138 
3139           for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3140                     /*
3141                      * No need for an atomic to clear the PTE.  Nothing else can
3142                      * see the address space any more and speculative access (if
3143                      * possible) won't modify.  Therefore there's no need to
3144                      * track the accessed/dirty bits.
3145                      */
3146                     opte = *pte;
3147                     if (!pmap_valid_entry(opte)) {
3148                               continue;
3149                     }
3150 
3151                     /*
3152                      * Count the PTE.  If it's not for a managed mapping
3153                      * there's noting more to do.
3154                      */
3155                     cnt--;
3156                     wired -= (opte & PTE_WIRED);
3157                     if ((opte & PTE_PVLIST) == 0) {
3158 #ifndef DOM0OPS
3159                               KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3160                                   "managed page without PTE_PVLIST for %#"
3161                                   PRIxVADDR, va);
3162                               KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3163                                   "pv-tracked page without PTE_PVLIST for %#"
3164                                   PRIxVADDR, va);
3165 #endif
3166                               KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3167                                   &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3168                                   va) == NULL);
3169                               continue;
3170                     }
3171 
3172                     /*
3173                      * "pve" now points to the lowest (by VA) dynamic PV entry
3174                      * in the PTP.  If it's for this VA, take advantage of it to
3175                      * avoid calling PHYS_TO_VM_PAGE().  Avoid modifying the RB
3176                      * tree by skipping to the next VA in the tree whenever
3177                      * there is a match here.  The tree will be cleared out in
3178                      * one pass before return to pmap_remove_all().
3179                      */
3180                     oattrs = pmap_pte_to_pp_attrs(opte);
3181                     if (pve != NULL && pve->pve_pte.pte_va == va) {
3182                               pp = pve->pve_pp;
3183                               KASSERT(pve->pve_pte.pte_ptp == ptp);
3184                               KASSERT(pp->pp_pte.pte_ptp != ptp ||
3185                                   pp->pp_pte.pte_va != va);
3186                               mutex_spin_enter(&pp->pp_lock);
3187                               pp->pp_attrs |= oattrs;
3188                               LIST_REMOVE(pve, pve_list);
3189                               mutex_spin_exit(&pp->pp_lock);
3190 
3191                               /*
3192                                * pve won't be touched again until pmap_drain_pv(),
3193                                * so it's still safe to traverse the tree.
3194                                */
3195                               pmap_free_pv(pmap, pve);
3196                               pve = RB_TREE_NEXT(tree, pve);
3197                               continue;
3198                     }
3199 
3200                     /*
3201                      * No entry in the tree so it must be embedded.  Look up the
3202                      * page and cancel the embedded entry.
3203                      */
3204                     if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3205                               pp = VM_PAGE_TO_PP(pg);
3206                     } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3207                               paddr_t pa = pmap_pte2pa(opte);
3208                               panic("%s: PTE_PVLIST with pv-untracked page"
3209                                   " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3210                                   "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3211                     }
3212                     mutex_spin_enter(&pp->pp_lock);
3213                     KASSERT(pp->pp_pte.pte_ptp == ptp);
3214                     KASSERT(pp->pp_pte.pte_va == va);
3215                     pp->pp_attrs |= oattrs;
3216                     pp->pp_pte.pte_ptp = NULL;
3217                     pp->pp_pte.pte_va = 0;
3218                     mutex_spin_exit(&pp->pp_lock);
3219           }
3220 
3221           /* PTP now empty - adjust the tree & stats to match. */
3222           pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3223           ptp->wire_count = 1;
3224 #ifdef DIAGNOSTIC
3225           rb_tree_init(tree, &pmap_rbtree_ops);
3226 #endif
3227 #else     /* !XENPV */
3228           /*
3229            * XXXAD For XEN, it's not clear to me that we can do this, because
3230            * I guess the hypervisor keeps track of PTEs too.
3231            */
3232           pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3233 #endif    /* !XENPV */
3234 }
3235 
3236 /*
3237  * pmap_remove_all: remove all mappings from pmap in bulk.
3238  *
3239  * Ordinarily when removing mappings it's important to hold the UVM object's
3240  * lock, so that pages do not gain a new identity while retaining stale TLB
3241  * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3242  * Here it's known that the address space is no longer visible to any user
3243  * process, so we don't need to worry about that.
3244  */
3245 bool
3246 pmap_remove_all(struct pmap *pmap)
3247 {
3248           struct vm_page *ptps[32];
3249           vaddr_t va, blkendva;
3250           struct pmap *pmap2;
3251           pt_entry_t *ptes;
3252           pd_entry_t pde __diagused;
3253           pd_entry_t * const *pdes;
3254           int lvl __diagused, i, n;
3255 
3256           /* XXX Can't handle EPT just yet. */
3257           if (pmap->pm_remove != NULL) {
3258                     return false;
3259           }
3260 
3261           for (;;) {
3262                     /* Fetch a block of PTPs from tree. */
3263                     mutex_enter(&pmap->pm_lock);
3264                     n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3265                         (void **)ptps, __arraycount(ptps), false);
3266                     if (n == 0) {
3267                               mutex_exit(&pmap->pm_lock);
3268                               break;
3269                     }
3270 
3271                     /* Remove all mappings in the set of PTPs. */
3272                     pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3273                     for (i = 0; i < n; i++) {
3274                               if (ptps[i]->wire_count == 0) {
3275                                         /* It's dead: pmap_update() will expunge. */
3276                                         continue;
3277                               }
3278 
3279                               /* Determine range of block. */
3280                               va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3281                               blkendva = x86_round_pdr(va + 1);
3282 
3283                               /* Make sure everything squares up... */
3284                               KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3285                               KASSERT(lvl == 1);
3286                               KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3287 
3288                               /* Zap! */
3289                               pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3290                                   blkendva);
3291 
3292                               /* PTP should now be unused - free it. */
3293                               KASSERT(ptps[i]->wire_count == 1);
3294                               pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3295                     }
3296                     pmap_unmap_ptes(pmap, pmap2);
3297                     pmap_drain_pv(pmap);
3298                     pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3299                     mutex_exit(&pmap->pm_lock);
3300 
3301                     /* Process deferred frees. */
3302                     pmap_update(pmap);
3303 
3304                     /* A breathing point. */
3305                     preempt_point();
3306           }
3307 
3308           /* Verify that the pmap is now completely empty. */
3309           pmap_check_ptps(pmap);
3310           KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3311               "pmap %p not empty", pmap);
3312 
3313           return true;
3314 }
3315 
3316 #if defined(PMAP_FORK)
3317 /*
3318  * pmap_fork: perform any necessary data structure manipulation when
3319  * a VM space is forked.
3320  */
3321 void
3322 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3323 {
3324 #ifdef USER_LDT
3325           union descriptor *new_ldt;
3326           int sel;
3327 
3328           if (__predict_true(pmap1->pm_ldt == NULL)) {
3329                     return;
3330           }
3331 
3332           /*
3333            * Copy the LDT into the new process.
3334            *
3335            * Read pmap1's ldt pointer unlocked; if it changes behind our back
3336            * we'll retry. This will starve if there's a stream of LDT changes
3337            * in another thread but that should not happen.
3338            */
3339 
3340 retry:
3341           if (pmap1->pm_ldt != NULL) {
3342                     /* Allocate space for the new process's LDT */
3343                     new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3344                         MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3345                     if (new_ldt == NULL) {
3346                               printf("WARNING: %s: unable to allocate LDT space\n",
3347                                   __func__);
3348                               return;
3349                     }
3350                     mutex_enter(&cpu_lock);
3351                     /* Get a GDT slot for it */
3352                     sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3353                     if (sel == -1) {
3354                               mutex_exit(&cpu_lock);
3355                               uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3356                                   MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3357                               printf("WARNING: %s: unable to allocate LDT selector\n",
3358                                   __func__);
3359                               return;
3360                     }
3361           } else {
3362                     /* Wasn't anything there after all. */
3363                     new_ldt = NULL;
3364                     sel = -1;
3365                     mutex_enter(&cpu_lock);
3366           }
3367 
3368           /*
3369            * Now that we have cpu_lock, ensure the LDT status is the same.
3370            */
3371           if (pmap1->pm_ldt != NULL) {
3372                     if (new_ldt == NULL) {
3373                               /* A wild LDT just appeared. */
3374                               mutex_exit(&cpu_lock);
3375                               goto retry;
3376                     }
3377 
3378                     /* Copy the LDT data and install it in pmap2 */
3379                     memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3380                     pmap2->pm_ldt = new_ldt;
3381                     pmap2->pm_ldt_sel = sel;
3382                     mutex_exit(&cpu_lock);
3383           } else {
3384                     if (new_ldt != NULL) {
3385                               /* The LDT disappeared, drop what we did. */
3386                               ldt_free(sel);
3387                               mutex_exit(&cpu_lock);
3388                               uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3389                                   MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3390                               return;
3391                     }
3392 
3393                     /* We're good, just leave. */
3394                     mutex_exit(&cpu_lock);
3395           }
3396 #endif /* USER_LDT */
3397 }
3398 #endif /* PMAP_FORK */
3399 
3400 #ifdef USER_LDT
3401 
3402 /*
3403  * pmap_ldt_xcall: cross call used by pmap_ldt_sync.  if the named pmap
3404  * is active, reload LDTR.
3405  */
3406 static void
3407 pmap_ldt_xcall(void *arg1, void *arg2)
3408 {
3409           struct pmap *pm;
3410 
3411           kpreempt_disable();
3412           pm = arg1;
3413           if (curcpu()->ci_pmap == pm) {
3414 #if defined(SVS)
3415                     if (svs_enabled) {
3416                               svs_ldt_sync(pm);
3417                     } else
3418 #endif
3419                     lldt(pm->pm_ldt_sel);
3420           }
3421           kpreempt_enable();
3422 }
3423 
3424 /*
3425  * pmap_ldt_sync: LDT selector for the named pmap is changing.  swap
3426  * in the new selector on all CPUs.
3427  */
3428 void
3429 pmap_ldt_sync(struct pmap *pm)
3430 {
3431           uint64_t where;
3432 
3433           KASSERT(mutex_owned(&cpu_lock));
3434 
3435           pmap_ldt_evcnt.ev_count++;
3436           where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3437           xc_wait(where);
3438 }
3439 
3440 /*
3441  * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3442  * restore the default.
3443  */
3444 void
3445 pmap_ldt_cleanup(struct lwp *l)
3446 {
3447           pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3448           union descriptor *ldt;
3449           int sel;
3450 
3451           if (__predict_true(pmap->pm_ldt == NULL)) {
3452                     return;
3453           }
3454 
3455           mutex_enter(&cpu_lock);
3456           if (pmap->pm_ldt != NULL) {
3457                     sel = pmap->pm_ldt_sel;
3458                     ldt = pmap->pm_ldt;
3459                     pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3460                     pmap->pm_ldt = NULL;
3461                     pmap_ldt_sync(pmap);
3462                     ldt_free(sel);
3463                     uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3464                         UVM_KMF_WIRED);
3465           }
3466           mutex_exit(&cpu_lock);
3467 }
3468 #endif /* USER_LDT */
3469 
3470 /*
3471  * pmap_activate: activate a process' pmap
3472  *
3473  * => must be called with kernel preemption disabled
3474  * => if lwp is the curlwp, then set ci_want_pmapload so that
3475  *    actual MMU context switch will be done by pmap_load() later
3476  */
3477 void
3478 pmap_activate(struct lwp *l)
3479 {
3480           struct cpu_info *ci;
3481           struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3482 
3483           KASSERT(kpreempt_disabled());
3484 
3485           ci = curcpu();
3486 
3487           if (l != ci->ci_curlwp)
3488                     return;
3489 
3490           KASSERT(ci->ci_want_pmapload == 0);
3491           KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3492 
3493           /*
3494            * no need to switch to kernel vmspace because
3495            * it's a subset of any vmspace.
3496            */
3497 
3498           if (pmap == pmap_kernel()) {
3499                     ci->ci_want_pmapload = 0;
3500                     return;
3501           }
3502 
3503           ci->ci_want_pmapload = 1;
3504 }
3505 
3506 #if defined(XENPV) && defined(__x86_64__)
3507 #define   KASSERT_PDIRPA(pmap) \
3508           KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3509               pmap == pmap_kernel())
3510 #elif defined(PAE)
3511 #define   KASSERT_PDIRPA(pmap) \
3512           KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3513 #elif !defined(XENPV)
3514 #define   KASSERT_PDIRPA(pmap) \
3515           KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3516 #else
3517 #define   KASSERT_PDIRPA(pmap)          KASSERT(true)       /* nothing to do */
3518 #endif
3519 
3520 /*
3521  * pmap_reactivate: try to regain reference to the pmap.
3522  *
3523  * => Must be called with kernel preemption disabled.
3524  */
3525 static void
3526 pmap_reactivate(struct pmap *pmap)
3527 {
3528           struct cpu_info * const ci = curcpu();
3529           const cpuid_t cid = cpu_index(ci);
3530 
3531           KASSERT(kpreempt_disabled());
3532           KASSERT_PDIRPA(pmap);
3533 
3534           /*
3535            * If we still have a lazy reference to this pmap, we can assume
3536            * that there was no TLB shootdown for this pmap in the meantime.
3537            *
3538            * The order of events here is important as we must synchronize
3539            * with TLB shootdown interrupts.  Declare interest in invalidations
3540            * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3541            * change only when the state is TLBSTATE_LAZY.
3542            */
3543 
3544           ci->ci_tlbstate = TLBSTATE_VALID;
3545           KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3546 
3547           if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3548                     /* We have the reference, state is valid. */
3549           } else {
3550                     /*
3551                      * Must reload the TLB, pmap has been changed during
3552                      * deactivated.
3553                      */
3554                     kcpuset_atomic_set(pmap->pm_cpus, cid);
3555 
3556                     tlbflush();
3557           }
3558 }
3559 
3560 /*
3561  * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3562  * and relevant LDT info.
3563  *
3564  * Ensures that the current process' pmap is loaded on the current CPU's
3565  * MMU and that there are no stale TLB entries.
3566  *
3567  * => The caller should disable kernel preemption or do check-and-retry
3568  *    to prevent a preemption from undoing our efforts.
3569  * => This function may block.
3570  */
3571 void
3572 pmap_load(void)
3573 {
3574           struct cpu_info *ci;
3575           struct pmap *pmap, *oldpmap;
3576           struct lwp *l;
3577           uint64_t pctr;
3578           int ilevel __diagused;
3579           u_long psl __diagused;
3580 
3581           kpreempt_disable();
3582  retry:
3583           ci = curcpu();
3584           if (!ci->ci_want_pmapload) {
3585                     kpreempt_enable();
3586                     return;
3587           }
3588           l = ci->ci_curlwp;
3589           pctr = lwp_pctr();
3590           __insn_barrier();
3591 
3592           /* should be able to take ipis. */
3593           KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
3594 #ifdef XENPV
3595           /* Check to see if interrupts are enabled (ie; no events are masked) */
3596           KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
3597 #else
3598           KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
3599 #endif
3600 
3601           KASSERT(l != NULL);
3602           pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3603           KASSERT(pmap != pmap_kernel());
3604           oldpmap = ci->ci_pmap;
3605 
3606           if (pmap == oldpmap) {
3607                     pmap_reactivate(pmap);
3608                     ci->ci_want_pmapload = 0;
3609                     kpreempt_enable();
3610                     return;
3611           }
3612 
3613           /*
3614            * Acquire a reference to the new pmap and perform the switch.
3615            */
3616 
3617           pmap_reference(pmap);
3618           pmap_load1(l, pmap, oldpmap);
3619           ci->ci_want_pmapload = 0;
3620 
3621           /*
3622            * we're now running with the new pmap.  drop the reference
3623            * to the old pmap.  if we block, we need to go around again.
3624            */
3625 
3626           pmap_destroy(oldpmap);
3627           __insn_barrier();
3628           if (lwp_pctr() != pctr) {
3629                     goto retry;
3630           }
3631 
3632           kpreempt_enable();
3633 }
3634 
3635 /*
3636  * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3637  * pmap_load().  It's critically important that this function does not
3638  * block.
3639  */
3640 static void
3641 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3642 {
3643           struct cpu_info *ci;
3644           struct pcb *pcb;
3645           cpuid_t cid;
3646 
3647           KASSERT(kpreempt_disabled());
3648 
3649           pcb = lwp_getpcb(l);
3650           ci = l->l_cpu;
3651           cid = cpu_index(ci);
3652 
3653           kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3654           kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3655 
3656           KASSERT_PDIRPA(oldpmap);
3657           KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3658           KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3659 
3660           /*
3661            * Mark the pmap in use by this CPU.  Again, we must synchronize
3662            * with TLB shootdown interrupts, so set the state VALID first,
3663            * then register us for shootdown events on this pmap.
3664            */
3665           ci->ci_tlbstate = TLBSTATE_VALID;
3666           kcpuset_atomic_set(pmap->pm_cpus, cid);
3667           kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3668           ci->ci_pmap = pmap;
3669 
3670           /*
3671            * update tss.  now that we have registered for invalidations
3672            * from other CPUs, we're good to load the page tables.
3673            */
3674 #ifdef PAE
3675           pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3676 #else
3677           pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3678 #endif
3679 
3680 #ifdef i386
3681 #ifndef XENPV
3682           ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3683           ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3684 #endif
3685 #endif
3686 
3687 #if defined(SVS) && defined(USER_LDT)
3688           if (svs_enabled) {
3689                     svs_ldt_sync(pmap);
3690           } else
3691 #endif
3692           lldt(pmap->pm_ldt_sel);
3693 
3694           cpu_load_pmap(pmap, oldpmap);
3695 }
3696 
3697 /*
3698  * pmap_deactivate: deactivate a process' pmap.
3699  *
3700  * => Must be called with kernel preemption disabled (high IPL is enough).
3701  */
3702 void
3703 pmap_deactivate(struct lwp *l)
3704 {
3705           struct pmap *pmap;
3706           struct cpu_info *ci;
3707 
3708           KASSERT(kpreempt_disabled());
3709 
3710           if (l != curlwp) {
3711                     return;
3712           }
3713 
3714           /*
3715            * Wait for pending TLB shootdowns to complete.  Necessary because
3716            * TLB shootdown state is per-CPU, and the LWP may be coming off
3717            * the CPU before it has a chance to call pmap_update(), e.g. due
3718            * to kernel preemption or blocking routine in between.
3719            */
3720           pmap_tlb_shootnow();
3721 
3722           ci = curcpu();
3723 
3724           if (ci->ci_want_pmapload) {
3725                     /*
3726                      * ci_want_pmapload means that our pmap is not loaded on
3727                      * the CPU or TLB might be stale.  note that pmap_kernel()
3728                      * is always considered loaded.
3729                      */
3730                     KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3731                         != pmap_kernel());
3732                     KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3733                         != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3734 
3735                     /*
3736                      * userspace has not been touched.
3737                      * nothing to do here.
3738                      */
3739 
3740                     ci->ci_want_pmapload = 0;
3741                     return;
3742           }
3743 
3744           pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3745 
3746           if (pmap == pmap_kernel()) {
3747                     return;
3748           }
3749 
3750           KASSERT_PDIRPA(pmap);
3751           KASSERT(ci->ci_pmap == pmap);
3752 
3753           /*
3754            * we aren't interested in TLB invalidations for this pmap,
3755            * at least for the time being.
3756            */
3757 
3758           KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3759           ci->ci_tlbstate = TLBSTATE_LAZY;
3760 }
3761 
3762 #ifdef EFI_RUNTIME
3763 
3764 extern struct pmap *efi_runtime_pmap;
3765 
3766 /*
3767  * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3768  * for an unprivileged user process
3769  */
3770 bool
3771 pmap_is_user(struct pmap *pmap)
3772 {
3773 
3774           KASSERT(pmap != pmap_kernel());
3775           return (pmap != efi_runtime_pmap);
3776 }
3777 
3778 /*
3779  * pmap_activate_sync: synchronously activate specified pmap.
3780  *
3781  * => Must be called with kernel preemption disabled (high IPL is enough).
3782  * => Must not sleep before pmap_deactivate_sync.
3783  */
3784 void *
3785 pmap_activate_sync(struct pmap *pmap)
3786 {
3787           struct cpu_info *ci = curcpu();
3788           struct pmap *oldpmap = ci->ci_pmap;
3789           unsigned cid = cpu_index(ci);
3790 
3791           KASSERT(kpreempt_disabled());
3792           KASSERT(pmap != pmap_kernel());
3793 
3794           KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3795           KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3796 
3797           if (oldpmap) {
3798                     KASSERT_PDIRPA(oldpmap);
3799                     kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3800                     kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3801           }
3802 
3803           ci->ci_tlbstate = TLBSTATE_VALID;
3804           kcpuset_atomic_set(pmap->pm_cpus, cid);
3805           kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3806           ci->ci_pmap = pmap;
3807 
3808 #if defined(SVS) && defined(USER_LDT)
3809           if (svs_enabled) {
3810                     svs_ldt_sync(pmap);
3811           } else
3812 #endif
3813           lldt(pmap->pm_ldt_sel);
3814 
3815           cpu_load_pmap(pmap, oldpmap);
3816 
3817           return oldpmap;
3818 }
3819 
3820 /*
3821  * pmap_deactivate_sync: synchronously deactivate specified pmap and
3822  * restore whatever was active before pmap_activate_sync.
3823  *
3824  * => Must be called with kernel preemption disabled (high IPL is enough).
3825  * => Must not have slept since pmap_activate_sync.
3826  */
3827 void
3828 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3829 {
3830           struct cpu_info *ci = curcpu();
3831           struct pmap *oldpmap = cookie;
3832           unsigned cid = cpu_index(ci);
3833 
3834           KASSERT(kpreempt_disabled());
3835           KASSERT(pmap != pmap_kernel());
3836           KASSERT(ci->ci_pmap == pmap);
3837 
3838           KASSERT_PDIRPA(pmap);
3839 
3840           KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3841           KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3842 
3843           pmap_tlb_shootnow();
3844 
3845           kcpuset_atomic_clear(pmap->pm_cpus, cid);
3846           kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3847 
3848           ci->ci_tlbstate = TLBSTATE_VALID;
3849           ci->ci_pmap = oldpmap;
3850           if (oldpmap) {
3851                     kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3852                     kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3853 #if defined(SVS) && defined(USER_LDT)
3854                     if (svs_enabled) {
3855                               svs_ldt_sync(oldpmap);
3856                     } else
3857 #endif
3858                     lldt(oldpmap->pm_ldt_sel);
3859                     cpu_load_pmap(oldpmap, pmap);
3860           } else {
3861                     lcr3(pmap_pdirpa(pmap_kernel(), 0));
3862           }
3863 }
3864 
3865 #endif    /* EFI_RUNTIME */
3866 
3867 /*
3868  * some misc. functions
3869  */
3870 
3871 bool
3872 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3873     int *lastlvl)
3874 {
3875           unsigned long index;
3876           pd_entry_t pde;
3877           int i;
3878 
3879           for (i = PTP_LEVELS; i > 1; i--) {
3880                     index = pl_i(va, i);
3881                     pde = pdes[i - 2][index];
3882                     if ((pde & PTE_P) == 0) {
3883                               *lastlvl = i;
3884                               return false;
3885                     }
3886                     if (pde & PTE_PS)
3887                               break;
3888           }
3889           if (lastpde != NULL)
3890                     *lastpde = pde;
3891           *lastlvl = i;
3892           return true;
3893 }
3894 
3895 /*
3896  * pmap_extract: extract a PA for the given VA
3897  */
3898 bool
3899 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3900 {
3901           pt_entry_t *ptes, pte;
3902           pd_entry_t pde;
3903           pd_entry_t * const *pdes;
3904           struct pmap *pmap2;
3905           paddr_t pa;
3906           bool rv;
3907           int lvl;
3908 
3909           if (__predict_false(pmap->pm_extract != NULL)) {
3910                     return (*pmap->pm_extract)(pmap, va, pap);
3911           }
3912 
3913 #ifdef __HAVE_DIRECT_MAP
3914           if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3915                     if (pap != NULL) {
3916                               *pap = PMAP_DIRECT_UNMAP(va);
3917                     }
3918                     return true;
3919           }
3920 #endif
3921 
3922           rv = false;
3923           pa = 0;
3924 
3925           if (pmap != pmap_kernel()) {
3926                     mutex_enter(&pmap->pm_lock);
3927           }
3928           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3929           if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3930                     if (lvl == 2) {
3931                               pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3932                               rv = true;
3933                     } else {
3934                               KASSERT(lvl == 1);
3935                               pte = ptes[pl1_i(va)];
3936                               if (__predict_true((pte & PTE_P) != 0)) {
3937                                         pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3938                                         rv = true;
3939                               }
3940                     }
3941           }
3942           pmap_unmap_ptes(pmap, pmap2);
3943           if (pmap != pmap_kernel()) {
3944                     mutex_exit(&pmap->pm_lock);
3945           }
3946           if (pap != NULL) {
3947                     *pap = pa;
3948           }
3949 
3950           return rv;
3951 }
3952 
3953 /*
3954  * vtophys: virtual address to physical address.  For use by
3955  * machine-dependent code only.
3956  */
3957 paddr_t
3958 vtophys(vaddr_t va)
3959 {
3960           paddr_t pa;
3961 
3962           if (pmap_extract(pmap_kernel(), va, &pa) == true)
3963                     return pa;
3964           return 0;
3965 }
3966 
3967 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3968 
3969 #ifdef XENPV
3970 /*
3971  * vtomach: virtual address to machine address.  For use by
3972  * machine-dependent code only.
3973  */
3974 paddr_t
3975 vtomach(vaddr_t va)
3976 {
3977           paddr_t pa;
3978 
3979           if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3980                     return pa;
3981           return 0;
3982 }
3983 #endif
3984 
3985 /*
3986  * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3987  * determine the bounds of the kernel virtual address space.
3988  */
3989 void
3990 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3991 {
3992           *startp = virtual_avail;
3993           *endp = virtual_end;
3994 }
3995 
3996 void
3997 pmap_zero_page(paddr_t pa)
3998 {
3999 #if defined(__HAVE_DIRECT_MAP)
4000           memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
4001 #else
4002 #if defined(XENPV)
4003           if (XEN_VERSION_SUPPORTED(3, 4)) {
4004                     xen_pagezero(pa);
4005                     return;
4006           }
4007 #endif
4008           struct cpu_info *ci;
4009           pt_entry_t *zpte;
4010           vaddr_t zerova;
4011 
4012           const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
4013 
4014           kpreempt_disable();
4015 
4016           ci = curcpu();
4017           zerova = ci->vpage[VPAGE_ZER];
4018           zpte = ci->vpage_pte[VPAGE_ZER];
4019 
4020           KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4021 
4022           pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4023           pmap_pte_flush();
4024           pmap_update_pg(zerova);                 /* flush TLB */
4025 
4026           memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4027 
4028 #if defined(DIAGNOSTIC) || defined(XENPV)
4029           pmap_pte_set(zpte, 0);                                      /* zap ! */
4030           pmap_pte_flush();
4031 #endif
4032 
4033           kpreempt_enable();
4034 #endif /* defined(__HAVE_DIRECT_MAP) */
4035 }
4036 
4037 void
4038 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4039 {
4040 #if defined(__HAVE_DIRECT_MAP)
4041           vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4042           vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4043 
4044           memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4045 #else
4046 #if defined(XENPV)
4047           if (XEN_VERSION_SUPPORTED(3, 4)) {
4048                     xen_copy_page(srcpa, dstpa);
4049                     return;
4050           }
4051 #endif
4052           struct cpu_info *ci;
4053           pt_entry_t *srcpte, *dstpte;
4054           vaddr_t srcva, dstva;
4055 
4056           const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4057 
4058           kpreempt_disable();
4059 
4060           ci = curcpu();
4061           srcva = ci->vpage[VPAGE_SRC];
4062           dstva = ci->vpage[VPAGE_DST];
4063           srcpte = ci->vpage_pte[VPAGE_SRC];
4064           dstpte = ci->vpage_pte[VPAGE_DST];
4065 
4066           KASSERT(*srcpte == 0 && *dstpte == 0);
4067 
4068           pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4069           pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4070           pmap_pte_flush();
4071           pmap_update_pg(srcva);
4072           pmap_update_pg(dstva);
4073 
4074           memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4075 
4076 #if defined(DIAGNOSTIC) || defined(XENPV)
4077           pmap_pte_set(srcpte, 0);
4078           pmap_pte_set(dstpte, 0);
4079           pmap_pte_flush();
4080 #endif
4081 
4082           kpreempt_enable();
4083 #endif /* defined(__HAVE_DIRECT_MAP) */
4084 }
4085 
4086 static pt_entry_t *
4087 pmap_map_ptp(struct vm_page *ptp)
4088 {
4089 #ifdef __HAVE_DIRECT_MAP
4090           return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4091 #else
4092           struct cpu_info *ci;
4093           pt_entry_t *ptppte;
4094           vaddr_t ptpva;
4095 
4096           KASSERT(kpreempt_disabled());
4097 
4098 #ifndef XENPV
4099           const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4100 #else
4101           const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4102 #endif
4103 
4104           ci = curcpu();
4105           ptpva = ci->vpage[VPAGE_PTP];
4106           ptppte = ci->vpage_pte[VPAGE_PTP];
4107 
4108           pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4109 
4110           pmap_pte_flush();
4111           pmap_update_pg(ptpva);
4112 
4113           return (pt_entry_t *)ptpva;
4114 #endif
4115 }
4116 
4117 static void
4118 pmap_unmap_ptp(void)
4119 {
4120 #ifndef __HAVE_DIRECT_MAP
4121 #if defined(DIAGNOSTIC) || defined(XENPV)
4122           struct cpu_info *ci;
4123           pt_entry_t *pte;
4124 
4125           KASSERT(kpreempt_disabled());
4126 
4127           ci = curcpu();
4128           pte = ci->vpage_pte[VPAGE_PTP];
4129 
4130           if (*pte != 0) {
4131                     pmap_pte_set(pte, 0);
4132                     pmap_pte_flush();
4133           }
4134 #endif
4135 #endif
4136 }
4137 
4138 static pt_entry_t *
4139 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4140 {
4141 
4142           KASSERT(kpreempt_disabled());
4143           if (pmap_is_curpmap(pmap)) {
4144                     return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4145           }
4146           KASSERT(ptp != NULL);
4147           return pmap_map_ptp(ptp) + pl1_pi(va);
4148 }
4149 
4150 static void
4151 pmap_unmap_pte(void)
4152 {
4153 
4154           KASSERT(kpreempt_disabled());
4155 
4156           pmap_unmap_ptp();
4157 }
4158 
4159 /*
4160  * p m a p   r e m o v e   f u n c t i o n s
4161  *
4162  * functions that remove mappings
4163  */
4164 
4165 /*
4166  * pmap_remove_ptes: remove PTEs from a PTP
4167  *
4168  * => caller must hold pmap's lock
4169  * => PTP must be mapped into KVA
4170  * => PTP should be null if pmap == pmap_kernel()
4171  * => must be called with kernel preemption disabled
4172  * => returns composite pte if at least one page should be shot down
4173  */
4174 static void
4175 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4176     vaddr_t startva, vaddr_t endva)
4177 {
4178           pt_entry_t *pte = (pt_entry_t *)ptpva;
4179 
4180           KASSERT(mutex_owned(&pmap->pm_lock));
4181           KASSERT(kpreempt_disabled());
4182 
4183           /*
4184            * mappings are very often sparse, so clip the given range to the
4185            * range of PTEs that are known present in the PTP.
4186            */
4187           pmap_ptp_range_clip(ptp, &startva, &pte);
4188 
4189           /*
4190            * note that ptpva points to the PTE that maps startva.   this may
4191            * or may not be the first PTE in the PTP.
4192            *
4193            * we loop through the PTP while there are still PTEs to look at
4194            * and the wire_count is greater than 1 (because we use the wire_count
4195            * to keep track of the number of real PTEs in the PTP).
4196            */
4197           while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4198                     (void)pmap_remove_pte(pmap, ptp, pte, startva);
4199                     startva += PAGE_SIZE;
4200                     pte++;
4201           }
4202 }
4203 
4204 /*
4205  * pmap_remove_pte: remove a single PTE from a PTP.
4206  *
4207  * => caller must hold pmap's lock
4208  * => PTP must be mapped into KVA
4209  * => PTP should be null if pmap == pmap_kernel()
4210  * => returns true if we removed a mapping
4211  * => must be called with kernel preemption disabled
4212  */
4213 static bool
4214 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4215     vaddr_t va)
4216 {
4217           struct pv_entry *pve;
4218           struct vm_page *pg;
4219           struct pmap_page *pp;
4220           pt_entry_t opte;
4221 
4222           KASSERT(mutex_owned(&pmap->pm_lock));
4223           KASSERT(kpreempt_disabled());
4224 
4225           if (!pmap_valid_entry(*pte)) {
4226                     /* VA not mapped. */
4227                     return false;
4228           }
4229 
4230           /* Atomically save the old PTE and zap it. */
4231           opte = pmap_pte_testset(pte, 0);
4232           if (!pmap_valid_entry(opte)) {
4233                     return false;
4234           }
4235 
4236           pmap_exec_account(pmap, va, opte, 0);
4237           pmap_stats_update_bypte(pmap, 0, opte);
4238 
4239           if (ptp) {
4240                     /*
4241                      * Dropping a PTE.  Make sure that the PDE is flushed.
4242                      */
4243                     ptp->wire_count--;
4244                     if (ptp->wire_count <= 1) {
4245                               opte |= PTE_A;
4246                     }
4247           }
4248 
4249           if ((opte & PTE_A) != 0) {
4250                     pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4251           }
4252 
4253           /*
4254            * If we are not on a pv list - we are done.
4255            */
4256           if ((opte & PTE_PVLIST) == 0) {
4257 #ifndef DOM0OPS
4258                     KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4259                         "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4260                     KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4261                         "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4262 #endif
4263                     KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4264                         &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4265                     return true;
4266           }
4267 
4268           if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4269                     pp = VM_PAGE_TO_PP(pg);
4270           } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4271                     paddr_t pa = pmap_pte2pa(opte);
4272                     panic("%s: PTE_PVLIST with pv-untracked page"
4273                         " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4274                         __func__, va, pa, atop(pa));
4275           }
4276 
4277           /* Sync R/M bits. */
4278           pve = pmap_lookup_pv(pmap, ptp, pp, va);
4279           pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4280           return true;
4281 }
4282 
4283 static void
4284 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4285 {
4286           pt_entry_t *ptes;
4287           pd_entry_t pde;
4288           pd_entry_t * const *pdes;
4289           bool result;
4290           vaddr_t blkendva, va = sva;
4291           struct vm_page *ptp;
4292           struct pmap *pmap2;
4293           int lvl;
4294 
4295           KASSERT(mutex_owned(&pmap->pm_lock));
4296 
4297           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4298 
4299           /*
4300            * removing one page?  take shortcut function.
4301            */
4302 
4303           if (va + PAGE_SIZE == eva) {
4304                     if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4305                               KASSERT(lvl == 1);
4306 
4307                               /* Get PTP if non-kernel mapping. */
4308                               if (pmap != pmap_kernel()) {
4309                                         ptp = pmap_find_ptp(pmap, va, 1);
4310                                         KASSERTMSG(ptp != NULL,
4311                                             "%s: unmanaged PTP detected", __func__);
4312                               } else {
4313                                         /* Never free kernel PTPs. */
4314                                         ptp = NULL;
4315                               }
4316 
4317                               result = pmap_remove_pte(pmap, ptp,
4318                                   &ptes[pl1_i(va)], va);
4319 
4320                               /*
4321                                * if mapping removed and the PTP is no longer
4322                                * being used, free it!
4323                                */
4324 
4325                               if (result && ptp && ptp->wire_count <= 1)
4326                                         pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4327                     }
4328           } else for (/* null */ ; va < eva ; va = blkendva) {
4329                     /* determine range of block */
4330                     blkendva = x86_round_pdr(va+1);
4331                     if (blkendva > eva)
4332                               blkendva = eva;
4333 
4334                     if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4335                               /* Skip a range corresponding to an invalid pde. */
4336                               blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4337                               continue;
4338                     }
4339                     KASSERT(lvl == 1);
4340 
4341                     /* Get PTP if non-kernel mapping. */
4342                     if (pmap != pmap_kernel()) {
4343                               ptp = pmap_find_ptp(pmap, va, 1);
4344                               KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4345                                   __func__);
4346                     } else {
4347                               /* Never free kernel PTPs. */
4348                               ptp = NULL;
4349                     }
4350 
4351                     pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4352                         blkendva);
4353 
4354                     /* If PTP is no longer being used, free it. */
4355                     if (ptp && ptp->wire_count <= 1) {
4356                               pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4357                     }
4358           }
4359           pmap_unmap_ptes(pmap, pmap2);
4360           pmap_drain_pv(pmap);
4361 }
4362 
4363 /*
4364  * pmap_remove: mapping removal function.
4365  *
4366  * => caller should not be holding any pmap locks
4367  */
4368 void
4369 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4370 {
4371           if (__predict_false(pmap->pm_remove != NULL)) {
4372                     (*pmap->pm_remove)(pmap, sva, eva);
4373                     return;
4374           }
4375 
4376           mutex_enter(&pmap->pm_lock);
4377           pmap_remove_locked(pmap, sva, eva);
4378           mutex_exit(&pmap->pm_lock);
4379 }
4380 
4381 /*
4382  * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4383  *
4384  * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4385  * => Caller should disable kernel preemption.
4386  * => issues tlb shootdowns if necessary.
4387  */
4388 static int
4389 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4390     pt_entry_t *optep)
4391 {
4392           struct pmap *pmap;
4393           struct vm_page *ptp;
4394           vaddr_t va;
4395           pt_entry_t *ptep;
4396           pt_entry_t opte;
4397           pt_entry_t npte;
4398           pt_entry_t expect;
4399           bool need_shootdown;
4400 
4401           ptp = pvpte->pte_ptp;
4402           va = pvpte->pte_va;
4403           KASSERT(ptp == NULL || ptp->uobject != NULL);
4404           KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4405           pmap = ptp_to_pmap(ptp);
4406           KASSERT(kpreempt_disabled());
4407 
4408           if (__predict_false(pmap->pm_sync_pv != NULL)) {
4409                     return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4410                         optep);
4411           }
4412 
4413           expect = pmap_pa2pte(pa) | PTE_P;
4414 
4415           if (clearbits != ~0) {
4416                     KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4417                     clearbits = pmap_pp_attrs_to_pte(clearbits);
4418           }
4419 
4420           ptep = pmap_map_pte(pmap, ptp, va);
4421           do {
4422                     opte = *ptep;
4423                     KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4424                     KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4425                     KASSERT(opte == 0 || (opte & PTE_P) != 0);
4426                     if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4427                               /*
4428                                * We lost a race with a V->P operation like
4429                                * pmap_remove().  Wait for the competitor
4430                                * reflecting pte bits into mp_attrs.
4431                                */
4432                               pmap_unmap_pte();
4433                               return EAGAIN;
4434                     }
4435 
4436                     /*
4437                      * Check if there's anything to do on this PTE.
4438                      */
4439                     if ((opte & clearbits) == 0) {
4440                               need_shootdown = false;
4441                               break;
4442                     }
4443 
4444                     /*
4445                      * We need a shootdown if the PTE is cached (PTE_A) ...
4446                      * ... Unless we are clearing only the PTE_W bit and
4447                      * it isn't cached as RW (PTE_D).
4448                      */
4449                     need_shootdown = (opte & PTE_A) != 0 &&
4450                         !(clearbits == PTE_W && (opte & PTE_D) == 0);
4451 
4452                     npte = opte & ~clearbits;
4453 
4454                     /*
4455                      * If we need a shootdown anyway, clear PTE_A and PTE_D.
4456                      */
4457                     if (need_shootdown) {
4458                               npte &= ~(PTE_A | PTE_D);
4459                     }
4460                     KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4461                     KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4462                     KASSERT(npte == 0 || (opte & PTE_P) != 0);
4463           } while (pmap_pte_cas(ptep, opte, npte) != opte);
4464 
4465           if (need_shootdown) {
4466                     pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4467           }
4468           pmap_unmap_pte();
4469 
4470           *oattrs = pmap_pte_to_pp_attrs(opte);
4471           if (optep != NULL)
4472                     *optep = opte;
4473           return 0;
4474 }
4475 
4476 static void
4477 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4478     vaddr_t va)
4479 {
4480           struct pmap *pmap2;
4481           pt_entry_t *ptes;
4482           pd_entry_t * const *pdes;
4483 
4484           KASSERT(mutex_owned(&pmap->pm_lock));
4485 
4486           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4487           pmap_stats_update_bypte(pmap, 0, opte);
4488           ptp->wire_count--;
4489           if (ptp->wire_count <= 1) {
4490                     pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4491           }
4492           pmap_unmap_ptes(pmap, pmap2);
4493 }
4494 
4495 static void
4496 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4497 {
4498           struct pv_pte *pvpte;
4499           struct vm_page *ptp;
4500           uintptr_t sum;
4501           uint8_t oattrs;
4502           bool locked;
4503 
4504           /*
4505            * Do an unlocked check to see if the page has no mappings, eg when
4506            * pmap_remove_all() was called before amap_wipeout() for a process
4507            * private amap - common.  The page being removed must be on the way
4508            * out, so we don't have to worry about concurrent attempts to enter
4509            * it (otherwise the caller either doesn't care or has screwed up).
4510            */
4511           sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4512           sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4513           sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4514           if (sum == 0) {
4515                     return;
4516           }
4517 
4518           kpreempt_disable();
4519           for (;;) {
4520                     struct pmap *pmap;
4521                     struct pv_entry *pve;
4522                     pt_entry_t opte;
4523                     vaddr_t va;
4524 
4525                     mutex_spin_enter(&pp->pp_lock);
4526                     if ((pvpte = pv_pte_first(pp)) == NULL) {
4527                               mutex_spin_exit(&pp->pp_lock);
4528                               break;
4529                     }
4530 
4531                     /*
4532                      * Add a reference to the pmap before clearing the pte.
4533                      * Otherwise the pmap can disappear behind us.
4534                      */
4535                     ptp = pvpte->pte_ptp;
4536                     pmap = ptp_to_pmap(ptp);
4537                     KASSERT(pmap->pm_obj[0].uo_refs > 0);
4538                     if (ptp != NULL) {
4539                               pmap_reference(pmap);
4540                     }
4541 
4542                     /*
4543                      * Now try to lock it.  We need a direct handoff between
4544                      * pp_lock and pm_lock to know the pv_entry is kept intact
4545                      * and kept associated with this pmap.  If that can't be
4546                      * had, wait for the pmap's lock to become free and then
4547                      * retry.
4548                      */
4549                     locked = mutex_tryenter(&pmap->pm_lock);
4550                     mutex_spin_exit(&pp->pp_lock);
4551                     if (!locked) {
4552                               mutex_enter(&pmap->pm_lock);
4553                               /* nothing, just wait for it */
4554                               mutex_exit(&pmap->pm_lock);
4555                               if (ptp != NULL) {
4556                                         pmap_destroy(pmap);
4557                               }
4558                               continue;
4559                     }
4560                     va = pvpte->pte_va;
4561 
4562                     KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4563                         "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4564                     KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4565                         "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4566                     KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4567                         "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4568 
4569 #ifdef DEBUG
4570                     pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4571                     rb_tree_t *tree = (ptp != NULL ?
4572                         &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4573                     pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4574                     if (pve == NULL) {
4575                               KASSERTMSG(&pp->pp_pte == pvpte,
4576                                   "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4577                                   va, pmap, ptp, pvpte, pve);
4578                     } else {
4579                               KASSERTMSG(&pve->pve_pte == pvpte,
4580                                   "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4581                                   va, pmap, ptp, pvpte, pve);
4582                     }
4583 #endif
4584 
4585                     if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4586                               panic("pmap_pp_remove: mapping not present");
4587                     }
4588 
4589                     pve = pmap_lookup_pv(pmap, ptp, pp, va);
4590                     pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4591 
4592                     /* Update the PTP reference count. Free if last reference. */
4593                     if (ptp != NULL) {
4594                               KASSERT(pmap != pmap_kernel());
4595                               pmap_tlb_shootnow();
4596                               if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4597                                         (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4598                               } else {
4599                                         pmap_pp_remove_ent(pmap, ptp, opte, va);
4600                               }
4601                     } else {
4602                               KASSERT(pmap == pmap_kernel());
4603                               pmap_stats_update_bypte(pmap, 0, opte);
4604                     }
4605                     pmap_tlb_shootnow();
4606                     pmap_drain_pv(pmap);
4607                     mutex_exit(&pmap->pm_lock);
4608                     if (ptp != NULL) {
4609                               pmap_destroy(pmap);
4610                     }
4611           }
4612           kpreempt_enable();
4613 }
4614 
4615 /*
4616  * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4617  *
4618  * => R/M bits are sync'd back to attrs
4619  */
4620 void
4621 pmap_page_remove(struct vm_page *pg)
4622 {
4623           struct pmap_page *pp;
4624           paddr_t pa;
4625 
4626           pp = VM_PAGE_TO_PP(pg);
4627           pa = VM_PAGE_TO_PHYS(pg);
4628           pmap_pp_remove(pp, pa);
4629 }
4630 
4631 /*
4632  * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4633  * that map it
4634  */
4635 void
4636 pmap_pv_remove(paddr_t pa)
4637 {
4638           struct pmap_page *pp;
4639 
4640           pp = pmap_pv_tracked(pa);
4641           if (pp == NULL)
4642                     panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4643           pmap_pp_remove(pp, pa);
4644 }
4645 
4646 /*
4647  * p m a p   a t t r i b u t e  f u n c t i o n s
4648  * functions that test/change managed page's attributes
4649  * since a page can be mapped multiple times we must check each PTE that
4650  * maps it by going down the pv lists.
4651  */
4652 
4653 /*
4654  * pmap_test_attrs: test a page's attributes
4655  */
4656 bool
4657 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4658 {
4659           struct pmap_page *pp;
4660           struct pv_pte *pvpte;
4661           struct pmap *pmap;
4662           uint8_t oattrs;
4663           u_int result;
4664           paddr_t pa;
4665 
4666           pp = VM_PAGE_TO_PP(pg);
4667           if ((pp->pp_attrs & testbits) != 0) {
4668                     return true;
4669           }
4670           pa = VM_PAGE_TO_PHYS(pg);
4671  startover:
4672           mutex_spin_enter(&pp->pp_lock);
4673           for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4674                     if ((pp->pp_attrs & testbits) != 0) {
4675                               break;
4676                     }
4677                     if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4678                               /*
4679                                * raced with a V->P operation.  wait for the other
4680                                * side to finish by acquiring pmap's lock.  if no
4681                                * wait, updates to pp_attrs by the other side may
4682                                * go unseen.
4683                                */
4684                               pmap = ptp_to_pmap(pvpte->pte_ptp);
4685                               pmap_reference(pmap);
4686                               mutex_spin_exit(&pp->pp_lock);
4687                               mutex_enter(&pmap->pm_lock);
4688                               /* nothing. */
4689                               mutex_exit(&pmap->pm_lock);
4690                               pmap_destroy(pmap);
4691                               goto startover;
4692                     }
4693                     pp->pp_attrs |= oattrs;
4694           }
4695           result = pp->pp_attrs & testbits;
4696           mutex_spin_exit(&pp->pp_lock);
4697 
4698           /*
4699            * note that we will exit the for loop with a non-null pve if
4700            * we have found the bits we are testing for.
4701            */
4702 
4703           return result != 0;
4704 }
4705 
4706 static bool
4707 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4708 {
4709           struct pv_pte *pvpte;
4710           struct pmap *pmap;
4711           uint8_t oattrs;
4712           u_int result;
4713 
4714 startover:
4715           mutex_spin_enter(&pp->pp_lock);
4716           for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4717                     if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4718                               /*
4719                                * raced with a V->P operation.  wait for the other
4720                                * side to finish by acquiring pmap's lock.  it is
4721                                * probably unmapping the page, and it will be gone
4722                                * when the loop is restarted.
4723                                */
4724                               pmap = ptp_to_pmap(pvpte->pte_ptp);
4725                               pmap_reference(pmap);
4726                               mutex_spin_exit(&pp->pp_lock);
4727                               mutex_enter(&pmap->pm_lock);
4728                               /* nothing. */
4729                               mutex_exit(&pmap->pm_lock);
4730                               pmap_destroy(pmap);
4731                               goto startover;
4732                     }
4733                     pp->pp_attrs |= oattrs;
4734           }
4735           result = pp->pp_attrs & clearbits;
4736           pp->pp_attrs &= ~clearbits;
4737           pmap_tlb_shootnow();
4738           mutex_spin_exit(&pp->pp_lock);
4739 
4740           return result != 0;
4741 }
4742 
4743 /*
4744  * pmap_clear_attrs: clear the specified attribute for a page.
4745  *
4746  * => we return true if we cleared one of the bits we were asked to
4747  */
4748 bool
4749 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4750 {
4751           struct pmap_page *pp;
4752           paddr_t pa;
4753 
4754           pp = VM_PAGE_TO_PP(pg);
4755           pa = VM_PAGE_TO_PHYS(pg);
4756 
4757           /*
4758            * If this is a new page, assert it has no mappings and simply zap
4759            * the stored attributes without taking any locks.
4760            */
4761           if ((pg->flags & PG_FAKE) != 0) {
4762                     KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4763                     KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4764                     KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4765                     atomic_store_relaxed(&pp->pp_attrs, 0);
4766                     return false;
4767           } else {
4768                     return pmap_pp_clear_attrs(pp, pa, clearbits);
4769           }
4770 }
4771 
4772 /*
4773  * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4774  * pv-tracked page.
4775  */
4776 bool
4777 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4778 {
4779           struct pmap_page *pp;
4780 
4781           pp = pmap_pv_tracked(pa);
4782           if (pp == NULL)
4783                     panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4784 
4785           return pmap_pp_clear_attrs(pp, pa, clearbits);
4786 }
4787 
4788 /*
4789  * p m a p   p r o t e c t i o n   f u n c t i o n s
4790  */
4791 
4792 /*
4793  * pmap_page_protect: change the protection of all recorded mappings
4794  * of a managed page
4795  *
4796  * => NOTE: this is an inline function in pmap.h
4797  */
4798 
4799 /* see pmap.h */
4800 
4801 /*
4802  * pmap_pv_protect: change the protection of all recorded mappings
4803  * of an unmanaged pv-tracked page
4804  *
4805  * => NOTE: this is an inline function in pmap.h
4806  */
4807 
4808 /* see pmap.h */
4809 
4810 /*
4811  * pmap_protect: set the protection in of the pages in a pmap
4812  *
4813  * => NOTE: this is an inline function in pmap.h
4814  */
4815 
4816 /* see pmap.h */
4817 
4818 /*
4819  * pmap_write_protect: write-protect pages in a pmap.
4820  *
4821  * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4822  * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4823  * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4824  * present the page will still be considered as a kernel page, and the privilege
4825  * separation will be enforced correctly.
4826  */
4827 void
4828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4829 {
4830           pt_entry_t bit_rem, bit_put;
4831           pt_entry_t *ptes;
4832           pt_entry_t * const *pdes;
4833           struct pmap *pmap2;
4834           vaddr_t blockend, va;
4835           int lvl, i;
4836 
4837           if (__predict_false(pmap->pm_write_protect != NULL)) {
4838                     (*pmap->pm_write_protect)(pmap, sva, eva, prot);
4839                     return;
4840           }
4841 
4842           bit_rem = 0;
4843           if (!(prot & VM_PROT_WRITE))
4844                     bit_rem = PTE_W;
4845 
4846           bit_put = 0;
4847           if (!(prot & VM_PROT_EXECUTE))
4848                     bit_put = pmap_pg_nx;
4849 
4850           sva &= ~PAGE_MASK;
4851           eva &= ~PAGE_MASK;
4852 
4853           /*
4854            * Acquire pmap.  No need to lock the kernel pmap as we won't
4855            * be touching PV entries nor stats and kernel PDEs aren't
4856            * freed.
4857            */
4858           if (pmap != pmap_kernel()) {
4859                     mutex_enter(&pmap->pm_lock);
4860           }
4861           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4862 
4863           for (va = sva ; va < eva; va = blockend) {
4864                     pt_entry_t *spte, *epte;
4865 
4866                     blockend = x86_round_pdr(va + 1);
4867                     if (blockend > eva)
4868                               blockend = eva;
4869 
4870                     /* Is it a valid block? */
4871                     if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4872                               continue;
4873                     }
4874                     KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4875                     KASSERT(lvl == 1);
4876 
4877                     spte = &ptes[pl1_i(va)];
4878                     epte = &ptes[pl1_i(blockend)];
4879 
4880                     for (i = 0; spte < epte; spte++, i++) {
4881                               pt_entry_t opte, npte;
4882 
4883                               do {
4884                                         opte = *spte;
4885                                         if (!pmap_valid_entry(opte)) {
4886                                                   goto next;
4887                                         }
4888                                         npte = (opte & ~bit_rem) | bit_put;
4889                               } while (pmap_pte_cas(spte, opte, npte) != opte);
4890 
4891                               if ((opte & PTE_D) != 0) {
4892                                         vaddr_t tva = va + x86_ptob(i);
4893                                         pmap_tlb_shootdown(pmap, tva, opte,
4894                                             TLBSHOOT_WRITE_PROTECT);
4895                               }
4896 next:;
4897                     }
4898           }
4899 
4900           /* Release pmap. */
4901           pmap_unmap_ptes(pmap, pmap2);
4902           if (pmap != pmap_kernel()) {
4903                     mutex_exit(&pmap->pm_lock);
4904           }
4905 }
4906 
4907 /*
4908  * pmap_unwire: clear the wired bit in the PTE.
4909  *
4910  * => Mapping should already be present.
4911  */
4912 void
4913 pmap_unwire(struct pmap *pmap, vaddr_t va)
4914 {
4915           pt_entry_t *ptes, *ptep, opte;
4916           pd_entry_t * const *pdes;
4917           struct pmap *pmap2;
4918           int lvl;
4919 
4920           if (__predict_false(pmap->pm_unwire != NULL)) {
4921                     (*pmap->pm_unwire)(pmap, va);
4922                     return;
4923           }
4924 
4925           /*
4926            * Acquire pmap.  Need to lock the kernel pmap only to protect the
4927            * statistics.
4928            */
4929           mutex_enter(&pmap->pm_lock);
4930           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4931 
4932           if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4933                     panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4934           }
4935           KASSERT(lvl == 1);
4936 
4937           ptep = &ptes[pl1_i(va)];
4938           opte = *ptep;
4939           KASSERT(pmap_valid_entry(opte));
4940 
4941           if (opte & PTE_WIRED) {
4942                     pt_entry_t npte = opte & ~PTE_WIRED;
4943 
4944                     opte = pmap_pte_testset(ptep, npte);
4945                     pmap_stats_update_bypte(pmap, npte, opte);
4946           } else {
4947                     printf("%s: wiring for pmap %p va %#" PRIxVADDR
4948                         " did not change!\n", __func__, pmap, va);
4949           }
4950 
4951           /* Release pmap. */
4952           pmap_unmap_ptes(pmap, pmap2);
4953           mutex_exit(&pmap->pm_lock);
4954 }
4955 
4956 /*
4957  * pmap_copy: copy mappings from one pmap to another
4958  *
4959  * => optional function
4960  * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4961  */
4962 
4963 /*
4964  * defined as macro in pmap.h
4965  */
4966 
4967 __strict_weak_alias(pmap_enter, pmap_enter_default);
4968 
4969 int
4970 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4971     u_int flags)
4972 {
4973           if (__predict_false(pmap->pm_enter != NULL)) {
4974                     return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4975           }
4976 
4977           return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4978 }
4979 
4980 /*
4981  * pmap_enter: enter a mapping into a pmap
4982  *
4983  * => must be done "now" ... no lazy-evaluation
4984  */
4985 int
4986 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4987              vm_prot_t prot, u_int flags, int domid)
4988 {
4989           pt_entry_t *ptes, opte, npte;
4990           pt_entry_t *ptep;
4991           pd_entry_t * const *pdes;
4992           struct vm_page *ptp;
4993           struct vm_page *new_pg, *old_pg;
4994           struct pmap_page *new_pp, *old_pp;
4995           struct pv_entry *old_pve, *new_pve;
4996           bool wired = (flags & PMAP_WIRED) != 0;
4997           struct pmap *pmap2;
4998           struct pmap_ptparray pt;
4999           int error;
5000           bool getptp, samepage, new_embedded;
5001           rb_tree_t *tree;
5002 
5003           KASSERT(pmap_initialized);
5004           KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5005           KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5006               PRIxVADDR " over PDP!", __func__, va);
5007           KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
5008               pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
5009               "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
5010 
5011 #ifdef XENPV
5012           KASSERT(domid == DOMID_SELF || pa == 0);
5013 #endif
5014 
5015           npte = ma | protection_codes[prot] | PTE_P;
5016           npte |= pmap_pat_flags(flags);
5017           if (wired)
5018                     npte |= PTE_WIRED;
5019           if (va < VM_MAXUSER_ADDRESS) {
5020                     KASSERTMSG(pmap != pmap_kernel(),
5021                         "entering user va %#"PRIxVADDR" into kernel pmap",
5022                         va);
5023                     if (pmap_is_user(pmap))
5024                               npte |= PTE_U;
5025           }
5026 
5027           if (pmap == pmap_kernel())
5028                     npte |= pmap_pg_g;
5029           if (flags & VM_PROT_ALL) {
5030                     npte |= PTE_A;
5031                     if (flags & VM_PROT_WRITE) {
5032                               KASSERT((npte & PTE_W) != 0);
5033                               npte |= PTE_D;
5034                     }
5035           }
5036 
5037 #ifdef XENPV
5038           if (domid != DOMID_SELF)
5039                     new_pg = NULL;
5040           else
5041 #endif
5042                     new_pg = PHYS_TO_VM_PAGE(pa);
5043 
5044           if (new_pg != NULL) {
5045                     /* This is a managed page */
5046                     npte |= PTE_PVLIST;
5047                     new_pp = VM_PAGE_TO_PP(new_pg);
5048                     PMAP_CHECK_PP(new_pp);
5049           } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5050                     /* This is an unmanaged pv-tracked page */
5051                     npte |= PTE_PVLIST;
5052                     PMAP_CHECK_PP(new_pp);
5053           } else {
5054                     new_pp = NULL;
5055           }
5056 
5057           /* Begin by locking the pmap. */
5058           mutex_enter(&pmap->pm_lock);
5059 
5060           /* Look up the PTP.  Allocate if none present. */
5061           ptp = NULL;
5062           getptp = false;
5063           if (pmap != pmap_kernel()) {
5064                     ptp = pmap_find_ptp(pmap, va, 1);
5065                     if (ptp == NULL) {
5066                               getptp = true;
5067                               error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5068                               if (error != 0) {
5069                                         if (flags & PMAP_CANFAIL) {
5070                                                   mutex_exit(&pmap->pm_lock);
5071                                                   return error;
5072                                         }
5073                                         panic("%s: get ptp failed, error=%d", __func__,
5074                                             error);
5075                               }
5076                     }
5077                     tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5078           } else {
5079                     /* Embedded PV entries rely on this. */
5080                     KASSERT(va != 0);
5081                     tree = &pmap_kernel_rb;
5082           }
5083 
5084           /*
5085            * Look up the old PV entry at this VA (if any), and insert a new PV
5086            * entry if required for the new mapping.  Temporarily track the old
5087            * and new mappings concurrently.  Only after the old mapping is
5088            * evicted from the pmap will we remove its PV entry.  Otherwise,
5089            * our picture of modified/accessed state for either page could get
5090            * out of sync (we need any P->V operation for either page to stall
5091            * on pmap->pm_lock until done here).
5092            */
5093           new_pve = NULL;
5094           old_pve = NULL;
5095           samepage = false;
5096           new_embedded = false;
5097 
5098           if (new_pp != NULL) {
5099                     error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5100                         &old_pve, &samepage, &new_embedded, tree);
5101 
5102                     /*
5103                      * If a new pv_entry was needed and none was available, we
5104                      * can go no further.
5105                      */
5106                     if (error != 0) {
5107                               if (flags & PMAP_CANFAIL) {
5108                                         if (getptp) {
5109                                                   pmap_unget_ptp(pmap, &pt);
5110                                         }
5111                                         mutex_exit(&pmap->pm_lock);
5112                                         return error;
5113                               }
5114                               panic("%s: alloc pve failed", __func__);
5115                     }
5116           } else {
5117                     old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5118           }
5119 
5120           /* Map PTEs into address space. */
5121           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5122 
5123           /* Install any newly allocated PTPs. */
5124           if (getptp) {
5125                     pmap_install_ptp(pmap, &pt, va, pdes);
5126           }
5127 
5128           /* Check if there is an existing mapping. */
5129           ptep = &ptes[pl1_i(va)];
5130           opte = *ptep;
5131           bool have_oldpa = pmap_valid_entry(opte);
5132           paddr_t oldpa = pmap_pte2pa(opte);
5133 
5134           /*
5135            * Update the pte.
5136            */
5137           do {
5138                     opte = *ptep;
5139 
5140                     /*
5141                      * if the same page, inherit PTE_A and PTE_D.
5142                      */
5143                     if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5144                               npte |= opte & (PTE_A | PTE_D);
5145                     }
5146 #if defined(XENPV)
5147                     if (domid != DOMID_SELF) {
5148                               /* pmap_pte_cas with error handling */
5149                               int s = splvm();
5150                               if (opte != *ptep) {
5151                                         splx(s);
5152                                         continue;
5153                               }
5154                               error = xpq_update_foreign(
5155                                   vtomach((vaddr_t)ptep), npte, domid, flags);
5156                               splx(s);
5157                               if (error) {
5158                                         /* Undo pv_entry tracking - oof. */
5159                                         if (new_pp != NULL) {
5160                                                   mutex_spin_enter(&new_pp->pp_lock);
5161                                                   if (new_pve != NULL) {
5162                                                             LIST_REMOVE(new_pve, pve_list);
5163                                                             KASSERT(pmap->pm_pve == NULL);
5164                                                             pmap->pm_pve = new_pve;
5165                                                   } else if (new_embedded) {
5166                                                             new_pp->pp_pte.pte_ptp = NULL;
5167                                                             new_pp->pp_pte.pte_va = 0;
5168                                                   }
5169                                                   mutex_spin_exit(&new_pp->pp_lock);
5170                                         }
5171                                         pmap_unmap_ptes(pmap, pmap2);
5172                                         /* Free new PTP. */
5173                                         if (ptp != NULL && ptp->wire_count <= 1) {
5174                                                   pmap_free_ptp(pmap, ptp, va, ptes,
5175                                                       pdes);
5176                                         }
5177                                         mutex_exit(&pmap->pm_lock);
5178                                         return error;
5179                               }
5180                               break;
5181                     }
5182 #endif /* defined(XENPV) */
5183           } while (pmap_pte_cas(ptep, opte, npte) != opte);
5184 
5185           /*
5186            * Done with the PTEs: they can now be unmapped.
5187            */
5188           pmap_unmap_ptes(pmap, pmap2);
5189 
5190           /*
5191            * Update statistics and PTP's reference count.
5192            */
5193           pmap_stats_update_bypte(pmap, npte, opte);
5194           if (ptp != NULL) {
5195                     if (!have_oldpa) {
5196                               ptp->wire_count++;
5197                     }
5198                     /* Remember minimum VA in PTP. */
5199                     pmap_ptp_range_set(ptp, va);
5200           }
5201           KASSERT(ptp == NULL || ptp->wire_count > 1);
5202 
5203           /*
5204            * If the same page, we can skip pv_entry handling.
5205            */
5206           if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5207                     KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5208                     if ((npte & PTE_PVLIST) != 0) {
5209                               KASSERT(samepage);
5210                               pmap_check_pv(pmap, ptp, new_pp, va, true);
5211                     }
5212                     goto same_pa;
5213           } else if ((npte & PTE_PVLIST) != 0) {
5214                     KASSERT(!samepage);
5215           }
5216 
5217           /*
5218            * If old page is pv-tracked, remove pv_entry from its list.
5219            */
5220           if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5221                     if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5222                               old_pp = VM_PAGE_TO_PP(old_pg);
5223                     } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5224                               panic("%s: PTE_PVLIST with pv-untracked page"
5225                                   " va = %#"PRIxVADDR
5226                                   " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5227                                   __func__, va, oldpa, atop(pa));
5228                     }
5229 
5230                     pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5231                         pmap_pte_to_pp_attrs(opte));
5232           } else {
5233                     KASSERT(old_pve == NULL);
5234                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5235           }
5236 
5237           /*
5238            * If new page is dynamically PV tracked, insert to tree.
5239            */
5240           if (new_pve != NULL) {
5241                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5242                     old_pve = rb_tree_insert_node(tree, new_pve);
5243                     KASSERT(old_pve == new_pve);
5244                     pmap_check_pv(pmap, ptp, new_pp, va, true);
5245           }
5246 
5247 same_pa:
5248           /*
5249            * shootdown tlb if necessary.
5250            */
5251 
5252           if ((~opte & (PTE_P | PTE_A)) == 0 &&
5253               ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5254                     pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5255           }
5256           pmap_drain_pv(pmap);
5257           mutex_exit(&pmap->pm_lock);
5258           return 0;
5259 }
5260 
5261 #if defined(XEN) && defined(DOM0OPS)
5262 
5263 struct pmap_data_gnt {
5264           SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5265           vaddr_t pd_gnt_sva;
5266           vaddr_t pd_gnt_eva; /* range covered by this gnt */
5267           int pd_gnt_refs; /* ref counter */
5268           struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5269 };
5270 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5271 
5272 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5273 
5274 static struct pmap_data_gnt *
5275 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5276 {
5277           struct pmap_data_gnt_head *headp;
5278           struct pmap_data_gnt *pgnt;
5279 
5280           KASSERT(mutex_owned(&pmap->pm_lock));
5281           headp = pmap->pm_data;
5282           KASSERT(headp != NULL);
5283           SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5284                     if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5285                               return pgnt;
5286                     /* check that we're not overlapping part of a region */
5287                     KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5288           }
5289           return NULL;
5290 }
5291 
5292 static void
5293 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5294     const struct gnttab_map_grant_ref *ops)
5295 {
5296           struct pmap_data_gnt_head *headp;
5297           struct pmap_data_gnt *pgnt;
5298           vaddr_t eva = sva + nentries * PAGE_SIZE;
5299           KASSERT(mutex_owned(&pmap->pm_lock));
5300           KASSERT(nentries >= 1);
5301           if (pmap->pm_remove == NULL) {
5302                     pmap->pm_remove = pmap_remove_gnt;
5303                     KASSERT(pmap->pm_data == NULL);
5304                     headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5305                     SLIST_INIT(headp);
5306                     pmap->pm_data = headp;
5307           } else {
5308                     KASSERT(pmap->pm_remove == pmap_remove_gnt);
5309                     KASSERT(pmap->pm_data != NULL);
5310                     headp = pmap->pm_data;
5311           }
5312 
5313           pgnt = pmap_find_gnt(pmap, sva, eva);
5314           if (pgnt != NULL) {
5315                     KASSERT(pgnt->pd_gnt_sva == sva);
5316                     KASSERT(pgnt->pd_gnt_eva == eva);
5317                     return;
5318           }
5319 
5320           /* new entry */
5321           pgnt = kmem_alloc(sizeof(*pgnt) +
5322               (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5323           pgnt->pd_gnt_sva = sva;
5324           pgnt->pd_gnt_eva = eva;
5325           pgnt->pd_gnt_refs = 0;
5326           memcpy(pgnt->pd_gnt_ops, ops,
5327               sizeof(struct gnttab_map_grant_ref) * nentries);
5328           SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5329 }
5330 
5331 static void
5332 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5333 {
5334           struct pmap_data_gnt_head *headp = pmap->pm_data;
5335           int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5336           KASSERT(nentries >= 1);
5337           KASSERT(mutex_owned(&pmap->pm_lock));
5338           KASSERT(pgnt->pd_gnt_refs == 0);
5339           SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5340           kmem_free(pgnt, sizeof(*pgnt) +
5341                         (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5342           if (SLIST_EMPTY(headp)) {
5343                     kmem_free(headp, sizeof(*headp));
5344                     pmap->pm_data = NULL;
5345                     pmap->pm_remove = NULL;
5346           }
5347 }
5348 
5349 /*
5350  * pmap_enter_gnt: enter a grant entry into a pmap
5351  *
5352  * => must be done "now" ... no lazy-evaluation
5353  */
5354 int
5355 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5356     const struct gnttab_map_grant_ref *oops)
5357 {
5358           struct pmap_data_gnt *pgnt;
5359           pt_entry_t *ptes, opte;
5360 #ifndef XENPV
5361           pt_entry_t npte;
5362 #endif
5363           pt_entry_t *ptep;
5364           pd_entry_t * const *pdes;
5365           struct vm_page *ptp;
5366           struct vm_page *old_pg;
5367           struct pmap_page *old_pp;
5368           struct pv_entry *old_pve;
5369           struct pmap *pmap2;
5370           struct pmap_ptparray pt;
5371           int error;
5372           bool getptp;
5373           rb_tree_t *tree;
5374           struct gnttab_map_grant_ref *op;
5375           int ret;
5376           int idx;
5377 
5378           KASSERT(pmap_initialized);
5379           KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5380           KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5381               PRIxVADDR " over PDP!", __func__, va);
5382           KASSERT(pmap != pmap_kernel());
5383 
5384           /* Begin by locking the pmap. */
5385           mutex_enter(&pmap->pm_lock);
5386           pmap_alloc_gnt(pmap, sva, nentries, oops);
5387 
5388           pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5389           KASSERT(pgnt != NULL);
5390 
5391           /* Look up the PTP.  Allocate if none present. */
5392           ptp = NULL;
5393           getptp = false;
5394           ptp = pmap_find_ptp(pmap, va, 1);
5395           if (ptp == NULL) {
5396                     getptp = true;
5397                     error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5398                     if (error != 0) {
5399                               mutex_exit(&pmap->pm_lock);
5400                               return error;
5401                     }
5402           }
5403           tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5404 
5405           /*
5406            * Look up the old PV entry at this VA (if any), and insert a new PV
5407            * entry if required for the new mapping.  Temporarily track the old
5408            * and new mappings concurrently.  Only after the old mapping is
5409            * evicted from the pmap will we remove its PV entry.  Otherwise,
5410            * our picture of modified/accessed state for either page could get
5411            * out of sync (we need any P->V operation for either page to stall
5412            * on pmap->pm_lock until done here).
5413            */
5414           old_pve = NULL;
5415 
5416           old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5417 
5418           /* Map PTEs into address space. */
5419           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5420 
5421           /* Install any newly allocated PTPs. */
5422           if (getptp) {
5423                     pmap_install_ptp(pmap, &pt, va, pdes);
5424           }
5425 
5426           /* Check if there is an existing mapping. */
5427           ptep = &ptes[pl1_i(va)];
5428           opte = *ptep;
5429           bool have_oldpa = pmap_valid_entry(opte);
5430           paddr_t oldpa = pmap_pte2pa(opte);
5431 
5432           /*
5433            * Update the pte.
5434            */
5435 
5436           idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5437           op = &pgnt->pd_gnt_ops[idx];
5438 
5439 #ifdef XENPV
5440           KASSERT(op->flags & GNTMAP_contains_pte);
5441           op->host_addr = xpmap_ptetomach(ptep);
5442 #else
5443           KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5444           KASSERT(op->flags != 0);
5445           KASSERT(op->host_addr != 0);
5446 #endif
5447           op->dev_bus_addr = 0;
5448           op->status = GNTST_general_error;
5449           ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5450           if (__predict_false(ret)) {
5451                     printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5452                         __func__, ret);
5453                     op->status = GNTST_general_error;
5454           }
5455           for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5456                     kpause("gntmap", false, mstohz(1), NULL);
5457                     ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5458                     if (__predict_false(ret)) {
5459                               printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5460                                   __func__, ret);
5461                               op->status = GNTST_general_error;
5462                     }
5463           }
5464           if (__predict_false(op->status != GNTST_okay)) {
5465                     printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5466                         __func__, op->status);
5467                     if (have_oldpa) { /* XXX did the pte really change if XENPV  ?*/
5468                               ptp->wire_count--;
5469                     }
5470           } else {
5471 #ifndef XENPV
5472                     npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5473                     if ((op->flags & GNTMAP_readonly) == 0)
5474                               npte |= PTE_W;
5475                     do {
5476                               opte = *ptep;
5477                     } while (pmap_pte_cas(ptep, opte, npte) != opte);
5478 #endif
5479                     pgnt->pd_gnt_refs++;
5480                     if (!have_oldpa) {
5481                               ptp->wire_count++;
5482                     }
5483                     KASSERT(ptp->wire_count > 1);
5484                     /* Remember minimum VA in PTP. */
5485                     pmap_ptp_range_set(ptp, va);
5486           }
5487           if (ptp->wire_count <= 1)
5488                     pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5489 
5490           /*
5491            * Done with the PTEs: they can now be unmapped.
5492            */
5493           pmap_unmap_ptes(pmap, pmap2);
5494 
5495           /*
5496            * Update statistics and PTP's reference count.
5497            */
5498           pmap_stats_update_bypte(pmap, 0, opte);
5499 
5500           /*
5501            * If old page is pv-tracked, remove pv_entry from its list.
5502            */
5503           if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5504                     if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5505                               old_pp = VM_PAGE_TO_PP(old_pg);
5506                     } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5507                               panic("%s: PTE_PVLIST with pv-untracked page"
5508                                   " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5509                                   __func__, va, oldpa);
5510                     }
5511 
5512                     pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5513                         pmap_pte_to_pp_attrs(opte));
5514           } else {
5515                     KASSERT(old_pve == NULL);
5516                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5517           }
5518 
5519           pmap_drain_pv(pmap);
5520           mutex_exit(&pmap->pm_lock);
5521           return op->status;
5522 }
5523 
5524 /*
5525  * pmap_remove_gnt: grant mapping removal function.
5526  *
5527  * => caller should not be holding any pmap locks
5528  */
5529 static void
5530 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5531 {
5532           struct pmap_data_gnt *pgnt;
5533           pt_entry_t *ptes;
5534           pd_entry_t pde;
5535           pd_entry_t * const *pdes;
5536           struct vm_page *ptp;
5537           struct pmap *pmap2;
5538           vaddr_t va;
5539           int lvl;
5540           int idx;
5541           struct gnttab_map_grant_ref *op;
5542           struct gnttab_unmap_grant_ref unmap_op;
5543           int ret;
5544 
5545           KASSERT(pmap != pmap_kernel());
5546           KASSERT(pmap->pm_remove == pmap_remove_gnt);
5547 
5548           mutex_enter(&pmap->pm_lock);
5549           for (va = sva; va < eva; va += PAGE_SIZE) {
5550                     pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5551                     if (pgnt == NULL) {
5552                               pmap_remove_locked(pmap, sva, eva);
5553                               continue;
5554                     }
5555 
5556                     pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5557                     if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5558                               panic("pmap_remove_gnt pdes not valid");
5559                     }
5560 
5561                     idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5562                     op = &pgnt->pd_gnt_ops[idx];
5563                     KASSERT(lvl == 1);
5564 
5565                     /* Get PTP if non-kernel mapping. */
5566                     ptp = pmap_find_ptp(pmap, va, 1);
5567                     KASSERTMSG(ptp != NULL,
5568                         "%s: unmanaged PTP detected", __func__);
5569 
5570                     if (op->status == GNTST_okay)  {
5571                               KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5572 #ifdef XENPV
5573                               unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5574 #else
5575                               unmap_op.host_addr = op->host_addr;
5576                               pmap_pte_testset(&ptes[pl1_i(va)], 0);
5577 #endif
5578                               unmap_op.handle = op->handle;
5579                               unmap_op.dev_bus_addr = 0;
5580                               ret = HYPERVISOR_grant_table_op(
5581                                   GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5582                               if (ret) {
5583                                         printf("%s: GNTTABOP_unmap_grant_ref "
5584                                             "failed: %d\n", __func__, ret);
5585                               }
5586 
5587                               ptp->wire_count--;
5588                               pgnt->pd_gnt_refs--;
5589                     }
5590                     if (pgnt->pd_gnt_refs == 0) {
5591                               pmap_free_gnt(pmap, pgnt);
5592                     }
5593                     /*
5594                      * if mapping removed and the PTP is no longer
5595                      * being used, free it!
5596                      */
5597 
5598                     if (ptp->wire_count <= 1)
5599                               pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5600                     pmap_unmap_ptes(pmap, pmap2);
5601           }
5602           mutex_exit(&pmap->pm_lock);
5603 }
5604 #endif /* XEN && DOM0OPS */
5605 
5606 paddr_t
5607 pmap_get_physpage(void)
5608 {
5609           struct vm_page *ptp;
5610           struct pmap *kpm = pmap_kernel();
5611           paddr_t pa;
5612 
5613           if (!uvm.page_init_done) {
5614                     /*
5615                      * We're growing the kernel pmap early (from
5616                      * uvm_pageboot_alloc()). This case must be
5617                      * handled a little differently.
5618                      */
5619 
5620                     if (!uvm_page_physget(&pa))
5621                               panic("%s: out of memory", __func__);
5622 #if defined(__HAVE_DIRECT_MAP)
5623                     memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5624 #else
5625 #if defined(XENPV)
5626                     if (XEN_VERSION_SUPPORTED(3, 4)) {
5627                               xen_pagezero(pa);
5628                               return pa;
5629                     }
5630 #endif
5631                     kpreempt_disable();
5632                     pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5633                         PTE_W | pmap_pg_nx);
5634                     pmap_pte_flush();
5635                     pmap_update_pg((vaddr_t)early_zerop);
5636                     memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5637 #if defined(DIAGNOSTIC) || defined(XENPV)
5638                     pmap_pte_set(early_zero_pte, 0);
5639                     pmap_pte_flush();
5640 #endif /* defined(DIAGNOSTIC) */
5641                     kpreempt_enable();
5642 #endif /* defined(__HAVE_DIRECT_MAP) */
5643           } else {
5644                     /* XXX */
5645                     ptp = uvm_pagealloc(NULL, 0, NULL,
5646                                             UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5647                     if (ptp == NULL)
5648                               panic("%s: out of memory", __func__);
5649                     ptp->flags &= ~PG_BUSY;
5650                     ptp->wire_count = 1;
5651                     pa = VM_PAGE_TO_PHYS(ptp);
5652           }
5653           pmap_stats_update(kpm, 1, 0);
5654 
5655           return pa;
5656 }
5657 
5658 /*
5659  * Expand the page tree with the specified amount of PTPs, mapping virtual
5660  * addresses starting at kva. We populate all the levels but the last one
5661  * (L1). The nodes of the tree are created as RW, but the pages covered
5662  * will be kentered in L1, with proper permissions.
5663  *
5664  * Used only by pmap_growkernel.
5665  */
5666 static void
5667 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5668 {
5669           unsigned long i;
5670           paddr_t pa;
5671           unsigned long index, endindex;
5672           int level;
5673           pd_entry_t *pdep;
5674 #ifdef XENPV
5675           int s = splvm(); /* protect xpq_* */
5676 #endif
5677 
5678           for (level = PTP_LEVELS; level > 1; level--) {
5679                     if (level == PTP_LEVELS)
5680                               pdep = cpm->pm_pdir;
5681                     else
5682                               pdep = normal_pdes[level - 2];
5683                     index = pl_i_roundup(kva, level);
5684                     endindex = index + needed_ptps[level - 1] - 1;
5685 
5686                     for (i = index; i <= endindex; i++) {
5687                               pt_entry_t pte;
5688 
5689                               KASSERT(!pmap_valid_entry(pdep[i]));
5690                               pa = pmap_get_physpage();
5691                               pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5692 #ifdef __x86_64__
5693                               pte |= pmap_pg_nx;
5694 #endif
5695                               pmap_pte_set(&pdep[i], pte);
5696 
5697 #ifdef XENPV
5698                               if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5699                                         if (__predict_true(
5700                                             cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5701                                                   /* update per-cpu PMDs on all cpus */
5702                                                   xen_kpm_sync(pmap_kernel(), i);
5703                                         } else {
5704                                                   /*
5705                                                    * too early; update primary CPU
5706                                                    * PMD only (without locks)
5707                                                    */
5708 #ifdef __x86_64__
5709                                                   pd_entry_t *cpu_pdep =
5710                                                             &cpu_info_primary.ci_kpm_pdir[i];
5711 #else
5712                                                   pd_entry_t *cpu_pdep =
5713                                                       &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5714 #endif
5715                                                   pmap_pte_set(cpu_pdep, pte);
5716                                         }
5717                               }
5718 #endif
5719 
5720                               KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5721                                   pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5722                               nkptp[level - 1]++;
5723                     }
5724                     pmap_pte_flush();
5725           }
5726 #ifdef XENPV
5727           splx(s);
5728 #endif
5729 }
5730 
5731 /*
5732  * pmap_growkernel: increase usage of KVM space.
5733  *
5734  * => we allocate new PTPs for the kernel and install them in all
5735  *    the pmaps on the system.
5736  */
5737 vaddr_t
5738 pmap_growkernel(vaddr_t maxkvaddr)
5739 {
5740           struct pmap *kpm = pmap_kernel();
5741           struct pmap *cpm;
5742 #if !defined(XENPV) || !defined(__x86_64__)
5743           struct pmap *pm;
5744           long old;
5745 #endif
5746           int s, i;
5747           long needed_kptp[PTP_LEVELS], target_nptp;
5748           bool invalidate = false;
5749 
5750           s = splvm();        /* to be safe */
5751           mutex_enter(&kpm->pm_lock);
5752 
5753           if (maxkvaddr <= pmap_maxkvaddr) {
5754                     mutex_exit(&kpm->pm_lock);
5755                     splx(s);
5756                     return pmap_maxkvaddr;
5757           }
5758 
5759           maxkvaddr = x86_round_pdr(maxkvaddr);
5760 #if !defined(XENPV) || !defined(__x86_64__)
5761           old = nkptp[PTP_LEVELS - 1];
5762 #endif
5763 
5764           /* Initialize needed_kptp. */
5765           for (i = PTP_LEVELS - 1; i >= 1; i--) {
5766                     target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5767                         pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5768 
5769                     if (target_nptp > nkptpmax[i])
5770                               panic("out of KVA space");
5771                     KASSERT(target_nptp >= nkptp[i]);
5772                     needed_kptp[i] = target_nptp - nkptp[i];
5773           }
5774 
5775 #ifdef XENPV
5776           /* only pmap_kernel(), or the per-cpu map, has kernel entries */
5777           cpm = kpm;
5778 #else
5779           /* Get the current pmap */
5780           if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5781                     cpm = curcpu()->ci_pmap;
5782           } else {
5783                     cpm = kpm;
5784           }
5785 #endif
5786 
5787           kasan_shadow_map((void *)pmap_maxkvaddr,
5788               (size_t)(maxkvaddr - pmap_maxkvaddr));
5789           kmsan_shadow_map((void *)pmap_maxkvaddr,
5790               (size_t)(maxkvaddr - pmap_maxkvaddr));
5791 
5792           pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5793 
5794           /*
5795            * If the number of top level entries changed, update all pmaps.
5796            */
5797           if (needed_kptp[PTP_LEVELS - 1] != 0) {
5798 #ifdef XENPV
5799 #ifdef __x86_64__
5800                     /* nothing, kernel entries are never entered in user pmap */
5801 #else
5802                     int pdkidx;
5803 
5804                     mutex_enter(&pmaps_lock);
5805                     LIST_FOREACH(pm, &pmaps, pm_list) {
5806                               for (pdkidx = PDIR_SLOT_KERN + old;
5807                                   pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5808                                   pdkidx++) {
5809                                         pmap_pte_set(&pm->pm_pdir[pdkidx],
5810                                             kpm->pm_pdir[pdkidx]);
5811                               }
5812                               pmap_pte_flush();
5813                     }
5814                     mutex_exit(&pmaps_lock);
5815 #endif /* __x86_64__ */
5816 #else /* XENPV */
5817                     size_t newpdes;
5818                     newpdes = nkptp[PTP_LEVELS - 1] - old;
5819                     if (cpm != kpm) {
5820                               memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5821                                   &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5822                                   newpdes * sizeof(pd_entry_t));
5823                     }
5824 
5825                     mutex_enter(&pmaps_lock);
5826                     LIST_FOREACH(pm, &pmaps, pm_list) {
5827                               if (__predict_false(pm->pm_enter != NULL)) {
5828                                         /*
5829                                          * Not a native pmap, the kernel is not mapped,
5830                                          * so nothing to synchronize.
5831                                          */
5832                                         continue;
5833                               }
5834                               memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5835                                   &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5836                                   newpdes * sizeof(pd_entry_t));
5837                     }
5838                     mutex_exit(&pmaps_lock);
5839 #endif
5840                     invalidate = true;
5841           }
5842           pmap_maxkvaddr = maxkvaddr;
5843           mutex_exit(&kpm->pm_lock);
5844           splx(s);
5845 
5846           if (invalidate && pmap_initialized) {
5847                     /* Invalidate the pmap cache. */
5848                     pool_cache_invalidate(&pmap_cache);
5849           }
5850 
5851           return maxkvaddr;
5852 }
5853 
5854 #ifdef DEBUG
5855 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5856 
5857 /*
5858  * pmap_dump: dump all the mappings from a pmap
5859  *
5860  * => caller should not be holding any pmap locks
5861  */
5862 void
5863 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5864 {
5865           pt_entry_t *ptes, *pte;
5866           pd_entry_t * const *pdes;
5867           struct pmap *pmap2;
5868           vaddr_t blkendva;
5869           int lvl;
5870 
5871           /*
5872            * if end is out of range truncate.
5873            * if (end == start) update to max.
5874            */
5875 
5876           if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5877                     eva = VM_MAXUSER_ADDRESS;
5878 
5879           mutex_enter(&pmap->pm_lock);
5880           pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5881 
5882           /*
5883            * dumping a range of pages: we dump in PTP sized blocks (4MB)
5884            */
5885 
5886           for (/* null */ ; sva < eva ; sva = blkendva) {
5887 
5888                     /* determine range of block */
5889                     blkendva = x86_round_pdr(sva+1);
5890                     if (blkendva > eva)
5891                               blkendva = eva;
5892 
5893                     /* valid block? */
5894                     if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5895                               continue;
5896                     KASSERT(lvl == 1);
5897 
5898                     pte = &ptes[pl1_i(sva)];
5899                     for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5900                               if (!pmap_valid_entry(*pte))
5901                                         continue;
5902                               printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5903                                   " (pte=%#" PRIxPADDR ")\n",
5904                                   sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5905                     }
5906           }
5907           pmap_unmap_ptes(pmap, pmap2);
5908           mutex_exit(&pmap->pm_lock);
5909 }
5910 #endif
5911 
5912 /*
5913  * pmap_update: process deferred invalidations and frees.
5914  */
5915 void
5916 pmap_update(struct pmap *pmap)
5917 {
5918           struct pmap_page *pp;
5919           struct vm_page *ptp;
5920 
5921           /*
5922            * Initiate any pending TLB shootdowns.  Wait for them to
5923            * complete before returning control to the caller.
5924            */
5925           kpreempt_disable();
5926           pmap_tlb_shootnow();
5927           kpreempt_enable();
5928 
5929           /*
5930            * Now that shootdowns are complete, process deferred frees.  This
5931            * is an unlocked check, but is safe as we're only interested in
5932            * work done in this LWP - we won't get a false negative.
5933            */
5934           if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5935                     return;
5936           }
5937 
5938           mutex_enter(&pmap->pm_lock);
5939           while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5940                     KASSERT(ptp->wire_count == 0);
5941                     KASSERT(ptp->uanon == NULL);
5942                     LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5943                     pp = VM_PAGE_TO_PP(ptp);
5944                     LIST_INIT(&pp->pp_pvlist);
5945                     pp->pp_attrs = 0;
5946                     pp->pp_pte.pte_ptp = NULL;
5947                     pp->pp_pte.pte_va = 0;
5948                     PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5949 
5950                     /*
5951                      * XXX Hack to avoid extra locking, and lock
5952                      * assertions in uvm_pagefree().  Despite uobject
5953                      * being set, this isn't a managed page.
5954                      */
5955                     PMAP_DUMMY_LOCK(pmap);
5956                     uvm_pagerealloc(ptp, NULL, 0);
5957                     PMAP_DUMMY_UNLOCK(pmap);
5958                     uvm_pagefree(ptp);
5959           }
5960           mutex_exit(&pmap->pm_lock);
5961 }
5962 
5963 #if PTP_LEVELS > 4
5964 #error "Unsupported number of page table mappings"
5965 #endif
5966 
5967 paddr_t
5968 pmap_init_tmp_pgtbl(paddr_t pg)
5969 {
5970           static bool maps_loaded;
5971           static const paddr_t x86_tmp_pml_paddr[] = {
5972               4 * PAGE_SIZE,  /* L1 */
5973               5 * PAGE_SIZE,  /* L2 */
5974               6 * PAGE_SIZE,  /* L3 */
5975               7 * PAGE_SIZE   /* L4 */
5976           };
5977           static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5978 
5979           pd_entry_t *tmp_pml, *kernel_pml;
5980 
5981           int level;
5982 
5983           if (!maps_loaded) {
5984                     for (level = 0; level < PTP_LEVELS; ++level) {
5985                               x86_tmp_pml_vaddr[level] =
5986                                   uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5987                                   UVM_KMF_VAONLY);
5988 
5989                               if (x86_tmp_pml_vaddr[level] == 0)
5990                                         panic("mapping of real mode PML failed\n");
5991                               pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5992                                   x86_tmp_pml_paddr[level],
5993                                   VM_PROT_READ | VM_PROT_WRITE, 0);
5994                     }
5995                     pmap_update(pmap_kernel());
5996                     maps_loaded = true;
5997           }
5998 
5999           /* Zero levels 1-3 */
6000           for (level = 0; level < PTP_LEVELS - 1; ++level) {
6001                     tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6002                     memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
6003           }
6004 
6005           /* Copy PML4 */
6006           kernel_pml = pmap_kernel()->pm_pdir;
6007           tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
6008           memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
6009 
6010 #ifdef PAE
6011           /*
6012            * Use the last 4 entries of the L2 page as L3 PD entries. These
6013            * last entries are unlikely to be used for temporary mappings.
6014            * 508: maps 0->1GB (userland)
6015            * 509: unused
6016            * 510: unused
6017            * 511: maps 3->4GB (kernel)
6018            */
6019           tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6020           tmp_pml[509] = 0;
6021           tmp_pml[510] = 0;
6022           tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6023 #endif
6024 
6025           for (level = PTP_LEVELS - 1; level > 0; --level) {
6026                     tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6027 
6028                     tmp_pml[pl_i(pg, level + 1)] =
6029                         (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6030           }
6031 
6032           tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6033           tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6034 
6035 #ifdef PAE
6036           /* Return the PA of the L3 page (entry 508 of the L2 page) */
6037           return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6038 #endif
6039 
6040           return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6041 }
6042 
6043 u_int
6044 x86_mmap_flags(paddr_t mdpgno)
6045 {
6046           u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6047           u_int pflag = 0;
6048 
6049           if (nflag & X86_MMAP_FLAG_PREFETCH)
6050                     pflag |= PMAP_WRITE_COMBINE;
6051 
6052           return pflag;
6053 }
6054 
6055 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6056 
6057 /*
6058  * -----------------------------------------------------------------------------
6059  * *****************************************************************************
6060  * *****************************************************************************
6061  * *****************************************************************************
6062  * *****************************************************************************
6063  * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6064  * *****************************************************************************
6065  * *****************************************************************************
6066  * *****************************************************************************
6067  * *****************************************************************************
6068  * -----------------------------------------------------------------------------
6069  *
6070  * These functions are invoked as callbacks from the code above. Contrary to
6071  * native, EPT does not have a recursive slot; therefore, it is not possible
6072  * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6073  * tree manually.
6074  *
6075  * Apart from that, the logic is mostly the same as native. Once a pmap has
6076  * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6077  * After that we're good, and the callbacks will handle the translations
6078  * for us.
6079  *
6080  * -----------------------------------------------------------------------------
6081  */
6082 
6083 /* Hardware bits. */
6084 #define EPT_R                 __BIT(0)  /* read */
6085 #define EPT_W                 __BIT(1)  /* write */
6086 #define EPT_X                 __BIT(2)  /* execute */
6087 #define EPT_T                 __BITS(5,3)         /* type */
6088 #define             TYPE_UC   0
6089 #define             TYPE_WC   1
6090 #define             TYPE_WT   4
6091 #define             TYPE_WP   5
6092 #define             TYPE_WB   6
6093 #define EPT_NOPAT   __BIT(6)
6094 #define EPT_L                 __BIT(7)  /* large */
6095 #define EPT_A                 __BIT(8)  /* accessed */
6096 #define EPT_D                 __BIT(9)  /* dirty */
6097 /* Software bits. */
6098 #define EPT_PVLIST  __BIT(60)
6099 #define EPT_WIRED   __BIT(61)
6100 
6101 #define pmap_ept_valid_entry(pte)       (pte & EPT_R)
6102 
6103 bool pmap_ept_has_ad __read_mostly;
6104 
6105 static inline void
6106 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6107 {
6108           int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6109           int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6110 
6111           KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6112           KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6113 
6114           pmap_stats_update(pmap, resid_diff, wired_diff);
6115 }
6116 
6117 static pt_entry_t
6118 pmap_ept_type(u_int flags)
6119 {
6120           u_int cacheflags = (flags & PMAP_CACHE_MASK);
6121           pt_entry_t ret;
6122 
6123           switch (cacheflags) {
6124           case PMAP_NOCACHE:
6125           case PMAP_NOCACHE_OVR:
6126                     ret = __SHIFTIN(TYPE_UC, EPT_T);
6127                     break;
6128           case PMAP_WRITE_COMBINE:
6129                     ret = __SHIFTIN(TYPE_WC, EPT_T);
6130                     break;
6131           case PMAP_WRITE_BACK:
6132           default:
6133                     ret = __SHIFTIN(TYPE_WB, EPT_T);
6134                     break;
6135           }
6136 
6137           ret |= EPT_NOPAT;
6138           return ret;
6139 }
6140 
6141 static inline pt_entry_t
6142 pmap_ept_prot(vm_prot_t prot)
6143 {
6144           pt_entry_t res = 0;
6145 
6146           if (prot & VM_PROT_READ)
6147                     res |= EPT_R;
6148           if (prot & VM_PROT_WRITE)
6149                     res |= EPT_W;
6150           if (prot & VM_PROT_EXECUTE)
6151                     res |= EPT_X;
6152 
6153           return res;
6154 }
6155 
6156 static inline uint8_t
6157 pmap_ept_to_pp_attrs(pt_entry_t ept)
6158 {
6159           uint8_t ret = 0;
6160           if (pmap_ept_has_ad) {
6161                     if (ept & EPT_D)
6162                               ret |= PP_ATTRS_D;
6163                     if (ept & EPT_A)
6164                               ret |= PP_ATTRS_A;
6165           } else {
6166                     ret |= (PP_ATTRS_D|PP_ATTRS_A);
6167           }
6168           if (ept & EPT_W)
6169                     ret |= PP_ATTRS_W;
6170           return ret;
6171 }
6172 
6173 static inline pt_entry_t
6174 pmap_pp_attrs_to_ept(uint8_t attrs)
6175 {
6176           pt_entry_t ept = 0;
6177           if (attrs & PP_ATTRS_D)
6178                     ept |= EPT_D;
6179           if (attrs & PP_ATTRS_A)
6180                     ept |= EPT_A;
6181           if (attrs & PP_ATTRS_W)
6182                     ept |= EPT_W;
6183           return ept;
6184 }
6185 
6186 /*
6187  * Helper for pmap_ept_free_ptp.
6188  * tree[0] = &L2[L2idx]
6189  * tree[1] = &L3[L3idx]
6190  * tree[2] = &L4[L4idx]
6191  */
6192 static void
6193 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6194 {
6195           pt_entry_t *pteva;
6196           paddr_t ptepa;
6197           int i, index;
6198 
6199           ptepa = pmap->pm_pdirpa[0];
6200           for (i = PTP_LEVELS; i > 1; i--) {
6201                     index = pl_pi(va, i);
6202                     pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6203                     KASSERT(pmap_ept_valid_entry(pteva[index]));
6204                     tree[i - 2] = &pteva[index];
6205                     ptepa = pmap_pte2pa(pteva[index]);
6206           }
6207 }
6208 
6209 static void
6210 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6211 {
6212           pd_entry_t *tree[3];
6213           int level;
6214 
6215           KASSERT(pmap != pmap_kernel());
6216           KASSERT(mutex_owned(&pmap->pm_lock));
6217           KASSERT(kpreempt_disabled());
6218 
6219           pmap_ept_get_tree(pmap, va, tree);
6220 
6221           level = 1;
6222           do {
6223                     (void)pmap_pte_testset(tree[level - 1], 0);
6224 
6225                     pmap_freepage(pmap, ptp, level);
6226                     if (level < PTP_LEVELS - 1) {
6227                               ptp = pmap_find_ptp(pmap, va, level + 1);
6228                               ptp->wire_count--;
6229                               if (ptp->wire_count > 1)
6230                                         break;
6231                     }
6232           } while (++level < PTP_LEVELS);
6233           pmap_pte_flush();
6234 }
6235 
6236 /* Allocate L4->L3->L2. Return L2. */
6237 static void
6238 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6239 {
6240           struct vm_page *ptp;
6241           unsigned long index;
6242           pd_entry_t *pteva;
6243           paddr_t ptepa;
6244           int i;
6245 
6246           KASSERT(pmap != pmap_kernel());
6247           KASSERT(mutex_owned(&pmap->pm_lock));
6248           KASSERT(kpreempt_disabled());
6249 
6250           /*
6251            * Now that we have all the pages looked up or allocated,
6252            * loop through again installing any new ones into the tree.
6253            */
6254           ptepa = pmap->pm_pdirpa[0];
6255           for (i = PTP_LEVELS; i > 1; i--) {
6256                     index = pl_pi(va, i);
6257                     pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6258 
6259                     if (pmap_ept_valid_entry(pteva[index])) {
6260                               KASSERT(!pt->alloced[i]);
6261                               ptepa = pmap_pte2pa(pteva[index]);
6262                               continue;
6263                     }
6264 
6265                     ptp = pt->pg[i];
6266                     ptp->flags &= ~PG_BUSY; /* never busy */
6267                     ptp->wire_count = 1;
6268                     pmap->pm_ptphint[i - 2] = ptp;
6269                     ptepa = VM_PAGE_TO_PHYS(ptp);
6270                     pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6271 
6272                     pmap_pte_flush();
6273                     pmap_stats_update(pmap, 1, 0);
6274 
6275                     /*
6276                      * If we're not in the top level, increase the
6277                      * wire count of the parent page.
6278                      */
6279                     if (i < PTP_LEVELS) {
6280                               pt->pg[i + 1]->wire_count++;
6281                     }
6282           }
6283 }
6284 
6285 static int
6286 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6287     u_int flags)
6288 {
6289           pt_entry_t *ptes, opte, npte;
6290           pt_entry_t *ptep;
6291           struct vm_page *ptp;
6292           struct vm_page *new_pg, *old_pg;
6293           struct pmap_page *new_pp, *old_pp;
6294           struct pv_entry *old_pve, *new_pve;
6295           bool wired = (flags & PMAP_WIRED) != 0;
6296           bool accessed;
6297           struct pmap_ptparray pt;
6298           int error;
6299           bool getptp, samepage, new_embedded;
6300           rb_tree_t *tree;
6301 
6302           KASSERT(pmap_initialized);
6303           KASSERT(va < VM_MAXUSER_ADDRESS);
6304 
6305           npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6306 
6307           if (wired)
6308                     npte |= EPT_WIRED;
6309           if (flags & VM_PROT_ALL) {
6310                     npte |= EPT_A;
6311                     if (flags & VM_PROT_WRITE) {
6312                               KASSERT((npte & EPT_W) != 0);
6313                               npte |= EPT_D;
6314                     }
6315           }
6316 
6317           new_pg = PHYS_TO_VM_PAGE(pa);
6318           if (new_pg != NULL) {
6319                     /* This is a managed page */
6320                     npte |= EPT_PVLIST;
6321                     new_pp = VM_PAGE_TO_PP(new_pg);
6322           } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6323                     /* This is an unmanaged pv-tracked page */
6324                     npte |= EPT_PVLIST;
6325           } else {
6326                     new_pp = NULL;
6327           }
6328 
6329           /* Begin by locking the pmap. */
6330           mutex_enter(&pmap->pm_lock);
6331 
6332           /* Look up the PTP.  Allocate if none present. */
6333           ptp = NULL;
6334           getptp = false;
6335           if (pmap != pmap_kernel()) {
6336                     ptp = pmap_find_ptp(pmap, va, 1);
6337                     if (ptp == NULL) {
6338                               getptp = true;
6339                               error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6340                               if (error != 0) {
6341                                         if (flags & PMAP_CANFAIL) {
6342                                                   mutex_exit(&pmap->pm_lock);
6343                                                   return error;
6344                                         }
6345                                         panic("%s: get ptp failed, error=%d", __func__,
6346                                             error);
6347                               }
6348                     }
6349                     tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6350           } else {
6351                     /* Embedded PV entries rely on this. */
6352                     KASSERT(va != 0);
6353                     tree = &pmap_kernel_rb;
6354           }
6355 
6356           /*
6357            * Look up the old PV entry at this VA (if any), and insert a new PV
6358            * entry if required for the new mapping.  Temporarily track the old
6359            * and new mappings concurrently.  Only after the old mapping is
6360            * evicted from the pmap will we remove its PV entry.  Otherwise,
6361            * our picture of modified/accessed state for either page could get
6362            * out of sync (we need any P->V operation for either page to stall
6363            * on pmap->pm_lock until done here).
6364            */
6365           new_pve = NULL;
6366           old_pve = NULL;
6367           samepage = false;
6368           new_embedded = false;
6369 
6370           if (new_pp != NULL) {
6371                     error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6372                         &old_pve, &samepage, &new_embedded, tree);
6373 
6374                     /*
6375                      * If a new pv_entry was needed and none was available, we
6376                      * can go no further.
6377                      */
6378                     if (error != 0) {
6379                               if (flags & PMAP_CANFAIL) {
6380                                         if (getptp) {
6381                                                   pmap_unget_ptp(pmap, &pt);
6382                                         }
6383                                         mutex_exit(&pmap->pm_lock);
6384                                         return error;
6385                               }
6386                               panic("%s: alloc pve failed", __func__);
6387                     }
6388           } else {
6389                     old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6390           }
6391 
6392           /* Map PTEs into address space. */
6393           kpreempt_disable();
6394 
6395           /* Install any newly allocated PTPs. */
6396           if (getptp) {
6397                     pmap_ept_install_ptp(pmap, &pt, va);
6398           }
6399 
6400           /* Check if there is an existing mapping. */
6401           ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6402           ptep = &ptes[pl1_pi(va)];
6403           opte = *ptep;
6404           bool have_oldpa = pmap_ept_valid_entry(opte);
6405           paddr_t oldpa = pmap_pte2pa(opte);
6406 
6407           /*
6408            * Update the pte.
6409            */
6410           do {
6411                     opte = *ptep;
6412 
6413                     /*
6414                      * if the same page, inherit PTE_A and PTE_D.
6415                      */
6416                     if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6417                               npte |= opte & (EPT_A | EPT_D);
6418                     }
6419           } while (pmap_pte_cas(ptep, opte, npte) != opte);
6420 
6421           /*
6422            * Done with the PTEs: they can now be unmapped.
6423            */
6424           kpreempt_enable();
6425 
6426           /*
6427            * Update statistics and PTP's reference count.
6428            */
6429           pmap_ept_stats_update_bypte(pmap, npte, opte);
6430           if (ptp != NULL) {
6431                     if (!have_oldpa) {
6432                               ptp->wire_count++;
6433                     }
6434                     /* Remember minimum VA in PTP. */
6435                     pmap_ptp_range_set(ptp, va);
6436           }
6437           KASSERT(ptp == NULL || ptp->wire_count > 1);
6438 
6439           /*
6440            * If the same page, we can skip pv_entry handling.
6441            */
6442           if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6443                     KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6444                     if ((npte & EPT_PVLIST) != 0) {
6445                               KASSERT(samepage);
6446                               pmap_check_pv(pmap, ptp, new_pp, va, true);
6447                     }
6448                     goto same_pa;
6449           } else if ((npte & EPT_PVLIST) != 0) {
6450                     KASSERT(!samepage);
6451           }
6452 
6453           /*
6454            * If old page is pv-tracked, remove pv_entry from its list.
6455            */
6456           if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6457                     if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6458                               old_pp = VM_PAGE_TO_PP(old_pg);
6459                     } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6460                               panic("%s: EPT_PVLIST with pv-untracked page"
6461                                   " va = %#"PRIxVADDR
6462                                   " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6463                                   __func__, va, oldpa, atop(pa));
6464                     }
6465 
6466                     pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6467                         pmap_ept_to_pp_attrs(opte));
6468           } else {
6469                     KASSERT(old_pve == NULL);
6470                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6471           }
6472 
6473           /*
6474            * If new page is dynamically PV tracked, insert to tree.
6475            */
6476           if (new_pve != NULL) {
6477                     KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6478                     old_pve = rb_tree_insert_node(tree, new_pve);
6479                     KASSERT(old_pve == new_pve);
6480                     pmap_check_pv(pmap, ptp, new_pp, va, true);
6481           }
6482 
6483 same_pa:
6484           /*
6485            * shootdown tlb if necessary.
6486            */
6487 
6488           if (pmap_ept_has_ad) {
6489                     accessed = (~opte & (EPT_R | EPT_A)) == 0;
6490           } else {
6491                     accessed = (opte & EPT_R) != 0;
6492           }
6493           if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6494                     pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6495           }
6496           pmap_drain_pv(pmap);
6497           mutex_exit(&pmap->pm_lock);
6498           return 0;
6499 }
6500 
6501 /* Pay close attention, this returns L2. */
6502 static int
6503 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6504 {
6505           pt_entry_t *pteva;
6506           paddr_t ptepa;
6507           int i, index;
6508 
6509           KASSERT(mutex_owned(&pmap->pm_lock));
6510 
6511           ptepa = pmap->pm_pdirpa[0];
6512           for (i = PTP_LEVELS; i > 1; i--) {
6513                     pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6514                     index = pl_pi(va, i);
6515                     if (!pmap_ept_valid_entry(pteva[index]))
6516                               return i;
6517                     ptepa = pmap_pte2pa(pteva[index]);
6518           }
6519           if (lastpde != NULL) {
6520                     *lastpde = pteva[index];
6521           }
6522 
6523           return 0;
6524 }
6525 
6526 static bool
6527 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6528 {
6529           pt_entry_t *ptes, pte;
6530           pd_entry_t pde;
6531           paddr_t ptppa, pa;
6532           bool rv;
6533 
6534 #ifdef __HAVE_DIRECT_MAP
6535           if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6536                     if (pap != NULL) {
6537                               *pap = PMAP_DIRECT_UNMAP(va);
6538                     }
6539                     return true;
6540           }
6541 #endif
6542 
6543           rv = false;
6544           pa = 0;
6545 
6546           mutex_enter(&pmap->pm_lock);
6547           kpreempt_disable();
6548 
6549           if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6550                     ptppa = pmap_pte2pa(pde);
6551                     ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6552                     pte = ptes[pl1_pi(va)];
6553                     if (__predict_true((pte & EPT_R) != 0)) {
6554                               pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6555                               rv = true;
6556                     }
6557           }
6558 
6559           kpreempt_enable();
6560           mutex_exit(&pmap->pm_lock);
6561 
6562           if (pap != NULL) {
6563                     *pap = pa;
6564           }
6565           return rv;
6566 }
6567 
6568 static bool
6569 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6570     vaddr_t va)
6571 {
6572           struct pv_entry *pve;
6573           struct vm_page *pg;
6574           struct pmap_page *pp;
6575           pt_entry_t opte;
6576           bool accessed;
6577 
6578           KASSERT(pmap != pmap_kernel());
6579           KASSERT(mutex_owned(&pmap->pm_lock));
6580           KASSERT(kpreempt_disabled());
6581 
6582           if (!pmap_ept_valid_entry(*pte)) {
6583                     /* VA not mapped. */
6584                     return false;
6585           }
6586 
6587           /* Atomically save the old PTE and zap it. */
6588           opte = pmap_pte_testset(pte, 0);
6589           if (!pmap_ept_valid_entry(opte)) {
6590                     return false;
6591           }
6592 
6593           pmap_ept_stats_update_bypte(pmap, 0, opte);
6594 
6595           if (ptp) {
6596                     /*
6597                      * Dropping a PTE.  Make sure that the PDE is flushed.
6598                      */
6599                     ptp->wire_count--;
6600                     if (ptp->wire_count <= 1) {
6601                               opte |= EPT_A;
6602                     }
6603           }
6604 
6605           if (pmap_ept_has_ad) {
6606                     accessed = (opte & EPT_A) != 0;
6607           } else {
6608                     accessed = true;
6609           }
6610           if (accessed) {
6611                     pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6612           }
6613 
6614           /*
6615            * If we are not on a pv list - we are done.
6616            */
6617           if ((opte & EPT_PVLIST) == 0) {
6618                     KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6619                         "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6620                     KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6621                         "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6622                     KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6623                         &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6624                     return true;
6625           }
6626 
6627           if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6628                     pp = VM_PAGE_TO_PP(pg);
6629           } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6630                     paddr_t pa = pmap_pte2pa(opte);
6631                     panic("%s: EPT_PVLIST with pv-untracked page"
6632                         " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6633                         __func__, va, pa, atop(pa));
6634           }
6635 
6636           /* Sync R/M bits. */
6637           pve = pmap_lookup_pv(pmap, ptp, pp, va);
6638           pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6639           return true;
6640 }
6641 
6642 static void
6643 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6644     vaddr_t startva, vaddr_t endva)
6645 {
6646           pt_entry_t *pte = (pt_entry_t *)ptpva;
6647 
6648           KASSERT(pmap != pmap_kernel());
6649           KASSERT(mutex_owned(&pmap->pm_lock));
6650           KASSERT(kpreempt_disabled());
6651 
6652           /*
6653            * mappings are very often sparse, so clip the given range to the
6654            * range of PTEs that are known present in the PTP.
6655            */
6656           pmap_ptp_range_clip(ptp, &startva, &pte);
6657 
6658           /*
6659            * note that ptpva points to the PTE that maps startva.   this may
6660            * or may not be the first PTE in the PTP.
6661            *
6662            * we loop through the PTP while there are still PTEs to look at
6663            * and the wire_count is greater than 1 (because we use the wire_count
6664            * to keep track of the number of real PTEs in the PTP).
6665            */
6666           while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6667                     (void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6668                     startva += PAGE_SIZE;
6669                     pte++;
6670           }
6671 }
6672 
6673 static void
6674 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6675 {
6676           pt_entry_t *ptes;
6677           pd_entry_t pde;
6678           paddr_t ptppa;
6679           vaddr_t blkendva, va = sva;
6680           struct vm_page *ptp;
6681 
6682           mutex_enter(&pmap->pm_lock);
6683           kpreempt_disable();
6684 
6685           for (/* null */ ; va < eva ; va = blkendva) {
6686                     int lvl;
6687 
6688                     /* determine range of block */
6689                     blkendva = x86_round_pdr(va+1);
6690                     if (blkendva > eva)
6691                               blkendva = eva;
6692 
6693                     lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6694                     if (lvl != 0) {
6695                               /* Skip a range corresponding to an invalid pde. */
6696                               blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6697                               continue;
6698                     }
6699 
6700                     /* PA of the PTP */
6701                     ptppa = pmap_pte2pa(pde);
6702 
6703                     ptp = pmap_find_ptp(pmap, va, 1);
6704                     KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6705                         __func__);
6706 
6707                     ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6708 
6709                     pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6710                         blkendva);
6711 
6712                     /* If PTP is no longer being used, free it. */
6713                     if (ptp && ptp->wire_count <= 1) {
6714                               pmap_ept_free_ptp(pmap, ptp, va);
6715                     }
6716           }
6717 
6718           kpreempt_enable();
6719           pmap_drain_pv(pmap);
6720           mutex_exit(&pmap->pm_lock);
6721 }
6722 
6723 static int
6724 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6725     uint8_t *oattrs, pt_entry_t *optep)
6726 {
6727           struct pmap *pmap;
6728           pt_entry_t *ptep;
6729           pt_entry_t opte;
6730           pt_entry_t npte;
6731           pt_entry_t expect;
6732           bool need_shootdown;
6733 
6734           expect = pmap_pa2pte(pa) | EPT_R;
6735           pmap = ptp_to_pmap(ptp);
6736 
6737           if (clearbits != ~0) {
6738                     KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6739                     clearbits = pmap_pp_attrs_to_ept(clearbits);
6740           }
6741 
6742           ptep = pmap_map_pte(pmap, ptp, va);
6743           do {
6744                     opte = *ptep;
6745                     KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6746                     KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6747                     KASSERT(opte == 0 || (opte & EPT_R) != 0);
6748                     if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6749                               /*
6750                                * We lost a race with a V->P operation like
6751                                * pmap_remove().  Wait for the competitor
6752                                * reflecting pte bits into mp_attrs.
6753                                */
6754                               pmap_unmap_pte();
6755                               return EAGAIN;
6756                     }
6757 
6758                     /*
6759                      * Check if there's anything to do on this PTE.
6760                      */
6761                     if ((opte & clearbits) == 0) {
6762                               need_shootdown = false;
6763                               break;
6764                     }
6765 
6766                     /*
6767                      * We need a shootdown if the PTE is cached (EPT_A) ...
6768                      * ... Unless we are clearing only the EPT_W bit and
6769                      * it isn't cached as RW (EPT_D).
6770                      */
6771                     if (pmap_ept_has_ad) {
6772                               need_shootdown = (opte & EPT_A) != 0 &&
6773                                   !(clearbits == EPT_W && (opte & EPT_D) == 0);
6774                     } else {
6775                               need_shootdown = true;
6776                     }
6777 
6778                     npte = opte & ~clearbits;
6779 
6780                     /*
6781                      * If we need a shootdown anyway, clear EPT_A and EPT_D.
6782                      */
6783                     if (need_shootdown) {
6784                               npte &= ~(EPT_A | EPT_D);
6785                     }
6786                     KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6787                     KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6788                     KASSERT(npte == 0 || (opte & EPT_R) != 0);
6789           } while (pmap_pte_cas(ptep, opte, npte) != opte);
6790 
6791           if (need_shootdown) {
6792                     pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6793           }
6794           pmap_unmap_pte();
6795 
6796           *oattrs = pmap_ept_to_pp_attrs(opte);
6797           if (optep != NULL)
6798                     *optep = opte;
6799           return 0;
6800 }
6801 
6802 static void
6803 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6804     vaddr_t va)
6805 {
6806 
6807           KASSERT(mutex_owned(&pmap->pm_lock));
6808 
6809           pmap_ept_stats_update_bypte(pmap, 0, opte);
6810           ptp->wire_count--;
6811           if (ptp->wire_count <= 1) {
6812                     pmap_ept_free_ptp(pmap, ptp, va);
6813           }
6814 }
6815 
6816 static void
6817 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6818 {
6819           pt_entry_t bit_rem;
6820           pt_entry_t *ptes, *spte;
6821           pt_entry_t opte, npte;
6822           pd_entry_t pde;
6823           paddr_t ptppa;
6824           vaddr_t va;
6825           bool modified;
6826 
6827           bit_rem = 0;
6828           if (!(prot & VM_PROT_WRITE))
6829                     bit_rem = EPT_W;
6830 
6831           sva &= PTE_FRAME;
6832           eva &= PTE_FRAME;
6833 
6834           /* Acquire pmap. */
6835           mutex_enter(&pmap->pm_lock);
6836           kpreempt_disable();
6837 
6838           for (va = sva; va < eva; va += PAGE_SIZE) {
6839                     if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6840                               continue;
6841                     }
6842 
6843                     ptppa = pmap_pte2pa(pde);
6844                     ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6845                     spte = &ptes[pl1_pi(va)];
6846 
6847                     do {
6848                               opte = *spte;
6849                               if (!pmap_ept_valid_entry(opte)) {
6850                                         goto next;
6851                               }
6852                               npte = (opte & ~bit_rem);
6853                     } while (pmap_pte_cas(spte, opte, npte) != opte);
6854 
6855                     if (pmap_ept_has_ad) {
6856                               modified = (opte & EPT_D) != 0;
6857                     } else {
6858                               modified = true;
6859                     }
6860                     if (modified) {
6861                               vaddr_t tva = x86_ptob(spte - ptes);
6862                               pmap_tlb_shootdown(pmap, tva, 0,
6863                                   TLBSHOOT_WRITE_PROTECT);
6864                     }
6865 next:;
6866           }
6867 
6868           kpreempt_enable();
6869           mutex_exit(&pmap->pm_lock);
6870 }
6871 
6872 static void
6873 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6874 {
6875           pt_entry_t *ptes, *ptep, opte;
6876           pd_entry_t pde;
6877           paddr_t ptppa;
6878 
6879           /* Acquire pmap. */
6880           mutex_enter(&pmap->pm_lock);
6881           kpreempt_disable();
6882 
6883           if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6884                     panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6885           }
6886 
6887           ptppa = pmap_pte2pa(pde);
6888           ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6889           ptep = &ptes[pl1_pi(va)];
6890           opte = *ptep;
6891           KASSERT(pmap_ept_valid_entry(opte));
6892 
6893           if (opte & EPT_WIRED) {
6894                     pt_entry_t npte = opte & ~EPT_WIRED;
6895 
6896                     opte = pmap_pte_testset(ptep, npte);
6897                     pmap_ept_stats_update_bypte(pmap, npte, opte);
6898           } else {
6899                     printf("%s: wiring for pmap %p va %#" PRIxVADDR
6900                         "did not change!\n", __func__, pmap, va);
6901           }
6902 
6903           /* Release pmap. */
6904           kpreempt_enable();
6905           mutex_exit(&pmap->pm_lock);
6906 }
6907 
6908 /* -------------------------------------------------------------------------- */
6909 
6910 void
6911 pmap_ept_transform(struct pmap *pmap)
6912 {
6913           pmap->pm_enter = pmap_ept_enter;
6914           pmap->pm_extract = pmap_ept_extract;
6915           pmap->pm_remove = pmap_ept_remove;
6916           pmap->pm_sync_pv = pmap_ept_sync_pv;
6917           pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6918           pmap->pm_write_protect = pmap_ept_write_protect;
6919           pmap->pm_unwire = pmap_ept_unwire;
6920 
6921           memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6922 }
6923 
6924 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6925