1 /* $NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran, and by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 2007 Manuel Bouyer.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
45 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
46 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
48 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
49 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
50 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
51 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
52 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
53 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
56 /*
57 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
58 *
59 * Permission to use, copy, modify, and distribute this software for any
60 * purpose with or without fee is hereby granted, provided that the above
61 * copyright notice and this permission notice appear in all copies.
62 *
63 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
64 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
65 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
66 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
67 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
68 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
69 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
70 */
71
72 /*
73 * Copyright 2001 (c) Wasabi Systems, Inc.
74 * All rights reserved.
75 *
76 * Written by Frank van der Linden for Wasabi Systems, Inc.
77 *
78 * Redistribution and use in source and binary forms, with or without
79 * modification, are permitted provided that the following conditions
80 * are met:
81 * 1. Redistributions of source code must retain the above copyright
82 * notice, this list of conditions and the following disclaimer.
83 * 2. Redistributions in binary form must reproduce the above copyright
84 * notice, this list of conditions and the following disclaimer in the
85 * documentation and/or other materials provided with the distribution.
86 * 3. All advertising materials mentioning features or use of this software
87 * must display the following acknowledgement:
88 * This product includes software developed for the NetBSD Project by
89 * Wasabi Systems, Inc.
90 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
91 * or promote products derived from this software without specific prior
92 * written permission.
93 *
94 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
95 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
96 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
97 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
98 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
99 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
100 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
101 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
102 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
103 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
104 * POSSIBILITY OF SUCH DAMAGE.
105 */
106
107 /*
108 * Copyright (c) 1997 Charles D. Cranor and Washington University.
109 * All rights reserved.
110 *
111 * Redistribution and use in source and binary forms, with or without
112 * modification, are permitted provided that the following conditions
113 * are met:
114 * 1. Redistributions of source code must retain the above copyright
115 * notice, this list of conditions and the following disclaimer.
116 * 2. Redistributions in binary form must reproduce the above copyright
117 * notice, this list of conditions and the following disclaimer in the
118 * documentation and/or other materials provided with the distribution.
119 *
120 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
121 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
122 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
123 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
124 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
125 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
126 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
127 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
128 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
129 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130 */
131
132 #include <sys/cdefs.h>
133 __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.427 2024/10/08 21:09:08 riastradh Exp $");
134
135 #include "opt_user_ldt.h"
136 #include "opt_lockdebug.h"
137 #include "opt_multiprocessor.h"
138 #include "opt_xen.h"
139 #include "opt_svs.h"
140 #include "opt_kaslr.h"
141 #include "opt_efi.h"
142
143 #define __MUTEX_PRIVATE /* for assertions */
144
145 #include <sys/param.h>
146 #include <sys/systm.h>
147 #include <sys/proc.h>
148 #include <sys/pool.h>
149 #include <sys/kernel.h>
150 #include <sys/atomic.h>
151 #include <sys/cpu.h>
152 #include <sys/intr.h>
153 #include <sys/xcall.h>
154 #include <sys/kcore.h>
155 #include <sys/kmem.h>
156 #include <sys/asan.h>
157 #include <sys/msan.h>
158 #include <sys/entropy.h>
159
160 #include <uvm/uvm.h>
161 #include <uvm/pmap/pmap_pvt.h>
162
163 #include <dev/isa/isareg.h>
164
165 #include <machine/specialreg.h>
166 #include <machine/gdt.h>
167 #include <machine/isa_machdep.h>
168 #include <machine/cpuvar.h>
169 #include <machine/cputypes.h>
170 #include <machine/pmap_private.h>
171
172 #include <x86/bootspace.h>
173 #include <x86/pat.h>
174 #include <x86/pmap_pv.h>
175
176 #include <x86/i82489reg.h>
177 #include <x86/i82489var.h>
178
179 #ifdef XEN
180 #include <xen/include/public/xen.h>
181 #include <xen/hypervisor.h>
182 #include <xen/xenpmap.h>
183 #endif
184
185 #ifdef __HAVE_DIRECT_MAP
186 #include <crypto/nist_hash_drbg/nist_hash_drbg.h>
187 #endif
188
189 /*
190 * general info:
191 *
192 * - for an explanation of how the x86 MMU hardware works see
193 * the comments in <machine/pte.h>.
194 *
195 * - for an explanation of the general memory structure used by
196 * this pmap (including the recursive mapping), see the comments
197 * in <machine/pmap.h>.
198 *
199 * this file contains the code for the "pmap module." the module's
200 * job is to manage the hardware's virtual to physical address mappings.
201 * note that there are two levels of mapping in the VM system:
202 *
203 * [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
204 * to map ranges of virtual address space to objects/files. for
205 * example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
206 * to the file /bin/ls starting at offset zero." note that
207 * the upper layer mapping is not concerned with how individual
208 * vm_pages are mapped.
209 *
210 * [2] the lower layer of the VM system (the pmap) maintains the mappings
211 * from virtual addresses. it is concerned with which vm_page is
212 * mapped where. for example, when you run /bin/ls and start
213 * at page 0x1000 the fault routine may lookup the correct page
214 * of the /bin/ls file and then ask the pmap layer to establish
215 * a mapping for it.
216 *
217 * note that information in the lower layer of the VM system can be
218 * thrown away since it can easily be reconstructed from the info
219 * in the upper layer.
220 *
221 * data structures we use include:
222 *
223 * - struct pmap: describes the address space of one thread
224 * - struct pmap_page: describes one pv-tracked page, without
225 * necessarily a corresponding vm_page
226 * - struct pv_entry: describes one <PMAP,VA> mapping of a PA
227 * - pmap_page::pp_pvlist: there is one list per pv-tracked page of
228 * physical memory. the pp_pvlist points to a list of pv_entry
229 * structures which describe all the <PMAP,VA> pairs that this
230 * page is mapped in. this is critical for page based operations
231 * such as pmap_page_protect() [change protection on _all_ mappings
232 * of a page]
233 */
234
235 /*
236 * Locking
237 *
238 * We have the following locks that we must deal with, listed in the order
239 * that they are acquired:
240 *
241 * pg->uobject->vmobjlock, pg->uanon->an_lock
242 *
243 * For managed pages, these per-object locks are taken by the VM system
244 * before calling into the pmap module - either a read or write hold.
245 * The lock hold prevent pages from changing identity while the pmap is
246 * operating on them. For example, the same lock is held across a call
247 * to pmap_remove() and the following call to pmap_update(), so that a
248 * page does not gain a new identity while its TLB visibility is stale.
249 *
250 * pmap->pm_lock
251 *
252 * This lock protects the fields in the pmap structure including the
253 * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
254 * structures. For modifying unmanaged kernel PTEs it is not needed as
255 * kernel PDEs are never freed, and the kernel is expected to be self
256 * consistent (and the lock can't be taken for unmanaged kernel PTEs,
257 * because they can be modified from interrupt context).
258 *
259 * pmaps_lock
260 *
261 * This lock protects the list of active pmaps (headed by "pmaps").
262 * It's acquired when adding or removing pmaps or adjusting kernel PDEs.
263 *
264 * pp_lock
265 *
266 * This per-page lock protects PV entry lists and the embedded PV entry
267 * in each vm_page, allowing for concurrent operation on pages by
268 * different pmaps. This is a spin mutex at IPL_VM, because at the
269 * points it is taken context switching is usually not tolerable, and
270 * spin mutexes must block out interrupts that could take kernel_lock.
271 */
272
273 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
274 #ifdef DIAGNOSTIC
275 #define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
276 #define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock)
277 #else
278 #define PMAP_DUMMY_LOCK(pm)
279 #define PMAP_DUMMY_UNLOCK(pm)
280 #endif
281
282 static const struct uvm_pagerops pmap_pager = {
283 /* nothing */
284 };
285
286 /*
287 * pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
288 */
289 #define pl_i(va, lvl) \
290 (((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
291
292 #define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
293
294 /*
295 * PTP macros:
296 * a PTP's index is the PD index of the PDE that points to it
297 * a PTP's offset is the byte-offset in the PTE space that this PTP is at
298 * a PTP's VA is the first VA mapped by that PTP
299 */
300
301 #define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE)
302
303 const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
304 const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
305 const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
306 const long nkptpmax[] = NKPTPMAX_INITIALIZER;
307 const long nbpd[] = NBPD_INITIALIZER;
308 #ifdef i386
309 pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
310 #else
311 pd_entry_t *normal_pdes[3];
312 #endif
313
314 long nkptp[] = NKPTP_INITIALIZER;
315
316 struct pmap_head pmaps;
317 kmutex_t pmaps_lock __cacheline_aligned;
318
319 struct pcpu_area *pcpuarea __read_mostly;
320
321 static vaddr_t pmap_maxkvaddr;
322
323 /*
324 * Misc. event counters.
325 */
326 struct evcnt pmap_iobmp_evcnt;
327 struct evcnt pmap_ldt_evcnt;
328
329 /*
330 * PAT
331 */
332 static bool cpu_pat_enabled __read_mostly = false;
333
334 /*
335 * Global data structures
336 */
337
338 static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
339 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
340 static rb_tree_t pmap_kernel_rb __cacheline_aligned;
341
342 struct bootspace bootspace __read_mostly;
343 struct slotspace slotspace __read_mostly;
344
345 /* Set to PTE_NX if supported. */
346 pd_entry_t pmap_pg_nx __read_mostly = 0;
347
348 /* Set to PTE_G if supported. */
349 pd_entry_t pmap_pg_g __read_mostly = 0;
350
351 /* Set to true if large pages are supported. */
352 int pmap_largepages __read_mostly = 0;
353
354 paddr_t lowmem_rsvd __read_mostly;
355 paddr_t avail_start __read_mostly; /* PA of first available physical page */
356 paddr_t avail_end __read_mostly; /* PA of last available physical page */
357
358 #ifdef XENPV
359 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
360 paddr_t pmap_pa_end; /* PA of last physical page for this domain */
361 #endif
362
363 #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
364 #define PMAP_CHECK_PP(pp) \
365 KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
366
367 #define PAGE_ALIGNED(pp) \
368 __builtin_assume_aligned((void *)(pp), PAGE_SIZE)
369
370 /*
371 * Other data structures
372 */
373
374 static pt_entry_t protection_codes[8] __read_mostly;
375
376 static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
377
378 /*
379 * The following two vaddr_t's are used during system startup to keep track of
380 * how much of the kernel's VM space we have used. Once the system is started,
381 * the management of the remaining kernel VM space is turned over to the
382 * kernel_map vm_map.
383 */
384 static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */
385 static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */
386
387 #ifndef XENPV
388 /*
389 * LAPIC virtual address, and fake physical address.
390 */
391 volatile vaddr_t local_apic_va __read_mostly;
392 paddr_t local_apic_pa __read_mostly;
393 #endif
394
395 /*
396 * pool that pmap structures are allocated from
397 */
398 struct pool_cache pmap_cache;
399 static int pmap_ctor(void *, void *, int);
400 static void pmap_dtor(void *, void *);
401
402 /*
403 * pv_page cache
404 */
405 static struct pool_cache pmap_pvp_cache;
406
407 #ifdef __HAVE_DIRECT_MAP
408 vaddr_t pmap_direct_base __read_mostly;
409 vaddr_t pmap_direct_end __read_mostly;
410 #endif
411
412 #ifndef __HAVE_DIRECT_MAP
413 /*
414 * Special VAs and the PTEs that map them
415 */
416 static pt_entry_t *early_zero_pte;
417 static void pmap_vpage_cpualloc(struct cpu_info *);
418 #ifdef XENPV
419 char *early_zerop; /* also referenced from xen_locore() */
420 #else
421 static char *early_zerop;
422 #endif
423 #endif
424
425 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
426
427 /* PDP pool and its callbacks */
428 static struct pool pmap_pdp_pool;
429 static void pmap_pdp_init(pd_entry_t *);
430 static void pmap_pdp_fini(pd_entry_t *);
431
432 #ifdef PAE
433 /* need to allocate items of 4 pages */
434 static void *pmap_pdp_alloc(struct pool *, int);
435 static void pmap_pdp_free(struct pool *, void *);
436 static struct pool_allocator pmap_pdp_allocator = {
437 .pa_alloc = pmap_pdp_alloc,
438 .pa_free = pmap_pdp_free,
439 .pa_pagesz = PAGE_SIZE * PDP_SIZE,
440 };
441 #endif
442
443 extern vaddr_t idt_vaddr;
444 extern paddr_t idt_paddr;
445 extern vaddr_t gdt_vaddr;
446 extern paddr_t gdt_paddr;
447 extern vaddr_t ldt_vaddr;
448 extern paddr_t ldt_paddr;
449
450 #ifdef i386
451 /* stuff to fix the pentium f00f bug */
452 extern vaddr_t pentium_idt_vaddr;
453 #endif
454
455 /* Array of freshly allocated PTPs, for pmap_get_ptp(). */
456 struct pmap_ptparray {
457 struct vm_page *pg[PTP_LEVELS + 1];
458 bool alloced[PTP_LEVELS + 1];
459 };
460
461 /*
462 * PV entries are allocated in page-sized chunks and cached per-pmap to
463 * avoid intense pressure on memory allocators.
464 */
465
466 struct pv_page {
467 LIST_HEAD(, pv_entry) pvp_pves;
468 LIST_ENTRY(pv_page) pvp_list;
469 long pvp_nfree;
470 struct pmap *pvp_pmap;
471 };
472
473 #define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
474
475 /*
476 * PV tree prototypes
477 */
478
479 static int pmap_compare_key(void *, const void *, const void *);
480 static int pmap_compare_nodes(void *, const void *, const void *);
481
482 /* Read-black tree */
483 static const rb_tree_ops_t pmap_rbtree_ops = {
484 .rbto_compare_nodes = pmap_compare_nodes,
485 .rbto_compare_key = pmap_compare_key,
486 .rbto_node_offset = offsetof(struct pv_entry, pve_rb),
487 .rbto_context = NULL
488 };
489
490 /*
491 * Local prototypes
492 */
493
494 #ifdef __HAVE_PCPU_AREA
495 static void pmap_init_pcpu(void);
496 #endif
497 #ifdef __HAVE_DIRECT_MAP
498 static void pmap_init_directmap(struct pmap *);
499 #endif
500 #if !defined(XENPV)
501 static void pmap_remap_global(void);
502 #endif
503 #ifndef XENPV
504 static void pmap_init_lapic(void);
505 static void pmap_remap_largepages(void);
506 #endif
507
508 static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
509 struct vm_page **);
510 static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
511 static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
512 pd_entry_t * const *);
513 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
514 static void pmap_freepage(struct pmap *, struct vm_page *, int);
515 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
516 pt_entry_t *, pd_entry_t * const *);
517 static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
518 vaddr_t);
519 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
520 vaddr_t);
521 static int pmap_pvp_ctor(void *, void *, int);
522 static void pmap_pvp_dtor(void *, void *);
523 static struct pv_entry *pmap_alloc_pv(struct pmap *);
524 static void pmap_free_pv(struct pmap *, struct pv_entry *);
525 static void pmap_drain_pv(struct pmap *);
526
527 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
528
529 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
530 static void pmap_reactivate(struct pmap *);
531
532 long
pmap_resident_count(struct pmap * pmap)533 pmap_resident_count(struct pmap *pmap)
534 {
535
536 return pmap->pm_stats.resident_count;
537 }
538
539 long
pmap_wired_count(struct pmap * pmap)540 pmap_wired_count(struct pmap *pmap)
541 {
542
543 return pmap->pm_stats.wired_count;
544 }
545
546 /*
547 * p m a p h e l p e r f u n c t i o n s
548 */
549
550 static inline void
pmap_stats_update(struct pmap * pmap,int resid_diff,int wired_diff)551 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
552 {
553
554 KASSERT(cold || mutex_owned(&pmap->pm_lock));
555 pmap->pm_stats.resident_count += resid_diff;
556 pmap->pm_stats.wired_count += wired_diff;
557 }
558
559 static inline void
pmap_stats_update_bypte(struct pmap * pmap,pt_entry_t npte,pt_entry_t opte)560 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
561 {
562 int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
563 int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
564
565 KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
566 KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED);
567
568 pmap_stats_update(pmap, resid_diff, wired_diff);
569 }
570
571 /*
572 * ptp_to_pmap: lookup pmap by ptp
573 */
574 static inline struct pmap *
ptp_to_pmap(struct vm_page * ptp)575 ptp_to_pmap(struct vm_page *ptp)
576 {
577 struct pmap *pmap;
578
579 if (ptp == NULL) {
580 return pmap_kernel();
581 }
582 pmap = (struct pmap *)ptp->uobject;
583 KASSERT(pmap != NULL);
584 KASSERT(&pmap->pm_obj[0] == ptp->uobject);
585 return pmap;
586 }
587
588 static inline struct pv_pte *
pve_to_pvpte(struct pv_entry * pve)589 pve_to_pvpte(struct pv_entry *pve)
590 {
591
592 if (pve == NULL)
593 return NULL;
594 KASSERT((void *)&pve->pve_pte == (void *)pve);
595 return &pve->pve_pte;
596 }
597
598 static inline struct pv_entry *
pvpte_to_pve(struct pv_pte * pvpte)599 pvpte_to_pve(struct pv_pte *pvpte)
600 {
601 struct pv_entry *pve = (void *)pvpte;
602
603 KASSERT(pve_to_pvpte(pve) == pvpte);
604 return pve;
605 }
606
607 /*
608 * Return true if the pmap page has an embedded PV entry.
609 */
610 static inline bool
pv_pte_embedded(struct pmap_page * pp)611 pv_pte_embedded(struct pmap_page *pp)
612 {
613
614 KASSERT(mutex_owned(&pp->pp_lock));
615 return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
616 }
617
618 /*
619 * pv_pte_first, pv_pte_next: PV list iterator.
620 */
621 static inline struct pv_pte *
pv_pte_first(struct pmap_page * pp)622 pv_pte_first(struct pmap_page *pp)
623 {
624
625 KASSERT(mutex_owned(&pp->pp_lock));
626 if (pv_pte_embedded(pp)) {
627 return &pp->pp_pte;
628 }
629 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
630 }
631
632 static inline struct pv_pte *
pv_pte_next(struct pmap_page * pp,struct pv_pte * pvpte)633 pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
634 {
635
636 KASSERT(mutex_owned(&pp->pp_lock));
637 KASSERT(pvpte != NULL);
638 if (pvpte == &pp->pp_pte) {
639 return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
640 }
641 return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
642 }
643
644 static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)645 pmap_pte_to_pp_attrs(pt_entry_t pte)
646 {
647 uint8_t ret = 0;
648 if (pte & PTE_D)
649 ret |= PP_ATTRS_D;
650 if (pte & PTE_A)
651 ret |= PP_ATTRS_A;
652 if (pte & PTE_W)
653 ret |= PP_ATTRS_W;
654 return ret;
655 }
656
657 static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)658 pmap_pp_attrs_to_pte(uint8_t attrs)
659 {
660 pt_entry_t pte = 0;
661 if (attrs & PP_ATTRS_D)
662 pte |= PTE_D;
663 if (attrs & PP_ATTRS_A)
664 pte |= PTE_A;
665 if (attrs & PP_ATTRS_W)
666 pte |= PTE_W;
667 return pte;
668 }
669
670 /*
671 * pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
672 * of course the kernel is always loaded
673 */
674 bool
pmap_is_curpmap(struct pmap * pmap)675 pmap_is_curpmap(struct pmap *pmap)
676 {
677 return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
678 }
679
680 inline void
pmap_reference(struct pmap * pmap)681 pmap_reference(struct pmap *pmap)
682 {
683
684 atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
685 }
686
687 /*
688 * rbtree: compare two nodes.
689 */
690 static int
pmap_compare_nodes(void * context,const void * n1,const void * n2)691 pmap_compare_nodes(void *context, const void *n1, const void *n2)
692 {
693 const struct pv_entry *pve1 = n1;
694 const struct pv_entry *pve2 = n2;
695
696 KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
697
698 if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
699 return -1;
700 }
701 if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
702 return 1;
703 }
704 return 0;
705 }
706
707 /*
708 * rbtree: compare a node and a key.
709 */
710 static int
pmap_compare_key(void * context,const void * n,const void * k)711 pmap_compare_key(void *context, const void *n, const void *k)
712 {
713 const struct pv_entry *pve = n;
714 const vaddr_t key = (vaddr_t)k;
715
716 if (pve->pve_pte.pte_va < key) {
717 return -1;
718 }
719 if (pve->pve_pte.pte_va > key) {
720 return 1;
721 }
722 return 0;
723 }
724
725 /*
726 * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
727 */
728 static inline void
pmap_ptp_range_set(struct vm_page * ptp,vaddr_t va)729 pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
730 {
731 vaddr_t *min = (vaddr_t *)&ptp->uanon;
732
733 if (va < *min) {
734 *min = va;
735 }
736 }
737
738 /*
739 * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
740 */
741 static inline void
pmap_ptp_range_clip(struct vm_page * ptp,vaddr_t * startva,pt_entry_t ** pte)742 pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
743 {
744 vaddr_t sclip;
745
746 if (ptp == NULL) {
747 return;
748 }
749
750 sclip = (vaddr_t)ptp->uanon;
751 sclip = (*startva < sclip ? sclip : *startva);
752 *pte += (sclip - *startva) / PAGE_SIZE;
753 *startva = sclip;
754 }
755
756 /*
757 * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
758 *
759 * there are several pmaps involved. some or all of them might be same.
760 *
761 * - the pmap given by the first argument
762 * our caller wants to access this pmap's PTEs.
763 *
764 * - pmap_kernel()
765 * the kernel pmap. note that it only contains the kernel part
766 * of the address space which is shared by any pmap. ie. any
767 * pmap can be used instead of pmap_kernel() for our purpose.
768 *
769 * - ci->ci_pmap
770 * pmap currently loaded on the cpu.
771 *
772 * - vm_map_pmap(&curproc->p_vmspace->vm_map)
773 * current process' pmap.
774 *
775 * => caller must lock pmap first (if not the kernel pmap)
776 * => must be undone with pmap_unmap_ptes before returning
777 * => disables kernel preemption
778 */
779 void
pmap_map_ptes(struct pmap * pmap,struct pmap ** pmap2,pd_entry_t ** ptepp,pd_entry_t * const ** pdeppp)780 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
781 pd_entry_t * const **pdeppp)
782 {
783 struct pmap *curpmap;
784 struct cpu_info *ci;
785 lwp_t *l;
786
787 kpreempt_disable();
788
789 /* The kernel's pmap is always accessible. */
790 if (pmap == pmap_kernel()) {
791 *pmap2 = NULL;
792 *ptepp = PTE_BASE;
793 *pdeppp = normal_pdes;
794 return;
795 }
796
797 KASSERT(mutex_owned(&pmap->pm_lock));
798
799 l = curlwp;
800 ci = l->l_cpu;
801 curpmap = ci->ci_pmap;
802 if (pmap == curpmap) {
803 /*
804 * Already on the CPU: make it valid. This is very
805 * often the case during exit(), when we have switched
806 * to the kernel pmap in order to destroy a user pmap.
807 */
808 if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
809 pmap_reactivate(pmap);
810 }
811 *pmap2 = NULL;
812 } else {
813 /*
814 * Toss current pmap from CPU and install new pmap, but keep
815 * a reference to the old one. Dropping the reference can
816 * can block as it needs to take locks, so defer that to
817 * pmap_unmap_ptes().
818 */
819 pmap_reference(pmap);
820 pmap_load1(l, pmap, curpmap);
821 *pmap2 = curpmap;
822 }
823 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
824 #ifdef DIAGNOSTIC
825 pmap->pm_pctr = lwp_pctr();
826 #endif
827 *ptepp = PTE_BASE;
828
829 #if defined(XENPV) && defined(__x86_64__)
830 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
831 ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
832 *pdeppp = ci->ci_normal_pdes;
833 #else
834 *pdeppp = normal_pdes;
835 #endif
836 }
837
838 /*
839 * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
840 *
841 * => we cannot tolerate context switches while mapped in: assert this.
842 * => reenables kernel preemption.
843 * => does not unlock pmap.
844 */
845 void
pmap_unmap_ptes(struct pmap * pmap,struct pmap * pmap2)846 pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
847 {
848 struct cpu_info *ci;
849 struct pmap *mypmap;
850 struct lwp *l;
851
852 KASSERT(kpreempt_disabled());
853
854 /* The kernel's pmap is always accessible. */
855 if (pmap == pmap_kernel()) {
856 kpreempt_enable();
857 return;
858 }
859
860 l = curlwp;
861 ci = l->l_cpu;
862
863 KASSERT(mutex_owned(&pmap->pm_lock));
864 KASSERT(pmap->pm_pctr == lwp_pctr());
865
866 #if defined(XENPV) && defined(__x86_64__)
867 KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
868 ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
869 #endif
870
871 /* If not our own pmap, mark whatever's on the CPU now as lazy. */
872 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
873 mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
874 if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
875 ci->ci_want_pmapload = 0;
876 } else {
877 ci->ci_want_pmapload = (mypmap != pmap_kernel());
878 ci->ci_tlbstate = TLBSTATE_LAZY;
879 }
880
881 /* Now safe to re-enable preemption. */
882 kpreempt_enable();
883
884 /* Toss reference to other pmap taken earlier. */
885 if (pmap2 != NULL) {
886 pmap_destroy(pmap2);
887 }
888 }
889
890 inline static void
pmap_exec_account(struct pmap * pm,vaddr_t va,pt_entry_t opte,pt_entry_t npte)891 pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
892 {
893
894 #if !defined(__x86_64__)
895 if (curproc == NULL || curproc->p_vmspace == NULL ||
896 pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
897 return;
898
899 if ((opte ^ npte) & PTE_X)
900 pmap_update_pg(va);
901
902 /*
903 * Executability was removed on the last executable change.
904 * Reset the code segment to something conservative and
905 * let the trap handler deal with setting the right limit.
906 * We can't do that because of locking constraints on the vm map.
907 */
908
909 if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
910 struct trapframe *tf = curlwp->l_md.md_regs;
911
912 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
913 pm->pm_hiexec = I386_MAX_EXE_ADDR;
914 }
915 #endif /* !defined(__x86_64__) */
916 }
917
918 #if !defined(__x86_64__)
919 /*
920 * Fixup the code segment to cover all potential executable mappings.
921 * returns 0 if no changes to the code segment were made.
922 */
923 int
pmap_exec_fixup(struct vm_map * map,struct trapframe * tf,struct pcb * pcb)924 pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
925 {
926 struct vm_map_entry *ent;
927 struct pmap *pm = vm_map_pmap(map);
928 vaddr_t va = 0;
929
930 vm_map_lock_read(map);
931 for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
932 /*
933 * This entry has greater va than the entries before.
934 * We need to make it point to the last page, not past it.
935 */
936 if (ent->protection & VM_PROT_EXECUTE)
937 va = trunc_page(ent->end) - PAGE_SIZE;
938 }
939 vm_map_unlock_read(map);
940 if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
941 return 0;
942
943 pm->pm_hiexec = va;
944 if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
945 tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
946 } else {
947 tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
948 return 0;
949 }
950 return 1;
951 }
952 #endif /* !defined(__x86_64__) */
953
954 void
pat_init(struct cpu_info * ci)955 pat_init(struct cpu_info *ci)
956 {
957 #ifndef XENPV
958 uint64_t pat;
959
960 if (!(ci->ci_feat_val[0] & CPUID_PAT))
961 return;
962
963 /* We change WT to WC. Leave all other entries the default values. */
964 pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
965 PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
966 PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
967 PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
968
969 wrmsr(MSR_CR_PAT, pat);
970 cpu_pat_enabled = true;
971 #endif
972 }
973
974 static pt_entry_t
pmap_pat_flags(u_int flags)975 pmap_pat_flags(u_int flags)
976 {
977 u_int cacheflags = (flags & PMAP_CACHE_MASK);
978
979 if (!cpu_pat_enabled) {
980 switch (cacheflags) {
981 case PMAP_NOCACHE:
982 case PMAP_NOCACHE_OVR:
983 /* results in PGC_UCMINUS on cpus which have
984 * the cpuid PAT but PAT "disabled"
985 */
986 return PTE_PCD;
987 default:
988 return 0;
989 }
990 }
991
992 switch (cacheflags) {
993 case PMAP_NOCACHE:
994 return PGC_UC;
995 case PMAP_WRITE_COMBINE:
996 return PGC_WC;
997 case PMAP_WRITE_BACK:
998 return PGC_WB;
999 case PMAP_NOCACHE_OVR:
1000 return PGC_UCMINUS;
1001 }
1002
1003 return 0;
1004 }
1005
1006 /*
1007 * p m a p k e n t e r f u n c t i o n s
1008 *
1009 * functions to quickly enter/remove pages from the kernel address
1010 * space. pmap_kremove is exported to MI kernel. we make use of
1011 * the recursive PTE mappings.
1012 */
1013
1014 /*
1015 * pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
1016 *
1017 * => no need to lock anything, assume va is already allocated
1018 * => should be faster than normal pmap enter function
1019 */
1020 void
pmap_kenter_pa(vaddr_t va,paddr_t pa,vm_prot_t prot,u_int flags)1021 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
1022 {
1023 pt_entry_t *pte, opte, npte;
1024
1025 KASSERT(!(prot & ~VM_PROT_ALL));
1026
1027 if (va < VM_MIN_KERNEL_ADDRESS)
1028 pte = vtopte(va);
1029 else
1030 pte = kvtopte(va);
1031 #if defined(XENPV) && defined(DOM0OPS)
1032 if (pa < pmap_pa_start || pa >= pmap_pa_end) {
1033 #ifdef DEBUG
1034 printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
1035 " outside range\n", __func__, pa, va);
1036 #endif /* DEBUG */
1037 npte = pa;
1038 } else
1039 #endif /* XENPV && DOM0OPS */
1040 npte = pmap_pa2pte(pa);
1041 npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
1042 npte |= pmap_pat_flags(flags);
1043 opte = pmap_pte_testset(pte, npte); /* zap! */
1044
1045 /*
1046 * XXX: make sure we are not dealing with a large page, since the only
1047 * large pages created are for the kernel image, and they should never
1048 * be kentered.
1049 */
1050 KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va);
1051
1052 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
1053 /* This should not happen. */
1054 printf_nolog("%s: mapping already present\n", __func__);
1055 kpreempt_disable();
1056 pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
1057 kpreempt_enable();
1058 }
1059 }
1060
1061 __strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
1062
1063 #if defined(__x86_64__)
1064 /*
1065 * Change protection for a virtual address. Local for a CPU only, don't
1066 * care about TLB shootdowns.
1067 *
1068 * => must be called with preemption disabled
1069 */
1070 void
pmap_changeprot_local(vaddr_t va,vm_prot_t prot)1071 pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
1072 {
1073 pt_entry_t *pte, opte, npte;
1074
1075 KASSERT(kpreempt_disabled());
1076
1077 if (va < VM_MIN_KERNEL_ADDRESS)
1078 pte = vtopte(va);
1079 else
1080 pte = kvtopte(va);
1081
1082 npte = opte = *pte;
1083
1084 if ((prot & VM_PROT_WRITE) != 0)
1085 npte |= PTE_W;
1086 else
1087 npte &= ~(PTE_W|PTE_D);
1088
1089 if (opte != npte) {
1090 pmap_pte_set(pte, npte);
1091 pmap_pte_flush();
1092 invlpg(va);
1093 }
1094 }
1095 #endif /* defined(__x86_64__) */
1096
1097 /*
1098 * pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
1099 *
1100 * => no need to lock anything
1101 * => caller must dispose of any vm_page mapped in the va range
1102 * => note: not an inline function
1103 * => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
1104 * => we assume kernel only unmaps valid addresses and thus don't bother
1105 * checking the valid bit before doing TLB flushing
1106 * => must be followed by call to pmap_update() before reuse of page
1107 */
1108 static void
pmap_kremove1(vaddr_t sva,vsize_t len,bool localonly)1109 pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
1110 {
1111 pt_entry_t *pte, opte;
1112 vaddr_t va, eva;
1113
1114 eva = sva + len;
1115
1116 kpreempt_disable();
1117 for (va = sva; va < eva; va += PAGE_SIZE) {
1118 pte = kvtopte(va);
1119 opte = pmap_pte_testset(pte, 0); /* zap! */
1120 if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) {
1121 pmap_tlb_shootdown(pmap_kernel(), va, opte,
1122 TLBSHOOT_KREMOVE);
1123 }
1124 KASSERTMSG((opte & PTE_PS) == 0,
1125 "va %#" PRIxVADDR " is a large page", va);
1126 KASSERTMSG((opte & PTE_PVLIST) == 0,
1127 "va %#" PRIxVADDR " is a pv tracked page", va);
1128 }
1129 if (localonly) {
1130 tlbflushg();
1131 }
1132 kpreempt_enable();
1133 }
1134
1135 void
pmap_kremove(vaddr_t sva,vsize_t len)1136 pmap_kremove(vaddr_t sva, vsize_t len)
1137 {
1138
1139 pmap_kremove1(sva, len, false);
1140 }
1141
1142 /*
1143 * pmap_kremove_local: like pmap_kremove(), but only worry about
1144 * TLB invalidations on the current CPU. this is only intended
1145 * for use while writing kernel crash dumps, either after panic
1146 * or via reboot -d.
1147 */
1148 void
pmap_kremove_local(vaddr_t sva,vsize_t len)1149 pmap_kremove_local(vaddr_t sva, vsize_t len)
1150 {
1151
1152 pmap_kremove1(sva, len, true);
1153 }
1154
1155 /*
1156 * p m a p i n i t f u n c t i o n s
1157 *
1158 * pmap_bootstrap and pmap_init are called during system startup
1159 * to init the pmap module. pmap_bootstrap() does a low level
1160 * init just to get things rolling. pmap_init() finishes the job.
1161 */
1162
1163 /*
1164 * pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
1165 * This function is to be used before any VM system has been set up.
1166 *
1167 * The va is taken from virtual_avail.
1168 */
1169 static vaddr_t
pmap_bootstrap_valloc(size_t npages)1170 pmap_bootstrap_valloc(size_t npages)
1171 {
1172 vaddr_t va = virtual_avail;
1173 virtual_avail += npages * PAGE_SIZE;
1174 return va;
1175 }
1176
1177 /*
1178 * pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
1179 * This function is to be used before any VM system has been set up.
1180 *
1181 * The pa is taken from avail_start.
1182 */
1183 static paddr_t
pmap_bootstrap_palloc(size_t npages)1184 pmap_bootstrap_palloc(size_t npages)
1185 {
1186 paddr_t pa = avail_start;
1187 avail_start += npages * PAGE_SIZE;
1188 return pa;
1189 }
1190
1191 /*
1192 * pmap_bootstrap: get the system in a state where it can run with VM properly
1193 * enabled (called before main()). The VM system is fully init'd later.
1194 *
1195 * => on i386, locore.S has already enabled the MMU by allocating a PDP for the
1196 * kernel, and nkpde PTP's for the kernel.
1197 * => kva_start is the first free virtual address in kernel space.
1198 */
1199 void
pmap_bootstrap(vaddr_t kva_start)1200 pmap_bootstrap(vaddr_t kva_start)
1201 {
1202 struct pmap *kpm;
1203 int i;
1204 vaddr_t kva;
1205
1206 pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
1207
1208 /*
1209 * Set up our local static global vars that keep track of the usage of
1210 * KVM before kernel_map is set up.
1211 */
1212 virtual_avail = kva_start; /* first free KVA */
1213 virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
1214
1215 /*
1216 * Set up protection_codes: we need to be able to convert from a MI
1217 * protection code (some combo of VM_PROT...) to something we can jam
1218 * into a x86 PTE.
1219 */
1220 protection_codes[VM_PROT_NONE] = pmap_pg_nx;
1221 protection_codes[VM_PROT_EXECUTE] = PTE_X;
1222 protection_codes[VM_PROT_READ] = pmap_pg_nx;
1223 protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
1224 protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
1225 protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
1226 protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
1227 protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
1228
1229 /*
1230 * Now we init the kernel's pmap.
1231 *
1232 * The kernel pmap's pm_obj is not used for much. However, in user pmaps
1233 * the pm_obj contains the list of active PTPs.
1234 */
1235 kpm = pmap_kernel();
1236 mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
1237 rw_init(&kpm->pm_dummy_lock);
1238 for (i = 0; i < PTP_LEVELS - 1; i++) {
1239 uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
1240 uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
1241 kpm->pm_ptphint[i] = NULL;
1242 }
1243 memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
1244
1245 kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
1246 for (i = 0; i < PDP_SIZE; i++)
1247 kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
1248
1249 kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
1250 x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
1251
1252 kcpuset_create(&kpm->pm_cpus, true);
1253 kcpuset_create(&kpm->pm_kernel_cpus, true);
1254
1255 kpm->pm_ldt = NULL;
1256 kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
1257
1258 /*
1259 * the above is just a rough estimate and not critical to the proper
1260 * operation of the system.
1261 */
1262
1263 #if !defined(XENPV)
1264 /*
1265 * Begin to enable global TLB entries if they are supported: add PTE_G
1266 * attribute to already mapped kernel pages. Do that only if SVS is
1267 * disabled.
1268 *
1269 * The G bit has no effect until the CR4_PGE bit is set in CR4, which
1270 * happens later in cpu_init().
1271 */
1272 #ifdef SVS
1273 if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
1274 #else
1275 if (cpu_feature[0] & CPUID_PGE) {
1276 #endif
1277 pmap_pg_g = PTE_G;
1278 pmap_remap_global();
1279 }
1280 #endif
1281
1282 #ifndef XENPV
1283 /*
1284 * Enable large pages if they are supported.
1285 */
1286 if (cpu_feature[0] & CPUID_PSE) {
1287 lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
1288 pmap_largepages = 1; /* enable software */
1289
1290 /*
1291 * The TLB must be flushed after enabling large pages on Pentium
1292 * CPUs, according to section 3.6.2.2 of "Intel Architecture
1293 * Software Developer's Manual, Volume 3: System Programming".
1294 */
1295 tlbflushg();
1296
1297 /* Remap the kernel. */
1298 pmap_remap_largepages();
1299 }
1300 pmap_init_lapic();
1301 #endif /* !XENPV */
1302
1303 #ifdef __HAVE_PCPU_AREA
1304 pmap_init_pcpu();
1305 #endif
1306
1307 #ifdef __HAVE_DIRECT_MAP
1308 pmap_init_directmap(kpm);
1309 #else
1310 pmap_vpage_cpualloc(&cpu_info_primary);
1311
1312 if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
1313 early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
1314 early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
1315 } else { /* amd64 */
1316 /*
1317 * zero_pte is stuck at the end of mapped space for the kernel
1318 * image (disjunct from kva space). This is done so that it
1319 * can safely be used in pmap_growkernel (pmap_get_physpage),
1320 * when it's called for the first time.
1321 * XXXfvdl fix this for MULTIPROCESSOR later.
1322 */
1323 #ifdef XENPV
1324 /* early_zerop initialized in xen_locore() */
1325 #else
1326 early_zerop = (void *)bootspace.spareva;
1327 #endif
1328 early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
1329 }
1330 #endif
1331
1332 #if defined(XENPV) && defined(__x86_64__)
1333 extern vaddr_t xen_dummy_page;
1334 paddr_t xen_dummy_user_pgd;
1335
1336 /*
1337 * We want a dummy page directory for Xen: when deactivating a pmap,
1338 * Xen will still consider it active. So we set user PGD to this one
1339 * to lift all protection on the now inactive page tables set.
1340 */
1341 xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
1342
1343 /* Zero fill it, the less checks in Xen it requires the better */
1344 memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
1345 /* Mark read-only */
1346 HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
1347 pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
1348 UVMF_INVLPG);
1349 /* Pin as L4 */
1350 xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
1351 #endif
1352
1353 /*
1354 * Allocate space for the Interrupt Descriptor Table (IDT),
1355 * Global Descriptor Table (GDT), and Local Descriptor Table
1356 * (LDT).
1357 *
1358 * Currently there is an initial temporary GDT allocated on the
1359 * stack by the caller of init386/init_x86_64, which is (among
1360 * other things) needed on i386 for %fs-relative addressing for
1361 * CPU-local data (CPUVAR(...), curcpu(), curlwp). This
1362 * initial temporary GDT will be popped off the stack before we
1363 * can enter main, so we need to make sure there is space for a
1364 * second temporary GDT to continue existing when we enter main
1365 * before we allocate space for the permanent GDT with
1366 * uvm_km(9) in gdt_init via cpu_startup and switch to that.
1367 */
1368 idt_vaddr = pmap_bootstrap_valloc(1);
1369 idt_paddr = pmap_bootstrap_palloc(1);
1370
1371 gdt_vaddr = pmap_bootstrap_valloc(1);
1372 gdt_paddr = pmap_bootstrap_palloc(1);
1373
1374 #ifdef __HAVE_PCPU_AREA
1375 ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
1376 #else
1377 ldt_vaddr = pmap_bootstrap_valloc(1);
1378 #endif
1379 ldt_paddr = pmap_bootstrap_palloc(1);
1380
1381 #if !defined(__x86_64__)
1382 /* pentium f00f bug stuff */
1383 pentium_idt_vaddr = pmap_bootstrap_valloc(1);
1384 #endif
1385
1386 #if defined(XENPVHVM)
1387 /* XXX: move to hypervisor.c with appropriate API adjustments */
1388 extern paddr_t HYPERVISOR_shared_info_pa;
1389 extern volatile struct xencons_interface *xencons_interface; /* XXX */
1390 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
1391
1392 if (vm_guest != VM_GUEST_XENPVH) {
1393 HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
1394 HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
1395 }
1396 xencons_interface = (void *) pmap_bootstrap_valloc(1);
1397 xenstore_interface = (void *) pmap_bootstrap_valloc(1);
1398 #endif
1399 /*
1400 * Now we reserve some VM for mapping pages when doing a crash dump.
1401 */
1402 virtual_avail = reserve_dumppages(virtual_avail);
1403
1404 /*
1405 * Init the global lock and global list.
1406 */
1407 mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
1408 LIST_INIT(&pmaps);
1409
1410 /*
1411 * Ensure the TLB is sync'd with reality by flushing it...
1412 */
1413 tlbflushg();
1414
1415 /*
1416 * Calculate pmap_maxkvaddr from nkptp[].
1417 */
1418 kva = VM_MIN_KERNEL_ADDRESS;
1419 for (i = PTP_LEVELS - 1; i >= 1; i--) {
1420 kva += nkptp[i] * nbpd[i];
1421 }
1422 pmap_maxkvaddr = kva;
1423 }
1424
1425 #ifndef XENPV
1426 static void
1427 pmap_init_lapic(void)
1428 {
1429 /*
1430 * On CPUs that have no LAPIC, local_apic_va is never kentered. But our
1431 * x86 implementation relies a lot on this address to be valid; so just
1432 * allocate a fake physical page that will be kentered into
1433 * local_apic_va by machdep.
1434 *
1435 * If the LAPIC is present, the va will be remapped somewhere else
1436 * later in lapic_map.
1437 */
1438 local_apic_va = pmap_bootstrap_valloc(1);
1439 local_apic_pa = pmap_bootstrap_palloc(1);
1440 }
1441 #endif
1442
1443 #ifdef __x86_64__
1444 static size_t
1445 pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
1446 {
1447 size_t npages;
1448 npages = (roundup(endva, pgsz) / pgsz) -
1449 (rounddown(startva, pgsz) / pgsz);
1450 return npages;
1451 }
1452 #endif
1453
1454 #if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
1455 static inline void
1456 slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
1457 {
1458 size_t sslot = slotspace.area[type].sslot;
1459 size_t nslot = slotspace.area[type].nslot;
1460
1461 memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
1462 }
1463 #endif
1464
1465 #ifdef __x86_64__
1466 /*
1467 * Randomize the location of an area. We count the holes in the VM space. We
1468 * randomly select one hole, and then randomly select an area within that hole.
1469 * Finally we update the associated entry in the slotspace structure.
1470 */
1471 vaddr_t
1472 slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
1473 vaddr_t randva)
1474 {
1475 struct {
1476 int start;
1477 int end;
1478 } holes[SLSPACE_NAREAS+1];
1479 size_t i, nholes, hole;
1480 size_t startsl, endsl, nslots, winsize;
1481 vaddr_t startva, va;
1482
1483 sz = roundup(sz, align);
1484
1485 /*
1486 * Take one more slot with +NBPD_L4, because we may end up choosing
1487 * an area that crosses slots:
1488 * +------+------+------+
1489 * | Slot | Slot | Slot |
1490 * +------+------+------+
1491 * [Chosen Area]
1492 * And in that case we must take into account the additional slot
1493 * consumed.
1494 */
1495 nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
1496
1497 /* Get the holes. */
1498 nholes = 0;
1499 size_t curslot = 0 + 256; /* end of SLAREA_USER */
1500 while (1) {
1501 /*
1502 * Find the first occupied slot after the current one.
1503 * The area between the two is a hole.
1504 */
1505 size_t minsslot = 512;
1506 size_t minnslot = 0;
1507 for (i = 0; i < SLSPACE_NAREAS; i++) {
1508 if (!slotspace.area[i].active)
1509 continue;
1510 if (slotspace.area[i].sslot >= curslot &&
1511 slotspace.area[i].sslot < minsslot) {
1512 minsslot = slotspace.area[i].sslot;
1513 minnslot = slotspace.area[i].nslot;
1514 }
1515 }
1516
1517 /* No hole anymore, stop here. */
1518 if (minsslot == 512) {
1519 break;
1520 }
1521
1522 /* Register the hole. */
1523 if (minsslot - curslot >= nslots) {
1524 holes[nholes].start = curslot;
1525 holes[nholes].end = minsslot;
1526 nholes++;
1527 }
1528
1529 /* Skip that hole, and iterate again. */
1530 curslot = minsslot + minnslot;
1531 }
1532
1533 if (nholes == 0) {
1534 panic("%s: impossible", __func__);
1535 }
1536
1537 /* Select a hole. */
1538 hole = randhole;
1539 #ifdef NO_X86_ASLR
1540 hole = 0;
1541 #endif
1542 hole %= nholes;
1543 startsl = holes[hole].start;
1544 endsl = holes[hole].end;
1545 startva = VA_SIGN_NEG(startsl * NBPD_L4);
1546
1547 /* Select an area within the hole. */
1548 va = randva;
1549 #ifdef NO_X86_ASLR
1550 va = 0;
1551 #endif
1552 winsize = ((endsl - startsl) * NBPD_L4) - sz;
1553 va %= winsize;
1554 va = rounddown(va, align);
1555 va += startva;
1556
1557 /* Update the entry. */
1558 slotspace.area[type].sslot = pl4_i(va);
1559 slotspace.area[type].nslot =
1560 pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
1561 slotspace.area[type].active = true;
1562
1563 return va;
1564 }
1565 #endif
1566
1567 #ifdef __HAVE_PCPU_AREA
1568 static void
1569 pmap_init_pcpu(void)
1570 {
1571 const vaddr_t startva = PMAP_PCPU_BASE;
1572 size_t nL4e, nL3e, nL2e, nL1e;
1573 size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
1574 paddr_t pa;
1575 vaddr_t endva;
1576 vaddr_t tmpva;
1577 pt_entry_t *pte;
1578 size_t size;
1579 int i;
1580
1581 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1582
1583 size = sizeof(struct pcpu_area);
1584
1585 endva = startva + size;
1586
1587 /* We will use this temporary va. */
1588 tmpva = bootspace.spareva;
1589 pte = PTE_BASE + pl1_i(tmpva);
1590
1591 /* Build L4 */
1592 L4e_idx = pl4_i(startva);
1593 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1594 KASSERT(nL4e == 1);
1595 for (i = 0; i < nL4e; i++) {
1596 KASSERT(L4_BASE[L4e_idx+i] == 0);
1597
1598 pa = pmap_bootstrap_palloc(1);
1599 *pte = (pa & PTE_FRAME) | pteflags;
1600 pmap_update_pg(tmpva);
1601 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1602
1603 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1604 }
1605
1606 /* Build L3 */
1607 L3e_idx = pl3_i(startva);
1608 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1609 for (i = 0; i < nL3e; i++) {
1610 KASSERT(L3_BASE[L3e_idx+i] == 0);
1611
1612 pa = pmap_bootstrap_palloc(1);
1613 *pte = (pa & PTE_FRAME) | pteflags;
1614 pmap_update_pg(tmpva);
1615 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1616
1617 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1618 }
1619
1620 /* Build L2 */
1621 L2e_idx = pl2_i(startva);
1622 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1623 for (i = 0; i < nL2e; i++) {
1624
1625 KASSERT(L2_BASE[L2e_idx+i] == 0);
1626
1627 pa = pmap_bootstrap_palloc(1);
1628 *pte = (pa & PTE_FRAME) | pteflags;
1629 pmap_update_pg(tmpva);
1630 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1631
1632 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
1633 }
1634
1635 /* Build L1 */
1636 L1e_idx = pl1_i(startva);
1637 nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
1638 for (i = 0; i < nL1e; i++) {
1639 /*
1640 * Nothing to do, the PTEs will be entered via
1641 * pmap_kenter_pa.
1642 */
1643 KASSERT(L1_BASE[L1e_idx+i] == 0);
1644 }
1645
1646 *pte = 0;
1647 pmap_update_pg(tmpva);
1648
1649 pcpuarea = (struct pcpu_area *)startva;
1650
1651 tlbflush();
1652 }
1653 #endif
1654
1655 #ifdef __HAVE_DIRECT_MAP
1656 static void
1657 randomize_hole(size_t *randholep, vaddr_t *randvap)
1658 {
1659 struct nist_hash_drbg drbg;
1660 uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
1661 const char p[] = "x86/directmap";
1662 int error;
1663
1664 entropy_extract(seed, sizeof(seed), 0);
1665
1666 error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
1667 /*nonce*/NULL, 0,
1668 /*personalization*/p, strlen(p));
1669 KASSERTMSG(error == 0, "error=%d", error);
1670
1671 error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
1672 /*additional*/NULL, 0);
1673 KASSERTMSG(error == 0, "error=%d", error);
1674
1675 error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
1676 /*additional*/NULL, 0);
1677 KASSERTMSG(error == 0, "error=%d", error);
1678
1679 explicit_memset(seed, 0, sizeof(seed));
1680 explicit_memset(&drbg, 0, sizeof(drbg));
1681 }
1682
1683 /*
1684 * Create the amd64 direct map. Called only once at boot time. We map all of
1685 * the physical memory contiguously using 2MB large pages, with RW permissions.
1686 * However there is a hole: the kernel is mapped with RO permissions.
1687 */
1688 static void
1689 pmap_init_directmap(struct pmap *kpm)
1690 {
1691 extern phys_ram_seg_t mem_clusters[];
1692 extern int mem_cluster_cnt;
1693
1694 vaddr_t startva;
1695 size_t nL4e, nL3e, nL2e;
1696 size_t L4e_idx, L3e_idx, L2e_idx;
1697 size_t spahole, epahole;
1698 paddr_t lastpa, pa;
1699 vaddr_t endva;
1700 vaddr_t tmpva;
1701 pt_entry_t *pte;
1702 phys_ram_seg_t *mc;
1703 int i;
1704 size_t randhole;
1705 vaddr_t randva;
1706
1707 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
1708 const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
1709
1710 CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
1711
1712 spahole = roundup(bootspace.head.pa, NBPD_L2);
1713 epahole = rounddown(bootspace.boot.pa, NBPD_L2);
1714
1715 /* Get the last physical address available */
1716 lastpa = 0;
1717 for (i = 0; i < mem_cluster_cnt; i++) {
1718 mc = &mem_clusters[i];
1719 lastpa = MAX(lastpa, mc->start + mc->size);
1720 }
1721
1722 /*
1723 * x86_add_cluster should have truncated the memory to MAXPHYSMEM.
1724 */
1725 if (lastpa > MAXPHYSMEM) {
1726 panic("pmap_init_directmap: lastpa incorrect");
1727 }
1728
1729 randomize_hole(&randhole, &randva);
1730 startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
1731 randhole, randva);
1732 endva = startva + lastpa;
1733
1734 /* We will use this temporary va. */
1735 tmpva = bootspace.spareva;
1736 pte = PTE_BASE + pl1_i(tmpva);
1737
1738 /* Build L4 */
1739 L4e_idx = pl4_i(startva);
1740 nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
1741 KASSERT(nL4e <= NL4_SLOT_DIRECT);
1742 for (i = 0; i < nL4e; i++) {
1743 KASSERT(L4_BASE[L4e_idx+i] == 0);
1744
1745 pa = pmap_bootstrap_palloc(1);
1746 *pte = (pa & PTE_FRAME) | pteflags;
1747 pmap_update_pg(tmpva);
1748 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1749
1750 L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
1751 }
1752
1753 /* Build L3 */
1754 L3e_idx = pl3_i(startva);
1755 nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
1756 for (i = 0; i < nL3e; i++) {
1757 KASSERT(L3_BASE[L3e_idx+i] == 0);
1758
1759 pa = pmap_bootstrap_palloc(1);
1760 *pte = (pa & PTE_FRAME) | pteflags;
1761 pmap_update_pg(tmpva);
1762 memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
1763
1764 L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
1765 }
1766
1767 /* Build L2 */
1768 L2e_idx = pl2_i(startva);
1769 nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
1770 for (i = 0; i < nL2e; i++) {
1771 KASSERT(L2_BASE[L2e_idx+i] == 0);
1772
1773 pa = (paddr_t)(i * NBPD_L2);
1774
1775 if (spahole <= pa && pa < epahole) {
1776 L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
1777 PTE_PS | pmap_pg_g;
1778 } else {
1779 L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
1780 PTE_PS | pmap_pg_g;
1781 }
1782 }
1783
1784 *pte = 0;
1785 pmap_update_pg(tmpva);
1786
1787 pmap_direct_base = startva;
1788 pmap_direct_end = endva;
1789
1790 tlbflush();
1791 }
1792 #endif /* __HAVE_DIRECT_MAP */
1793
1794 #if !defined(XENPV)
1795 /*
1796 * Remap all of the virtual pages created so far with the PTE_G bit.
1797 */
1798 static void
1799 pmap_remap_global(void)
1800 {
1801 vaddr_t kva, kva_end;
1802 unsigned long p1i;
1803 size_t i;
1804
1805 /* head */
1806 kva = bootspace.head.va;
1807 kva_end = kva + bootspace.head.sz;
1808 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1809 p1i = pl1_i(kva);
1810 if (pmap_valid_entry(PTE_BASE[p1i]))
1811 PTE_BASE[p1i] |= pmap_pg_g;
1812 }
1813
1814 /* kernel segments */
1815 for (i = 0; i < BTSPACE_NSEGS; i++) {
1816 if (bootspace.segs[i].type == BTSEG_NONE) {
1817 continue;
1818 }
1819 kva = bootspace.segs[i].va;
1820 kva_end = kva + bootspace.segs[i].sz;
1821 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1822 p1i = pl1_i(kva);
1823 if (pmap_valid_entry(PTE_BASE[p1i]))
1824 PTE_BASE[p1i] |= pmap_pg_g;
1825 }
1826 }
1827
1828 /* boot space */
1829 kva = bootspace.boot.va;
1830 kva_end = kva + bootspace.boot.sz;
1831 for ( ; kva < kva_end; kva += PAGE_SIZE) {
1832 p1i = pl1_i(kva);
1833 if (pmap_valid_entry(PTE_BASE[p1i]))
1834 PTE_BASE[p1i] |= pmap_pg_g;
1835 }
1836 }
1837 #endif
1838
1839 #ifndef XENPV
1840 /*
1841 * Remap several kernel segments with large pages. We cover as many pages as we
1842 * can. Called only once at boot time, if the CPU supports large pages.
1843 */
1844 static void
1845 pmap_remap_largepages(void)
1846 {
1847 pd_entry_t *pde;
1848 vaddr_t kva, kva_end;
1849 paddr_t pa;
1850 size_t i;
1851
1852 /* Remap the kernel text using large pages. */
1853 for (i = 0; i < BTSPACE_NSEGS; i++) {
1854 if (bootspace.segs[i].type != BTSEG_TEXT) {
1855 continue;
1856 }
1857 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1858 if (kva < bootspace.segs[i].va) {
1859 continue;
1860 }
1861 kva_end = rounddown(bootspace.segs[i].va +
1862 bootspace.segs[i].sz, NBPD_L2);
1863 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1864 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1865 pde = &L2_BASE[pl2_i(kva)];
1866 *pde = pa | pmap_pg_g | PTE_PS | PTE_P;
1867 tlbflushg();
1868 }
1869 }
1870
1871 /* Remap the kernel rodata using large pages. */
1872 for (i = 0; i < BTSPACE_NSEGS; i++) {
1873 if (bootspace.segs[i].type != BTSEG_RODATA) {
1874 continue;
1875 }
1876 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1877 if (kva < bootspace.segs[i].va) {
1878 continue;
1879 }
1880 kva_end = rounddown(bootspace.segs[i].va +
1881 bootspace.segs[i].sz, NBPD_L2);
1882 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1883 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1884 pde = &L2_BASE[pl2_i(kva)];
1885 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
1886 tlbflushg();
1887 }
1888 }
1889
1890 /* Remap the kernel data+bss using large pages. */
1891 for (i = 0; i < BTSPACE_NSEGS; i++) {
1892 if (bootspace.segs[i].type != BTSEG_DATA) {
1893 continue;
1894 }
1895 kva = roundup(bootspace.segs[i].va, NBPD_L2);
1896 if (kva < bootspace.segs[i].va) {
1897 continue;
1898 }
1899 kva_end = rounddown(bootspace.segs[i].va +
1900 bootspace.segs[i].sz, NBPD_L2);
1901 pa = roundup(bootspace.segs[i].pa, NBPD_L2);
1902 for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
1903 pde = &L2_BASE[pl2_i(kva)];
1904 *pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
1905 tlbflushg();
1906 }
1907 }
1908 }
1909 #endif /* !XENPV */
1910
1911 /*
1912 * pmap_init: called from uvm_init, our job is to get the pmap system ready
1913 * to manage mappings.
1914 */
1915 void
1916 pmap_init(void)
1917 {
1918 int flags;
1919
1920 /*
1921 * initialize caches.
1922 */
1923
1924 pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
1925 0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
1926
1927 #ifdef XENPV
1928 /*
1929 * pool_cache(9) should not touch cached objects, since they
1930 * are pinned on xen and R/O for the domU
1931 */
1932 flags = PR_NOTOUCH;
1933 #else
1934 flags = 0;
1935 #endif
1936
1937 #ifdef PAE
1938 pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
1939 "pdppl", &pmap_pdp_allocator, IPL_NONE);
1940 #else
1941 pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
1942 "pdppl", NULL, IPL_NONE);
1943 #endif
1944 pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
1945 0, 0, "pvpage", &pool_allocator_kmem,
1946 IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
1947
1948 pmap_tlb_init();
1949
1950 /* XXX: Since cpu_hatch() is only for secondary CPUs. */
1951 pmap_tlb_cpu_init(curcpu());
1952
1953 evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
1954 NULL, "x86", "io bitmap copy");
1955 evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
1956 NULL, "x86", "ldt sync");
1957
1958 /*
1959 * The kernel doesn't keep track of PTPs, so there's nowhere handy
1960 * to hang a tree of pv_entry records. Dynamically allocated
1961 * pv_entry lists are not heavily used in the kernel's pmap (the
1962 * usual case is embedded), so cop out and use a single RB tree
1963 * to cover them.
1964 */
1965 rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
1966
1967 /*
1968 * done: pmap module is up (and ready for business)
1969 */
1970
1971 pmap_initialized = true;
1972 }
1973
1974 #ifndef XENPV
1975 /*
1976 * pmap_cpu_init_late: perform late per-CPU initialization.
1977 */
1978 void
1979 pmap_cpu_init_late(struct cpu_info *ci)
1980 {
1981 /*
1982 * The BP has already its own PD page allocated during early
1983 * MD startup.
1984 */
1985 if (ci == &cpu_info_primary)
1986 return;
1987 #ifdef PAE
1988 cpu_alloc_l3_page(ci);
1989 #endif
1990 }
1991 #endif
1992
1993 #ifndef __HAVE_DIRECT_MAP
1994 CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
1995 CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
1996
1997 static void
1998 pmap_vpage_cpualloc(struct cpu_info *ci)
1999 {
2000 bool primary = (ci == &cpu_info_primary);
2001 size_t i, npages;
2002 vaddr_t vabase;
2003 vsize_t vrange;
2004
2005 npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
2006 KASSERT(npages >= VPAGE_MAX);
2007 vrange = npages * PAGE_SIZE;
2008
2009 if (primary) {
2010 while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
2011 /* Waste some pages to align properly */
2012 }
2013 /* The base is aligned, allocate the rest (contiguous) */
2014 pmap_bootstrap_valloc(npages - 1);
2015 } else {
2016 vabase = uvm_km_alloc(kernel_map, vrange, vrange,
2017 UVM_KMF_VAONLY);
2018 if (vabase == 0) {
2019 panic("%s: failed to allocate tmp VA for CPU %d\n",
2020 __func__, cpu_index(ci));
2021 }
2022 }
2023
2024 KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
2025
2026 for (i = 0; i < VPAGE_MAX; i++) {
2027 ci->vpage[i] = vabase + i * PAGE_SIZE;
2028 ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
2029 }
2030 }
2031
2032 void
2033 pmap_vpage_cpu_init(struct cpu_info *ci)
2034 {
2035 if (ci == &cpu_info_primary) {
2036 /* cpu0 already taken care of in pmap_bootstrap */
2037 return;
2038 }
2039
2040 pmap_vpage_cpualloc(ci);
2041 }
2042 #endif
2043
2044 /*
2045 * p v _ e n t r y f u n c t i o n s
2046 */
2047
2048 /*
2049 * pmap_pvp_dtor: pool_cache constructor for PV pages.
2050 */
2051 static int
2052 pmap_pvp_ctor(void *arg, void *obj, int flags)
2053 {
2054 struct pv_page *pvp = (struct pv_page *)obj;
2055 struct pv_entry *pve = (struct pv_entry *)obj + 1;
2056 struct pv_entry *maxpve = pve + PVE_PER_PVP;
2057
2058 KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
2059 KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
2060
2061 LIST_INIT(&pvp->pvp_pves);
2062 pvp->pvp_nfree = PVE_PER_PVP;
2063 pvp->pvp_pmap = NULL;
2064
2065 for (; pve < maxpve; pve++) {
2066 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2067 }
2068
2069 return 0;
2070 }
2071
2072 /*
2073 * pmap_pvp_dtor: pool_cache destructor for PV pages.
2074 */
2075 static void
2076 pmap_pvp_dtor(void *arg, void *obj)
2077 {
2078 struct pv_page *pvp __diagused = obj;
2079
2080 KASSERT(pvp->pvp_pmap == NULL);
2081 KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2082 }
2083
2084 /*
2085 * pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
2086 */
2087 static struct pv_entry *
2088 pmap_alloc_pv(struct pmap *pmap)
2089 {
2090 struct pv_entry *pve;
2091 struct pv_page *pvp;
2092
2093 KASSERT(mutex_owned(&pmap->pm_lock));
2094
2095 if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
2096 if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2097 LIST_REMOVE(pvp, pvp_list);
2098 } else {
2099 pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
2100 }
2101 if (__predict_false(pvp == NULL)) {
2102 return NULL;
2103 }
2104 /* full -> part */
2105 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2106 pvp->pvp_pmap = pmap;
2107 }
2108
2109 KASSERT(pvp->pvp_pmap == pmap);
2110 KASSERT(pvp->pvp_nfree > 0);
2111
2112 pve = LIST_FIRST(&pvp->pvp_pves);
2113 LIST_REMOVE(pve, pve_list);
2114 pvp->pvp_nfree--;
2115
2116 if (__predict_false(pvp->pvp_nfree == 0)) {
2117 /* part -> empty */
2118 KASSERT(LIST_EMPTY(&pvp->pvp_pves));
2119 LIST_REMOVE(pvp, pvp_list);
2120 LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
2121 } else {
2122 KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
2123 }
2124
2125 return pve;
2126 }
2127
2128 /*
2129 * pmap_free_pv: delayed free of a PV entry.
2130 */
2131 static void
2132 pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
2133 {
2134 struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
2135
2136 KASSERT(mutex_owned(&pmap->pm_lock));
2137 KASSERT(pvp->pvp_pmap == pmap);
2138 KASSERT(pvp->pvp_nfree >= 0);
2139
2140 LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
2141 pvp->pvp_nfree++;
2142
2143 if (__predict_false(pvp->pvp_nfree == 1)) {
2144 /* empty -> part */
2145 LIST_REMOVE(pvp, pvp_list);
2146 LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
2147 } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
2148 /* part -> full */
2149 LIST_REMOVE(pvp, pvp_list);
2150 LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
2151 }
2152 }
2153
2154 /*
2155 * pmap_drain_pv: free full PV pages.
2156 */
2157 static void
2158 pmap_drain_pv(struct pmap *pmap)
2159 {
2160 struct pv_page *pvp;
2161
2162 KASSERT(mutex_owned(&pmap->pm_lock));
2163
2164 while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
2165 LIST_REMOVE(pvp, pvp_list);
2166 KASSERT(pvp->pvp_pmap == pmap);
2167 KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
2168 pvp->pvp_pmap = NULL;
2169 pool_cache_put(&pmap_pvp_cache, pvp);
2170 }
2171 }
2172
2173 /*
2174 * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
2175 */
2176 static void
2177 pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
2178 vaddr_t va, bool tracked)
2179 {
2180 #ifdef DEBUG
2181 struct pv_pte *pvpte;
2182
2183 PMAP_CHECK_PP(pp);
2184
2185 mutex_spin_enter(&pp->pp_lock);
2186 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
2187 if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
2188 break;
2189 }
2190 }
2191 mutex_spin_exit(&pp->pp_lock);
2192
2193 if (pvpte && !tracked) {
2194 panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
2195 } else if (!pvpte && tracked) {
2196 panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
2197 }
2198 #endif
2199 }
2200
2201 /*
2202 * pmap_treelookup_pv: search the PV tree for a dynamic entry
2203 *
2204 * => pmap must be locked
2205 */
2206 static struct pv_entry *
2207 pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2208 const rb_tree_t *tree, const vaddr_t va)
2209 {
2210 struct pv_entry *pve;
2211 rb_node_t *node;
2212
2213 /*
2214 * Inlined lookup tailored for exactly what's needed here that is
2215 * quite a bit faster than using rb_tree_find_node().
2216 */
2217 for (node = tree->rbt_root;;) {
2218 if (__predict_false(RB_SENTINEL_P(node))) {
2219 return NULL;
2220 }
2221 pve = (struct pv_entry *)
2222 ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
2223 if (pve->pve_pte.pte_va == va) {
2224 KASSERT(pve->pve_pte.pte_ptp == ptp);
2225 return pve;
2226 }
2227 node = node->rb_nodes[pve->pve_pte.pte_va < va];
2228 }
2229 }
2230
2231 /*
2232 * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
2233 *
2234 * => a PV entry must be known present (doesn't check for existence)
2235 * => pmap must be locked
2236 */
2237 static struct pv_entry *
2238 pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
2239 const struct pmap_page * const old_pp, const vaddr_t va)
2240 {
2241 struct pv_entry *pve;
2242 const rb_tree_t *tree;
2243
2244 KASSERT(mutex_owned(&pmap->pm_lock));
2245 KASSERT(ptp != NULL || pmap == pmap_kernel());
2246
2247 /*
2248 * [This mostly deals with the case of process-private pages, i.e.
2249 * anonymous memory allocations or COW.]
2250 *
2251 * If the page is tracked with an embedded entry then the tree
2252 * lookup can be avoided. It's safe to check for this specific
2253 * set of values without pp_lock because both will only ever be
2254 * set together for this pmap.
2255 *
2256 */
2257 if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
2258 atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
2259 return NULL;
2260 }
2261
2262 /*
2263 * [This mostly deals with shared mappings, for example shared libs
2264 * and executables.]
2265 *
2266 * Optimise for pmap_remove_ptes() which works by ascending scan:
2267 * look at the lowest numbered node in the tree first. The tree is
2268 * known non-empty because of the check above. For short lived
2269 * processes where pmap_remove() isn't used much this gets close to
2270 * a 100% hit rate.
2271 */
2272 tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2273 KASSERT(!RB_SENTINEL_P(tree->rbt_root));
2274 pve = (struct pv_entry *)
2275 ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
2276 offsetof(struct pv_entry, pve_rb));
2277 if (__predict_true(pve->pve_pte.pte_va == va)) {
2278 KASSERT(pve->pve_pte.pte_ptp == ptp);
2279 return pve;
2280 }
2281
2282 /* Search the RB tree for the key (uncommon). */
2283 return pmap_treelookup_pv(pmap, ptp, tree, va);
2284 }
2285
2286 /*
2287 * pmap_enter_pv: enter a mapping onto a pmap_page lst
2288 *
2289 * => pmap must be locked
2290 * => does NOT insert dynamic entries to tree (pmap_enter() does later)
2291 */
2292 static int
2293 pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2294 vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
2295 bool *samepage, bool *new_embedded, rb_tree_t *tree)
2296 {
2297 struct pv_entry *pve;
2298 int error;
2299
2300 KASSERT(mutex_owned(&pmap->pm_lock));
2301 KASSERT(ptp_to_pmap(ptp) == pmap);
2302 KASSERT(ptp == NULL || ptp->uobject != NULL);
2303 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2304 PMAP_CHECK_PP(pp);
2305
2306 /*
2307 * If entering the same page and it's already tracked with an
2308 * embedded entry, we can avoid the expense below. It's safe
2309 * to check for this very specific set of values without a lock
2310 * because both will only ever be set together for this pmap.
2311 */
2312 if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
2313 atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
2314 *samepage = true;
2315 pmap_check_pv(pmap, ptp, pp, va, true);
2316 return 0;
2317 }
2318
2319 /*
2320 * Check for an existing dynamic mapping at this address. If it's
2321 * for the same page, then it will be reused and nothing needs to be
2322 * changed.
2323 */
2324 *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
2325 if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
2326 *samepage = true;
2327 pmap_check_pv(pmap, ptp, pp, va, true);
2328 return 0;
2329 }
2330
2331 /*
2332 * Need to put a new mapping in place. Grab a spare pv_entry in
2333 * case it's needed; won't know for sure until the lock is taken.
2334 */
2335 if (pmap->pm_pve == NULL) {
2336 pmap->pm_pve = pmap_alloc_pv(pmap);
2337 }
2338
2339 error = 0;
2340 pmap_check_pv(pmap, ptp, pp, va, false);
2341 mutex_spin_enter(&pp->pp_lock);
2342 if (!pv_pte_embedded(pp)) {
2343 /*
2344 * Embedded PV tracking available - easy.
2345 */
2346 pp->pp_pte.pte_ptp = ptp;
2347 pp->pp_pte.pte_va = va;
2348 *new_embedded = true;
2349 } else if (__predict_false(pmap->pm_pve == NULL)) {
2350 /*
2351 * No memory.
2352 */
2353 error = ENOMEM;
2354 } else {
2355 /*
2356 * Install new pv_entry on the page.
2357 */
2358 pve = pmap->pm_pve;
2359 pmap->pm_pve = NULL;
2360 *new_pve = pve;
2361 pve->pve_pte.pte_ptp = ptp;
2362 pve->pve_pte.pte_va = va;
2363 pve->pve_pp = pp;
2364 LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
2365 }
2366 mutex_spin_exit(&pp->pp_lock);
2367 if (error == 0) {
2368 pmap_check_pv(pmap, ptp, pp, va, true);
2369 }
2370
2371 return error;
2372 }
2373
2374 /*
2375 * pmap_remove_pv: try to remove a mapping from a pv_list
2376 *
2377 * => pmap must be locked
2378 * => removes dynamic entries from tree and frees them
2379 * => caller should adjust ptp's wire_count and free PTP if needed
2380 */
2381 static void
2382 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
2383 vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
2384 {
2385 rb_tree_t *tree = (ptp != NULL ?
2386 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
2387
2388 KASSERT(mutex_owned(&pmap->pm_lock));
2389 KASSERT(ptp_to_pmap(ptp) == pmap);
2390 KASSERT(ptp == NULL || ptp->uobject != NULL);
2391 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
2392 KASSERT(ptp != NULL || pmap == pmap_kernel());
2393
2394 pmap_check_pv(pmap, ptp, pp, va, true);
2395
2396 if (pve == NULL) {
2397 mutex_spin_enter(&pp->pp_lock);
2398 KASSERT(pp->pp_pte.pte_ptp == ptp);
2399 KASSERT(pp->pp_pte.pte_va == va);
2400 pp->pp_attrs |= oattrs;
2401 pp->pp_pte.pte_ptp = NULL;
2402 pp->pp_pte.pte_va = 0;
2403 mutex_spin_exit(&pp->pp_lock);
2404 } else {
2405 mutex_spin_enter(&pp->pp_lock);
2406 KASSERT(pp->pp_pte.pte_ptp != ptp ||
2407 pp->pp_pte.pte_va != va);
2408 KASSERT(pve->pve_pte.pte_ptp == ptp);
2409 KASSERT(pve->pve_pte.pte_va == va);
2410 KASSERT(pve->pve_pp == pp);
2411 pp->pp_attrs |= oattrs;
2412 LIST_REMOVE(pve, pve_list);
2413 mutex_spin_exit(&pp->pp_lock);
2414
2415 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
2416 rb_tree_remove_node(tree, pve);
2417 #ifdef DIAGNOSTIC
2418 memset(pve, 0, sizeof(*pve));
2419 #endif
2420 pmap_free_pv(pmap, pve);
2421 }
2422
2423 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
2424 pmap_check_pv(pmap, ptp, pp, va, false);
2425 }
2426
2427 /*
2428 * p t p f u n c t i o n s
2429 */
2430
2431 static struct vm_page *
2432 pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
2433 {
2434 int lidx = level - 1;
2435 off_t off = ptp_va2o(va, level);
2436 struct vm_page *pg;
2437
2438 KASSERT(mutex_owned(&pmap->pm_lock));
2439
2440 if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
2441 KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
2442 pg = pmap->pm_ptphint[lidx];
2443 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2444 return pg;
2445 }
2446 PMAP_DUMMY_LOCK(pmap);
2447 pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
2448 PMAP_DUMMY_UNLOCK(pmap);
2449 if (pg != NULL && __predict_false(pg->wire_count == 0)) {
2450 /* This page is queued to be freed - ignore. */
2451 pg = NULL;
2452 }
2453 if (pg != NULL) {
2454 PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
2455 }
2456 pmap->pm_ptphint[lidx] = pg;
2457 return pg;
2458 }
2459
2460 static inline void
2461 pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
2462 {
2463 int lidx;
2464
2465 KASSERT(ptp->wire_count <= 1);
2466 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
2467
2468 lidx = level - 1;
2469 pmap_stats_update(pmap, -ptp->wire_count, 0);
2470 if (pmap->pm_ptphint[lidx] == ptp)
2471 pmap->pm_ptphint[lidx] = NULL;
2472 ptp->wire_count = 0;
2473 ptp->uanon = NULL;
2474 KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
2475
2476 /*
2477 * Enqueue the PTP to be freed by pmap_update(). We can't remove
2478 * the page from the uvm_object, as that can take further locks
2479 * (intolerable right now because the PTEs are likely mapped in).
2480 * Instead mark the PTP as free and if we bump into it again, we'll
2481 * either ignore or reuse (depending on what's useful at the time).
2482 */
2483 LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
2484 }
2485
2486 static void
2487 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
2488 pt_entry_t *ptes, pd_entry_t * const *pdes)
2489 {
2490 unsigned long index;
2491 int level;
2492 vaddr_t invaladdr;
2493 pd_entry_t opde;
2494
2495 KASSERT(pmap != pmap_kernel());
2496 KASSERT(mutex_owned(&pmap->pm_lock));
2497 KASSERT(kpreempt_disabled());
2498
2499 level = 1;
2500 do {
2501 index = pl_i(va, level + 1);
2502 opde = pmap_pte_testset(&pdes[level - 1][index], 0);
2503
2504 /*
2505 * On Xen-amd64 or SVS, we need to sync the top level page
2506 * directory on each CPU.
2507 */
2508 #if defined(XENPV) && defined(__x86_64__)
2509 if (level == PTP_LEVELS - 1) {
2510 xen_kpm_sync(pmap, index);
2511 }
2512 #elif defined(SVS)
2513 if (svs_enabled && level == PTP_LEVELS - 1 &&
2514 pmap_is_user(pmap)) {
2515 svs_pmap_sync(pmap, index);
2516 }
2517 #endif
2518
2519 invaladdr = level == 1 ? (vaddr_t)ptes :
2520 (vaddr_t)pdes[level - 2];
2521 pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
2522 opde, TLBSHOOT_FREE_PTP);
2523
2524 #if defined(XENPV)
2525 pmap_tlb_shootnow();
2526 #endif
2527
2528 pmap_freepage(pmap, ptp, level);
2529 if (level < PTP_LEVELS - 1) {
2530 ptp = pmap_find_ptp(pmap, va, level + 1);
2531 ptp->wire_count--;
2532 if (ptp->wire_count > 1)
2533 break;
2534 }
2535 } while (++level < PTP_LEVELS);
2536 pmap_pte_flush();
2537 }
2538
2539 /*
2540 * pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
2541 *
2542 * => pmap should NOT be pmap_kernel()
2543 * => pmap should be locked
2544 * => we are not touching any PTEs yet, so they need not be mapped in
2545 */
2546 static int
2547 pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2548 int flags, struct vm_page **resultp)
2549 {
2550 struct vm_page *ptp;
2551 int i, aflags;
2552 struct uvm_object *obj;
2553 voff_t off;
2554
2555 KASSERT(pmap != pmap_kernel());
2556 KASSERT(mutex_owned(&pmap->pm_lock));
2557
2558 /*
2559 * Loop through all page table levels allocating a page
2560 * for any level where we don't already have one.
2561 */
2562 memset(pt, 0, sizeof(*pt));
2563 aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
2564 UVM_PGA_ZERO;
2565 for (i = PTP_LEVELS; i > 1; i--) {
2566 obj = &pmap->pm_obj[i - 2];
2567 off = ptp_va2o(va, i - 1);
2568
2569 PMAP_DUMMY_LOCK(pmap);
2570 pt->pg[i] = uvm_pagelookup(obj, off);
2571
2572 if (pt->pg[i] == NULL) {
2573 pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
2574 pt->alloced[i] = (pt->pg[i] != NULL);
2575 } else if (pt->pg[i]->wire_count == 0) {
2576 /* This page was queued to be freed; dequeue it. */
2577 LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
2578 pt->alloced[i] = true;
2579 }
2580 PMAP_DUMMY_UNLOCK(pmap);
2581 if (pt->pg[i] == NULL) {
2582 pmap_unget_ptp(pmap, pt);
2583 return ENOMEM;
2584 } else if (pt->alloced[i]) {
2585 pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
2586 rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
2587 &pmap_rbtree_ops);
2588 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2589 }
2590 }
2591 ptp = pt->pg[2];
2592 KASSERT(ptp != NULL);
2593 *resultp = ptp;
2594 pmap->pm_ptphint[0] = ptp;
2595 return 0;
2596 }
2597
2598 /*
2599 * pmap_install_ptp: install any freshly allocated PTPs
2600 *
2601 * => pmap should NOT be pmap_kernel()
2602 * => pmap should be locked
2603 * => PTEs must be mapped
2604 * => preemption must be disabled
2605 */
2606 static void
2607 pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
2608 pd_entry_t * const *pdes)
2609 {
2610 struct vm_page *ptp;
2611 unsigned long index;
2612 pd_entry_t *pva;
2613 paddr_t pa;
2614 int i;
2615
2616 KASSERT(pmap != pmap_kernel());
2617 KASSERT(mutex_owned(&pmap->pm_lock));
2618 KASSERT(kpreempt_disabled());
2619
2620 /*
2621 * Now that we have all the pages looked up or allocated,
2622 * loop through again installing any new ones into the tree.
2623 */
2624 for (i = PTP_LEVELS; i > 1; i--) {
2625 index = pl_i(va, i);
2626 pva = pdes[i - 2];
2627
2628 if (pmap_valid_entry(pva[index])) {
2629 KASSERT(!pt->alloced[i]);
2630 continue;
2631 }
2632
2633 ptp = pt->pg[i];
2634 ptp->flags &= ~PG_BUSY; /* never busy */
2635 ptp->wire_count = 1;
2636 pmap->pm_ptphint[i - 2] = ptp;
2637 pa = VM_PAGE_TO_PHYS(ptp);
2638 pmap_pte_set(&pva[index], (pd_entry_t)
2639 (pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
2640
2641 /*
2642 * On Xen-amd64 or SVS, we need to sync the top level page
2643 * directory on each CPU.
2644 */
2645 #if defined(XENPV) && defined(__x86_64__)
2646 if (i == PTP_LEVELS) {
2647 xen_kpm_sync(pmap, index);
2648 }
2649 #elif defined(SVS)
2650 if (svs_enabled && i == PTP_LEVELS &&
2651 pmap_is_user(pmap)) {
2652 svs_pmap_sync(pmap, index);
2653 }
2654 #endif
2655
2656 pmap_pte_flush();
2657 pmap_stats_update(pmap, 1, 0);
2658
2659 /*
2660 * If we're not in the top level, increase the
2661 * wire count of the parent page.
2662 */
2663 if (i < PTP_LEVELS) {
2664 pt->pg[i + 1]->wire_count++;
2665 }
2666 }
2667 }
2668
2669 /*
2670 * pmap_unget_ptp: free unusued PTPs
2671 *
2672 * => pmap should NOT be pmap_kernel()
2673 * => pmap should be locked
2674 */
2675 static void
2676 pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
2677 {
2678 int i;
2679
2680 KASSERT(pmap != pmap_kernel());
2681 KASSERT(mutex_owned(&pmap->pm_lock));
2682
2683 for (i = PTP_LEVELS; i > 1; i--) {
2684 if (!pt->alloced[i]) {
2685 continue;
2686 }
2687 KASSERT(pt->pg[i]->wire_count == 0);
2688 PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
2689 pmap_freepage(pmap, pt->pg[i], i - 1);
2690 }
2691 }
2692
2693 /*
2694 * p m a p l i f e c y c l e f u n c t i o n s
2695 */
2696
2697 /*
2698 * pmap_pdp_init: constructor a new PDP.
2699 */
2700 static void
2701 pmap_pdp_init(pd_entry_t *pdir)
2702 {
2703 paddr_t pdirpa = 0;
2704 vaddr_t object;
2705 int i;
2706
2707 #if !defined(XENPV) || !defined(__x86_64__)
2708 int npde;
2709 #endif
2710 #ifdef XENPV
2711 int s;
2712 #endif
2713
2714 memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
2715
2716 /*
2717 * NOTE: This is all done unlocked, but we will check afterwards
2718 * if we have raced with pmap_growkernel().
2719 */
2720
2721 #if defined(XENPV) && defined(__x86_64__)
2722 /* Fetch the physical address of the page directory */
2723 (void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
2724
2725 /*
2726 * This pdir will NEVER be active in kernel mode, so mark
2727 * recursive entry invalid.
2728 */
2729 pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
2730
2731 /*
2732 * PDP constructed this way won't be for the kernel, hence we
2733 * don't put kernel mappings on Xen.
2734 *
2735 * But we need to make pmap_create() happy, so put a dummy
2736 * (without PTE_P) value at the right place.
2737 */
2738 pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
2739 (pd_entry_t)-1 & PTE_FRAME;
2740 #else /* XENPV && __x86_64__*/
2741 object = (vaddr_t)pdir;
2742 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2743 /* Fetch the physical address of the page directory */
2744 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2745
2746 /* Put in recursive PDE to map the PTEs */
2747 pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
2748 pmap_pg_nx;
2749 #ifndef XENPV
2750 pdir[PDIR_SLOT_PTE + i] |= PTE_W;
2751 #endif
2752 }
2753
2754 /* Copy the kernel's top level PDE */
2755 npde = nkptp[PTP_LEVELS - 1];
2756
2757 memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
2758 npde * sizeof(pd_entry_t));
2759
2760 if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
2761 int idx = pl_i(KERNBASE, PTP_LEVELS);
2762 pdir[idx] = PDP_BASE[idx];
2763 }
2764
2765 #ifdef __HAVE_PCPU_AREA
2766 pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
2767 #endif
2768 #ifdef __HAVE_DIRECT_MAP
2769 slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
2770 #endif
2771 #ifdef KASAN
2772 slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
2773 #endif
2774 #ifdef KMSAN
2775 slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
2776 #endif
2777 #endif /* XENPV && __x86_64__*/
2778
2779 #ifdef XENPV
2780 s = splvm();
2781 object = (vaddr_t)pdir;
2782 pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
2783 VM_PROT_READ);
2784 pmap_update(pmap_kernel());
2785 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2786 /*
2787 * pin as L2/L4 page, we have to do the page with the
2788 * PDIR_SLOT_PTE entries last
2789 */
2790 #ifdef PAE
2791 if (i == l2tol3(PDIR_SLOT_PTE))
2792 continue;
2793 #endif
2794
2795 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2796 #ifdef __x86_64__
2797 xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
2798 #else
2799 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2800 #endif
2801 }
2802 #ifdef PAE
2803 object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE);
2804 (void)pmap_extract(pmap_kernel(), object, &pdirpa);
2805 xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
2806 #endif
2807 splx(s);
2808 #endif /* XENPV */
2809 }
2810
2811 /*
2812 * pmap_pdp_fini: destructor for the PDPs.
2813 */
2814 static void
2815 pmap_pdp_fini(pd_entry_t *pdir)
2816 {
2817 #ifdef XENPV
2818 paddr_t pdirpa = 0; /* XXX: GCC */
2819 vaddr_t object = (vaddr_t)pdir;
2820 int i;
2821 int s = splvm();
2822 pt_entry_t *pte;
2823
2824 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2825 /* fetch the physical address of the page directory. */
2826 (void) pmap_extract(pmap_kernel(), object, &pdirpa);
2827 /* unpin page table */
2828 xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
2829 }
2830 object = (vaddr_t)pdir;
2831 for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
2832 /* Set page RW again */
2833 pte = kvtopte(object);
2834 pmap_pte_set(pte, *pte | PTE_W);
2835 xen_bcast_invlpg((vaddr_t)object);
2836 }
2837 splx(s);
2838 #endif /* XENPV */
2839 }
2840
2841 #ifdef PAE
2842 static void *
2843 pmap_pdp_alloc(struct pool *pp, int flags)
2844 {
2845 return (void *)uvm_km_alloc(kernel_map,
2846 PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
2847 ((flags & PR_WAITOK) ? UVM_KMF_WAITVA
2848 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
2849 UVM_KMF_WIRED);
2850 }
2851
2852 static void
2853 pmap_pdp_free(struct pool *pp, void *v)
2854 {
2855 uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
2856 UVM_KMF_WIRED);
2857 }
2858 #endif /* PAE */
2859
2860 /*
2861 * pmap_ctor: constructor for the pmap cache.
2862 */
2863 static int
2864 pmap_ctor(void *arg, void *obj, int flags)
2865 {
2866 struct pmap *pmap = obj;
2867 pt_entry_t p;
2868 int i;
2869
2870 KASSERT((flags & PR_WAITOK) != 0);
2871
2872 mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
2873 rw_init(&pmap->pm_dummy_lock);
2874 kcpuset_create(&pmap->pm_cpus, true);
2875 kcpuset_create(&pmap->pm_kernel_cpus, true);
2876 #ifdef XENPV
2877 kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
2878 #endif
2879 LIST_INIT(&pmap->pm_gc_ptp);
2880 pmap->pm_pve = NULL;
2881 LIST_INIT(&pmap->pm_pvp_full);
2882 LIST_INIT(&pmap->pm_pvp_part);
2883 LIST_INIT(&pmap->pm_pvp_empty);
2884
2885 /* allocate and init PDP */
2886 pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
2887
2888 for (;;) {
2889 pmap_pdp_init(pmap->pm_pdir);
2890 mutex_enter(&pmaps_lock);
2891 p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
2892 if (__predict_true(p != 0)) {
2893 break;
2894 }
2895 mutex_exit(&pmaps_lock);
2896 }
2897
2898 for (i = 0; i < PDP_SIZE; i++)
2899 pmap->pm_pdirpa[i] =
2900 pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
2901
2902 LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
2903 mutex_exit(&pmaps_lock);
2904
2905 return 0;
2906 }
2907
2908 /*
2909 * pmap_ctor: destructor for the pmap cache.
2910 */
2911 static void
2912 pmap_dtor(void *arg, void *obj)
2913 {
2914 struct pmap *pmap = obj;
2915
2916 mutex_enter(&pmaps_lock);
2917 LIST_REMOVE(pmap, pm_list);
2918 mutex_exit(&pmaps_lock);
2919
2920 pmap_pdp_fini(pmap->pm_pdir);
2921 pool_put(&pmap_pdp_pool, pmap->pm_pdir);
2922 mutex_destroy(&pmap->pm_lock);
2923 rw_destroy(&pmap->pm_dummy_lock);
2924 kcpuset_destroy(pmap->pm_cpus);
2925 kcpuset_destroy(pmap->pm_kernel_cpus);
2926 #ifdef XENPV
2927 kcpuset_destroy(pmap->pm_xen_ptp_cpus);
2928 #endif
2929 }
2930
2931 /*
2932 * pmap_create: create a pmap object.
2933 */
2934 struct pmap *
2935 pmap_create(void)
2936 {
2937 struct pmap *pmap;
2938 int i;
2939
2940 pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
2941
2942 /* init uvm_object */
2943 for (i = 0; i < PTP_LEVELS - 1; i++) {
2944 uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
2945 uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
2946 pmap->pm_ptphint[i] = NULL;
2947 }
2948 pmap->pm_stats.wired_count = 0;
2949 /* count the PDP allocd below */
2950 pmap->pm_stats.resident_count = PDP_SIZE;
2951 #if !defined(__x86_64__)
2952 pmap->pm_hiexec = 0;
2953 #endif
2954
2955 /* Used by NVMM and Xen */
2956 pmap->pm_enter = NULL;
2957 pmap->pm_extract = NULL;
2958 pmap->pm_remove = NULL;
2959 pmap->pm_sync_pv = NULL;
2960 pmap->pm_pp_remove_ent = NULL;
2961 pmap->pm_write_protect = NULL;
2962 pmap->pm_unwire = NULL;
2963 pmap->pm_tlb_flush = NULL;
2964 pmap->pm_data = NULL;
2965
2966 /* init the LDT */
2967 pmap->pm_ldt = NULL;
2968 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
2969
2970 return pmap;
2971 }
2972
2973 /*
2974 * pmap_check_ptps: verify that none of the pmap's page table objects
2975 * have any pages allocated to them.
2976 */
2977 static void
2978 pmap_check_ptps(struct pmap *pmap)
2979 {
2980 int i;
2981
2982 for (i = 0; i < PTP_LEVELS - 1; i++) {
2983 KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
2984 "pmap %p level %d still has %d pages",
2985 pmap, i, (int)pmap->pm_obj[i].uo_npages);
2986 }
2987 }
2988
2989 static void
2990 pmap_check_inuse(struct pmap *pmap)
2991 {
2992 #ifdef DEBUG
2993 CPU_INFO_ITERATOR cii;
2994 struct cpu_info *ci;
2995
2996 for (CPU_INFO_FOREACH(cii, ci)) {
2997 if (ci->ci_pmap == pmap)
2998 panic("destroying pmap being used");
2999 #if defined(XENPV) && defined(__x86_64__)
3000 for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
3001 if (pmap->pm_pdir[i] != 0 &&
3002 ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
3003 printf("pmap_destroy(%p) pmap_kernel %p "
3004 "curcpu %d cpu %d ci_pmap %p "
3005 "ci->ci_kpm_pdir[%d]=%" PRIx64
3006 " pmap->pm_pdir[%d]=%" PRIx64 "\n",
3007 pmap, pmap_kernel(), curcpu()->ci_index,
3008 ci->ci_index, ci->ci_pmap,
3009 i, ci->ci_kpm_pdir[i],
3010 i, pmap->pm_pdir[i]);
3011 panic("%s: used pmap", __func__);
3012 }
3013 }
3014 #endif
3015 }
3016 #endif /* DEBUG */
3017 }
3018
3019 /*
3020 * pmap_destroy: drop reference count on pmap. free pmap if reference
3021 * count goes to zero.
3022 *
3023 * => we can be called from pmap_unmap_ptes() with a different, unrelated
3024 * pmap's lock held. be careful!
3025 */
3026 void
3027 pmap_destroy(struct pmap *pmap)
3028 {
3029 int i;
3030
3031 /*
3032 * drop reference count and verify not in use.
3033 */
3034
3035 if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
3036 return;
3037 }
3038 pmap_check_inuse(pmap);
3039
3040 /*
3041 * handle any deferred frees.
3042 */
3043
3044 mutex_enter(&pmap->pm_lock);
3045 if (pmap->pm_pve != NULL) {
3046 pmap_free_pv(pmap, pmap->pm_pve);
3047 pmap->pm_pve = NULL;
3048 }
3049 pmap_drain_pv(pmap);
3050 mutex_exit(&pmap->pm_lock);
3051 pmap_update(pmap);
3052
3053 /*
3054 * Reference count is zero, free pmap resources and then free pmap.
3055 */
3056
3057 pmap_check_ptps(pmap);
3058 KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
3059
3060 #ifdef USER_LDT
3061 if (pmap->pm_ldt != NULL) {
3062 /*
3063 * No need to switch the LDT; this address space is gone,
3064 * nothing is using it.
3065 *
3066 * No need to lock the pmap for ldt_free (or anything else),
3067 * we're the last one to use it.
3068 */
3069 /* XXXAD can't take cpu_lock here - fix soon. */
3070 mutex_enter(&cpu_lock);
3071 ldt_free(pmap->pm_ldt_sel);
3072 mutex_exit(&cpu_lock);
3073 uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
3074 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3075 }
3076 #endif
3077
3078 for (i = 0; i < PTP_LEVELS - 1; i++) {
3079 uvm_obj_destroy(&pmap->pm_obj[i], false);
3080 }
3081 kcpuset_zero(pmap->pm_cpus);
3082 kcpuset_zero(pmap->pm_kernel_cpus);
3083 #ifdef XENPV
3084 kcpuset_zero(pmap->pm_xen_ptp_cpus);
3085 #endif
3086
3087 KASSERT(LIST_EMPTY(&pmap->pm_pvp_full));
3088 KASSERT(LIST_EMPTY(&pmap->pm_pvp_part));
3089 KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty));
3090
3091 pmap_check_ptps(pmap);
3092 if (__predict_false(pmap->pm_enter != NULL)) {
3093 /* XXX make this a different cache */
3094 pool_cache_destruct_object(&pmap_cache, pmap);
3095 } else {
3096 pool_cache_put(&pmap_cache, pmap);
3097 }
3098 }
3099
3100 /*
3101 * pmap_zap_ptp: clear out an entire PTP without modifying PTEs
3102 *
3103 * => caller must hold pmap's lock
3104 * => PTP must be mapped into KVA
3105 * => must be called with kernel preemption disabled
3106 * => does as little work as possible
3107 */
3108 static void
3109 pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
3110 vaddr_t startva, vaddr_t blkendva)
3111 {
3112 #ifndef XENPV
3113 struct pv_entry *pve;
3114 struct vm_page *pg;
3115 struct pmap_page *pp;
3116 pt_entry_t opte;
3117 rb_tree_t *tree;
3118 vaddr_t va;
3119 int wired;
3120 uint8_t oattrs;
3121 u_int cnt;
3122
3123 KASSERT(mutex_owned(&pmap->pm_lock));
3124 KASSERT(kpreempt_disabled());
3125 KASSERT(pmap != pmap_kernel());
3126 KASSERT(ptp->wire_count > 1);
3127 KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
3128
3129 /*
3130 * Start at the lowest entered VA, and scan until there are no more
3131 * PTEs in the PTPs.
3132 */
3133 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
3134 pve = RB_TREE_MIN(tree);
3135 wired = 0;
3136 va = (vaddr_t)ptp->uanon;
3137 pte += ((va - startva) >> PAGE_SHIFT);
3138
3139 for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
3140 /*
3141 * No need for an atomic to clear the PTE. Nothing else can
3142 * see the address space any more and speculative access (if
3143 * possible) won't modify. Therefore there's no need to
3144 * track the accessed/dirty bits.
3145 */
3146 opte = *pte;
3147 if (!pmap_valid_entry(opte)) {
3148 continue;
3149 }
3150
3151 /*
3152 * Count the PTE. If it's not for a managed mapping
3153 * there's noting more to do.
3154 */
3155 cnt--;
3156 wired -= (opte & PTE_WIRED);
3157 if ((opte & PTE_PVLIST) == 0) {
3158 #ifndef DOM0OPS
3159 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
3160 "managed page without PTE_PVLIST for %#"
3161 PRIxVADDR, va);
3162 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
3163 "pv-tracked page without PTE_PVLIST for %#"
3164 PRIxVADDR, va);
3165 #endif
3166 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
3167 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
3168 va) == NULL);
3169 continue;
3170 }
3171
3172 /*
3173 * "pve" now points to the lowest (by VA) dynamic PV entry
3174 * in the PTP. If it's for this VA, take advantage of it to
3175 * avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB
3176 * tree by skipping to the next VA in the tree whenever
3177 * there is a match here. The tree will be cleared out in
3178 * one pass before return to pmap_remove_all().
3179 */
3180 oattrs = pmap_pte_to_pp_attrs(opte);
3181 if (pve != NULL && pve->pve_pte.pte_va == va) {
3182 pp = pve->pve_pp;
3183 KASSERT(pve->pve_pte.pte_ptp == ptp);
3184 KASSERT(pp->pp_pte.pte_ptp != ptp ||
3185 pp->pp_pte.pte_va != va);
3186 mutex_spin_enter(&pp->pp_lock);
3187 pp->pp_attrs |= oattrs;
3188 LIST_REMOVE(pve, pve_list);
3189 mutex_spin_exit(&pp->pp_lock);
3190
3191 /*
3192 * pve won't be touched again until pmap_drain_pv(),
3193 * so it's still safe to traverse the tree.
3194 */
3195 pmap_free_pv(pmap, pve);
3196 pve = RB_TREE_NEXT(tree, pve);
3197 continue;
3198 }
3199
3200 /*
3201 * No entry in the tree so it must be embedded. Look up the
3202 * page and cancel the embedded entry.
3203 */
3204 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
3205 pp = VM_PAGE_TO_PP(pg);
3206 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
3207 paddr_t pa = pmap_pte2pa(opte);
3208 panic("%s: PTE_PVLIST with pv-untracked page"
3209 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR
3210 "(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
3211 }
3212 mutex_spin_enter(&pp->pp_lock);
3213 KASSERT(pp->pp_pte.pte_ptp == ptp);
3214 KASSERT(pp->pp_pte.pte_va == va);
3215 pp->pp_attrs |= oattrs;
3216 pp->pp_pte.pte_ptp = NULL;
3217 pp->pp_pte.pte_va = 0;
3218 mutex_spin_exit(&pp->pp_lock);
3219 }
3220
3221 /* PTP now empty - adjust the tree & stats to match. */
3222 pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
3223 ptp->wire_count = 1;
3224 #ifdef DIAGNOSTIC
3225 rb_tree_init(tree, &pmap_rbtree_ops);
3226 #endif
3227 #else /* !XENPV */
3228 /*
3229 * XXXAD For XEN, it's not clear to me that we can do this, because
3230 * I guess the hypervisor keeps track of PTEs too.
3231 */
3232 pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
3233 #endif /* !XENPV */
3234 }
3235
3236 /*
3237 * pmap_remove_all: remove all mappings from pmap in bulk.
3238 *
3239 * Ordinarily when removing mappings it's important to hold the UVM object's
3240 * lock, so that pages do not gain a new identity while retaining stale TLB
3241 * entries (the same lock hold covers both pmap_remove() and pmap_update()).
3242 * Here it's known that the address space is no longer visible to any user
3243 * process, so we don't need to worry about that.
3244 */
3245 bool
3246 pmap_remove_all(struct pmap *pmap)
3247 {
3248 struct vm_page *ptps[32];
3249 vaddr_t va, blkendva;
3250 struct pmap *pmap2;
3251 pt_entry_t *ptes;
3252 pd_entry_t pde __diagused;
3253 pd_entry_t * const *pdes;
3254 int lvl __diagused, i, n;
3255
3256 /* XXX Can't handle EPT just yet. */
3257 if (pmap->pm_remove != NULL) {
3258 return false;
3259 }
3260
3261 for (;;) {
3262 /* Fetch a block of PTPs from tree. */
3263 mutex_enter(&pmap->pm_lock);
3264 n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
3265 (void **)ptps, __arraycount(ptps), false);
3266 if (n == 0) {
3267 mutex_exit(&pmap->pm_lock);
3268 break;
3269 }
3270
3271 /* Remove all mappings in the set of PTPs. */
3272 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3273 for (i = 0; i < n; i++) {
3274 if (ptps[i]->wire_count == 0) {
3275 /* It's dead: pmap_update() will expunge. */
3276 continue;
3277 }
3278
3279 /* Determine range of block. */
3280 va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
3281 blkendva = x86_round_pdr(va + 1);
3282
3283 /* Make sure everything squares up... */
3284 KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
3285 KASSERT(lvl == 1);
3286 KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
3287
3288 /* Zap! */
3289 pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
3290 blkendva);
3291
3292 /* PTP should now be unused - free it. */
3293 KASSERT(ptps[i]->wire_count == 1);
3294 pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
3295 }
3296 pmap_unmap_ptes(pmap, pmap2);
3297 pmap_drain_pv(pmap);
3298 pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
3299 mutex_exit(&pmap->pm_lock);
3300
3301 /* Process deferred frees. */
3302 pmap_update(pmap);
3303
3304 /* A breathing point. */
3305 preempt_point();
3306 }
3307
3308 /* Verify that the pmap is now completely empty. */
3309 pmap_check_ptps(pmap);
3310 KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
3311 "pmap %p not empty", pmap);
3312
3313 return true;
3314 }
3315
3316 #if defined(PMAP_FORK)
3317 /*
3318 * pmap_fork: perform any necessary data structure manipulation when
3319 * a VM space is forked.
3320 */
3321 void
3322 pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
3323 {
3324 #ifdef USER_LDT
3325 union descriptor *new_ldt;
3326 int sel;
3327
3328 if (__predict_true(pmap1->pm_ldt == NULL)) {
3329 return;
3330 }
3331
3332 /*
3333 * Copy the LDT into the new process.
3334 *
3335 * Read pmap1's ldt pointer unlocked; if it changes behind our back
3336 * we'll retry. This will starve if there's a stream of LDT changes
3337 * in another thread but that should not happen.
3338 */
3339
3340 retry:
3341 if (pmap1->pm_ldt != NULL) {
3342 /* Allocate space for the new process's LDT */
3343 new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
3344 MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
3345 if (new_ldt == NULL) {
3346 printf("WARNING: %s: unable to allocate LDT space\n",
3347 __func__);
3348 return;
3349 }
3350 mutex_enter(&cpu_lock);
3351 /* Get a GDT slot for it */
3352 sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
3353 if (sel == -1) {
3354 mutex_exit(&cpu_lock);
3355 uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3356 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3357 printf("WARNING: %s: unable to allocate LDT selector\n",
3358 __func__);
3359 return;
3360 }
3361 } else {
3362 /* Wasn't anything there after all. */
3363 new_ldt = NULL;
3364 sel = -1;
3365 mutex_enter(&cpu_lock);
3366 }
3367
3368 /*
3369 * Now that we have cpu_lock, ensure the LDT status is the same.
3370 */
3371 if (pmap1->pm_ldt != NULL) {
3372 if (new_ldt == NULL) {
3373 /* A wild LDT just appeared. */
3374 mutex_exit(&cpu_lock);
3375 goto retry;
3376 }
3377
3378 /* Copy the LDT data and install it in pmap2 */
3379 memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
3380 pmap2->pm_ldt = new_ldt;
3381 pmap2->pm_ldt_sel = sel;
3382 mutex_exit(&cpu_lock);
3383 } else {
3384 if (new_ldt != NULL) {
3385 /* The LDT disappeared, drop what we did. */
3386 ldt_free(sel);
3387 mutex_exit(&cpu_lock);
3388 uvm_km_free(kernel_map, (vaddr_t)new_ldt,
3389 MAX_USERLDT_SIZE, UVM_KMF_WIRED);
3390 return;
3391 }
3392
3393 /* We're good, just leave. */
3394 mutex_exit(&cpu_lock);
3395 }
3396 #endif /* USER_LDT */
3397 }
3398 #endif /* PMAP_FORK */
3399
3400 #ifdef USER_LDT
3401
3402 /*
3403 * pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap
3404 * is active, reload LDTR.
3405 */
3406 static void
3407 pmap_ldt_xcall(void *arg1, void *arg2)
3408 {
3409 struct pmap *pm;
3410
3411 kpreempt_disable();
3412 pm = arg1;
3413 if (curcpu()->ci_pmap == pm) {
3414 #if defined(SVS)
3415 if (svs_enabled) {
3416 svs_ldt_sync(pm);
3417 } else
3418 #endif
3419 lldt(pm->pm_ldt_sel);
3420 }
3421 kpreempt_enable();
3422 }
3423
3424 /*
3425 * pmap_ldt_sync: LDT selector for the named pmap is changing. swap
3426 * in the new selector on all CPUs.
3427 */
3428 void
3429 pmap_ldt_sync(struct pmap *pm)
3430 {
3431 uint64_t where;
3432
3433 KASSERT(mutex_owned(&cpu_lock));
3434
3435 pmap_ldt_evcnt.ev_count++;
3436 where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
3437 xc_wait(where);
3438 }
3439
3440 /*
3441 * pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
3442 * restore the default.
3443 */
3444 void
3445 pmap_ldt_cleanup(struct lwp *l)
3446 {
3447 pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
3448 union descriptor *ldt;
3449 int sel;
3450
3451 if (__predict_true(pmap->pm_ldt == NULL)) {
3452 return;
3453 }
3454
3455 mutex_enter(&cpu_lock);
3456 if (pmap->pm_ldt != NULL) {
3457 sel = pmap->pm_ldt_sel;
3458 ldt = pmap->pm_ldt;
3459 pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
3460 pmap->pm_ldt = NULL;
3461 pmap_ldt_sync(pmap);
3462 ldt_free(sel);
3463 uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
3464 UVM_KMF_WIRED);
3465 }
3466 mutex_exit(&cpu_lock);
3467 }
3468 #endif /* USER_LDT */
3469
3470 /*
3471 * pmap_activate: activate a process' pmap
3472 *
3473 * => must be called with kernel preemption disabled
3474 * => if lwp is the curlwp, then set ci_want_pmapload so that
3475 * actual MMU context switch will be done by pmap_load() later
3476 */
3477 void
3478 pmap_activate(struct lwp *l)
3479 {
3480 struct cpu_info *ci;
3481 struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3482
3483 KASSERT(kpreempt_disabled());
3484
3485 ci = curcpu();
3486
3487 if (l != ci->ci_curlwp)
3488 return;
3489
3490 KASSERT(ci->ci_want_pmapload == 0);
3491 KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
3492
3493 /*
3494 * no need to switch to kernel vmspace because
3495 * it's a subset of any vmspace.
3496 */
3497
3498 if (pmap == pmap_kernel()) {
3499 ci->ci_want_pmapload = 0;
3500 return;
3501 }
3502
3503 ci->ci_want_pmapload = 1;
3504 }
3505
3506 #if defined(XENPV) && defined(__x86_64__)
3507 #define KASSERT_PDIRPA(pmap) \
3508 KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
3509 pmap == pmap_kernel())
3510 #elif defined(PAE)
3511 #define KASSERT_PDIRPA(pmap) \
3512 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
3513 #elif !defined(XENPV)
3514 #define KASSERT_PDIRPA(pmap) \
3515 KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
3516 #else
3517 #define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */
3518 #endif
3519
3520 /*
3521 * pmap_reactivate: try to regain reference to the pmap.
3522 *
3523 * => Must be called with kernel preemption disabled.
3524 */
3525 static void
3526 pmap_reactivate(struct pmap *pmap)
3527 {
3528 struct cpu_info * const ci = curcpu();
3529 const cpuid_t cid = cpu_index(ci);
3530
3531 KASSERT(kpreempt_disabled());
3532 KASSERT_PDIRPA(pmap);
3533
3534 /*
3535 * If we still have a lazy reference to this pmap, we can assume
3536 * that there was no TLB shootdown for this pmap in the meantime.
3537 *
3538 * The order of events here is important as we must synchronize
3539 * with TLB shootdown interrupts. Declare interest in invalidations
3540 * (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
3541 * change only when the state is TLBSTATE_LAZY.
3542 */
3543
3544 ci->ci_tlbstate = TLBSTATE_VALID;
3545 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3546
3547 if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
3548 /* We have the reference, state is valid. */
3549 } else {
3550 /*
3551 * Must reload the TLB, pmap has been changed during
3552 * deactivated.
3553 */
3554 kcpuset_atomic_set(pmap->pm_cpus, cid);
3555
3556 tlbflush();
3557 }
3558 }
3559
3560 /*
3561 * pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
3562 * and relevant LDT info.
3563 *
3564 * Ensures that the current process' pmap is loaded on the current CPU's
3565 * MMU and that there are no stale TLB entries.
3566 *
3567 * => The caller should disable kernel preemption or do check-and-retry
3568 * to prevent a preemption from undoing our efforts.
3569 * => This function may block.
3570 */
3571 void
3572 pmap_load(void)
3573 {
3574 struct cpu_info *ci;
3575 struct pmap *pmap, *oldpmap;
3576 struct lwp *l;
3577 uint64_t pctr;
3578 int ilevel __diagused;
3579 u_long psl __diagused;
3580
3581 kpreempt_disable();
3582 retry:
3583 ci = curcpu();
3584 if (!ci->ci_want_pmapload) {
3585 kpreempt_enable();
3586 return;
3587 }
3588 l = ci->ci_curlwp;
3589 pctr = lwp_pctr();
3590 __insn_barrier();
3591
3592 /* should be able to take ipis. */
3593 KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
3594 #ifdef XENPV
3595 /* Check to see if interrupts are enabled (ie; no events are masked) */
3596 KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
3597 #else
3598 KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
3599 #endif
3600
3601 KASSERT(l != NULL);
3602 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3603 KASSERT(pmap != pmap_kernel());
3604 oldpmap = ci->ci_pmap;
3605
3606 if (pmap == oldpmap) {
3607 pmap_reactivate(pmap);
3608 ci->ci_want_pmapload = 0;
3609 kpreempt_enable();
3610 return;
3611 }
3612
3613 /*
3614 * Acquire a reference to the new pmap and perform the switch.
3615 */
3616
3617 pmap_reference(pmap);
3618 pmap_load1(l, pmap, oldpmap);
3619 ci->ci_want_pmapload = 0;
3620
3621 /*
3622 * we're now running with the new pmap. drop the reference
3623 * to the old pmap. if we block, we need to go around again.
3624 */
3625
3626 pmap_destroy(oldpmap);
3627 __insn_barrier();
3628 if (lwp_pctr() != pctr) {
3629 goto retry;
3630 }
3631
3632 kpreempt_enable();
3633 }
3634
3635 /*
3636 * pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
3637 * pmap_load(). It's critically important that this function does not
3638 * block.
3639 */
3640 static void
3641 pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
3642 {
3643 struct cpu_info *ci;
3644 struct pcb *pcb;
3645 cpuid_t cid;
3646
3647 KASSERT(kpreempt_disabled());
3648
3649 pcb = lwp_getpcb(l);
3650 ci = l->l_cpu;
3651 cid = cpu_index(ci);
3652
3653 kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3654 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3655
3656 KASSERT_PDIRPA(oldpmap);
3657 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3658 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3659
3660 /*
3661 * Mark the pmap in use by this CPU. Again, we must synchronize
3662 * with TLB shootdown interrupts, so set the state VALID first,
3663 * then register us for shootdown events on this pmap.
3664 */
3665 ci->ci_tlbstate = TLBSTATE_VALID;
3666 kcpuset_atomic_set(pmap->pm_cpus, cid);
3667 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3668 ci->ci_pmap = pmap;
3669
3670 /*
3671 * update tss. now that we have registered for invalidations
3672 * from other CPUs, we're good to load the page tables.
3673 */
3674 #ifdef PAE
3675 pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
3676 #else
3677 pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
3678 #endif
3679
3680 #ifdef i386
3681 #ifndef XENPV
3682 ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
3683 ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
3684 #endif
3685 #endif
3686
3687 #if defined(SVS) && defined(USER_LDT)
3688 if (svs_enabled) {
3689 svs_ldt_sync(pmap);
3690 } else
3691 #endif
3692 lldt(pmap->pm_ldt_sel);
3693
3694 cpu_load_pmap(pmap, oldpmap);
3695 }
3696
3697 /*
3698 * pmap_deactivate: deactivate a process' pmap.
3699 *
3700 * => Must be called with kernel preemption disabled (high IPL is enough).
3701 */
3702 void
3703 pmap_deactivate(struct lwp *l)
3704 {
3705 struct pmap *pmap;
3706 struct cpu_info *ci;
3707
3708 KASSERT(kpreempt_disabled());
3709
3710 if (l != curlwp) {
3711 return;
3712 }
3713
3714 /*
3715 * Wait for pending TLB shootdowns to complete. Necessary because
3716 * TLB shootdown state is per-CPU, and the LWP may be coming off
3717 * the CPU before it has a chance to call pmap_update(), e.g. due
3718 * to kernel preemption or blocking routine in between.
3719 */
3720 pmap_tlb_shootnow();
3721
3722 ci = curcpu();
3723
3724 if (ci->ci_want_pmapload) {
3725 /*
3726 * ci_want_pmapload means that our pmap is not loaded on
3727 * the CPU or TLB might be stale. note that pmap_kernel()
3728 * is always considered loaded.
3729 */
3730 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3731 != pmap_kernel());
3732 KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
3733 != ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
3734
3735 /*
3736 * userspace has not been touched.
3737 * nothing to do here.
3738 */
3739
3740 ci->ci_want_pmapload = 0;
3741 return;
3742 }
3743
3744 pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
3745
3746 if (pmap == pmap_kernel()) {
3747 return;
3748 }
3749
3750 KASSERT_PDIRPA(pmap);
3751 KASSERT(ci->ci_pmap == pmap);
3752
3753 /*
3754 * we aren't interested in TLB invalidations for this pmap,
3755 * at least for the time being.
3756 */
3757
3758 KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
3759 ci->ci_tlbstate = TLBSTATE_LAZY;
3760 }
3761
3762 #ifdef EFI_RUNTIME
3763
3764 extern struct pmap *efi_runtime_pmap;
3765
3766 /*
3767 * pmap_is_user: true if pmap, which must not be the kernel pmap, is
3768 * for an unprivileged user process
3769 */
3770 bool
3771 pmap_is_user(struct pmap *pmap)
3772 {
3773
3774 KASSERT(pmap != pmap_kernel());
3775 return (pmap != efi_runtime_pmap);
3776 }
3777
3778 /*
3779 * pmap_activate_sync: synchronously activate specified pmap.
3780 *
3781 * => Must be called with kernel preemption disabled (high IPL is enough).
3782 * => Must not sleep before pmap_deactivate_sync.
3783 */
3784 void *
3785 pmap_activate_sync(struct pmap *pmap)
3786 {
3787 struct cpu_info *ci = curcpu();
3788 struct pmap *oldpmap = ci->ci_pmap;
3789 unsigned cid = cpu_index(ci);
3790
3791 KASSERT(kpreempt_disabled());
3792 KASSERT(pmap != pmap_kernel());
3793
3794 KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
3795 KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
3796
3797 if (oldpmap) {
3798 KASSERT_PDIRPA(oldpmap);
3799 kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
3800 kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
3801 }
3802
3803 ci->ci_tlbstate = TLBSTATE_VALID;
3804 kcpuset_atomic_set(pmap->pm_cpus, cid);
3805 kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
3806 ci->ci_pmap = pmap;
3807
3808 #if defined(SVS) && defined(USER_LDT)
3809 if (svs_enabled) {
3810 svs_ldt_sync(pmap);
3811 } else
3812 #endif
3813 lldt(pmap->pm_ldt_sel);
3814
3815 cpu_load_pmap(pmap, oldpmap);
3816
3817 return oldpmap;
3818 }
3819
3820 /*
3821 * pmap_deactivate_sync: synchronously deactivate specified pmap and
3822 * restore whatever was active before pmap_activate_sync.
3823 *
3824 * => Must be called with kernel preemption disabled (high IPL is enough).
3825 * => Must not have slept since pmap_activate_sync.
3826 */
3827 void
3828 pmap_deactivate_sync(struct pmap *pmap, void *cookie)
3829 {
3830 struct cpu_info *ci = curcpu();
3831 struct pmap *oldpmap = cookie;
3832 unsigned cid = cpu_index(ci);
3833
3834 KASSERT(kpreempt_disabled());
3835 KASSERT(pmap != pmap_kernel());
3836 KASSERT(ci->ci_pmap == pmap);
3837
3838 KASSERT_PDIRPA(pmap);
3839
3840 KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
3841 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
3842
3843 pmap_tlb_shootnow();
3844
3845 kcpuset_atomic_clear(pmap->pm_cpus, cid);
3846 kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
3847
3848 ci->ci_tlbstate = TLBSTATE_VALID;
3849 ci->ci_pmap = oldpmap;
3850 if (oldpmap) {
3851 kcpuset_atomic_set(oldpmap->pm_cpus, cid);
3852 kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
3853 #if defined(SVS) && defined(USER_LDT)
3854 if (svs_enabled) {
3855 svs_ldt_sync(oldpmap);
3856 } else
3857 #endif
3858 lldt(oldpmap->pm_ldt_sel);
3859 cpu_load_pmap(oldpmap, pmap);
3860 } else {
3861 lcr3(pmap_pdirpa(pmap_kernel(), 0));
3862 }
3863 }
3864
3865 #endif /* EFI_RUNTIME */
3866
3867 /*
3868 * some misc. functions
3869 */
3870
3871 bool
3872 pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
3873 int *lastlvl)
3874 {
3875 unsigned long index;
3876 pd_entry_t pde;
3877 int i;
3878
3879 for (i = PTP_LEVELS; i > 1; i--) {
3880 index = pl_i(va, i);
3881 pde = pdes[i - 2][index];
3882 if ((pde & PTE_P) == 0) {
3883 *lastlvl = i;
3884 return false;
3885 }
3886 if (pde & PTE_PS)
3887 break;
3888 }
3889 if (lastpde != NULL)
3890 *lastpde = pde;
3891 *lastlvl = i;
3892 return true;
3893 }
3894
3895 /*
3896 * pmap_extract: extract a PA for the given VA
3897 */
3898 bool
3899 pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
3900 {
3901 pt_entry_t *ptes, pte;
3902 pd_entry_t pde;
3903 pd_entry_t * const *pdes;
3904 struct pmap *pmap2;
3905 paddr_t pa;
3906 bool rv;
3907 int lvl;
3908
3909 if (__predict_false(pmap->pm_extract != NULL)) {
3910 return (*pmap->pm_extract)(pmap, va, pap);
3911 }
3912
3913 #ifdef __HAVE_DIRECT_MAP
3914 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
3915 if (pap != NULL) {
3916 *pap = PMAP_DIRECT_UNMAP(va);
3917 }
3918 return true;
3919 }
3920 #endif
3921
3922 rv = false;
3923 pa = 0;
3924
3925 if (pmap != pmap_kernel()) {
3926 mutex_enter(&pmap->pm_lock);
3927 }
3928 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
3929 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
3930 if (lvl == 2) {
3931 pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
3932 rv = true;
3933 } else {
3934 KASSERT(lvl == 1);
3935 pte = ptes[pl1_i(va)];
3936 if (__predict_true((pte & PTE_P) != 0)) {
3937 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
3938 rv = true;
3939 }
3940 }
3941 }
3942 pmap_unmap_ptes(pmap, pmap2);
3943 if (pmap != pmap_kernel()) {
3944 mutex_exit(&pmap->pm_lock);
3945 }
3946 if (pap != NULL) {
3947 *pap = pa;
3948 }
3949
3950 return rv;
3951 }
3952
3953 /*
3954 * vtophys: virtual address to physical address. For use by
3955 * machine-dependent code only.
3956 */
3957 paddr_t
3958 vtophys(vaddr_t va)
3959 {
3960 paddr_t pa;
3961
3962 if (pmap_extract(pmap_kernel(), va, &pa) == true)
3963 return pa;
3964 return 0;
3965 }
3966
3967 __strict_weak_alias(pmap_extract_ma, pmap_extract);
3968
3969 #ifdef XENPV
3970 /*
3971 * vtomach: virtual address to machine address. For use by
3972 * machine-dependent code only.
3973 */
3974 paddr_t
3975 vtomach(vaddr_t va)
3976 {
3977 paddr_t pa;
3978
3979 if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
3980 return pa;
3981 return 0;
3982 }
3983 #endif
3984
3985 /*
3986 * pmap_virtual_space: used during bootup [pmap_steal_memory] to
3987 * determine the bounds of the kernel virtual address space.
3988 */
3989 void
3990 pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
3991 {
3992 *startp = virtual_avail;
3993 *endp = virtual_end;
3994 }
3995
3996 void
3997 pmap_zero_page(paddr_t pa)
3998 {
3999 #if defined(__HAVE_DIRECT_MAP)
4000 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
4001 #else
4002 #if defined(XENPV)
4003 if (XEN_VERSION_SUPPORTED(3, 4)) {
4004 xen_pagezero(pa);
4005 return;
4006 }
4007 #endif
4008 struct cpu_info *ci;
4009 pt_entry_t *zpte;
4010 vaddr_t zerova;
4011
4012 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
4013
4014 kpreempt_disable();
4015
4016 ci = curcpu();
4017 zerova = ci->vpage[VPAGE_ZER];
4018 zpte = ci->vpage_pte[VPAGE_ZER];
4019
4020 KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
4021
4022 pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
4023 pmap_pte_flush();
4024 pmap_update_pg(zerova); /* flush TLB */
4025
4026 memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
4027
4028 #if defined(DIAGNOSTIC) || defined(XENPV)
4029 pmap_pte_set(zpte, 0); /* zap ! */
4030 pmap_pte_flush();
4031 #endif
4032
4033 kpreempt_enable();
4034 #endif /* defined(__HAVE_DIRECT_MAP) */
4035 }
4036
4037 void
4038 pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
4039 {
4040 #if defined(__HAVE_DIRECT_MAP)
4041 vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
4042 vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
4043
4044 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4045 #else
4046 #if defined(XENPV)
4047 if (XEN_VERSION_SUPPORTED(3, 4)) {
4048 xen_copy_page(srcpa, dstpa);
4049 return;
4050 }
4051 #endif
4052 struct cpu_info *ci;
4053 pt_entry_t *srcpte, *dstpte;
4054 vaddr_t srcva, dstva;
4055
4056 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
4057
4058 kpreempt_disable();
4059
4060 ci = curcpu();
4061 srcva = ci->vpage[VPAGE_SRC];
4062 dstva = ci->vpage[VPAGE_DST];
4063 srcpte = ci->vpage_pte[VPAGE_SRC];
4064 dstpte = ci->vpage_pte[VPAGE_DST];
4065
4066 KASSERT(*srcpte == 0 && *dstpte == 0);
4067
4068 pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
4069 pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
4070 pmap_pte_flush();
4071 pmap_update_pg(srcva);
4072 pmap_update_pg(dstva);
4073
4074 memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
4075
4076 #if defined(DIAGNOSTIC) || defined(XENPV)
4077 pmap_pte_set(srcpte, 0);
4078 pmap_pte_set(dstpte, 0);
4079 pmap_pte_flush();
4080 #endif
4081
4082 kpreempt_enable();
4083 #endif /* defined(__HAVE_DIRECT_MAP) */
4084 }
4085
4086 static pt_entry_t *
4087 pmap_map_ptp(struct vm_page *ptp)
4088 {
4089 #ifdef __HAVE_DIRECT_MAP
4090 return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
4091 #else
4092 struct cpu_info *ci;
4093 pt_entry_t *ptppte;
4094 vaddr_t ptpva;
4095
4096 KASSERT(kpreempt_disabled());
4097
4098 #ifndef XENPV
4099 const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
4100 #else
4101 const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
4102 #endif
4103
4104 ci = curcpu();
4105 ptpva = ci->vpage[VPAGE_PTP];
4106 ptppte = ci->vpage_pte[VPAGE_PTP];
4107
4108 pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
4109
4110 pmap_pte_flush();
4111 pmap_update_pg(ptpva);
4112
4113 return (pt_entry_t *)ptpva;
4114 #endif
4115 }
4116
4117 static void
4118 pmap_unmap_ptp(void)
4119 {
4120 #ifndef __HAVE_DIRECT_MAP
4121 #if defined(DIAGNOSTIC) || defined(XENPV)
4122 struct cpu_info *ci;
4123 pt_entry_t *pte;
4124
4125 KASSERT(kpreempt_disabled());
4126
4127 ci = curcpu();
4128 pte = ci->vpage_pte[VPAGE_PTP];
4129
4130 if (*pte != 0) {
4131 pmap_pte_set(pte, 0);
4132 pmap_pte_flush();
4133 }
4134 #endif
4135 #endif
4136 }
4137
4138 static pt_entry_t *
4139 pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
4140 {
4141
4142 KASSERT(kpreempt_disabled());
4143 if (pmap_is_curpmap(pmap)) {
4144 return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
4145 }
4146 KASSERT(ptp != NULL);
4147 return pmap_map_ptp(ptp) + pl1_pi(va);
4148 }
4149
4150 static void
4151 pmap_unmap_pte(void)
4152 {
4153
4154 KASSERT(kpreempt_disabled());
4155
4156 pmap_unmap_ptp();
4157 }
4158
4159 /*
4160 * p m a p r e m o v e f u n c t i o n s
4161 *
4162 * functions that remove mappings
4163 */
4164
4165 /*
4166 * pmap_remove_ptes: remove PTEs from a PTP
4167 *
4168 * => caller must hold pmap's lock
4169 * => PTP must be mapped into KVA
4170 * => PTP should be null if pmap == pmap_kernel()
4171 * => must be called with kernel preemption disabled
4172 * => returns composite pte if at least one page should be shot down
4173 */
4174 static void
4175 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
4176 vaddr_t startva, vaddr_t endva)
4177 {
4178 pt_entry_t *pte = (pt_entry_t *)ptpva;
4179
4180 KASSERT(mutex_owned(&pmap->pm_lock));
4181 KASSERT(kpreempt_disabled());
4182
4183 /*
4184 * mappings are very often sparse, so clip the given range to the
4185 * range of PTEs that are known present in the PTP.
4186 */
4187 pmap_ptp_range_clip(ptp, &startva, &pte);
4188
4189 /*
4190 * note that ptpva points to the PTE that maps startva. this may
4191 * or may not be the first PTE in the PTP.
4192 *
4193 * we loop through the PTP while there are still PTEs to look at
4194 * and the wire_count is greater than 1 (because we use the wire_count
4195 * to keep track of the number of real PTEs in the PTP).
4196 */
4197 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
4198 (void)pmap_remove_pte(pmap, ptp, pte, startva);
4199 startva += PAGE_SIZE;
4200 pte++;
4201 }
4202 }
4203
4204 /*
4205 * pmap_remove_pte: remove a single PTE from a PTP.
4206 *
4207 * => caller must hold pmap's lock
4208 * => PTP must be mapped into KVA
4209 * => PTP should be null if pmap == pmap_kernel()
4210 * => returns true if we removed a mapping
4211 * => must be called with kernel preemption disabled
4212 */
4213 static bool
4214 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
4215 vaddr_t va)
4216 {
4217 struct pv_entry *pve;
4218 struct vm_page *pg;
4219 struct pmap_page *pp;
4220 pt_entry_t opte;
4221
4222 KASSERT(mutex_owned(&pmap->pm_lock));
4223 KASSERT(kpreempt_disabled());
4224
4225 if (!pmap_valid_entry(*pte)) {
4226 /* VA not mapped. */
4227 return false;
4228 }
4229
4230 /* Atomically save the old PTE and zap it. */
4231 opte = pmap_pte_testset(pte, 0);
4232 if (!pmap_valid_entry(opte)) {
4233 return false;
4234 }
4235
4236 pmap_exec_account(pmap, va, opte, 0);
4237 pmap_stats_update_bypte(pmap, 0, opte);
4238
4239 if (ptp) {
4240 /*
4241 * Dropping a PTE. Make sure that the PDE is flushed.
4242 */
4243 ptp->wire_count--;
4244 if (ptp->wire_count <= 1) {
4245 opte |= PTE_A;
4246 }
4247 }
4248
4249 if ((opte & PTE_A) != 0) {
4250 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
4251 }
4252
4253 /*
4254 * If we are not on a pv list - we are done.
4255 */
4256 if ((opte & PTE_PVLIST) == 0) {
4257 #ifndef DOM0OPS
4258 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
4259 "managed page without PTE_PVLIST for %#"PRIxVADDR, va);
4260 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
4261 "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
4262 #endif
4263 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
4264 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
4265 return true;
4266 }
4267
4268 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
4269 pp = VM_PAGE_TO_PP(pg);
4270 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
4271 paddr_t pa = pmap_pte2pa(opte);
4272 panic("%s: PTE_PVLIST with pv-untracked page"
4273 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
4274 __func__, va, pa, atop(pa));
4275 }
4276
4277 /* Sync R/M bits. */
4278 pve = pmap_lookup_pv(pmap, ptp, pp, va);
4279 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
4280 return true;
4281 }
4282
4283 static void
4284 pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4285 {
4286 pt_entry_t *ptes;
4287 pd_entry_t pde;
4288 pd_entry_t * const *pdes;
4289 bool result;
4290 vaddr_t blkendva, va = sva;
4291 struct vm_page *ptp;
4292 struct pmap *pmap2;
4293 int lvl;
4294
4295 KASSERT(mutex_owned(&pmap->pm_lock));
4296
4297 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4298
4299 /*
4300 * removing one page? take shortcut function.
4301 */
4302
4303 if (va + PAGE_SIZE == eva) {
4304 if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4305 KASSERT(lvl == 1);
4306
4307 /* Get PTP if non-kernel mapping. */
4308 if (pmap != pmap_kernel()) {
4309 ptp = pmap_find_ptp(pmap, va, 1);
4310 KASSERTMSG(ptp != NULL,
4311 "%s: unmanaged PTP detected", __func__);
4312 } else {
4313 /* Never free kernel PTPs. */
4314 ptp = NULL;
4315 }
4316
4317 result = pmap_remove_pte(pmap, ptp,
4318 &ptes[pl1_i(va)], va);
4319
4320 /*
4321 * if mapping removed and the PTP is no longer
4322 * being used, free it!
4323 */
4324
4325 if (result && ptp && ptp->wire_count <= 1)
4326 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4327 }
4328 } else for (/* null */ ; va < eva ; va = blkendva) {
4329 /* determine range of block */
4330 blkendva = x86_round_pdr(va+1);
4331 if (blkendva > eva)
4332 blkendva = eva;
4333
4334 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
4335 /* Skip a range corresponding to an invalid pde. */
4336 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
4337 continue;
4338 }
4339 KASSERT(lvl == 1);
4340
4341 /* Get PTP if non-kernel mapping. */
4342 if (pmap != pmap_kernel()) {
4343 ptp = pmap_find_ptp(pmap, va, 1);
4344 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
4345 __func__);
4346 } else {
4347 /* Never free kernel PTPs. */
4348 ptp = NULL;
4349 }
4350
4351 pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
4352 blkendva);
4353
4354 /* If PTP is no longer being used, free it. */
4355 if (ptp && ptp->wire_count <= 1) {
4356 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4357 }
4358 }
4359 pmap_unmap_ptes(pmap, pmap2);
4360 pmap_drain_pv(pmap);
4361 }
4362
4363 /*
4364 * pmap_remove: mapping removal function.
4365 *
4366 * => caller should not be holding any pmap locks
4367 */
4368 void
4369 pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
4370 {
4371 if (__predict_false(pmap->pm_remove != NULL)) {
4372 (*pmap->pm_remove)(pmap, sva, eva);
4373 return;
4374 }
4375
4376 mutex_enter(&pmap->pm_lock);
4377 pmap_remove_locked(pmap, sva, eva);
4378 mutex_exit(&pmap->pm_lock);
4379 }
4380
4381 /*
4382 * pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
4383 *
4384 * => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
4385 * => Caller should disable kernel preemption.
4386 * => issues tlb shootdowns if necessary.
4387 */
4388 static int
4389 pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
4390 pt_entry_t *optep)
4391 {
4392 struct pmap *pmap;
4393 struct vm_page *ptp;
4394 vaddr_t va;
4395 pt_entry_t *ptep;
4396 pt_entry_t opte;
4397 pt_entry_t npte;
4398 pt_entry_t expect;
4399 bool need_shootdown;
4400
4401 ptp = pvpte->pte_ptp;
4402 va = pvpte->pte_va;
4403 KASSERT(ptp == NULL || ptp->uobject != NULL);
4404 KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
4405 pmap = ptp_to_pmap(ptp);
4406 KASSERT(kpreempt_disabled());
4407
4408 if (__predict_false(pmap->pm_sync_pv != NULL)) {
4409 return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
4410 optep);
4411 }
4412
4413 expect = pmap_pa2pte(pa) | PTE_P;
4414
4415 if (clearbits != ~0) {
4416 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
4417 clearbits = pmap_pp_attrs_to_pte(clearbits);
4418 }
4419
4420 ptep = pmap_map_pte(pmap, ptp, va);
4421 do {
4422 opte = *ptep;
4423 KASSERT((opte & (PTE_D | PTE_A)) != PTE_D);
4424 KASSERT((opte & (PTE_A | PTE_P)) != PTE_A);
4425 KASSERT(opte == 0 || (opte & PTE_P) != 0);
4426 if ((opte & (PTE_FRAME | PTE_P)) != expect) {
4427 /*
4428 * We lost a race with a V->P operation like
4429 * pmap_remove(). Wait for the competitor
4430 * reflecting pte bits into mp_attrs.
4431 */
4432 pmap_unmap_pte();
4433 return EAGAIN;
4434 }
4435
4436 /*
4437 * Check if there's anything to do on this PTE.
4438 */
4439 if ((opte & clearbits) == 0) {
4440 need_shootdown = false;
4441 break;
4442 }
4443
4444 /*
4445 * We need a shootdown if the PTE is cached (PTE_A) ...
4446 * ... Unless we are clearing only the PTE_W bit and
4447 * it isn't cached as RW (PTE_D).
4448 */
4449 need_shootdown = (opte & PTE_A) != 0 &&
4450 !(clearbits == PTE_W && (opte & PTE_D) == 0);
4451
4452 npte = opte & ~clearbits;
4453
4454 /*
4455 * If we need a shootdown anyway, clear PTE_A and PTE_D.
4456 */
4457 if (need_shootdown) {
4458 npte &= ~(PTE_A | PTE_D);
4459 }
4460 KASSERT((npte & (PTE_D | PTE_A)) != PTE_D);
4461 KASSERT((npte & (PTE_A | PTE_P)) != PTE_A);
4462 KASSERT(npte == 0 || (opte & PTE_P) != 0);
4463 } while (pmap_pte_cas(ptep, opte, npte) != opte);
4464
4465 if (need_shootdown) {
4466 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
4467 }
4468 pmap_unmap_pte();
4469
4470 *oattrs = pmap_pte_to_pp_attrs(opte);
4471 if (optep != NULL)
4472 *optep = opte;
4473 return 0;
4474 }
4475
4476 static void
4477 pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
4478 vaddr_t va)
4479 {
4480 struct pmap *pmap2;
4481 pt_entry_t *ptes;
4482 pd_entry_t * const *pdes;
4483
4484 KASSERT(mutex_owned(&pmap->pm_lock));
4485
4486 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4487 pmap_stats_update_bypte(pmap, 0, opte);
4488 ptp->wire_count--;
4489 if (ptp->wire_count <= 1) {
4490 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
4491 }
4492 pmap_unmap_ptes(pmap, pmap2);
4493 }
4494
4495 static void
4496 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
4497 {
4498 struct pv_pte *pvpte;
4499 struct vm_page *ptp;
4500 uintptr_t sum;
4501 uint8_t oattrs;
4502 bool locked;
4503
4504 /*
4505 * Do an unlocked check to see if the page has no mappings, eg when
4506 * pmap_remove_all() was called before amap_wipeout() for a process
4507 * private amap - common. The page being removed must be on the way
4508 * out, so we don't have to worry about concurrent attempts to enter
4509 * it (otherwise the caller either doesn't care or has screwed up).
4510 */
4511 sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
4512 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
4513 sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
4514 if (sum == 0) {
4515 return;
4516 }
4517
4518 kpreempt_disable();
4519 for (;;) {
4520 struct pmap *pmap;
4521 struct pv_entry *pve;
4522 pt_entry_t opte;
4523 vaddr_t va;
4524
4525 mutex_spin_enter(&pp->pp_lock);
4526 if ((pvpte = pv_pte_first(pp)) == NULL) {
4527 mutex_spin_exit(&pp->pp_lock);
4528 break;
4529 }
4530
4531 /*
4532 * Add a reference to the pmap before clearing the pte.
4533 * Otherwise the pmap can disappear behind us.
4534 */
4535 ptp = pvpte->pte_ptp;
4536 pmap = ptp_to_pmap(ptp);
4537 KASSERT(pmap->pm_obj[0].uo_refs > 0);
4538 if (ptp != NULL) {
4539 pmap_reference(pmap);
4540 }
4541
4542 /*
4543 * Now try to lock it. We need a direct handoff between
4544 * pp_lock and pm_lock to know the pv_entry is kept intact
4545 * and kept associated with this pmap. If that can't be
4546 * had, wait for the pmap's lock to become free and then
4547 * retry.
4548 */
4549 locked = mutex_tryenter(&pmap->pm_lock);
4550 mutex_spin_exit(&pp->pp_lock);
4551 if (!locked) {
4552 mutex_enter(&pmap->pm_lock);
4553 /* nothing, just wait for it */
4554 mutex_exit(&pmap->pm_lock);
4555 if (ptp != NULL) {
4556 pmap_destroy(pmap);
4557 }
4558 continue;
4559 }
4560 va = pvpte->pte_va;
4561
4562 KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
4563 "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4564 KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
4565 "va %lx pmap %p ptp %p is free", va, pmap, ptp);
4566 KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
4567 "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
4568
4569 #ifdef DEBUG
4570 pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
4571 rb_tree_t *tree = (ptp != NULL ?
4572 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
4573 pve = pmap_treelookup_pv(pmap, ptp, tree, va);
4574 if (pve == NULL) {
4575 KASSERTMSG(&pp->pp_pte == pvpte,
4576 "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
4577 va, pmap, ptp, pvpte, pve);
4578 } else {
4579 KASSERTMSG(&pve->pve_pte == pvpte,
4580 "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
4581 va, pmap, ptp, pvpte, pve);
4582 }
4583 #endif
4584
4585 if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
4586 panic("pmap_pp_remove: mapping not present");
4587 }
4588
4589 pve = pmap_lookup_pv(pmap, ptp, pp, va);
4590 pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
4591
4592 /* Update the PTP reference count. Free if last reference. */
4593 if (ptp != NULL) {
4594 KASSERT(pmap != pmap_kernel());
4595 pmap_tlb_shootnow();
4596 if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
4597 (*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
4598 } else {
4599 pmap_pp_remove_ent(pmap, ptp, opte, va);
4600 }
4601 } else {
4602 KASSERT(pmap == pmap_kernel());
4603 pmap_stats_update_bypte(pmap, 0, opte);
4604 }
4605 pmap_tlb_shootnow();
4606 pmap_drain_pv(pmap);
4607 mutex_exit(&pmap->pm_lock);
4608 if (ptp != NULL) {
4609 pmap_destroy(pmap);
4610 }
4611 }
4612 kpreempt_enable();
4613 }
4614
4615 /*
4616 * pmap_page_remove: remove a managed vm_page from all pmaps that map it
4617 *
4618 * => R/M bits are sync'd back to attrs
4619 */
4620 void
4621 pmap_page_remove(struct vm_page *pg)
4622 {
4623 struct pmap_page *pp;
4624 paddr_t pa;
4625
4626 pp = VM_PAGE_TO_PP(pg);
4627 pa = VM_PAGE_TO_PHYS(pg);
4628 pmap_pp_remove(pp, pa);
4629 }
4630
4631 /*
4632 * pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
4633 * that map it
4634 */
4635 void
4636 pmap_pv_remove(paddr_t pa)
4637 {
4638 struct pmap_page *pp;
4639
4640 pp = pmap_pv_tracked(pa);
4641 if (pp == NULL)
4642 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4643 pmap_pp_remove(pp, pa);
4644 }
4645
4646 /*
4647 * p m a p a t t r i b u t e f u n c t i o n s
4648 * functions that test/change managed page's attributes
4649 * since a page can be mapped multiple times we must check each PTE that
4650 * maps it by going down the pv lists.
4651 */
4652
4653 /*
4654 * pmap_test_attrs: test a page's attributes
4655 */
4656 bool
4657 pmap_test_attrs(struct vm_page *pg, unsigned testbits)
4658 {
4659 struct pmap_page *pp;
4660 struct pv_pte *pvpte;
4661 struct pmap *pmap;
4662 uint8_t oattrs;
4663 u_int result;
4664 paddr_t pa;
4665
4666 pp = VM_PAGE_TO_PP(pg);
4667 if ((pp->pp_attrs & testbits) != 0) {
4668 return true;
4669 }
4670 pa = VM_PAGE_TO_PHYS(pg);
4671 startover:
4672 mutex_spin_enter(&pp->pp_lock);
4673 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4674 if ((pp->pp_attrs & testbits) != 0) {
4675 break;
4676 }
4677 if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
4678 /*
4679 * raced with a V->P operation. wait for the other
4680 * side to finish by acquiring pmap's lock. if no
4681 * wait, updates to pp_attrs by the other side may
4682 * go unseen.
4683 */
4684 pmap = ptp_to_pmap(pvpte->pte_ptp);
4685 pmap_reference(pmap);
4686 mutex_spin_exit(&pp->pp_lock);
4687 mutex_enter(&pmap->pm_lock);
4688 /* nothing. */
4689 mutex_exit(&pmap->pm_lock);
4690 pmap_destroy(pmap);
4691 goto startover;
4692 }
4693 pp->pp_attrs |= oattrs;
4694 }
4695 result = pp->pp_attrs & testbits;
4696 mutex_spin_exit(&pp->pp_lock);
4697
4698 /*
4699 * note that we will exit the for loop with a non-null pve if
4700 * we have found the bits we are testing for.
4701 */
4702
4703 return result != 0;
4704 }
4705
4706 static bool
4707 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
4708 {
4709 struct pv_pte *pvpte;
4710 struct pmap *pmap;
4711 uint8_t oattrs;
4712 u_int result;
4713
4714 startover:
4715 mutex_spin_enter(&pp->pp_lock);
4716 for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
4717 if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
4718 /*
4719 * raced with a V->P operation. wait for the other
4720 * side to finish by acquiring pmap's lock. it is
4721 * probably unmapping the page, and it will be gone
4722 * when the loop is restarted.
4723 */
4724 pmap = ptp_to_pmap(pvpte->pte_ptp);
4725 pmap_reference(pmap);
4726 mutex_spin_exit(&pp->pp_lock);
4727 mutex_enter(&pmap->pm_lock);
4728 /* nothing. */
4729 mutex_exit(&pmap->pm_lock);
4730 pmap_destroy(pmap);
4731 goto startover;
4732 }
4733 pp->pp_attrs |= oattrs;
4734 }
4735 result = pp->pp_attrs & clearbits;
4736 pp->pp_attrs &= ~clearbits;
4737 pmap_tlb_shootnow();
4738 mutex_spin_exit(&pp->pp_lock);
4739
4740 return result != 0;
4741 }
4742
4743 /*
4744 * pmap_clear_attrs: clear the specified attribute for a page.
4745 *
4746 * => we return true if we cleared one of the bits we were asked to
4747 */
4748 bool
4749 pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
4750 {
4751 struct pmap_page *pp;
4752 paddr_t pa;
4753
4754 pp = VM_PAGE_TO_PP(pg);
4755 pa = VM_PAGE_TO_PHYS(pg);
4756
4757 /*
4758 * If this is a new page, assert it has no mappings and simply zap
4759 * the stored attributes without taking any locks.
4760 */
4761 if ((pg->flags & PG_FAKE) != 0) {
4762 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0);
4763 KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL);
4764 KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
4765 atomic_store_relaxed(&pp->pp_attrs, 0);
4766 return false;
4767 } else {
4768 return pmap_pp_clear_attrs(pp, pa, clearbits);
4769 }
4770 }
4771
4772 /*
4773 * pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
4774 * pv-tracked page.
4775 */
4776 bool
4777 pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
4778 {
4779 struct pmap_page *pp;
4780
4781 pp = pmap_pv_tracked(pa);
4782 if (pp == NULL)
4783 panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
4784
4785 return pmap_pp_clear_attrs(pp, pa, clearbits);
4786 }
4787
4788 /*
4789 * p m a p p r o t e c t i o n f u n c t i o n s
4790 */
4791
4792 /*
4793 * pmap_page_protect: change the protection of all recorded mappings
4794 * of a managed page
4795 *
4796 * => NOTE: this is an inline function in pmap.h
4797 */
4798
4799 /* see pmap.h */
4800
4801 /*
4802 * pmap_pv_protect: change the protection of all recorded mappings
4803 * of an unmanaged pv-tracked page
4804 *
4805 * => NOTE: this is an inline function in pmap.h
4806 */
4807
4808 /* see pmap.h */
4809
4810 /*
4811 * pmap_protect: set the protection in of the pages in a pmap
4812 *
4813 * => NOTE: this is an inline function in pmap.h
4814 */
4815
4816 /* see pmap.h */
4817
4818 /*
4819 * pmap_write_protect: write-protect pages in a pmap.
4820 *
4821 * Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
4822 * don't need to remove this bit when re-entering the PTEs here: Xen tracks the
4823 * kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
4824 * present the page will still be considered as a kernel page, and the privilege
4825 * separation will be enforced correctly.
4826 */
4827 void
4828 pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
4829 {
4830 pt_entry_t bit_rem, bit_put;
4831 pt_entry_t *ptes;
4832 pt_entry_t * const *pdes;
4833 struct pmap *pmap2;
4834 vaddr_t blockend, va;
4835 int lvl, i;
4836
4837 if (__predict_false(pmap->pm_write_protect != NULL)) {
4838 (*pmap->pm_write_protect)(pmap, sva, eva, prot);
4839 return;
4840 }
4841
4842 bit_rem = 0;
4843 if (!(prot & VM_PROT_WRITE))
4844 bit_rem = PTE_W;
4845
4846 bit_put = 0;
4847 if (!(prot & VM_PROT_EXECUTE))
4848 bit_put = pmap_pg_nx;
4849
4850 sva &= ~PAGE_MASK;
4851 eva &= ~PAGE_MASK;
4852
4853 /*
4854 * Acquire pmap. No need to lock the kernel pmap as we won't
4855 * be touching PV entries nor stats and kernel PDEs aren't
4856 * freed.
4857 */
4858 if (pmap != pmap_kernel()) {
4859 mutex_enter(&pmap->pm_lock);
4860 }
4861 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4862
4863 for (va = sva ; va < eva; va = blockend) {
4864 pt_entry_t *spte, *epte;
4865
4866 blockend = x86_round_pdr(va + 1);
4867 if (blockend > eva)
4868 blockend = eva;
4869
4870 /* Is it a valid block? */
4871 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4872 continue;
4873 }
4874 KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS);
4875 KASSERT(lvl == 1);
4876
4877 spte = &ptes[pl1_i(va)];
4878 epte = &ptes[pl1_i(blockend)];
4879
4880 for (i = 0; spte < epte; spte++, i++) {
4881 pt_entry_t opte, npte;
4882
4883 do {
4884 opte = *spte;
4885 if (!pmap_valid_entry(opte)) {
4886 goto next;
4887 }
4888 npte = (opte & ~bit_rem) | bit_put;
4889 } while (pmap_pte_cas(spte, opte, npte) != opte);
4890
4891 if ((opte & PTE_D) != 0) {
4892 vaddr_t tva = va + x86_ptob(i);
4893 pmap_tlb_shootdown(pmap, tva, opte,
4894 TLBSHOOT_WRITE_PROTECT);
4895 }
4896 next:;
4897 }
4898 }
4899
4900 /* Release pmap. */
4901 pmap_unmap_ptes(pmap, pmap2);
4902 if (pmap != pmap_kernel()) {
4903 mutex_exit(&pmap->pm_lock);
4904 }
4905 }
4906
4907 /*
4908 * pmap_unwire: clear the wired bit in the PTE.
4909 *
4910 * => Mapping should already be present.
4911 */
4912 void
4913 pmap_unwire(struct pmap *pmap, vaddr_t va)
4914 {
4915 pt_entry_t *ptes, *ptep, opte;
4916 pd_entry_t * const *pdes;
4917 struct pmap *pmap2;
4918 int lvl;
4919
4920 if (__predict_false(pmap->pm_unwire != NULL)) {
4921 (*pmap->pm_unwire)(pmap, va);
4922 return;
4923 }
4924
4925 /*
4926 * Acquire pmap. Need to lock the kernel pmap only to protect the
4927 * statistics.
4928 */
4929 mutex_enter(&pmap->pm_lock);
4930 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
4931
4932 if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
4933 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
4934 }
4935 KASSERT(lvl == 1);
4936
4937 ptep = &ptes[pl1_i(va)];
4938 opte = *ptep;
4939 KASSERT(pmap_valid_entry(opte));
4940
4941 if (opte & PTE_WIRED) {
4942 pt_entry_t npte = opte & ~PTE_WIRED;
4943
4944 opte = pmap_pte_testset(ptep, npte);
4945 pmap_stats_update_bypte(pmap, npte, opte);
4946 } else {
4947 printf("%s: wiring for pmap %p va %#" PRIxVADDR
4948 " did not change!\n", __func__, pmap, va);
4949 }
4950
4951 /* Release pmap. */
4952 pmap_unmap_ptes(pmap, pmap2);
4953 mutex_exit(&pmap->pm_lock);
4954 }
4955
4956 /*
4957 * pmap_copy: copy mappings from one pmap to another
4958 *
4959 * => optional function
4960 * void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
4961 */
4962
4963 /*
4964 * defined as macro in pmap.h
4965 */
4966
4967 __strict_weak_alias(pmap_enter, pmap_enter_default);
4968
4969 int
4970 pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
4971 u_int flags)
4972 {
4973 if (__predict_false(pmap->pm_enter != NULL)) {
4974 return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
4975 }
4976
4977 return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
4978 }
4979
4980 /*
4981 * pmap_enter: enter a mapping into a pmap
4982 *
4983 * => must be done "now" ... no lazy-evaluation
4984 */
4985 int
4986 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
4987 vm_prot_t prot, u_int flags, int domid)
4988 {
4989 pt_entry_t *ptes, opte, npte;
4990 pt_entry_t *ptep;
4991 pd_entry_t * const *pdes;
4992 struct vm_page *ptp;
4993 struct vm_page *new_pg, *old_pg;
4994 struct pmap_page *new_pp, *old_pp;
4995 struct pv_entry *old_pve, *new_pve;
4996 bool wired = (flags & PMAP_WIRED) != 0;
4997 struct pmap *pmap2;
4998 struct pmap_ptparray pt;
4999 int error;
5000 bool getptp, samepage, new_embedded;
5001 rb_tree_t *tree;
5002
5003 KASSERT(pmap_initialized);
5004 KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5005 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5006 PRIxVADDR " over PDP!", __func__, va);
5007 KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
5008 pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
5009 "%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
5010
5011 #ifdef XENPV
5012 KASSERT(domid == DOMID_SELF || pa == 0);
5013 #endif
5014
5015 npte = ma | protection_codes[prot] | PTE_P;
5016 npte |= pmap_pat_flags(flags);
5017 if (wired)
5018 npte |= PTE_WIRED;
5019 if (va < VM_MAXUSER_ADDRESS) {
5020 KASSERTMSG(pmap != pmap_kernel(),
5021 "entering user va %#"PRIxVADDR" into kernel pmap",
5022 va);
5023 if (pmap_is_user(pmap))
5024 npte |= PTE_U;
5025 }
5026
5027 if (pmap == pmap_kernel())
5028 npte |= pmap_pg_g;
5029 if (flags & VM_PROT_ALL) {
5030 npte |= PTE_A;
5031 if (flags & VM_PROT_WRITE) {
5032 KASSERT((npte & PTE_W) != 0);
5033 npte |= PTE_D;
5034 }
5035 }
5036
5037 #ifdef XENPV
5038 if (domid != DOMID_SELF)
5039 new_pg = NULL;
5040 else
5041 #endif
5042 new_pg = PHYS_TO_VM_PAGE(pa);
5043
5044 if (new_pg != NULL) {
5045 /* This is a managed page */
5046 npte |= PTE_PVLIST;
5047 new_pp = VM_PAGE_TO_PP(new_pg);
5048 PMAP_CHECK_PP(new_pp);
5049 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
5050 /* This is an unmanaged pv-tracked page */
5051 npte |= PTE_PVLIST;
5052 PMAP_CHECK_PP(new_pp);
5053 } else {
5054 new_pp = NULL;
5055 }
5056
5057 /* Begin by locking the pmap. */
5058 mutex_enter(&pmap->pm_lock);
5059
5060 /* Look up the PTP. Allocate if none present. */
5061 ptp = NULL;
5062 getptp = false;
5063 if (pmap != pmap_kernel()) {
5064 ptp = pmap_find_ptp(pmap, va, 1);
5065 if (ptp == NULL) {
5066 getptp = true;
5067 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
5068 if (error != 0) {
5069 if (flags & PMAP_CANFAIL) {
5070 mutex_exit(&pmap->pm_lock);
5071 return error;
5072 }
5073 panic("%s: get ptp failed, error=%d", __func__,
5074 error);
5075 }
5076 }
5077 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5078 } else {
5079 /* Embedded PV entries rely on this. */
5080 KASSERT(va != 0);
5081 tree = &pmap_kernel_rb;
5082 }
5083
5084 /*
5085 * Look up the old PV entry at this VA (if any), and insert a new PV
5086 * entry if required for the new mapping. Temporarily track the old
5087 * and new mappings concurrently. Only after the old mapping is
5088 * evicted from the pmap will we remove its PV entry. Otherwise,
5089 * our picture of modified/accessed state for either page could get
5090 * out of sync (we need any P->V operation for either page to stall
5091 * on pmap->pm_lock until done here).
5092 */
5093 new_pve = NULL;
5094 old_pve = NULL;
5095 samepage = false;
5096 new_embedded = false;
5097
5098 if (new_pp != NULL) {
5099 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
5100 &old_pve, &samepage, &new_embedded, tree);
5101
5102 /*
5103 * If a new pv_entry was needed and none was available, we
5104 * can go no further.
5105 */
5106 if (error != 0) {
5107 if (flags & PMAP_CANFAIL) {
5108 if (getptp) {
5109 pmap_unget_ptp(pmap, &pt);
5110 }
5111 mutex_exit(&pmap->pm_lock);
5112 return error;
5113 }
5114 panic("%s: alloc pve failed", __func__);
5115 }
5116 } else {
5117 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5118 }
5119
5120 /* Map PTEs into address space. */
5121 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5122
5123 /* Install any newly allocated PTPs. */
5124 if (getptp) {
5125 pmap_install_ptp(pmap, &pt, va, pdes);
5126 }
5127
5128 /* Check if there is an existing mapping. */
5129 ptep = &ptes[pl1_i(va)];
5130 opte = *ptep;
5131 bool have_oldpa = pmap_valid_entry(opte);
5132 paddr_t oldpa = pmap_pte2pa(opte);
5133
5134 /*
5135 * Update the pte.
5136 */
5137 do {
5138 opte = *ptep;
5139
5140 /*
5141 * if the same page, inherit PTE_A and PTE_D.
5142 */
5143 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5144 npte |= opte & (PTE_A | PTE_D);
5145 }
5146 #if defined(XENPV)
5147 if (domid != DOMID_SELF) {
5148 /* pmap_pte_cas with error handling */
5149 int s = splvm();
5150 if (opte != *ptep) {
5151 splx(s);
5152 continue;
5153 }
5154 error = xpq_update_foreign(
5155 vtomach((vaddr_t)ptep), npte, domid, flags);
5156 splx(s);
5157 if (error) {
5158 /* Undo pv_entry tracking - oof. */
5159 if (new_pp != NULL) {
5160 mutex_spin_enter(&new_pp->pp_lock);
5161 if (new_pve != NULL) {
5162 LIST_REMOVE(new_pve, pve_list);
5163 KASSERT(pmap->pm_pve == NULL);
5164 pmap->pm_pve = new_pve;
5165 } else if (new_embedded) {
5166 new_pp->pp_pte.pte_ptp = NULL;
5167 new_pp->pp_pte.pte_va = 0;
5168 }
5169 mutex_spin_exit(&new_pp->pp_lock);
5170 }
5171 pmap_unmap_ptes(pmap, pmap2);
5172 /* Free new PTP. */
5173 if (ptp != NULL && ptp->wire_count <= 1) {
5174 pmap_free_ptp(pmap, ptp, va, ptes,
5175 pdes);
5176 }
5177 mutex_exit(&pmap->pm_lock);
5178 return error;
5179 }
5180 break;
5181 }
5182 #endif /* defined(XENPV) */
5183 } while (pmap_pte_cas(ptep, opte, npte) != opte);
5184
5185 /*
5186 * Done with the PTEs: they can now be unmapped.
5187 */
5188 pmap_unmap_ptes(pmap, pmap2);
5189
5190 /*
5191 * Update statistics and PTP's reference count.
5192 */
5193 pmap_stats_update_bypte(pmap, npte, opte);
5194 if (ptp != NULL) {
5195 if (!have_oldpa) {
5196 ptp->wire_count++;
5197 }
5198 /* Remember minimum VA in PTP. */
5199 pmap_ptp_range_set(ptp, va);
5200 }
5201 KASSERT(ptp == NULL || ptp->wire_count > 1);
5202
5203 /*
5204 * If the same page, we can skip pv_entry handling.
5205 */
5206 if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
5207 KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
5208 if ((npte & PTE_PVLIST) != 0) {
5209 KASSERT(samepage);
5210 pmap_check_pv(pmap, ptp, new_pp, va, true);
5211 }
5212 goto same_pa;
5213 } else if ((npte & PTE_PVLIST) != 0) {
5214 KASSERT(!samepage);
5215 }
5216
5217 /*
5218 * If old page is pv-tracked, remove pv_entry from its list.
5219 */
5220 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5221 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5222 old_pp = VM_PAGE_TO_PP(old_pg);
5223 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5224 panic("%s: PTE_PVLIST with pv-untracked page"
5225 " va = %#"PRIxVADDR
5226 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
5227 __func__, va, oldpa, atop(pa));
5228 }
5229
5230 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5231 pmap_pte_to_pp_attrs(opte));
5232 } else {
5233 KASSERT(old_pve == NULL);
5234 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5235 }
5236
5237 /*
5238 * If new page is dynamically PV tracked, insert to tree.
5239 */
5240 if (new_pve != NULL) {
5241 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5242 old_pve = rb_tree_insert_node(tree, new_pve);
5243 KASSERT(old_pve == new_pve);
5244 pmap_check_pv(pmap, ptp, new_pp, va, true);
5245 }
5246
5247 same_pa:
5248 /*
5249 * shootdown tlb if necessary.
5250 */
5251
5252 if ((~opte & (PTE_P | PTE_A)) == 0 &&
5253 ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
5254 pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
5255 }
5256 pmap_drain_pv(pmap);
5257 mutex_exit(&pmap->pm_lock);
5258 return 0;
5259 }
5260
5261 #if defined(XEN) && defined(DOM0OPS)
5262
5263 struct pmap_data_gnt {
5264 SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
5265 vaddr_t pd_gnt_sva;
5266 vaddr_t pd_gnt_eva; /* range covered by this gnt */
5267 int pd_gnt_refs; /* ref counter */
5268 struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
5269 };
5270 SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
5271
5272 static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
5273
5274 static struct pmap_data_gnt *
5275 pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5276 {
5277 struct pmap_data_gnt_head *headp;
5278 struct pmap_data_gnt *pgnt;
5279
5280 KASSERT(mutex_owned(&pmap->pm_lock));
5281 headp = pmap->pm_data;
5282 KASSERT(headp != NULL);
5283 SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
5284 if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
5285 return pgnt;
5286 /* check that we're not overlapping part of a region */
5287 KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
5288 }
5289 return NULL;
5290 }
5291
5292 static void
5293 pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
5294 const struct gnttab_map_grant_ref *ops)
5295 {
5296 struct pmap_data_gnt_head *headp;
5297 struct pmap_data_gnt *pgnt;
5298 vaddr_t eva = sva + nentries * PAGE_SIZE;
5299 KASSERT(mutex_owned(&pmap->pm_lock));
5300 KASSERT(nentries >= 1);
5301 if (pmap->pm_remove == NULL) {
5302 pmap->pm_remove = pmap_remove_gnt;
5303 KASSERT(pmap->pm_data == NULL);
5304 headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
5305 SLIST_INIT(headp);
5306 pmap->pm_data = headp;
5307 } else {
5308 KASSERT(pmap->pm_remove == pmap_remove_gnt);
5309 KASSERT(pmap->pm_data != NULL);
5310 headp = pmap->pm_data;
5311 }
5312
5313 pgnt = pmap_find_gnt(pmap, sva, eva);
5314 if (pgnt != NULL) {
5315 KASSERT(pgnt->pd_gnt_sva == sva);
5316 KASSERT(pgnt->pd_gnt_eva == eva);
5317 return;
5318 }
5319
5320 /* new entry */
5321 pgnt = kmem_alloc(sizeof(*pgnt) +
5322 (nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
5323 pgnt->pd_gnt_sva = sva;
5324 pgnt->pd_gnt_eva = eva;
5325 pgnt->pd_gnt_refs = 0;
5326 memcpy(pgnt->pd_gnt_ops, ops,
5327 sizeof(struct gnttab_map_grant_ref) * nentries);
5328 SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
5329 }
5330
5331 static void
5332 pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
5333 {
5334 struct pmap_data_gnt_head *headp = pmap->pm_data;
5335 int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
5336 KASSERT(nentries >= 1);
5337 KASSERT(mutex_owned(&pmap->pm_lock));
5338 KASSERT(pgnt->pd_gnt_refs == 0);
5339 SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
5340 kmem_free(pgnt, sizeof(*pgnt) +
5341 (nentries - 1) * sizeof(struct gnttab_map_grant_ref));
5342 if (SLIST_EMPTY(headp)) {
5343 kmem_free(headp, sizeof(*headp));
5344 pmap->pm_data = NULL;
5345 pmap->pm_remove = NULL;
5346 }
5347 }
5348
5349 /*
5350 * pmap_enter_gnt: enter a grant entry into a pmap
5351 *
5352 * => must be done "now" ... no lazy-evaluation
5353 */
5354 int
5355 pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
5356 const struct gnttab_map_grant_ref *oops)
5357 {
5358 struct pmap_data_gnt *pgnt;
5359 pt_entry_t *ptes, opte;
5360 #ifndef XENPV
5361 pt_entry_t npte;
5362 #endif
5363 pt_entry_t *ptep;
5364 pd_entry_t * const *pdes;
5365 struct vm_page *ptp;
5366 struct vm_page *old_pg;
5367 struct pmap_page *old_pp;
5368 struct pv_entry *old_pve;
5369 struct pmap *pmap2;
5370 struct pmap_ptparray pt;
5371 int error;
5372 bool getptp;
5373 rb_tree_t *tree;
5374 struct gnttab_map_grant_ref *op;
5375 int ret;
5376 int idx;
5377
5378 KASSERT(pmap_initialized);
5379 KASSERT(va < VM_MAX_KERNEL_ADDRESS);
5380 KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
5381 PRIxVADDR " over PDP!", __func__, va);
5382 KASSERT(pmap != pmap_kernel());
5383
5384 /* Begin by locking the pmap. */
5385 mutex_enter(&pmap->pm_lock);
5386 pmap_alloc_gnt(pmap, sva, nentries, oops);
5387
5388 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5389 KASSERT(pgnt != NULL);
5390
5391 /* Look up the PTP. Allocate if none present. */
5392 ptp = NULL;
5393 getptp = false;
5394 ptp = pmap_find_ptp(pmap, va, 1);
5395 if (ptp == NULL) {
5396 getptp = true;
5397 error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
5398 if (error != 0) {
5399 mutex_exit(&pmap->pm_lock);
5400 return error;
5401 }
5402 }
5403 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
5404
5405 /*
5406 * Look up the old PV entry at this VA (if any), and insert a new PV
5407 * entry if required for the new mapping. Temporarily track the old
5408 * and new mappings concurrently. Only after the old mapping is
5409 * evicted from the pmap will we remove its PV entry. Otherwise,
5410 * our picture of modified/accessed state for either page could get
5411 * out of sync (we need any P->V operation for either page to stall
5412 * on pmap->pm_lock until done here).
5413 */
5414 old_pve = NULL;
5415
5416 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
5417
5418 /* Map PTEs into address space. */
5419 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5420
5421 /* Install any newly allocated PTPs. */
5422 if (getptp) {
5423 pmap_install_ptp(pmap, &pt, va, pdes);
5424 }
5425
5426 /* Check if there is an existing mapping. */
5427 ptep = &ptes[pl1_i(va)];
5428 opte = *ptep;
5429 bool have_oldpa = pmap_valid_entry(opte);
5430 paddr_t oldpa = pmap_pte2pa(opte);
5431
5432 /*
5433 * Update the pte.
5434 */
5435
5436 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5437 op = &pgnt->pd_gnt_ops[idx];
5438
5439 #ifdef XENPV
5440 KASSERT(op->flags & GNTMAP_contains_pte);
5441 op->host_addr = xpmap_ptetomach(ptep);
5442 #else
5443 KASSERT((op->flags & GNTMAP_contains_pte) == 0);
5444 KASSERT(op->flags != 0);
5445 KASSERT(op->host_addr != 0);
5446 #endif
5447 op->dev_bus_addr = 0;
5448 op->status = GNTST_general_error;
5449 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5450 if (__predict_false(ret)) {
5451 printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5452 __func__, ret);
5453 op->status = GNTST_general_error;
5454 }
5455 for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
5456 kpause("gntmap", false, mstohz(1), NULL);
5457 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
5458 if (__predict_false(ret)) {
5459 printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
5460 __func__, ret);
5461 op->status = GNTST_general_error;
5462 }
5463 }
5464 if (__predict_false(op->status != GNTST_okay)) {
5465 printf("%s: GNTTABOP_map_grant_ref status: %d\n",
5466 __func__, op->status);
5467 if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/
5468 ptp->wire_count--;
5469 }
5470 } else {
5471 #ifndef XENPV
5472 npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
5473 if ((op->flags & GNTMAP_readonly) == 0)
5474 npte |= PTE_W;
5475 do {
5476 opte = *ptep;
5477 } while (pmap_pte_cas(ptep, opte, npte) != opte);
5478 #endif
5479 pgnt->pd_gnt_refs++;
5480 if (!have_oldpa) {
5481 ptp->wire_count++;
5482 }
5483 KASSERT(ptp->wire_count > 1);
5484 /* Remember minimum VA in PTP. */
5485 pmap_ptp_range_set(ptp, va);
5486 }
5487 if (ptp->wire_count <= 1)
5488 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5489
5490 /*
5491 * Done with the PTEs: they can now be unmapped.
5492 */
5493 pmap_unmap_ptes(pmap, pmap2);
5494
5495 /*
5496 * Update statistics and PTP's reference count.
5497 */
5498 pmap_stats_update_bypte(pmap, 0, opte);
5499
5500 /*
5501 * If old page is pv-tracked, remove pv_entry from its list.
5502 */
5503 if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
5504 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
5505 old_pp = VM_PAGE_TO_PP(old_pg);
5506 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
5507 panic("%s: PTE_PVLIST with pv-untracked page"
5508 " va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
5509 __func__, va, oldpa);
5510 }
5511
5512 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
5513 pmap_pte_to_pp_attrs(opte));
5514 } else {
5515 KASSERT(old_pve == NULL);
5516 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
5517 }
5518
5519 pmap_drain_pv(pmap);
5520 mutex_exit(&pmap->pm_lock);
5521 return op->status;
5522 }
5523
5524 /*
5525 * pmap_remove_gnt: grant mapping removal function.
5526 *
5527 * => caller should not be holding any pmap locks
5528 */
5529 static void
5530 pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5531 {
5532 struct pmap_data_gnt *pgnt;
5533 pt_entry_t *ptes;
5534 pd_entry_t pde;
5535 pd_entry_t * const *pdes;
5536 struct vm_page *ptp;
5537 struct pmap *pmap2;
5538 vaddr_t va;
5539 int lvl;
5540 int idx;
5541 struct gnttab_map_grant_ref *op;
5542 struct gnttab_unmap_grant_ref unmap_op;
5543 int ret;
5544
5545 KASSERT(pmap != pmap_kernel());
5546 KASSERT(pmap->pm_remove == pmap_remove_gnt);
5547
5548 mutex_enter(&pmap->pm_lock);
5549 for (va = sva; va < eva; va += PAGE_SIZE) {
5550 pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
5551 if (pgnt == NULL) {
5552 pmap_remove_locked(pmap, sva, eva);
5553 continue;
5554 }
5555
5556 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5557 if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
5558 panic("pmap_remove_gnt pdes not valid");
5559 }
5560
5561 idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
5562 op = &pgnt->pd_gnt_ops[idx];
5563 KASSERT(lvl == 1);
5564
5565 /* Get PTP if non-kernel mapping. */
5566 ptp = pmap_find_ptp(pmap, va, 1);
5567 KASSERTMSG(ptp != NULL,
5568 "%s: unmanaged PTP detected", __func__);
5569
5570 if (op->status == GNTST_okay) {
5571 KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
5572 #ifdef XENPV
5573 unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
5574 #else
5575 unmap_op.host_addr = op->host_addr;
5576 pmap_pte_testset(&ptes[pl1_i(va)], 0);
5577 #endif
5578 unmap_op.handle = op->handle;
5579 unmap_op.dev_bus_addr = 0;
5580 ret = HYPERVISOR_grant_table_op(
5581 GNTTABOP_unmap_grant_ref, &unmap_op, 1);
5582 if (ret) {
5583 printf("%s: GNTTABOP_unmap_grant_ref "
5584 "failed: %d\n", __func__, ret);
5585 }
5586
5587 ptp->wire_count--;
5588 pgnt->pd_gnt_refs--;
5589 }
5590 if (pgnt->pd_gnt_refs == 0) {
5591 pmap_free_gnt(pmap, pgnt);
5592 }
5593 /*
5594 * if mapping removed and the PTP is no longer
5595 * being used, free it!
5596 */
5597
5598 if (ptp->wire_count <= 1)
5599 pmap_free_ptp(pmap, ptp, va, ptes, pdes);
5600 pmap_unmap_ptes(pmap, pmap2);
5601 }
5602 mutex_exit(&pmap->pm_lock);
5603 }
5604 #endif /* XEN && DOM0OPS */
5605
5606 paddr_t
5607 pmap_get_physpage(void)
5608 {
5609 struct vm_page *ptp;
5610 struct pmap *kpm = pmap_kernel();
5611 paddr_t pa;
5612
5613 if (!uvm.page_init_done) {
5614 /*
5615 * We're growing the kernel pmap early (from
5616 * uvm_pageboot_alloc()). This case must be
5617 * handled a little differently.
5618 */
5619
5620 if (!uvm_page_physget(&pa))
5621 panic("%s: out of memory", __func__);
5622 #if defined(__HAVE_DIRECT_MAP)
5623 memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
5624 #else
5625 #if defined(XENPV)
5626 if (XEN_VERSION_SUPPORTED(3, 4)) {
5627 xen_pagezero(pa);
5628 return pa;
5629 }
5630 #endif
5631 kpreempt_disable();
5632 pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
5633 PTE_W | pmap_pg_nx);
5634 pmap_pte_flush();
5635 pmap_update_pg((vaddr_t)early_zerop);
5636 memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
5637 #if defined(DIAGNOSTIC) || defined(XENPV)
5638 pmap_pte_set(early_zero_pte, 0);
5639 pmap_pte_flush();
5640 #endif /* defined(DIAGNOSTIC) */
5641 kpreempt_enable();
5642 #endif /* defined(__HAVE_DIRECT_MAP) */
5643 } else {
5644 /* XXX */
5645 ptp = uvm_pagealloc(NULL, 0, NULL,
5646 UVM_PGA_USERESERVE|UVM_PGA_ZERO);
5647 if (ptp == NULL)
5648 panic("%s: out of memory", __func__);
5649 ptp->flags &= ~PG_BUSY;
5650 ptp->wire_count = 1;
5651 pa = VM_PAGE_TO_PHYS(ptp);
5652 }
5653 pmap_stats_update(kpm, 1, 0);
5654
5655 return pa;
5656 }
5657
5658 /*
5659 * Expand the page tree with the specified amount of PTPs, mapping virtual
5660 * addresses starting at kva. We populate all the levels but the last one
5661 * (L1). The nodes of the tree are created as RW, but the pages covered
5662 * will be kentered in L1, with proper permissions.
5663 *
5664 * Used only by pmap_growkernel.
5665 */
5666 static void
5667 pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
5668 {
5669 unsigned long i;
5670 paddr_t pa;
5671 unsigned long index, endindex;
5672 int level;
5673 pd_entry_t *pdep;
5674 #ifdef XENPV
5675 int s = splvm(); /* protect xpq_* */
5676 #endif
5677
5678 for (level = PTP_LEVELS; level > 1; level--) {
5679 if (level == PTP_LEVELS)
5680 pdep = cpm->pm_pdir;
5681 else
5682 pdep = normal_pdes[level - 2];
5683 index = pl_i_roundup(kva, level);
5684 endindex = index + needed_ptps[level - 1] - 1;
5685
5686 for (i = index; i <= endindex; i++) {
5687 pt_entry_t pte;
5688
5689 KASSERT(!pmap_valid_entry(pdep[i]));
5690 pa = pmap_get_physpage();
5691 pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
5692 #ifdef __x86_64__
5693 pte |= pmap_pg_nx;
5694 #endif
5695 pmap_pte_set(&pdep[i], pte);
5696
5697 #ifdef XENPV
5698 if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
5699 if (__predict_true(
5700 cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5701 /* update per-cpu PMDs on all cpus */
5702 xen_kpm_sync(pmap_kernel(), i);
5703 } else {
5704 /*
5705 * too early; update primary CPU
5706 * PMD only (without locks)
5707 */
5708 #ifdef __x86_64__
5709 pd_entry_t *cpu_pdep =
5710 &cpu_info_primary.ci_kpm_pdir[i];
5711 #else
5712 pd_entry_t *cpu_pdep =
5713 &cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
5714 #endif
5715 pmap_pte_set(cpu_pdep, pte);
5716 }
5717 }
5718 #endif
5719
5720 KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
5721 pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
5722 nkptp[level - 1]++;
5723 }
5724 pmap_pte_flush();
5725 }
5726 #ifdef XENPV
5727 splx(s);
5728 #endif
5729 }
5730
5731 /*
5732 * pmap_growkernel: increase usage of KVM space.
5733 *
5734 * => we allocate new PTPs for the kernel and install them in all
5735 * the pmaps on the system.
5736 */
5737 vaddr_t
5738 pmap_growkernel(vaddr_t maxkvaddr)
5739 {
5740 struct pmap *kpm = pmap_kernel();
5741 struct pmap *cpm;
5742 #if !defined(XENPV) || !defined(__x86_64__)
5743 struct pmap *pm;
5744 long old;
5745 #endif
5746 int s, i;
5747 long needed_kptp[PTP_LEVELS], target_nptp;
5748 bool invalidate = false;
5749
5750 s = splvm(); /* to be safe */
5751 mutex_enter(&kpm->pm_lock);
5752
5753 if (maxkvaddr <= pmap_maxkvaddr) {
5754 mutex_exit(&kpm->pm_lock);
5755 splx(s);
5756 return pmap_maxkvaddr;
5757 }
5758
5759 maxkvaddr = x86_round_pdr(maxkvaddr);
5760 #if !defined(XENPV) || !defined(__x86_64__)
5761 old = nkptp[PTP_LEVELS - 1];
5762 #endif
5763
5764 /* Initialize needed_kptp. */
5765 for (i = PTP_LEVELS - 1; i >= 1; i--) {
5766 target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
5767 pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
5768
5769 if (target_nptp > nkptpmax[i])
5770 panic("out of KVA space");
5771 KASSERT(target_nptp >= nkptp[i]);
5772 needed_kptp[i] = target_nptp - nkptp[i];
5773 }
5774
5775 #ifdef XENPV
5776 /* only pmap_kernel(), or the per-cpu map, has kernel entries */
5777 cpm = kpm;
5778 #else
5779 /* Get the current pmap */
5780 if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
5781 cpm = curcpu()->ci_pmap;
5782 } else {
5783 cpm = kpm;
5784 }
5785 #endif
5786
5787 kasan_shadow_map((void *)pmap_maxkvaddr,
5788 (size_t)(maxkvaddr - pmap_maxkvaddr));
5789 kmsan_shadow_map((void *)pmap_maxkvaddr,
5790 (size_t)(maxkvaddr - pmap_maxkvaddr));
5791
5792 pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
5793
5794 /*
5795 * If the number of top level entries changed, update all pmaps.
5796 */
5797 if (needed_kptp[PTP_LEVELS - 1] != 0) {
5798 #ifdef XENPV
5799 #ifdef __x86_64__
5800 /* nothing, kernel entries are never entered in user pmap */
5801 #else
5802 int pdkidx;
5803
5804 mutex_enter(&pmaps_lock);
5805 LIST_FOREACH(pm, &pmaps, pm_list) {
5806 for (pdkidx = PDIR_SLOT_KERN + old;
5807 pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
5808 pdkidx++) {
5809 pmap_pte_set(&pm->pm_pdir[pdkidx],
5810 kpm->pm_pdir[pdkidx]);
5811 }
5812 pmap_pte_flush();
5813 }
5814 mutex_exit(&pmaps_lock);
5815 #endif /* __x86_64__ */
5816 #else /* XENPV */
5817 size_t newpdes;
5818 newpdes = nkptp[PTP_LEVELS - 1] - old;
5819 if (cpm != kpm) {
5820 memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
5821 &cpm->pm_pdir[PDIR_SLOT_KERN + old],
5822 newpdes * sizeof(pd_entry_t));
5823 }
5824
5825 mutex_enter(&pmaps_lock);
5826 LIST_FOREACH(pm, &pmaps, pm_list) {
5827 if (__predict_false(pm->pm_enter != NULL)) {
5828 /*
5829 * Not a native pmap, the kernel is not mapped,
5830 * so nothing to synchronize.
5831 */
5832 continue;
5833 }
5834 memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
5835 &kpm->pm_pdir[PDIR_SLOT_KERN + old],
5836 newpdes * sizeof(pd_entry_t));
5837 }
5838 mutex_exit(&pmaps_lock);
5839 #endif
5840 invalidate = true;
5841 }
5842 pmap_maxkvaddr = maxkvaddr;
5843 mutex_exit(&kpm->pm_lock);
5844 splx(s);
5845
5846 if (invalidate && pmap_initialized) {
5847 /* Invalidate the pmap cache. */
5848 pool_cache_invalidate(&pmap_cache);
5849 }
5850
5851 return maxkvaddr;
5852 }
5853
5854 #ifdef DEBUG
5855 void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
5856
5857 /*
5858 * pmap_dump: dump all the mappings from a pmap
5859 *
5860 * => caller should not be holding any pmap locks
5861 */
5862 void
5863 pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
5864 {
5865 pt_entry_t *ptes, *pte;
5866 pd_entry_t * const *pdes;
5867 struct pmap *pmap2;
5868 vaddr_t blkendva;
5869 int lvl;
5870
5871 /*
5872 * if end is out of range truncate.
5873 * if (end == start) update to max.
5874 */
5875
5876 if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
5877 eva = VM_MAXUSER_ADDRESS;
5878
5879 mutex_enter(&pmap->pm_lock);
5880 pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
5881
5882 /*
5883 * dumping a range of pages: we dump in PTP sized blocks (4MB)
5884 */
5885
5886 for (/* null */ ; sva < eva ; sva = blkendva) {
5887
5888 /* determine range of block */
5889 blkendva = x86_round_pdr(sva+1);
5890 if (blkendva > eva)
5891 blkendva = eva;
5892
5893 /* valid block? */
5894 if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
5895 continue;
5896 KASSERT(lvl == 1);
5897
5898 pte = &ptes[pl1_i(sva)];
5899 for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
5900 if (!pmap_valid_entry(*pte))
5901 continue;
5902 printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
5903 " (pte=%#" PRIxPADDR ")\n",
5904 sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
5905 }
5906 }
5907 pmap_unmap_ptes(pmap, pmap2);
5908 mutex_exit(&pmap->pm_lock);
5909 }
5910 #endif
5911
5912 /*
5913 * pmap_update: process deferred invalidations and frees.
5914 */
5915 void
5916 pmap_update(struct pmap *pmap)
5917 {
5918 struct pmap_page *pp;
5919 struct vm_page *ptp;
5920
5921 /*
5922 * Initiate any pending TLB shootdowns. Wait for them to
5923 * complete before returning control to the caller.
5924 */
5925 kpreempt_disable();
5926 pmap_tlb_shootnow();
5927 kpreempt_enable();
5928
5929 /*
5930 * Now that shootdowns are complete, process deferred frees. This
5931 * is an unlocked check, but is safe as we're only interested in
5932 * work done in this LWP - we won't get a false negative.
5933 */
5934 if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
5935 return;
5936 }
5937
5938 mutex_enter(&pmap->pm_lock);
5939 while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
5940 KASSERT(ptp->wire_count == 0);
5941 KASSERT(ptp->uanon == NULL);
5942 LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
5943 pp = VM_PAGE_TO_PP(ptp);
5944 LIST_INIT(&pp->pp_pvlist);
5945 pp->pp_attrs = 0;
5946 pp->pp_pte.pte_ptp = NULL;
5947 pp->pp_pte.pte_va = 0;
5948 PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
5949
5950 /*
5951 * XXX Hack to avoid extra locking, and lock
5952 * assertions in uvm_pagefree(). Despite uobject
5953 * being set, this isn't a managed page.
5954 */
5955 PMAP_DUMMY_LOCK(pmap);
5956 uvm_pagerealloc(ptp, NULL, 0);
5957 PMAP_DUMMY_UNLOCK(pmap);
5958 uvm_pagefree(ptp);
5959 }
5960 mutex_exit(&pmap->pm_lock);
5961 }
5962
5963 #if PTP_LEVELS > 4
5964 #error "Unsupported number of page table mappings"
5965 #endif
5966
5967 paddr_t
5968 pmap_init_tmp_pgtbl(paddr_t pg)
5969 {
5970 static bool maps_loaded;
5971 static const paddr_t x86_tmp_pml_paddr[] = {
5972 4 * PAGE_SIZE, /* L1 */
5973 5 * PAGE_SIZE, /* L2 */
5974 6 * PAGE_SIZE, /* L3 */
5975 7 * PAGE_SIZE /* L4 */
5976 };
5977 static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
5978
5979 pd_entry_t *tmp_pml, *kernel_pml;
5980
5981 int level;
5982
5983 if (!maps_loaded) {
5984 for (level = 0; level < PTP_LEVELS; ++level) {
5985 x86_tmp_pml_vaddr[level] =
5986 uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
5987 UVM_KMF_VAONLY);
5988
5989 if (x86_tmp_pml_vaddr[level] == 0)
5990 panic("mapping of real mode PML failed\n");
5991 pmap_kenter_pa(x86_tmp_pml_vaddr[level],
5992 x86_tmp_pml_paddr[level],
5993 VM_PROT_READ | VM_PROT_WRITE, 0);
5994 }
5995 pmap_update(pmap_kernel());
5996 maps_loaded = true;
5997 }
5998
5999 /* Zero levels 1-3 */
6000 for (level = 0; level < PTP_LEVELS - 1; ++level) {
6001 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6002 memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
6003 }
6004
6005 /* Copy PML4 */
6006 kernel_pml = pmap_kernel()->pm_pdir;
6007 tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
6008 memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
6009
6010 #ifdef PAE
6011 /*
6012 * Use the last 4 entries of the L2 page as L3 PD entries. These
6013 * last entries are unlikely to be used for temporary mappings.
6014 * 508: maps 0->1GB (userland)
6015 * 509: unused
6016 * 510: unused
6017 * 511: maps 3->4GB (kernel)
6018 */
6019 tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
6020 tmp_pml[509] = 0;
6021 tmp_pml[510] = 0;
6022 tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
6023 #endif
6024
6025 for (level = PTP_LEVELS - 1; level > 0; --level) {
6026 tmp_pml = (void *)x86_tmp_pml_vaddr[level];
6027
6028 tmp_pml[pl_i(pg, level + 1)] =
6029 (x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
6030 }
6031
6032 tmp_pml = (void *)x86_tmp_pml_vaddr[0];
6033 tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
6034
6035 #ifdef PAE
6036 /* Return the PA of the L3 page (entry 508 of the L2 page) */
6037 return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
6038 #endif
6039
6040 return x86_tmp_pml_paddr[PTP_LEVELS - 1];
6041 }
6042
6043 u_int
6044 x86_mmap_flags(paddr_t mdpgno)
6045 {
6046 u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
6047 u_int pflag = 0;
6048
6049 if (nflag & X86_MMAP_FLAG_PREFETCH)
6050 pflag |= PMAP_WRITE_COMBINE;
6051
6052 return pflag;
6053 }
6054
6055 #if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
6056
6057 /*
6058 * -----------------------------------------------------------------------------
6059 * *****************************************************************************
6060 * *****************************************************************************
6061 * *****************************************************************************
6062 * *****************************************************************************
6063 * **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
6064 * *****************************************************************************
6065 * *****************************************************************************
6066 * *****************************************************************************
6067 * *****************************************************************************
6068 * -----------------------------------------------------------------------------
6069 *
6070 * These functions are invoked as callbacks from the code above. Contrary to
6071 * native, EPT does not have a recursive slot; therefore, it is not possible
6072 * to call pmap_map_ptes(). Instead, we use the direct map and walk down the
6073 * tree manually.
6074 *
6075 * Apart from that, the logic is mostly the same as native. Once a pmap has
6076 * been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
6077 * After that we're good, and the callbacks will handle the translations
6078 * for us.
6079 *
6080 * -----------------------------------------------------------------------------
6081 */
6082
6083 /* Hardware bits. */
6084 #define EPT_R __BIT(0) /* read */
6085 #define EPT_W __BIT(1) /* write */
6086 #define EPT_X __BIT(2) /* execute */
6087 #define EPT_T __BITS(5,3) /* type */
6088 #define TYPE_UC 0
6089 #define TYPE_WC 1
6090 #define TYPE_WT 4
6091 #define TYPE_WP 5
6092 #define TYPE_WB 6
6093 #define EPT_NOPAT __BIT(6)
6094 #define EPT_L __BIT(7) /* large */
6095 #define EPT_A __BIT(8) /* accessed */
6096 #define EPT_D __BIT(9) /* dirty */
6097 /* Software bits. */
6098 #define EPT_PVLIST __BIT(60)
6099 #define EPT_WIRED __BIT(61)
6100
6101 #define pmap_ept_valid_entry(pte) (pte & EPT_R)
6102
6103 bool pmap_ept_has_ad __read_mostly;
6104
6105 static inline void
6106 pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
6107 {
6108 int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
6109 int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
6110
6111 KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6112 KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
6113
6114 pmap_stats_update(pmap, resid_diff, wired_diff);
6115 }
6116
6117 static pt_entry_t
6118 pmap_ept_type(u_int flags)
6119 {
6120 u_int cacheflags = (flags & PMAP_CACHE_MASK);
6121 pt_entry_t ret;
6122
6123 switch (cacheflags) {
6124 case PMAP_NOCACHE:
6125 case PMAP_NOCACHE_OVR:
6126 ret = __SHIFTIN(TYPE_UC, EPT_T);
6127 break;
6128 case PMAP_WRITE_COMBINE:
6129 ret = __SHIFTIN(TYPE_WC, EPT_T);
6130 break;
6131 case PMAP_WRITE_BACK:
6132 default:
6133 ret = __SHIFTIN(TYPE_WB, EPT_T);
6134 break;
6135 }
6136
6137 ret |= EPT_NOPAT;
6138 return ret;
6139 }
6140
6141 static inline pt_entry_t
6142 pmap_ept_prot(vm_prot_t prot)
6143 {
6144 pt_entry_t res = 0;
6145
6146 if (prot & VM_PROT_READ)
6147 res |= EPT_R;
6148 if (prot & VM_PROT_WRITE)
6149 res |= EPT_W;
6150 if (prot & VM_PROT_EXECUTE)
6151 res |= EPT_X;
6152
6153 return res;
6154 }
6155
6156 static inline uint8_t
6157 pmap_ept_to_pp_attrs(pt_entry_t ept)
6158 {
6159 uint8_t ret = 0;
6160 if (pmap_ept_has_ad) {
6161 if (ept & EPT_D)
6162 ret |= PP_ATTRS_D;
6163 if (ept & EPT_A)
6164 ret |= PP_ATTRS_A;
6165 } else {
6166 ret |= (PP_ATTRS_D|PP_ATTRS_A);
6167 }
6168 if (ept & EPT_W)
6169 ret |= PP_ATTRS_W;
6170 return ret;
6171 }
6172
6173 static inline pt_entry_t
6174 pmap_pp_attrs_to_ept(uint8_t attrs)
6175 {
6176 pt_entry_t ept = 0;
6177 if (attrs & PP_ATTRS_D)
6178 ept |= EPT_D;
6179 if (attrs & PP_ATTRS_A)
6180 ept |= EPT_A;
6181 if (attrs & PP_ATTRS_W)
6182 ept |= EPT_W;
6183 return ept;
6184 }
6185
6186 /*
6187 * Helper for pmap_ept_free_ptp.
6188 * tree[0] = &L2[L2idx]
6189 * tree[1] = &L3[L3idx]
6190 * tree[2] = &L4[L4idx]
6191 */
6192 static void
6193 pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
6194 {
6195 pt_entry_t *pteva;
6196 paddr_t ptepa;
6197 int i, index;
6198
6199 ptepa = pmap->pm_pdirpa[0];
6200 for (i = PTP_LEVELS; i > 1; i--) {
6201 index = pl_pi(va, i);
6202 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6203 KASSERT(pmap_ept_valid_entry(pteva[index]));
6204 tree[i - 2] = &pteva[index];
6205 ptepa = pmap_pte2pa(pteva[index]);
6206 }
6207 }
6208
6209 static void
6210 pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
6211 {
6212 pd_entry_t *tree[3];
6213 int level;
6214
6215 KASSERT(pmap != pmap_kernel());
6216 KASSERT(mutex_owned(&pmap->pm_lock));
6217 KASSERT(kpreempt_disabled());
6218
6219 pmap_ept_get_tree(pmap, va, tree);
6220
6221 level = 1;
6222 do {
6223 (void)pmap_pte_testset(tree[level - 1], 0);
6224
6225 pmap_freepage(pmap, ptp, level);
6226 if (level < PTP_LEVELS - 1) {
6227 ptp = pmap_find_ptp(pmap, va, level + 1);
6228 ptp->wire_count--;
6229 if (ptp->wire_count > 1)
6230 break;
6231 }
6232 } while (++level < PTP_LEVELS);
6233 pmap_pte_flush();
6234 }
6235
6236 /* Allocate L4->L3->L2. Return L2. */
6237 static void
6238 pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
6239 {
6240 struct vm_page *ptp;
6241 unsigned long index;
6242 pd_entry_t *pteva;
6243 paddr_t ptepa;
6244 int i;
6245
6246 KASSERT(pmap != pmap_kernel());
6247 KASSERT(mutex_owned(&pmap->pm_lock));
6248 KASSERT(kpreempt_disabled());
6249
6250 /*
6251 * Now that we have all the pages looked up or allocated,
6252 * loop through again installing any new ones into the tree.
6253 */
6254 ptepa = pmap->pm_pdirpa[0];
6255 for (i = PTP_LEVELS; i > 1; i--) {
6256 index = pl_pi(va, i);
6257 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6258
6259 if (pmap_ept_valid_entry(pteva[index])) {
6260 KASSERT(!pt->alloced[i]);
6261 ptepa = pmap_pte2pa(pteva[index]);
6262 continue;
6263 }
6264
6265 ptp = pt->pg[i];
6266 ptp->flags &= ~PG_BUSY; /* never busy */
6267 ptp->wire_count = 1;
6268 pmap->pm_ptphint[i - 2] = ptp;
6269 ptepa = VM_PAGE_TO_PHYS(ptp);
6270 pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
6271
6272 pmap_pte_flush();
6273 pmap_stats_update(pmap, 1, 0);
6274
6275 /*
6276 * If we're not in the top level, increase the
6277 * wire count of the parent page.
6278 */
6279 if (i < PTP_LEVELS) {
6280 pt->pg[i + 1]->wire_count++;
6281 }
6282 }
6283 }
6284
6285 static int
6286 pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
6287 u_int flags)
6288 {
6289 pt_entry_t *ptes, opte, npte;
6290 pt_entry_t *ptep;
6291 struct vm_page *ptp;
6292 struct vm_page *new_pg, *old_pg;
6293 struct pmap_page *new_pp, *old_pp;
6294 struct pv_entry *old_pve, *new_pve;
6295 bool wired = (flags & PMAP_WIRED) != 0;
6296 bool accessed;
6297 struct pmap_ptparray pt;
6298 int error;
6299 bool getptp, samepage, new_embedded;
6300 rb_tree_t *tree;
6301
6302 KASSERT(pmap_initialized);
6303 KASSERT(va < VM_MAXUSER_ADDRESS);
6304
6305 npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
6306
6307 if (wired)
6308 npte |= EPT_WIRED;
6309 if (flags & VM_PROT_ALL) {
6310 npte |= EPT_A;
6311 if (flags & VM_PROT_WRITE) {
6312 KASSERT((npte & EPT_W) != 0);
6313 npte |= EPT_D;
6314 }
6315 }
6316
6317 new_pg = PHYS_TO_VM_PAGE(pa);
6318 if (new_pg != NULL) {
6319 /* This is a managed page */
6320 npte |= EPT_PVLIST;
6321 new_pp = VM_PAGE_TO_PP(new_pg);
6322 } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
6323 /* This is an unmanaged pv-tracked page */
6324 npte |= EPT_PVLIST;
6325 } else {
6326 new_pp = NULL;
6327 }
6328
6329 /* Begin by locking the pmap. */
6330 mutex_enter(&pmap->pm_lock);
6331
6332 /* Look up the PTP. Allocate if none present. */
6333 ptp = NULL;
6334 getptp = false;
6335 if (pmap != pmap_kernel()) {
6336 ptp = pmap_find_ptp(pmap, va, 1);
6337 if (ptp == NULL) {
6338 getptp = true;
6339 error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
6340 if (error != 0) {
6341 if (flags & PMAP_CANFAIL) {
6342 mutex_exit(&pmap->pm_lock);
6343 return error;
6344 }
6345 panic("%s: get ptp failed, error=%d", __func__,
6346 error);
6347 }
6348 }
6349 tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
6350 } else {
6351 /* Embedded PV entries rely on this. */
6352 KASSERT(va != 0);
6353 tree = &pmap_kernel_rb;
6354 }
6355
6356 /*
6357 * Look up the old PV entry at this VA (if any), and insert a new PV
6358 * entry if required for the new mapping. Temporarily track the old
6359 * and new mappings concurrently. Only after the old mapping is
6360 * evicted from the pmap will we remove its PV entry. Otherwise,
6361 * our picture of modified/accessed state for either page could get
6362 * out of sync (we need any P->V operation for either page to stall
6363 * on pmap->pm_lock until done here).
6364 */
6365 new_pve = NULL;
6366 old_pve = NULL;
6367 samepage = false;
6368 new_embedded = false;
6369
6370 if (new_pp != NULL) {
6371 error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
6372 &old_pve, &samepage, &new_embedded, tree);
6373
6374 /*
6375 * If a new pv_entry was needed and none was available, we
6376 * can go no further.
6377 */
6378 if (error != 0) {
6379 if (flags & PMAP_CANFAIL) {
6380 if (getptp) {
6381 pmap_unget_ptp(pmap, &pt);
6382 }
6383 mutex_exit(&pmap->pm_lock);
6384 return error;
6385 }
6386 panic("%s: alloc pve failed", __func__);
6387 }
6388 } else {
6389 old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
6390 }
6391
6392 /* Map PTEs into address space. */
6393 kpreempt_disable();
6394
6395 /* Install any newly allocated PTPs. */
6396 if (getptp) {
6397 pmap_ept_install_ptp(pmap, &pt, va);
6398 }
6399
6400 /* Check if there is an existing mapping. */
6401 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
6402 ptep = &ptes[pl1_pi(va)];
6403 opte = *ptep;
6404 bool have_oldpa = pmap_ept_valid_entry(opte);
6405 paddr_t oldpa = pmap_pte2pa(opte);
6406
6407 /*
6408 * Update the pte.
6409 */
6410 do {
6411 opte = *ptep;
6412
6413 /*
6414 * if the same page, inherit PTE_A and PTE_D.
6415 */
6416 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6417 npte |= opte & (EPT_A | EPT_D);
6418 }
6419 } while (pmap_pte_cas(ptep, opte, npte) != opte);
6420
6421 /*
6422 * Done with the PTEs: they can now be unmapped.
6423 */
6424 kpreempt_enable();
6425
6426 /*
6427 * Update statistics and PTP's reference count.
6428 */
6429 pmap_ept_stats_update_bypte(pmap, npte, opte);
6430 if (ptp != NULL) {
6431 if (!have_oldpa) {
6432 ptp->wire_count++;
6433 }
6434 /* Remember minimum VA in PTP. */
6435 pmap_ptp_range_set(ptp, va);
6436 }
6437 KASSERT(ptp == NULL || ptp->wire_count > 1);
6438
6439 /*
6440 * If the same page, we can skip pv_entry handling.
6441 */
6442 if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
6443 KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
6444 if ((npte & EPT_PVLIST) != 0) {
6445 KASSERT(samepage);
6446 pmap_check_pv(pmap, ptp, new_pp, va, true);
6447 }
6448 goto same_pa;
6449 } else if ((npte & EPT_PVLIST) != 0) {
6450 KASSERT(!samepage);
6451 }
6452
6453 /*
6454 * If old page is pv-tracked, remove pv_entry from its list.
6455 */
6456 if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
6457 if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
6458 old_pp = VM_PAGE_TO_PP(old_pg);
6459 } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
6460 panic("%s: EPT_PVLIST with pv-untracked page"
6461 " va = %#"PRIxVADDR
6462 " pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
6463 __func__, va, oldpa, atop(pa));
6464 }
6465
6466 pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
6467 pmap_ept_to_pp_attrs(opte));
6468 } else {
6469 KASSERT(old_pve == NULL);
6470 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6471 }
6472
6473 /*
6474 * If new page is dynamically PV tracked, insert to tree.
6475 */
6476 if (new_pve != NULL) {
6477 KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
6478 old_pve = rb_tree_insert_node(tree, new_pve);
6479 KASSERT(old_pve == new_pve);
6480 pmap_check_pv(pmap, ptp, new_pp, va, true);
6481 }
6482
6483 same_pa:
6484 /*
6485 * shootdown tlb if necessary.
6486 */
6487
6488 if (pmap_ept_has_ad) {
6489 accessed = (~opte & (EPT_R | EPT_A)) == 0;
6490 } else {
6491 accessed = (opte & EPT_R) != 0;
6492 }
6493 if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
6494 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
6495 }
6496 pmap_drain_pv(pmap);
6497 mutex_exit(&pmap->pm_lock);
6498 return 0;
6499 }
6500
6501 /* Pay close attention, this returns L2. */
6502 static int
6503 pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
6504 {
6505 pt_entry_t *pteva;
6506 paddr_t ptepa;
6507 int i, index;
6508
6509 KASSERT(mutex_owned(&pmap->pm_lock));
6510
6511 ptepa = pmap->pm_pdirpa[0];
6512 for (i = PTP_LEVELS; i > 1; i--) {
6513 pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
6514 index = pl_pi(va, i);
6515 if (!pmap_ept_valid_entry(pteva[index]))
6516 return i;
6517 ptepa = pmap_pte2pa(pteva[index]);
6518 }
6519 if (lastpde != NULL) {
6520 *lastpde = pteva[index];
6521 }
6522
6523 return 0;
6524 }
6525
6526 static bool
6527 pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
6528 {
6529 pt_entry_t *ptes, pte;
6530 pd_entry_t pde;
6531 paddr_t ptppa, pa;
6532 bool rv;
6533
6534 #ifdef __HAVE_DIRECT_MAP
6535 if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
6536 if (pap != NULL) {
6537 *pap = PMAP_DIRECT_UNMAP(va);
6538 }
6539 return true;
6540 }
6541 #endif
6542
6543 rv = false;
6544 pa = 0;
6545
6546 mutex_enter(&pmap->pm_lock);
6547 kpreempt_disable();
6548
6549 if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
6550 ptppa = pmap_pte2pa(pde);
6551 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6552 pte = ptes[pl1_pi(va)];
6553 if (__predict_true((pte & EPT_R) != 0)) {
6554 pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
6555 rv = true;
6556 }
6557 }
6558
6559 kpreempt_enable();
6560 mutex_exit(&pmap->pm_lock);
6561
6562 if (pap != NULL) {
6563 *pap = pa;
6564 }
6565 return rv;
6566 }
6567
6568 static bool
6569 pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
6570 vaddr_t va)
6571 {
6572 struct pv_entry *pve;
6573 struct vm_page *pg;
6574 struct pmap_page *pp;
6575 pt_entry_t opte;
6576 bool accessed;
6577
6578 KASSERT(pmap != pmap_kernel());
6579 KASSERT(mutex_owned(&pmap->pm_lock));
6580 KASSERT(kpreempt_disabled());
6581
6582 if (!pmap_ept_valid_entry(*pte)) {
6583 /* VA not mapped. */
6584 return false;
6585 }
6586
6587 /* Atomically save the old PTE and zap it. */
6588 opte = pmap_pte_testset(pte, 0);
6589 if (!pmap_ept_valid_entry(opte)) {
6590 return false;
6591 }
6592
6593 pmap_ept_stats_update_bypte(pmap, 0, opte);
6594
6595 if (ptp) {
6596 /*
6597 * Dropping a PTE. Make sure that the PDE is flushed.
6598 */
6599 ptp->wire_count--;
6600 if (ptp->wire_count <= 1) {
6601 opte |= EPT_A;
6602 }
6603 }
6604
6605 if (pmap_ept_has_ad) {
6606 accessed = (opte & EPT_A) != 0;
6607 } else {
6608 accessed = true;
6609 }
6610 if (accessed) {
6611 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
6612 }
6613
6614 /*
6615 * If we are not on a pv list - we are done.
6616 */
6617 if ((opte & EPT_PVLIST) == 0) {
6618 KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
6619 "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
6620 KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
6621 "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
6622 KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
6623 &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
6624 return true;
6625 }
6626
6627 if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
6628 pp = VM_PAGE_TO_PP(pg);
6629 } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
6630 paddr_t pa = pmap_pte2pa(opte);
6631 panic("%s: EPT_PVLIST with pv-untracked page"
6632 " va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
6633 __func__, va, pa, atop(pa));
6634 }
6635
6636 /* Sync R/M bits. */
6637 pve = pmap_lookup_pv(pmap, ptp, pp, va);
6638 pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
6639 return true;
6640 }
6641
6642 static void
6643 pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
6644 vaddr_t startva, vaddr_t endva)
6645 {
6646 pt_entry_t *pte = (pt_entry_t *)ptpva;
6647
6648 KASSERT(pmap != pmap_kernel());
6649 KASSERT(mutex_owned(&pmap->pm_lock));
6650 KASSERT(kpreempt_disabled());
6651
6652 /*
6653 * mappings are very often sparse, so clip the given range to the
6654 * range of PTEs that are known present in the PTP.
6655 */
6656 pmap_ptp_range_clip(ptp, &startva, &pte);
6657
6658 /*
6659 * note that ptpva points to the PTE that maps startva. this may
6660 * or may not be the first PTE in the PTP.
6661 *
6662 * we loop through the PTP while there are still PTEs to look at
6663 * and the wire_count is greater than 1 (because we use the wire_count
6664 * to keep track of the number of real PTEs in the PTP).
6665 */
6666 while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
6667 (void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
6668 startva += PAGE_SIZE;
6669 pte++;
6670 }
6671 }
6672
6673 static void
6674 pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
6675 {
6676 pt_entry_t *ptes;
6677 pd_entry_t pde;
6678 paddr_t ptppa;
6679 vaddr_t blkendva, va = sva;
6680 struct vm_page *ptp;
6681
6682 mutex_enter(&pmap->pm_lock);
6683 kpreempt_disable();
6684
6685 for (/* null */ ; va < eva ; va = blkendva) {
6686 int lvl;
6687
6688 /* determine range of block */
6689 blkendva = x86_round_pdr(va+1);
6690 if (blkendva > eva)
6691 blkendva = eva;
6692
6693 lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
6694 if (lvl != 0) {
6695 /* Skip a range corresponding to an invalid pde. */
6696 blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
6697 continue;
6698 }
6699
6700 /* PA of the PTP */
6701 ptppa = pmap_pte2pa(pde);
6702
6703 ptp = pmap_find_ptp(pmap, va, 1);
6704 KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
6705 __func__);
6706
6707 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6708
6709 pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
6710 blkendva);
6711
6712 /* If PTP is no longer being used, free it. */
6713 if (ptp && ptp->wire_count <= 1) {
6714 pmap_ept_free_ptp(pmap, ptp, va);
6715 }
6716 }
6717
6718 kpreempt_enable();
6719 pmap_drain_pv(pmap);
6720 mutex_exit(&pmap->pm_lock);
6721 }
6722
6723 static int
6724 pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
6725 uint8_t *oattrs, pt_entry_t *optep)
6726 {
6727 struct pmap *pmap;
6728 pt_entry_t *ptep;
6729 pt_entry_t opte;
6730 pt_entry_t npte;
6731 pt_entry_t expect;
6732 bool need_shootdown;
6733
6734 expect = pmap_pa2pte(pa) | EPT_R;
6735 pmap = ptp_to_pmap(ptp);
6736
6737 if (clearbits != ~0) {
6738 KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
6739 clearbits = pmap_pp_attrs_to_ept(clearbits);
6740 }
6741
6742 ptep = pmap_map_pte(pmap, ptp, va);
6743 do {
6744 opte = *ptep;
6745 KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
6746 KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
6747 KASSERT(opte == 0 || (opte & EPT_R) != 0);
6748 if ((opte & (PTE_FRAME | EPT_R)) != expect) {
6749 /*
6750 * We lost a race with a V->P operation like
6751 * pmap_remove(). Wait for the competitor
6752 * reflecting pte bits into mp_attrs.
6753 */
6754 pmap_unmap_pte();
6755 return EAGAIN;
6756 }
6757
6758 /*
6759 * Check if there's anything to do on this PTE.
6760 */
6761 if ((opte & clearbits) == 0) {
6762 need_shootdown = false;
6763 break;
6764 }
6765
6766 /*
6767 * We need a shootdown if the PTE is cached (EPT_A) ...
6768 * ... Unless we are clearing only the EPT_W bit and
6769 * it isn't cached as RW (EPT_D).
6770 */
6771 if (pmap_ept_has_ad) {
6772 need_shootdown = (opte & EPT_A) != 0 &&
6773 !(clearbits == EPT_W && (opte & EPT_D) == 0);
6774 } else {
6775 need_shootdown = true;
6776 }
6777
6778 npte = opte & ~clearbits;
6779
6780 /*
6781 * If we need a shootdown anyway, clear EPT_A and EPT_D.
6782 */
6783 if (need_shootdown) {
6784 npte &= ~(EPT_A | EPT_D);
6785 }
6786 KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
6787 KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
6788 KASSERT(npte == 0 || (opte & EPT_R) != 0);
6789 } while (pmap_pte_cas(ptep, opte, npte) != opte);
6790
6791 if (need_shootdown) {
6792 pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
6793 }
6794 pmap_unmap_pte();
6795
6796 *oattrs = pmap_ept_to_pp_attrs(opte);
6797 if (optep != NULL)
6798 *optep = opte;
6799 return 0;
6800 }
6801
6802 static void
6803 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
6804 vaddr_t va)
6805 {
6806
6807 KASSERT(mutex_owned(&pmap->pm_lock));
6808
6809 pmap_ept_stats_update_bypte(pmap, 0, opte);
6810 ptp->wire_count--;
6811 if (ptp->wire_count <= 1) {
6812 pmap_ept_free_ptp(pmap, ptp, va);
6813 }
6814 }
6815
6816 static void
6817 pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
6818 {
6819 pt_entry_t bit_rem;
6820 pt_entry_t *ptes, *spte;
6821 pt_entry_t opte, npte;
6822 pd_entry_t pde;
6823 paddr_t ptppa;
6824 vaddr_t va;
6825 bool modified;
6826
6827 bit_rem = 0;
6828 if (!(prot & VM_PROT_WRITE))
6829 bit_rem = EPT_W;
6830
6831 sva &= PTE_FRAME;
6832 eva &= PTE_FRAME;
6833
6834 /* Acquire pmap. */
6835 mutex_enter(&pmap->pm_lock);
6836 kpreempt_disable();
6837
6838 for (va = sva; va < eva; va += PAGE_SIZE) {
6839 if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6840 continue;
6841 }
6842
6843 ptppa = pmap_pte2pa(pde);
6844 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6845 spte = &ptes[pl1_pi(va)];
6846
6847 do {
6848 opte = *spte;
6849 if (!pmap_ept_valid_entry(opte)) {
6850 goto next;
6851 }
6852 npte = (opte & ~bit_rem);
6853 } while (pmap_pte_cas(spte, opte, npte) != opte);
6854
6855 if (pmap_ept_has_ad) {
6856 modified = (opte & EPT_D) != 0;
6857 } else {
6858 modified = true;
6859 }
6860 if (modified) {
6861 vaddr_t tva = x86_ptob(spte - ptes);
6862 pmap_tlb_shootdown(pmap, tva, 0,
6863 TLBSHOOT_WRITE_PROTECT);
6864 }
6865 next:;
6866 }
6867
6868 kpreempt_enable();
6869 mutex_exit(&pmap->pm_lock);
6870 }
6871
6872 static void
6873 pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
6874 {
6875 pt_entry_t *ptes, *ptep, opte;
6876 pd_entry_t pde;
6877 paddr_t ptppa;
6878
6879 /* Acquire pmap. */
6880 mutex_enter(&pmap->pm_lock);
6881 kpreempt_disable();
6882
6883 if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
6884 panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
6885 }
6886
6887 ptppa = pmap_pte2pa(pde);
6888 ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
6889 ptep = &ptes[pl1_pi(va)];
6890 opte = *ptep;
6891 KASSERT(pmap_ept_valid_entry(opte));
6892
6893 if (opte & EPT_WIRED) {
6894 pt_entry_t npte = opte & ~EPT_WIRED;
6895
6896 opte = pmap_pte_testset(ptep, npte);
6897 pmap_ept_stats_update_bypte(pmap, npte, opte);
6898 } else {
6899 printf("%s: wiring for pmap %p va %#" PRIxVADDR
6900 "did not change!\n", __func__, pmap, va);
6901 }
6902
6903 /* Release pmap. */
6904 kpreempt_enable();
6905 mutex_exit(&pmap->pm_lock);
6906 }
6907
6908 /* -------------------------------------------------------------------------- */
6909
6910 void
6911 pmap_ept_transform(struct pmap *pmap)
6912 {
6913 pmap->pm_enter = pmap_ept_enter;
6914 pmap->pm_extract = pmap_ept_extract;
6915 pmap->pm_remove = pmap_ept_remove;
6916 pmap->pm_sync_pv = pmap_ept_sync_pv;
6917 pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
6918 pmap->pm_write_protect = pmap_ept_write_protect;
6919 pmap->pm_unwire = pmap_ept_unwire;
6920
6921 memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
6922 }
6923
6924 #endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
6925