xref: /freebsd-11-stable/sys/i386/i386/pmap.c (revision 806cf5513cac5458772952cb269b2ad049715bfd)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
9  * All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * the Systems Programming Group of the University of Utah Computer
13  * Science Department and William Jolitz of UUNET Technologies Inc.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  * 3. All advertising materials mentioning features or use of this software
24  *    must display the following acknowledgement:
25  *	This product includes software developed by the University of
26  *	California, Berkeley and its contributors.
27  * 4. Neither the name of the University nor the names of its contributors
28  *    may be used to endorse or promote products derived from this software
29  *    without specific prior written permission.
30  *
31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41  * SUCH DAMAGE.
42  *
43  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
44  */
45 /*-
46  * Copyright (c) 2003 Networks Associates Technology, Inc.
47  * All rights reserved.
48  *
49  * This software was developed for the FreeBSD Project by Jake Burkholder,
50  * Safeport Network Services, and Network Associates Laboratories, the
51  * Security Research Division of Network Associates, Inc. under
52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53  * CHATS research program.
54  *
55  * Redistribution and use in source and binary forms, with or without
56  * modification, are permitted provided that the following conditions
57  * are met:
58  * 1. Redistributions of source code must retain the above copyright
59  *    notice, this list of conditions and the following disclaimer.
60  * 2. Redistributions in binary form must reproduce the above copyright
61  *    notice, this list of conditions and the following disclaimer in the
62  *    documentation and/or other materials provided with the distribution.
63  *
64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74  * SUCH DAMAGE.
75  */
76 
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
79 
80 /*
81  *	Manages physical address maps.
82  *
83  *	Since the information managed by this module is
84  *	also stored by the logical address mapping module,
85  *	this module may throw away valid virtual-to-physical
86  *	mappings at almost any time.  However, invalidations
87  *	of virtual-to-physical mappings must be done as
88  *	requested.
89  *
90  *	In order to cope with hardware architectures which
91  *	make virtual-to-physical map invalidates expensive,
92  *	this module may delay invalidate or reduced protection
93  *	operations until such time as they are actually
94  *	necessary.  This module is given full information as
95  *	to which processors are currently using which maps,
96  *	and to when physical maps must be made correct.
97  */
98 
99 #include "opt_apic.h"
100 #include "opt_cpu.h"
101 #include "opt_pmap.h"
102 #include "opt_smp.h"
103 #include "opt_vm.h"
104 #include "opt_xbox.h"
105 
106 #include <sys/param.h>
107 #include <sys/systm.h>
108 #include <sys/kernel.h>
109 #include <sys/ktr.h>
110 #include <sys/lock.h>
111 #include <sys/malloc.h>
112 #include <sys/mman.h>
113 #include <sys/msgbuf.h>
114 #include <sys/mutex.h>
115 #include <sys/proc.h>
116 #include <sys/rwlock.h>
117 #include <sys/sf_buf.h>
118 #include <sys/sx.h>
119 #include <sys/vmmeter.h>
120 #include <sys/sched.h>
121 #include <sys/sysctl.h>
122 #include <sys/smp.h>
123 
124 #include <vm/vm.h>
125 #include <vm/vm_param.h>
126 #include <vm/vm_kern.h>
127 #include <vm/vm_page.h>
128 #include <vm/vm_map.h>
129 #include <vm/vm_object.h>
130 #include <vm/vm_extern.h>
131 #include <vm/vm_pageout.h>
132 #include <vm/vm_pager.h>
133 #include <vm/vm_phys.h>
134 #include <vm/vm_radix.h>
135 #include <vm/vm_reserv.h>
136 #include <vm/uma.h>
137 
138 #ifdef DEV_APIC
139 #include <sys/bus.h>
140 #include <machine/intr_machdep.h>
141 #include <x86/apicvar.h>
142 #endif
143 #include <machine/cpu.h>
144 #include <machine/cputypes.h>
145 #include <machine/md_var.h>
146 #include <machine/pcb.h>
147 #include <machine/specialreg.h>
148 #ifdef SMP
149 #include <machine/smp.h>
150 #endif
151 
152 #ifdef XBOX
153 #include <machine/xbox.h>
154 #endif
155 
156 #ifndef PMAP_SHPGPERPROC
157 #define PMAP_SHPGPERPROC 200
158 #endif
159 
160 #if !defined(DIAGNOSTIC)
161 #ifdef __GNUC_GNU_INLINE__
162 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
163 #else
164 #define PMAP_INLINE	extern inline
165 #endif
166 #else
167 #define PMAP_INLINE
168 #endif
169 
170 #ifdef PV_STATS
171 #define PV_STAT(x)	do { x ; } while (0)
172 #else
173 #define PV_STAT(x)	do { } while (0)
174 #endif
175 
176 #define	pa_index(pa)	((pa) >> PDRSHIFT)
177 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
178 
179 /*
180  * Get PDEs and PTEs for user/kernel address space
181  */
182 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
183 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
184 
185 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
186 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
187 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
188 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
189 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
190 
191 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
192     atomic_clear_int((u_int *)(pte), PG_W))
193 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
194 
195 struct pmap kernel_pmap_store;
196 LIST_HEAD(pmaplist, pmap);
197 static struct pmaplist allpmaps;
198 static struct mtx allpmaps_lock;
199 
200 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
201 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
202 int pgeflag = 0;		/* PG_G or-in */
203 int pseflag = 0;		/* PG_PS or-in */
204 
205 static int nkpt = NKPT;
206 vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR;
207 extern u_int32_t KERNend;
208 extern u_int32_t KPTphys;
209 
210 #if defined(PAE) || defined(PAE_TABLES)
211 pt_entry_t pg_nx;
212 static uma_zone_t pdptzone;
213 #endif
214 
215 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
216 
217 static int pat_works = 1;
218 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 0,
219     "Is page attribute table fully functional?");
220 
221 static int pg_ps_enabled = 1;
222 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
223     &pg_ps_enabled, 0, "Are large page mappings enabled?");
224 
225 #define	PAT_INDEX_SIZE	8
226 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
227 
228 /*
229  * pmap_mapdev support pre initialization (i.e. console)
230  */
231 #define	PMAP_PREINIT_MAPPING_COUNT	8
232 static struct pmap_preinit_mapping {
233 	vm_paddr_t	pa;
234 	vm_offset_t	va;
235 	vm_size_t	sz;
236 	int		mode;
237 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
238 static int pmap_initialized;
239 
240 static struct rwlock_padalign pvh_global_lock;
241 
242 /*
243  * Data for the pv entry allocation mechanism
244  */
245 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
246 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
247 static struct md_page *pv_table;
248 static int shpgperproc = PMAP_SHPGPERPROC;
249 
250 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
251 int pv_maxchunks;			/* How many chunks we have KVA for */
252 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
253 
254 /*
255  * All those kernel PT submaps that BSD is so fond of
256  */
257 pt_entry_t *CMAP3;
258 static pd_entry_t *KPTD;
259 caddr_t ptvmmap = 0;
260 caddr_t CADDR3;
261 struct msgbuf *msgbufp = NULL;
262 
263 /*
264  * Crashdump maps.
265  */
266 static caddr_t crashdumpmap;
267 
268 static pt_entry_t *PMAP1 = NULL, *PMAP2;
269 static pt_entry_t *PADDR1 = NULL, *PADDR2;
270 #ifdef SMP
271 static int PMAP1cpu;
272 static int PMAP1changedcpu;
273 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
274 	   &PMAP1changedcpu, 0,
275 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
276 #endif
277 static int PMAP1changed;
278 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
279 	   &PMAP1changed, 0,
280 	   "Number of times pmap_pte_quick changed PMAP1");
281 static int PMAP1unchanged;
282 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
283 	   &PMAP1unchanged, 0,
284 	   "Number of times pmap_pte_quick didn't change PMAP1");
285 static struct mtx PMAP2mutex;
286 
287 int pti;
288 
289 /*
290  * Internal flags for pmap_mapdev_internal().
291  */
292 #define	MAPDEV_SETATTR		0x0000001	/* Modify existing attrs. */
293 
294 static void	free_pv_chunk(struct pv_chunk *pc);
295 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
296 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
297 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
298 static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
299 #if VM_NRESERVLEVEL > 0
300 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
301 #endif
302 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
303 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
304 		    vm_offset_t va);
305 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
306 
307 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
308 static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m,
309     vm_prot_t prot);
310 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
311     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
312 static void pmap_flush_page(vm_page_t m);
313 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
314 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
315 		    pd_entry_t pde);
316 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
317 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
318 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
319 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
320 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
321 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
322 #if VM_NRESERVLEVEL > 0
323 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
324 #endif
325 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
326     vm_prot_t prot);
327 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
328 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
329     struct spglist *free);
330 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
331     struct spglist *free);
332 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
333 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
334     struct spglist *free);
335 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
336 					vm_offset_t va);
337 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
338 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
339     vm_page_t m);
340 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
341     pd_entry_t newpde);
342 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
343 
344 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
345 
346 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
347 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
348 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
349 static void pmap_pte_release(pt_entry_t *pte);
350 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
351 #if defined(PAE) || defined(PAE_TABLES)
352 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
353     int wait);
354 #endif
355 static void pmap_set_pg(void);
356 
357 static __inline void pagezero(void *page);
358 
359 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
360 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
361 
362 /*
363  * If you get an error here, then you set KVA_PAGES wrong! See the
364  * description of KVA_PAGES in sys/i386/include/pmap.h. It must be
365  * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE.
366  */
367 CTASSERT(KERNBASE % (1 << 24) == 0);
368 
369 /*
370  *	Bootstrap the system enough to run with virtual memory.
371  *
372  *	On the i386 this is called after mapping has already been enabled
373  *	and just syncs the pmap module with what has already been done.
374  *	[We can't call it easily with mapping off since the kernel is not
375  *	mapped with PA == VA, hence we would have to relocate every address
376  *	from the linked base (virtual) address "KERNBASE" to the actual
377  *	(physical) address starting relative to 0]
378  */
379 void
pmap_bootstrap(vm_paddr_t firstaddr)380 pmap_bootstrap(vm_paddr_t firstaddr)
381 {
382 	vm_offset_t va;
383 	pt_entry_t *pte, *unused;
384 	struct pcpu *pc;
385 	u_long res;
386 	int i;
387 
388 	res = atop(firstaddr - (vm_paddr_t)KERNLOAD);
389 
390 	/*
391 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
392 	 * preallocated kernel page table pages so that vm_page structures
393 	 * representing these pages will be created.  The vm_page structures
394 	 * are required for promotion of the corresponding kernel virtual
395 	 * addresses to superpage mappings.
396 	 */
397 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
398 
399 	/*
400 	 * Initialize the first available kernel virtual address.  However,
401 	 * using "firstaddr" may waste a few pages of the kernel virtual
402 	 * address space, because locore may not have mapped every physical
403 	 * page that it allocated.  Preferably, locore would provide a first
404 	 * unused virtual address in addition to "firstaddr".
405 	 */
406 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
407 	virtual_end = VM_MAX_KERNEL_ADDRESS;
408 
409 	/*
410 	 * Initialize the kernel pmap (which is statically allocated).
411 	 * Count bootstrap data as being resident in case any of this data is
412 	 * later unmapped (using pmap_remove()) and freed.
413 	 */
414 	PMAP_LOCK_INIT(kernel_pmap);
415 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
416 #if defined(PAE) || defined(PAE_TABLES)
417 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
418 #endif
419 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
420 	kernel_pmap->pm_stats.resident_count = res;
421 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
422 
423  	/*
424 	 * Initialize the global pv list lock.
425 	 */
426 	rw_init(&pvh_global_lock, "pmap pv global");
427 
428 	LIST_INIT(&allpmaps);
429 
430 	/*
431 	 * Request a spin mutex so that changes to allpmaps cannot be
432 	 * preempted by smp_rendezvous_cpus().  Otherwise,
433 	 * pmap_update_pde_kernel() could access allpmaps while it is
434 	 * being changed.
435 	 */
436 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
437 	mtx_lock_spin(&allpmaps_lock);
438 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
439 	mtx_unlock_spin(&allpmaps_lock);
440 
441 	/*
442 	 * Reserve some special page table entries/VA space for temporary
443 	 * mapping of pages.
444 	 */
445 #define	SYSMAP(c, p, v, n)	\
446 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
447 
448 	va = virtual_avail;
449 	pte = vtopte(va);
450 
451 
452 	/*
453 	 * Initialize temporary map objects on the current CPU for use
454 	 * during early boot.
455 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
456 	 * CMAP3 is used for the idle process page zeroing.
457 	 */
458 	pc = get_pcpu();
459 	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
460 	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
461 	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
462 	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
463 
464 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
465 
466 	/*
467 	 * Crashdump maps.
468 	 */
469 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
470 
471 	/*
472 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
473 	 */
474 	SYSMAP(caddr_t, unused, ptvmmap, 1)
475 
476 	/*
477 	 * msgbufp is used to map the system message buffer.
478 	 */
479 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
480 
481 	/*
482 	 * KPTmap is used by pmap_kextract().
483 	 *
484 	 * KPTmap is first initialized by locore.  However, that initial
485 	 * KPTmap can only support NKPT page table pages.  Here, a larger
486 	 * KPTmap is created that can support KVA_PAGES page table pages.
487 	 */
488 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
489 
490 	for (i = 0; i < NKPT; i++)
491 		KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V;
492 
493 	/*
494 	 * Adjust the start of the KPTD and KPTmap so that the implementation
495 	 * of pmap_kextract() and pmap_growkernel() can be made simpler.
496 	 */
497 	KPTD -= KPTDI;
498 	KPTmap -= i386_btop(KPTDI << PDRSHIFT);
499 
500 	/*
501 	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
502 	 * respectively.
503 	 */
504 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
505 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
506 
507 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
508 
509 	virtual_avail = va;
510 
511 	/*
512 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
513 	 * physical memory region that is used by the ACPI wakeup code.  This
514 	 * mapping must not have PG_G set.
515 	 */
516 #ifdef XBOX
517 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
518 	 * an early stadium, we cannot yet neatly map video memory ... :-(
519 	 * Better fixes are very welcome! */
520 	if (!arch_i386_is_xbox)
521 #endif
522 	for (i = 1; i < NKPT; i++)
523 		PTD[i] = 0;
524 
525 	/*
526 	 * Initialize the PAT MSR if present.
527 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
528 	 * side-effect, invalidates stale PG_G TLB entries that might
529 	 * have been created in our pre-boot environment.  We assume
530 	 * that PAT support implies PGE and in reverse, PGE presence
531 	 * comes with PAT.  Both features were added for Pentium Pro.
532 	 */
533 	pmap_init_pat();
534 
535 	/* Turn on PG_G on kernel page(s) */
536 	pmap_set_pg();
537 }
538 
539 static void
pmap_init_reserved_pages(void)540 pmap_init_reserved_pages(void)
541 {
542 	struct pcpu *pc;
543 	vm_offset_t pages;
544 	int i;
545 
546 	CPU_FOREACH(i) {
547 		pc = pcpu_find(i);
548 		/*
549 		 * Skip if the mapping has already been initialized,
550 		 * i.e. this is the BSP.
551 		 */
552 		if (pc->pc_cmap_addr1 != 0)
553 			continue;
554 		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
555 		pages = kva_alloc(PAGE_SIZE * 3);
556 		if (pages == 0)
557 			panic("%s: unable to allocate KVA", __func__);
558 		pc->pc_cmap_pte1 = vtopte(pages);
559 		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
560 		pc->pc_cmap_addr1 = (caddr_t)pages;
561 		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
562 		pc->pc_qmap_addr = pages + (PAGE_SIZE * 2);
563 	}
564 }
565 
566 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
567 
568 /*
569  * Setup the PAT MSR.
570  */
571 void
pmap_init_pat(void)572 pmap_init_pat(void)
573 {
574 	int pat_table[PAT_INDEX_SIZE];
575 	uint64_t pat_msr;
576 	u_long cr0, cr4;
577 	int i;
578 
579 	/* Set default PAT index table. */
580 	for (i = 0; i < PAT_INDEX_SIZE; i++)
581 		pat_table[i] = -1;
582 	pat_table[PAT_WRITE_BACK] = 0;
583 	pat_table[PAT_WRITE_THROUGH] = 1;
584 	pat_table[PAT_UNCACHEABLE] = 3;
585 	pat_table[PAT_WRITE_COMBINING] = 3;
586 	pat_table[PAT_WRITE_PROTECTED] = 3;
587 	pat_table[PAT_UNCACHED] = 3;
588 
589 	/*
590 	 * Bail if this CPU doesn't implement PAT.
591 	 * We assume that PAT support implies PGE.
592 	 */
593 	if ((cpu_feature & CPUID_PAT) == 0) {
594 		for (i = 0; i < PAT_INDEX_SIZE; i++)
595 			pat_index[i] = pat_table[i];
596 		pat_works = 0;
597 		return;
598 	}
599 
600 	/*
601 	 * Due to some Intel errata, we can only safely use the lower 4
602 	 * PAT entries.
603 	 *
604 	 *   Intel Pentium III Processor Specification Update
605 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
606 	 * or Mode C Paging)
607 	 *
608 	 *   Intel Pentium IV  Processor Specification Update
609 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
610 	 */
611 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
612 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
613 		pat_works = 0;
614 
615 	/* Initialize default PAT entries. */
616 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
617 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
618 	    PAT_VALUE(2, PAT_UNCACHED) |
619 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
620 	    PAT_VALUE(4, PAT_WRITE_BACK) |
621 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
622 	    PAT_VALUE(6, PAT_UNCACHED) |
623 	    PAT_VALUE(7, PAT_UNCACHEABLE);
624 
625 	if (pat_works) {
626 		/*
627 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
628 		 * Program 5 and 6 as WP and WC.
629 		 * Leave 4 and 7 as WB and UC.
630 		 */
631 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
632 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
633 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
634 		pat_table[PAT_UNCACHED] = 2;
635 		pat_table[PAT_WRITE_PROTECTED] = 5;
636 		pat_table[PAT_WRITE_COMBINING] = 6;
637 	} else {
638 		/*
639 		 * Just replace PAT Index 2 with WC instead of UC-.
640 		 */
641 		pat_msr &= ~PAT_MASK(2);
642 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
643 		pat_table[PAT_WRITE_COMBINING] = 2;
644 	}
645 
646 	/* Disable PGE. */
647 	cr4 = rcr4();
648 	load_cr4(cr4 & ~CR4_PGE);
649 
650 	/* Disable caches (CD = 1, NW = 0). */
651 	cr0 = rcr0();
652 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
653 
654 	/* Flushes caches and TLBs. */
655 	wbinvd();
656 	invltlb();
657 
658 	/* Update PAT and index table. */
659 	wrmsr(MSR_PAT, pat_msr);
660 	for (i = 0; i < PAT_INDEX_SIZE; i++)
661 		pat_index[i] = pat_table[i];
662 
663 	/* Flush caches and TLBs again. */
664 	wbinvd();
665 	invltlb();
666 
667 	/* Restore caches and PGE. */
668 	load_cr0(cr0);
669 	load_cr4(cr4);
670 }
671 
672 /*
673  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
674  */
675 static void
pmap_set_pg(void)676 pmap_set_pg(void)
677 {
678 	pt_entry_t *pte;
679 	vm_offset_t va, endva;
680 
681 	if (pgeflag == 0)
682 		return;
683 
684 	endva = KERNBASE + KERNend;
685 
686 	if (pseflag) {
687 		va = KERNBASE + KERNLOAD;
688 		while (va  < endva) {
689 			pdir_pde(PTD, va) |= pgeflag;
690 			invltlb();	/* Flush non-PG_G entries. */
691 			va += NBPDR;
692 		}
693 	} else {
694 		va = (vm_offset_t)btext;
695 		while (va < endva) {
696 			pte = vtopte(va);
697 			if (*pte)
698 				*pte |= pgeflag;
699 			invltlb();	/* Flush non-PG_G entries. */
700 			va += PAGE_SIZE;
701 		}
702 	}
703 }
704 
705 /*
706  * Initialize a vm_page's machine-dependent fields.
707  */
708 void
pmap_page_init(vm_page_t m)709 pmap_page_init(vm_page_t m)
710 {
711 
712 	TAILQ_INIT(&m->md.pv_list);
713 	m->md.pat_mode = PAT_WRITE_BACK;
714 }
715 
716 #if defined(PAE) || defined(PAE_TABLES)
717 static void *
pmap_pdpt_allocf(uma_zone_t zone,vm_size_t bytes,uint8_t * flags,int wait)718 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
719 {
720 
721 	/* Inform UMA that this allocator uses kernel_map/object. */
722 	*flags = UMA_SLAB_KERNEL;
723 	return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
724 	    0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
725 }
726 #endif
727 
728 /*
729  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
730  * Requirements:
731  *  - Must deal with pages in order to ensure that none of the PG_* bits
732  *    are ever set, PG_V in particular.
733  *  - Assumes we can write to ptes without pte_store() atomic ops, even
734  *    on PAE systems.  This should be ok.
735  *  - Assumes nothing will ever test these addresses for 0 to indicate
736  *    no mapping instead of correctly checking PG_V.
737  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
738  * Because PG_V is never set, there can be no mappings to invalidate.
739  */
740 static vm_offset_t
pmap_ptelist_alloc(vm_offset_t * head)741 pmap_ptelist_alloc(vm_offset_t *head)
742 {
743 	pt_entry_t *pte;
744 	vm_offset_t va;
745 
746 	va = *head;
747 	if (va == 0)
748 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
749 	pte = vtopte(va);
750 	*head = *pte;
751 	if (*head & PG_V)
752 		panic("pmap_ptelist_alloc: va with PG_V set!");
753 	*pte = 0;
754 	return (va);
755 }
756 
757 static void
pmap_ptelist_free(vm_offset_t * head,vm_offset_t va)758 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
759 {
760 	pt_entry_t *pte;
761 
762 	if (va & PG_V)
763 		panic("pmap_ptelist_free: freeing va with PG_V set!");
764 	pte = vtopte(va);
765 	*pte = *head;		/* virtual! PG_V is 0 though */
766 	*head = va;
767 }
768 
769 static void
pmap_ptelist_init(vm_offset_t * head,void * base,int npages)770 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
771 {
772 	int i;
773 	vm_offset_t va;
774 
775 	*head = 0;
776 	for (i = npages - 1; i >= 0; i--) {
777 		va = (vm_offset_t)base + i * PAGE_SIZE;
778 		pmap_ptelist_free(head, va);
779 	}
780 }
781 
782 
783 /*
784  *	Initialize the pmap module.
785  *	Called by vm_init, to initialize any structures that the pmap
786  *	system needs to map virtual memory.
787  */
788 void
pmap_init(void)789 pmap_init(void)
790 {
791 	struct pmap_preinit_mapping *ppim;
792 	vm_page_t mpte;
793 	vm_size_t s;
794 	int i, pv_npg;
795 
796 	/*
797 	 * Initialize the vm page array entries for the kernel pmap's
798 	 * page table pages.
799 	 */
800 	PMAP_LOCK(kernel_pmap);
801 	for (i = 0; i < NKPT; i++) {
802 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
803 		KASSERT(mpte >= vm_page_array &&
804 		    mpte < &vm_page_array[vm_page_array_size],
805 		    ("pmap_init: page table page is out of range"));
806 		mpte->pindex = i + KPTDI;
807 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
808 		mpte->wire_count = 1;
809 		if (pseflag != 0 &&
810 		    KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend &&
811 		    pmap_insert_pt_page(kernel_pmap, mpte))
812 			panic("pmap_init: pmap_insert_pt_page failed");
813 	}
814 	PMAP_UNLOCK(kernel_pmap);
815 	atomic_add_int(&vm_cnt.v_wire_count, NKPT);
816 
817 	/*
818 	 * Initialize the address space (zone) for the pv entries.  Set a
819 	 * high water mark so that the system can recover from excessive
820 	 * numbers of pv entries.
821 	 */
822 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
823 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
824 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
825 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
826 	pv_entry_high_water = 9 * (pv_entry_max / 10);
827 
828 	/*
829 	 * If the kernel is running on a virtual machine, then it must assume
830 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
831 	 * be prepared for the hypervisor changing the vendor and family that
832 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
833 	 * 10h Erratum 383 is enabled if the processor's feature set does not
834 	 * include at least one feature that is only supported by older Intel
835 	 * or newer AMD processors.
836 	 */
837 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
838 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
839 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
840 	    AMDID2_FMA4)) == 0)
841 		workaround_erratum383 = 1;
842 
843 	/*
844 	 * Are large page mappings supported and enabled?
845 	 */
846 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
847 	if (pseflag == 0)
848 		pg_ps_enabled = 0;
849 	else if (pg_ps_enabled) {
850 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
851 		    ("pmap_init: can't assign to pagesizes[1]"));
852 		pagesizes[1] = NBPDR;
853 	}
854 
855 	/*
856 	 * Calculate the size of the pv head table for superpages.
857 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
858 	 */
859 	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
860 	    PAGE_SIZE) / NBPDR + 1;
861 
862 	/*
863 	 * Allocate memory for the pv head table for superpages.
864 	 */
865 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
866 	s = round_page(s);
867 	pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
868 	    M_WAITOK | M_ZERO);
869 	for (i = 0; i < pv_npg; i++)
870 		TAILQ_INIT(&pv_table[i].pv_list);
871 
872 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
873 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
874 	if (pv_chunkbase == NULL)
875 		panic("pmap_init: not enough kvm for pv chunks");
876 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
877 #if defined(PAE) || defined(PAE_TABLES)
878 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
879 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
880 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
881 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
882 #endif
883 
884 	pmap_initialized = 1;
885 	if (!bootverbose)
886 		return;
887 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
888 		ppim = pmap_preinit_mapping + i;
889 		if (ppim->va == 0)
890 			continue;
891 		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
892 		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
893 	}
894 }
895 
896 
897 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
898 	"Max number of PV entries");
899 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
900 	"Page share factor per proc");
901 
902 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
903     "2/4MB page mapping counters");
904 
905 static u_long pmap_pde_demotions;
906 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
907     &pmap_pde_demotions, 0, "2/4MB page demotions");
908 
909 static u_long pmap_pde_mappings;
910 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
911     &pmap_pde_mappings, 0, "2/4MB page mappings");
912 
913 static u_long pmap_pde_p_failures;
914 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
915     &pmap_pde_p_failures, 0, "2/4MB page promotion failures");
916 
917 static u_long pmap_pde_promotions;
918 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
919     &pmap_pde_promotions, 0, "2/4MB page promotions");
920 
921 /***************************************************
922  * Low level helper routines.....
923  ***************************************************/
924 
925 /*
926  * Determine the appropriate bits to set in a PTE or PDE for a specified
927  * caching mode.
928  */
929 int
pmap_cache_bits(int mode,boolean_t is_pde)930 pmap_cache_bits(int mode, boolean_t is_pde)
931 {
932 	int cache_bits, pat_flag, pat_idx;
933 
934 	if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0)
935 		panic("Unknown caching mode %d\n", mode);
936 
937 	/* The PAT bit is different for PTE's and PDE's. */
938 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
939 
940 	/* Map the caching mode to a PAT index. */
941 	pat_idx = pat_index[mode];
942 
943 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
944 	cache_bits = 0;
945 	if (pat_idx & 0x4)
946 		cache_bits |= pat_flag;
947 	if (pat_idx & 0x2)
948 		cache_bits |= PG_NC_PCD;
949 	if (pat_idx & 0x1)
950 		cache_bits |= PG_NC_PWT;
951 	return (cache_bits);
952 }
953 
954 /*
955  * The caller is responsible for maintaining TLB consistency.
956  */
957 static void
pmap_kenter_pde(vm_offset_t va,pd_entry_t newpde)958 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
959 {
960 	pd_entry_t *pde;
961 	pmap_t pmap;
962 	boolean_t PTD_updated;
963 
964 	PTD_updated = FALSE;
965 	mtx_lock_spin(&allpmaps_lock);
966 	LIST_FOREACH(pmap, &allpmaps, pm_list) {
967 		if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] &
968 		    PG_FRAME))
969 			PTD_updated = TRUE;
970 		pde = pmap_pde(pmap, va);
971 		pde_store(pde, newpde);
972 	}
973 	mtx_unlock_spin(&allpmaps_lock);
974 	KASSERT(PTD_updated,
975 	    ("pmap_kenter_pde: current page table is not in allpmaps"));
976 }
977 
978 /*
979  * After changing the page size for the specified virtual address in the page
980  * table, flush the corresponding entries from the processor's TLB.  Only the
981  * calling processor's TLB is affected.
982  *
983  * The calling thread must be pinned to a processor.
984  */
985 static void
pmap_update_pde_invalidate(vm_offset_t va,pd_entry_t newpde)986 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
987 {
988 	u_long cr4;
989 
990 	if ((newpde & PG_PS) == 0)
991 		/* Demotion: flush a specific 2MB page mapping. */
992 		invlpg(va);
993 	else if ((newpde & PG_G) == 0)
994 		/*
995 		 * Promotion: flush every 4KB page mapping from the TLB
996 		 * because there are too many to flush individually.
997 		 */
998 		invltlb();
999 	else {
1000 		/*
1001 		 * Promotion: flush every 4KB page mapping from the TLB,
1002 		 * including any global (PG_G) mappings.
1003 		 */
1004 		cr4 = rcr4();
1005 		load_cr4(cr4 & ~CR4_PGE);
1006 		/*
1007 		 * Although preemption at this point could be detrimental to
1008 		 * performance, it would not lead to an error.  PG_G is simply
1009 		 * ignored if CR4.PGE is clear.  Moreover, in case this block
1010 		 * is re-entered, the load_cr4() either above or below will
1011 		 * modify CR4.PGE flushing the TLB.
1012 		 */
1013 		load_cr4(cr4 | CR4_PGE);
1014 	}
1015 }
1016 
1017 void
invltlb_glob(void)1018 invltlb_glob(void)
1019 {
1020 	uint64_t cr4;
1021 
1022 	if (pgeflag == 0) {
1023 		invltlb();
1024 	} else {
1025 		cr4 = rcr4();
1026 		load_cr4(cr4 & ~CR4_PGE);
1027 		load_cr4(cr4 | CR4_PGE);
1028 	}
1029 }
1030 
1031 
1032 #ifdef SMP
1033 /*
1034  * For SMP, these functions have to use the IPI mechanism for coherence.
1035  *
1036  * N.B.: Before calling any of the following TLB invalidation functions,
1037  * the calling processor must ensure that all stores updating a non-
1038  * kernel page table are globally performed.  Otherwise, another
1039  * processor could cache an old, pre-update entry without being
1040  * invalidated.  This can happen one of two ways: (1) The pmap becomes
1041  * active on another processor after its pm_active field is checked by
1042  * one of the following functions but before a store updating the page
1043  * table is globally performed. (2) The pmap becomes active on another
1044  * processor before its pm_active field is checked but due to
1045  * speculative loads one of the following functions stills reads the
1046  * pmap as inactive on the other processor.
1047  *
1048  * The kernel page table is exempt because its pm_active field is
1049  * immutable.  The kernel page table is always active on every
1050  * processor.
1051  */
1052 void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1053 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1054 {
1055 	cpuset_t *mask, other_cpus;
1056 	u_int cpuid;
1057 
1058 	sched_pin();
1059 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1060 		invlpg(va);
1061 		mask = &all_cpus;
1062 	} else {
1063 		cpuid = PCPU_GET(cpuid);
1064 		other_cpus = all_cpus;
1065 		CPU_CLR(cpuid, &other_cpus);
1066 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1067 			invlpg(va);
1068 		CPU_AND(&other_cpus, &pmap->pm_active);
1069 		mask = &other_cpus;
1070 	}
1071 	smp_masked_invlpg(*mask, va, pmap);
1072 	sched_unpin();
1073 }
1074 
1075 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1076 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
1077 
1078 void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1079 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1080 {
1081 	cpuset_t *mask, other_cpus;
1082 	vm_offset_t addr;
1083 	u_int cpuid;
1084 
1085 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1086 		pmap_invalidate_all(pmap);
1087 		return;
1088 	}
1089 
1090 	sched_pin();
1091 	if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) {
1092 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1093 			invlpg(addr);
1094 		mask = &all_cpus;
1095 	} else {
1096 		cpuid = PCPU_GET(cpuid);
1097 		other_cpus = all_cpus;
1098 		CPU_CLR(cpuid, &other_cpus);
1099 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1100 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
1101 				invlpg(addr);
1102 		CPU_AND(&other_cpus, &pmap->pm_active);
1103 		mask = &other_cpus;
1104 	}
1105 	smp_masked_invlpg_range(*mask, sva, eva, pmap);
1106 	sched_unpin();
1107 }
1108 
1109 void
pmap_invalidate_all(pmap_t pmap)1110 pmap_invalidate_all(pmap_t pmap)
1111 {
1112 	cpuset_t *mask, other_cpus;
1113 	u_int cpuid;
1114 
1115 	sched_pin();
1116 	if (pmap == kernel_pmap) {
1117 		invltlb_glob();
1118 		mask = &all_cpus;
1119 	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
1120 		invltlb();
1121 		mask = &all_cpus;
1122 	} else {
1123 		cpuid = PCPU_GET(cpuid);
1124 		other_cpus = all_cpus;
1125 		CPU_CLR(cpuid, &other_cpus);
1126 		if (CPU_ISSET(cpuid, &pmap->pm_active))
1127 			invltlb();
1128 		CPU_AND(&other_cpus, &pmap->pm_active);
1129 		mask = &other_cpus;
1130 	}
1131 	smp_masked_invltlb(*mask, pmap);
1132 	sched_unpin();
1133 }
1134 
1135 void
pmap_invalidate_cache(void)1136 pmap_invalidate_cache(void)
1137 {
1138 
1139 	sched_pin();
1140 	wbinvd();
1141 	smp_cache_flush();
1142 	sched_unpin();
1143 }
1144 
1145 struct pde_action {
1146 	cpuset_t invalidate;	/* processors that invalidate their TLB */
1147 	vm_offset_t va;
1148 	pd_entry_t *pde;
1149 	pd_entry_t newpde;
1150 	u_int store;		/* processor that updates the PDE */
1151 };
1152 
1153 static void
pmap_update_pde_kernel(void * arg)1154 pmap_update_pde_kernel(void *arg)
1155 {
1156 	struct pde_action *act = arg;
1157 	pd_entry_t *pde;
1158 	pmap_t pmap;
1159 
1160 	if (act->store == PCPU_GET(cpuid)) {
1161 
1162 		/*
1163 		 * Elsewhere, this operation requires allpmaps_lock for
1164 		 * synchronization.  Here, it does not because it is being
1165 		 * performed in the context of an all_cpus rendezvous.
1166 		 */
1167 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
1168 			pde = pmap_pde(pmap, act->va);
1169 			pde_store(pde, act->newpde);
1170 		}
1171 	}
1172 }
1173 
1174 static void
pmap_update_pde_user(void * arg)1175 pmap_update_pde_user(void *arg)
1176 {
1177 	struct pde_action *act = arg;
1178 
1179 	if (act->store == PCPU_GET(cpuid))
1180 		pde_store(act->pde, act->newpde);
1181 }
1182 
1183 static void
pmap_update_pde_teardown(void * arg)1184 pmap_update_pde_teardown(void *arg)
1185 {
1186 	struct pde_action *act = arg;
1187 
1188 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1189 		pmap_update_pde_invalidate(act->va, act->newpde);
1190 }
1191 
1192 /*
1193  * Change the page size for the specified virtual address in a way that
1194  * prevents any possibility of the TLB ever having two entries that map the
1195  * same virtual address using different page sizes.  This is the recommended
1196  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1197  * machine check exception for a TLB state that is improperly diagnosed as a
1198  * hardware error.
1199  */
1200 static void
pmap_update_pde(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t newpde)1201 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1202 {
1203 	struct pde_action act;
1204 	cpuset_t active, other_cpus;
1205 	u_int cpuid;
1206 
1207 	sched_pin();
1208 	cpuid = PCPU_GET(cpuid);
1209 	other_cpus = all_cpus;
1210 	CPU_CLR(cpuid, &other_cpus);
1211 	if (pmap == kernel_pmap)
1212 		active = all_cpus;
1213 	else
1214 		active = pmap->pm_active;
1215 	if (CPU_OVERLAP(&active, &other_cpus)) {
1216 		act.store = cpuid;
1217 		act.invalidate = active;
1218 		act.va = va;
1219 		act.pde = pde;
1220 		act.newpde = newpde;
1221 		CPU_SET(cpuid, &active);
1222 		smp_rendezvous_cpus(active,
1223 		    smp_no_rendezvous_barrier, pmap == kernel_pmap ?
1224 		    pmap_update_pde_kernel : pmap_update_pde_user,
1225 		    pmap_update_pde_teardown, &act);
1226 	} else {
1227 		if (pmap == kernel_pmap)
1228 			pmap_kenter_pde(va, newpde);
1229 		else
1230 			pde_store(pde, newpde);
1231 		if (CPU_ISSET(cpuid, &active))
1232 			pmap_update_pde_invalidate(va, newpde);
1233 	}
1234 	sched_unpin();
1235 }
1236 #else /* !SMP */
1237 /*
1238  * Normal, non-SMP, 486+ invalidation functions.
1239  * We inline these within pmap.c for speed.
1240  */
1241 PMAP_INLINE void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va)1242 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1243 {
1244 
1245 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1246 		invlpg(va);
1247 }
1248 
1249 PMAP_INLINE void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)1250 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1251 {
1252 	vm_offset_t addr;
1253 
1254 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1255 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
1256 			invlpg(addr);
1257 }
1258 
1259 PMAP_INLINE void
pmap_invalidate_all(pmap_t pmap)1260 pmap_invalidate_all(pmap_t pmap)
1261 {
1262 
1263 	if (pmap == kernel_pmap)
1264 		invltlb_glob();
1265 	else if (!CPU_EMPTY(&pmap->pm_active))
1266 		invltlb();
1267 }
1268 
1269 PMAP_INLINE void
pmap_invalidate_cache(void)1270 pmap_invalidate_cache(void)
1271 {
1272 
1273 	wbinvd();
1274 }
1275 
1276 static void
pmap_update_pde(pmap_t pmap,vm_offset_t va,pd_entry_t * pde,pd_entry_t newpde)1277 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1278 {
1279 
1280 	if (pmap == kernel_pmap)
1281 		pmap_kenter_pde(va, newpde);
1282 	else
1283 		pde_store(pde, newpde);
1284 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
1285 		pmap_update_pde_invalidate(va, newpde);
1286 }
1287 #endif /* !SMP */
1288 
1289 static void
pmap_invalidate_pde_page(pmap_t pmap,vm_offset_t va,pd_entry_t pde)1290 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
1291 {
1292 
1293 	/*
1294 	 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
1295 	 * created by a promotion that did not invalidate the 512 or 1024 4KB
1296 	 * page mappings that might exist in the TLB.  Consequently, at this
1297 	 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
1298 	 * the address range [va, va + NBPDR).  Therefore, the entire range
1299 	 * must be invalidated here.  In contrast, when PG_PROMOTED is clear,
1300 	 * the TLB will not hold any 4KB page mappings for the address range
1301 	 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
1302 	 * 2- or 4MB page mapping from the TLB.
1303 	 */
1304 	if ((pde & PG_PROMOTED) != 0)
1305 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
1306 	else
1307 		pmap_invalidate_page(pmap, va);
1308 }
1309 
1310 #define	PMAP_CLFLUSH_THRESHOLD	(2 * 1024 * 1024)
1311 
1312 void
pmap_invalidate_cache_range(vm_offset_t sva,vm_offset_t eva,boolean_t force)1313 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force)
1314 {
1315 
1316 	if (force) {
1317 		sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
1318 	} else {
1319 		KASSERT((sva & PAGE_MASK) == 0,
1320 		    ("pmap_invalidate_cache_range: sva not page-aligned"));
1321 		KASSERT((eva & PAGE_MASK) == 0,
1322 		    ("pmap_invalidate_cache_range: eva not page-aligned"));
1323 	}
1324 
1325 	if ((cpu_feature & CPUID_SS) != 0 && !force)
1326 		; /* If "Self Snoop" is supported and allowed, do nothing. */
1327 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 &&
1328 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1329 #ifdef DEV_APIC
1330 		/*
1331 		 * XXX: Some CPUs fault, hang, or trash the local APIC
1332 		 * registers if we use CLFLUSH on the local APIC
1333 		 * range.  The local APIC is always uncached, so we
1334 		 * don't need to flush for that range anyway.
1335 		 */
1336 		if (pmap_kextract(sva) == lapic_paddr)
1337 			return;
1338 #endif
1339 		/*
1340 		 * Otherwise, do per-cache line flush.  Use the sfence
1341 		 * instruction to insure that previous stores are
1342 		 * included in the write-back.  The processor
1343 		 * propagates flush to other processors in the cache
1344 		 * coherence domain.
1345 		 */
1346 		sfence();
1347 		for (; sva < eva; sva += cpu_clflush_line_size)
1348 			clflushopt(sva);
1349 		sfence();
1350 	} else if ((cpu_feature & CPUID_CLFSH) != 0 &&
1351 	    eva - sva < PMAP_CLFLUSH_THRESHOLD) {
1352 #ifdef DEV_APIC
1353 		if (pmap_kextract(sva) == lapic_paddr)
1354 			return;
1355 #endif
1356 		/*
1357 		 * Writes are ordered by CLFLUSH on Intel CPUs.
1358 		 */
1359 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1360 			mfence();
1361 		for (; sva < eva; sva += cpu_clflush_line_size)
1362 			clflush(sva);
1363 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
1364 			mfence();
1365 	} else {
1366 
1367 		/*
1368 		 * No targeted cache flush methods are supported by CPU,
1369 		 * or the supplied range is bigger than 2MB.
1370 		 * Globally invalidate cache.
1371 		 */
1372 		pmap_invalidate_cache();
1373 	}
1374 }
1375 
1376 void
pmap_invalidate_cache_pages(vm_page_t * pages,int count)1377 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
1378 {
1379 	int i;
1380 
1381 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
1382 	    (cpu_feature & CPUID_CLFSH) == 0) {
1383 		pmap_invalidate_cache();
1384 	} else {
1385 		for (i = 0; i < count; i++)
1386 			pmap_flush_page(pages[i]);
1387 	}
1388 }
1389 
1390 /*
1391  * Are we current address space or kernel?
1392  */
1393 static __inline int
pmap_is_current(pmap_t pmap)1394 pmap_is_current(pmap_t pmap)
1395 {
1396 
1397 	return (pmap == kernel_pmap || pmap ==
1398 	    vmspace_pmap(curthread->td_proc->p_vmspace));
1399 }
1400 
1401 /*
1402  * If the given pmap is not the current or kernel pmap, the returned pte must
1403  * be released by passing it to pmap_pte_release().
1404  */
1405 pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va)1406 pmap_pte(pmap_t pmap, vm_offset_t va)
1407 {
1408 	pd_entry_t newpf;
1409 	pd_entry_t *pde;
1410 
1411 	pde = pmap_pde(pmap, va);
1412 	if (*pde & PG_PS)
1413 		return (pde);
1414 	if (*pde != 0) {
1415 		/* are we current address space or kernel? */
1416 		if (pmap_is_current(pmap))
1417 			return (vtopte(va));
1418 		mtx_lock(&PMAP2mutex);
1419 		newpf = *pde & PG_FRAME;
1420 		if ((*PMAP2 & PG_FRAME) != newpf) {
1421 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
1422 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
1423 		}
1424 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
1425 	}
1426 	return (NULL);
1427 }
1428 
1429 /*
1430  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
1431  * being NULL.
1432  */
1433 static __inline void
pmap_pte_release(pt_entry_t * pte)1434 pmap_pte_release(pt_entry_t *pte)
1435 {
1436 
1437 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
1438 		mtx_unlock(&PMAP2mutex);
1439 }
1440 
1441 /*
1442  * NB:  The sequence of updating a page table followed by accesses to the
1443  * corresponding pages is subject to the situation described in the "AMD64
1444  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
1445  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
1446  * right after modifying the PTE bits is crucial.
1447  */
1448 static __inline void
invlcaddr(void * caddr)1449 invlcaddr(void *caddr)
1450 {
1451 
1452 	invlpg((u_int)caddr);
1453 }
1454 
1455 /*
1456  * Super fast pmap_pte routine best used when scanning
1457  * the pv lists.  This eliminates many coarse-grained
1458  * invltlb calls.  Note that many of the pv list
1459  * scans are across different pmaps.  It is very wasteful
1460  * to do an entire invltlb for checking a single mapping.
1461  *
1462  * If the given pmap is not the current pmap, pvh_global_lock
1463  * must be held and curthread pinned to a CPU.
1464  */
1465 static pt_entry_t *
pmap_pte_quick(pmap_t pmap,vm_offset_t va)1466 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
1467 {
1468 	pd_entry_t newpf;
1469 	pd_entry_t *pde;
1470 
1471 	pde = pmap_pde(pmap, va);
1472 	if (*pde & PG_PS)
1473 		return (pde);
1474 	if (*pde != 0) {
1475 		/* are we current address space or kernel? */
1476 		if (pmap_is_current(pmap))
1477 			return (vtopte(va));
1478 		rw_assert(&pvh_global_lock, RA_WLOCKED);
1479 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1480 		newpf = *pde & PG_FRAME;
1481 		if ((*PMAP1 & PG_FRAME) != newpf) {
1482 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
1483 #ifdef SMP
1484 			PMAP1cpu = PCPU_GET(cpuid);
1485 #endif
1486 			invlcaddr(PADDR1);
1487 			PMAP1changed++;
1488 		} else
1489 #ifdef SMP
1490 		if (PMAP1cpu != PCPU_GET(cpuid)) {
1491 			PMAP1cpu = PCPU_GET(cpuid);
1492 			invlcaddr(PADDR1);
1493 			PMAP1changedcpu++;
1494 		} else
1495 #endif
1496 			PMAP1unchanged++;
1497 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
1498 	}
1499 	return (0);
1500 }
1501 
1502 /*
1503  *	Routine:	pmap_extract
1504  *	Function:
1505  *		Extract the physical page address associated
1506  *		with the given map/virtual_address pair.
1507  */
1508 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1509 pmap_extract(pmap_t pmap, vm_offset_t va)
1510 {
1511 	vm_paddr_t rtval;
1512 	pt_entry_t *pte;
1513 	pd_entry_t pde;
1514 
1515 	rtval = 0;
1516 	PMAP_LOCK(pmap);
1517 	pde = pmap->pm_pdir[va >> PDRSHIFT];
1518 	if (pde != 0) {
1519 		if ((pde & PG_PS) != 0)
1520 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
1521 		else {
1522 			pte = pmap_pte(pmap, va);
1523 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1524 			pmap_pte_release(pte);
1525 		}
1526 	}
1527 	PMAP_UNLOCK(pmap);
1528 	return (rtval);
1529 }
1530 
1531 /*
1532  *	Routine:	pmap_extract_and_hold
1533  *	Function:
1534  *		Atomically extract and hold the physical page
1535  *		with the given pmap and virtual address pair
1536  *		if that mapping permits the given protection.
1537  */
1538 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1539 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1540 {
1541 	pd_entry_t pde;
1542 	pt_entry_t pte, *ptep;
1543 	vm_page_t m;
1544 	vm_paddr_t pa;
1545 
1546 	pa = 0;
1547 	m = NULL;
1548 	PMAP_LOCK(pmap);
1549 retry:
1550 	pde = *pmap_pde(pmap, va);
1551 	if (pde != 0) {
1552 		if (pde & PG_PS) {
1553 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1554 				if (vm_page_pa_tryrelock(pmap, (pde &
1555 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
1556 					goto retry;
1557 				m = PHYS_TO_VM_PAGE(pa);
1558 			}
1559 		} else {
1560 			ptep = pmap_pte(pmap, va);
1561 			pte = *ptep;
1562 			pmap_pte_release(ptep);
1563 			if (pte != 0 &&
1564 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1565 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
1566 				    &pa))
1567 					goto retry;
1568 				m = PHYS_TO_VM_PAGE(pa);
1569 			}
1570 		}
1571 		if (m != NULL)
1572 			vm_page_hold(m);
1573 	}
1574 	PA_UNLOCK_COND(pa);
1575 	PMAP_UNLOCK(pmap);
1576 	return (m);
1577 }
1578 
1579 /***************************************************
1580  * Low level mapping routines.....
1581  ***************************************************/
1582 
1583 /*
1584  * Add a wired page to the kva.
1585  * Note: not SMP coherent.
1586  *
1587  * This function may be used before pmap_bootstrap() is called.
1588  */
1589 PMAP_INLINE void
pmap_kenter(vm_offset_t va,vm_paddr_t pa)1590 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1591 {
1592 	pt_entry_t *pte;
1593 
1594 	pte = vtopte(va);
1595 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1596 }
1597 
1598 static __inline void
pmap_kenter_attr(vm_offset_t va,vm_paddr_t pa,int mode)1599 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1600 {
1601 	pt_entry_t *pte;
1602 
1603 	pte = vtopte(va);
1604 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1605 }
1606 
1607 /*
1608  * Remove a page from the kernel pagetables.
1609  * Note: not SMP coherent.
1610  *
1611  * This function may be used before pmap_bootstrap() is called.
1612  */
1613 PMAP_INLINE void
pmap_kremove(vm_offset_t va)1614 pmap_kremove(vm_offset_t va)
1615 {
1616 	pt_entry_t *pte;
1617 
1618 	pte = vtopte(va);
1619 	pte_clear(pte);
1620 }
1621 
1622 /*
1623  *	Used to map a range of physical addresses into kernel
1624  *	virtual address space.
1625  *
1626  *	The value passed in '*virt' is a suggested virtual address for
1627  *	the mapping. Architectures which can support a direct-mapped
1628  *	physical to virtual region can return the appropriate address
1629  *	within that region, leaving '*virt' unchanged. Other
1630  *	architectures should map the pages starting at '*virt' and
1631  *	update '*virt' with the first usable address after the mapped
1632  *	region.
1633  */
1634 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)1635 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1636 {
1637 	vm_offset_t va, sva;
1638 	vm_paddr_t superpage_offset;
1639 	pd_entry_t newpde;
1640 
1641 	va = *virt;
1642 	/*
1643 	 * Does the physical address range's size and alignment permit at
1644 	 * least one superpage mapping to be created?
1645 	 */
1646 	superpage_offset = start & PDRMASK;
1647 	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
1648 		/*
1649 		 * Increase the starting virtual address so that its alignment
1650 		 * does not preclude the use of superpage mappings.
1651 		 */
1652 		if ((va & PDRMASK) < superpage_offset)
1653 			va = (va & ~PDRMASK) + superpage_offset;
1654 		else if ((va & PDRMASK) > superpage_offset)
1655 			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
1656 	}
1657 	sva = va;
1658 	while (start < end) {
1659 		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
1660 		    pseflag) {
1661 			KASSERT((va & PDRMASK) == 0,
1662 			    ("pmap_map: misaligned va %#x", va));
1663 			newpde = start | PG_PS | pgeflag | PG_RW | PG_V;
1664 			pmap_kenter_pde(va, newpde);
1665 			va += NBPDR;
1666 			start += NBPDR;
1667 		} else {
1668 			pmap_kenter(va, start);
1669 			va += PAGE_SIZE;
1670 			start += PAGE_SIZE;
1671 		}
1672 	}
1673 	pmap_invalidate_range(kernel_pmap, sva, va);
1674 	*virt = va;
1675 	return (sva);
1676 }
1677 
1678 
1679 /*
1680  * Add a list of wired pages to the kva
1681  * this routine is only used for temporary
1682  * kernel mappings that do not need to have
1683  * page modification or references recorded.
1684  * Note that old mappings are simply written
1685  * over.  The page *must* be wired.
1686  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1687  */
1688 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)1689 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1690 {
1691 	pt_entry_t *endpte, oldpte, pa, *pte;
1692 	vm_page_t m;
1693 
1694 	oldpte = 0;
1695 	pte = vtopte(sva);
1696 	endpte = pte + count;
1697 	while (pte < endpte) {
1698 		m = *ma++;
1699 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0);
1700 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
1701 			oldpte |= *pte;
1702 			pte_store(pte, pa | pgeflag | PG_RW | PG_V);
1703 		}
1704 		pte++;
1705 	}
1706 	if (__predict_false((oldpte & PG_V) != 0))
1707 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
1708 		    PAGE_SIZE);
1709 }
1710 
1711 /*
1712  * This routine tears out page mappings from the
1713  * kernel -- it is meant only for temporary mappings.
1714  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1715  */
1716 void
pmap_qremove(vm_offset_t sva,int count)1717 pmap_qremove(vm_offset_t sva, int count)
1718 {
1719 	vm_offset_t va;
1720 
1721 	va = sva;
1722 	while (count-- > 0) {
1723 		pmap_kremove(va);
1724 		va += PAGE_SIZE;
1725 	}
1726 	pmap_invalidate_range(kernel_pmap, sva, va);
1727 }
1728 
1729 /***************************************************
1730  * Page table page management routines.....
1731  ***************************************************/
1732 static __inline void
pmap_free_zero_pages(struct spglist * free)1733 pmap_free_zero_pages(struct spglist *free)
1734 {
1735 	vm_page_t m;
1736 	int count;
1737 
1738 	for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) {
1739 		SLIST_REMOVE_HEAD(free, plinks.s.ss);
1740 		/* Preserve the page's PG_ZERO setting. */
1741 		vm_page_free_toq(m);
1742 	}
1743 	atomic_subtract_int(&vm_cnt.v_wire_count, count);
1744 }
1745 
1746 /*
1747  * Schedule the specified unused page table page to be freed.  Specifically,
1748  * add the page to the specified list of pages that will be released to the
1749  * physical memory manager after the TLB has been updated.
1750  */
1751 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,boolean_t set_PG_ZERO)1752 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1753     boolean_t set_PG_ZERO)
1754 {
1755 
1756 	if (set_PG_ZERO)
1757 		m->flags |= PG_ZERO;
1758 	else
1759 		m->flags &= ~PG_ZERO;
1760 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1761 }
1762 
1763 /*
1764  * Inserts the specified page table page into the specified pmap's collection
1765  * of idle page table pages.  Each of a pmap's page table pages is responsible
1766  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1767  * ordered by this virtual address range.
1768  */
1769 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte)1770 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
1771 {
1772 
1773 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1774 	return (vm_radix_insert(&pmap->pm_root, mpte));
1775 }
1776 
1777 /*
1778  * Removes the page table page mapping the specified virtual address from the
1779  * specified pmap's collection of idle page table pages, and returns it.
1780  * Otherwise, returns NULL if there is no page table page corresponding to the
1781  * specified virtual address.
1782  */
1783 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)1784 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1785 {
1786 
1787 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1788 	return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
1789 }
1790 
1791 /*
1792  * Decrements a page table page's wire count, which is used to record the
1793  * number of valid page table entries within the page.  If the wire count
1794  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1795  * page table page was unmapped and FALSE otherwise.
1796  */
1797 static inline boolean_t
pmap_unwire_ptp(pmap_t pmap,vm_page_t m,struct spglist * free)1798 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1799 {
1800 
1801 	--m->wire_count;
1802 	if (m->wire_count == 0) {
1803 		_pmap_unwire_ptp(pmap, m, free);
1804 		return (TRUE);
1805 	} else
1806 		return (FALSE);
1807 }
1808 
1809 static void
_pmap_unwire_ptp(pmap_t pmap,vm_page_t m,struct spglist * free)1810 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
1811 {
1812 	vm_offset_t pteva;
1813 
1814 	/*
1815 	 * unmap the page table page
1816 	 */
1817 	pmap->pm_pdir[m->pindex] = 0;
1818 	--pmap->pm_stats.resident_count;
1819 
1820 	/*
1821 	 * Do an invltlb to make the invalidated mapping
1822 	 * take effect immediately.
1823 	 */
1824 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1825 	pmap_invalidate_page(pmap, pteva);
1826 
1827 	/*
1828 	 * Put page on a list so that it is released after
1829 	 * *ALL* TLB shootdown is done
1830 	 */
1831 	pmap_add_delayed_free_list(m, free, TRUE);
1832 }
1833 
1834 /*
1835  * After removing a page table entry, this routine is used to
1836  * conditionally free the page, and manage the hold/wire counts.
1837  */
1838 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,struct spglist * free)1839 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
1840 {
1841 	pd_entry_t ptepde;
1842 	vm_page_t mpte;
1843 
1844 	if (va >= VM_MAXUSER_ADDRESS)
1845 		return (0);
1846 	ptepde = *pmap_pde(pmap, va);
1847 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1848 	return (pmap_unwire_ptp(pmap, mpte, free));
1849 }
1850 
1851 /*
1852  * Initialize the pmap for the swapper process.
1853  */
1854 void
pmap_pinit0(pmap_t pmap)1855 pmap_pinit0(pmap_t pmap)
1856 {
1857 
1858 	PMAP_LOCK_INIT(pmap);
1859 	/*
1860 	 * Since the page table directory is shared with the kernel pmap,
1861 	 * which is already included in the list "allpmaps", this pmap does
1862 	 * not need to be inserted into that list.
1863 	 */
1864 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1865 #if defined(PAE) || defined(PAE_TABLES)
1866 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1867 #endif
1868 	pmap->pm_root.rt_root = 0;
1869 	CPU_ZERO(&pmap->pm_active);
1870 	TAILQ_INIT(&pmap->pm_pvchunk);
1871 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1872 	pmap_activate_boot(pmap);
1873 }
1874 
1875 /*
1876  * Initialize a preallocated and zeroed pmap structure,
1877  * such as one in a vmspace structure.
1878  */
1879 int
pmap_pinit(pmap_t pmap)1880 pmap_pinit(pmap_t pmap)
1881 {
1882 	vm_page_t m, ptdpg[NPGPTD];
1883 	vm_paddr_t pa;
1884 	int i;
1885 
1886 	/*
1887 	 * No need to allocate page table space yet but we do need a valid
1888 	 * page directory table.
1889 	 */
1890 	if (pmap->pm_pdir == NULL) {
1891 		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
1892 		if (pmap->pm_pdir == NULL)
1893 			return (0);
1894 #if defined(PAE) || defined(PAE_TABLES)
1895 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1896 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
1897 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1898 		    ("pmap_pinit: pdpt misaligned"));
1899 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1900 		    ("pmap_pinit: pdpt above 4g"));
1901 #endif
1902 		pmap->pm_root.rt_root = 0;
1903 	}
1904 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
1905 	    ("pmap_pinit: pmap has reserved page table page(s)"));
1906 
1907 	/*
1908 	 * allocate the page directory page(s)
1909 	 */
1910 	for (i = 0; i < NPGPTD;) {
1911 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1912 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1913 		if (m == NULL)
1914 			VM_WAIT;
1915 		else {
1916 			ptdpg[i++] = m;
1917 		}
1918 	}
1919 
1920 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1921 
1922 	for (i = 0; i < NPGPTD; i++)
1923 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
1924 			pagezero(pmap->pm_pdir + (i * NPDEPG));
1925 
1926 	mtx_lock_spin(&allpmaps_lock);
1927 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1928 	/* Copy the kernel page table directory entries. */
1929 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1930 	mtx_unlock_spin(&allpmaps_lock);
1931 
1932 	/* install self-referential address mapping entry(s) */
1933 	for (i = 0; i < NPGPTD; i++) {
1934 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1935 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1936 #if defined(PAE) || defined(PAE_TABLES)
1937 		pmap->pm_pdpt[i] = pa | PG_V;
1938 #endif
1939 	}
1940 
1941 	CPU_ZERO(&pmap->pm_active);
1942 	TAILQ_INIT(&pmap->pm_pvchunk);
1943 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1944 
1945 	return (1);
1946 }
1947 
1948 /*
1949  * this routine is called if the page table page is not
1950  * mapped correctly.
1951  */
1952 static vm_page_t
_pmap_allocpte(pmap_t pmap,u_int ptepindex,u_int flags)1953 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
1954 {
1955 	vm_paddr_t ptepa;
1956 	vm_page_t m;
1957 
1958 	/*
1959 	 * Allocate a page table page.
1960 	 */
1961 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1962 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1963 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
1964 			PMAP_UNLOCK(pmap);
1965 			rw_wunlock(&pvh_global_lock);
1966 			VM_WAIT;
1967 			rw_wlock(&pvh_global_lock);
1968 			PMAP_LOCK(pmap);
1969 		}
1970 
1971 		/*
1972 		 * Indicate the need to retry.  While waiting, the page table
1973 		 * page may have been allocated.
1974 		 */
1975 		return (NULL);
1976 	}
1977 	if ((m->flags & PG_ZERO) == 0)
1978 		pmap_zero_page(m);
1979 
1980 	/*
1981 	 * Map the pagetable page into the process address space, if
1982 	 * it isn't already there.
1983 	 */
1984 
1985 	pmap->pm_stats.resident_count++;
1986 
1987 	ptepa = VM_PAGE_TO_PHYS(m);
1988 	pmap->pm_pdir[ptepindex] =
1989 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1990 
1991 	return (m);
1992 }
1993 
1994 static vm_page_t
pmap_allocpte(pmap_t pmap,vm_offset_t va,u_int flags)1995 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
1996 {
1997 	u_int ptepindex;
1998 	pd_entry_t ptepa;
1999 	vm_page_t m;
2000 
2001 	/*
2002 	 * Calculate pagetable page index
2003 	 */
2004 	ptepindex = va >> PDRSHIFT;
2005 retry:
2006 	/*
2007 	 * Get the page directory entry
2008 	 */
2009 	ptepa = pmap->pm_pdir[ptepindex];
2010 
2011 	/*
2012 	 * This supports switching from a 4MB page to a
2013 	 * normal 4K page.
2014 	 */
2015 	if (ptepa & PG_PS) {
2016 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
2017 		ptepa = pmap->pm_pdir[ptepindex];
2018 	}
2019 
2020 	/*
2021 	 * If the page table page is mapped, we just increment the
2022 	 * hold count, and activate it.
2023 	 */
2024 	if (ptepa) {
2025 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
2026 		m->wire_count++;
2027 	} else {
2028 		/*
2029 		 * Here if the pte page isn't mapped, or if it has
2030 		 * been deallocated.
2031 		 */
2032 		m = _pmap_allocpte(pmap, ptepindex, flags);
2033 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
2034 			goto retry;
2035 	}
2036 	return (m);
2037 }
2038 
2039 
2040 /***************************************************
2041 * Pmap allocation/deallocation routines.
2042  ***************************************************/
2043 
2044 /*
2045  * Release any resources held by the given physical map.
2046  * Called when a pmap initialized by pmap_pinit is being released.
2047  * Should only be called if the map contains no valid mappings.
2048  */
2049 void
pmap_release(pmap_t pmap)2050 pmap_release(pmap_t pmap)
2051 {
2052 	vm_page_t m, ptdpg[NPGPTD];
2053 	int i;
2054 
2055 	KASSERT(pmap->pm_stats.resident_count == 0,
2056 	    ("pmap_release: pmap resident count %ld != 0",
2057 	    pmap->pm_stats.resident_count));
2058 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2059 	    ("pmap_release: pmap has reserved page table page(s)"));
2060 	KASSERT(CPU_EMPTY(&pmap->pm_active),
2061 	    ("releasing active pmap %p", pmap));
2062 
2063 	mtx_lock_spin(&allpmaps_lock);
2064 	LIST_REMOVE(pmap, pm_list);
2065 	mtx_unlock_spin(&allpmaps_lock);
2066 
2067 	for (i = 0; i < NPGPTD; i++)
2068 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
2069 		    PG_FRAME);
2070 
2071 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
2072 	    sizeof(*pmap->pm_pdir));
2073 
2074 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
2075 
2076 	for (i = 0; i < NPGPTD; i++) {
2077 		m = ptdpg[i];
2078 #if defined(PAE) || defined(PAE_TABLES)
2079 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
2080 		    ("pmap_release: got wrong ptd page"));
2081 #endif
2082 		m->wire_count--;
2083 		vm_page_free_zero(m);
2084 	}
2085 	atomic_subtract_int(&vm_cnt.v_wire_count, NPGPTD);
2086 }
2087 
2088 static int
kvm_size(SYSCTL_HANDLER_ARGS)2089 kvm_size(SYSCTL_HANDLER_ARGS)
2090 {
2091 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
2092 
2093 	return (sysctl_handle_long(oidp, &ksize, 0, req));
2094 }
2095 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
2096     0, 0, kvm_size, "IU", "Size of KVM");
2097 
2098 static int
kvm_free(SYSCTL_HANDLER_ARGS)2099 kvm_free(SYSCTL_HANDLER_ARGS)
2100 {
2101 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2102 
2103 	return (sysctl_handle_long(oidp, &kfree, 0, req));
2104 }
2105 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
2106     0, 0, kvm_free, "IU", "Amount of KVM free");
2107 
2108 /*
2109  * grow the number of kernel page table entries, if needed
2110  */
2111 void
pmap_growkernel(vm_offset_t addr)2112 pmap_growkernel(vm_offset_t addr)
2113 {
2114 	vm_paddr_t ptppaddr;
2115 	vm_page_t nkpg;
2116 	pd_entry_t newpdir;
2117 
2118 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2119 	addr = roundup2(addr, NBPDR);
2120 	if (addr - 1 >= vm_map_max(kernel_map))
2121 		addr = vm_map_max(kernel_map);
2122 	while (kernel_vm_end < addr) {
2123 		if (pdir_pde(PTD, kernel_vm_end)) {
2124 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2125 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2126 				kernel_vm_end = vm_map_max(kernel_map);
2127 				break;
2128 			}
2129 			continue;
2130 		}
2131 
2132 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
2133 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2134 		    VM_ALLOC_ZERO);
2135 		if (nkpg == NULL)
2136 			panic("pmap_growkernel: no memory to grow kernel");
2137 
2138 		nkpt++;
2139 
2140 		if ((nkpg->flags & PG_ZERO) == 0)
2141 			pmap_zero_page(nkpg);
2142 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
2143 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
2144 		pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir;
2145 
2146 		pmap_kenter_pde(kernel_vm_end, newpdir);
2147 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
2148 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2149 			kernel_vm_end = vm_map_max(kernel_map);
2150 			break;
2151 		}
2152 	}
2153 }
2154 
2155 
2156 /***************************************************
2157  * page management routines.
2158  ***************************************************/
2159 
2160 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2161 CTASSERT(_NPCM == 11);
2162 CTASSERT(_NPCPV == 336);
2163 
2164 static __inline struct pv_chunk *
pv_to_chunk(pv_entry_t pv)2165 pv_to_chunk(pv_entry_t pv)
2166 {
2167 
2168 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2169 }
2170 
2171 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2172 
2173 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
2174 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
2175 
2176 static const uint32_t pc_freemask[_NPCM] = {
2177 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2178 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2179 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
2180 	PC_FREE0_9, PC_FREE10
2181 };
2182 
2183 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2184 	"Current number of pv entries");
2185 
2186 #ifdef PV_STATS
2187 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2188 
2189 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2190 	"Current number of pv entry chunks");
2191 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2192 	"Current number of pv entry chunks allocated");
2193 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2194 	"Current number of pv entry chunks frees");
2195 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2196 	"Number of times tried to get a chunk page but failed.");
2197 
2198 static long pv_entry_frees, pv_entry_allocs;
2199 static int pv_entry_spare;
2200 
2201 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2202 	"Current number of pv entry frees");
2203 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2204 	"Current number of pv entry allocs");
2205 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2206 	"Current number of spare pv entries");
2207 #endif
2208 
2209 /*
2210  * We are in a serious low memory condition.  Resort to
2211  * drastic measures to free some pages so we can allocate
2212  * another pv entry chunk.
2213  */
2214 static vm_page_t
pmap_pv_reclaim(pmap_t locked_pmap)2215 pmap_pv_reclaim(pmap_t locked_pmap)
2216 {
2217 	struct pch newtail;
2218 	struct pv_chunk *pc;
2219 	struct md_page *pvh;
2220 	pd_entry_t *pde;
2221 	pmap_t pmap;
2222 	pt_entry_t *pte, tpte;
2223 	pv_entry_t pv;
2224 	vm_offset_t va;
2225 	vm_page_t m, m_pc;
2226 	struct spglist free;
2227 	uint32_t inuse;
2228 	int bit, field, freed;
2229 
2230 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2231 	pmap = NULL;
2232 	m_pc = NULL;
2233 	SLIST_INIT(&free);
2234 	TAILQ_INIT(&newtail);
2235 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
2236 	    SLIST_EMPTY(&free))) {
2237 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2238 		if (pmap != pc->pc_pmap) {
2239 			if (pmap != NULL) {
2240 				pmap_invalidate_all(pmap);
2241 				if (pmap != locked_pmap)
2242 					PMAP_UNLOCK(pmap);
2243 			}
2244 			pmap = pc->pc_pmap;
2245 			/* Avoid deadlock and lock recursion. */
2246 			if (pmap > locked_pmap)
2247 				PMAP_LOCK(pmap);
2248 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
2249 				pmap = NULL;
2250 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2251 				continue;
2252 			}
2253 		}
2254 
2255 		/*
2256 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2257 		 */
2258 		freed = 0;
2259 		for (field = 0; field < _NPCM; field++) {
2260 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2261 			    inuse != 0; inuse &= ~(1UL << bit)) {
2262 				bit = bsfl(inuse);
2263 				pv = &pc->pc_pventry[field * 32 + bit];
2264 				va = pv->pv_va;
2265 				pde = pmap_pde(pmap, va);
2266 				if ((*pde & PG_PS) != 0)
2267 					continue;
2268 				pte = pmap_pte(pmap, va);
2269 				tpte = *pte;
2270 				if ((tpte & PG_W) == 0)
2271 					tpte = pte_load_clear(pte);
2272 				pmap_pte_release(pte);
2273 				if ((tpte & PG_W) != 0)
2274 					continue;
2275 				KASSERT(tpte != 0,
2276 				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
2277 				    pmap, va));
2278 				if ((tpte & PG_G) != 0)
2279 					pmap_invalidate_page(pmap, va);
2280 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
2281 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2282 					vm_page_dirty(m);
2283 				if ((tpte & PG_A) != 0)
2284 					vm_page_aflag_set(m, PGA_REFERENCED);
2285 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2286 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2287 				    (m->flags & PG_FICTITIOUS) == 0) {
2288 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2289 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2290 						vm_page_aflag_clear(m,
2291 						    PGA_WRITEABLE);
2292 					}
2293 				}
2294 				pc->pc_map[field] |= 1UL << bit;
2295 				pmap_unuse_pt(pmap, va, &free);
2296 				freed++;
2297 			}
2298 		}
2299 		if (freed == 0) {
2300 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2301 			continue;
2302 		}
2303 		/* Every freed mapping is for a 4 KB page. */
2304 		pmap->pm_stats.resident_count -= freed;
2305 		PV_STAT(pv_entry_frees += freed);
2306 		PV_STAT(pv_entry_spare += freed);
2307 		pv_entry_count -= freed;
2308 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2309 		for (field = 0; field < _NPCM; field++)
2310 			if (pc->pc_map[field] != pc_freemask[field]) {
2311 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2312 				    pc_list);
2313 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
2314 
2315 				/*
2316 				 * One freed pv entry in locked_pmap is
2317 				 * sufficient.
2318 				 */
2319 				if (pmap == locked_pmap)
2320 					goto out;
2321 				break;
2322 			}
2323 		if (field == _NPCM) {
2324 			PV_STAT(pv_entry_spare -= _NPCPV);
2325 			PV_STAT(pc_chunk_count--);
2326 			PV_STAT(pc_chunk_frees++);
2327 			/* Entire chunk is free; return it. */
2328 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2329 			pmap_qremove((vm_offset_t)pc, 1);
2330 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2331 			break;
2332 		}
2333 	}
2334 out:
2335 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
2336 	if (pmap != NULL) {
2337 		pmap_invalidate_all(pmap);
2338 		if (pmap != locked_pmap)
2339 			PMAP_UNLOCK(pmap);
2340 	}
2341 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
2342 		m_pc = SLIST_FIRST(&free);
2343 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2344 		/* Recycle a freed page table page. */
2345 		m_pc->wire_count = 1;
2346 	}
2347 	pmap_free_zero_pages(&free);
2348 	return (m_pc);
2349 }
2350 
2351 /*
2352  * free the pv_entry back to the free list
2353  */
2354 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)2355 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2356 {
2357 	struct pv_chunk *pc;
2358 	int idx, field, bit;
2359 
2360 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2361 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2362 	PV_STAT(pv_entry_frees++);
2363 	PV_STAT(pv_entry_spare++);
2364 	pv_entry_count--;
2365 	pc = pv_to_chunk(pv);
2366 	idx = pv - &pc->pc_pventry[0];
2367 	field = idx / 32;
2368 	bit = idx % 32;
2369 	pc->pc_map[field] |= 1ul << bit;
2370 	for (idx = 0; idx < _NPCM; idx++)
2371 		if (pc->pc_map[idx] != pc_freemask[idx]) {
2372 			/*
2373 			 * 98% of the time, pc is already at the head of the
2374 			 * list.  If it isn't already, move it to the head.
2375 			 */
2376 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
2377 			    pc)) {
2378 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2379 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
2380 				    pc_list);
2381 			}
2382 			return;
2383 		}
2384 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2385 	free_pv_chunk(pc);
2386 }
2387 
2388 static void
free_pv_chunk(struct pv_chunk * pc)2389 free_pv_chunk(struct pv_chunk *pc)
2390 {
2391 	vm_page_t m;
2392 
2393  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2394 	PV_STAT(pv_entry_spare -= _NPCPV);
2395 	PV_STAT(pc_chunk_count--);
2396 	PV_STAT(pc_chunk_frees++);
2397 	/* entire chunk is free, return it */
2398 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2399 	pmap_qremove((vm_offset_t)pc, 1);
2400 	vm_page_unwire(m, PQ_NONE);
2401 	vm_page_free(m);
2402 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
2403 }
2404 
2405 /*
2406  * get a new pv_entry, allocating a block from the system
2407  * when needed.
2408  */
2409 static pv_entry_t
get_pv_entry(pmap_t pmap,boolean_t try)2410 get_pv_entry(pmap_t pmap, boolean_t try)
2411 {
2412 	static const struct timeval printinterval = { 60, 0 };
2413 	static struct timeval lastprint;
2414 	int bit, field;
2415 	pv_entry_t pv;
2416 	struct pv_chunk *pc;
2417 	vm_page_t m;
2418 
2419 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2420 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2421 	PV_STAT(pv_entry_allocs++);
2422 	pv_entry_count++;
2423 	if (pv_entry_count > pv_entry_high_water)
2424 		if (ratecheck(&lastprint, &printinterval))
2425 			printf("Approaching the limit on PV entries, consider "
2426 			    "increasing either the vm.pmap.shpgperproc or the "
2427 			    "vm.pmap.pv_entries tunable.\n");
2428 retry:
2429 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2430 	if (pc != NULL) {
2431 		for (field = 0; field < _NPCM; field++) {
2432 			if (pc->pc_map[field]) {
2433 				bit = bsfl(pc->pc_map[field]);
2434 				break;
2435 			}
2436 		}
2437 		if (field < _NPCM) {
2438 			pv = &pc->pc_pventry[field * 32 + bit];
2439 			pc->pc_map[field] &= ~(1ul << bit);
2440 			/* If this was the last item, move it to tail */
2441 			for (field = 0; field < _NPCM; field++)
2442 				if (pc->pc_map[field] != 0) {
2443 					PV_STAT(pv_entry_spare--);
2444 					return (pv);	/* not full, return */
2445 				}
2446 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2447 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2448 			PV_STAT(pv_entry_spare--);
2449 			return (pv);
2450 		}
2451 	}
2452 	/*
2453 	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
2454 	 * global lock.  If "pv_vafree" is currently non-empty, it will
2455 	 * remain non-empty until pmap_ptelist_alloc() completes.
2456 	 */
2457 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2458 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
2459 		if (try) {
2460 			pv_entry_count--;
2461 			PV_STAT(pc_chunk_tryfail++);
2462 			return (NULL);
2463 		}
2464 		m = pmap_pv_reclaim(pmap);
2465 		if (m == NULL)
2466 			goto retry;
2467 	}
2468 	PV_STAT(pc_chunk_count++);
2469 	PV_STAT(pc_chunk_allocs++);
2470 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
2471 	pmap_qenter((vm_offset_t)pc, &m, 1);
2472 	pc->pc_pmap = pmap;
2473 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
2474 	for (field = 1; field < _NPCM; field++)
2475 		pc->pc_map[field] = pc_freemask[field];
2476 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2477 	pv = &pc->pc_pventry[0];
2478 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2479 	PV_STAT(pv_entry_spare += _NPCPV - 1);
2480 	return (pv);
2481 }
2482 
2483 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2484 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2485 {
2486 	pv_entry_t pv;
2487 
2488 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2489 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2490 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2491 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2492 			break;
2493 		}
2494 	}
2495 	return (pv);
2496 }
2497 
2498 static void
pmap_pv_demote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa)2499 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2500 {
2501 	struct md_page *pvh;
2502 	pv_entry_t pv;
2503 	vm_offset_t va_last;
2504 	vm_page_t m;
2505 
2506 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2507 	KASSERT((pa & PDRMASK) == 0,
2508 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
2509 
2510 	/*
2511 	 * Transfer the 4mpage's pv entry for this mapping to the first
2512 	 * page's pv list.
2513 	 */
2514 	pvh = pa_to_pvh(pa);
2515 	va = trunc_4mpage(va);
2516 	pv = pmap_pvh_remove(pvh, pmap, va);
2517 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
2518 	m = PHYS_TO_VM_PAGE(pa);
2519 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2520 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
2521 	va_last = va + NBPDR - PAGE_SIZE;
2522 	do {
2523 		m++;
2524 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2525 		    ("pmap_pv_demote_pde: page %p is not managed", m));
2526 		va += PAGE_SIZE;
2527 		pmap_insert_entry(pmap, va, m);
2528 	} while (va < va_last);
2529 }
2530 
2531 #if VM_NRESERVLEVEL > 0
2532 static void
pmap_pv_promote_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa)2533 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2534 {
2535 	struct md_page *pvh;
2536 	pv_entry_t pv;
2537 	vm_offset_t va_last;
2538 	vm_page_t m;
2539 
2540 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2541 	KASSERT((pa & PDRMASK) == 0,
2542 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
2543 
2544 	/*
2545 	 * Transfer the first page's pv entry for this mapping to the
2546 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
2547 	 * to get_pv_entry(), a transfer avoids the possibility that
2548 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
2549 	 * removes one of the mappings that is being promoted.
2550 	 */
2551 	m = PHYS_TO_VM_PAGE(pa);
2552 	va = trunc_4mpage(va);
2553 	pv = pmap_pvh_remove(&m->md, pmap, va);
2554 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
2555 	pvh = pa_to_pvh(pa);
2556 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2557 	/* Free the remaining NPTEPG - 1 pv entries. */
2558 	va_last = va + NBPDR - PAGE_SIZE;
2559 	do {
2560 		m++;
2561 		va += PAGE_SIZE;
2562 		pmap_pvh_free(&m->md, pmap, va);
2563 	} while (va < va_last);
2564 }
2565 #endif /* VM_NRESERVLEVEL > 0 */
2566 
2567 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)2568 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2569 {
2570 	pv_entry_t pv;
2571 
2572 	pv = pmap_pvh_remove(pvh, pmap, va);
2573 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2574 	free_pv_entry(pmap, pv);
2575 }
2576 
2577 static void
pmap_remove_entry(pmap_t pmap,vm_page_t m,vm_offset_t va)2578 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
2579 {
2580 	struct md_page *pvh;
2581 
2582 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2583 	pmap_pvh_free(&m->md, pmap, va);
2584 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
2585 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2586 		if (TAILQ_EMPTY(&pvh->pv_list))
2587 			vm_page_aflag_clear(m, PGA_WRITEABLE);
2588 	}
2589 }
2590 
2591 /*
2592  * Create a pv entry for page at pa for
2593  * (pmap, va).
2594  */
2595 static void
pmap_insert_entry(pmap_t pmap,vm_offset_t va,vm_page_t m)2596 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2597 {
2598 	pv_entry_t pv;
2599 
2600 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2601 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2602 	pv = get_pv_entry(pmap, FALSE);
2603 	pv->pv_va = va;
2604 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2605 }
2606 
2607 /*
2608  * Conditionally create a pv entry.
2609  */
2610 static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m)2611 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
2612 {
2613 	pv_entry_t pv;
2614 
2615 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2616 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2617 	if (pv_entry_count < pv_entry_high_water &&
2618 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2619 		pv->pv_va = va;
2620 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2621 		return (TRUE);
2622 	} else
2623 		return (FALSE);
2624 }
2625 
2626 /*
2627  * Create the pv entries for each of the pages within a superpage.
2628  */
2629 static boolean_t
pmap_pv_insert_pde(pmap_t pmap,vm_offset_t va,vm_paddr_t pa)2630 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
2631 {
2632 	struct md_page *pvh;
2633 	pv_entry_t pv;
2634 
2635 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2636 	if (pv_entry_count < pv_entry_high_water &&
2637 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
2638 		pv->pv_va = va;
2639 		pvh = pa_to_pvh(pa);
2640 		TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2641 		return (TRUE);
2642 	} else
2643 		return (FALSE);
2644 }
2645 
2646 /*
2647  * Fills a page table page with mappings to consecutive physical pages.
2648  */
2649 static void
pmap_fill_ptp(pt_entry_t * firstpte,pt_entry_t newpte)2650 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
2651 {
2652 	pt_entry_t *pte;
2653 
2654 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
2655 		*pte = newpte;
2656 		newpte += PAGE_SIZE;
2657 	}
2658 }
2659 
2660 /*
2661  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
2662  * 2- or 4MB page mapping is invalidated.
2663  */
2664 static boolean_t
pmap_demote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)2665 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2666 {
2667 	pd_entry_t newpde, oldpde;
2668 	pt_entry_t *firstpte, newpte;
2669 	vm_paddr_t mptepa;
2670 	vm_page_t mpte;
2671 	struct spglist free;
2672 	vm_offset_t sva;
2673 
2674 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2675 	oldpde = *pde;
2676 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
2677 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
2678 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2679 	    NULL) {
2680 		KASSERT((oldpde & PG_W) == 0,
2681 		    ("pmap_demote_pde: page table page for a wired mapping"
2682 		    " is missing"));
2683 
2684 		/*
2685 		 * Invalidate the 2- or 4MB page mapping and return
2686 		 * "failure" if the mapping was never accessed or the
2687 		 * allocation of the new page table page fails.
2688 		 */
2689 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
2690 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
2691 		    VM_ALLOC_WIRED)) == NULL) {
2692 			SLIST_INIT(&free);
2693 			sva = trunc_4mpage(va);
2694 			pmap_remove_pde(pmap, pde, sva, &free);
2695 			if ((oldpde & PG_G) == 0)
2696 				pmap_invalidate_pde_page(pmap, sva, oldpde);
2697 			pmap_free_zero_pages(&free);
2698 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
2699 			    " in pmap %p", va, pmap);
2700 			return (FALSE);
2701 		}
2702 		if (va < VM_MAXUSER_ADDRESS)
2703 			pmap->pm_stats.resident_count++;
2704 	}
2705 	mptepa = VM_PAGE_TO_PHYS(mpte);
2706 
2707 	/*
2708 	 * If the page mapping is in the kernel's address space, then the
2709 	 * KPTmap can provide access to the page table page.  Otherwise,
2710 	 * temporarily map the page table page (mpte) into the kernel's
2711 	 * address space at either PADDR1 or PADDR2.
2712 	 */
2713 	if (va >= KERNBASE)
2714 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
2715 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
2716 		if ((*PMAP1 & PG_FRAME) != mptepa) {
2717 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2718 #ifdef SMP
2719 			PMAP1cpu = PCPU_GET(cpuid);
2720 #endif
2721 			invlcaddr(PADDR1);
2722 			PMAP1changed++;
2723 		} else
2724 #ifdef SMP
2725 		if (PMAP1cpu != PCPU_GET(cpuid)) {
2726 			PMAP1cpu = PCPU_GET(cpuid);
2727 			invlcaddr(PADDR1);
2728 			PMAP1changedcpu++;
2729 		} else
2730 #endif
2731 			PMAP1unchanged++;
2732 		firstpte = PADDR1;
2733 	} else {
2734 		mtx_lock(&PMAP2mutex);
2735 		if ((*PMAP2 & PG_FRAME) != mptepa) {
2736 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
2737 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
2738 		}
2739 		firstpte = PADDR2;
2740 	}
2741 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
2742 	KASSERT((oldpde & PG_A) != 0,
2743 	    ("pmap_demote_pde: oldpde is missing PG_A"));
2744 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
2745 	    ("pmap_demote_pde: oldpde is missing PG_M"));
2746 	newpte = oldpde & ~PG_PS;
2747 	if ((newpte & PG_PDE_PAT) != 0)
2748 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
2749 
2750 	/*
2751 	 * If the page table page is new, initialize it.
2752 	 */
2753 	if (mpte->wire_count == 1) {
2754 		mpte->wire_count = NPTEPG;
2755 		pmap_fill_ptp(firstpte, newpte);
2756 	}
2757 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
2758 	    ("pmap_demote_pde: firstpte and newpte map different physical"
2759 	    " addresses"));
2760 
2761 	/*
2762 	 * If the mapping has changed attributes, update the page table
2763 	 * entries.
2764 	 */
2765 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
2766 		pmap_fill_ptp(firstpte, newpte);
2767 
2768 	/*
2769 	 * Demote the mapping.  This pmap is locked.  The old PDE has
2770 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
2771 	 * set.  Thus, there is no danger of a race with another
2772 	 * processor changing the setting of PG_A and/or PG_M between
2773 	 * the read above and the store below.
2774 	 */
2775 	if (workaround_erratum383)
2776 		pmap_update_pde(pmap, va, pde, newpde);
2777 	else if (pmap == kernel_pmap)
2778 		pmap_kenter_pde(va, newpde);
2779 	else
2780 		pde_store(pde, newpde);
2781 	if (firstpte == PADDR2)
2782 		mtx_unlock(&PMAP2mutex);
2783 
2784 	/*
2785 	 * Invalidate the recursive mapping of the page table page.
2786 	 */
2787 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2788 
2789 	/*
2790 	 * Demote the pv entry.  This depends on the earlier demotion
2791 	 * of the mapping.  Specifically, the (re)creation of a per-
2792 	 * page pv entry might trigger the execution of pmap_collect(),
2793 	 * which might reclaim a newly (re)created per-page pv entry
2794 	 * and destroy the associated mapping.  In order to destroy
2795 	 * the mapping, the PDE must have already changed from mapping
2796 	 * the 2mpage to referencing the page table page.
2797 	 */
2798 	if ((oldpde & PG_MANAGED) != 0)
2799 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
2800 
2801 	pmap_pde_demotions++;
2802 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
2803 	    " in pmap %p", va, pmap);
2804 	return (TRUE);
2805 }
2806 
2807 /*
2808  * Removes a 2- or 4MB page mapping from the kernel pmap.
2809  */
2810 static void
pmap_remove_kernel_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)2811 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
2812 {
2813 	pd_entry_t newpde;
2814 	vm_paddr_t mptepa;
2815 	vm_page_t mpte;
2816 
2817 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2818 	mpte = pmap_remove_pt_page(pmap, va);
2819 	if (mpte == NULL)
2820 		panic("pmap_remove_kernel_pde: Missing pt page.");
2821 
2822 	mptepa = VM_PAGE_TO_PHYS(mpte);
2823 	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
2824 
2825 	/*
2826 	 * Initialize the page table page.
2827 	 */
2828 	pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
2829 
2830 	/*
2831 	 * Remove the mapping.
2832 	 */
2833 	if (workaround_erratum383)
2834 		pmap_update_pde(pmap, va, pde, newpde);
2835 	else
2836 		pmap_kenter_pde(va, newpde);
2837 
2838 	/*
2839 	 * Invalidate the recursive mapping of the page table page.
2840 	 */
2841 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
2842 }
2843 
2844 /*
2845  * pmap_remove_pde: do the things to unmap a superpage in a process
2846  */
2847 static void
pmap_remove_pde(pmap_t pmap,pd_entry_t * pdq,vm_offset_t sva,struct spglist * free)2848 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
2849     struct spglist *free)
2850 {
2851 	struct md_page *pvh;
2852 	pd_entry_t oldpde;
2853 	vm_offset_t eva, va;
2854 	vm_page_t m, mpte;
2855 
2856 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2857 	KASSERT((sva & PDRMASK) == 0,
2858 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
2859 	oldpde = pte_load_clear(pdq);
2860 	if (oldpde & PG_W)
2861 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
2862 
2863 	/*
2864 	 * Machines that don't support invlpg, also don't support
2865 	 * PG_G.
2866 	 */
2867 	if ((oldpde & PG_G) != 0)
2868 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
2869 
2870 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2871 	if (oldpde & PG_MANAGED) {
2872 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
2873 		pmap_pvh_free(pvh, pmap, sva);
2874 		eva = sva + NBPDR;
2875 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
2876 		    va < eva; va += PAGE_SIZE, m++) {
2877 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
2878 				vm_page_dirty(m);
2879 			if (oldpde & PG_A)
2880 				vm_page_aflag_set(m, PGA_REFERENCED);
2881 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2882 			    TAILQ_EMPTY(&pvh->pv_list))
2883 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2884 		}
2885 	}
2886 	if (pmap == kernel_pmap) {
2887 		pmap_remove_kernel_pde(pmap, pdq, sva);
2888 	} else {
2889 		mpte = pmap_remove_pt_page(pmap, sva);
2890 		if (mpte != NULL) {
2891 			pmap->pm_stats.resident_count--;
2892 			KASSERT(mpte->wire_count == NPTEPG,
2893 			    ("pmap_remove_pde: pte page wire count error"));
2894 			mpte->wire_count = 0;
2895 			pmap_add_delayed_free_list(mpte, free, FALSE);
2896 		}
2897 	}
2898 }
2899 
2900 /*
2901  * pmap_remove_pte: do the things to unmap a page in a process
2902  */
2903 static int
pmap_remove_pte(pmap_t pmap,pt_entry_t * ptq,vm_offset_t va,struct spglist * free)2904 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
2905     struct spglist *free)
2906 {
2907 	pt_entry_t oldpte;
2908 	vm_page_t m;
2909 
2910 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2911 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2912 	oldpte = pte_load_clear(ptq);
2913 	KASSERT(oldpte != 0,
2914 	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
2915 	if (oldpte & PG_W)
2916 		pmap->pm_stats.wired_count -= 1;
2917 	/*
2918 	 * Machines that don't support invlpg, also don't support
2919 	 * PG_G.
2920 	 */
2921 	if (oldpte & PG_G)
2922 		pmap_invalidate_page(kernel_pmap, va);
2923 	pmap->pm_stats.resident_count -= 1;
2924 	if (oldpte & PG_MANAGED) {
2925 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
2926 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2927 			vm_page_dirty(m);
2928 		if (oldpte & PG_A)
2929 			vm_page_aflag_set(m, PGA_REFERENCED);
2930 		pmap_remove_entry(pmap, m, va);
2931 	}
2932 	return (pmap_unuse_pt(pmap, va, free));
2933 }
2934 
2935 /*
2936  * Remove a single page from a process address space
2937  */
2938 static void
pmap_remove_page(pmap_t pmap,vm_offset_t va,struct spglist * free)2939 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
2940 {
2941 	pt_entry_t *pte;
2942 
2943 	rw_assert(&pvh_global_lock, RA_WLOCKED);
2944 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2945 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2946 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2947 		return;
2948 	pmap_remove_pte(pmap, pte, va, free);
2949 	pmap_invalidate_page(pmap, va);
2950 }
2951 
2952 /*
2953  *	Remove the given range of addresses from the specified map.
2954  *
2955  *	It is assumed that the start and end are properly
2956  *	rounded to the page size.
2957  */
2958 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)2959 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2960 {
2961 	vm_offset_t pdnxt;
2962 	pd_entry_t ptpaddr;
2963 	pt_entry_t *pte;
2964 	struct spglist free;
2965 	int anyvalid;
2966 
2967 	/*
2968 	 * Perform an unsynchronized read.  This is, however, safe.
2969 	 */
2970 	if (pmap->pm_stats.resident_count == 0)
2971 		return;
2972 
2973 	anyvalid = 0;
2974 	SLIST_INIT(&free);
2975 
2976 	rw_wlock(&pvh_global_lock);
2977 	sched_pin();
2978 	PMAP_LOCK(pmap);
2979 
2980 	/*
2981 	 * special handling of removing one page.  a very
2982 	 * common operation and easy to short circuit some
2983 	 * code.
2984 	 */
2985 	if ((sva + PAGE_SIZE == eva) &&
2986 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2987 		pmap_remove_page(pmap, sva, &free);
2988 		goto out;
2989 	}
2990 
2991 	for (; sva < eva; sva = pdnxt) {
2992 		u_int pdirindex;
2993 
2994 		/*
2995 		 * Calculate index for next page table.
2996 		 */
2997 		pdnxt = (sva + NBPDR) & ~PDRMASK;
2998 		if (pdnxt < sva)
2999 			pdnxt = eva;
3000 		if (pmap->pm_stats.resident_count == 0)
3001 			break;
3002 
3003 		pdirindex = sva >> PDRSHIFT;
3004 		ptpaddr = pmap->pm_pdir[pdirindex];
3005 
3006 		/*
3007 		 * Weed out invalid mappings. Note: we assume that the page
3008 		 * directory table is always allocated, and in kernel virtual.
3009 		 */
3010 		if (ptpaddr == 0)
3011 			continue;
3012 
3013 		/*
3014 		 * Check for large page.
3015 		 */
3016 		if ((ptpaddr & PG_PS) != 0) {
3017 			/*
3018 			 * Are we removing the entire large page?  If not,
3019 			 * demote the mapping and fall through.
3020 			 */
3021 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3022 				/*
3023 				 * The TLB entry for a PG_G mapping is
3024 				 * invalidated by pmap_remove_pde().
3025 				 */
3026 				if ((ptpaddr & PG_G) == 0)
3027 					anyvalid = 1;
3028 				pmap_remove_pde(pmap,
3029 				    &pmap->pm_pdir[pdirindex], sva, &free);
3030 				continue;
3031 			} else if (!pmap_demote_pde(pmap,
3032 			    &pmap->pm_pdir[pdirindex], sva)) {
3033 				/* The large page mapping was destroyed. */
3034 				continue;
3035 			}
3036 		}
3037 
3038 		/*
3039 		 * Limit our scan to either the end of the va represented
3040 		 * by the current page table page, or to the end of the
3041 		 * range being removed.
3042 		 */
3043 		if (pdnxt > eva)
3044 			pdnxt = eva;
3045 
3046 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3047 		    sva += PAGE_SIZE) {
3048 			if (*pte == 0)
3049 				continue;
3050 
3051 			/*
3052 			 * The TLB entry for a PG_G mapping is invalidated
3053 			 * by pmap_remove_pte().
3054 			 */
3055 			if ((*pte & PG_G) == 0)
3056 				anyvalid = 1;
3057 			if (pmap_remove_pte(pmap, pte, sva, &free))
3058 				break;
3059 		}
3060 	}
3061 out:
3062 	sched_unpin();
3063 	if (anyvalid)
3064 		pmap_invalidate_all(pmap);
3065 	rw_wunlock(&pvh_global_lock);
3066 	PMAP_UNLOCK(pmap);
3067 	pmap_free_zero_pages(&free);
3068 }
3069 
3070 /*
3071  *	Routine:	pmap_remove_all
3072  *	Function:
3073  *		Removes this physical page from
3074  *		all physical maps in which it resides.
3075  *		Reflects back modify bits to the pager.
3076  *
3077  *	Notes:
3078  *		Original versions of this routine were very
3079  *		inefficient because they iteratively called
3080  *		pmap_remove (slow...)
3081  */
3082 
3083 void
pmap_remove_all(vm_page_t m)3084 pmap_remove_all(vm_page_t m)
3085 {
3086 	struct md_page *pvh;
3087 	pv_entry_t pv;
3088 	pmap_t pmap;
3089 	pt_entry_t *pte, tpte;
3090 	pd_entry_t *pde;
3091 	vm_offset_t va;
3092 	struct spglist free;
3093 
3094 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3095 	    ("pmap_remove_all: page %p is not managed", m));
3096 	SLIST_INIT(&free);
3097 	rw_wlock(&pvh_global_lock);
3098 	sched_pin();
3099 	if ((m->flags & PG_FICTITIOUS) != 0)
3100 		goto small_mappings;
3101 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3102 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3103 		va = pv->pv_va;
3104 		pmap = PV_PMAP(pv);
3105 		PMAP_LOCK(pmap);
3106 		pde = pmap_pde(pmap, va);
3107 		(void)pmap_demote_pde(pmap, pde, va);
3108 		PMAP_UNLOCK(pmap);
3109 	}
3110 small_mappings:
3111 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3112 		pmap = PV_PMAP(pv);
3113 		PMAP_LOCK(pmap);
3114 		pmap->pm_stats.resident_count--;
3115 		pde = pmap_pde(pmap, pv->pv_va);
3116 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
3117 		    " a 4mpage in page %p's pv list", m));
3118 		pte = pmap_pte_quick(pmap, pv->pv_va);
3119 		tpte = pte_load_clear(pte);
3120 		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
3121 		    pmap, pv->pv_va));
3122 		if (tpte & PG_W)
3123 			pmap->pm_stats.wired_count--;
3124 		if (tpte & PG_A)
3125 			vm_page_aflag_set(m, PGA_REFERENCED);
3126 
3127 		/*
3128 		 * Update the vm_page_t clean and reference bits.
3129 		 */
3130 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3131 			vm_page_dirty(m);
3132 		pmap_unuse_pt(pmap, pv->pv_va, &free);
3133 		pmap_invalidate_page(pmap, pv->pv_va);
3134 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3135 		free_pv_entry(pmap, pv);
3136 		PMAP_UNLOCK(pmap);
3137 	}
3138 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3139 	sched_unpin();
3140 	rw_wunlock(&pvh_global_lock);
3141 	pmap_free_zero_pages(&free);
3142 }
3143 
3144 /*
3145  * pmap_protect_pde: do the things to protect a 4mpage in a process
3146  */
3147 static boolean_t
pmap_protect_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t sva,vm_prot_t prot)3148 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
3149 {
3150 	pd_entry_t newpde, oldpde;
3151 	vm_offset_t eva, va;
3152 	vm_page_t m;
3153 	boolean_t anychanged;
3154 
3155 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3156 	KASSERT((sva & PDRMASK) == 0,
3157 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
3158 	anychanged = FALSE;
3159 retry:
3160 	oldpde = newpde = *pde;
3161 	if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
3162 	    (PG_MANAGED | PG_M | PG_RW)) {
3163 		eva = sva + NBPDR;
3164 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
3165 		    va < eva; va += PAGE_SIZE, m++)
3166 			vm_page_dirty(m);
3167 	}
3168 	if ((prot & VM_PROT_WRITE) == 0)
3169 		newpde &= ~(PG_RW | PG_M);
3170 #if defined(PAE) || defined(PAE_TABLES)
3171 	if ((prot & VM_PROT_EXECUTE) == 0)
3172 		newpde |= pg_nx;
3173 #endif
3174 	if (newpde != oldpde) {
3175 		/*
3176 		 * As an optimization to future operations on this PDE, clear
3177 		 * PG_PROMOTED.  The impending invalidation will remove any
3178 		 * lingering 4KB page mappings from the TLB.
3179 		 */
3180 		if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
3181 			goto retry;
3182 		if ((oldpde & PG_G) != 0)
3183 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
3184 		else
3185 			anychanged = TRUE;
3186 	}
3187 	return (anychanged);
3188 }
3189 
3190 /*
3191  *	Set the physical protection on the
3192  *	specified range of this map as requested.
3193  */
3194 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)3195 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3196 {
3197 	vm_offset_t pdnxt;
3198 	pd_entry_t ptpaddr;
3199 	pt_entry_t *pte;
3200 	boolean_t anychanged, pv_lists_locked;
3201 
3202 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3203 	if (prot == VM_PROT_NONE) {
3204 		pmap_remove(pmap, sva, eva);
3205 		return;
3206 	}
3207 
3208 #if defined(PAE) || defined(PAE_TABLES)
3209 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
3210 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
3211 		return;
3212 #else
3213 	if (prot & VM_PROT_WRITE)
3214 		return;
3215 #endif
3216 
3217 	if (pmap_is_current(pmap))
3218 		pv_lists_locked = FALSE;
3219 	else {
3220 		pv_lists_locked = TRUE;
3221 resume:
3222 		rw_wlock(&pvh_global_lock);
3223 		sched_pin();
3224 	}
3225 	anychanged = FALSE;
3226 
3227 	PMAP_LOCK(pmap);
3228 	for (; sva < eva; sva = pdnxt) {
3229 		pt_entry_t obits, pbits;
3230 		u_int pdirindex;
3231 
3232 		pdnxt = (sva + NBPDR) & ~PDRMASK;
3233 		if (pdnxt < sva)
3234 			pdnxt = eva;
3235 
3236 		pdirindex = sva >> PDRSHIFT;
3237 		ptpaddr = pmap->pm_pdir[pdirindex];
3238 
3239 		/*
3240 		 * Weed out invalid mappings. Note: we assume that the page
3241 		 * directory table is always allocated, and in kernel virtual.
3242 		 */
3243 		if (ptpaddr == 0)
3244 			continue;
3245 
3246 		/*
3247 		 * Check for large page.
3248 		 */
3249 		if ((ptpaddr & PG_PS) != 0) {
3250 			/*
3251 			 * Are we protecting the entire large page?  If not,
3252 			 * demote the mapping and fall through.
3253 			 */
3254 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
3255 				/*
3256 				 * The TLB entry for a PG_G mapping is
3257 				 * invalidated by pmap_protect_pde().
3258 				 */
3259 				if (pmap_protect_pde(pmap,
3260 				    &pmap->pm_pdir[pdirindex], sva, prot))
3261 					anychanged = TRUE;
3262 				continue;
3263 			} else {
3264 				if (!pv_lists_locked) {
3265 					pv_lists_locked = TRUE;
3266 					if (!rw_try_wlock(&pvh_global_lock)) {
3267 						if (anychanged)
3268 							pmap_invalidate_all(
3269 							    pmap);
3270 						PMAP_UNLOCK(pmap);
3271 						goto resume;
3272 					}
3273 					sched_pin();
3274 				}
3275 				if (!pmap_demote_pde(pmap,
3276 				    &pmap->pm_pdir[pdirindex], sva)) {
3277 					/*
3278 					 * The large page mapping was
3279 					 * destroyed.
3280 					 */
3281 					continue;
3282 				}
3283 			}
3284 		}
3285 
3286 		if (pdnxt > eva)
3287 			pdnxt = eva;
3288 
3289 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
3290 		    sva += PAGE_SIZE) {
3291 			vm_page_t m;
3292 
3293 retry:
3294 			/*
3295 			 * Regardless of whether a pte is 32 or 64 bits in
3296 			 * size, PG_RW, PG_A, and PG_M are among the least
3297 			 * significant 32 bits.
3298 			 */
3299 			obits = pbits = *pte;
3300 			if ((pbits & PG_V) == 0)
3301 				continue;
3302 
3303 			if ((prot & VM_PROT_WRITE) == 0) {
3304 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
3305 				    (PG_MANAGED | PG_M | PG_RW)) {
3306 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
3307 					vm_page_dirty(m);
3308 				}
3309 				pbits &= ~(PG_RW | PG_M);
3310 			}
3311 #if defined(PAE) || defined(PAE_TABLES)
3312 			if ((prot & VM_PROT_EXECUTE) == 0)
3313 				pbits |= pg_nx;
3314 #endif
3315 
3316 			if (pbits != obits) {
3317 #if defined(PAE) || defined(PAE_TABLES)
3318 				if (!atomic_cmpset_64(pte, obits, pbits))
3319 					goto retry;
3320 #else
3321 				if (!atomic_cmpset_int((u_int *)pte, obits,
3322 				    pbits))
3323 					goto retry;
3324 #endif
3325 				if (obits & PG_G)
3326 					pmap_invalidate_page(pmap, sva);
3327 				else
3328 					anychanged = TRUE;
3329 			}
3330 		}
3331 	}
3332 	if (anychanged)
3333 		pmap_invalidate_all(pmap);
3334 	if (pv_lists_locked) {
3335 		sched_unpin();
3336 		rw_wunlock(&pvh_global_lock);
3337 	}
3338 	PMAP_UNLOCK(pmap);
3339 }
3340 
3341 #if VM_NRESERVLEVEL > 0
3342 /*
3343  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
3344  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
3345  * For promotion to occur, two conditions must be met: (1) the 4KB page
3346  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
3347  * mappings must have identical characteristics.
3348  *
3349  * Managed (PG_MANAGED) mappings within the kernel address space are not
3350  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
3351  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
3352  * pmap.
3353  */
3354 static void
pmap_promote_pde(pmap_t pmap,pd_entry_t * pde,vm_offset_t va)3355 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3356 {
3357 	pd_entry_t newpde;
3358 	pt_entry_t *firstpte, oldpte, pa, *pte;
3359 	vm_offset_t oldpteva;
3360 	vm_page_t mpte;
3361 
3362 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3363 
3364 	/*
3365 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
3366 	 * either invalid, unused, or does not map the first 4KB physical page
3367 	 * within a 2- or 4MB page.
3368 	 */
3369 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
3370 setpde:
3371 	newpde = *firstpte;
3372 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
3373 		pmap_pde_p_failures++;
3374 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3375 		    " in pmap %p", va, pmap);
3376 		return;
3377 	}
3378 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
3379 		pmap_pde_p_failures++;
3380 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3381 		    " in pmap %p", va, pmap);
3382 		return;
3383 	}
3384 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
3385 		/*
3386 		 * When PG_M is already clear, PG_RW can be cleared without
3387 		 * a TLB invalidation.
3388 		 */
3389 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
3390 		    ~PG_RW))
3391 			goto setpde;
3392 		newpde &= ~PG_RW;
3393 	}
3394 
3395 	/*
3396 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
3397 	 * PTE maps an unexpected 4KB physical page or does not have identical
3398 	 * characteristics to the first PTE.
3399 	 */
3400 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
3401 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
3402 setpte:
3403 		oldpte = *pte;
3404 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
3405 			pmap_pde_p_failures++;
3406 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3407 			    " in pmap %p", va, pmap);
3408 			return;
3409 		}
3410 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
3411 			/*
3412 			 * When PG_M is already clear, PG_RW can be cleared
3413 			 * without a TLB invalidation.
3414 			 */
3415 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
3416 			    oldpte & ~PG_RW))
3417 				goto setpte;
3418 			oldpte &= ~PG_RW;
3419 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
3420 			    (va & ~PDRMASK);
3421 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
3422 			    " in pmap %p", oldpteva, pmap);
3423 		}
3424 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
3425 			pmap_pde_p_failures++;
3426 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
3427 			    " in pmap %p", va, pmap);
3428 			return;
3429 		}
3430 		pa -= PAGE_SIZE;
3431 	}
3432 
3433 	/*
3434 	 * Save the page table page in its current state until the PDE
3435 	 * mapping the superpage is demoted by pmap_demote_pde() or
3436 	 * destroyed by pmap_remove_pde().
3437 	 */
3438 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
3439 	KASSERT(mpte >= vm_page_array &&
3440 	    mpte < &vm_page_array[vm_page_array_size],
3441 	    ("pmap_promote_pde: page table page is out of range"));
3442 	KASSERT(mpte->pindex == va >> PDRSHIFT,
3443 	    ("pmap_promote_pde: page table page's pindex is wrong"));
3444 	if (pmap_insert_pt_page(pmap, mpte)) {
3445 		pmap_pde_p_failures++;
3446 		CTR2(KTR_PMAP,
3447 		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
3448 		    pmap);
3449 		return;
3450 	}
3451 
3452 	/*
3453 	 * Promote the pv entries.
3454 	 */
3455 	if ((newpde & PG_MANAGED) != 0)
3456 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
3457 
3458 	/*
3459 	 * Propagate the PAT index to its proper position.
3460 	 */
3461 	if ((newpde & PG_PTE_PAT) != 0)
3462 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
3463 
3464 	/*
3465 	 * Map the superpage.
3466 	 */
3467 	if (workaround_erratum383)
3468 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
3469 	else if (pmap == kernel_pmap)
3470 		pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
3471 	else
3472 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
3473 
3474 	pmap_pde_promotions++;
3475 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
3476 	    " in pmap %p", va, pmap);
3477 }
3478 #endif /* VM_NRESERVLEVEL > 0 */
3479 
3480 /*
3481  *	Insert the given physical page (p) at
3482  *	the specified virtual address (v) in the
3483  *	target physical map with the protection requested.
3484  *
3485  *	If specified, the page will be wired down, meaning
3486  *	that the related pte can not be reclaimed.
3487  *
3488  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3489  *	or lose information.  That is, this routine must actually
3490  *	insert this page into the given map NOW.
3491  */
3492 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)3493 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3494     u_int flags, int8_t psind)
3495 {
3496 	pd_entry_t *pde;
3497 	pt_entry_t *pte;
3498 	pt_entry_t newpte, origpte;
3499 	pv_entry_t pv;
3500 	vm_paddr_t opa, pa;
3501 	vm_page_t mpte, om;
3502 	boolean_t invlva, wired;
3503 
3504 	va = trunc_page(va);
3505 	mpte = NULL;
3506 	wired = (flags & PMAP_ENTER_WIRED) != 0;
3507 
3508 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
3509 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
3510 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)",
3511 	    va));
3512 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
3513 		VM_OBJECT_ASSERT_LOCKED(m->object);
3514 
3515 	rw_wlock(&pvh_global_lock);
3516 	PMAP_LOCK(pmap);
3517 	sched_pin();
3518 
3519 	pde = pmap_pde(pmap, va);
3520 	if (va < VM_MAXUSER_ADDRESS) {
3521 		/*
3522 		 * va is for UVA.
3523 		 * In the case that a page table page is not resident,
3524 		 * we are creating it here.  pmap_allocpte() handles
3525 		 * demotion.
3526 		 */
3527 		mpte = pmap_allocpte(pmap, va, flags);
3528 		if (mpte == NULL) {
3529 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
3530 			    ("pmap_allocpte failed with sleep allowed"));
3531 			sched_unpin();
3532 			rw_wunlock(&pvh_global_lock);
3533 			PMAP_UNLOCK(pmap);
3534 			return (KERN_RESOURCE_SHORTAGE);
3535 		}
3536 	} else {
3537 		/*
3538 		 * va is for KVA, so pmap_demote_pde() will never fail
3539 		 * to install a page table page.  PG_V is also
3540 		 * asserted by pmap_demote_pde().
3541 		 */
3542 		KASSERT(pde != NULL && (*pde & PG_V) != 0,
3543 		    ("KVA %#x invalid pde pdir %#jx", va,
3544 		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
3545 		if ((*pde & PG_PS) != 0)
3546 			pmap_demote_pde(pmap, pde, va);
3547 	}
3548 	pte = pmap_pte_quick(pmap, va);
3549 
3550 	/*
3551 	 * Page Directory table entry is not valid, which should not
3552 	 * happen.  We should have either allocated the page table
3553 	 * page or demoted the existing mapping above.
3554 	 */
3555 	if (pte == NULL) {
3556 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
3557 		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
3558 	}
3559 
3560 	pa = VM_PAGE_TO_PHYS(m);
3561 	om = NULL;
3562 	origpte = *pte;
3563 	opa = origpte & PG_FRAME;
3564 
3565 	/*
3566 	 * Mapping has not changed, must be protection or wiring change.
3567 	 */
3568 	if (origpte && (opa == pa)) {
3569 		/*
3570 		 * Wiring change, just update stats. We don't worry about
3571 		 * wiring PT pages as they remain resident as long as there
3572 		 * are valid mappings in them. Hence, if a user page is wired,
3573 		 * the PT page will be also.
3574 		 */
3575 		if (wired && ((origpte & PG_W) == 0))
3576 			pmap->pm_stats.wired_count++;
3577 		else if (!wired && (origpte & PG_W))
3578 			pmap->pm_stats.wired_count--;
3579 
3580 		/*
3581 		 * Remove extra pte reference
3582 		 */
3583 		if (mpte)
3584 			mpte->wire_count--;
3585 
3586 		if (origpte & PG_MANAGED) {
3587 			om = m;
3588 			pa |= PG_MANAGED;
3589 		}
3590 		goto validate;
3591 	}
3592 
3593 	pv = NULL;
3594 
3595 	/*
3596 	 * Mapping has changed, invalidate old range and fall through to
3597 	 * handle validating new mapping.
3598 	 */
3599 	if (opa) {
3600 		if (origpte & PG_W)
3601 			pmap->pm_stats.wired_count--;
3602 		if (origpte & PG_MANAGED) {
3603 			om = PHYS_TO_VM_PAGE(opa);
3604 			pv = pmap_pvh_remove(&om->md, pmap, va);
3605 		}
3606 		if (mpte != NULL) {
3607 			mpte->wire_count--;
3608 			KASSERT(mpte->wire_count > 0,
3609 			    ("pmap_enter: missing reference to page table page,"
3610 			     " va: 0x%x", va));
3611 		}
3612 	} else
3613 		pmap->pm_stats.resident_count++;
3614 
3615 	/*
3616 	 * Enter on the PV list if part of our managed memory.
3617 	 */
3618 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3619 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
3620 		    ("pmap_enter: managed mapping within the clean submap"));
3621 		if (pv == NULL)
3622 			pv = get_pv_entry(pmap, FALSE);
3623 		pv->pv_va = va;
3624 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3625 		pa |= PG_MANAGED;
3626 	} else if (pv != NULL)
3627 		free_pv_entry(pmap, pv);
3628 
3629 	/*
3630 	 * Increment counters
3631 	 */
3632 	if (wired)
3633 		pmap->pm_stats.wired_count++;
3634 
3635 validate:
3636 	/*
3637 	 * Now validate mapping with desired protection/wiring.
3638 	 */
3639 	newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V);
3640 	if ((prot & VM_PROT_WRITE) != 0) {
3641 		newpte |= PG_RW;
3642 		if ((newpte & PG_MANAGED) != 0)
3643 			vm_page_aflag_set(m, PGA_WRITEABLE);
3644 	}
3645 #if defined(PAE) || defined(PAE_TABLES)
3646 	if ((prot & VM_PROT_EXECUTE) == 0)
3647 		newpte |= pg_nx;
3648 #endif
3649 	if (wired)
3650 		newpte |= PG_W;
3651 	if (va < VM_MAXUSER_ADDRESS)
3652 		newpte |= PG_U;
3653 	if (pmap == kernel_pmap)
3654 		newpte |= pgeflag;
3655 
3656 	/*
3657 	 * if the mapping or permission bits are different, we need
3658 	 * to update the pte.
3659 	 */
3660 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
3661 		newpte |= PG_A;
3662 		if ((flags & VM_PROT_WRITE) != 0)
3663 			newpte |= PG_M;
3664 		if (origpte & PG_V) {
3665 			invlva = FALSE;
3666 			origpte = pte_load_store(pte, newpte);
3667 			if (origpte & PG_A) {
3668 				if (origpte & PG_MANAGED)
3669 					vm_page_aflag_set(om, PGA_REFERENCED);
3670 				if (opa != VM_PAGE_TO_PHYS(m))
3671 					invlva = TRUE;
3672 #if defined(PAE) || defined(PAE_TABLES)
3673 				if ((origpte & PG_NX) == 0 &&
3674 				    (newpte & PG_NX) != 0)
3675 					invlva = TRUE;
3676 #endif
3677 			}
3678 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3679 				if ((origpte & PG_MANAGED) != 0)
3680 					vm_page_dirty(om);
3681 				if ((prot & VM_PROT_WRITE) == 0)
3682 					invlva = TRUE;
3683 			}
3684 			if ((origpte & PG_MANAGED) != 0 &&
3685 			    TAILQ_EMPTY(&om->md.pv_list) &&
3686 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3687 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3688 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3689 			if (invlva)
3690 				pmap_invalidate_page(pmap, va);
3691 		} else
3692 			pte_store(pte, newpte);
3693 	}
3694 
3695 #if VM_NRESERVLEVEL > 0
3696 	/*
3697 	 * If both the page table page and the reservation are fully
3698 	 * populated, then attempt promotion.
3699 	 */
3700 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
3701 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
3702 	    vm_reserv_level_iffullpop(m) == 0)
3703 		pmap_promote_pde(pmap, pde, va);
3704 #endif
3705 
3706 	sched_unpin();
3707 	rw_wunlock(&pvh_global_lock);
3708 	PMAP_UNLOCK(pmap);
3709 	return (KERN_SUCCESS);
3710 }
3711 
3712 /*
3713  * Tries to create a 2- or 4MB page mapping.  Returns TRUE if successful and
3714  * FALSE otherwise.  Fails if (1) a page table page cannot be allocated without
3715  * blocking, (2) a mapping already exists at the specified virtual address, or
3716  * (3) a pv entry cannot be allocated without reclaiming another pv entry.
3717  */
3718 static boolean_t
pmap_enter_pde(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3719 pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3720 {
3721 	pd_entry_t *pde, newpde;
3722 
3723 	rw_assert(&pvh_global_lock, RA_WLOCKED);
3724 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3725 	pde = pmap_pde(pmap, va);
3726 	if (*pde != 0) {
3727 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3728 		    " in pmap %p", va, pmap);
3729 		return (FALSE);
3730 	}
3731 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) |
3732 	    PG_PS | PG_V;
3733 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3734 		newpde |= PG_MANAGED;
3735 
3736 		/*
3737 		 * Abort this mapping if its PV entry could not be created.
3738 		 */
3739 		if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) {
3740 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3741 			    " in pmap %p", va, pmap);
3742 			return (FALSE);
3743 		}
3744 	}
3745 #if defined(PAE) || defined(PAE_TABLES)
3746 	if ((prot & VM_PROT_EXECUTE) == 0)
3747 		newpde |= pg_nx;
3748 #endif
3749 	if (va < VM_MAXUSER_ADDRESS)
3750 		newpde |= PG_U;
3751 
3752 	/*
3753 	 * Increment counters.
3754 	 */
3755 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
3756 
3757 	/*
3758 	 * Map the superpage.  (This is not a promoted mapping; there will not
3759 	 * be any lingering 4KB page mappings in the TLB.)
3760 	 */
3761 	pde_store(pde, newpde);
3762 
3763 	pmap_pde_mappings++;
3764 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3765 	    " in pmap %p", va, pmap);
3766 	return (TRUE);
3767 }
3768 
3769 /*
3770  * Maps a sequence of resident pages belonging to the same object.
3771  * The sequence begins with the given page m_start.  This page is
3772  * mapped at the given virtual address start.  Each subsequent page is
3773  * mapped at a virtual address that is offset from start by the same
3774  * amount as the page is offset from m_start within the object.  The
3775  * last page in the sequence is the page with the largest offset from
3776  * m_start that can be mapped at a virtual address less than the given
3777  * virtual address end.  Not every virtual page between start and end
3778  * is mapped; only those for which a resident page exists with the
3779  * corresponding offset from m_start are mapped.
3780  */
3781 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)3782 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3783     vm_page_t m_start, vm_prot_t prot)
3784 {
3785 	vm_offset_t va;
3786 	vm_page_t m, mpte;
3787 	vm_pindex_t diff, psize;
3788 
3789 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3790 
3791 	psize = atop(end - start);
3792 	mpte = NULL;
3793 	m = m_start;
3794 	rw_wlock(&pvh_global_lock);
3795 	PMAP_LOCK(pmap);
3796 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3797 		va = start + ptoa(diff);
3798 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
3799 		    m->psind == 1 && pg_ps_enabled &&
3800 		    pmap_enter_pde(pmap, va, m, prot))
3801 			m = &m[NBPDR / PAGE_SIZE - 1];
3802 		else
3803 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
3804 			    mpte);
3805 		m = TAILQ_NEXT(m, listq);
3806 	}
3807 	rw_wunlock(&pvh_global_lock);
3808 	PMAP_UNLOCK(pmap);
3809 }
3810 
3811 /*
3812  * this code makes some *MAJOR* assumptions:
3813  * 1. Current pmap & pmap exists.
3814  * 2. Not wired.
3815  * 3. Read access.
3816  * 4. No page table pages.
3817  * but is *MUCH* faster than pmap_enter...
3818  */
3819 
3820 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)3821 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3822 {
3823 
3824 	rw_wlock(&pvh_global_lock);
3825 	PMAP_LOCK(pmap);
3826 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
3827 	rw_wunlock(&pvh_global_lock);
3828 	PMAP_UNLOCK(pmap);
3829 }
3830 
3831 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte)3832 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3833     vm_prot_t prot, vm_page_t mpte)
3834 {
3835 	pt_entry_t newpte, *pte;
3836 	struct spglist free;
3837 
3838 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3839 	    (m->oflags & VPO_UNMANAGED) != 0,
3840 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3841 	rw_assert(&pvh_global_lock, RA_WLOCKED);
3842 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3843 
3844 	/*
3845 	 * In the case that a page table page is not
3846 	 * resident, we are creating it here.
3847 	 */
3848 	if (va < VM_MAXUSER_ADDRESS) {
3849 		u_int ptepindex;
3850 		pd_entry_t ptepa;
3851 
3852 		/*
3853 		 * Calculate pagetable page index
3854 		 */
3855 		ptepindex = va >> PDRSHIFT;
3856 		if (mpte && (mpte->pindex == ptepindex)) {
3857 			mpte->wire_count++;
3858 		} else {
3859 			/*
3860 			 * Get the page directory entry
3861 			 */
3862 			ptepa = pmap->pm_pdir[ptepindex];
3863 
3864 			/*
3865 			 * If the page table page is mapped, we just increment
3866 			 * the hold count, and activate it.
3867 			 */
3868 			if (ptepa) {
3869 				if (ptepa & PG_PS)
3870 					return (NULL);
3871 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
3872 				mpte->wire_count++;
3873 			} else {
3874 				mpte = _pmap_allocpte(pmap, ptepindex,
3875 				    PMAP_ENTER_NOSLEEP);
3876 				if (mpte == NULL)
3877 					return (mpte);
3878 			}
3879 		}
3880 	} else {
3881 		mpte = NULL;
3882 	}
3883 
3884 	/*
3885 	 * This call to vtopte makes the assumption that we are
3886 	 * entering the page into the current pmap.  In order to support
3887 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
3888 	 * But that isn't as quick as vtopte.
3889 	 */
3890 	pte = vtopte(va);
3891 	if (*pte) {
3892 		if (mpte != NULL) {
3893 			mpte->wire_count--;
3894 			mpte = NULL;
3895 		}
3896 		return (mpte);
3897 	}
3898 
3899 	/*
3900 	 * Enter on the PV list if part of our managed memory.
3901 	 */
3902 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3903 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
3904 		if (mpte != NULL) {
3905 			SLIST_INIT(&free);
3906 			if (pmap_unwire_ptp(pmap, mpte, &free)) {
3907 				pmap_invalidate_page(pmap, va);
3908 				pmap_free_zero_pages(&free);
3909 			}
3910 
3911 			mpte = NULL;
3912 		}
3913 		return (mpte);
3914 	}
3915 
3916 	/*
3917 	 * Increment counters
3918 	 */
3919 	pmap->pm_stats.resident_count++;
3920 
3921 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
3922 	    pmap_cache_bits(m->md.pat_mode, 0);
3923 	if ((m->oflags & VPO_UNMANAGED) == 0)
3924 		newpte |= PG_MANAGED;
3925 #if defined(PAE) || defined(PAE_TABLES)
3926 	if ((prot & VM_PROT_EXECUTE) == 0)
3927 		newpte |= pg_nx;
3928 #endif
3929 	if (pmap != kernel_pmap)
3930 		newpte |= PG_U;
3931 	pte_store(pte, newpte);
3932 	return (mpte);
3933 }
3934 
3935 /*
3936  * Make a temporary mapping for a physical address.  This is only intended
3937  * to be used for panic dumps.
3938  */
3939 void *
pmap_kenter_temporary(vm_paddr_t pa,int i)3940 pmap_kenter_temporary(vm_paddr_t pa, int i)
3941 {
3942 	vm_offset_t va;
3943 
3944 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
3945 	pmap_kenter(va, pa);
3946 	invlpg(va);
3947 	return ((void *)crashdumpmap);
3948 }
3949 
3950 /*
3951  * This code maps large physical mmap regions into the
3952  * processor address space.  Note that some shortcuts
3953  * are taken, but the code works.
3954  */
3955 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)3956 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3957     vm_pindex_t pindex, vm_size_t size)
3958 {
3959 	pd_entry_t *pde;
3960 	vm_paddr_t pa, ptepa;
3961 	vm_page_t p;
3962 	int pat_mode;
3963 
3964 	VM_OBJECT_ASSERT_WLOCKED(object);
3965 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3966 	    ("pmap_object_init_pt: non-device object"));
3967 	if (pseflag &&
3968 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
3969 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
3970 			return;
3971 		p = vm_page_lookup(object, pindex);
3972 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
3973 		    ("pmap_object_init_pt: invalid page %p", p));
3974 		pat_mode = p->md.pat_mode;
3975 
3976 		/*
3977 		 * Abort the mapping if the first page is not physically
3978 		 * aligned to a 2/4MB page boundary.
3979 		 */
3980 		ptepa = VM_PAGE_TO_PHYS(p);
3981 		if (ptepa & (NBPDR - 1))
3982 			return;
3983 
3984 		/*
3985 		 * Skip the first page.  Abort the mapping if the rest of
3986 		 * the pages are not physically contiguous or have differing
3987 		 * memory attributes.
3988 		 */
3989 		p = TAILQ_NEXT(p, listq);
3990 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3991 		    pa += PAGE_SIZE) {
3992 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
3993 			    ("pmap_object_init_pt: invalid page %p", p));
3994 			if (pa != VM_PAGE_TO_PHYS(p) ||
3995 			    pat_mode != p->md.pat_mode)
3996 				return;
3997 			p = TAILQ_NEXT(p, listq);
3998 		}
3999 
4000 		/*
4001 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
4002 		 * "size" is a multiple of 2/4M, adding the PAT setting to
4003 		 * "pa" will not affect the termination of this loop.
4004 		 */
4005 		PMAP_LOCK(pmap);
4006 		for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa +
4007 		    size; pa += NBPDR) {
4008 			pde = pmap_pde(pmap, addr);
4009 			if (*pde == 0) {
4010 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
4011 				    PG_U | PG_RW | PG_V);
4012 				pmap->pm_stats.resident_count += NBPDR /
4013 				    PAGE_SIZE;
4014 				pmap_pde_mappings++;
4015 			}
4016 			/* Else continue on if the PDE is already valid. */
4017 			addr += NBPDR;
4018 		}
4019 		PMAP_UNLOCK(pmap);
4020 	}
4021 }
4022 
4023 /*
4024  *	Clear the wired attribute from the mappings for the specified range of
4025  *	addresses in the given pmap.  Every valid mapping within that range
4026  *	must have the wired attribute set.  In contrast, invalid mappings
4027  *	cannot have the wired attribute set, so they are ignored.
4028  *
4029  *	The wired attribute of the page table entry is not a hardware feature,
4030  *	so there is no need to invalidate any TLB entries.
4031  */
4032 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4033 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4034 {
4035 	vm_offset_t pdnxt;
4036 	pd_entry_t *pde;
4037 	pt_entry_t *pte;
4038 	boolean_t pv_lists_locked;
4039 
4040 	if (pmap_is_current(pmap))
4041 		pv_lists_locked = FALSE;
4042 	else {
4043 		pv_lists_locked = TRUE;
4044 resume:
4045 		rw_wlock(&pvh_global_lock);
4046 		sched_pin();
4047 	}
4048 	PMAP_LOCK(pmap);
4049 	for (; sva < eva; sva = pdnxt) {
4050 		pdnxt = (sva + NBPDR) & ~PDRMASK;
4051 		if (pdnxt < sva)
4052 			pdnxt = eva;
4053 		pde = pmap_pde(pmap, sva);
4054 		if ((*pde & PG_V) == 0)
4055 			continue;
4056 		if ((*pde & PG_PS) != 0) {
4057 			if ((*pde & PG_W) == 0)
4058 				panic("pmap_unwire: pde %#jx is missing PG_W",
4059 				    (uintmax_t)*pde);
4060 
4061 			/*
4062 			 * Are we unwiring the entire large page?  If not,
4063 			 * demote the mapping and fall through.
4064 			 */
4065 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
4066 				/*
4067 				 * Regardless of whether a pde (or pte) is 32
4068 				 * or 64 bits in size, PG_W is among the least
4069 				 * significant 32 bits.
4070 				 */
4071 				atomic_clear_int((u_int *)pde, PG_W);
4072 				pmap->pm_stats.wired_count -= NBPDR /
4073 				    PAGE_SIZE;
4074 				continue;
4075 			} else {
4076 				if (!pv_lists_locked) {
4077 					pv_lists_locked = TRUE;
4078 					if (!rw_try_wlock(&pvh_global_lock)) {
4079 						PMAP_UNLOCK(pmap);
4080 						/* Repeat sva. */
4081 						goto resume;
4082 					}
4083 					sched_pin();
4084 				}
4085 				if (!pmap_demote_pde(pmap, pde, sva))
4086 					panic("pmap_unwire: demotion failed");
4087 			}
4088 		}
4089 		if (pdnxt > eva)
4090 			pdnxt = eva;
4091 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
4092 		    sva += PAGE_SIZE) {
4093 			if ((*pte & PG_V) == 0)
4094 				continue;
4095 			if ((*pte & PG_W) == 0)
4096 				panic("pmap_unwire: pte %#jx is missing PG_W",
4097 				    (uintmax_t)*pte);
4098 
4099 			/*
4100 			 * PG_W must be cleared atomically.  Although the pmap
4101 			 * lock synchronizes access to PG_W, another processor
4102 			 * could be setting PG_M and/or PG_A concurrently.
4103 			 *
4104 			 * PG_W is among the least significant 32 bits.
4105 			 */
4106 			atomic_clear_int((u_int *)pte, PG_W);
4107 			pmap->pm_stats.wired_count--;
4108 		}
4109 	}
4110 	if (pv_lists_locked) {
4111 		sched_unpin();
4112 		rw_wunlock(&pvh_global_lock);
4113 	}
4114 	PMAP_UNLOCK(pmap);
4115 }
4116 
4117 
4118 /*
4119  *	Copy the range specified by src_addr/len
4120  *	from the source map to the range dst_addr/len
4121  *	in the destination map.
4122  *
4123  *	This routine is only advisory and need not do anything.
4124  */
4125 
4126 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)4127 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4128     vm_offset_t src_addr)
4129 {
4130 	struct spglist free;
4131 	vm_offset_t addr;
4132 	vm_offset_t end_addr = src_addr + len;
4133 	vm_offset_t pdnxt;
4134 
4135 	if (dst_addr != src_addr)
4136 		return;
4137 
4138 	if (!pmap_is_current(src_pmap))
4139 		return;
4140 
4141 	rw_wlock(&pvh_global_lock);
4142 	if (dst_pmap < src_pmap) {
4143 		PMAP_LOCK(dst_pmap);
4144 		PMAP_LOCK(src_pmap);
4145 	} else {
4146 		PMAP_LOCK(src_pmap);
4147 		PMAP_LOCK(dst_pmap);
4148 	}
4149 	sched_pin();
4150 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
4151 		pt_entry_t *src_pte, *dst_pte;
4152 		vm_page_t dstmpte, srcmpte;
4153 		pd_entry_t srcptepaddr;
4154 		u_int ptepindex;
4155 
4156 		KASSERT(addr < UPT_MIN_ADDRESS,
4157 		    ("pmap_copy: invalid to pmap_copy page tables"));
4158 
4159 		pdnxt = (addr + NBPDR) & ~PDRMASK;
4160 		if (pdnxt < addr)
4161 			pdnxt = end_addr;
4162 		ptepindex = addr >> PDRSHIFT;
4163 
4164 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
4165 		if (srcptepaddr == 0)
4166 			continue;
4167 
4168 		if (srcptepaddr & PG_PS) {
4169 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
4170 				continue;
4171 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
4172 			    ((srcptepaddr & PG_MANAGED) == 0 ||
4173 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr &
4174 			    PG_PS_FRAME))) {
4175 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
4176 				    ~PG_W;
4177 				dst_pmap->pm_stats.resident_count +=
4178 				    NBPDR / PAGE_SIZE;
4179 				pmap_pde_mappings++;
4180 			}
4181 			continue;
4182 		}
4183 
4184 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
4185 		KASSERT(srcmpte->wire_count > 0,
4186 		    ("pmap_copy: source page table page is unused"));
4187 
4188 		if (pdnxt > end_addr)
4189 			pdnxt = end_addr;
4190 
4191 		src_pte = vtopte(addr);
4192 		while (addr < pdnxt) {
4193 			pt_entry_t ptetemp;
4194 			ptetemp = *src_pte;
4195 			/*
4196 			 * we only virtual copy managed pages
4197 			 */
4198 			if ((ptetemp & PG_MANAGED) != 0) {
4199 				dstmpte = pmap_allocpte(dst_pmap, addr,
4200 				    PMAP_ENTER_NOSLEEP);
4201 				if (dstmpte == NULL)
4202 					goto out;
4203 				dst_pte = pmap_pte_quick(dst_pmap, addr);
4204 				if (*dst_pte == 0 &&
4205 				    pmap_try_insert_pv_entry(dst_pmap, addr,
4206 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
4207 					/*
4208 					 * Clear the wired, modified, and
4209 					 * accessed (referenced) bits
4210 					 * during the copy.
4211 					 */
4212 					*dst_pte = ptetemp & ~(PG_W | PG_M |
4213 					    PG_A);
4214 					dst_pmap->pm_stats.resident_count++;
4215 	 			} else {
4216 					SLIST_INIT(&free);
4217 					if (pmap_unwire_ptp(dst_pmap, dstmpte,
4218 					    &free)) {
4219 						pmap_invalidate_page(dst_pmap,
4220 						    addr);
4221 						pmap_free_zero_pages(&free);
4222 					}
4223 					goto out;
4224 				}
4225 				if (dstmpte->wire_count >= srcmpte->wire_count)
4226 					break;
4227 			}
4228 			addr += PAGE_SIZE;
4229 			src_pte++;
4230 		}
4231 	}
4232 out:
4233 	sched_unpin();
4234 	rw_wunlock(&pvh_global_lock);
4235 	PMAP_UNLOCK(src_pmap);
4236 	PMAP_UNLOCK(dst_pmap);
4237 }
4238 
4239 static __inline void
pagezero(void * page)4240 pagezero(void *page)
4241 {
4242 #if defined(I686_CPU)
4243 	if (cpu_class == CPUCLASS_686) {
4244 		if (cpu_feature & CPUID_SSE2)
4245 			sse2_pagezero(page);
4246 		else
4247 			i686_pagezero(page);
4248 	} else
4249 #endif
4250 		bzero(page, PAGE_SIZE);
4251 }
4252 
4253 /*
4254  *	pmap_zero_page zeros the specified hardware page by mapping
4255  *	the page into KVM and using bzero to clear its contents.
4256  */
4257 void
pmap_zero_page(vm_page_t m)4258 pmap_zero_page(vm_page_t m)
4259 {
4260 	pt_entry_t *cmap_pte2;
4261 	struct pcpu *pc;
4262 
4263 	sched_pin();
4264 	pc = get_pcpu();
4265 	cmap_pte2 = pc->pc_cmap_pte2;
4266 	mtx_lock(&pc->pc_cmap_lock);
4267 	if (*cmap_pte2)
4268 		panic("pmap_zero_page: CMAP2 busy");
4269 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4270 	    pmap_cache_bits(m->md.pat_mode, 0);
4271 	invlcaddr(pc->pc_cmap_addr2);
4272 	pagezero(pc->pc_cmap_addr2);
4273 	*cmap_pte2 = 0;
4274 
4275 	/*
4276 	 * Unpin the thread before releasing the lock.  Otherwise the thread
4277 	 * could be rescheduled while still bound to the current CPU, only
4278 	 * to unpin itself immediately upon resuming execution.
4279 	 */
4280 	sched_unpin();
4281 	mtx_unlock(&pc->pc_cmap_lock);
4282 }
4283 
4284 /*
4285  *	pmap_zero_page_area zeros the specified hardware page by mapping
4286  *	the page into KVM and using bzero to clear its contents.
4287  *
4288  *	off and size may not cover an area beyond a single hardware page.
4289  */
4290 void
pmap_zero_page_area(vm_page_t m,int off,int size)4291 pmap_zero_page_area(vm_page_t m, int off, int size)
4292 {
4293 	pt_entry_t *cmap_pte2;
4294 	struct pcpu *pc;
4295 
4296 	sched_pin();
4297 	pc = get_pcpu();
4298 	cmap_pte2 = pc->pc_cmap_pte2;
4299 	mtx_lock(&pc->pc_cmap_lock);
4300 	if (*cmap_pte2)
4301 		panic("pmap_zero_page_area: CMAP2 busy");
4302 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4303 	    pmap_cache_bits(m->md.pat_mode, 0);
4304 	invlcaddr(pc->pc_cmap_addr2);
4305 	if (off == 0 && size == PAGE_SIZE)
4306 		pagezero(pc->pc_cmap_addr2);
4307 	else
4308 		bzero(pc->pc_cmap_addr2 + off, size);
4309 	*cmap_pte2 = 0;
4310 	sched_unpin();
4311 	mtx_unlock(&pc->pc_cmap_lock);
4312 }
4313 
4314 /*
4315  *	pmap_zero_page_idle zeros the specified hardware page by mapping
4316  *	the page into KVM and using bzero to clear its contents.  This
4317  *	is intended to be called from the vm_pagezero process only and
4318  *	outside of Giant.
4319  */
4320 void
pmap_zero_page_idle(vm_page_t m)4321 pmap_zero_page_idle(vm_page_t m)
4322 {
4323 
4324 	if (*CMAP3)
4325 		panic("pmap_zero_page_idle: CMAP3 busy");
4326 	sched_pin();
4327 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
4328 	    pmap_cache_bits(m->md.pat_mode, 0);
4329 	invlcaddr(CADDR3);
4330 	pagezero(CADDR3);
4331 	*CMAP3 = 0;
4332 	sched_unpin();
4333 }
4334 
4335 /*
4336  *	pmap_copy_page copies the specified (machine independent)
4337  *	page by mapping the page into virtual memory and using
4338  *	bcopy to copy the page, one machine dependent page at a
4339  *	time.
4340  */
4341 void
pmap_copy_page(vm_page_t src,vm_page_t dst)4342 pmap_copy_page(vm_page_t src, vm_page_t dst)
4343 {
4344 	pt_entry_t *cmap_pte1, *cmap_pte2;
4345 	struct pcpu *pc;
4346 
4347 	sched_pin();
4348 	pc = get_pcpu();
4349 	cmap_pte1 = pc->pc_cmap_pte1;
4350 	cmap_pte2 = pc->pc_cmap_pte2;
4351 	mtx_lock(&pc->pc_cmap_lock);
4352 	if (*cmap_pte1)
4353 		panic("pmap_copy_page: CMAP1 busy");
4354 	if (*cmap_pte2)
4355 		panic("pmap_copy_page: CMAP2 busy");
4356 	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
4357 	    pmap_cache_bits(src->md.pat_mode, 0);
4358 	invlcaddr(pc->pc_cmap_addr1);
4359 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
4360 	    pmap_cache_bits(dst->md.pat_mode, 0);
4361 	invlcaddr(pc->pc_cmap_addr2);
4362 	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
4363 	*cmap_pte1 = 0;
4364 	*cmap_pte2 = 0;
4365 	sched_unpin();
4366 	mtx_unlock(&pc->pc_cmap_lock);
4367 }
4368 
4369 int unmapped_buf_allowed = 1;
4370 
4371 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)4372 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4373     vm_offset_t b_offset, int xfersize)
4374 {
4375 	vm_page_t a_pg, b_pg;
4376 	char *a_cp, *b_cp;
4377 	vm_offset_t a_pg_offset, b_pg_offset;
4378 	pt_entry_t *cmap_pte1, *cmap_pte2;
4379 	struct pcpu *pc;
4380 	int cnt;
4381 
4382 	sched_pin();
4383 	pc = get_pcpu();
4384 	cmap_pte1 = pc->pc_cmap_pte1;
4385 	cmap_pte2 = pc->pc_cmap_pte2;
4386 	mtx_lock(&pc->pc_cmap_lock);
4387 	if (*cmap_pte1 != 0)
4388 		panic("pmap_copy_pages: CMAP1 busy");
4389 	if (*cmap_pte2 != 0)
4390 		panic("pmap_copy_pages: CMAP2 busy");
4391 	while (xfersize > 0) {
4392 		a_pg = ma[a_offset >> PAGE_SHIFT];
4393 		a_pg_offset = a_offset & PAGE_MASK;
4394 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4395 		b_pg = mb[b_offset >> PAGE_SHIFT];
4396 		b_pg_offset = b_offset & PAGE_MASK;
4397 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4398 		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
4399 		    pmap_cache_bits(a_pg->md.pat_mode, 0);
4400 		invlcaddr(pc->pc_cmap_addr1);
4401 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
4402 		    PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0);
4403 		invlcaddr(pc->pc_cmap_addr2);
4404 		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
4405 		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
4406 		bcopy(a_cp, b_cp, cnt);
4407 		a_offset += cnt;
4408 		b_offset += cnt;
4409 		xfersize -= cnt;
4410 	}
4411 	*cmap_pte1 = 0;
4412 	*cmap_pte2 = 0;
4413 	sched_unpin();
4414 	mtx_unlock(&pc->pc_cmap_lock);
4415 }
4416 
4417 /*
4418  * Returns true if the pmap's pv is one of the first
4419  * 16 pvs linked to from this page.  This count may
4420  * be changed upwards or downwards in the future; it
4421  * is only necessary that true be returned for a small
4422  * subset of pmaps for proper page aging.
4423  */
4424 boolean_t
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)4425 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4426 {
4427 	struct md_page *pvh;
4428 	pv_entry_t pv;
4429 	int loops = 0;
4430 	boolean_t rv;
4431 
4432 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4433 	    ("pmap_page_exists_quick: page %p is not managed", m));
4434 	rv = FALSE;
4435 	rw_wlock(&pvh_global_lock);
4436 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4437 		if (PV_PMAP(pv) == pmap) {
4438 			rv = TRUE;
4439 			break;
4440 		}
4441 		loops++;
4442 		if (loops >= 16)
4443 			break;
4444 	}
4445 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4446 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4447 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4448 			if (PV_PMAP(pv) == pmap) {
4449 				rv = TRUE;
4450 				break;
4451 			}
4452 			loops++;
4453 			if (loops >= 16)
4454 				break;
4455 		}
4456 	}
4457 	rw_wunlock(&pvh_global_lock);
4458 	return (rv);
4459 }
4460 
4461 /*
4462  *	pmap_page_wired_mappings:
4463  *
4464  *	Return the number of managed mappings to the given physical page
4465  *	that are wired.
4466  */
4467 int
pmap_page_wired_mappings(vm_page_t m)4468 pmap_page_wired_mappings(vm_page_t m)
4469 {
4470 	int count;
4471 
4472 	count = 0;
4473 	if ((m->oflags & VPO_UNMANAGED) != 0)
4474 		return (count);
4475 	rw_wlock(&pvh_global_lock);
4476 	count = pmap_pvh_wired_mappings(&m->md, count);
4477 	if ((m->flags & PG_FICTITIOUS) == 0) {
4478 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
4479 	        count);
4480 	}
4481 	rw_wunlock(&pvh_global_lock);
4482 	return (count);
4483 }
4484 
4485 /*
4486  *	pmap_pvh_wired_mappings:
4487  *
4488  *	Return the updated number "count" of managed mappings that are wired.
4489  */
4490 static int
pmap_pvh_wired_mappings(struct md_page * pvh,int count)4491 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
4492 {
4493 	pmap_t pmap;
4494 	pt_entry_t *pte;
4495 	pv_entry_t pv;
4496 
4497 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4498 	sched_pin();
4499 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4500 		pmap = PV_PMAP(pv);
4501 		PMAP_LOCK(pmap);
4502 		pte = pmap_pte_quick(pmap, pv->pv_va);
4503 		if ((*pte & PG_W) != 0)
4504 			count++;
4505 		PMAP_UNLOCK(pmap);
4506 	}
4507 	sched_unpin();
4508 	return (count);
4509 }
4510 
4511 /*
4512  * Returns TRUE if the given page is mapped individually or as part of
4513  * a 4mpage.  Otherwise, returns FALSE.
4514  */
4515 boolean_t
pmap_page_is_mapped(vm_page_t m)4516 pmap_page_is_mapped(vm_page_t m)
4517 {
4518 	boolean_t rv;
4519 
4520 	if ((m->oflags & VPO_UNMANAGED) != 0)
4521 		return (FALSE);
4522 	rw_wlock(&pvh_global_lock);
4523 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4524 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4525 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
4526 	rw_wunlock(&pvh_global_lock);
4527 	return (rv);
4528 }
4529 
4530 /*
4531  * Remove all pages from specified address space
4532  * this aids process exit speeds.  Also, this code
4533  * is special cased for current process only, but
4534  * can have the more generic (and slightly slower)
4535  * mode enabled.  This is much faster than pmap_remove
4536  * in the case of running down an entire address space.
4537  */
4538 void
pmap_remove_pages(pmap_t pmap)4539 pmap_remove_pages(pmap_t pmap)
4540 {
4541 	pt_entry_t *pte, tpte;
4542 	vm_page_t m, mpte, mt;
4543 	pv_entry_t pv;
4544 	struct md_page *pvh;
4545 	struct pv_chunk *pc, *npc;
4546 	struct spglist free;
4547 	int field, idx;
4548 	int32_t bit;
4549 	uint32_t inuse, bitmask;
4550 	int allfree;
4551 
4552 	if (pmap != PCPU_GET(curpmap)) {
4553 		printf("warning: pmap_remove_pages called with non-current pmap\n");
4554 		return;
4555 	}
4556 	SLIST_INIT(&free);
4557 	rw_wlock(&pvh_global_lock);
4558 	PMAP_LOCK(pmap);
4559 	sched_pin();
4560 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
4561 		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
4562 		    pc->pc_pmap));
4563 		allfree = 1;
4564 		for (field = 0; field < _NPCM; field++) {
4565 			inuse = ~pc->pc_map[field] & pc_freemask[field];
4566 			while (inuse != 0) {
4567 				bit = bsfl(inuse);
4568 				bitmask = 1UL << bit;
4569 				idx = field * 32 + bit;
4570 				pv = &pc->pc_pventry[idx];
4571 				inuse &= ~bitmask;
4572 
4573 				pte = pmap_pde(pmap, pv->pv_va);
4574 				tpte = *pte;
4575 				if ((tpte & PG_PS) == 0) {
4576 					pte = vtopte(pv->pv_va);
4577 					tpte = *pte & ~PG_PTE_PAT;
4578 				}
4579 
4580 				if (tpte == 0) {
4581 					printf(
4582 					    "TPTE at %p  IS ZERO @ VA %08x\n",
4583 					    pte, pv->pv_va);
4584 					panic("bad pte");
4585 				}
4586 
4587 /*
4588  * We cannot remove wired pages from a process' mapping at this time
4589  */
4590 				if (tpte & PG_W) {
4591 					allfree = 0;
4592 					continue;
4593 				}
4594 
4595 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
4596 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
4597 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
4598 				    m, (uintmax_t)m->phys_addr,
4599 				    (uintmax_t)tpte));
4600 
4601 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
4602 				    m < &vm_page_array[vm_page_array_size],
4603 				    ("pmap_remove_pages: bad tpte %#jx",
4604 				    (uintmax_t)tpte));
4605 
4606 				pte_clear(pte);
4607 
4608 				/*
4609 				 * Update the vm_page_t clean/reference bits.
4610 				 */
4611 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4612 					if ((tpte & PG_PS) != 0) {
4613 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4614 							vm_page_dirty(mt);
4615 					} else
4616 						vm_page_dirty(m);
4617 				}
4618 
4619 				/* Mark free */
4620 				PV_STAT(pv_entry_frees++);
4621 				PV_STAT(pv_entry_spare++);
4622 				pv_entry_count--;
4623 				pc->pc_map[field] |= bitmask;
4624 				if ((tpte & PG_PS) != 0) {
4625 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
4626 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
4627 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4628 					if (TAILQ_EMPTY(&pvh->pv_list)) {
4629 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
4630 							if (TAILQ_EMPTY(&mt->md.pv_list))
4631 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
4632 					}
4633 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
4634 					if (mpte != NULL) {
4635 						pmap->pm_stats.resident_count--;
4636 						KASSERT(mpte->wire_count == NPTEPG,
4637 						    ("pmap_remove_pages: pte page wire count error"));
4638 						mpte->wire_count = 0;
4639 						pmap_add_delayed_free_list(mpte, &free, FALSE);
4640 					}
4641 				} else {
4642 					pmap->pm_stats.resident_count--;
4643 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4644 					if (TAILQ_EMPTY(&m->md.pv_list) &&
4645 					    (m->flags & PG_FICTITIOUS) == 0) {
4646 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4647 						if (TAILQ_EMPTY(&pvh->pv_list))
4648 							vm_page_aflag_clear(m, PGA_WRITEABLE);
4649 					}
4650 					pmap_unuse_pt(pmap, pv->pv_va, &free);
4651 				}
4652 			}
4653 		}
4654 		if (allfree) {
4655 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
4656 			free_pv_chunk(pc);
4657 		}
4658 	}
4659 	sched_unpin();
4660 	pmap_invalidate_all(pmap);
4661 	rw_wunlock(&pvh_global_lock);
4662 	PMAP_UNLOCK(pmap);
4663 	pmap_free_zero_pages(&free);
4664 }
4665 
4666 /*
4667  *	pmap_is_modified:
4668  *
4669  *	Return whether or not the specified physical page was modified
4670  *	in any physical maps.
4671  */
4672 boolean_t
pmap_is_modified(vm_page_t m)4673 pmap_is_modified(vm_page_t m)
4674 {
4675 	boolean_t rv;
4676 
4677 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4678 	    ("pmap_is_modified: page %p is not managed", m));
4679 
4680 	/*
4681 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4682 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
4683 	 * is clear, no PTEs can have PG_M set.
4684 	 */
4685 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4686 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4687 		return (FALSE);
4688 	rw_wlock(&pvh_global_lock);
4689 	rv = pmap_is_modified_pvh(&m->md) ||
4690 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4691 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4692 	rw_wunlock(&pvh_global_lock);
4693 	return (rv);
4694 }
4695 
4696 /*
4697  * Returns TRUE if any of the given mappings were used to modify
4698  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
4699  * mappings are supported.
4700  */
4701 static boolean_t
pmap_is_modified_pvh(struct md_page * pvh)4702 pmap_is_modified_pvh(struct md_page *pvh)
4703 {
4704 	pv_entry_t pv;
4705 	pt_entry_t *pte;
4706 	pmap_t pmap;
4707 	boolean_t rv;
4708 
4709 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4710 	rv = FALSE;
4711 	sched_pin();
4712 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4713 		pmap = PV_PMAP(pv);
4714 		PMAP_LOCK(pmap);
4715 		pte = pmap_pte_quick(pmap, pv->pv_va);
4716 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
4717 		PMAP_UNLOCK(pmap);
4718 		if (rv)
4719 			break;
4720 	}
4721 	sched_unpin();
4722 	return (rv);
4723 }
4724 
4725 /*
4726  *	pmap_is_prefaultable:
4727  *
4728  *	Return whether or not the specified virtual address is elgible
4729  *	for prefault.
4730  */
4731 boolean_t
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)4732 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4733 {
4734 	pd_entry_t *pde;
4735 	pt_entry_t *pte;
4736 	boolean_t rv;
4737 
4738 	rv = FALSE;
4739 	PMAP_LOCK(pmap);
4740 	pde = pmap_pde(pmap, addr);
4741 	if (*pde != 0 && (*pde & PG_PS) == 0) {
4742 		pte = vtopte(addr);
4743 		rv = *pte == 0;
4744 	}
4745 	PMAP_UNLOCK(pmap);
4746 	return (rv);
4747 }
4748 
4749 /*
4750  *	pmap_is_referenced:
4751  *
4752  *	Return whether or not the specified physical page was referenced
4753  *	in any physical maps.
4754  */
4755 boolean_t
pmap_is_referenced(vm_page_t m)4756 pmap_is_referenced(vm_page_t m)
4757 {
4758 	boolean_t rv;
4759 
4760 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4761 	    ("pmap_is_referenced: page %p is not managed", m));
4762 	rw_wlock(&pvh_global_lock);
4763 	rv = pmap_is_referenced_pvh(&m->md) ||
4764 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4765 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
4766 	rw_wunlock(&pvh_global_lock);
4767 	return (rv);
4768 }
4769 
4770 /*
4771  * Returns TRUE if any of the given mappings were referenced and FALSE
4772  * otherwise.  Both page and 4mpage mappings are supported.
4773  */
4774 static boolean_t
pmap_is_referenced_pvh(struct md_page * pvh)4775 pmap_is_referenced_pvh(struct md_page *pvh)
4776 {
4777 	pv_entry_t pv;
4778 	pt_entry_t *pte;
4779 	pmap_t pmap;
4780 	boolean_t rv;
4781 
4782 	rw_assert(&pvh_global_lock, RA_WLOCKED);
4783 	rv = FALSE;
4784 	sched_pin();
4785 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4786 		pmap = PV_PMAP(pv);
4787 		PMAP_LOCK(pmap);
4788 		pte = pmap_pte_quick(pmap, pv->pv_va);
4789 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
4790 		PMAP_UNLOCK(pmap);
4791 		if (rv)
4792 			break;
4793 	}
4794 	sched_unpin();
4795 	return (rv);
4796 }
4797 
4798 /*
4799  * Clear the write and modified bits in each of the given page's mappings.
4800  */
4801 void
pmap_remove_write(vm_page_t m)4802 pmap_remove_write(vm_page_t m)
4803 {
4804 	struct md_page *pvh;
4805 	pv_entry_t next_pv, pv;
4806 	pmap_t pmap;
4807 	pd_entry_t *pde;
4808 	pt_entry_t oldpte, *pte;
4809 	vm_offset_t va;
4810 
4811 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4812 	    ("pmap_remove_write: page %p is not managed", m));
4813 
4814 	/*
4815 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4816 	 * set by another thread while the object is locked.  Thus,
4817 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
4818 	 */
4819 	VM_OBJECT_ASSERT_WLOCKED(m->object);
4820 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4821 		return;
4822 	rw_wlock(&pvh_global_lock);
4823 	sched_pin();
4824 	if ((m->flags & PG_FICTITIOUS) != 0)
4825 		goto small_mappings;
4826 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4827 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4828 		va = pv->pv_va;
4829 		pmap = PV_PMAP(pv);
4830 		PMAP_LOCK(pmap);
4831 		pde = pmap_pde(pmap, va);
4832 		if ((*pde & PG_RW) != 0)
4833 			(void)pmap_demote_pde(pmap, pde, va);
4834 		PMAP_UNLOCK(pmap);
4835 	}
4836 small_mappings:
4837 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4838 		pmap = PV_PMAP(pv);
4839 		PMAP_LOCK(pmap);
4840 		pde = pmap_pde(pmap, pv->pv_va);
4841 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
4842 		    " a 4mpage in page %p's pv list", m));
4843 		pte = pmap_pte_quick(pmap, pv->pv_va);
4844 retry:
4845 		oldpte = *pte;
4846 		if ((oldpte & PG_RW) != 0) {
4847 			/*
4848 			 * Regardless of whether a pte is 32 or 64 bits
4849 			 * in size, PG_RW and PG_M are among the least
4850 			 * significant 32 bits.
4851 			 */
4852 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
4853 			    oldpte & ~(PG_RW | PG_M)))
4854 				goto retry;
4855 			if ((oldpte & PG_M) != 0)
4856 				vm_page_dirty(m);
4857 			pmap_invalidate_page(pmap, pv->pv_va);
4858 		}
4859 		PMAP_UNLOCK(pmap);
4860 	}
4861 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4862 	sched_unpin();
4863 	rw_wunlock(&pvh_global_lock);
4864 }
4865 
4866 /*
4867  *	pmap_ts_referenced:
4868  *
4869  *	Return a count of reference bits for a page, clearing those bits.
4870  *	It is not necessary for every reference bit to be cleared, but it
4871  *	is necessary that 0 only be returned when there are truly no
4872  *	reference bits set.
4873  *
4874  *	As an optimization, update the page's dirty field if a modified bit is
4875  *	found while counting reference bits.  This opportunistic update can be
4876  *	performed at low cost and can eliminate the need for some future calls
4877  *	to pmap_is_modified().  However, since this function stops after
4878  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4879  *	dirty pages.  Those dirty pages will only be detected by a future call
4880  *	to pmap_is_modified().
4881  */
4882 int
pmap_ts_referenced(vm_page_t m)4883 pmap_ts_referenced(vm_page_t m)
4884 {
4885 	struct md_page *pvh;
4886 	pv_entry_t pv, pvf;
4887 	pmap_t pmap;
4888 	pd_entry_t *pde;
4889 	pt_entry_t *pte;
4890 	vm_paddr_t pa;
4891 	int rtval = 0;
4892 
4893 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4894 	    ("pmap_ts_referenced: page %p is not managed", m));
4895 	pa = VM_PAGE_TO_PHYS(m);
4896 	pvh = pa_to_pvh(pa);
4897 	rw_wlock(&pvh_global_lock);
4898 	sched_pin();
4899 	if ((m->flags & PG_FICTITIOUS) != 0 ||
4900 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4901 		goto small_mappings;
4902 	pv = pvf;
4903 	do {
4904 		pmap = PV_PMAP(pv);
4905 		PMAP_LOCK(pmap);
4906 		pde = pmap_pde(pmap, pv->pv_va);
4907 		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
4908 			/*
4909 			 * Although "*pde" is mapping a 2/4MB page, because
4910 			 * this function is called at a 4KB page granularity,
4911 			 * we only update the 4KB page under test.
4912 			 */
4913 			vm_page_dirty(m);
4914 		}
4915 		if ((*pde & PG_A) != 0) {
4916 			/*
4917 			 * Since this reference bit is shared by either 1024
4918 			 * or 512 4KB pages, it should not be cleared every
4919 			 * time it is tested.  Apply a simple "hash" function
4920 			 * on the physical page number, the virtual superpage
4921 			 * number, and the pmap address to select one 4KB page
4922 			 * out of the 1024 or 512 on which testing the
4923 			 * reference bit will result in clearing that bit.
4924 			 * This function is designed to avoid the selection of
4925 			 * the same 4KB page for every 2- or 4MB page mapping.
4926 			 *
4927 			 * On demotion, a mapping that hasn't been referenced
4928 			 * is simply destroyed.  To avoid the possibility of a
4929 			 * subsequent page fault on a demoted wired mapping,
4930 			 * always leave its reference bit set.  Moreover,
4931 			 * since the superpage is wired, the current state of
4932 			 * its reference bit won't affect page replacement.
4933 			 */
4934 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
4935 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
4936 			    (*pde & PG_W) == 0) {
4937 				atomic_clear_int((u_int *)pde, PG_A);
4938 				pmap_invalidate_page(pmap, pv->pv_va);
4939 			}
4940 			rtval++;
4941 		}
4942 		PMAP_UNLOCK(pmap);
4943 		/* Rotate the PV list if it has more than one entry. */
4944 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4945 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4946 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4947 		}
4948 		if (rtval >= PMAP_TS_REFERENCED_MAX)
4949 			goto out;
4950 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4951 small_mappings:
4952 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4953 		goto out;
4954 	pv = pvf;
4955 	do {
4956 		pmap = PV_PMAP(pv);
4957 		PMAP_LOCK(pmap);
4958 		pde = pmap_pde(pmap, pv->pv_va);
4959 		KASSERT((*pde & PG_PS) == 0,
4960 		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
4961 		    m));
4962 		pte = pmap_pte_quick(pmap, pv->pv_va);
4963 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4964 			vm_page_dirty(m);
4965 		if ((*pte & PG_A) != 0) {
4966 			atomic_clear_int((u_int *)pte, PG_A);
4967 			pmap_invalidate_page(pmap, pv->pv_va);
4968 			rtval++;
4969 		}
4970 		PMAP_UNLOCK(pmap);
4971 		/* Rotate the PV list if it has more than one entry. */
4972 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
4973 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4974 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4975 		}
4976 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
4977 	    PMAP_TS_REFERENCED_MAX);
4978 out:
4979 	sched_unpin();
4980 	rw_wunlock(&pvh_global_lock);
4981 	return (rtval);
4982 }
4983 
4984 /*
4985  *	Apply the given advice to the specified range of addresses within the
4986  *	given pmap.  Depending on the advice, clear the referenced and/or
4987  *	modified flags in each mapping and set the mapped page's dirty field.
4988  */
4989 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)4990 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4991 {
4992 	pd_entry_t oldpde, *pde;
4993 	pt_entry_t *pte;
4994 	vm_offset_t va, pdnxt;
4995 	vm_page_t m;
4996 	boolean_t anychanged, pv_lists_locked;
4997 
4998 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
4999 		return;
5000 	if (pmap_is_current(pmap))
5001 		pv_lists_locked = FALSE;
5002 	else {
5003 		pv_lists_locked = TRUE;
5004 resume:
5005 		rw_wlock(&pvh_global_lock);
5006 		sched_pin();
5007 	}
5008 	anychanged = FALSE;
5009 	PMAP_LOCK(pmap);
5010 	for (; sva < eva; sva = pdnxt) {
5011 		pdnxt = (sva + NBPDR) & ~PDRMASK;
5012 		if (pdnxt < sva)
5013 			pdnxt = eva;
5014 		pde = pmap_pde(pmap, sva);
5015 		oldpde = *pde;
5016 		if ((oldpde & PG_V) == 0)
5017 			continue;
5018 		else if ((oldpde & PG_PS) != 0) {
5019 			if ((oldpde & PG_MANAGED) == 0)
5020 				continue;
5021 			if (!pv_lists_locked) {
5022 				pv_lists_locked = TRUE;
5023 				if (!rw_try_wlock(&pvh_global_lock)) {
5024 					if (anychanged)
5025 						pmap_invalidate_all(pmap);
5026 					PMAP_UNLOCK(pmap);
5027 					goto resume;
5028 				}
5029 				sched_pin();
5030 			}
5031 			if (!pmap_demote_pde(pmap, pde, sva)) {
5032 				/*
5033 				 * The large page mapping was destroyed.
5034 				 */
5035 				continue;
5036 			}
5037 
5038 			/*
5039 			 * Unless the page mappings are wired, remove the
5040 			 * mapping to a single page so that a subsequent
5041 			 * access may repromote.  Since the underlying page
5042 			 * table page is fully populated, this removal never
5043 			 * frees a page table page.
5044 			 */
5045 			if ((oldpde & PG_W) == 0) {
5046 				pte = pmap_pte_quick(pmap, sva);
5047 				KASSERT((*pte & PG_V) != 0,
5048 				    ("pmap_advise: invalid PTE"));
5049 				pmap_remove_pte(pmap, pte, sva, NULL);
5050 				anychanged = TRUE;
5051 			}
5052 		}
5053 		if (pdnxt > eva)
5054 			pdnxt = eva;
5055 		va = pdnxt;
5056 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
5057 		    sva += PAGE_SIZE) {
5058 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
5059 				goto maybe_invlrng;
5060 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5061 				if (advice == MADV_DONTNEED) {
5062 					/*
5063 					 * Future calls to pmap_is_modified()
5064 					 * can be avoided by making the page
5065 					 * dirty now.
5066 					 */
5067 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
5068 					vm_page_dirty(m);
5069 				}
5070 				atomic_clear_int((u_int *)pte, PG_M | PG_A);
5071 			} else if ((*pte & PG_A) != 0)
5072 				atomic_clear_int((u_int *)pte, PG_A);
5073 			else
5074 				goto maybe_invlrng;
5075 			if ((*pte & PG_G) != 0) {
5076 				if (va == pdnxt)
5077 					va = sva;
5078 			} else
5079 				anychanged = TRUE;
5080 			continue;
5081 maybe_invlrng:
5082 			if (va != pdnxt) {
5083 				pmap_invalidate_range(pmap, va, sva);
5084 				va = pdnxt;
5085 			}
5086 		}
5087 		if (va != pdnxt)
5088 			pmap_invalidate_range(pmap, va, sva);
5089 	}
5090 	if (anychanged)
5091 		pmap_invalidate_all(pmap);
5092 	if (pv_lists_locked) {
5093 		sched_unpin();
5094 		rw_wunlock(&pvh_global_lock);
5095 	}
5096 	PMAP_UNLOCK(pmap);
5097 }
5098 
5099 /*
5100  *	Clear the modify bits on the specified physical page.
5101  */
5102 void
pmap_clear_modify(vm_page_t m)5103 pmap_clear_modify(vm_page_t m)
5104 {
5105 	struct md_page *pvh;
5106 	pv_entry_t next_pv, pv;
5107 	pmap_t pmap;
5108 	pd_entry_t oldpde, *pde;
5109 	pt_entry_t oldpte, *pte;
5110 	vm_offset_t va;
5111 
5112 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5113 	    ("pmap_clear_modify: page %p is not managed", m));
5114 	VM_OBJECT_ASSERT_WLOCKED(m->object);
5115 	KASSERT(!vm_page_xbusied(m),
5116 	    ("pmap_clear_modify: page %p is exclusive busied", m));
5117 
5118 	/*
5119 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
5120 	 * If the object containing the page is locked and the page is not
5121 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
5122 	 */
5123 	if ((m->aflags & PGA_WRITEABLE) == 0)
5124 		return;
5125 	rw_wlock(&pvh_global_lock);
5126 	sched_pin();
5127 	if ((m->flags & PG_FICTITIOUS) != 0)
5128 		goto small_mappings;
5129 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5130 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5131 		va = pv->pv_va;
5132 		pmap = PV_PMAP(pv);
5133 		PMAP_LOCK(pmap);
5134 		pde = pmap_pde(pmap, va);
5135 		oldpde = *pde;
5136 		if ((oldpde & PG_RW) != 0) {
5137 			if (pmap_demote_pde(pmap, pde, va)) {
5138 				if ((oldpde & PG_W) == 0) {
5139 					/*
5140 					 * Write protect the mapping to a
5141 					 * single page so that a subsequent
5142 					 * write access may repromote.
5143 					 */
5144 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
5145 					    PG_PS_FRAME);
5146 					pte = pmap_pte_quick(pmap, va);
5147 					oldpte = *pte;
5148 					if ((oldpte & PG_V) != 0) {
5149 						/*
5150 						 * Regardless of whether a pte is 32 or 64 bits
5151 						 * in size, PG_RW and PG_M are among the least
5152 						 * significant 32 bits.
5153 						 */
5154 						while (!atomic_cmpset_int((u_int *)pte,
5155 						    oldpte,
5156 						    oldpte & ~(PG_M | PG_RW)))
5157 							oldpte = *pte;
5158 						vm_page_dirty(m);
5159 						pmap_invalidate_page(pmap, va);
5160 					}
5161 				}
5162 			}
5163 		}
5164 		PMAP_UNLOCK(pmap);
5165 	}
5166 small_mappings:
5167 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5168 		pmap = PV_PMAP(pv);
5169 		PMAP_LOCK(pmap);
5170 		pde = pmap_pde(pmap, pv->pv_va);
5171 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
5172 		    " a 4mpage in page %p's pv list", m));
5173 		pte = pmap_pte_quick(pmap, pv->pv_va);
5174 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5175 			/*
5176 			 * Regardless of whether a pte is 32 or 64 bits
5177 			 * in size, PG_M is among the least significant
5178 			 * 32 bits.
5179 			 */
5180 			atomic_clear_int((u_int *)pte, PG_M);
5181 			pmap_invalidate_page(pmap, pv->pv_va);
5182 		}
5183 		PMAP_UNLOCK(pmap);
5184 	}
5185 	sched_unpin();
5186 	rw_wunlock(&pvh_global_lock);
5187 }
5188 
5189 /*
5190  * Miscellaneous support routines follow
5191  */
5192 
5193 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
5194 static __inline void
pmap_pte_attr(pt_entry_t * pte,int cache_bits)5195 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
5196 {
5197 	u_int opte, npte;
5198 
5199 	/*
5200 	 * The cache mode bits are all in the low 32-bits of the
5201 	 * PTE, so we can just spin on updating the low 32-bits.
5202 	 */
5203 	do {
5204 		opte = *(u_int *)pte;
5205 		npte = opte & ~PG_PTE_CACHE;
5206 		npte |= cache_bits;
5207 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
5208 }
5209 
5210 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
5211 static __inline void
pmap_pde_attr(pd_entry_t * pde,int cache_bits)5212 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
5213 {
5214 	u_int opde, npde;
5215 
5216 	/*
5217 	 * The cache mode bits are all in the low 32-bits of the
5218 	 * PDE, so we can just spin on updating the low 32-bits.
5219 	 */
5220 	do {
5221 		opde = *(u_int *)pde;
5222 		npde = opde & ~PG_PDE_CACHE;
5223 		npde |= cache_bits;
5224 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
5225 }
5226 
5227 /*
5228  * Map a set of physical memory pages into the kernel virtual
5229  * address space. Return a pointer to where it is mapped. This
5230  * routine is intended to be used for mapping device memory,
5231  * NOT real memory.
5232  */
5233 static void *
pmap_mapdev_internal(vm_paddr_t pa,vm_size_t size,int mode,int flags)5234 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags)
5235 {
5236 	struct pmap_preinit_mapping *ppim;
5237 	vm_offset_t va, offset;
5238 	vm_page_t m;
5239 	vm_size_t tmpsize;
5240 	int i;
5241 
5242 	offset = pa & PAGE_MASK;
5243 	size = round_page(offset + size);
5244 	pa = pa & PG_FRAME;
5245 
5246 	if (pa < KERNLOAD && pa + size <= KERNLOAD) {
5247 		va = KERNBASE + pa;
5248 		if ((flags & MAPDEV_SETATTR) == 0)
5249 			return ((void *)(va + offset));
5250 	} else if (!pmap_initialized) {
5251 		va = 0;
5252 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5253 			ppim = pmap_preinit_mapping + i;
5254 			if (ppim->va == 0) {
5255 				ppim->pa = pa;
5256 				ppim->sz = size;
5257 				ppim->mode = mode;
5258 				ppim->va = virtual_avail;
5259 				virtual_avail += size;
5260 				va = ppim->va;
5261 				break;
5262 			}
5263 		}
5264 		if (va == 0)
5265 			panic("%s: too many preinit mappings", __func__);
5266 	} else {
5267 		/*
5268 		 * If we have a preinit mapping, re-use it.
5269 		 */
5270 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5271 			ppim = pmap_preinit_mapping + i;
5272 			if (ppim->pa == pa && ppim->sz == size &&
5273 			    (ppim->mode == mode ||
5274 			    (flags & MAPDEV_SETATTR) == 0))
5275 				return ((void *)(ppim->va + offset));
5276 		}
5277 		va = kva_alloc(size);
5278 		if (va == 0)
5279 			panic("%s: Couldn't allocate KVA", __func__);
5280 	}
5281 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) {
5282 		if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) {
5283 			m = PHYS_TO_VM_PAGE(pa);
5284 			if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) {
5285 				pmap_kenter_attr(va + tmpsize, pa + tmpsize,
5286 				    m->md.pat_mode);
5287 				continue;
5288 			}
5289 		}
5290 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
5291 	}
5292 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
5293 	pmap_invalidate_cache_range(va, va + size, FALSE);
5294 	return ((void *)(va + offset));
5295 }
5296 
5297 void *
pmap_mapdev_attr(vm_paddr_t pa,vm_size_t size,int mode)5298 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
5299 {
5300 
5301 	return (pmap_mapdev_internal(pa, size, mode, MAPDEV_SETATTR));
5302 }
5303 
5304 void *
pmap_mapdev(vm_paddr_t pa,vm_size_t size)5305 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
5306 {
5307 
5308 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
5309 }
5310 
5311 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)5312 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5313 {
5314 
5315 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 0));
5316 }
5317 
5318 void
pmap_unmapdev(vm_offset_t va,vm_size_t size)5319 pmap_unmapdev(vm_offset_t va, vm_size_t size)
5320 {
5321 	struct pmap_preinit_mapping *ppim;
5322 	vm_offset_t offset;
5323 	int i;
5324 
5325 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
5326 		return;
5327 	offset = va & PAGE_MASK;
5328 	size = round_page(offset + size);
5329 	va = trunc_page(va);
5330 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5331 		ppim = pmap_preinit_mapping + i;
5332 		if (ppim->va == va && ppim->sz == size) {
5333 			if (pmap_initialized)
5334 				return;
5335 			ppim->pa = 0;
5336 			ppim->va = 0;
5337 			ppim->sz = 0;
5338 			ppim->mode = 0;
5339 			if (va + size == virtual_avail)
5340 				virtual_avail = va;
5341 			return;
5342 		}
5343 	}
5344 	if (pmap_initialized)
5345 		kva_free(va, size);
5346 }
5347 
5348 /*
5349  * Sets the memory attribute for the specified page.
5350  */
5351 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)5352 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5353 {
5354 
5355 	m->md.pat_mode = ma;
5356 	if ((m->flags & PG_FICTITIOUS) != 0)
5357 		return;
5358 
5359 	/*
5360 	 * If "m" is a normal page, flush it from the cache.
5361 	 * See pmap_invalidate_cache_range().
5362 	 *
5363 	 * First, try to find an existing mapping of the page by sf
5364 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
5365 	 * flushes the cache.
5366 	 */
5367 	if (sf_buf_invalidate_cache(m))
5368 		return;
5369 
5370 	/*
5371 	 * If page is not mapped by sf buffer, but CPU does not
5372 	 * support self snoop, map the page transient and do
5373 	 * invalidation. In the worst case, whole cache is flushed by
5374 	 * pmap_invalidate_cache_range().
5375 	 */
5376 	if ((cpu_feature & CPUID_SS) == 0)
5377 		pmap_flush_page(m);
5378 }
5379 
5380 static void
pmap_flush_page(vm_page_t m)5381 pmap_flush_page(vm_page_t m)
5382 {
5383 	pt_entry_t *cmap_pte2;
5384 	struct pcpu *pc;
5385 	vm_offset_t sva, eva;
5386 	bool useclflushopt;
5387 
5388 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
5389 	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
5390 		sched_pin();
5391 		pc = get_pcpu();
5392 		cmap_pte2 = pc->pc_cmap_pte2;
5393 		mtx_lock(&pc->pc_cmap_lock);
5394 		if (*cmap_pte2)
5395 			panic("pmap_flush_page: CMAP2 busy");
5396 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
5397 		    PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0);
5398 		invlcaddr(pc->pc_cmap_addr2);
5399 		sva = (vm_offset_t)pc->pc_cmap_addr2;
5400 		eva = sva + PAGE_SIZE;
5401 
5402 		/*
5403 		 * Use mfence or sfence despite the ordering implied by
5404 		 * mtx_{un,}lock() because clflush on non-Intel CPUs
5405 		 * and clflushopt are not guaranteed to be ordered by
5406 		 * any other instruction.
5407 		 */
5408 		if (useclflushopt)
5409 			sfence();
5410 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5411 			mfence();
5412 		for (; sva < eva; sva += cpu_clflush_line_size) {
5413 			if (useclflushopt)
5414 				clflushopt(sva);
5415 			else
5416 				clflush(sva);
5417 		}
5418 		if (useclflushopt)
5419 			sfence();
5420 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
5421 			mfence();
5422 		*cmap_pte2 = 0;
5423 		sched_unpin();
5424 		mtx_unlock(&pc->pc_cmap_lock);
5425 	} else
5426 		pmap_invalidate_cache();
5427 }
5428 
5429 /*
5430  * Changes the specified virtual address range's memory type to that given by
5431  * the parameter "mode".  The specified virtual address range must be
5432  * completely contained within either the kernel map.
5433  *
5434  * Returns zero if the change completed successfully, and either EINVAL or
5435  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5436  * of the virtual address range was not mapped, and ENOMEM is returned if
5437  * there was insufficient memory available to complete the change.
5438  */
5439 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)5440 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5441 {
5442 	vm_offset_t base, offset, tmpva;
5443 	pd_entry_t *pde;
5444 	pt_entry_t *pte;
5445 	int cache_bits_pte, cache_bits_pde;
5446 	boolean_t changed;
5447 
5448 	base = trunc_page(va);
5449 	offset = va & PAGE_MASK;
5450 	size = round_page(offset + size);
5451 
5452 	/*
5453 	 * Only supported on kernel virtual addresses above the recursive map.
5454 	 */
5455 	if (base < VM_MIN_KERNEL_ADDRESS)
5456 		return (EINVAL);
5457 
5458 	cache_bits_pde = pmap_cache_bits(mode, 1);
5459 	cache_bits_pte = pmap_cache_bits(mode, 0);
5460 	changed = FALSE;
5461 
5462 	/*
5463 	 * Pages that aren't mapped aren't supported.  Also break down
5464 	 * 2/4MB pages into 4KB pages if required.
5465 	 */
5466 	PMAP_LOCK(kernel_pmap);
5467 	for (tmpva = base; tmpva < base + size; ) {
5468 		pde = pmap_pde(kernel_pmap, tmpva);
5469 		if (*pde == 0) {
5470 			PMAP_UNLOCK(kernel_pmap);
5471 			return (EINVAL);
5472 		}
5473 		if (*pde & PG_PS) {
5474 			/*
5475 			 * If the current 2/4MB page already has
5476 			 * the required memory type, then we need not
5477 			 * demote this page.  Just increment tmpva to
5478 			 * the next 2/4MB page frame.
5479 			 */
5480 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
5481 				tmpva = trunc_4mpage(tmpva) + NBPDR;
5482 				continue;
5483 			}
5484 
5485 			/*
5486 			 * If the current offset aligns with a 2/4MB
5487 			 * page frame and there is at least 2/4MB left
5488 			 * within the range, then we need not break
5489 			 * down this page into 4KB pages.
5490 			 */
5491 			if ((tmpva & PDRMASK) == 0 &&
5492 			    tmpva + PDRMASK < base + size) {
5493 				tmpva += NBPDR;
5494 				continue;
5495 			}
5496 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
5497 				PMAP_UNLOCK(kernel_pmap);
5498 				return (ENOMEM);
5499 			}
5500 		}
5501 		pte = vtopte(tmpva);
5502 		if (*pte == 0) {
5503 			PMAP_UNLOCK(kernel_pmap);
5504 			return (EINVAL);
5505 		}
5506 		tmpva += PAGE_SIZE;
5507 	}
5508 	PMAP_UNLOCK(kernel_pmap);
5509 
5510 	/*
5511 	 * Ok, all the pages exist, so run through them updating their
5512 	 * cache mode if required.
5513 	 */
5514 	for (tmpva = base; tmpva < base + size; ) {
5515 		pde = pmap_pde(kernel_pmap, tmpva);
5516 		if (*pde & PG_PS) {
5517 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
5518 				pmap_pde_attr(pde, cache_bits_pde);
5519 				changed = TRUE;
5520 			}
5521 			tmpva = trunc_4mpage(tmpva) + NBPDR;
5522 		} else {
5523 			pte = vtopte(tmpva);
5524 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
5525 				pmap_pte_attr(pte, cache_bits_pte);
5526 				changed = TRUE;
5527 			}
5528 			tmpva += PAGE_SIZE;
5529 		}
5530 	}
5531 
5532 	/*
5533 	 * Flush CPU caches to make sure any data isn't cached that
5534 	 * shouldn't be, etc.
5535 	 */
5536 	if (changed) {
5537 		pmap_invalidate_range(kernel_pmap, base, tmpva);
5538 		pmap_invalidate_cache_range(base, tmpva, FALSE);
5539 	}
5540 	return (0);
5541 }
5542 
5543 /*
5544  * perform the pmap work for mincore
5545  */
5546 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * locked_pa)5547 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5548 {
5549 	pd_entry_t *pdep;
5550 	pt_entry_t *ptep, pte;
5551 	vm_paddr_t pa;
5552 	int val;
5553 
5554 	PMAP_LOCK(pmap);
5555 retry:
5556 	pdep = pmap_pde(pmap, addr);
5557 	if (*pdep != 0) {
5558 		if (*pdep & PG_PS) {
5559 			pte = *pdep;
5560 			/* Compute the physical address of the 4KB page. */
5561 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
5562 			    PG_FRAME;
5563 			val = MINCORE_SUPER;
5564 		} else {
5565 			ptep = pmap_pte(pmap, addr);
5566 			pte = *ptep;
5567 			pmap_pte_release(ptep);
5568 			pa = pte & PG_FRAME;
5569 			val = 0;
5570 		}
5571 	} else {
5572 		pte = 0;
5573 		pa = 0;
5574 		val = 0;
5575 	}
5576 	if ((pte & PG_V) != 0) {
5577 		val |= MINCORE_INCORE;
5578 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5579 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5580 		if ((pte & PG_A) != 0)
5581 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5582 	}
5583 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5584 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5585 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5586 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
5587 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
5588 			goto retry;
5589 	} else
5590 		PA_UNLOCK_COND(*locked_pa);
5591 	PMAP_UNLOCK(pmap);
5592 	return (val);
5593 }
5594 
5595 void
pmap_activate(struct thread * td)5596 pmap_activate(struct thread *td)
5597 {
5598 	pmap_t	pmap, oldpmap;
5599 	u_int	cpuid;
5600 	u_int32_t  cr3;
5601 
5602 	critical_enter();
5603 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
5604 	oldpmap = PCPU_GET(curpmap);
5605 	cpuid = PCPU_GET(cpuid);
5606 #if defined(SMP)
5607 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
5608 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5609 #else
5610 	CPU_CLR(cpuid, &oldpmap->pm_active);
5611 	CPU_SET(cpuid, &pmap->pm_active);
5612 #endif
5613 #if defined(PAE) || defined(PAE_TABLES)
5614 	cr3 = vtophys(pmap->pm_pdpt);
5615 #else
5616 	cr3 = vtophys(pmap->pm_pdir);
5617 #endif
5618 	/*
5619 	 * pmap_activate is for the current thread on the current cpu
5620 	 */
5621 	td->td_pcb->pcb_cr3 = cr3;
5622 	load_cr3(cr3);
5623 	PCPU_SET(curpmap, pmap);
5624 	critical_exit();
5625 }
5626 
5627 void
pmap_activate_boot(pmap_t pmap)5628 pmap_activate_boot(pmap_t pmap)
5629 {
5630 	u_int cpuid;
5631 
5632 	cpuid = PCPU_GET(cpuid);
5633 #if defined(SMP)
5634 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
5635 #else
5636 	CPU_SET(cpuid, &pmap->pm_active);
5637 #endif
5638 	PCPU_SET(curpmap, pmap);
5639 }
5640 
5641 void
pmap_sync_icache(pmap_t pm,vm_offset_t va,vm_size_t sz)5642 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5643 {
5644 }
5645 
5646 /*
5647  *	Increase the starting virtual address of the given mapping if a
5648  *	different alignment might result in more superpage mappings.
5649  */
5650 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)5651 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5652     vm_offset_t *addr, vm_size_t size)
5653 {
5654 	vm_offset_t superpage_offset;
5655 
5656 	if (size < NBPDR)
5657 		return;
5658 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5659 		offset += ptoa(object->pg_color);
5660 	superpage_offset = offset & PDRMASK;
5661 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
5662 	    (*addr & PDRMASK) == superpage_offset)
5663 		return;
5664 	if ((*addr & PDRMASK) < superpage_offset)
5665 		*addr = (*addr & ~PDRMASK) + superpage_offset;
5666 	else
5667 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
5668 }
5669 
5670 vm_offset_t
pmap_quick_enter_page(vm_page_t m)5671 pmap_quick_enter_page(vm_page_t m)
5672 {
5673 	vm_offset_t qaddr;
5674 	pt_entry_t *pte;
5675 
5676 	critical_enter();
5677 	qaddr = PCPU_GET(qmap_addr);
5678 	pte = vtopte(qaddr);
5679 
5680 	KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy"));
5681 	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
5682 	    pmap_cache_bits(pmap_page_get_memattr(m), 0);
5683 	invlpg(qaddr);
5684 
5685 	return (qaddr);
5686 }
5687 
5688 void
pmap_quick_remove_page(vm_offset_t addr)5689 pmap_quick_remove_page(vm_offset_t addr)
5690 {
5691 	vm_offset_t qaddr;
5692 	pt_entry_t *pte;
5693 
5694 	qaddr = PCPU_GET(qmap_addr);
5695 	pte = vtopte(qaddr);
5696 
5697 	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
5698 	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
5699 
5700 	*pte = 0;
5701 	critical_exit();
5702 }
5703 
5704 #if defined(PMAP_DEBUG)
pmap_pid_dump(int pid)5705 pmap_pid_dump(int pid)
5706 {
5707 	pmap_t pmap;
5708 	struct proc *p;
5709 	int npte = 0;
5710 	int index;
5711 
5712 	sx_slock(&allproc_lock);
5713 	FOREACH_PROC_IN_SYSTEM(p) {
5714 		if (p->p_pid != pid)
5715 			continue;
5716 
5717 		if (p->p_vmspace) {
5718 			int i,j;
5719 			index = 0;
5720 			pmap = vmspace_pmap(p->p_vmspace);
5721 			for (i = 0; i < NPDEPTD; i++) {
5722 				pd_entry_t *pde;
5723 				pt_entry_t *pte;
5724 				vm_offset_t base = i << PDRSHIFT;
5725 
5726 				pde = &pmap->pm_pdir[i];
5727 				if (pde && pmap_pde_v(pde)) {
5728 					for (j = 0; j < NPTEPG; j++) {
5729 						vm_offset_t va = base + (j << PAGE_SHIFT);
5730 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
5731 							if (index) {
5732 								index = 0;
5733 								printf("\n");
5734 							}
5735 							sx_sunlock(&allproc_lock);
5736 							return (npte);
5737 						}
5738 						pte = pmap_pte(pmap, va);
5739 						if (pte && pmap_pte_v(pte)) {
5740 							pt_entry_t pa;
5741 							vm_page_t m;
5742 							pa = *pte;
5743 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
5744 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
5745 								va, pa, m->hold_count, m->wire_count, m->flags);
5746 							npte++;
5747 							index++;
5748 							if (index >= 2) {
5749 								index = 0;
5750 								printf("\n");
5751 							} else {
5752 								printf(" ");
5753 							}
5754 						}
5755 					}
5756 				}
5757 			}
5758 		}
5759 	}
5760 	sx_sunlock(&allproc_lock);
5761 	return (npte);
5762 }
5763 #endif
5764