1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2002-2006 Rice University
5 * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
6 * All rights reserved.
7 *
8 * This software was developed for the FreeBSD Project by Alan L. Cox,
9 * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
26 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
27 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
28 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
30 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 /*
35 * Physical memory system implementation
36 *
37 * Any external functions defined by this module are only to be used by the
38 * virtual memory system.
39 */
40
41 #include <sys/cdefs.h>
42 #include "opt_ddb.h"
43 #include "opt_vm.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/domainset.h>
48 #include <sys/lock.h>
49 #include <sys/kernel.h>
50 #include <sys/kthread.h>
51 #include <sys/malloc.h>
52 #include <sys/mutex.h>
53 #include <sys/proc.h>
54 #include <sys/queue.h>
55 #include <sys/rwlock.h>
56 #include <sys/sbuf.h>
57 #include <sys/sched.h>
58 #include <sys/sysctl.h>
59 #include <sys/tree.h>
60 #include <sys/tslog.h>
61 #include <sys/unistd.h>
62 #include <sys/vmmeter.h>
63
64 #include <ddb/ddb.h>
65
66 #include <vm/vm.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_param.h>
69 #include <vm/vm_kern.h>
70 #include <vm/vm_page.h>
71 #include <vm/vm_phys.h>
72 #include <vm/vm_pagequeue.h>
73
74 _Static_assert(sizeof(long) * NBBY >= VM_PHYSSEG_MAX,
75 "Too many physsegs.");
76 _Static_assert(sizeof(long long) >= sizeof(vm_paddr_t),
77 "vm_paddr_t too big for ffsll, flsll.");
78
79 #ifdef NUMA
80 struct mem_affinity __read_mostly *mem_affinity;
81 int __read_mostly *mem_locality;
82
83 static int numa_disabled;
84 static SYSCTL_NODE(_vm, OID_AUTO, numa, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
85 "NUMA options");
86 SYSCTL_INT(_vm_numa, OID_AUTO, disabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
87 &numa_disabled, 0, "NUMA-awareness in the allocators is disabled");
88 #endif
89
90 int __read_mostly vm_ndomains = 1;
91 domainset_t __read_mostly all_domains = DOMAINSET_T_INITIALIZER(0x1);
92
93 struct vm_phys_seg __read_mostly vm_phys_segs[VM_PHYSSEG_MAX];
94 int __read_mostly vm_phys_nsegs;
95 static struct vm_phys_seg vm_phys_early_segs[8];
96 static int vm_phys_early_nsegs;
97
98 struct vm_phys_fictitious_seg;
99 static int vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *,
100 struct vm_phys_fictitious_seg *);
101
102 RB_HEAD(fict_tree, vm_phys_fictitious_seg) vm_phys_fictitious_tree =
103 RB_INITIALIZER(&vm_phys_fictitious_tree);
104
105 struct vm_phys_fictitious_seg {
106 RB_ENTRY(vm_phys_fictitious_seg) node;
107 /* Memory region data */
108 vm_paddr_t start;
109 vm_paddr_t end;
110 vm_page_t first_page;
111 };
112
113 RB_GENERATE_STATIC(fict_tree, vm_phys_fictitious_seg, node,
114 vm_phys_fictitious_cmp);
115
116 static struct rwlock_padalign vm_phys_fictitious_reg_lock;
117 MALLOC_DEFINE(M_FICT_PAGES, "vm_fictitious", "Fictitious VM pages");
118
119 static struct vm_freelist __aligned(CACHE_LINE_SIZE)
120 vm_phys_free_queues[MAXMEMDOM][VM_NFREELIST][VM_NFREEPOOL]
121 [VM_NFREEORDER_MAX];
122
123 static int __read_mostly vm_nfreelists;
124
125 /*
126 * These "avail lists" are globals used to communicate boot-time physical
127 * memory layout to other parts of the kernel. Each physically contiguous
128 * region of memory is defined by a start address at an even index and an
129 * end address at the following odd index. Each list is terminated by a
130 * pair of zero entries.
131 *
132 * dump_avail tells the dump code what regions to include in a crash dump, and
133 * phys_avail is all of the remaining physical memory that is available for
134 * the vm system.
135 *
136 * Initially dump_avail and phys_avail are identical. Boot time memory
137 * allocations remove extents from phys_avail that may still be included
138 * in dumps.
139 */
140 vm_paddr_t phys_avail[PHYS_AVAIL_COUNT];
141 vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
142
143 /*
144 * Provides the mapping from VM_FREELIST_* to free list indices (flind).
145 */
146 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
147 static int __read_mostly vm_default_freepool;
148
149 CTASSERT(VM_FREELIST_DEFAULT == 0);
150
151 #ifdef VM_FREELIST_DMA32
152 #define VM_DMA32_BOUNDARY ((vm_paddr_t)1 << 32)
153 #endif
154
155 /*
156 * Enforce the assumptions made by vm_phys_add_seg() and vm_phys_init() about
157 * the ordering of the free list boundaries.
158 */
159 #if defined(VM_LOWMEM_BOUNDARY) && defined(VM_DMA32_BOUNDARY)
160 CTASSERT(VM_LOWMEM_BOUNDARY < VM_DMA32_BOUNDARY);
161 #endif
162
163 static int sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS);
164 SYSCTL_OID(_vm, OID_AUTO, phys_free,
165 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
166 sysctl_vm_phys_free, "A",
167 "Phys Free Info");
168
169 static int sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS);
170 SYSCTL_OID(_vm, OID_AUTO, phys_segs,
171 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
172 sysctl_vm_phys_segs, "A",
173 "Phys Seg Info");
174
175 #ifdef NUMA
176 static int sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS);
177 SYSCTL_OID(_vm, OID_AUTO, phys_locality,
178 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
179 sysctl_vm_phys_locality, "A",
180 "Phys Locality Info");
181 #endif
182
183 SYSCTL_INT(_vm, OID_AUTO, ndomains, CTLFLAG_RD,
184 &vm_ndomains, 0, "Number of physical memory domains available.");
185
186 static void _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain);
187 static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
188 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
189 int order, int pool, int tail);
190
191 static bool __diagused
vm_phys_pool_valid(int pool)192 vm_phys_pool_valid(int pool)
193 {
194 #ifdef VM_FREEPOOL_LAZYINIT
195 if (pool == VM_FREEPOOL_LAZYINIT)
196 return (false);
197 #endif
198 return (pool >= 0 && pool < VM_NFREEPOOL);
199 }
200
201 /*
202 * Red-black tree helpers for vm fictitious range management.
203 */
204 static inline int
vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg * p,struct vm_phys_fictitious_seg * range)205 vm_phys_fictitious_in_range(struct vm_phys_fictitious_seg *p,
206 struct vm_phys_fictitious_seg *range)
207 {
208
209 KASSERT(range->start != 0 && range->end != 0,
210 ("Invalid range passed on search for vm_fictitious page"));
211 if (p->start >= range->end)
212 return (1);
213 if (p->start < range->start)
214 return (-1);
215
216 return (0);
217 }
218
219 static int
vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg * p1,struct vm_phys_fictitious_seg * p2)220 vm_phys_fictitious_cmp(struct vm_phys_fictitious_seg *p1,
221 struct vm_phys_fictitious_seg *p2)
222 {
223
224 /* Check if this is a search for a page */
225 if (p1->end == 0)
226 return (vm_phys_fictitious_in_range(p1, p2));
227
228 KASSERT(p2->end != 0,
229 ("Invalid range passed as second parameter to vm fictitious comparison"));
230
231 /* Searching to add a new range */
232 if (p1->end <= p2->start)
233 return (-1);
234 if (p1->start >= p2->end)
235 return (1);
236
237 panic("Trying to add overlapping vm fictitious ranges:\n"
238 "[%#jx:%#jx] and [%#jx:%#jx]", (uintmax_t)p1->start,
239 (uintmax_t)p1->end, (uintmax_t)p2->start, (uintmax_t)p2->end);
240 }
241
242 int
vm_phys_domain_match(int prefer __numa_used,vm_paddr_t low __numa_used,vm_paddr_t high __numa_used)243 vm_phys_domain_match(int prefer __numa_used, vm_paddr_t low __numa_used,
244 vm_paddr_t high __numa_used)
245 {
246 #ifdef NUMA
247 domainset_t mask;
248 int i;
249
250 if (vm_ndomains == 1 || mem_affinity == NULL)
251 return (0);
252
253 DOMAINSET_ZERO(&mask);
254 /*
255 * Check for any memory that overlaps low, high.
256 */
257 for (i = 0; mem_affinity[i].end != 0; i++)
258 if (mem_affinity[i].start <= high &&
259 mem_affinity[i].end >= low)
260 DOMAINSET_SET(mem_affinity[i].domain, &mask);
261 if (prefer != -1 && DOMAINSET_ISSET(prefer, &mask))
262 return (prefer);
263 if (DOMAINSET_EMPTY(&mask))
264 panic("vm_phys_domain_match: Impossible constraint");
265 return (DOMAINSET_FFS(&mask) - 1);
266 #else
267 return (0);
268 #endif
269 }
270
271 /*
272 * Outputs the state of the physical memory allocator, specifically,
273 * the amount of physical memory in each free list.
274 */
275 static int
sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)276 sysctl_vm_phys_free(SYSCTL_HANDLER_ARGS)
277 {
278 struct sbuf sbuf;
279 struct vm_freelist *fl;
280 int dom, error, flind, oind, pind;
281
282 error = sysctl_wire_old_buffer(req, 0);
283 if (error != 0)
284 return (error);
285 sbuf_new_for_sysctl(&sbuf, NULL, 128 * vm_ndomains, req);
286 for (dom = 0; dom < vm_ndomains; dom++) {
287 sbuf_printf(&sbuf,"\nDOMAIN %d:\n", dom);
288 for (flind = 0; flind < vm_nfreelists; flind++) {
289 sbuf_printf(&sbuf, "\nFREE LIST %d:\n"
290 "\n ORDER (SIZE) | NUMBER"
291 "\n ", flind);
292 for (pind = 0; pind < VM_NFREEPOOL; pind++)
293 sbuf_printf(&sbuf, " | POOL %d", pind);
294 sbuf_printf(&sbuf, "\n-- ");
295 for (pind = 0; pind < VM_NFREEPOOL; pind++)
296 sbuf_printf(&sbuf, "-- -- ");
297 sbuf_printf(&sbuf, "--\n");
298 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
299 sbuf_printf(&sbuf, " %2d (%6dK)", oind,
300 1 << (PAGE_SHIFT - 10 + oind));
301 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
302 fl = vm_phys_free_queues[dom][flind][pind];
303 sbuf_printf(&sbuf, " | %6d",
304 fl[oind].lcnt);
305 }
306 sbuf_printf(&sbuf, "\n");
307 }
308 }
309 }
310 error = sbuf_finish(&sbuf);
311 sbuf_delete(&sbuf);
312 return (error);
313 }
314
315 /*
316 * Outputs the set of physical memory segments.
317 */
318 static int
sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)319 sysctl_vm_phys_segs(SYSCTL_HANDLER_ARGS)
320 {
321 struct sbuf sbuf;
322 struct vm_phys_seg *seg;
323 int error, segind;
324
325 error = sysctl_wire_old_buffer(req, 0);
326 if (error != 0)
327 return (error);
328 sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
329 for (segind = 0; segind < vm_phys_nsegs; segind++) {
330 sbuf_printf(&sbuf, "\nSEGMENT %d:\n\n", segind);
331 seg = &vm_phys_segs[segind];
332 sbuf_printf(&sbuf, "start: %#jx\n",
333 (uintmax_t)seg->start);
334 sbuf_printf(&sbuf, "end: %#jx\n",
335 (uintmax_t)seg->end);
336 sbuf_printf(&sbuf, "domain: %d\n", seg->domain);
337 sbuf_printf(&sbuf, "free list: %p\n", seg->free_queues);
338 }
339 error = sbuf_finish(&sbuf);
340 sbuf_delete(&sbuf);
341 return (error);
342 }
343
344 /*
345 * Return affinity, or -1 if there's no affinity information.
346 */
347 int
vm_phys_mem_affinity(int f __numa_used,int t __numa_used)348 vm_phys_mem_affinity(int f __numa_used, int t __numa_used)
349 {
350
351 #ifdef NUMA
352 if (mem_locality == NULL)
353 return (-1);
354 if (f >= vm_ndomains || t >= vm_ndomains)
355 return (-1);
356 return (mem_locality[f * vm_ndomains + t]);
357 #else
358 return (-1);
359 #endif
360 }
361
362 #ifdef NUMA
363 /*
364 * Outputs the VM locality table.
365 */
366 static int
sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)367 sysctl_vm_phys_locality(SYSCTL_HANDLER_ARGS)
368 {
369 struct sbuf sbuf;
370 int error, i, j;
371
372 error = sysctl_wire_old_buffer(req, 0);
373 if (error != 0)
374 return (error);
375 sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
376
377 sbuf_printf(&sbuf, "\n");
378
379 for (i = 0; i < vm_ndomains; i++) {
380 sbuf_printf(&sbuf, "%d: ", i);
381 for (j = 0; j < vm_ndomains; j++) {
382 sbuf_printf(&sbuf, "%d ", vm_phys_mem_affinity(i, j));
383 }
384 sbuf_printf(&sbuf, "\n");
385 }
386 error = sbuf_finish(&sbuf);
387 sbuf_delete(&sbuf);
388 return (error);
389 }
390 #endif
391
392 static void
vm_freelist_add(struct vm_freelist * fl,vm_page_t m,int order,int pool,int tail)393 vm_freelist_add(struct vm_freelist *fl, vm_page_t m, int order, int pool,
394 int tail)
395 {
396
397 m->order = order;
398 m->pool = pool;
399 if (tail)
400 TAILQ_INSERT_TAIL(&fl[order].pl, m, listq);
401 else
402 TAILQ_INSERT_HEAD(&fl[order].pl, m, listq);
403 fl[order].lcnt++;
404 }
405
406 static void
vm_freelist_rem(struct vm_freelist * fl,vm_page_t m,int order)407 vm_freelist_rem(struct vm_freelist *fl, vm_page_t m, int order)
408 {
409
410 TAILQ_REMOVE(&fl[order].pl, m, listq);
411 fl[order].lcnt--;
412 m->order = VM_NFREEORDER;
413 }
414
415 /*
416 * Create a physical memory segment.
417 */
418 static void
_vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end,int domain)419 _vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end, int domain)
420 {
421 struct vm_phys_seg *seg;
422
423 if (!(0 <= domain && domain < vm_ndomains))
424 panic("%s: Invalid domain %d ('vm_ndomains' is %d)",
425 __func__, domain, vm_ndomains);
426 if (vm_phys_nsegs >= VM_PHYSSEG_MAX)
427 panic("Not enough storage for physical segments, "
428 "increase VM_PHYSSEG_MAX");
429
430 seg = &vm_phys_segs[vm_phys_nsegs++];
431 while (seg > vm_phys_segs && seg[-1].start >= end) {
432 *seg = *(seg - 1);
433 seg--;
434 }
435 seg->start = start;
436 seg->end = end;
437 seg->domain = domain;
438 if (seg != vm_phys_segs && seg[-1].end > start)
439 panic("Overlapping physical segments: Current [%#jx,%#jx) "
440 "at index %zu, previous [%#jx,%#jx)",
441 (uintmax_t)start, (uintmax_t)end, seg - vm_phys_segs,
442 (uintmax_t)seg[-1].start, (uintmax_t)seg[-1].end);
443 }
444
445 static void
vm_phys_create_seg(vm_paddr_t start,vm_paddr_t end)446 vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end)
447 {
448 #ifdef NUMA
449 int i;
450
451 if (mem_affinity == NULL) {
452 _vm_phys_create_seg(start, end, 0);
453 return;
454 }
455
456 for (i = 0;; i++) {
457 if (mem_affinity[i].end == 0)
458 panic("Reached end of affinity info");
459 if (mem_affinity[i].end <= start)
460 continue;
461 if (mem_affinity[i].start > start)
462 panic("No affinity info for start %jx",
463 (uintmax_t)start);
464 if (mem_affinity[i].end >= end) {
465 _vm_phys_create_seg(start, end,
466 mem_affinity[i].domain);
467 break;
468 }
469 _vm_phys_create_seg(start, mem_affinity[i].end,
470 mem_affinity[i].domain);
471 start = mem_affinity[i].end;
472 }
473 #else
474 _vm_phys_create_seg(start, end, 0);
475 #endif
476 }
477
478 /*
479 * Add a physical memory segment.
480 */
481 void
vm_phys_add_seg(vm_paddr_t start,vm_paddr_t end)482 vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end)
483 {
484 vm_paddr_t paddr;
485
486 if ((start & PAGE_MASK) != 0)
487 panic("%s: start (%jx) is not page aligned", __func__,
488 (uintmax_t)start);
489 if ((end & PAGE_MASK) != 0)
490 panic("%s: end (%jx) is not page aligned", __func__,
491 (uintmax_t)end);
492 if (start > end)
493 panic("%s: start (%jx) > end (%jx)!", __func__,
494 (uintmax_t)start, (uintmax_t)end);
495
496 if (start == end)
497 return;
498
499 /*
500 * Split the physical memory segment if it spans two or more free
501 * list boundaries.
502 */
503 paddr = start;
504 #ifdef VM_FREELIST_LOWMEM
505 if (paddr < VM_LOWMEM_BOUNDARY && end > VM_LOWMEM_BOUNDARY) {
506 vm_phys_create_seg(paddr, VM_LOWMEM_BOUNDARY);
507 paddr = VM_LOWMEM_BOUNDARY;
508 }
509 #endif
510 #ifdef VM_FREELIST_DMA32
511 if (paddr < VM_DMA32_BOUNDARY && end > VM_DMA32_BOUNDARY) {
512 vm_phys_create_seg(paddr, VM_DMA32_BOUNDARY);
513 paddr = VM_DMA32_BOUNDARY;
514 }
515 #endif
516 vm_phys_create_seg(paddr, end);
517 }
518
519 /*
520 * Initialize the physical memory allocator.
521 *
522 * Requires that vm_page_array is initialized!
523 */
524 void
vm_phys_init(void)525 vm_phys_init(void)
526 {
527 struct vm_freelist *fl;
528 struct vm_phys_seg *end_seg, *prev_seg, *seg, *tmp_seg;
529 #if defined(VM_DMA32_NPAGES_THRESHOLD) || defined(VM_PHYSSEG_SPARSE)
530 u_long npages;
531 #endif
532 int dom, flind, freelist, oind, pind, segind;
533
534 /*
535 * Compute the number of free lists, and generate the mapping from the
536 * manifest constants VM_FREELIST_* to the free list indices.
537 *
538 * Initially, the entries of vm_freelist_to_flind[] are set to either
539 * 0 or 1 to indicate which free lists should be created.
540 */
541 #ifdef VM_DMA32_NPAGES_THRESHOLD
542 npages = 0;
543 #endif
544 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
545 seg = &vm_phys_segs[segind];
546 #ifdef VM_FREELIST_LOWMEM
547 if (seg->end <= VM_LOWMEM_BOUNDARY)
548 vm_freelist_to_flind[VM_FREELIST_LOWMEM] = 1;
549 else
550 #endif
551 #ifdef VM_FREELIST_DMA32
552 if (
553 #ifdef VM_DMA32_NPAGES_THRESHOLD
554 /*
555 * Create the DMA32 free list only if the amount of
556 * physical memory above physical address 4G exceeds the
557 * given threshold.
558 */
559 npages > VM_DMA32_NPAGES_THRESHOLD &&
560 #endif
561 seg->end <= VM_DMA32_BOUNDARY)
562 vm_freelist_to_flind[VM_FREELIST_DMA32] = 1;
563 else
564 #endif
565 {
566 #ifdef VM_DMA32_NPAGES_THRESHOLD
567 npages += atop(seg->end - seg->start);
568 #endif
569 vm_freelist_to_flind[VM_FREELIST_DEFAULT] = 1;
570 }
571 }
572 /* Change each entry into a running total of the free lists. */
573 for (freelist = 1; freelist < VM_NFREELIST; freelist++) {
574 vm_freelist_to_flind[freelist] +=
575 vm_freelist_to_flind[freelist - 1];
576 }
577 vm_nfreelists = vm_freelist_to_flind[VM_NFREELIST - 1];
578 KASSERT(vm_nfreelists > 0, ("vm_phys_init: no free lists"));
579 /* Change each entry into a free list index. */
580 for (freelist = 0; freelist < VM_NFREELIST; freelist++)
581 vm_freelist_to_flind[freelist]--;
582
583 /*
584 * Initialize the first_page and free_queues fields of each physical
585 * memory segment.
586 */
587 #ifdef VM_PHYSSEG_SPARSE
588 npages = 0;
589 #endif
590 for (segind = 0; segind < vm_phys_nsegs; segind++) {
591 seg = &vm_phys_segs[segind];
592 #ifdef VM_PHYSSEG_SPARSE
593 seg->first_page = &vm_page_array[npages];
594 npages += atop(seg->end - seg->start);
595 #else
596 seg->first_page = PHYS_TO_VM_PAGE(seg->start);
597 #endif
598 #ifdef VM_FREELIST_LOWMEM
599 if (seg->end <= VM_LOWMEM_BOUNDARY) {
600 flind = vm_freelist_to_flind[VM_FREELIST_LOWMEM];
601 KASSERT(flind >= 0,
602 ("vm_phys_init: LOWMEM flind < 0"));
603 } else
604 #endif
605 #ifdef VM_FREELIST_DMA32
606 if (seg->end <= VM_DMA32_BOUNDARY) {
607 flind = vm_freelist_to_flind[VM_FREELIST_DMA32];
608 KASSERT(flind >= 0,
609 ("vm_phys_init: DMA32 flind < 0"));
610 } else
611 #endif
612 {
613 flind = vm_freelist_to_flind[VM_FREELIST_DEFAULT];
614 KASSERT(flind >= 0,
615 ("vm_phys_init: DEFAULT flind < 0"));
616 }
617 seg->free_queues = &vm_phys_free_queues[seg->domain][flind];
618 }
619
620 /*
621 * Coalesce physical memory segments that are contiguous and share the
622 * same per-domain free queues.
623 */
624 prev_seg = vm_phys_segs;
625 seg = &vm_phys_segs[1];
626 end_seg = &vm_phys_segs[vm_phys_nsegs];
627 while (seg < end_seg) {
628 if (prev_seg->end == seg->start &&
629 prev_seg->free_queues == seg->free_queues) {
630 prev_seg->end = seg->end;
631 KASSERT(prev_seg->domain == seg->domain,
632 ("vm_phys_init: free queues cannot span domains"));
633 vm_phys_nsegs--;
634 end_seg--;
635 for (tmp_seg = seg; tmp_seg < end_seg; tmp_seg++)
636 *tmp_seg = *(tmp_seg + 1);
637 } else {
638 prev_seg = seg;
639 seg++;
640 }
641 }
642
643 /*
644 * Initialize the free queues.
645 */
646 for (dom = 0; dom < vm_ndomains; dom++) {
647 for (flind = 0; flind < vm_nfreelists; flind++) {
648 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
649 fl = vm_phys_free_queues[dom][flind][pind];
650 for (oind = 0; oind < VM_NFREEORDER; oind++)
651 TAILQ_INIT(&fl[oind].pl);
652 }
653 }
654 }
655
656 #ifdef VM_FREEPOOL_LAZYINIT
657 vm_default_freepool = VM_FREEPOOL_LAZYINIT;
658 #else
659 vm_default_freepool = VM_FREEPOOL_DEFAULT;
660 #endif
661
662 rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
663 }
664
665 /*
666 * Register info about the NUMA topology of the system.
667 *
668 * Invoked by platform-dependent code prior to vm_phys_init().
669 */
670 void
vm_phys_register_domains(int ndomains __numa_used,struct mem_affinity * affinity __numa_used,int * locality __numa_used)671 vm_phys_register_domains(int ndomains __numa_used,
672 struct mem_affinity *affinity __numa_used, int *locality __numa_used)
673 {
674 #ifdef NUMA
675 int i;
676
677 /*
678 * For now the only override value that we support is 1, which
679 * effectively disables NUMA-awareness in the allocators.
680 */
681 TUNABLE_INT_FETCH("vm.numa.disabled", &numa_disabled);
682 if (numa_disabled)
683 ndomains = 1;
684
685 if (ndomains > 1) {
686 vm_ndomains = ndomains;
687 mem_affinity = affinity;
688 mem_locality = locality;
689 }
690
691 for (i = 0; i < vm_ndomains; i++)
692 DOMAINSET_SET(i, &all_domains);
693 #endif
694 }
695
696 /*
697 * Split a contiguous, power of two-sized set of physical pages.
698 *
699 * When this function is called by a page allocation function, the caller
700 * should request insertion at the head unless the order [order, oind) queues
701 * are known to be empty. The objective being to reduce the likelihood of
702 * long-term fragmentation by promoting contemporaneous allocation and
703 * (hopefully) deallocation.
704 */
705 static __inline void
vm_phys_split_pages(vm_page_t m,int oind,struct vm_freelist * fl,int order,int pool,int tail)706 vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl, int order,
707 int pool, int tail)
708 {
709 vm_page_t m_buddy;
710
711 while (oind > order) {
712 oind--;
713 m_buddy = &m[1 << oind];
714 KASSERT(m_buddy->order == VM_NFREEORDER,
715 ("vm_phys_split_pages: page %p has unexpected order %d",
716 m_buddy, m_buddy->order));
717 vm_freelist_add(fl, m_buddy, oind, pool, tail);
718 }
719 }
720
721 static void
vm_phys_enq_chunk(struct vm_freelist * fl,vm_page_t m,int order,int pool,int tail)722 vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int pool,
723 int tail)
724 {
725 KASSERT(order >= 0 && order < VM_NFREEORDER,
726 ("%s: invalid order %d", __func__, order));
727
728 vm_freelist_add(fl, m, order, pool, tail);
729 #ifdef VM_FREEPOOL_LAZYINIT
730 if (__predict_false(pool == VM_FREEPOOL_LAZYINIT)) {
731 vm_page_t m_next;
732 vm_paddr_t pa;
733 int npages;
734
735 npages = 1 << order;
736 m_next = m + npages;
737 pa = m->phys_addr + ptoa(npages);
738 if (pa < vm_phys_segs[m->segind].end) {
739 vm_page_init_page(m_next, pa, m->segind,
740 VM_FREEPOOL_LAZYINIT);
741 }
742 }
743 #endif
744 }
745
746 /*
747 * Add the physical pages [m, m + npages) at the beginning of a power-of-two
748 * aligned and sized set to the specified free list.
749 *
750 * When this function is called by a page allocation function, the caller
751 * should request insertion at the head unless the lower-order queues are
752 * known to be empty. The objective being to reduce the likelihood of long-
753 * term fragmentation by promoting contemporaneous allocation and (hopefully)
754 * deallocation.
755 *
756 * The physical page m's buddy must not be free.
757 */
758 static void
vm_phys_enq_beg(vm_page_t m,u_int npages,struct vm_freelist * fl,int pool,int tail)759 vm_phys_enq_beg(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
760 int tail)
761 {
762 int order;
763
764 KASSERT(npages == 0 ||
765 (VM_PAGE_TO_PHYS(m) &
766 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
767 ("%s: page %p and npages %u are misaligned",
768 __func__, m, npages));
769 while (npages > 0) {
770 KASSERT(m->order == VM_NFREEORDER,
771 ("%s: page %p has unexpected order %d",
772 __func__, m, m->order));
773 order = ilog2(npages);
774 KASSERT(order < VM_NFREEORDER,
775 ("%s: order %d is out of range", __func__, order));
776 vm_phys_enq_chunk(fl, m, order, pool, tail);
777 m += 1 << order;
778 npages -= 1 << order;
779 }
780 }
781
782 /*
783 * Add the physical pages [m, m + npages) at the end of a power-of-two aligned
784 * and sized set to the specified free list.
785 *
786 * When this function is called by a page allocation function, the caller
787 * should request insertion at the head unless the lower-order queues are
788 * known to be empty. The objective being to reduce the likelihood of long-
789 * term fragmentation by promoting contemporaneous allocation and (hopefully)
790 * deallocation.
791 *
792 * If npages is zero, this function does nothing and ignores the physical page
793 * parameter m. Otherwise, the physical page m's buddy must not be free.
794 */
795 static vm_page_t
vm_phys_enq_range(vm_page_t m,u_int npages,struct vm_freelist * fl,int pool,int tail)796 vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int pool,
797 int tail)
798 {
799 int order;
800
801 KASSERT(npages == 0 ||
802 ((VM_PAGE_TO_PHYS(m) + npages * PAGE_SIZE) &
803 ((PAGE_SIZE << ilog2(npages)) - 1)) == 0,
804 ("vm_phys_enq_range: page %p and npages %u are misaligned",
805 m, npages));
806 while (npages > 0) {
807 KASSERT(m->order == VM_NFREEORDER,
808 ("vm_phys_enq_range: page %p has unexpected order %d",
809 m, m->order));
810 order = ffs(npages) - 1;
811 vm_phys_enq_chunk(fl, m, order, pool, tail);
812 m += 1 << order;
813 npages -= 1 << order;
814 }
815 return (m);
816 }
817
818 /*
819 * Complete initialization a contiguous, power of two-sized set of physical
820 * pages.
821 *
822 * If the pages currently belong to the lazy init pool, then the corresponding
823 * page structures must be initialized. In this case it is assumed that the
824 * first page in the run has already been initialized.
825 */
826 static void
vm_phys_finish_init(vm_page_t m,int order)827 vm_phys_finish_init(vm_page_t m, int order)
828 {
829 #ifdef VM_FREEPOOL_LAZYINIT
830 if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
831 vm_paddr_t pa;
832 int segind;
833
834 TSENTER();
835 pa = m->phys_addr + PAGE_SIZE;
836 segind = m->segind;
837 for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
838 m_tmp++, pa += PAGE_SIZE)
839 vm_page_init_page(m_tmp, pa, segind, VM_NFREEPOOL);
840 TSEXIT();
841 }
842 #endif
843 }
844
845 /*
846 * Tries to allocate the specified number of pages from the specified pool
847 * within the specified domain. Returns the actual number of allocated pages
848 * and a pointer to each page through the array ma[].
849 *
850 * The returned pages may not be physically contiguous. However, in contrast
851 * to performing multiple, back-to-back calls to vm_phys_alloc_pages(..., 0),
852 * calling this function once to allocate the desired number of pages will
853 * avoid wasted time in vm_phys_split_pages(). The allocated pages have no
854 * valid pool field set.
855 *
856 * The free page queues for the specified domain must be locked.
857 */
858 int
vm_phys_alloc_npages(int domain,int pool,int npages,vm_page_t ma[])859 vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
860 {
861 struct vm_freelist *alt, *fl;
862 vm_page_t m;
863 int avail, end, flind, freelist, i, oind, pind;
864
865 KASSERT(domain >= 0 && domain < vm_ndomains,
866 ("vm_phys_alloc_npages: domain %d is out of range", domain));
867 KASSERT(vm_phys_pool_valid(pool),
868 ("vm_phys_alloc_npages: pool %d is out of range", pool));
869 KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
870 ("vm_phys_alloc_npages: npages %d is out of range", npages));
871 vm_domain_free_assert_locked(VM_DOMAIN(domain));
872 i = 0;
873 for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
874 flind = vm_freelist_to_flind[freelist];
875 if (flind < 0)
876 continue;
877 fl = vm_phys_free_queues[domain][flind][pool];
878 for (oind = 0; oind < VM_NFREEORDER; oind++) {
879 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
880 vm_freelist_rem(fl, m, oind);
881 avail = i + (1 << oind);
882 end = imin(npages, avail);
883 while (i < end)
884 ma[i++] = m++;
885 if (i == npages) {
886 /*
887 * Return excess pages to fl. Its order
888 * [0, oind) queues are empty.
889 */
890 vm_phys_enq_range(m, avail - i, fl,
891 pool, 1);
892 return (npages);
893 }
894 }
895 }
896 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
897 for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
898 pind++) {
899 alt = vm_phys_free_queues[domain][flind][pind];
900 while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
901 NULL) {
902 vm_freelist_rem(alt, m, oind);
903 vm_phys_finish_init(m, oind);
904 avail = i + (1 << oind);
905 end = imin(npages, avail);
906 while (i < end)
907 ma[i++] = m++;
908 if (i == npages) {
909 /*
910 * Return excess pages to fl.
911 * Its order [0, oind) queues
912 * are empty.
913 */
914 vm_phys_enq_range(m, avail - i,
915 fl, pool, 1);
916 return (npages);
917 }
918 }
919 }
920 }
921 }
922 return (i);
923 }
924
925 /*
926 * Allocate a contiguous, power of two-sized set of physical pages from the
927 * specified free list. The free list must be specified using one of the
928 * manifest constants VM_FREELIST_*.
929 *
930 * The free page queues must be locked.
931 */
932 static vm_page_t
vm_phys_alloc_freelist_pages(int domain,int freelist,int pool,int order)933 vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
934 {
935 struct vm_freelist *alt, *fl;
936 vm_page_t m;
937 int oind, pind, flind;
938
939 KASSERT(domain >= 0 && domain < vm_ndomains,
940 ("vm_phys_alloc_freelist_pages: domain %d is out of range",
941 domain));
942 KASSERT(freelist < VM_NFREELIST,
943 ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
944 freelist));
945 KASSERT(vm_phys_pool_valid(pool),
946 ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
947 KASSERT(order < VM_NFREEORDER,
948 ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
949
950 flind = vm_freelist_to_flind[freelist];
951 /* Check if freelist is present */
952 if (flind < 0)
953 return (NULL);
954
955 vm_domain_free_assert_locked(VM_DOMAIN(domain));
956 fl = &vm_phys_free_queues[domain][flind][pool][0];
957 for (oind = order; oind < VM_NFREEORDER; oind++) {
958 m = TAILQ_FIRST(&fl[oind].pl);
959 if (m != NULL) {
960 vm_freelist_rem(fl, m, oind);
961 /* The order [order, oind) queues are empty. */
962 vm_phys_split_pages(m, oind, fl, order, pool, 1);
963 return (m);
964 }
965 }
966
967 /*
968 * The given pool was empty. Find the largest
969 * contiguous, power-of-two-sized set of pages in any
970 * pool. Transfer these pages to the given pool, and
971 * use them to satisfy the allocation.
972 */
973 for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
974 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
975 alt = &vm_phys_free_queues[domain][flind][pind][0];
976 m = TAILQ_FIRST(&alt[oind].pl);
977 if (m != NULL) {
978 vm_freelist_rem(alt, m, oind);
979 vm_phys_finish_init(m, oind);
980 /* The order [order, oind) queues are empty. */
981 vm_phys_split_pages(m, oind, fl, order, pool, 1);
982 return (m);
983 }
984 }
985 }
986 return (NULL);
987 }
988
989 /*
990 * Allocate a contiguous, power of two-sized set of physical pages
991 * from the free lists.
992 *
993 * The free page queues must be locked.
994 */
995 vm_page_t
vm_phys_alloc_pages(int domain,int pool,int order)996 vm_phys_alloc_pages(int domain, int pool, int order)
997 {
998 vm_page_t m;
999 int freelist;
1000
1001 for (freelist = 0; freelist < VM_NFREELIST; freelist++) {
1002 m = vm_phys_alloc_freelist_pages(domain, freelist, pool, order);
1003 if (m != NULL)
1004 return (m);
1005 }
1006 return (NULL);
1007 }
1008
1009 /*
1010 * Find the vm_page corresponding to the given physical address, which must lie
1011 * within the given physical memory segment.
1012 */
1013 vm_page_t
vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg * seg,vm_paddr_t pa)1014 vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa)
1015 {
1016 KASSERT(pa >= seg->start && pa < seg->end,
1017 ("%s: pa %#jx is out of range", __func__, (uintmax_t)pa));
1018
1019 return (&seg->first_page[atop(pa - seg->start)]);
1020 }
1021
1022 /*
1023 * Find the vm_page corresponding to the given physical address.
1024 */
1025 vm_page_t
vm_phys_paddr_to_vm_page(vm_paddr_t pa)1026 vm_phys_paddr_to_vm_page(vm_paddr_t pa)
1027 {
1028 struct vm_phys_seg *seg;
1029
1030 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
1031 return (vm_phys_seg_paddr_to_vm_page(seg, pa));
1032 return (NULL);
1033 }
1034
1035 vm_page_t
vm_phys_fictitious_to_vm_page(vm_paddr_t pa)1036 vm_phys_fictitious_to_vm_page(vm_paddr_t pa)
1037 {
1038 struct vm_phys_fictitious_seg tmp, *seg;
1039 vm_page_t m;
1040
1041 m = NULL;
1042 tmp.start = pa;
1043 tmp.end = 0;
1044
1045 rw_rlock(&vm_phys_fictitious_reg_lock);
1046 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1047 rw_runlock(&vm_phys_fictitious_reg_lock);
1048 if (seg == NULL)
1049 return (NULL);
1050
1051 m = &seg->first_page[atop(pa - seg->start)];
1052 KASSERT((m->flags & PG_FICTITIOUS) != 0, ("%p not fictitious", m));
1053
1054 return (m);
1055 }
1056
1057 static inline void
vm_phys_fictitious_init_range(vm_page_t range,vm_paddr_t start,long page_count,vm_memattr_t memattr)1058 vm_phys_fictitious_init_range(vm_page_t range, vm_paddr_t start,
1059 long page_count, vm_memattr_t memattr)
1060 {
1061 long i;
1062
1063 bzero(range, page_count * sizeof(*range));
1064 for (i = 0; i < page_count; i++) {
1065 vm_page_initfake(&range[i], start + PAGE_SIZE * i, memattr);
1066 range[i].oflags &= ~VPO_UNMANAGED;
1067 range[i].busy_lock = VPB_UNBUSIED;
1068 }
1069 }
1070
1071 int
vm_phys_fictitious_reg_range(vm_paddr_t start,vm_paddr_t end,vm_memattr_t memattr)1072 vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
1073 vm_memattr_t memattr)
1074 {
1075 struct vm_phys_fictitious_seg *seg;
1076 vm_page_t fp;
1077 long page_count;
1078 #ifdef VM_PHYSSEG_DENSE
1079 long pi, pe;
1080 long dpage_count;
1081 #endif
1082
1083 KASSERT(start < end,
1084 ("Start of segment isn't less than end (start: %jx end: %jx)",
1085 (uintmax_t)start, (uintmax_t)end));
1086
1087 page_count = (end - start) / PAGE_SIZE;
1088
1089 #ifdef VM_PHYSSEG_DENSE
1090 pi = atop(start);
1091 pe = atop(end);
1092 if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1093 fp = &vm_page_array[pi - first_page];
1094 if ((pe - first_page) > vm_page_array_size) {
1095 /*
1096 * We have a segment that starts inside
1097 * of vm_page_array, but ends outside of it.
1098 *
1099 * Use vm_page_array pages for those that are
1100 * inside of the vm_page_array range, and
1101 * allocate the remaining ones.
1102 */
1103 dpage_count = vm_page_array_size - (pi - first_page);
1104 vm_phys_fictitious_init_range(fp, start, dpage_count,
1105 memattr);
1106 page_count -= dpage_count;
1107 start += ptoa(dpage_count);
1108 goto alloc;
1109 }
1110 /*
1111 * We can allocate the full range from vm_page_array,
1112 * so there's no need to register the range in the tree.
1113 */
1114 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1115 return (0);
1116 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1117 /*
1118 * We have a segment that ends inside of vm_page_array,
1119 * but starts outside of it.
1120 */
1121 fp = &vm_page_array[0];
1122 dpage_count = pe - first_page;
1123 vm_phys_fictitious_init_range(fp, ptoa(first_page), dpage_count,
1124 memattr);
1125 end -= ptoa(dpage_count);
1126 page_count -= dpage_count;
1127 goto alloc;
1128 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1129 /*
1130 * Trying to register a fictitious range that expands before
1131 * and after vm_page_array.
1132 */
1133 return (EINVAL);
1134 } else {
1135 alloc:
1136 #endif
1137 fp = malloc(page_count * sizeof(struct vm_page), M_FICT_PAGES,
1138 M_WAITOK);
1139 #ifdef VM_PHYSSEG_DENSE
1140 }
1141 #endif
1142 vm_phys_fictitious_init_range(fp, start, page_count, memattr);
1143
1144 seg = malloc(sizeof(*seg), M_FICT_PAGES, M_WAITOK | M_ZERO);
1145 seg->start = start;
1146 seg->end = end;
1147 seg->first_page = fp;
1148
1149 rw_wlock(&vm_phys_fictitious_reg_lock);
1150 RB_INSERT(fict_tree, &vm_phys_fictitious_tree, seg);
1151 rw_wunlock(&vm_phys_fictitious_reg_lock);
1152
1153 return (0);
1154 }
1155
1156 void
vm_phys_fictitious_unreg_range(vm_paddr_t start,vm_paddr_t end)1157 vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end)
1158 {
1159 struct vm_phys_fictitious_seg *seg, tmp;
1160 #ifdef VM_PHYSSEG_DENSE
1161 long pi, pe;
1162 #endif
1163
1164 KASSERT(start < end,
1165 ("Start of segment isn't less than end (start: %jx end: %jx)",
1166 (uintmax_t)start, (uintmax_t)end));
1167
1168 #ifdef VM_PHYSSEG_DENSE
1169 pi = atop(start);
1170 pe = atop(end);
1171 if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
1172 if ((pe - first_page) <= vm_page_array_size) {
1173 /*
1174 * This segment was allocated using vm_page_array
1175 * only, there's nothing to do since those pages
1176 * were never added to the tree.
1177 */
1178 return;
1179 }
1180 /*
1181 * We have a segment that starts inside
1182 * of vm_page_array, but ends outside of it.
1183 *
1184 * Calculate how many pages were added to the
1185 * tree and free them.
1186 */
1187 start = ptoa(first_page + vm_page_array_size);
1188 } else if (pe > first_page && (pe - first_page) < vm_page_array_size) {
1189 /*
1190 * We have a segment that ends inside of vm_page_array,
1191 * but starts outside of it.
1192 */
1193 end = ptoa(first_page);
1194 } else if (pi < first_page && pe > (first_page + vm_page_array_size)) {
1195 /* Since it's not possible to register such a range, panic. */
1196 panic(
1197 "Unregistering not registered fictitious range [%#jx:%#jx]",
1198 (uintmax_t)start, (uintmax_t)end);
1199 }
1200 #endif
1201 tmp.start = start;
1202 tmp.end = 0;
1203
1204 rw_wlock(&vm_phys_fictitious_reg_lock);
1205 seg = RB_FIND(fict_tree, &vm_phys_fictitious_tree, &tmp);
1206 if (seg->start != start || seg->end != end) {
1207 rw_wunlock(&vm_phys_fictitious_reg_lock);
1208 panic(
1209 "Unregistering not registered fictitious range [%#jx:%#jx]",
1210 (uintmax_t)start, (uintmax_t)end);
1211 }
1212 RB_REMOVE(fict_tree, &vm_phys_fictitious_tree, seg);
1213 rw_wunlock(&vm_phys_fictitious_reg_lock);
1214 free(seg->first_page, M_FICT_PAGES);
1215 free(seg, M_FICT_PAGES);
1216 }
1217
1218 /*
1219 * Free a contiguous, power of two-sized set of physical pages.
1220 * The pool field in the first page determines the destination pool.
1221 *
1222 * The free page queues must be locked.
1223 */
1224 void
vm_phys_free_pages(vm_page_t m,int pool,int order)1225 vm_phys_free_pages(vm_page_t m, int pool, int order)
1226 {
1227 struct vm_freelist *fl;
1228 struct vm_phys_seg *seg;
1229 vm_paddr_t pa;
1230 vm_page_t m_buddy;
1231
1232 KASSERT(m->order == VM_NFREEORDER,
1233 ("%s: page %p has unexpected order %d",
1234 __func__, m, m->order));
1235 KASSERT(vm_phys_pool_valid(pool),
1236 ("%s: unexpected pool param %d", __func__, pool));
1237 KASSERT(order < VM_NFREEORDER,
1238 ("%s: order %d is out of range", __func__, order));
1239 seg = &vm_phys_segs[m->segind];
1240 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1241 if (order < VM_NFREEORDER - 1) {
1242 pa = VM_PAGE_TO_PHYS(m);
1243 do {
1244 pa ^= ((vm_paddr_t)1 << (PAGE_SHIFT + order));
1245 if (pa < seg->start || pa >= seg->end)
1246 break;
1247 m_buddy = vm_phys_seg_paddr_to_vm_page(seg, pa);
1248 if (m_buddy->order != order)
1249 break;
1250 fl = (*seg->free_queues)[m_buddy->pool];
1251 vm_freelist_rem(fl, m_buddy, order);
1252 vm_phys_finish_init(m_buddy, order);
1253 order++;
1254 pa &= ~(((vm_paddr_t)1 << (PAGE_SHIFT + order)) - 1);
1255 m = vm_phys_seg_paddr_to_vm_page(seg, pa);
1256 } while (order < VM_NFREEORDER - 1);
1257 }
1258 fl = (*seg->free_queues)[pool];
1259 vm_freelist_add(fl, m, order, pool, 1);
1260 }
1261
1262 #ifdef VM_FREEPOOL_LAZYINIT
1263 /*
1264 * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
1265 * them to the default pool. This is a prerequisite for some rare operations
1266 * which need to scan the page array and thus depend on all pages being
1267 * initialized.
1268 */
1269 static void
vm_phys_lazy_init_domain(int domain,bool locked)1270 vm_phys_lazy_init_domain(int domain, bool locked)
1271 {
1272 static bool initdone[MAXMEMDOM];
1273 struct vm_domain *vmd;
1274 struct vm_freelist *fl;
1275 vm_page_t m;
1276 int pind;
1277 bool unlocked;
1278
1279 if (__predict_true(atomic_load_bool(&initdone[domain])))
1280 return;
1281
1282 vmd = VM_DOMAIN(domain);
1283 if (locked)
1284 vm_domain_free_assert_locked(vmd);
1285 else
1286 vm_domain_free_lock(vmd);
1287 if (atomic_load_bool(&initdone[domain]))
1288 goto out;
1289 pind = VM_FREEPOOL_LAZYINIT;
1290 for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
1291 int flind;
1292
1293 flind = vm_freelist_to_flind[freelist];
1294 if (flind < 0)
1295 continue;
1296 fl = vm_phys_free_queues[domain][flind][pind];
1297 for (int oind = 0; oind < VM_NFREEORDER; oind++) {
1298 if (atomic_load_int(&fl[oind].lcnt) == 0)
1299 continue;
1300 while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
1301 /*
1302 * Avoid holding the lock across the
1303 * initialization unless there's a free page
1304 * shortage.
1305 */
1306 vm_freelist_rem(fl, m, oind);
1307 unlocked = vm_domain_allocate(vmd,
1308 VM_ALLOC_NORMAL, 1 << oind);
1309 if (unlocked)
1310 vm_domain_free_unlock(vmd);
1311 vm_phys_finish_init(m, oind);
1312 if (unlocked) {
1313 vm_domain_freecnt_inc(vmd, 1 << oind);
1314 vm_domain_free_lock(vmd);
1315 }
1316 vm_phys_free_pages(m, VM_FREEPOOL_DEFAULT,
1317 oind);
1318 }
1319 }
1320 }
1321 atomic_store_bool(&initdone[domain], true);
1322 out:
1323 if (!locked)
1324 vm_domain_free_unlock(vmd);
1325 }
1326
1327 static void
vm_phys_lazy_init(void)1328 vm_phys_lazy_init(void)
1329 {
1330 for (int domain = 0; domain < vm_ndomains; domain++)
1331 vm_phys_lazy_init_domain(domain, false);
1332 atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
1333 }
1334
1335 static void
vm_phys_lazy_init_kthr(void * arg __unused)1336 vm_phys_lazy_init_kthr(void *arg __unused)
1337 {
1338 vm_phys_lazy_init();
1339 kthread_exit();
1340 }
1341
1342 static void
vm_phys_lazy_sysinit(void * arg __unused)1343 vm_phys_lazy_sysinit(void *arg __unused)
1344 {
1345 struct thread *td;
1346 int error;
1347
1348 error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
1349 RFSTOPPED, 0, "vmlazyinit");
1350 if (error == 0) {
1351 thread_lock(td);
1352 sched_prio(td, PRI_MIN_IDLE);
1353 sched_add(td, SRQ_BORING);
1354 } else {
1355 printf("%s: could not create lazy init thread: %d\n",
1356 __func__, error);
1357 vm_phys_lazy_init();
1358 }
1359 }
1360 SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
1361 NULL);
1362 #endif /* VM_FREEPOOL_LAZYINIT */
1363
1364 /*
1365 * Free a contiguous, arbitrarily sized set of physical pages, without
1366 * merging across set boundaries. Assumes no pages have a valid pool field.
1367 *
1368 * The free page queues must be locked.
1369 */
1370 void
vm_phys_enqueue_contig(vm_page_t m,int pool,u_long npages)1371 vm_phys_enqueue_contig(vm_page_t m, int pool, u_long npages)
1372 {
1373 struct vm_freelist *fl;
1374 struct vm_phys_seg *seg;
1375 vm_page_t m_end;
1376 vm_paddr_t diff, lo;
1377 int order;
1378
1379 /*
1380 * Avoid unnecessary coalescing by freeing the pages in the largest
1381 * possible power-of-two-sized subsets.
1382 */
1383 vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1384 seg = &vm_phys_segs[m->segind];
1385 fl = (*seg->free_queues)[pool];
1386 m_end = m + npages;
1387 /* Free blocks of increasing size. */
1388 lo = atop(VM_PAGE_TO_PHYS(m));
1389 if (m < m_end &&
1390 (diff = lo ^ (lo + npages - 1)) != 0) {
1391 order = min(ilog2(diff), VM_NFREEORDER - 1);
1392 m = vm_phys_enq_range(m, roundup2(lo, 1 << order) - lo, fl,
1393 pool, 1);
1394 }
1395
1396 /* Free blocks of maximum size. */
1397 order = VM_NFREEORDER - 1;
1398 while (m + (1 << order) <= m_end) {
1399 KASSERT(seg == &vm_phys_segs[m->segind],
1400 ("%s: page range [%p,%p) spans multiple segments",
1401 __func__, m_end - npages, m));
1402 vm_phys_enq_chunk(fl, m, order, pool, 1);
1403 m += 1 << order;
1404 }
1405 /* Free blocks of diminishing size. */
1406 vm_phys_enq_beg(m, m_end - m, fl, pool, 1);
1407 }
1408
1409 /*
1410 * Free a contiguous, arbitrarily sized set of physical pages.
1411 * Assumes that every page but the first has no valid pool field.
1412 * Uses the pool value in the first page if valid, otherwise default.
1413 *
1414 * The free page queues must be locked.
1415 */
1416 void
vm_phys_free_contig(vm_page_t m,int pool,u_long npages)1417 vm_phys_free_contig(vm_page_t m, int pool, u_long npages)
1418 {
1419 vm_paddr_t lo;
1420 vm_page_t m_start, m_end;
1421 unsigned max_order, order_start, order_end;
1422
1423 vm_domain_free_assert_locked(vm_pagequeue_domain(m));
1424
1425 lo = atop(VM_PAGE_TO_PHYS(m));
1426 max_order = min(ilog2(lo ^ (lo + npages)), VM_NFREEORDER - 1);
1427
1428 m_start = m;
1429 order_start = ffsll(lo) - 1;
1430 if (order_start < max_order)
1431 m_start += 1 << order_start;
1432 m_end = m + npages;
1433 order_end = ffsll(lo + npages) - 1;
1434 if (order_end < max_order)
1435 m_end -= 1 << order_end;
1436 /*
1437 * Avoid unnecessary coalescing by freeing the pages at the start and
1438 * end of the range last.
1439 */
1440 if (m_start < m_end)
1441 vm_phys_enqueue_contig(m_start, pool, m_end - m_start);
1442 if (order_start < max_order)
1443 vm_phys_free_pages(m, pool, order_start);
1444 if (order_end < max_order)
1445 vm_phys_free_pages(m_end, pool, order_end);
1446 }
1447
1448 /*
1449 * Identify the first address range within segment segind or greater
1450 * that matches the domain, lies within the low/high range, and has
1451 * enough pages. Return -1 if there is none.
1452 */
1453 int
vm_phys_find_range(vm_page_t bounds[],int segind,int domain,u_long npages,vm_paddr_t low,vm_paddr_t high)1454 vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
1455 u_long npages, vm_paddr_t low, vm_paddr_t high)
1456 {
1457 vm_paddr_t pa_end, pa_start;
1458 struct vm_phys_seg *end_seg, *seg;
1459
1460 KASSERT(npages > 0, ("npages is zero"));
1461 KASSERT(domain >= 0 && domain < vm_ndomains, ("domain out of range"));
1462 end_seg = &vm_phys_segs[vm_phys_nsegs];
1463 for (seg = &vm_phys_segs[segind]; seg < end_seg; seg++) {
1464 if (seg->domain != domain)
1465 continue;
1466 if (seg->start >= high)
1467 return (-1);
1468 pa_start = MAX(low, seg->start);
1469 pa_end = MIN(high, seg->end);
1470 if (pa_end - pa_start < ptoa(npages))
1471 continue;
1472 #ifdef VM_FREEPOOL_LAZYINIT
1473 /*
1474 * The pages on the free lists must be initialized.
1475 */
1476 vm_phys_lazy_init_domain(domain, false);
1477 #endif
1478 bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
1479 bounds[1] = &seg->first_page[atop(pa_end - seg->start)];
1480 return (seg - vm_phys_segs);
1481 }
1482 return (-1);
1483 }
1484
1485 /*
1486 * Search for the given physical page "m" in the free lists. If the search
1487 * succeeds, remove "m" from the free lists and return true. Otherwise, return
1488 * false, indicating that "m" is not in the free lists.
1489 *
1490 * The free page queues must be locked.
1491 */
1492 bool
vm_phys_unfree_page(vm_paddr_t pa)1493 vm_phys_unfree_page(vm_paddr_t pa)
1494 {
1495 struct vm_freelist *fl;
1496 struct vm_phys_seg *seg;
1497 vm_paddr_t pa_half;
1498 vm_page_t m, m_set, m_tmp;
1499 int order, pool;
1500
1501 seg = vm_phys_paddr_to_seg(pa);
1502 vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
1503
1504 #ifdef VM_FREEPOOL_LAZYINIT
1505 /*
1506 * The pages on the free lists must be initialized.
1507 */
1508 vm_phys_lazy_init_domain(seg->domain, true);
1509 #endif
1510
1511 /*
1512 * First, find the contiguous, power of two-sized set of free
1513 * physical pages containing the given physical page "m" and
1514 * assign it to "m_set".
1515 */
1516 m = vm_phys_paddr_to_vm_page(pa);
1517 for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
1518 order < VM_NFREEORDER - 1; ) {
1519 order++;
1520 pa = m->phys_addr & (~(vm_paddr_t)0 << (PAGE_SHIFT + order));
1521 if (pa >= seg->start)
1522 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa);
1523 else
1524 return (false);
1525 }
1526 if (m_set->order < order)
1527 return (false);
1528 if (m_set->order == VM_NFREEORDER)
1529 return (false);
1530 KASSERT(m_set->order < VM_NFREEORDER,
1531 ("vm_phys_unfree_page: page %p has unexpected order %d",
1532 m_set, m_set->order));
1533
1534 /*
1535 * Next, remove "m_set" from the free lists. Finally, extract
1536 * "m" from "m_set" using an iterative algorithm: While "m_set"
1537 * is larger than a page, shrink "m_set" by returning the half
1538 * of "m_set" that does not contain "m" to the free lists.
1539 */
1540 pool = m_set->pool;
1541 fl = (*seg->free_queues)[pool];
1542 order = m_set->order;
1543 vm_freelist_rem(fl, m_set, order);
1544 while (order > 0) {
1545 order--;
1546 pa_half = m_set->phys_addr ^ (1 << (PAGE_SHIFT + order));
1547 if (m->phys_addr < pa_half)
1548 m_tmp = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1549 else {
1550 m_tmp = m_set;
1551 m_set = vm_phys_seg_paddr_to_vm_page(seg, pa_half);
1552 }
1553 vm_freelist_add(fl, m_tmp, order, pool, 0);
1554 }
1555 KASSERT(m_set == m, ("vm_phys_unfree_page: fatal inconsistency"));
1556 return (true);
1557 }
1558
1559 /*
1560 * Find a run of contiguous physical pages, meeting alignment requirements, from
1561 * a list of max-sized page blocks, where we need at least two consecutive
1562 * blocks to satisfy the (large) page request.
1563 */
1564 static vm_page_t
vm_phys_find_freelist_contig(struct vm_freelist * fl,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1565 vm_phys_find_freelist_contig(struct vm_freelist *fl, u_long npages,
1566 vm_paddr_t low, vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
1567 {
1568 struct vm_phys_seg *seg;
1569 vm_page_t m, m_iter, m_ret;
1570 vm_paddr_t max_size, size;
1571 int max_order;
1572
1573 max_order = VM_NFREEORDER - 1;
1574 size = npages << PAGE_SHIFT;
1575 max_size = (vm_paddr_t)1 << (PAGE_SHIFT + max_order);
1576 KASSERT(size > max_size, ("size is too small"));
1577
1578 /*
1579 * In order to avoid examining any free max-sized page block more than
1580 * twice, identify the ones that are first in a physically-contiguous
1581 * sequence of such blocks, and only for those walk the sequence to
1582 * check if there are enough free blocks starting at a properly aligned
1583 * block. Thus, no block is checked for free-ness more than twice.
1584 */
1585 TAILQ_FOREACH(m, &fl[max_order].pl, listq) {
1586 /*
1587 * Skip m unless it is first in a sequence of free max page
1588 * blocks >= low in its segment.
1589 */
1590 seg = &vm_phys_segs[m->segind];
1591 if (VM_PAGE_TO_PHYS(m) < MAX(low, seg->start))
1592 continue;
1593 if (VM_PAGE_TO_PHYS(m) >= max_size &&
1594 VM_PAGE_TO_PHYS(m) - max_size >= MAX(low, seg->start) &&
1595 max_order == m[-1 << max_order].order)
1596 continue;
1597
1598 /*
1599 * Advance m_ret from m to the first of the sequence, if any,
1600 * that satisfies alignment conditions and might leave enough
1601 * space.
1602 */
1603 m_ret = m;
1604 while (!vm_addr_ok(VM_PAGE_TO_PHYS(m_ret),
1605 size, alignment, boundary) &&
1606 VM_PAGE_TO_PHYS(m_ret) + size <= MIN(high, seg->end) &&
1607 max_order == m_ret[1 << max_order].order)
1608 m_ret += 1 << max_order;
1609
1610 /*
1611 * Skip m unless some block m_ret in the sequence is properly
1612 * aligned, and begins a sequence of enough pages less than
1613 * high, and in the same segment.
1614 */
1615 if (VM_PAGE_TO_PHYS(m_ret) + size > MIN(high, seg->end))
1616 continue;
1617
1618 /*
1619 * Skip m unless the blocks to allocate starting at m_ret are
1620 * all free.
1621 */
1622 for (m_iter = m_ret;
1623 m_iter < m_ret + npages && max_order == m_iter->order;
1624 m_iter += 1 << max_order) {
1625 }
1626 if (m_iter < m_ret + npages)
1627 continue;
1628 return (m_ret);
1629 }
1630 return (NULL);
1631 }
1632
1633 /*
1634 * Find a run of contiguous physical pages from the specified free list
1635 * table.
1636 */
1637 static vm_page_t
vm_phys_find_queues_contig(struct vm_freelist (* queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1638 vm_phys_find_queues_contig(
1639 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX],
1640 u_long npages, vm_paddr_t low, vm_paddr_t high,
1641 u_long alignment, vm_paddr_t boundary)
1642 {
1643 struct vm_freelist *fl;
1644 vm_page_t m_ret;
1645 vm_paddr_t pa, pa_end, size;
1646 int oind, order, pind;
1647
1648 KASSERT(npages > 0, ("npages is 0"));
1649 KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1650 KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1651 /* Compute the queue that is the best fit for npages. */
1652 order = flsl(npages - 1);
1653 /* Search for a large enough free block. */
1654 size = npages << PAGE_SHIFT;
1655 for (oind = order; oind < VM_NFREEORDER; oind++) {
1656 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1657 fl = (*queues)[pind];
1658 TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
1659 /*
1660 * Determine if the address range starting at pa
1661 * is within the given range, satisfies the
1662 * given alignment, and does not cross the given
1663 * boundary.
1664 */
1665 pa = VM_PAGE_TO_PHYS(m_ret);
1666 pa_end = pa + size;
1667 if (low <= pa && pa_end <= high &&
1668 vm_addr_ok(pa, size, alignment, boundary))
1669 return (m_ret);
1670 }
1671 }
1672 }
1673 if (order < VM_NFREEORDER)
1674 return (NULL);
1675 /* Search for a long-enough sequence of max-order blocks. */
1676 for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
1677 fl = (*queues)[pind];
1678 m_ret = vm_phys_find_freelist_contig(fl, npages,
1679 low, high, alignment, boundary);
1680 if (m_ret != NULL)
1681 return (m_ret);
1682 }
1683 return (NULL);
1684 }
1685
1686 /*
1687 * Allocate a contiguous set of physical pages of the given size
1688 * "npages" from the free lists. All of the physical pages must be at
1689 * or above the given physical address "low" and below the given
1690 * physical address "high". The given value "alignment" determines the
1691 * alignment of the first physical page in the set. If the given value
1692 * "boundary" is non-zero, then the set of physical pages cannot cross
1693 * any physical address boundary that is a multiple of that value. Both
1694 * "alignment" and "boundary" must be a power of two. Sets the pool
1695 * field to DEFAULT in the first allocated page.
1696 */
1697 vm_page_t
vm_phys_alloc_contig(int domain,u_long npages,vm_paddr_t low,vm_paddr_t high,u_long alignment,vm_paddr_t boundary)1698 vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
1699 u_long alignment, vm_paddr_t boundary)
1700 {
1701 vm_paddr_t pa_end, pa_start;
1702 struct vm_freelist *fl;
1703 vm_page_t m, m_run;
1704 struct vm_phys_seg *seg;
1705 struct vm_freelist (*queues)[VM_NFREEPOOL][VM_NFREEORDER_MAX];
1706 int oind, segind;
1707
1708 KASSERT(npages > 0, ("npages is 0"));
1709 KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
1710 KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
1711 vm_domain_free_assert_locked(VM_DOMAIN(domain));
1712 if (low >= high)
1713 return (NULL);
1714 queues = NULL;
1715 m_run = NULL;
1716 for (segind = vm_phys_nsegs - 1; segind >= 0; segind--) {
1717 seg = &vm_phys_segs[segind];
1718 if (seg->start >= high || seg->domain != domain)
1719 continue;
1720 if (low >= seg->end)
1721 break;
1722 if (low <= seg->start)
1723 pa_start = seg->start;
1724 else
1725 pa_start = low;
1726 if (high < seg->end)
1727 pa_end = high;
1728 else
1729 pa_end = seg->end;
1730 if (pa_end - pa_start < ptoa(npages))
1731 continue;
1732 /*
1733 * If a previous segment led to a search using
1734 * the same free lists as would this segment, then
1735 * we've actually already searched within this
1736 * too. So skip it.
1737 */
1738 if (seg->free_queues == queues)
1739 continue;
1740 queues = seg->free_queues;
1741 m_run = vm_phys_find_queues_contig(queues, npages,
1742 low, high, alignment, boundary);
1743 if (m_run != NULL)
1744 break;
1745 }
1746 if (m_run == NULL)
1747 return (NULL);
1748
1749 /* Allocate pages from the page-range found. */
1750 for (m = m_run; m < &m_run[npages]; m = &m[1 << oind]) {
1751 fl = (*queues)[m->pool];
1752 oind = m->order;
1753 vm_freelist_rem(fl, m, oind);
1754 vm_phys_finish_init(m, oind);
1755 }
1756 /* Return excess pages to the free lists. */
1757 fl = (*queues)[VM_FREEPOOL_DEFAULT];
1758 vm_phys_enq_range(&m_run[npages], m - &m_run[npages], fl,
1759 VM_FREEPOOL_DEFAULT, 0);
1760
1761 /* Return page verified to satisfy conditions of request. */
1762 pa_start = VM_PAGE_TO_PHYS(m_run);
1763 KASSERT(low <= pa_start,
1764 ("memory allocated below minimum requested range"));
1765 KASSERT(pa_start + ptoa(npages) <= high,
1766 ("memory allocated above maximum requested range"));
1767 seg = &vm_phys_segs[m_run->segind];
1768 KASSERT(seg->domain == domain,
1769 ("memory not allocated from specified domain"));
1770 KASSERT(vm_addr_ok(pa_start, ptoa(npages), alignment, boundary),
1771 ("memory alignment/boundary constraints not satisfied"));
1772 return (m_run);
1773 }
1774
1775 /*
1776 * Return the index of the first unused slot which may be the terminating
1777 * entry.
1778 */
1779 static int
vm_phys_avail_count(void)1780 vm_phys_avail_count(void)
1781 {
1782 int i;
1783
1784 for (i = 0; i < PHYS_AVAIL_COUNT; i += 2)
1785 if (phys_avail[i] == 0 && phys_avail[i + 1] == 0)
1786 return (i);
1787 panic("Improperly terminated phys_avail[]");
1788 }
1789
1790 /*
1791 * Assert that a phys_avail entry is valid.
1792 */
1793 static void
vm_phys_avail_check(int i)1794 vm_phys_avail_check(int i)
1795 {
1796 if (i % 2 != 0)
1797 panic("Chunk start index %d is not even.", i);
1798 if (phys_avail[i] & PAGE_MASK)
1799 panic("Unaligned phys_avail[%d]: %#jx", i,
1800 (intmax_t)phys_avail[i]);
1801 if (phys_avail[i + 1] & PAGE_MASK)
1802 panic("Unaligned phys_avail[%d + 1]: %#jx", i,
1803 (intmax_t)phys_avail[i + 1]);
1804 if (phys_avail[i + 1] < phys_avail[i])
1805 panic("phys_avail[%d]: start %#jx > end %#jx", i,
1806 (intmax_t)phys_avail[i], (intmax_t)phys_avail[i + 1]);
1807 }
1808
1809 /*
1810 * Return the index of an overlapping phys_avail entry or -1.
1811 */
1812 #ifdef NUMA
1813 static int
vm_phys_avail_find(vm_paddr_t pa)1814 vm_phys_avail_find(vm_paddr_t pa)
1815 {
1816 int i;
1817
1818 for (i = 0; phys_avail[i + 1]; i += 2)
1819 if (phys_avail[i] <= pa && phys_avail[i + 1] > pa)
1820 return (i);
1821 return (-1);
1822 }
1823 #endif
1824
1825 /*
1826 * Return the index of the largest entry.
1827 */
1828 int
vm_phys_avail_largest(void)1829 vm_phys_avail_largest(void)
1830 {
1831 vm_paddr_t sz, largesz;
1832 int largest;
1833 int i;
1834
1835 largest = 0;
1836 largesz = 0;
1837 for (i = 0; phys_avail[i + 1]; i += 2) {
1838 sz = vm_phys_avail_size(i);
1839 if (sz > largesz) {
1840 largesz = sz;
1841 largest = i;
1842 }
1843 }
1844
1845 return (largest);
1846 }
1847
1848 vm_paddr_t
vm_phys_avail_size(int i)1849 vm_phys_avail_size(int i)
1850 {
1851
1852 return (phys_avail[i + 1] - phys_avail[i]);
1853 }
1854
1855 /*
1856 * Split a chunk in phys_avail[] at the address 'pa'.
1857 *
1858 * 'pa' must be within a chunk (slots i and i + 1) or one of its boundaries.
1859 * Returns zero on actual split, in which case the two new chunks occupy slots
1860 * i to i + 3, else EJUSTRETURN if 'pa' was one of the boundaries (and no split
1861 * actually occurred) else ENOSPC if there are not enough slots in phys_avail[]
1862 * to represent the additional chunk caused by the split.
1863 */
1864 static int
vm_phys_avail_split(vm_paddr_t pa,int i)1865 vm_phys_avail_split(vm_paddr_t pa, int i)
1866 {
1867 int cnt;
1868
1869 vm_phys_avail_check(i);
1870 if (pa < phys_avail[i] || pa > phys_avail[i + 1])
1871 panic("%s: Address %#jx not in range at slot %d [%#jx;%#jx].",
1872 __func__, (uintmax_t)pa, i,
1873 (uintmax_t)phys_avail[i], (uintmax_t)phys_avail[i + 1]);
1874 if (pa == phys_avail[i] || pa == phys_avail[i + 1])
1875 return (EJUSTRETURN);
1876 cnt = vm_phys_avail_count();
1877 if (cnt >= PHYS_AVAIL_ENTRIES)
1878 return (ENOSPC);
1879 memmove(&phys_avail[i + 2], &phys_avail[i],
1880 (cnt - i) * sizeof(phys_avail[0]));
1881 phys_avail[i + 1] = pa;
1882 phys_avail[i + 2] = pa;
1883 vm_phys_avail_check(i);
1884 vm_phys_avail_check(i+2);
1885
1886 return (0);
1887 }
1888
1889 /*
1890 * Check if a given physical address can be included as part of a crash dump.
1891 */
1892 bool
vm_phys_is_dumpable(vm_paddr_t pa)1893 vm_phys_is_dumpable(vm_paddr_t pa)
1894 {
1895 vm_page_t m;
1896 int i;
1897
1898 if ((m = vm_phys_paddr_to_vm_page(pa)) != NULL)
1899 return ((m->flags & PG_NODUMP) == 0);
1900
1901 for (i = 0; dump_avail[i] != 0 || dump_avail[i + 1] != 0; i += 2) {
1902 if (pa >= dump_avail[i] && pa < dump_avail[i + 1])
1903 return (true);
1904 }
1905 return (false);
1906 }
1907
1908 void
vm_phys_early_add_seg(vm_paddr_t start,vm_paddr_t end)1909 vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end)
1910 {
1911 struct vm_phys_seg *seg;
1912
1913 if (vm_phys_early_nsegs == -1)
1914 panic("%s: called after initialization", __func__);
1915 if (vm_phys_early_nsegs == nitems(vm_phys_early_segs))
1916 panic("%s: ran out of early segments", __func__);
1917
1918 seg = &vm_phys_early_segs[vm_phys_early_nsegs++];
1919 seg->start = start;
1920 seg->end = end;
1921 }
1922
1923 /*
1924 * This routine allocates NUMA node specific memory before the page
1925 * allocator is bootstrapped.
1926 */
1927 vm_paddr_t
vm_phys_early_alloc(int domain,size_t alloc_size)1928 vm_phys_early_alloc(int domain, size_t alloc_size)
1929 {
1930 #ifdef NUMA
1931 int mem_index;
1932 #endif
1933 int i, biggestone;
1934 vm_paddr_t pa, mem_start, mem_end, size, biggestsize, align;
1935
1936 KASSERT(domain == -1 || (domain >= 0 && domain < vm_ndomains),
1937 ("%s: invalid domain index %d", __func__, domain));
1938
1939 /*
1940 * Search the mem_affinity array for the biggest address
1941 * range in the desired domain. This is used to constrain
1942 * the phys_avail selection below.
1943 */
1944 biggestsize = 0;
1945 mem_start = 0;
1946 mem_end = -1;
1947 #ifdef NUMA
1948 mem_index = 0;
1949 if (mem_affinity != NULL) {
1950 for (i = 0;; i++) {
1951 size = mem_affinity[i].end - mem_affinity[i].start;
1952 if (size == 0)
1953 break;
1954 if (domain != -1 && mem_affinity[i].domain != domain)
1955 continue;
1956 if (size > biggestsize) {
1957 mem_index = i;
1958 biggestsize = size;
1959 }
1960 }
1961 mem_start = mem_affinity[mem_index].start;
1962 mem_end = mem_affinity[mem_index].end;
1963 }
1964 #endif
1965
1966 /*
1967 * Now find biggest physical segment in within the desired
1968 * numa domain.
1969 */
1970 biggestsize = 0;
1971 biggestone = 0;
1972 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1973 /* skip regions that are out of range */
1974 if (phys_avail[i+1] - alloc_size < mem_start ||
1975 phys_avail[i+1] > mem_end)
1976 continue;
1977 size = vm_phys_avail_size(i);
1978 if (size > biggestsize) {
1979 biggestone = i;
1980 biggestsize = size;
1981 }
1982 }
1983 alloc_size = round_page(alloc_size);
1984
1985 /*
1986 * Grab single pages from the front to reduce fragmentation.
1987 */
1988 if (alloc_size == PAGE_SIZE) {
1989 pa = phys_avail[biggestone];
1990 phys_avail[biggestone] += PAGE_SIZE;
1991 vm_phys_avail_check(biggestone);
1992 return (pa);
1993 }
1994
1995 /*
1996 * Naturally align large allocations.
1997 */
1998 align = phys_avail[biggestone + 1] & (alloc_size - 1);
1999 if (alloc_size + align > biggestsize)
2000 panic("cannot find a large enough size\n");
2001 if (align != 0 &&
2002 vm_phys_avail_split(phys_avail[biggestone + 1] - align,
2003 biggestone) != 0)
2004 /* Wasting memory. */
2005 phys_avail[biggestone + 1] -= align;
2006
2007 phys_avail[biggestone + 1] -= alloc_size;
2008 vm_phys_avail_check(biggestone);
2009 pa = phys_avail[biggestone + 1];
2010 return (pa);
2011 }
2012
2013 void
vm_phys_early_startup(void)2014 vm_phys_early_startup(void)
2015 {
2016 struct vm_phys_seg *seg;
2017 int i;
2018
2019 if (phys_avail[1] == 0)
2020 panic("phys_avail[] is empty");
2021
2022 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2023 phys_avail[i] = round_page(phys_avail[i]);
2024 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
2025 }
2026
2027 for (i = 0; i < vm_phys_early_nsegs; i++) {
2028 seg = &vm_phys_early_segs[i];
2029 vm_phys_add_seg(seg->start, seg->end);
2030 }
2031 vm_phys_early_nsegs = -1;
2032
2033 #ifdef NUMA
2034 /* Force phys_avail to be split by domain. */
2035 if (mem_affinity != NULL) {
2036 int idx;
2037
2038 for (i = 0; mem_affinity[i].end != 0; i++) {
2039 idx = vm_phys_avail_find(mem_affinity[i].start);
2040 if (idx != -1)
2041 vm_phys_avail_split(mem_affinity[i].start, idx);
2042 idx = vm_phys_avail_find(mem_affinity[i].end);
2043 if (idx != -1)
2044 vm_phys_avail_split(mem_affinity[i].end, idx);
2045 }
2046 }
2047 #endif
2048 }
2049
2050 #ifdef DDB
2051 /*
2052 * Show the number of physical pages in each of the free lists.
2053 */
DB_SHOW_COMMAND_FLAGS(freepages,db_show_freepages,DB_CMD_MEMSAFE)2054 DB_SHOW_COMMAND_FLAGS(freepages, db_show_freepages, DB_CMD_MEMSAFE)
2055 {
2056 struct vm_freelist *fl;
2057 int flind, oind, pind, dom;
2058
2059 for (dom = 0; dom < vm_ndomains; dom++) {
2060 db_printf("DOMAIN: %d\n", dom);
2061 for (flind = 0; flind < vm_nfreelists; flind++) {
2062 db_printf("FREE LIST %d:\n"
2063 "\n ORDER (SIZE) | NUMBER"
2064 "\n ", flind);
2065 for (pind = 0; pind < VM_NFREEPOOL; pind++)
2066 db_printf(" | POOL %d", pind);
2067 db_printf("\n-- ");
2068 for (pind = 0; pind < VM_NFREEPOOL; pind++)
2069 db_printf("-- -- ");
2070 db_printf("--\n");
2071 for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
2072 db_printf(" %2.2d (%6.6dK)", oind,
2073 1 << (PAGE_SHIFT - 10 + oind));
2074 for (pind = 0; pind < VM_NFREEPOOL; pind++) {
2075 fl = vm_phys_free_queues[dom][flind][pind];
2076 db_printf(" | %6.6d", fl[oind].lcnt);
2077 }
2078 db_printf("\n");
2079 }
2080 db_printf("\n");
2081 }
2082 db_printf("\n");
2083 }
2084 }
2085 #endif
2086