xref: /freebsd-13-stable/sys/vm/vm_radix.c (revision 3bc80996974a61a4223eae4c1ccd47b6ee32a48a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  * The following code is not generalized into a general purpose library
35  * because there are way too many parameters embedded that should really
36  * be decided by the library consumers.  At the same time, consumers
37  * of this code must achieve highest possible performance.
38  *
39  * The implementation takes into account the following rationale:
40  * - Size of the nodes should be as small as possible but still big enough
41  *   to avoid a large maximum depth for the trie.  This is a balance
42  *   between the necessity to not wire too much physical memory for the nodes
43  *   and the necessity to avoid too much cache pollution during the trie
44  *   operations.
45  * - There is not a huge bias toward the number of lookup operations over
46  *   the number of insert and remove operations.  This basically implies
47  *   that optimizations supposedly helping one operation but hurting the
48  *   other might be carefully evaluated.
49  * - On average not many nodes are expected to be fully populated, hence
50  *   level compression may just complicate things.
51  */
52 
53 #include <sys/cdefs.h>
54 #include "opt_ddb.h"
55 
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/kernel.h>
59 #include <sys/proc.h>
60 #include <sys/vmmeter.h>
61 #include <sys/smr.h>
62 #include <sys/smr_types.h>
63 
64 #include <vm/uma.h>
65 #include <vm/vm.h>
66 #include <vm/vm_param.h>
67 #include <vm/vm_object.h>
68 #include <vm/vm_page.h>
69 #include <vm/vm_radix.h>
70 
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74 
75 /*
76  * These widths should allow the pointers to a node's children to fit within
77  * a single cache line.  The extra levels from a narrow width should not be
78  * a problem thanks to path compression.
79  */
80 #ifdef __LP64__
81 #define	VM_RADIX_WIDTH	4
82 #else
83 #define	VM_RADIX_WIDTH	3
84 #endif
85 
86 #define	VM_RADIX_COUNT	(1 << VM_RADIX_WIDTH)
87 #define	VM_RADIX_MASK	(VM_RADIX_COUNT - 1)
88 #define	VM_RADIX_LIMIT							\
89 	(howmany(sizeof(vm_pindex_t) * NBBY, VM_RADIX_WIDTH) - 1)
90 
91 /* Flag bits stored in node pointers. */
92 #define	VM_RADIX_ISLEAF	0x1
93 #define	VM_RADIX_FLAGS	0x1
94 #define	VM_RADIX_PAD	VM_RADIX_FLAGS
95 
96 /* Returns one unit associated with specified level. */
97 #define	VM_RADIX_UNITLEVEL(lev)						\
98 	((vm_pindex_t)1 << ((lev) * VM_RADIX_WIDTH))
99 
100 enum vm_radix_access { SMR, LOCKED, UNSERIALIZED };
101 
102 struct vm_radix_node;
103 typedef SMR_POINTER(struct vm_radix_node *) smrnode_t;
104 
105 struct vm_radix_node {
106 	vm_pindex_t	rn_owner;			/* Owner of record. */
107 	uint16_t	rn_count;			/* Valid children. */
108 	uint8_t		rn_clev;			/* Current level. */
109 	int8_t		rn_last;			/* zero last ptr. */
110 	smrnode_t	rn_child[VM_RADIX_COUNT];	/* Child nodes. */
111 };
112 
113 static uma_zone_t vm_radix_node_zone;
114 static smr_t vm_radix_smr;
115 
116 static void vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
117     enum vm_radix_access access);
118 
119 /*
120  * Allocate a radix node.
121  */
122 static struct vm_radix_node *
vm_radix_node_get(vm_pindex_t owner,uint16_t count,uint16_t clevel)123 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
124 {
125 	struct vm_radix_node *rnode;
126 
127 	rnode = uma_zalloc_smr(vm_radix_node_zone, M_NOWAIT);
128 	if (rnode == NULL)
129 		return (NULL);
130 
131 	/*
132 	 * We want to clear the last child pointer after the final section
133 	 * has exited so lookup can not return false negatives.  It is done
134 	 * here because it will be cache-cold in the dtor callback.
135 	 */
136 	if (rnode->rn_last != 0) {
137 		vm_radix_node_store(&rnode->rn_child[rnode->rn_last - 1],
138 		    NULL, UNSERIALIZED);
139 		rnode->rn_last = 0;
140 	}
141 	rnode->rn_owner = owner;
142 	rnode->rn_count = count;
143 	rnode->rn_clev = clevel;
144 	return (rnode);
145 }
146 
147 /*
148  * Free radix node.
149  */
150 static __inline void
vm_radix_node_put(struct vm_radix_node * rnode,int8_t last)151 vm_radix_node_put(struct vm_radix_node *rnode, int8_t last)
152 {
153 #ifdef INVARIANTS
154 	int slot;
155 
156 	KASSERT(rnode->rn_count == 0,
157 	    ("vm_radix_node_put: rnode %p has %d children", rnode,
158 	    rnode->rn_count));
159 	for (slot = 0; slot < VM_RADIX_COUNT; slot++) {
160 		if (slot == last)
161 			continue;
162 		KASSERT(smr_unserialized_load(&rnode->rn_child[slot], true) ==
163 		    NULL, ("vm_radix_node_put: rnode %p has a child", rnode));
164 	}
165 #endif
166 	/* Off by one so a freshly zero'd node is not assigned to. */
167 	rnode->rn_last = last + 1;
168 	uma_zfree_smr(vm_radix_node_zone, rnode);
169 }
170 
171 /*
172  * Return the position in the array for a given level.
173  */
174 static __inline int
vm_radix_slot(vm_pindex_t index,uint16_t level)175 vm_radix_slot(vm_pindex_t index, uint16_t level)
176 {
177 
178 	return ((index >> (level * VM_RADIX_WIDTH)) & VM_RADIX_MASK);
179 }
180 
181 /* Trims the key after the specified level. */
182 static __inline vm_pindex_t
vm_radix_trimkey(vm_pindex_t index,uint16_t level)183 vm_radix_trimkey(vm_pindex_t index, uint16_t level)
184 {
185 	vm_pindex_t ret;
186 
187 	ret = index;
188 	if (level > 0) {
189 		ret >>= level * VM_RADIX_WIDTH;
190 		ret <<= level * VM_RADIX_WIDTH;
191 	}
192 	return (ret);
193 }
194 
195 /*
196  * Fetch a node pointer from a slot in another node.
197  */
198 static __inline struct vm_radix_node *
vm_radix_node_load(smrnode_t * p,enum vm_radix_access access)199 vm_radix_node_load(smrnode_t *p, enum vm_radix_access access)
200 {
201 
202 	switch (access) {
203 	case UNSERIALIZED:
204 		return (smr_unserialized_load(p, true));
205 	case LOCKED:
206 		return (smr_serialized_load(p, true));
207 	case SMR:
208 		return (smr_entered_load(p, vm_radix_smr));
209 	}
210 	__assert_unreachable();
211 }
212 
213 static __inline void
vm_radix_node_store(smrnode_t * p,struct vm_radix_node * v,enum vm_radix_access access)214 vm_radix_node_store(smrnode_t *p, struct vm_radix_node *v,
215     enum vm_radix_access access)
216 {
217 
218 	switch (access) {
219 	case UNSERIALIZED:
220 		smr_unserialized_store(p, v, true);
221 		break;
222 	case LOCKED:
223 		smr_serialized_store(p, v, true);
224 		break;
225 	case SMR:
226 		panic("vm_radix_node_store: Not supported in smr section.");
227 	}
228 }
229 
230 /*
231  * Get the root node for a radix tree.
232  */
233 static __inline struct vm_radix_node *
vm_radix_root_load(struct vm_radix * rtree,enum vm_radix_access access)234 vm_radix_root_load(struct vm_radix *rtree, enum vm_radix_access access)
235 {
236 
237 	return (vm_radix_node_load((smrnode_t *)&rtree->rt_root, access));
238 }
239 
240 /*
241  * Set the root node for a radix tree.
242  */
243 static __inline void
vm_radix_root_store(struct vm_radix * rtree,struct vm_radix_node * rnode,enum vm_radix_access access)244 vm_radix_root_store(struct vm_radix *rtree, struct vm_radix_node *rnode,
245     enum vm_radix_access access)
246 {
247 
248 	vm_radix_node_store((smrnode_t *)&rtree->rt_root, rnode, access);
249 }
250 
251 /*
252  * Returns TRUE if the specified radix node is a leaf and FALSE otherwise.
253  */
254 static __inline boolean_t
vm_radix_isleaf(struct vm_radix_node * rnode)255 vm_radix_isleaf(struct vm_radix_node *rnode)
256 {
257 
258 	return (((uintptr_t)rnode & VM_RADIX_ISLEAF) != 0);
259 }
260 
261 /*
262  * Returns the associated page extracted from rnode.
263  */
264 static __inline vm_page_t
vm_radix_topage(struct vm_radix_node * rnode)265 vm_radix_topage(struct vm_radix_node *rnode)
266 {
267 
268 	return ((vm_page_t)((uintptr_t)rnode & ~VM_RADIX_FLAGS));
269 }
270 
271 /*
272  * Adds the page as a child of the provided node.
273  */
274 static __inline void
vm_radix_addpage(struct vm_radix_node * rnode,vm_pindex_t index,uint16_t clev,vm_page_t page,enum vm_radix_access access)275 vm_radix_addpage(struct vm_radix_node *rnode, vm_pindex_t index, uint16_t clev,
276     vm_page_t page, enum vm_radix_access access)
277 {
278 	int slot;
279 
280 	slot = vm_radix_slot(index, clev);
281 	vm_radix_node_store(&rnode->rn_child[slot],
282 	    (struct vm_radix_node *)((uintptr_t)page | VM_RADIX_ISLEAF), access);
283 }
284 
285 /*
286  * Returns the slot where two keys differ.
287  * It cannot accept 2 equal keys.
288  */
289 static __inline uint16_t
vm_radix_keydiff(vm_pindex_t index1,vm_pindex_t index2)290 vm_radix_keydiff(vm_pindex_t index1, vm_pindex_t index2)
291 {
292 	uint16_t clev;
293 
294 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
295 	    __func__, (uintmax_t)index1));
296 
297 	index1 ^= index2;
298 	for (clev = VM_RADIX_LIMIT;; clev--)
299 		if (vm_radix_slot(index1, clev) != 0)
300 			return (clev);
301 }
302 
303 /*
304  * Returns TRUE if it can be determined that key does not belong to the
305  * specified rnode.  Otherwise, returns FALSE.
306  */
307 static __inline boolean_t
vm_radix_keybarr(struct vm_radix_node * rnode,vm_pindex_t idx)308 vm_radix_keybarr(struct vm_radix_node *rnode, vm_pindex_t idx)
309 {
310 
311 	if (rnode->rn_clev < VM_RADIX_LIMIT) {
312 		idx = vm_radix_trimkey(idx, rnode->rn_clev + 1);
313 		return (idx != rnode->rn_owner);
314 	}
315 	return (FALSE);
316 }
317 
318 /*
319  * Internal helper for vm_radix_reclaim_allnodes().
320  * This function is recursive.
321  */
322 static void
vm_radix_reclaim_allnodes_int(struct vm_radix_node * rnode)323 vm_radix_reclaim_allnodes_int(struct vm_radix_node *rnode)
324 {
325 	struct vm_radix_node *child;
326 	int slot;
327 
328 	KASSERT(rnode->rn_count <= VM_RADIX_COUNT,
329 	    ("vm_radix_reclaim_allnodes_int: bad count in rnode %p", rnode));
330 	for (slot = 0; rnode->rn_count != 0; slot++) {
331 		child = vm_radix_node_load(&rnode->rn_child[slot], UNSERIALIZED);
332 		if (child == NULL)
333 			continue;
334 		if (!vm_radix_isleaf(child))
335 			vm_radix_reclaim_allnodes_int(child);
336 		vm_radix_node_store(&rnode->rn_child[slot], NULL, UNSERIALIZED);
337 		rnode->rn_count--;
338 	}
339 	vm_radix_node_put(rnode, -1);
340 }
341 
342 #ifndef UMA_MD_SMALL_ALLOC
343 void vm_radix_reserve_kva(void);
344 /*
345  * Reserve the KVA necessary to satisfy the node allocation.
346  * This is mandatory in architectures not supporting direct
347  * mapping as they will need otherwise to carve into the kernel maps for
348  * every node allocation, resulting into deadlocks for consumers already
349  * working with kernel maps.
350  */
351 void
vm_radix_reserve_kva(void)352 vm_radix_reserve_kva(void)
353 {
354 
355 	/*
356 	 * Calculate the number of reserved nodes, discounting the pages that
357 	 * are needed to store them.
358 	 */
359 	if (!uma_zone_reserve_kva(vm_radix_node_zone,
360 	    ((vm_paddr_t)vm_cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
361 	    sizeof(struct vm_radix_node))))
362 		panic("%s: unable to reserve KVA", __func__);
363 }
364 #endif
365 
366 /*
367  * Initialize the UMA slab zone.
368  */
369 void
vm_radix_zinit(void)370 vm_radix_zinit(void)
371 {
372 
373 	vm_radix_node_zone = uma_zcreate("RADIX NODE",
374 	    sizeof(struct vm_radix_node), NULL, NULL, NULL, NULL,
375 	    VM_RADIX_PAD, UMA_ZONE_VM | UMA_ZONE_SMR | UMA_ZONE_ZINIT);
376 	vm_radix_smr = uma_zone_get_smr(vm_radix_node_zone);
377 }
378 
379 /*
380  * Inserts the key-value pair into the trie.
381  * Panics if the key already exists.
382  */
383 int
vm_radix_insert(struct vm_radix * rtree,vm_page_t page)384 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
385 {
386 	vm_pindex_t index, newind;
387 	struct vm_radix_node *rnode, *tmp;
388 	smrnode_t *parentp;
389 	vm_page_t m;
390 	int slot;
391 	uint16_t clev;
392 
393 	index = page->pindex;
394 
395 	/*
396 	 * The owner of record for root is not really important because it
397 	 * will never be used.
398 	 */
399 	rnode = vm_radix_root_load(rtree, LOCKED);
400 	if (rnode == NULL) {
401 		rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
402 		return (0);
403 	}
404 	parentp = (smrnode_t *)&rtree->rt_root;
405 	for (;;) {
406 		if (vm_radix_isleaf(rnode)) {
407 			m = vm_radix_topage(rnode);
408 			if (m->pindex == index)
409 				panic("%s: key %jx is already present",
410 				    __func__, (uintmax_t)index);
411 			clev = vm_radix_keydiff(m->pindex, index);
412 			tmp = vm_radix_node_get(vm_radix_trimkey(index,
413 			    clev + 1), 2, clev);
414 			if (tmp == NULL)
415 				return (ENOMEM);
416 			/* These writes are not yet visible due to ordering. */
417 			vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
418 			vm_radix_addpage(tmp, m->pindex, clev, m, UNSERIALIZED);
419 			/* Synchronize to make leaf visible. */
420 			vm_radix_node_store(parentp, tmp, LOCKED);
421 			return (0);
422 		} else if (vm_radix_keybarr(rnode, index))
423 			break;
424 		slot = vm_radix_slot(index, rnode->rn_clev);
425 		parentp = &rnode->rn_child[slot];
426 		tmp = vm_radix_node_load(parentp, LOCKED);
427 		if (tmp == NULL) {
428 			rnode->rn_count++;
429 			vm_radix_addpage(rnode, index, rnode->rn_clev, page,
430 			    LOCKED);
431 			return (0);
432 		}
433 		rnode = tmp;
434 	}
435 
436 	/*
437 	 * A new node is needed because the right insertion level is reached.
438 	 * Setup the new intermediate node and add the 2 children: the
439 	 * new object and the older edge.
440 	 */
441 	newind = rnode->rn_owner;
442 	clev = vm_radix_keydiff(newind, index);
443 	tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
444 	if (tmp == NULL)
445 		return (ENOMEM);
446 	slot = vm_radix_slot(newind, clev);
447 	/* These writes are not yet visible due to ordering. */
448 	vm_radix_addpage(tmp, index, clev, page, UNSERIALIZED);
449 	vm_radix_node_store(&tmp->rn_child[slot], rnode, UNSERIALIZED);
450 	/* Serializing write to make the above visible. */
451 	vm_radix_node_store(parentp, tmp, LOCKED);
452 
453 	return (0);
454 }
455 
456 /*
457  * Returns TRUE if the specified radix tree contains a single leaf and FALSE
458  * otherwise.
459  */
460 boolean_t
vm_radix_is_singleton(struct vm_radix * rtree)461 vm_radix_is_singleton(struct vm_radix *rtree)
462 {
463 	struct vm_radix_node *rnode;
464 
465 	rnode = vm_radix_root_load(rtree, LOCKED);
466 	if (rnode == NULL)
467 		return (FALSE);
468 	return (vm_radix_isleaf(rnode));
469 }
470 
471 /*
472  * Returns the value stored at the index.  If the index is not present,
473  * NULL is returned.
474  */
475 static __always_inline vm_page_t
_vm_radix_lookup(struct vm_radix * rtree,vm_pindex_t index,enum vm_radix_access access)476 _vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index,
477     enum vm_radix_access access)
478 {
479 	struct vm_radix_node *rnode;
480 	vm_page_t m;
481 	int slot;
482 
483 	rnode = vm_radix_root_load(rtree, access);
484 	while (rnode != NULL) {
485 		if (vm_radix_isleaf(rnode)) {
486 			m = vm_radix_topage(rnode);
487 			if (m->pindex == index)
488 				return (m);
489 			break;
490 		}
491 		if (vm_radix_keybarr(rnode, index))
492 			break;
493 		slot = vm_radix_slot(index, rnode->rn_clev);
494 		rnode = vm_radix_node_load(&rnode->rn_child[slot], access);
495 	}
496 	return (NULL);
497 }
498 
499 /*
500  * Returns the value stored at the index assuming there is an external lock.
501  *
502  * If the index is not present, NULL is returned.
503  */
504 vm_page_t
vm_radix_lookup(struct vm_radix * rtree,vm_pindex_t index)505 vm_radix_lookup(struct vm_radix *rtree, vm_pindex_t index)
506 {
507 
508 	return _vm_radix_lookup(rtree, index, LOCKED);
509 }
510 
511 /*
512  * Returns the value stored at the index without requiring an external lock.
513  *
514  * If the index is not present, NULL is returned.
515  */
516 vm_page_t
vm_radix_lookup_unlocked(struct vm_radix * rtree,vm_pindex_t index)517 vm_radix_lookup_unlocked(struct vm_radix *rtree, vm_pindex_t index)
518 {
519 	vm_page_t m;
520 
521 	smr_enter(vm_radix_smr);
522 	m = _vm_radix_lookup(rtree, index, SMR);
523 	smr_exit(vm_radix_smr);
524 
525 	return (m);
526 }
527 
528 /*
529  * Look up the nearest entry at a position greater than or equal to index.
530  */
531 vm_page_t
vm_radix_lookup_ge(struct vm_radix * rtree,vm_pindex_t index)532 vm_radix_lookup_ge(struct vm_radix *rtree, vm_pindex_t index)
533 {
534 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
535 	vm_pindex_t inc;
536 	vm_page_t m;
537 	struct vm_radix_node *child, *rnode;
538 #ifdef INVARIANTS
539 	int loops = 0;
540 #endif
541 	int slot, tos;
542 
543 	rnode = vm_radix_root_load(rtree, LOCKED);
544 	if (rnode == NULL)
545 		return (NULL);
546 	else if (vm_radix_isleaf(rnode)) {
547 		m = vm_radix_topage(rnode);
548 		if (m->pindex >= index)
549 			return (m);
550 		else
551 			return (NULL);
552 	}
553 	tos = 0;
554 	for (;;) {
555 		/*
556 		 * If the keys differ before the current bisection node,
557 		 * then the search key might rollback to the earliest
558 		 * available bisection node or to the smallest key
559 		 * in the current node (if the owner is greater than the
560 		 * search key).
561 		 */
562 		if (vm_radix_keybarr(rnode, index)) {
563 			if (index > rnode->rn_owner) {
564 ascend:
565 				KASSERT(++loops < 1000,
566 				    ("vm_radix_lookup_ge: too many loops"));
567 
568 				/*
569 				 * Pop nodes from the stack until either the
570 				 * stack is empty or a node that could have a
571 				 * matching descendant is found.
572 				 */
573 				do {
574 					if (tos == 0)
575 						return (NULL);
576 					rnode = stack[--tos];
577 				} while (vm_radix_slot(index,
578 				    rnode->rn_clev) == (VM_RADIX_COUNT - 1));
579 
580 				/*
581 				 * The following computation cannot overflow
582 				 * because index's slot at the current level
583 				 * is less than VM_RADIX_COUNT - 1.
584 				 */
585 				index = vm_radix_trimkey(index,
586 				    rnode->rn_clev);
587 				index += VM_RADIX_UNITLEVEL(rnode->rn_clev);
588 			} else
589 				index = rnode->rn_owner;
590 			KASSERT(!vm_radix_keybarr(rnode, index),
591 			    ("vm_radix_lookup_ge: keybarr failed"));
592 		}
593 		slot = vm_radix_slot(index, rnode->rn_clev);
594 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
595 		if (vm_radix_isleaf(child)) {
596 			m = vm_radix_topage(child);
597 			if (m->pindex >= index)
598 				return (m);
599 		} else if (child != NULL)
600 			goto descend;
601 
602 		/*
603 		 * Look for an available edge or page within the current
604 		 * bisection node.
605 		 */
606                 if (slot < (VM_RADIX_COUNT - 1)) {
607 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
608 			index = vm_radix_trimkey(index, rnode->rn_clev);
609 			do {
610 				index += inc;
611 				slot++;
612 				child = vm_radix_node_load(&rnode->rn_child[slot],
613 				    LOCKED);
614 				if (vm_radix_isleaf(child)) {
615 					m = vm_radix_topage(child);
616 					if (m->pindex >= index)
617 						return (m);
618 				} else if (child != NULL)
619 					goto descend;
620 			} while (slot < (VM_RADIX_COUNT - 1));
621 		}
622 		KASSERT(child == NULL || vm_radix_isleaf(child),
623 		    ("vm_radix_lookup_ge: child is radix node"));
624 
625 		/*
626 		 * If a page or edge greater than the search slot is not found
627 		 * in the current node, ascend to the next higher-level node.
628 		 */
629 		goto ascend;
630 descend:
631 		KASSERT(rnode->rn_clev > 0,
632 		    ("vm_radix_lookup_ge: pushing leaf's parent"));
633 		KASSERT(tos < VM_RADIX_LIMIT,
634 		    ("vm_radix_lookup_ge: stack overflow"));
635 		stack[tos++] = rnode;
636 		rnode = child;
637 	}
638 }
639 
640 /*
641  * Look up the nearest entry at a position less than or equal to index.
642  */
643 vm_page_t
vm_radix_lookup_le(struct vm_radix * rtree,vm_pindex_t index)644 vm_radix_lookup_le(struct vm_radix *rtree, vm_pindex_t index)
645 {
646 	struct vm_radix_node *stack[VM_RADIX_LIMIT];
647 	vm_pindex_t inc;
648 	vm_page_t m;
649 	struct vm_radix_node *child, *rnode;
650 #ifdef INVARIANTS
651 	int loops = 0;
652 #endif
653 	int slot, tos;
654 
655 	rnode = vm_radix_root_load(rtree, LOCKED);
656 	if (rnode == NULL)
657 		return (NULL);
658 	else if (vm_radix_isleaf(rnode)) {
659 		m = vm_radix_topage(rnode);
660 		if (m->pindex <= index)
661 			return (m);
662 		else
663 			return (NULL);
664 	}
665 	tos = 0;
666 	for (;;) {
667 		/*
668 		 * If the keys differ before the current bisection node,
669 		 * then the search key might rollback to the earliest
670 		 * available bisection node or to the largest key
671 		 * in the current node (if the owner is smaller than the
672 		 * search key).
673 		 */
674 		if (vm_radix_keybarr(rnode, index)) {
675 			if (index > rnode->rn_owner) {
676 				index = rnode->rn_owner + VM_RADIX_COUNT *
677 				    VM_RADIX_UNITLEVEL(rnode->rn_clev);
678 			} else {
679 ascend:
680 				KASSERT(++loops < 1000,
681 				    ("vm_radix_lookup_le: too many loops"));
682 
683 				/*
684 				 * Pop nodes from the stack until either the
685 				 * stack is empty or a node that could have a
686 				 * matching descendant is found.
687 				 */
688 				do {
689 					if (tos == 0)
690 						return (NULL);
691 					rnode = stack[--tos];
692 				} while (vm_radix_slot(index,
693 				    rnode->rn_clev) == 0);
694 
695 				/*
696 				 * The following computation cannot overflow
697 				 * because index's slot at the current level
698 				 * is greater than 0.
699 				 */
700 				index = vm_radix_trimkey(index,
701 				    rnode->rn_clev);
702 			}
703 			index--;
704 			KASSERT(!vm_radix_keybarr(rnode, index),
705 			    ("vm_radix_lookup_le: keybarr failed"));
706 		}
707 		slot = vm_radix_slot(index, rnode->rn_clev);
708 		child = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
709 		if (vm_radix_isleaf(child)) {
710 			m = vm_radix_topage(child);
711 			if (m->pindex <= index)
712 				return (m);
713 		} else if (child != NULL)
714 			goto descend;
715 
716 		/*
717 		 * Look for an available edge or page within the current
718 		 * bisection node.
719 		 */
720 		if (slot > 0) {
721 			inc = VM_RADIX_UNITLEVEL(rnode->rn_clev);
722 			index |= inc - 1;
723 			do {
724 				index -= inc;
725 				slot--;
726 				child = vm_radix_node_load(&rnode->rn_child[slot],
727 				    LOCKED);
728 				if (vm_radix_isleaf(child)) {
729 					m = vm_radix_topage(child);
730 					if (m->pindex <= index)
731 						return (m);
732 				} else if (child != NULL)
733 					goto descend;
734 			} while (slot > 0);
735 		}
736 		KASSERT(child == NULL || vm_radix_isleaf(child),
737 		    ("vm_radix_lookup_le: child is radix node"));
738 
739 		/*
740 		 * If a page or edge smaller than the search slot is not found
741 		 * in the current node, ascend to the next higher-level node.
742 		 */
743 		goto ascend;
744 descend:
745 		KASSERT(rnode->rn_clev > 0,
746 		    ("vm_radix_lookup_le: pushing leaf's parent"));
747 		KASSERT(tos < VM_RADIX_LIMIT,
748 		    ("vm_radix_lookup_le: stack overflow"));
749 		stack[tos++] = rnode;
750 		rnode = child;
751 	}
752 }
753 
754 /*
755  * Remove the specified index from the trie, and return the value stored at
756  * that index.  If the index is not present, return NULL.
757  */
758 vm_page_t
vm_radix_remove(struct vm_radix * rtree,vm_pindex_t index)759 vm_radix_remove(struct vm_radix *rtree, vm_pindex_t index)
760 {
761 	struct vm_radix_node *rnode, *parent, *tmp;
762 	vm_page_t m;
763 	int i, slot;
764 
765 	rnode = vm_radix_root_load(rtree, LOCKED);
766 	if (vm_radix_isleaf(rnode)) {
767 		m = vm_radix_topage(rnode);
768 		if (m->pindex != index)
769 			return (NULL);
770 		vm_radix_root_store(rtree, NULL, LOCKED);
771 		return (m);
772 	}
773 	parent = NULL;
774 	for (;;) {
775 		if (rnode == NULL)
776 			return (NULL);
777 		slot = vm_radix_slot(index, rnode->rn_clev);
778 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
779 		if (vm_radix_isleaf(tmp)) {
780 			m = vm_radix_topage(tmp);
781 			if (m->pindex != index)
782 				return (NULL);
783 			vm_radix_node_store(&rnode->rn_child[slot], NULL, LOCKED);
784 			rnode->rn_count--;
785 			if (rnode->rn_count > 1)
786 				return (m);
787 			for (i = 0; i < VM_RADIX_COUNT; i++)
788 				if (vm_radix_node_load(&rnode->rn_child[i],
789 				    LOCKED) != NULL)
790 					break;
791 			KASSERT(i != VM_RADIX_COUNT,
792 			    ("%s: invalid node configuration", __func__));
793 			tmp = vm_radix_node_load(&rnode->rn_child[i], LOCKED);
794 			if (parent == NULL)
795 				vm_radix_root_store(rtree, tmp, LOCKED);
796 			else {
797 				slot = vm_radix_slot(index, parent->rn_clev);
798 				KASSERT(vm_radix_node_load(
799 				    &parent->rn_child[slot], LOCKED) == rnode,
800 				    ("%s: invalid child value", __func__));
801 				vm_radix_node_store(&parent->rn_child[slot],
802 				    tmp, LOCKED);
803 			}
804 			/*
805 			 * The child is still valid and we can not zero the
806 			 * pointer until all smr references are gone.
807 			 */
808 			rnode->rn_count--;
809 			vm_radix_node_put(rnode, i);
810 			return (m);
811 		}
812 		parent = rnode;
813 		rnode = tmp;
814 	}
815 }
816 
817 /*
818  * Remove and free all the nodes from the radix tree.
819  * This function is recursive but there is a tight control on it as the
820  * maximum depth of the tree is fixed.
821  */
822 void
vm_radix_reclaim_allnodes(struct vm_radix * rtree)823 vm_radix_reclaim_allnodes(struct vm_radix *rtree)
824 {
825 	struct vm_radix_node *root;
826 
827 	root = vm_radix_root_load(rtree, LOCKED);
828 	if (root == NULL)
829 		return;
830 	vm_radix_root_store(rtree, NULL, UNSERIALIZED);
831 	if (!vm_radix_isleaf(root))
832 		vm_radix_reclaim_allnodes_int(root);
833 }
834 
835 /*
836  * Replace an existing page in the trie with another one.
837  * Panics if there is not an old page in the trie at the new page's index.
838  */
839 vm_page_t
vm_radix_replace(struct vm_radix * rtree,vm_page_t newpage)840 vm_radix_replace(struct vm_radix *rtree, vm_page_t newpage)
841 {
842 	struct vm_radix_node *rnode, *tmp;
843 	vm_page_t m;
844 	vm_pindex_t index;
845 	int slot;
846 
847 	index = newpage->pindex;
848 	rnode = vm_radix_root_load(rtree, LOCKED);
849 	if (rnode == NULL)
850 		panic("%s: replacing page on an empty trie", __func__);
851 	if (vm_radix_isleaf(rnode)) {
852 		m = vm_radix_topage(rnode);
853 		if (m->pindex != index)
854 			panic("%s: original replacing root key not found",
855 			    __func__);
856 		rtree->rt_root = (uintptr_t)newpage | VM_RADIX_ISLEAF;
857 		return (m);
858 	}
859 	for (;;) {
860 		slot = vm_radix_slot(index, rnode->rn_clev);
861 		tmp = vm_radix_node_load(&rnode->rn_child[slot], LOCKED);
862 		if (vm_radix_isleaf(tmp)) {
863 			m = vm_radix_topage(tmp);
864 			if (m->pindex == index) {
865 				vm_radix_node_store(&rnode->rn_child[slot],
866 				    (struct vm_radix_node *)((uintptr_t)newpage |
867 				    VM_RADIX_ISLEAF), LOCKED);
868 				return (m);
869 			} else
870 				break;
871 		} else if (tmp == NULL || vm_radix_keybarr(tmp, index))
872 			break;
873 		rnode = tmp;
874 	}
875 	panic("%s: original replacing page not found", __func__);
876 }
877 
878 void
vm_radix_wait(void)879 vm_radix_wait(void)
880 {
881 	uma_zwait(vm_radix_node_zone);
882 }
883 
884 #ifdef DDB
885 /*
886  * Show details about the given radix node.
887  */
DB_SHOW_COMMAND(radixnode,db_show_radixnode)888 DB_SHOW_COMMAND(radixnode, db_show_radixnode)
889 {
890 	struct vm_radix_node *rnode, *tmp;
891 	int i;
892 
893         if (!have_addr)
894                 return;
895 	rnode = (struct vm_radix_node *)addr;
896 	db_printf("radixnode %p, owner %jx, children count %u, level %u:\n",
897 	    (void *)rnode, (uintmax_t)rnode->rn_owner, rnode->rn_count,
898 	    rnode->rn_clev);
899 	for (i = 0; i < VM_RADIX_COUNT; i++) {
900 		tmp = vm_radix_node_load(&rnode->rn_child[i], UNSERIALIZED);
901 		if (tmp != NULL)
902 			db_printf("slot: %d, val: %p, page: %p, clev: %d\n",
903 			    i, (void *)tmp,
904 			    vm_radix_isleaf(tmp) ?  vm_radix_topage(tmp) : NULL,
905 			    rnode->rn_clev);
906 	}
907 }
908 #endif /* DDB */
909