xref: /freebsd-13-stable/sys/kern/subr_pctrie.c (revision 3bc80996974a61a4223eae4c1ccd47b6ee32a48a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2013 EMC Corp.
5  * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
6  * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Path-compressed radix trie implementation.
34  *
35  * The implementation takes into account the following rationale:
36  * - Size of the nodes should be as small as possible but still big enough
37  *   to avoid a large maximum depth for the trie.  This is a balance
38  *   between the necessity to not wire too much physical memory for the nodes
39  *   and the necessity to avoid too much cache pollution during the trie
40  *   operations.
41  * - There is not a huge bias toward the number of lookup operations over
42  *   the number of insert and remove operations.  This basically implies
43  *   that optimizations supposedly helping one operation but hurting the
44  *   other might be carefully evaluated.
45  * - On average not many nodes are expected to be fully populated, hence
46  *   level compression may just complicate things.
47  */
48 
49 #include <sys/cdefs.h>
50 #include "opt_ddb.h"
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
56 #include <sys/proc.h>	/* smr.h depends on struct thread. */
57 #include <sys/smr.h>
58 #include <sys/smr_types.h>
59 
60 #ifdef DDB
61 #include <ddb/ddb.h>
62 #endif
63 
64 #define	PCTRIE_MASK	(PCTRIE_COUNT - 1)
65 #define	PCTRIE_LIMIT	(howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
66 
67 /* Flag bits stored in node pointers. */
68 #define	PCTRIE_ISLEAF	0x1
69 #define	PCTRIE_FLAGS	0x1
70 #define	PCTRIE_PAD	PCTRIE_FLAGS
71 
72 /* Returns one unit associated with specified level. */
73 #define	PCTRIE_UNITLEVEL(lev)						\
74 	((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
75 
76 struct pctrie_node;
77 typedef SMR_POINTER(struct pctrie_node *) smr_pctnode_t;
78 
79 struct pctrie_node {
80 	uint64_t	pn_owner;			/* Owner of record. */
81 	uint16_t	pn_count;			/* Valid children. */
82 	uint8_t		pn_clev;			/* Current level. */
83 	int8_t		pn_last;			/* Zero last ptr. */
84 	smr_pctnode_t	pn_child[PCTRIE_COUNT];		/* Child nodes. */
85 };
86 
87 enum pctrie_access { PCTRIE_SMR, PCTRIE_LOCKED, PCTRIE_UNSERIALIZED };
88 
89 static __inline void pctrie_node_store(smr_pctnode_t *p, void *val,
90     enum pctrie_access access);
91 
92 /*
93  * Allocate a node.  Pre-allocation should ensure that the request
94  * will always be satisfied.
95  */
96 static struct pctrie_node *
pctrie_node_get(struct pctrie * ptree,pctrie_alloc_t allocfn,uint64_t owner,uint16_t count,uint16_t clevel)97 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
98     uint16_t count, uint16_t clevel)
99 {
100 	struct pctrie_node *node;
101 
102 	node = allocfn(ptree);
103 	if (node == NULL)
104 		return (NULL);
105 
106 	/*
107 	 * We want to clear the last child pointer after the final section
108 	 * has exited so lookup can not return false negatives.  It is done
109 	 * here because it will be cache-cold in the dtor callback.
110 	 */
111 	if (node->pn_last != 0) {
112 		pctrie_node_store(&node->pn_child[node->pn_last - 1], NULL,
113 		    PCTRIE_UNSERIALIZED);
114 		node->pn_last = 0;
115 	}
116 	node->pn_owner = owner;
117 	node->pn_count = count;
118 	node->pn_clev = clevel;
119 	return (node);
120 }
121 
122 /*
123  * Free radix node.
124  */
125 static __inline void
pctrie_node_put(struct pctrie * ptree,struct pctrie_node * node,pctrie_free_t freefn,int8_t last)126 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
127     pctrie_free_t freefn, int8_t last)
128 {
129 #ifdef INVARIANTS
130 	int slot;
131 
132 	KASSERT(node->pn_count == 0,
133 	    ("pctrie_node_put: node %p has %d children", node,
134 	    node->pn_count));
135 	for (slot = 0; slot < PCTRIE_COUNT; slot++) {
136 		if (slot == last)
137 			continue;
138 		KASSERT(smr_unserialized_load(&node->pn_child[slot], true) ==
139 		    NULL, ("pctrie_node_put: node %p has a child", node));
140 	}
141 #endif
142 	node->pn_last = last + 1;
143 	freefn(ptree, node);
144 }
145 
146 /*
147  * Return the position in the array for a given level.
148  */
149 static __inline int
pctrie_slot(uint64_t index,uint16_t level)150 pctrie_slot(uint64_t index, uint16_t level)
151 {
152 
153 	return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
154 }
155 
156 /* Trims the key after the specified level. */
157 static __inline uint64_t
pctrie_trimkey(uint64_t index,uint16_t level)158 pctrie_trimkey(uint64_t index, uint16_t level)
159 {
160 	uint64_t ret;
161 
162 	ret = index;
163 	if (level > 0) {
164 		ret >>= level * PCTRIE_WIDTH;
165 		ret <<= level * PCTRIE_WIDTH;
166 	}
167 	return (ret);
168 }
169 
170 /*
171  * Fetch a node pointer from a slot.
172  */
173 static __inline struct pctrie_node *
pctrie_node_load(smr_pctnode_t * p,smr_t smr,enum pctrie_access access)174 pctrie_node_load(smr_pctnode_t *p, smr_t smr, enum pctrie_access access)
175 {
176 	switch (access) {
177 	case PCTRIE_UNSERIALIZED:
178 		return (smr_unserialized_load(p, true));
179 	case PCTRIE_LOCKED:
180 		return (smr_serialized_load(p, true));
181 	case PCTRIE_SMR:
182 		return (smr_entered_load(p, smr));
183 	}
184 	__assert_unreachable();
185 }
186 
187 static __inline void
pctrie_node_store(smr_pctnode_t * p,void * v,enum pctrie_access access)188 pctrie_node_store(smr_pctnode_t *p, void *v, enum pctrie_access access)
189 {
190 	switch (access) {
191 	case PCTRIE_UNSERIALIZED:
192 		smr_unserialized_store(p, v, true);
193 		break;
194 	case PCTRIE_LOCKED:
195 		smr_serialized_store(p, v, true);
196 		break;
197 	case PCTRIE_SMR:
198 		panic("%s: Not supported in SMR section.", __func__);
199 		break;
200 	default:
201 		__assert_unreachable();
202 		break;
203 	}
204 }
205 
206 /*
207  * Get the root node for a tree.
208  */
209 static __inline struct pctrie_node *
pctrie_root_load(struct pctrie * ptree,smr_t smr,enum pctrie_access access)210 pctrie_root_load(struct pctrie *ptree, smr_t smr, enum pctrie_access access)
211 {
212 	return (pctrie_node_load((smr_pctnode_t *)&ptree->pt_root, smr, access));
213 }
214 
215 /*
216  * Set the root node for a tree.
217  */
218 static __inline void
pctrie_root_store(struct pctrie * ptree,struct pctrie_node * node,enum pctrie_access access)219 pctrie_root_store(struct pctrie *ptree, struct pctrie_node *node,
220     enum pctrie_access access)
221 {
222 	pctrie_node_store((smr_pctnode_t *)&ptree->pt_root, node, access);
223 }
224 
225 /*
226  * Returns TRUE if the specified node is a leaf and FALSE otherwise.
227  */
228 static __inline bool
pctrie_isleaf(struct pctrie_node * node)229 pctrie_isleaf(struct pctrie_node *node)
230 {
231 
232 	return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
233 }
234 
235 /*
236  * Returns the associated val extracted from node.
237  */
238 static __inline uint64_t *
pctrie_toval(struct pctrie_node * node)239 pctrie_toval(struct pctrie_node *node)
240 {
241 
242 	return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
243 }
244 
245 /*
246  * Adds the val as a child of the provided node.
247  */
248 static __inline void
pctrie_addval(struct pctrie_node * node,uint64_t index,uint16_t clev,uint64_t * val,enum pctrie_access access)249 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
250     uint64_t *val, enum pctrie_access access)
251 {
252 	int slot;
253 
254 	slot = pctrie_slot(index, clev);
255 	pctrie_node_store(&node->pn_child[slot],
256 	    (void *)((uintptr_t)val | PCTRIE_ISLEAF), access);
257 }
258 
259 /*
260  * Returns the slot where two keys differ.
261  * It cannot accept 2 equal keys.
262  */
263 static __inline uint16_t
pctrie_keydiff(uint64_t index1,uint64_t index2)264 pctrie_keydiff(uint64_t index1, uint64_t index2)
265 {
266 	uint16_t clev;
267 
268 	KASSERT(index1 != index2, ("%s: passing the same key value %jx",
269 	    __func__, (uintmax_t)index1));
270 
271 	index1 ^= index2;
272 	for (clev = PCTRIE_LIMIT;; clev--)
273 		if (pctrie_slot(index1, clev) != 0)
274 			return (clev);
275 }
276 
277 /*
278  * Returns TRUE if it can be determined that key does not belong to the
279  * specified node.  Otherwise, returns FALSE.
280  */
281 static __inline bool
pctrie_keybarr(struct pctrie_node * node,uint64_t idx)282 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
283 {
284 
285 	if (node->pn_clev < PCTRIE_LIMIT) {
286 		idx = pctrie_trimkey(idx, node->pn_clev + 1);
287 		return (idx != node->pn_owner);
288 	}
289 	return (false);
290 }
291 
292 /*
293  * Internal helper for pctrie_reclaim_allnodes().
294  * This function is recursive.
295  */
296 static void
pctrie_reclaim_allnodes_int(struct pctrie * ptree,struct pctrie_node * node,pctrie_free_t freefn)297 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
298     pctrie_free_t freefn)
299 {
300 	struct pctrie_node *child;
301 	int slot;
302 
303 	KASSERT(node->pn_count <= PCTRIE_COUNT,
304 	    ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
305 	for (slot = 0; node->pn_count != 0; slot++) {
306 		child = pctrie_node_load(&node->pn_child[slot], NULL,
307 		    PCTRIE_UNSERIALIZED);
308 		if (child == NULL)
309 			continue;
310 		if (!pctrie_isleaf(child))
311 			pctrie_reclaim_allnodes_int(ptree, child, freefn);
312 		pctrie_node_store(&node->pn_child[slot], NULL,
313 		    PCTRIE_UNSERIALIZED);
314 		node->pn_count--;
315 	}
316 	pctrie_node_put(ptree, node, freefn, -1);
317 }
318 
319 /*
320  * pctrie node zone initializer.
321  */
322 int
pctrie_zone_init(void * mem,int size __unused,int flags __unused)323 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
324 {
325 	struct pctrie_node *node;
326 
327 	node = mem;
328 	node->pn_last = 0;
329 	memset(node->pn_child, 0, sizeof(node->pn_child));
330 	return (0);
331 }
332 
333 size_t
pctrie_node_size(void)334 pctrie_node_size(void)
335 {
336 
337 	return (sizeof(struct pctrie_node));
338 }
339 
340 /*
341  * Inserts the key-value pair into the trie.
342  * Panics if the key already exists.
343  */
344 int
pctrie_insert(struct pctrie * ptree,uint64_t * val,pctrie_alloc_t allocfn)345 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
346 {
347 	uint64_t index, newind;
348 	struct pctrie_node *node, *tmp;
349 	smr_pctnode_t *parentp;
350 	uint64_t *m;
351 	int slot;
352 	uint16_t clev;
353 
354 	index = *val;
355 
356 	/*
357 	 * The owner of record for root is not really important because it
358 	 * will never be used.
359 	 */
360 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
361 	if (node == NULL) {
362 		ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
363 		return (0);
364 	}
365 	parentp = (smr_pctnode_t *)&ptree->pt_root;
366 	for (;;) {
367 		if (pctrie_isleaf(node)) {
368 			m = pctrie_toval(node);
369 			if (*m == index)
370 				panic("%s: key %jx is already present",
371 				    __func__, (uintmax_t)index);
372 			clev = pctrie_keydiff(*m, index);
373 			tmp = pctrie_node_get(ptree, allocfn,
374 			    pctrie_trimkey(index, clev + 1), 2, clev);
375 			if (tmp == NULL)
376 				return (ENOMEM);
377 			/* These writes are not yet visible due to ordering. */
378 			pctrie_addval(tmp, index, clev, val,
379 			    PCTRIE_UNSERIALIZED);
380 			pctrie_addval(tmp, *m, clev, m, PCTRIE_UNSERIALIZED);
381 			/* Synchronize to make leaf visible. */
382 			pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
383 			return (0);
384 		} else if (pctrie_keybarr(node, index))
385 			break;
386 		slot = pctrie_slot(index, node->pn_clev);
387 		parentp = &node->pn_child[slot];
388 		tmp = pctrie_node_load(parentp, NULL, PCTRIE_LOCKED);
389 		if (tmp == NULL) {
390 			node->pn_count++;
391 			pctrie_addval(node, index, node->pn_clev, val,
392 			    PCTRIE_LOCKED);
393 			return (0);
394 		}
395 		node = tmp;
396 	}
397 
398 	/*
399 	 * A new node is needed because the right insertion level is reached.
400 	 * Setup the new intermediate node and add the 2 children: the
401 	 * new object and the older edge.
402 	 */
403 	newind = node->pn_owner;
404 	clev = pctrie_keydiff(newind, index);
405 	tmp = pctrie_node_get(ptree, allocfn,
406 	    pctrie_trimkey(index, clev + 1), 2, clev);
407 	if (tmp == NULL)
408 		return (ENOMEM);
409 	slot = pctrie_slot(newind, clev);
410 	/* These writes are not yet visible due to ordering. */
411 	pctrie_addval(tmp, index, clev, val, PCTRIE_UNSERIALIZED);
412 	pctrie_node_store(&tmp->pn_child[slot], node, PCTRIE_UNSERIALIZED);
413 	/* Synchronize to make the above visible. */
414 	pctrie_node_store(parentp, tmp, PCTRIE_LOCKED);
415 
416 	return (0);
417 }
418 
419 /*
420  * Returns the value stored at the index.  If the index is not present,
421  * NULL is returned.
422  */
423 static __always_inline uint64_t *
_pctrie_lookup(struct pctrie * ptree,uint64_t index,smr_t smr,enum pctrie_access access)424 _pctrie_lookup(struct pctrie *ptree, uint64_t index, smr_t smr,
425     enum pctrie_access access)
426 {
427 	struct pctrie_node *node;
428 	uint64_t *m;
429 	int slot;
430 
431 	node = pctrie_root_load(ptree, smr, access);
432 	while (node != NULL) {
433 		if (pctrie_isleaf(node)) {
434 			m = pctrie_toval(node);
435 			if (*m == index)
436 				return (m);
437 			break;
438 		}
439 		if (pctrie_keybarr(node, index))
440 			break;
441 		slot = pctrie_slot(index, node->pn_clev);
442 		node = pctrie_node_load(&node->pn_child[slot], smr, access);
443 	}
444 	return (NULL);
445 }
446 
447 /*
448  * Returns the value stored at the index, assuming access is externally
449  * synchronized by a lock.
450  *
451  * If the index is not present, NULL is returned.
452  */
453 uint64_t *
pctrie_lookup(struct pctrie * ptree,uint64_t index)454 pctrie_lookup(struct pctrie *ptree, uint64_t index)
455 {
456 	return (_pctrie_lookup(ptree, index, NULL, PCTRIE_LOCKED));
457 }
458 
459 /*
460  * Returns the value stored at the index without requiring an external lock.
461  *
462  * If the index is not present, NULL is returned.
463  */
464 uint64_t *
pctrie_lookup_unlocked(struct pctrie * ptree,uint64_t index,smr_t smr)465 pctrie_lookup_unlocked(struct pctrie *ptree, uint64_t index, smr_t smr)
466 {
467 	uint64_t *res;
468 
469 	smr_enter(smr);
470 	res = _pctrie_lookup(ptree, index, smr, PCTRIE_SMR);
471 	smr_exit(smr);
472 	return (res);
473 }
474 
475 /*
476  * Look up the nearest entry at a position bigger than or equal to index,
477  * assuming access is externally synchronized by a lock.
478  */
479 uint64_t *
pctrie_lookup_ge(struct pctrie * ptree,uint64_t index)480 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
481 {
482 	struct pctrie_node *stack[PCTRIE_LIMIT];
483 	uint64_t inc;
484 	uint64_t *m;
485 	struct pctrie_node *child, *node;
486 #ifdef INVARIANTS
487 	int loops = 0;
488 #endif
489 	unsigned tos;
490 	int slot;
491 
492 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
493 	if (node == NULL)
494 		return (NULL);
495 	else if (pctrie_isleaf(node)) {
496 		m = pctrie_toval(node);
497 		if (*m >= index)
498 			return (m);
499 		else
500 			return (NULL);
501 	}
502 	tos = 0;
503 	for (;;) {
504 		/*
505 		 * If the keys differ before the current bisection node,
506 		 * then the search key might rollback to the earliest
507 		 * available bisection node or to the smallest key
508 		 * in the current node (if the owner is greater than the
509 		 * search key).
510 		 */
511 		if (pctrie_keybarr(node, index)) {
512 			if (index > node->pn_owner) {
513 ascend:
514 				KASSERT(++loops < 1000,
515 				    ("pctrie_lookup_ge: too many loops"));
516 
517 				/*
518 				 * Pop nodes from the stack until either the
519 				 * stack is empty or a node that could have a
520 				 * matching descendant is found.
521 				 */
522 				do {
523 					if (tos == 0)
524 						return (NULL);
525 					node = stack[--tos];
526 				} while (pctrie_slot(index,
527 				    node->pn_clev) == (PCTRIE_COUNT - 1));
528 
529 				/*
530 				 * The following computation cannot overflow
531 				 * because index's slot at the current level
532 				 * is less than PCTRIE_COUNT - 1.
533 				 */
534 				index = pctrie_trimkey(index,
535 				    node->pn_clev);
536 				index += PCTRIE_UNITLEVEL(node->pn_clev);
537 			} else
538 				index = node->pn_owner;
539 			KASSERT(!pctrie_keybarr(node, index),
540 			    ("pctrie_lookup_ge: keybarr failed"));
541 		}
542 		slot = pctrie_slot(index, node->pn_clev);
543 		child = pctrie_node_load(&node->pn_child[slot], NULL,
544 		    PCTRIE_LOCKED);
545 		if (pctrie_isleaf(child)) {
546 			m = pctrie_toval(child);
547 			if (*m >= index)
548 				return (m);
549 		} else if (child != NULL)
550 			goto descend;
551 
552 		/*
553 		 * Look for an available edge or val within the current
554 		 * bisection node.
555 		 */
556                 if (slot < (PCTRIE_COUNT - 1)) {
557 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
558 			index = pctrie_trimkey(index, node->pn_clev);
559 			do {
560 				index += inc;
561 				slot++;
562 				child = pctrie_node_load(&node->pn_child[slot],
563 				    NULL, PCTRIE_LOCKED);
564 				if (pctrie_isleaf(child)) {
565 					m = pctrie_toval(child);
566 					if (*m >= index)
567 						return (m);
568 				} else if (child != NULL)
569 					goto descend;
570 			} while (slot < (PCTRIE_COUNT - 1));
571 		}
572 		KASSERT(child == NULL || pctrie_isleaf(child),
573 		    ("pctrie_lookup_ge: child is radix node"));
574 
575 		/*
576 		 * If a value or edge greater than the search slot is not found
577 		 * in the current node, ascend to the next higher-level node.
578 		 */
579 		goto ascend;
580 descend:
581 		KASSERT(node->pn_clev > 0,
582 		    ("pctrie_lookup_ge: pushing leaf's parent"));
583 		KASSERT(tos < PCTRIE_LIMIT,
584 		    ("pctrie_lookup_ge: stack overflow"));
585 		stack[tos++] = node;
586 		node = child;
587 	}
588 }
589 
590 /*
591  * Look up the nearest entry at a position less than or equal to index,
592  * assuming access is externally synchronized by a lock.
593  */
594 uint64_t *
pctrie_lookup_le(struct pctrie * ptree,uint64_t index)595 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
596 {
597 	struct pctrie_node *stack[PCTRIE_LIMIT];
598 	uint64_t inc;
599 	uint64_t *m;
600 	struct pctrie_node *child, *node;
601 #ifdef INVARIANTS
602 	int loops = 0;
603 #endif
604 	unsigned tos;
605 	int slot;
606 
607 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
608 	if (node == NULL)
609 		return (NULL);
610 	else if (pctrie_isleaf(node)) {
611 		m = pctrie_toval(node);
612 		if (*m <= index)
613 			return (m);
614 		else
615 			return (NULL);
616 	}
617 	tos = 0;
618 	for (;;) {
619 		/*
620 		 * If the keys differ before the current bisection node,
621 		 * then the search key might rollback to the earliest
622 		 * available bisection node or to the largest key
623 		 * in the current node (if the owner is smaller than the
624 		 * search key).
625 		 */
626 		if (pctrie_keybarr(node, index)) {
627 			if (index > node->pn_owner) {
628 				index = node->pn_owner + PCTRIE_COUNT *
629 				    PCTRIE_UNITLEVEL(node->pn_clev);
630 			} else {
631 ascend:
632 				KASSERT(++loops < 1000,
633 				    ("pctrie_lookup_le: too many loops"));
634 
635 				/*
636 				 * Pop nodes from the stack until either the
637 				 * stack is empty or a node that could have a
638 				 * matching descendant is found.
639 				 */
640 				do {
641 					if (tos == 0)
642 						return (NULL);
643 					node = stack[--tos];
644 				} while (pctrie_slot(index,
645 				    node->pn_clev) == 0);
646 
647 				/*
648 				 * The following computation cannot overflow
649 				 * because index's slot at the current level
650 				 * is greater than 0.
651 				 */
652 				index = pctrie_trimkey(index,
653 				    node->pn_clev);
654 			}
655 			index--;
656 			KASSERT(!pctrie_keybarr(node, index),
657 			    ("pctrie_lookup_le: keybarr failed"));
658 		}
659 		slot = pctrie_slot(index, node->pn_clev);
660 		child = pctrie_node_load(&node->pn_child[slot], NULL,
661 		    PCTRIE_LOCKED);
662 		if (pctrie_isleaf(child)) {
663 			m = pctrie_toval(child);
664 			if (*m <= index)
665 				return (m);
666 		} else if (child != NULL)
667 			goto descend;
668 
669 		/*
670 		 * Look for an available edge or value within the current
671 		 * bisection node.
672 		 */
673 		if (slot > 0) {
674 			inc = PCTRIE_UNITLEVEL(node->pn_clev);
675 			index |= inc - 1;
676 			do {
677 				index -= inc;
678 				slot--;
679 				child = pctrie_node_load(&node->pn_child[slot],
680 				    NULL, PCTRIE_LOCKED);
681 				if (pctrie_isleaf(child)) {
682 					m = pctrie_toval(child);
683 					if (*m <= index)
684 						return (m);
685 				} else if (child != NULL)
686 					goto descend;
687 			} while (slot > 0);
688 		}
689 		KASSERT(child == NULL || pctrie_isleaf(child),
690 		    ("pctrie_lookup_le: child is radix node"));
691 
692 		/*
693 		 * If a value or edge smaller than the search slot is not found
694 		 * in the current node, ascend to the next higher-level node.
695 		 */
696 		goto ascend;
697 descend:
698 		KASSERT(node->pn_clev > 0,
699 		    ("pctrie_lookup_le: pushing leaf's parent"));
700 		KASSERT(tos < PCTRIE_LIMIT,
701 		    ("pctrie_lookup_le: stack overflow"));
702 		stack[tos++] = node;
703 		node = child;
704 	}
705 }
706 
707 /*
708  * Remove the specified index from the tree.
709  * Panics if the key is not present.
710  */
711 void
pctrie_remove(struct pctrie * ptree,uint64_t index,pctrie_free_t freefn)712 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
713 {
714 	struct pctrie_node *node, *parent, *tmp;
715 	uint64_t *m;
716 	int i, slot;
717 
718 	node = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
719 	if (pctrie_isleaf(node)) {
720 		m = pctrie_toval(node);
721 		if (*m != index)
722 			panic("%s: invalid key found", __func__);
723 		pctrie_root_store(ptree, NULL, PCTRIE_LOCKED);
724 		return;
725 	}
726 	parent = NULL;
727 	for (;;) {
728 		if (node == NULL)
729 			panic("pctrie_remove: impossible to locate the key");
730 		slot = pctrie_slot(index, node->pn_clev);
731 		tmp = pctrie_node_load(&node->pn_child[slot], NULL,
732 		    PCTRIE_LOCKED);
733 		if (pctrie_isleaf(tmp)) {
734 			m = pctrie_toval(tmp);
735 			if (*m != index)
736 				panic("%s: invalid key found", __func__);
737 			pctrie_node_store(&node->pn_child[slot], NULL,
738 			    PCTRIE_LOCKED);
739 			node->pn_count--;
740 			if (node->pn_count > 1)
741 				break;
742 			for (i = 0; i < PCTRIE_COUNT; i++) {
743 				tmp = pctrie_node_load(&node->pn_child[i],
744 				    NULL, PCTRIE_LOCKED);
745 				if (tmp != NULL)
746 					break;
747 			}
748 			KASSERT(i != PCTRIE_COUNT,
749 			    ("%s: invalid node configuration", __func__));
750 			if (parent == NULL)
751 				pctrie_root_store(ptree, tmp, PCTRIE_LOCKED);
752 			else {
753 				slot = pctrie_slot(index, parent->pn_clev);
754 				KASSERT(pctrie_node_load(
755 					&parent->pn_child[slot], NULL,
756 					PCTRIE_LOCKED) == node,
757 				    ("%s: invalid child value", __func__));
758 				pctrie_node_store(&parent->pn_child[slot], tmp,
759 				    PCTRIE_LOCKED);
760 			}
761 			/*
762 			 * The child is still valid and we can not zero the
763 			 * pointer until all SMR references are gone.
764 			 */
765 			node->pn_count--;
766 			pctrie_node_put(ptree, node, freefn, i);
767 			break;
768 		}
769 		parent = node;
770 		node = tmp;
771 	}
772 }
773 
774 /*
775  * Remove and free all the nodes from the tree.
776  * This function is recursive but there is a tight control on it as the
777  * maximum depth of the tree is fixed.
778  */
779 void
pctrie_reclaim_allnodes(struct pctrie * ptree,pctrie_free_t freefn)780 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
781 {
782 	struct pctrie_node *root;
783 
784 	root = pctrie_root_load(ptree, NULL, PCTRIE_LOCKED);
785 	if (root == NULL)
786 		return;
787 	pctrie_root_store(ptree, NULL, PCTRIE_UNSERIALIZED);
788 	if (!pctrie_isleaf(root))
789 		pctrie_reclaim_allnodes_int(ptree, root, freefn);
790 }
791 
792 #ifdef DDB
793 /*
794  * Show details about the given node.
795  */
DB_SHOW_COMMAND(pctrienode,db_show_pctrienode)796 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
797 {
798 	struct pctrie_node *node, *tmp;
799 	int i;
800 
801         if (!have_addr)
802                 return;
803 	node = (struct pctrie_node *)addr;
804 	db_printf("node %p, owner %jx, children count %u, level %u:\n",
805 	    (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
806 	    node->pn_clev);
807 	for (i = 0; i < PCTRIE_COUNT; i++) {
808 		tmp = pctrie_node_load(&node->pn_child[i], NULL,
809 		    PCTRIE_UNSERIALIZED);
810 		if (tmp != NULL)
811 			db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
812 			    i, (void *)tmp,
813 			    pctrie_isleaf(tmp) ? pctrie_toval(tmp) : NULL,
814 			    node->pn_clev);
815 	}
816 }
817 #endif /* DDB */
818