xref: /dragonfly/sys/kern/kern_kmalloc.c (revision 11050bbcfb34283a9d27bc1ded58120cc5862897)
1 /*
2  * KERN_KMALLOC.C   - Kernel memory allocator
3  *
4  * Copyright (c) 2021 The DragonFly Project, All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 
37 /*
38  * This module implements the kmalloc_obj allocator.  This is a type-stable
39  * allocator that uses the same base structures (e.g. malloc_type) plus
40  * some extensions to efficiently implement single-type zones.
41  *
42  * All memory management is zone based.  When a zone is destroyed, all of
43  * its memory is returned to the system with no fragmentation.
44  *
45  * A mini-slab allocator hangs directly off the zone structure (malloc_type).
46  * Since the object zones are single-size-only, the slab allocator is very
47  * simple and currently utilizes just two per-zone/per-cpu slabs (active and
48  * alternate) before kicking up to the per-zone cache.  Beyond that we just
49  * have the per-cpu globaldata-based 'free slab' cache to avoid unnecessary
50  * kernel_map mappings and unmappings.
51  *
52  * The advantage of this that zones don't stomp over each other and cause
53  * excessive fragmentation in the slabs.  For example, when you umount a
54  * large tmpfs filesystem, most of its memory (all of its kmalloc_obj memory)
55  * is returned to the system.
56  */
57 
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/kernel.h>
61 #include <sys/slaballoc.h>
62 #include <sys/mbuf.h>
63 #include <sys/vmmeter.h>
64 #include <sys/spinlock.h>
65 #include <sys/lock.h>
66 #include <sys/thread.h>
67 #include <sys/globaldata.h>
68 #include <sys/sysctl.h>
69 #include <sys/ktr.h>
70 #include <sys/malloc.h>
71 
72 #include <vm/vm.h>
73 #include <vm/vm_param.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_extern.h>
76 #include <vm/vm_object.h>
77 #include <vm/pmap.h>
78 #include <vm/vm_map.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_pageout.h>
81 
82 #include <machine/cpu.h>
83 
84 #include <sys/spinlock2.h>
85 #include <sys/thread2.h>
86 #include <sys/exislock2.h>
87 #include <vm/vm_page2.h>
88 
89 #define MEMORY_STRING         "ptr=%p type=%p size=%lu flags=%04x"
90 #define MEMORY_ARGS void *ptr, void *type, unsigned long size, int flags
91 
92 #if !defined(KTR_MEMORY)
93 #define KTR_MEMORY  KTR_ALL
94 #endif
95 KTR_INFO_MASTER(mem_obj);
96 KTR_INFO(KTR_MEMORY, mem_obj, malloc_beg, 0, "kmalloc_obj begin");
97 KTR_INFO(KTR_MEMORY, mem_obj, malloc_end, 1, MEMORY_STRING, MEMORY_ARGS);
98 #if 0
99 KTR_INFO(KTR_MEMORY, mem_obj, free_zero, 2, MEMORY_STRING, MEMORY_ARGS);
100 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz, 3, MEMORY_STRING, MEMORY_ARGS);
101 KTR_INFO(KTR_MEMORY, mem_obj, free_ovsz_delayed, 4, MEMORY_STRING, MEMORY_ARGS);
102 KTR_INFO(KTR_MEMORY, mem_obj, free_chunk, 5, MEMORY_STRING, MEMORY_ARGS);
103 KTR_INFO(KTR_MEMORY, mem_obj, free_request, 6, MEMORY_STRING, MEMORY_ARGS);
104 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_beg, 7, MEMORY_STRING, MEMORY_ARGS);
105 KTR_INFO(KTR_MEMORY, mem_obj, free_rem_end, 8, MEMORY_STRING, MEMORY_ARGS);
106 #endif
107 KTR_INFO(KTR_MEMORY, mem_obj, free_beg, 9, "kfree_obj begin");
108 KTR_INFO(KTR_MEMORY, mem_obj, free_end, 10, "kfree_obj end");
109 
110 #define logmemory(name, ptr, type, size, flags)                                 \
111           KTR_LOG(mem_obj_ ## name, ptr, type, size, flags)
112 #define logmemory_quick(name)                                                   \
113           KTR_LOG(mem_obj_ ## name)
114 
115 __read_frequently static int KMGDMaxFreeSlabs = KMGD_MAXFREESLABS;
116 SYSCTL_INT(_kern, OID_AUTO, kzone_cache, CTLFLAG_RW, &KMGDMaxFreeSlabs, 0, "");
117 __read_frequently static int kzone_bretire = 4;
118 SYSCTL_INT(_kern, OID_AUTO, kzone_bretire, CTLFLAG_RW, &kzone_bretire, 0, "");
119 __read_frequently static int kzone_debug;
120 SYSCTL_INT(_kern, OID_AUTO, kzone_debug, CTLFLAG_RW, &kzone_debug, 0, "");
121 
122 __read_frequently struct kmalloc_slab kslab_dummy;
123 
124 static void malloc_slab_destroy(struct malloc_type *type,
125                               struct kmalloc_slab **slabp);
126 
127 /*
128  * Cache a chain of slabs onto their respective cpu slab caches.  Any slabs
129  * which we cannot cache will be returned.
130  *
131  * free_slabs            - Current structure may only be accessed by current cpu
132  * remote_free_slabs - Only atomic swap operations are allowed.
133  * free_count            - Only atomic operations are allowed.
134  *
135  * If the count is sufficient to cache the entire list, NULL is returned.
136  * Otherwise the portion that was not cached is returned.
137  */
138 static __noinline
139 struct kmalloc_slab *
gslab_cache(struct kmalloc_slab * slab)140 gslab_cache(struct kmalloc_slab *slab)
141 {
142           struct kmalloc_slab *save;
143           struct kmalloc_slab *next;
144           struct kmalloc_slab *res;
145           struct kmalloc_slab **resp;
146           struct kmalloc_slab **slabp;
147           globaldata_t rgd;
148           size_t count;
149           int cpuid;
150 
151           res = NULL;
152           resp = &res;
153           KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
154 
155           /*
156            * Given the slab list, get the cpuid and clip off as many matching
157            * elements as fits in the cache.
158            */
159           while (slab) {
160                     cpuid = slab->orig_cpuid;
161                     rgd = globaldata_find(cpuid);
162 
163                     KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
164                     /*
165                      * Doesn't fit in cache, put on return list.
166                      */
167                     if (rgd->gd_kmslab.free_count >= KMGDMaxFreeSlabs) {
168                               *resp = slab;
169                               resp = &slab->next;
170                               slab = slab->next;
171                               continue;
172                     }
173 
174                     /*
175                      * Collect.  We aren't required to match-up the original cpu
176                      * with the disposal cpu, but its a good idea to retain
177                      * memory locality.
178                      *
179                      * The slabs we collect are going into the global cache,
180                      * remove the type association.
181                      */
182                     KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
183                     slabp = &slab->next;
184                     count = 1;
185                     slab->type = NULL;
186 
187                     while ((next = *slabp) != NULL &&
188                            next->orig_cpuid == cpuid &&
189                            rgd->gd_kmslab.free_count + count < KMGDMaxFreeSlabs)
190                   {
191                               KKASSERT(((uintptr_t)next & KMALLOC_SLAB_MASK) == 0);
192                               next->type = NULL;
193                               ++count;
194                               slabp = &next->next;
195                     }
196 
197                     /*
198                      * Safety, unhook before next, next is not included in the
199                      * list starting with slab that is being pre-pended
200                      * to remote_free_slabs.
201                      */
202                     *slabp = NULL;
203 
204                     /*
205                      * Now atomically pre-pend slab...*slabp to remote_free_slabs.
206                      * Pump the count first (its ok if the actual chain length
207                      * races the count update).
208                      *
209                      * NOTE: In the loop, (save) is updated by fcmpset.
210                      */
211                     atomic_add_long(&rgd->gd_kmslab.free_count, count);
212                     save = rgd->gd_kmslab.remote_free_slabs;
213                     for (;;) {
214                               KKASSERT(((uintptr_t)save & KMALLOC_SLAB_MASK) == 0);
215                               *slabp = save;      /* end of slab list chain to... */
216                               cpu_ccfence();
217                               if (atomic_fcmpset_ptr(
218                                         &rgd->gd_kmslab.remote_free_slabs,
219                                         &save, slab))
220                               {
221                                         break;
222                               }
223                     }
224 
225                     /*
226                      * Setup for next loop
227                      */
228                     slab = next;
229           }
230 
231           /*
232            * Terminate the result list and return it
233            */
234           *resp = NULL;
235 
236           return res;
237 }
238 
239 /*
240  * May only be called on current cpu.  Pull a free slab from the
241  * pcpu cache.  If we run out, move any slabs that have built-up
242  * from remote cpus.
243  *
244  * We are only allowed to swap the remote_free_slabs head, we cannot
245  * manipulate any next pointers while structures are sitting on that list.
246  */
247 static __inline
248 struct kmalloc_slab *
gslab_alloc(globaldata_t gd)249 gslab_alloc(globaldata_t gd)
250 {
251           struct kmalloc_slab *slab;
252 
253           slab = gd->gd_kmslab.free_slabs;
254           if (slab == NULL) {
255                     slab = atomic_swap_ptr(
256                               (volatile void **)&gd->gd_kmslab.remote_free_slabs,
257                               NULL);
258                     KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
259           }
260           if (slab) {
261                     gd->gd_kmslab.free_slabs = slab->next;
262                     slab->next = NULL;
263                     atomic_add_long(&gd->gd_kmslab.free_count, -1);
264                     KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
265           }
266           return slab;
267 }
268 
269 void
malloc_mgt_init(struct malloc_type * type __unused,struct kmalloc_mgt * mgt,size_t size)270 malloc_mgt_init(struct malloc_type *type __unused,
271                     struct kmalloc_mgt *mgt, size_t size)
272 {
273           size_t offset;
274           size_t count;
275 
276           bzero(mgt, sizeof(*mgt));
277           spin_init(&mgt->spin, "kmmgt");
278 
279           /*
280            * Allows us to avoid a conditional.  The dummy slabs are empty
281            * and have no objects.
282            */
283           mgt->active = &kslab_dummy;
284           mgt->alternate = &kslab_dummy;
285           mgt->empty_tailp = &mgt->empty;
286 
287           /*
288            * Figure out the count by taking into account the size of the fobjs[]
289            * array by adding it to the object size.  This initial calculation
290            * ignores alignment edge-cases that might require the count to be
291            * reduced.
292            */
293           offset = offsetof(struct kmalloc_slab, fobjs[0]);
294           count = (KMALLOC_SLAB_SIZE - offset) / (size + sizeof(void *));
295 
296           /*
297            * Recalculate the offset of the first object, this time including
298            * the required alignment.  (size) should already be aligned.  This
299            * may push the last object beyond the slab so check and loop with
300            * a reduced count as necessary.
301            *
302            * Ok, theoretically the count should not actually change since the
303            * division above rounds-down (that is, any mis-alignment is already
304            * not included in the count calculation).  But I'm not going to take
305            * any chances and check anyway as a safety in case some programmer
306            * changes the code above later.  This is not a time-critical code
307            * path.
308            */
309           offset = offsetof(struct kmalloc_slab, fobjs[count]);
310           offset = __VM_CACHELINE_ALIGN(offset);
311 
312           while (offset + size * count > KMALLOC_SLAB_SIZE) {
313                     --count;
314                     offset = offsetof(struct kmalloc_slab, fobjs[count]);
315                     offset = __VM_CACHELINE_ALIGN(offset);
316                     KKASSERT (offset + size * count <= KMALLOC_SLAB_SIZE);
317           }
318 
319           mgt->slab_offset = offset;
320           mgt->slab_count      = count;
321 }
322 
323 void
malloc_mgt_relocate(struct kmalloc_mgt * src,struct kmalloc_mgt * dst)324 malloc_mgt_relocate(struct kmalloc_mgt *src, struct kmalloc_mgt *dst)
325 {
326           struct kmalloc_slab **slabp;
327 
328           spin_init(&dst->spin, "kmmgt");
329           slabp = &dst->empty;
330 
331           while (*slabp) {
332                     slabp = &(*slabp)->next;
333           }
334           dst->empty_tailp = slabp;
335 }
336 
337 void
malloc_mgt_uninit(struct malloc_type * type,struct kmalloc_mgt * mgt)338 malloc_mgt_uninit(struct malloc_type *type, struct kmalloc_mgt *mgt)
339 {
340           if (mgt->active != &kslab_dummy)
341                     malloc_slab_destroy(type, &mgt->active);
342           mgt->active = NULL;
343 
344           if (mgt->alternate != &kslab_dummy)
345                     malloc_slab_destroy(type, &mgt->alternate);
346           mgt->alternate = NULL;
347 
348           malloc_slab_destroy(type, &mgt->partial);
349           malloc_slab_destroy(type, &mgt->full);
350           malloc_slab_destroy(type, &mgt->empty);
351           mgt->npartial = 0;
352           mgt->nfull = 0;
353           mgt->nempty = 0;
354           mgt->empty_tailp = &mgt->empty;
355 
356           spin_uninit(&mgt->spin);
357 }
358 
359 /*
360  * Destroy a list of slabs.  Attempt to cache the slabs on the specified
361  * (possibly remote) cpu.  This allows slabs that were operating on a
362  * particular cpu to be disposed of back to that same cpu.
363  */
364 static void
malloc_slab_destroy(struct malloc_type * type,struct kmalloc_slab ** slabp)365 malloc_slab_destroy(struct malloc_type *type, struct kmalloc_slab **slabp)
366 {
367           struct kmalloc_slab *slab;
368           struct kmalloc_slab *base;
369           struct kmalloc_slab **basep;
370           size_t delta;
371 
372           if (*slabp == NULL)
373                     return;
374 
375           /*
376            * Collect all slabs that can actually be destroyed, complain
377            * about the rest.
378            */
379           base = NULL;
380           basep = &base;
381           while ((slab = *slabp) != NULL) {
382                     KKASSERT(((uintptr_t)slab & KMALLOC_SLAB_MASK) == 0);
383 
384                     delta = slab->findex - slab->aindex;
385                     if (delta == slab->ncount) {
386                               *slabp = slab->next;          /* unlink */
387                               *basep = slab;                /* link into base list */
388                               basep = &slab->next;
389                     } else {
390                               kprintf("%s: slab %p %zd objects "
391                                         "were still allocated\n",
392                                         type->ks_shortdesc, slab,
393                                         slab->ncount - delta);
394                               /* leave link intact and iterate */
395                               slabp = &slab->next;
396                     }
397           }
398 
399           /*
400            * Terminate the base list of slabs that can be destroyed,
401            * then cache as many of them as possible.
402            */
403           *basep = NULL;
404           if (base == NULL)
405                     return;
406           base = gslab_cache(base);
407 
408           /*
409            * Destroy the remainder
410            */
411           while ((slab = base) != NULL) {
412                     base = slab->next;
413                     slab->next = (void *)(uintptr_t)-1;
414                     kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
415           }
416 }
417 
418 /*
419  * Objects can be freed to an empty slab at any time, causing it to no
420  * longer be empty.  To improve performance, we do not try to pro-actively
421  * move such slabs to the appropriate partial or full list upon kfree_obj().
422  * Instead, a poller comes along and tests the slabs on the empty list
423  * periodically, and moves slabs that are no longer empty to the appropriate
424  * list.
425  *
426  * --
427  *
428  * Poll a limited number of slabs on the empty list and move them
429  * to the appropriate full or partial list.  Slabs left on the empty
430  * list are rotated to the tail.
431  *
432  * If gcache is non-zero this function will try to place full slabs into
433  * the globaldata cache, if it isn't already too full.
434  *
435  * The mgt is spin-locked
436  *
437  * Returns non-zero if the ggm updates possibly made slabs available for
438  * allocation.
439  */
440 static int
malloc_mgt_poll_empty_locked(struct kmalloc_mgt * ggm,int count)441 malloc_mgt_poll_empty_locked(struct kmalloc_mgt *ggm, int count)
442 {
443           struct kmalloc_slab *marker;
444           struct kmalloc_slab *slab;
445           size_t delta;
446           int got_something;
447 
448           if (ggm->empty == NULL)
449                     return 0;
450 
451           got_something = 0;
452           marker = ggm->empty;
453 
454           while (count-- && (slab = ggm->empty) != NULL) {
455                     /*
456                      * Unlink from empty
457                      */
458                     ggm->empty = slab->next;
459                     slab->next = NULL;
460                     --ggm->nempty;
461                     if (ggm->empty_tailp == &slab->next)
462                               ggm->empty_tailp = &ggm->empty;
463 
464                     /*
465                      * Check partial, full, and empty.  We rotate
466                      * empty entries to the end of the empty list.
467                      *
468                      * NOTE: For a fully-freeable slab we also have
469                      *         to check xindex.
470                      */
471                     delta = slab->findex - slab->aindex;
472                     if (delta == slab->ncount) {
473                               /*
474                                * Stuff into the full list.  This requires setting
475                                * the exis sequence number via exis_terminate().
476                                */
477                               KKASSERT(slab->next == NULL);
478                               exis_terminate(&slab->exis);
479                               slab->next = ggm->full;
480                               ggm->full = slab;
481                               got_something = 1;
482                               ++ggm->nfull;
483                     } else if (delta) {
484                               /*
485                                * Partially full
486                                */
487                               KKASSERT(slab->next == NULL);
488                               slab->next = ggm->partial;
489                               ggm->partial = slab;
490                               got_something = 1;
491                               ++ggm->npartial;
492                     } else {
493                               /*
494                                * Empty
495                                */
496                               KKASSERT(slab->next == NULL);
497                               *ggm->empty_tailp = slab;
498                               ggm->empty_tailp = &slab->next;
499                               ++ggm->nempty;
500                               if (ggm->empty == marker)
501                                         break;
502                     }
503           }
504           return got_something;
505 }
506 
507 /*
508  * Called once a second with the zone interlocked against destruction.
509  *
510  * Returns non-zero to tell the caller to iterate to the next type,
511  * else the caller should stay on the current type.
512  */
513 int
malloc_mgt_poll(struct malloc_type * type)514 malloc_mgt_poll(struct malloc_type *type)
515 {
516           struct kmalloc_mgt *ggm;
517           struct kmalloc_slab *slab;
518           struct kmalloc_slab **slabp;
519           struct kmalloc_slab *base;
520           struct kmalloc_slab **basep;
521           size_t delta;
522           int donext;
523           int count;
524           int retired;
525 
526           if ((type->ks_flags & KSF_OBJSIZE) == 0)
527                     return 1;
528 
529           /*
530            * Check the partial, full, and empty lists for full freeable slabs
531            * in excess of desired caching count.
532            */
533           ggm = &type->ks_mgt;
534           spin_lock(&ggm->spin);
535 
536           /*
537            * Move empty slabs to partial or full as appropriate.  We
538            * don't bother checking partial slabs to see if they are full
539            * for now.
540            */
541           malloc_mgt_poll_empty_locked(ggm, 16);
542 
543           /*
544            * Ok, cleanout some of the full mags from the full list
545            */
546           base = NULL;
547           basep = &base;
548           count = ggm->nfull;
549           retired = 0;
550           cpu_ccfence();
551 
552           if (count > KMALLOC_MAXFREEMAGS) {
553                     slabp = &ggm->full;
554                     count -= KMALLOC_MAXFREEMAGS;
555                     if (count > 16)
556                               count = 16;
557 
558                     while (count && (slab = *slabp) != NULL) {
559                               delta = slab->findex - slab->aindex;
560                               if (delta == slab->ncount &&
561                                   slab->xindex == slab->findex &&
562                                   exis_freeable(&slab->exis))
563                               {
564                                         /*
565                                          * (1) No allocated entries in the structure,
566                                          *     this should always be the case from the
567                                          *     full list.
568                                          *
569                                          * (2) kfree_obj() has fully completed.  Just
570                                          *     checking findex is not sufficient since
571                                          *     it is incremented to reserve the slot
572                                          *     before the element is loaded into it.
573                                          *
574                                          * (3) The slab has been on the full list for
575                                          *     a sufficient number of EXIS
576                                          *     pseudo_ticks, for type-safety.
577                                          */
578                                         *slabp = slab->next;
579                                         *basep = slab;
580                                         basep = &slab->next;
581                                         --ggm->nfull;
582                                         ++ggm->gcache_count;
583                                         if (++retired == kzone_bretire)
584                                                   break;
585                               } else {
586                                         slabp = &slab->next;
587                               }
588                               --count;
589                     }
590                     *basep = NULL;      /* terminate the retirement list */
591                     donext = (*slabp == NULL);
592           } else {
593                     donext = 1;
594           }
595           spin_unlock(&ggm->spin);
596 
597           /*
598            * Clean out any slabs that we couldn't stow in the globaldata cache.
599            */
600           if (retired) {
601                     if (kzone_debug) {
602                               kprintf("kmalloc_poll: %s retire %d\n",
603                                         type->ks_shortdesc, retired);
604                     }
605                     base = gslab_cache(base);
606                     while ((slab = base) != NULL) {
607                               base = base->next;
608                               slab->next = NULL;
609                               kmem_slab_free(slab, KMALLOC_SLAB_SIZE);
610                     }
611           }
612 
613           return donext;
614 }
615 
616 /*
617  * Optional bitmap double-free check.  This is typically turned on by
618  * default for safety (sys/_malloc.h)
619  */
620 #ifdef KMALLOC_CHECK_DOUBLE_FREE
621 
622 static __inline void
bmap_set(struct kmalloc_slab * slab,void * obj)623 bmap_set(struct kmalloc_slab *slab, void *obj)
624 {
625           uint64_t *ptr;
626           uint64_t mask;
627           size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
628                        slab->objsize;
629 
630           ptr = &slab->bmap[i >> 6];
631           mask = (uint64_t)1U << (i & 63);
632           KKASSERT(i < slab->ncount && (*ptr & mask) == 0);
633           atomic_set_64(ptr, mask);
634 }
635 
636 static __inline void
bmap_clr(struct kmalloc_slab * slab,void * obj)637 bmap_clr(struct kmalloc_slab *slab, void *obj)
638 {
639           uint64_t *ptr;
640           uint64_t mask;
641           size_t i = (((uintptr_t)obj & KMALLOC_SLAB_MASK) - slab->offset) /
642                        slab->objsize;
643 
644           ptr = &slab->bmap[i >> 6];
645           mask = (uint64_t)1U << (i & 63);
646           KKASSERT(i < slab->ncount && (*ptr & mask) != 0);
647           atomic_clear_64(ptr, mask);
648 }
649 
650 #endif
651 
652 /*
653  * Cleanup a mgt structure.
654  *
655  * Always called from the current cpu, so we can manipulate the various
656  * lists freely.
657  *
658  * WARNING: findex can race, fobjs[n] is updated after findex is incremented,
659  *            and 'full'
660  */
661 #if 0
662 static void
663 mgt_cleanup(struct kmalloc_mgt *mgt)
664 {
665 #if 0
666           struct kmalloc_slab **slabp;
667           struct kmalloc_slab *slab;
668           size_t delta;
669           size_t total;
670 #endif
671 }
672 #endif
673 
674 #ifdef SLAB_DEBUG
675 void *
_kmalloc_obj_debug(unsigned long size,struct malloc_type * type,int flags,const char * file,int line)676 _kmalloc_obj_debug(unsigned long size, struct malloc_type *type, int flags,
677                 const char *file, int line)
678 #else
679 void *
680 _kmalloc_obj(unsigned long size, struct malloc_type *type, int flags)
681 #endif
682 {
683           struct kmalloc_slab *slab;
684           struct kmalloc_use *use;
685           struct kmalloc_mgt *mgt;
686           struct kmalloc_mgt *ggm;
687           globaldata_t gd;
688           void *obj;
689           size_t delta;
690 
691           /*
692            * Check limits
693            */
694           while (__predict_false(type->ks_loosememuse >= type->ks_limit)) {
695                     long ttl;
696                     int n;
697 
698                     for (n = ttl = 0; n < ncpus; ++n)
699                               ttl += type->ks_use[n].memuse;
700                     type->ks_loosememuse = ttl;   /* not MP synchronized */
701                     if ((ssize_t)ttl < 0)                   /* deal with occassional race */
702                               ttl = 0;
703                     if (ttl >= type->ks_limit) {
704                               if (flags & M_NULLOK)
705                                         return(NULL);
706                               panic("%s: malloc limit exceeded", type->ks_shortdesc);
707                     }
708           }
709 
710           /*
711            * Setup
712            */
713           crit_enter();
714           logmemory_quick(malloc_beg);
715           KKASSERT(size == type->ks_objsize);
716           gd = mycpu;
717           use = &type->ks_use[gd->gd_cpuid];
718 
719 retry:
720           /*
721            * Check active
722            *
723            * NOTE: obj can be NULL if racing a _kfree_obj().
724            */
725           mgt = &use->mgt;
726           slab = mgt->active;                     /* Might be dummy */
727           delta = slab->findex - slab->aindex;
728           if (__predict_true(delta != 0)) {       /* Cannot be dummy */
729                     size_t i;
730 
731                     i = slab->aindex % slab->ncount;
732                     obj = slab->fobjs[i];
733                     if (__predict_true(obj != NULL)) {
734                               slab->fobjs[i] = NULL;
735                               ++slab->aindex;
736 #ifdef KMALLOC_CHECK_DOUBLE_FREE
737                               bmap_set(slab, obj);
738 #endif
739                               goto found;
740                     }
741           }
742 
743           /*
744            * Check alternate.  If we find something, swap it with
745            * the active.
746            *
747            * NOTE: It is possible for exhausted slabs to recover entries
748            *         via _kfree_obj(), so we just keep swapping until both
749            *         are empty.
750            *
751            * NOTE: obj can be NULL if racing a _kfree_obj().
752            */
753           slab = mgt->alternate;                            /* Might be dummy */
754           delta = slab->findex - slab->aindex;
755           if (__predict_true(delta != 0)) {       /* Cannot be dummy */
756                     size_t i;
757 
758                     mgt->alternate = mgt->active;
759                     mgt->active = slab;
760                     i = slab->aindex % slab->ncount;
761                     obj = slab->fobjs[i];
762                     if (__predict_true(obj != NULL)) {
763                               slab->fobjs[i] = NULL;
764                               ++slab->aindex;
765 #ifdef KMALLOC_CHECK_DOUBLE_FREE
766                               bmap_set(slab, obj);
767 #endif
768                               goto found;
769                     }
770           }
771 
772           /*
773            * Rotate a slab from the global mgt into the pcpu mgt.
774            *
775            *        G(partial, full) -> active -> alternate -> G(empty)
776            *
777            * We try to exhaust partials first to reduce fragmentation, then
778            * dig into the fulls.
779            */
780           ggm = &type->ks_mgt;
781           spin_lock(&ggm->spin);
782 
783 rerotate:
784           if (ggm->partial) {
785                     slab = mgt->alternate;                  /* Might be dummy */
786                     mgt->alternate = mgt->active; /* Might be dummy */
787                     mgt->active = ggm->partial;
788                     ggm->partial = ggm->partial->next;
789                     mgt->active->next = NULL;
790                     --ggm->npartial;
791                     if (slab != &kslab_dummy) {
792                               KKASSERT(slab->next == NULL);
793                               *ggm->empty_tailp = slab;
794                               ggm->empty_tailp = &slab->next;
795                               ++ggm->nempty;
796                     }
797                     spin_unlock(&ggm->spin);
798                     goto retry;
799           }
800 
801           if (ggm->full) {
802                     slab = mgt->alternate;                  /* Might be dummy */
803                     mgt->alternate = mgt->active; /* Might be dummy */
804                     mgt->active = ggm->full;
805                     ggm->full = ggm->full->next;
806                     mgt->active->next = NULL;
807                     --ggm->nfull;
808                     exis_setlive(&mgt->active->exis);
809                     if (slab != &kslab_dummy) {
810                               KKASSERT(slab->next == NULL);
811                               *ggm->empty_tailp = slab;
812                               ggm->empty_tailp = &slab->next;
813                               ++ggm->nempty;
814                     }
815                     spin_unlock(&ggm->spin);
816                     goto retry;
817           }
818 
819           /*
820            * We couldn't find anything, scan a limited number of empty entries
821            * looking for something with objects.  This will also free excess
822            * full lists that meet requirements.
823            */
824           if (malloc_mgt_poll_empty_locked(ggm, 16))
825                     goto rerotate;
826 
827           /*
828            * Absolutely nothing is available, allocate a new slab and
829            * rotate it in.
830            *
831            * Try to get a slab from the global pcpu slab cache (very cheap).
832            * If that fails, allocate a new slab (very expensive).
833            */
834           spin_unlock(&ggm->spin);
835 
836           if (gd->gd_kmslab.free_count == 0 || (slab = gslab_alloc(gd)) == NULL) {
837                     slab = kmem_slab_alloc(KMALLOC_SLAB_SIZE, KMALLOC_SLAB_SIZE,
838                                                M_WAITOK);
839           }
840 
841           bzero(slab, sizeof(*slab));
842           KKASSERT(offsetof(struct kmalloc_slab, fobjs[use->mgt.slab_count]) <=
843                      use->mgt.slab_offset);
844 
845           obj = (char *)slab + use->mgt.slab_offset;
846           slab->type = type;
847           slab->orig_cpuid = gd->gd_cpuid;
848           slab->ncount = use->mgt.slab_count;
849           slab->offset = use->mgt.slab_offset;
850           slab->objsize = type->ks_objsize;
851           slab->aindex = 0;
852           slab->findex = slab->ncount;
853           slab->xindex = slab->ncount;
854           for (delta = 0; delta < slab->ncount; ++delta) {
855                     slab->fobjs[delta] = obj;
856                     obj = (char *)obj + type->ks_objsize;
857           }
858 
859           /*
860            * Sanity check, assert that the last byte of last object is still
861            * in the slab.
862            */
863 #if 0
864           KKASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
865                       ~KMALLOC_SLAB_MASK) == 0);
866 #endif
867           KASSERT(((((uintptr_t)obj - 1) ^ (uintptr_t)slab) &
868                       ~KMALLOC_SLAB_MASK) == 0, ("SLAB %p ncount %zd objsize %zd obj=%p\n", slab, slab->ncount, slab->objsize, obj));
869           slab->magic = KMALLOC_SLAB_MAGIC;
870           spin_init(&slab->spin, "kmslb");
871 
872           /*
873            * Rotate it in, then retry.
874            *
875            *        (NEW)slab -> active -> alternate -> G(empty)
876            */
877           spin_lock(&ggm->spin);
878           if (mgt->alternate != &kslab_dummy) {
879                     struct kmalloc_slab *slab_tmp;
880 
881                     slab_tmp = mgt->alternate;
882                     slab_tmp->next = NULL;
883                     *ggm->empty_tailp = slab_tmp;
884                     ggm->empty_tailp = &slab_tmp->next;
885                     ++ggm->nempty;
886           }
887           mgt->alternate = mgt->active;           /* Might be dummy */
888           mgt->active = slab;
889           spin_unlock(&ggm->spin);
890 
891           goto retry;
892 
893           /*
894            * Found object, adjust statistics and return
895            */
896 found:
897           ++use->inuse;
898           ++use->calls;
899           use->memuse += size;
900           use->loosememuse += size;
901           if (__predict_false(use->loosememuse >= KMALLOC_LOOSE_SIZE)) {
902               /* not MP synchronized */
903               type->ks_loosememuse += use->loosememuse;
904               use->loosememuse = 0;
905           }
906 
907           /*
908            * Handle remaining flags.  M_ZERO is typically not set because
909            * the inline macro deals with zeroing for constant sizes.
910            */
911           if (__predict_false(flags & M_ZERO))
912               bzero(obj, size);
913 
914           crit_exit();
915           logmemory(malloc_end, NULL, type, size, flags);
916 
917           return(obj);
918 }
919 
920 /*
921  * Free a type-stable object.  We have the base structure and can
922  * calculate the slab, but from this direction we don't know which
923  * mgt structure or list the slab might be on.
924  */
925 void
_kfree_obj(void * obj,struct malloc_type * type)926 _kfree_obj(void *obj, struct malloc_type *type)
927 {
928           struct kmalloc_slab *slab;
929           struct kmalloc_use *use;
930           globaldata_t gd;
931           size_t    delta;
932           size_t    i;
933 
934           logmemory_quick(free_beg);
935           gd = mycpu;
936 
937           /*
938            * Calculate the slab from the pointer
939            */
940           slab = (void *)((uintptr_t)obj & ~KMALLOC_SLAB_MASK);
941           delta = slab->findex - slab->aindex;
942           KKASSERT(slab->magic == KMALLOC_SLAB_MAGIC && delta != slab->ncount);
943 
944           /*
945            * We can only safely adjust the statistics for the current cpu.
946            * Don't try to track down the original cpu.  The statistics will
947            * be collected and fixed up by vmstat -m  (etc).
948            */
949           use = &slab->type->ks_use[gd->gd_cpuid];
950           --use->inuse;
951           use->memuse -= slab->objsize;
952 
953           /*
954            * There MUST be free space in the slab since we are returning
955            * the obj to the same slab it was allocated from.
956            */
957           i = atomic_fetchadd_long(&slab->findex, 1);
958           i = i % slab->ncount;
959           if (slab->fobjs[i] != NULL) {
960                     kprintf("_kfree_obj failure %zd/%zd/%zd\n",
961                               slab->aindex, slab->findex, slab->ncount);
962           }
963 #ifdef KMALLOC_CHECK_DOUBLE_FREE
964           bmap_clr(slab, obj);
965 #endif
966           KKASSERT(slab->fobjs[i] == NULL);
967           slab->fobjs[i] = obj;
968           atomic_add_long(&slab->xindex, 1);      /* synchronizer */
969 
970           logmemory_quick(free_end);
971 }
972