xref: /dragonfly/lib/libc/stdlib/dmalloc.c (revision 171835807871f68c36f75ff84e1d6f7df4683df0)
1 /*
2  * DMALLOC.C        - Dillon's malloc
3  *
4  * Copyright (c) 2011,2017 The DragonFly Project. All rights reserved.
5  *
6  * This code is derived from software contributed to The DragonFly Project
7  * by Matthew Dillon <dillon@backplane.com>.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module implements a modified slab allocator as a drop-in replacement
38  * for the libc malloc().  The slab algorithm has been adjusted to support
39  * dynamic sizing of slabs which effectively allows slabs to be used for
40  * allocations of any size.  Because of this we neither have a small-block
41  * allocator or a big-block allocator and the code paths are simplified.
42  *
43  * To support dynamic slab sizing available user virtual memory is broken
44  * down into ~1024 regions.  Each region has fixed slab size whos value is
45  * set when the region is opened up for use.  The free() path simply applies
46  * a mask based on the region to the pointer to acquire the base of the
47  * governing slab structure.
48  *
49  * Regions[NREGIONS]          (1024)
50  *
51  * Slab management and locking is done on a per-zone basis.
52  *
53  *        Alloc Size          Chunking        Number of zones
54  *        0-127               8                   16
55  *        128-255             16                  8
56  *        256-511             32                  8
57  *        512-1023  64                  8
58  *        1024-2047 128                 8
59  *        2048-4095 256                 8
60  *        4096-8191 512                 8
61  *        8192-16383          1024                8
62  *        16384-32767         2048                8
63  *        32768-65535         4096                8
64  *        ... continues forever ...     4 zones
65  *
66  *        For a 2^63 memory space each doubling >= 64K is broken down into
67  *        4 chunking zones, so we support 88 + (48 * 4) = 280 zones.
68  *
69  *                               API FEATURES AND SIDE EFFECTS
70  *
71  *    + power-of-2 sized allocations up to a page will be power-of-2 aligned.
72  *        Above that power-of-2 sized allocations are page-aligned.  Non
73  *        power-of-2 sized allocations are aligned the same as the chunk
74  *        size for their zone.
75  *    + ability to allocate arbitrarily large chunks of memory
76  *    + realloc will reuse the passed pointer if possible, within the
77  *        limitations of the zone chunking.
78  *
79  * On top of the slab allocator we also implement a 16-entry-per-thread
80  * magazine cache for allocations <= NOMSLABSIZE.
81  *
82  *                                      FUTURE FEATURES
83  *
84  *    + [better] garbage collection
85  *    + better initial sizing.
86  *
87  * TUNING
88  *
89  * The value of the environment variable MALLOC_OPTIONS is a character string
90  * containing various flags to tune nmalloc.  Upper case letters enabled
91  * or increase the feature, lower case disables or decreases the feature.
92  *
93  * U                Enable UTRACE for all operations, observable with ktrace.
94  *                  Diasbled by default.
95  *
96  * Z                Zero out allocations, otherwise allocations (except for
97  *                  calloc) will contain garbage.
98  *                  Disabled by default.
99  *
100  * H                Pass a hint with madvise() about unused pages.
101  *                  Disabled by default.
102  *                  Not currently implemented.
103  *
104  * F                Disable local per-thread caching.
105  *                  Disabled by default.
106  *
107  * C                Increase (decrease) how much excess cache to retain.
108  *                  Set to 4 by default.
109  */
110 
111 /* cc -shared -fPIC -g -O -I/usr/src/lib/libc/include -o dmalloc.so dmalloc.c */
112 
113 #ifndef STANDALONE_DEBUG
114 #include "libc_private.h"
115 #endif
116 
117 #include <sys/param.h>
118 #include <sys/types.h>
119 #include <sys/mman.h>
120 #include <sys/queue.h>
121 #include <sys/ktrace.h>
122 #include <stdio.h>
123 #include <stdint.h>
124 #include <stdlib.h>
125 #include <stdarg.h>
126 #include <stddef.h>
127 #include <unistd.h>
128 #include <string.h>
129 #include <fcntl.h>
130 #include <errno.h>
131 #include <pthread.h>
132 #include <limits.h>
133 
134 #include <machine/atomic.h>
135 #include <machine/cpufunc.h>
136 
137 #ifdef STANDALONE_DEBUG
138 void _nmalloc_thr_init(void);
139 #else
140 #include "spinlock.h"
141 #include "un-namespace.h"
142 #endif
143 
144 #ifndef MAP_SIZEALIGN
145 #define MAP_SIZEALIGN         0
146 #endif
147 
148 #if SSIZE_MAX == 0x7FFFFFFF
149 #define ADDRBITS    32
150 #define UVM_BITS    32        /* worst case */
151 #else
152 #define ADDRBITS    64
153 #define UVM_BITS    48        /* worst case XXX */
154 #endif
155 
156 #if LONG_MAX == 0x7FFFFFFF
157 #define LONG_BITS   32
158 #define LONG_BITS_SHIFT       5
159 #else
160 #define LONG_BITS   64
161 #define LONG_BITS_SHIFT       6
162 #endif
163 
164 #define LOCKEDPTR   ((void *)(intptr_t)-1)
165 
166 /*
167  * Regions[]
168  */
169 #define NREGIONS_BITS         10
170 #define NREGIONS    (1 << NREGIONS_BITS)
171 #define NREGIONS_MASK         (NREGIONS - 1)
172 #define NREGIONS_SHIFT        (UVM_BITS - NREGIONS_BITS)
173 #define NREGIONS_SIZE         (1LU << NREGIONS_SHIFT)
174 
175 typedef struct region *region_t;
176 typedef struct slglobaldata *slglobaldata_t;
177 typedef struct slab *slab_t;
178 
179 struct region {
180           uintptr_t mask;
181           slab_t              slab;     /* conditional out of band slab */
182 };
183 
184 static struct region Regions[NREGIONS];
185 
186 /*
187  * Number of chunking zones available
188  */
189 #define CHUNKFACTOR 8
190 #if ADDRBITS == 32
191 #define NZONES                (16 + 9 * CHUNKFACTOR + 16 * CHUNKFACTOR)
192 #else
193 #define NZONES                (16 + 9 * CHUNKFACTOR + 48 * CHUNKFACTOR)
194 #endif
195 
196 static int MaxChunks[NZONES];
197 
198 #define NDEPOTS               8                   /* must be power of 2 */
199 
200 /*
201  * Maximum number of chunks per slab, governed by the allocation bitmap in
202  * each slab.  The maximum is reduced for large chunk sizes.
203  */
204 #define MAXCHUNKS   (LONG_BITS * LONG_BITS)
205 #define MAXCHUNKS_BITS        (LONG_BITS_SHIFT * LONG_BITS_SHIFT)
206 #define LITSLABSIZE (32 * 1024)
207 #define NOMSLABSIZE (2 * 1024 * 1024)
208 #define BIGSLABSIZE (128 * 1024 * 1024)
209 
210 #define ZALLOC_SLAB_MAGIC     0x736c6162          /* magic sanity */
211 
212 TAILQ_HEAD(slab_list, slab);
213 
214 /*
215  * A slab structure
216  */
217 struct slab {
218           struct slab         *next;              /* slabs with available space */
219           TAILQ_ENTRY(slab) entry;
220           int32_t             magic;              /* magic number for sanity check */
221           u_int               navail;             /* number of free elements available */
222           u_int               nmax;
223           u_int               free_bit; /* free hint bitno */
224           u_int               free_index;         /* free hint index */
225           u_long              bitmap[LONG_BITS]; /* free chunks */
226           size_t              slab_size;          /* size of entire slab */
227           size_t              chunk_size;         /* chunk size for validation */
228           int                 zone_index;
229           enum { UNKNOWN, AVAIL, EMPTY, FULL } state;
230           int                 flags;
231           region_t  region;             /* related region */
232           char                *chunks;  /* chunk base */
233           slglobaldata_t      slgd;               /* localized to thread else NULL */
234 };
235 
236 /*
237  * per-thread data + global depot
238  *
239  * NOTE: The magazine shortcut is only used for per-thread data.
240  */
241 #define NMAGSHORTCUT          16
242 
243 struct slglobaldata {
244           spinlock_t          lock;               /* only used by slglobaldepot */
245           struct zoneinfo {
246                     slab_t    avail_base;
247                     slab_t    empty_base;
248                     int       best_region;
249                     int       mag_index;
250                     int       avail_count;
251                     int       empty_count;
252                     void      *mag_shortcut[NMAGSHORTCUT];
253           } zone[NZONES];
254           struct slab_list full_zones;  /* via entry */
255           int                 masked;
256           int                 biggest_index;
257           size_t              nslabs;
258 };
259 
260 #define SLAB_ZEROD            0x0001
261 
262 /*
263  * Misc constants.  Note that allocations that are exact multiples of
264  * PAGE_SIZE, or exceed the zone limit, fall through to the kmem module.
265  * IN_SAME_PAGE_MASK is used to sanity-check the per-page free lists.
266  */
267 #define MIN_CHUNK_SIZE                  8                   /* in bytes */
268 #define MIN_CHUNK_MASK                  (MIN_CHUNK_SIZE - 1)
269 
270 #define SAFLAG_ZERO 0x00000001
271 
272 /*
273  * The WEIRD_ADDR is used as known text to copy into free objects to
274  * try to create deterministic failure cases if the data is accessed after
275  * free.
276  *
277  * WARNING: A limited number of spinlocks are available, BIGXSIZE should
278  *            not be larger then 64.
279  */
280 #ifdef INVARIANTS
281 #define WEIRD_ADDR      0xdeadc0de
282 #endif
283 
284 /*
285  * Thread control
286  */
287 
288 #define MASSERT(exp)          do { if (__predict_false(!(exp)))       \
289                                         _mpanic("assertion: %s in %s",          \
290                                         #exp, __func__);              \
291                                   } while (0)
292 
293 /*
294  * With this attribute set, do not require a function call for accessing
295  * this variable when the code is compiled -fPIC.
296  *
297  * Must be empty for libc_rtld (similar to __thread)
298  */
299 #if defined(__LIBC_RTLD)
300 #define TLS_ATTRIBUTE
301 #else
302 #define TLS_ATTRIBUTE __attribute__ ((tls_model ("initial-exec")));
303 #endif
304 
305 static __thread struct slglobaldata slglobal TLS_ATTRIBUTE;
306 static pthread_key_t thread_malloc_key;
307 static pthread_once_t thread_malloc_once = PTHREAD_ONCE_INIT;
308 static struct slglobaldata slglobaldepot;
309 
310 static int opt_madvise = 0;
311 static int opt_free = 0;
312 static int opt_cache = 4;
313 static int opt_utrace = 0;
314 static int g_malloc_flags = 0;
315 static int malloc_panic;
316 
317 #ifdef INVARIANTS
318 static const int32_t weirdary[16] = {
319           WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
320           WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
321           WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR,
322           WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR, WEIRD_ADDR
323 };
324 #endif
325 
326 static void *memalloc(size_t size, int flags);
327 static void *memrealloc(void *ptr, size_t size);
328 static void memfree(void *ptr, int);
329 static int memalign(void **memptr, size_t alignment, size_t size);
330 static slab_t slaballoc(int zi, size_t chunking, size_t chunk_size);
331 static void slabfree(slab_t slab);
332 static void slabterm(slglobaldata_t slgd, slab_t slab);
333 static void *_vmem_alloc(int ri, size_t slab_size);
334 static void _vmem_free(void *ptr, size_t slab_size);
335 static void _mpanic(const char *ctl, ...) __printflike(1, 2);
336 #ifndef STANDALONE_DEBUG
337 static void malloc_init(void) __constructor(101);
338 #else
339 static void malloc_init(void) __constructor(101);
340 #endif
341 
342 
343 struct nmalloc_utrace {
344           void *p;
345           size_t s;
346           void *r;
347 };
348 
349 #define UTRACE(a, b, c)                                                         \
350           if (opt_utrace) {                                           \
351                     struct nmalloc_utrace ut = {                      \
352                               .p = (a),                               \
353                               .s = (b),                               \
354                               .r = (c)                                \
355                     };                                                          \
356                     utrace(&ut, sizeof(ut));                          \
357           }
358 
359 #ifdef INVARIANTS
360 /*
361  * If enabled any memory allocated without M_ZERO is initialized to -1.
362  */
363 static int  use_malloc_pattern;
364 #endif
365 
366 static void
malloc_init(void)367 malloc_init(void)
368 {
369           const char *p = NULL;
370 
371           TAILQ_INIT(&slglobal.full_zones);
372 
373           Regions[0].mask = -1; /* disallow activity in lowest region */
374 
375           if (issetugid() == 0)
376                     p = getenv("MALLOC_OPTIONS");
377 
378           for (; p != NULL && *p != '\0'; p++) {
379                     switch(*p) {
380                     case 'u':
381                               opt_utrace = 0;
382                               break;
383                     case 'U':
384                               opt_utrace = 1;
385                               break;
386                     case 'h':
387                               opt_madvise = 0;
388                               break;
389                     case 'H':
390                               opt_madvise = 1;
391                               break;
392                     case 'c':
393                               if (opt_cache > 0)
394                                         --opt_cache;
395                               break;
396                     case 'C':
397                               ++opt_cache;
398                               break;
399                     case 'f':
400                               opt_free = 0;
401                               break;
402                     case 'F':
403                               opt_free = 1;
404                               break;
405                     case 'z':
406                               g_malloc_flags = 0;
407                               break;
408                     case 'Z':
409                               g_malloc_flags = SAFLAG_ZERO;
410                               break;
411                     default:
412                               break;
413                     }
414           }
415 
416           UTRACE((void *) -1, 0, NULL);
417 }
418 
419 /*
420  * We have to install a handler for nmalloc thread teardowns when
421  * the thread is created.  We cannot delay this because destructors in
422  * sophisticated userland programs can call malloc() for the first time
423  * during their thread exit.
424  *
425  * This routine is called directly from pthreads.
426  */
427 static void _nmalloc_thr_init_once(void);
428 static void _nmalloc_thr_destructor(void *thrp);
429 
430 void
_nmalloc_thr_init(void)431 _nmalloc_thr_init(void)
432 {
433           static int did_init;
434 
435           TAILQ_INIT(&slglobal.full_zones);
436 
437           if (slglobal.masked)
438                     return;
439 
440           slglobal.masked = 1;
441           if (did_init == 0) {
442                     did_init = 1;
443                     pthread_once(&thread_malloc_once, _nmalloc_thr_init_once);
444           }
445           pthread_setspecific(thread_malloc_key, &slglobal);
446           slglobal.masked = 0;
447 }
448 
449 void
_nmalloc_thr_prepfork(void)450 _nmalloc_thr_prepfork(void)
451 {
452           if (__isthreaded)
453                     _SPINLOCK(&slglobaldepot.lock);
454 }
455 
456 void
_nmalloc_thr_parentfork(void)457 _nmalloc_thr_parentfork(void)
458 {
459           if (__isthreaded)
460                     _SPINUNLOCK(&slglobaldepot.lock);
461 }
462 
463 void
_nmalloc_thr_childfork(void)464 _nmalloc_thr_childfork(void)
465 {
466           if (__isthreaded)
467                     _SPINUNLOCK(&slglobaldepot.lock);
468 }
469 
470 /*
471  * Called just once
472  */
473 static void
_nmalloc_thr_init_once(void)474 _nmalloc_thr_init_once(void)
475 {
476           /* ignore error from stub if not threaded */
477           pthread_key_create(&thread_malloc_key, _nmalloc_thr_destructor);
478 }
479 
480 /*
481  * Called for each thread undergoing exit
482  *
483  * Move all of the thread's slabs into a depot.
484  */
485 static void
_nmalloc_thr_destructor(void * thrp)486 _nmalloc_thr_destructor(void *thrp)
487 {
488           slglobaldata_t slgd = thrp;
489           struct zoneinfo *zinfo;
490           slab_t slab;
491           void *ptr;
492           int i;
493           int j;
494 
495           slgd->masked = 1;
496 
497           for (i = 0; i <= slgd->biggest_index; i++) {
498                     zinfo = &slgd->zone[i];
499 
500                     while ((j = zinfo->mag_index) > 0) {
501                               --j;
502                               ptr = zinfo->mag_shortcut[j];
503                               zinfo->mag_shortcut[j] = NULL;          /* SAFETY */
504                               zinfo->mag_index = j;
505                               memfree(ptr, 0);
506                     }
507 
508                     while ((slab = zinfo->empty_base) != NULL) {
509                               zinfo->empty_base = slab->next;
510                               --zinfo->empty_count;
511                               slabterm(slgd, slab);
512                     }
513 
514                     while ((slab = zinfo->avail_base) != NULL) {
515                               zinfo->avail_base = slab->next;
516                               --zinfo->avail_count;
517                               slabterm(slgd, slab);
518                     }
519 
520                     while ((slab = TAILQ_FIRST(&slgd->full_zones)) != NULL) {
521                               TAILQ_REMOVE(&slgd->full_zones, slab, entry);
522                               slabterm(slgd, slab);
523                     }
524           }
525 }
526 
527 /*
528  * Calculate the zone index for the allocation request size and set the
529  * allocation request size to that particular zone's chunk size.
530  *
531  * Minimum alignment is 16 bytes for allocations >= 16 bytes to conform
532  * with malloc requirements for intel/amd.
533  */
534 static __inline int
zoneindex(size_t * bytes,size_t * chunking)535 zoneindex(size_t *bytes, size_t *chunking)
536 {
537           size_t n = (size_t)*bytes;
538           size_t x;
539           size_t c;
540           int i;
541 
542           if (n < 128) {
543                     if (n < 16) {
544                               *bytes = n = (n + 7) & ~7;
545                               *chunking = 8;
546                               return(n / 8 - 1);  /* 8 byte chunks, 2 zones */
547                     } else {
548                               *bytes = n = (n + 15) & ~15;
549                               *chunking = 16;
550                               return(n / 16 + 2); /* 16 byte chunks, 8 zones */
551                     }
552           }
553           if (n < 4096) {
554                     x = 256;
555                     c = x / (CHUNKFACTOR * 2);
556                     i = 16;
557           } else {
558                     x = 8192;
559                     c = x / (CHUNKFACTOR * 2);
560                     i = 16 + CHUNKFACTOR * 5;  /* 256->512,1024,2048,4096,8192 */
561           }
562           while (n >= x) {
563                     x <<= 1;
564                     c <<= 1;
565                     i += CHUNKFACTOR;
566                     if (x == 0)
567                               _mpanic("slaballoc: byte value too high");
568           }
569           *bytes = n = roundup2(n, c);
570           *chunking = c;
571           return (i + n / c - CHUNKFACTOR);
572 #if 0
573           *bytes = n = (n + c - 1) & ~(c - 1);
574           *chunking = c;
575           return (n / c + i);
576 
577           if (n < 256) {
578                     *bytes = n = (n + 15) & ~15;
579                     *chunking = 16;
580                     return(n / (CHUNKINGLO*2) + CHUNKINGLO*1 - 1);
581           }
582           if (n < 8192) {
583                     if (n < 512) {
584                               *bytes = n = (n + 31) & ~31;
585                               *chunking = 32;
586                               return(n / (CHUNKINGLO*4) + CHUNKINGLO*2 - 1);
587                     }
588                     if (n < 1024) {
589                               *bytes = n = (n + 63) & ~63;
590                               *chunking = 64;
591                               return(n / (CHUNKINGLO*8) + CHUNKINGLO*3 - 1);
592                     }
593                     if (n < 2048) {
594                               *bytes = n = (n + 127) & ~127;
595                               *chunking = 128;
596                               return(n / (CHUNKINGLO*16) + CHUNKINGLO*4 - 1);
597                     }
598                     if (n < 4096) {
599                               *bytes = n = (n + 255) & ~255;
600                               *chunking = 256;
601                               return(n / (CHUNKINGLO*32) + CHUNKINGLO*5 - 1);
602                     }
603                     *bytes = n = (n + 511) & ~511;
604                     *chunking = 512;
605                     return(n / (CHUNKINGLO*64) + CHUNKINGLO*6 - 1);
606           }
607           if (n < 16384) {
608                     *bytes = n = (n + 1023) & ~1023;
609                     *chunking = 1024;
610                     return(n / (CHUNKINGLO*128) + CHUNKINGLO*7 - 1);
611           }
612           if (n < 32768) {                                  /* 16384-32767 */
613                     *bytes = n = (n + 2047) & ~2047;
614                     *chunking = 2048;
615                     return(n / (CHUNKINGLO*256) + CHUNKINGLO*8 - 1);
616           }
617           if (n < 65536) {
618                     *bytes = n = (n + 4095) & ~4095;        /* 32768-65535 */
619                     *chunking = 4096;
620                     return(n / (CHUNKINGLO*512) + CHUNKINGLO*9 - 1);
621           }
622 
623           x = 131072;
624           c = 8192;
625           i = CHUNKINGLO*10 - 1;
626 
627           while (n >= x) {
628                     x <<= 1;
629                     c <<= 1;
630                     i += CHUNKINGHI;
631                     if (x == 0)
632                               _mpanic("slaballoc: byte value too high");
633           }
634           *bytes = n = (n + c - 1) & ~(c - 1);
635           *chunking = c;
636           return (n / c + i);
637 #endif
638 }
639 
640 /*
641  * malloc() - call internal slab allocator
642  */
643 void *
__malloc(size_t size)644 __malloc(size_t size)
645 {
646           void *ptr;
647 
648           ptr = memalloc(size, 0);
649           if (ptr == NULL)
650                     errno = ENOMEM;
651           else
652                     UTRACE(0, size, ptr);
653           return(ptr);
654 }
655 
656 /*
657  * calloc() - call internal slab allocator
658  */
659 void *
__calloc(size_t number,size_t size)660 __calloc(size_t number, size_t size)
661 {
662           void *ptr;
663 
664           ptr = memalloc(number * size, SAFLAG_ZERO);
665           if (ptr == NULL)
666                     errno = ENOMEM;
667           else
668                     UTRACE(0, number * size, ptr);
669           return(ptr);
670 }
671 
672 /*
673  * realloc() (SLAB ALLOCATOR)
674  *
675  * We do not attempt to optimize this routine beyond reusing the same
676  * pointer if the new size fits within the chunking of the old pointer's
677  * zone.
678  */
679 void *
__realloc(void * ptr,size_t size)680 __realloc(void *ptr, size_t size)
681 {
682           void *ret;
683 
684           if (ptr == NULL)
685                     ret = memalloc(size, 0);
686           else
687                     ret = memrealloc(ptr, size);
688           if (ret == NULL)
689                     errno = ENOMEM;
690           else
691                     UTRACE(ptr, size, ret);
692           return(ret);
693 }
694 
695 /*
696  * aligned_alloc()
697  *
698  * Allocate (size) bytes with a alignment of (alignment).
699  */
700 void *
__aligned_alloc(size_t alignment,size_t size)701 __aligned_alloc(size_t alignment, size_t size)
702 {
703           void *ptr;
704           int rc;
705 
706           ptr = NULL;
707           rc = memalign(&ptr, alignment, size);
708           if (rc)
709                     errno = rc;
710 
711           return (ptr);
712 }
713 
714 /*
715  * posix_memalign()
716  *
717  * Allocate (size) bytes with a alignment of (alignment), where (alignment)
718  * is a power of 2 >= sizeof(void *).
719  */
720 int
__posix_memalign(void ** memptr,size_t alignment,size_t size)721 __posix_memalign(void **memptr, size_t alignment, size_t size)
722 {
723           int rc;
724 
725           /*
726            * OpenGroup spec issue 6 check
727            */
728           if (alignment < sizeof(void *)) {
729                     *memptr = NULL;
730                     return(EINVAL);
731           }
732 
733           rc = memalign(memptr, alignment, size);
734 
735           return (rc);
736 }
737 
738 /*
739  * The slab allocator will allocate on power-of-2 boundaries up to at least
740  * PAGE_SIZE.  Otherwise we use the zoneindex mechanic to find a zone
741  * matching the requirements.
742  */
743 static int
memalign(void ** memptr,size_t alignment,size_t size)744 memalign(void **memptr, size_t alignment, size_t size)
745 {
746 
747           if (alignment < 1) {
748                     *memptr = NULL;
749                     return(EINVAL);
750           }
751 
752           /*
753            * OpenGroup spec issue 6 check
754            */
755           if ((alignment | (alignment - 1)) + 1 != (alignment << 1)) {
756                     *memptr = NULL;
757                     return(EINVAL);
758           }
759 
760           /*
761            * XXX for now just find the nearest power of 2 >= size and also
762            * >= alignment and allocate that.
763            */
764           while (alignment < size) {
765                     alignment <<= 1;
766                     if (alignment == 0)
767                               _mpanic("posix_memalign: byte value too high");
768           }
769           *memptr = memalloc(alignment, 0);
770           return(*memptr ? 0 : ENOMEM);
771 }
772 
773 /*
774  * free() (SLAB ALLOCATOR) - do the obvious
775  */
776 void
__free(void * ptr)777 __free(void *ptr)
778 {
779           if (ptr) {
780                     UTRACE(ptr, 0, 0);
781                     memfree(ptr, 0);
782           }
783 }
784 
785 /*
786  * memalloc()       (SLAB ALLOCATOR)
787  *
788  *        Allocate memory via the slab allocator.
789  */
790 static void *
memalloc(size_t size,int flags)791 memalloc(size_t size, int flags)
792 {
793           slglobaldata_t slgd;
794           struct zoneinfo *zinfo;
795           slab_t slab;
796           size_t chunking;
797           int bmi;
798           int bno;
799           u_long *bmp;
800           int zi;
801 #ifdef INVARIANTS
802           int i;
803 #endif
804           int j;
805           char *obj;
806 
807           /*
808            * If 0 bytes is requested we have to return a unique pointer, allocate
809            * at least one byte.
810            */
811           if (size == 0)
812                     size = 1;
813 
814           /* Capture global flags */
815           flags |= g_malloc_flags;
816 
817           /* Compute allocation zone; zoneindex will panic on excessive sizes */
818           zi = zoneindex(&size, &chunking);
819           MASSERT(zi < NZONES);
820           if (size == 0)
821                     return(NULL);
822 
823           /*
824            * Try magazine shortcut first
825            */
826           slgd = &slglobal;
827           zinfo = &slgd->zone[zi];
828 
829           if ((j = zinfo->mag_index) != 0) {
830                     zinfo->mag_index = --j;
831                     obj = zinfo->mag_shortcut[j];
832                     zinfo->mag_shortcut[j] = NULL;          /* SAFETY */
833                     if (flags & SAFLAG_ZERO)
834                               bzero(obj, size);
835                     return obj;
836           }
837 
838           /*
839            * Locate a slab with available space.  If no slabs are available
840            * back-off to the empty list and if we still come up dry allocate
841            * a new slab (which will try the depot first).
842            */
843 retry:
844           if ((slab = zinfo->avail_base) == NULL) {
845                     if ((slab = zinfo->empty_base) == NULL) {
846                               /*
847                                * Still dry
848                                */
849                               slab = slaballoc(zi, chunking, size);
850                               if (slab == NULL)
851                                         return(NULL);
852                               slab->next = zinfo->avail_base;
853                               zinfo->avail_base = slab;
854                               ++zinfo->avail_count;
855                               slab->state = AVAIL;
856                               if (slgd->biggest_index < zi)
857                                         slgd->biggest_index = zi;
858                               ++slgd->nslabs;
859                     } else {
860                               /*
861                                * Pulled from empty list
862                                */
863                               zinfo->empty_base = slab->next;
864                               slab->next = zinfo->avail_base;
865                               zinfo->avail_base = slab;
866                               ++zinfo->avail_count;
867                               slab->state = AVAIL;
868                               --zinfo->empty_count;
869                     }
870           }
871 
872           /*
873            * Allocate a chunk out of the slab.  HOT PATH
874            *
875            * Only the thread owning the slab can allocate out of it.
876            *
877            * NOTE: The last bit in the bitmap is always marked allocated so
878            *         we cannot overflow here.
879            */
880           bno = slab->free_bit;
881           bmi = slab->free_index;
882           bmp = &slab->bitmap[bmi];
883           if (*bmp & (1LU << bno)) {
884                     atomic_clear_long(bmp, 1LU << bno);
885                     obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) * size;
886                     slab->free_bit = (bno + 1) & (LONG_BITS - 1);
887                     atomic_add_int(&slab->navail, -1);
888                     if (flags & SAFLAG_ZERO)
889                               bzero(obj, size);
890                     return (obj);
891           }
892 
893           /*
894            * Allocate a chunk out of a slab.  COLD PATH
895            */
896           if (slab->navail == 0) {
897                     zinfo->avail_base = slab->next;
898                     --zinfo->avail_count;
899                     slab->state = FULL;
900                     TAILQ_INSERT_TAIL(&slgd->full_zones, slab, entry);
901                     goto retry;
902           }
903 
904           while (bmi < LONG_BITS) {
905                     bmp = &slab->bitmap[bmi];
906                     if (*bmp) {
907                               bno = bsflong(*bmp);
908                               atomic_clear_long(bmp, 1LU << bno);
909                               obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
910                                                        size;
911                               slab->free_index = bmi;
912                               slab->free_bit = (bno + 1) & (LONG_BITS - 1);
913                               atomic_add_int(&slab->navail, -1);
914                               if (flags & SAFLAG_ZERO)
915                                         bzero(obj, size);
916                               return (obj);
917                     }
918                     ++bmi;
919           }
920           bmi = 0;
921           while (bmi < LONG_BITS) {
922                     bmp = &slab->bitmap[bmi];
923                     if (*bmp) {
924                               bno = bsflong(*bmp);
925                               atomic_clear_long(bmp, 1LU << bno);
926                               obj = slab->chunks + ((bmi << LONG_BITS_SHIFT) + bno) *
927                                                        size;
928                               slab->free_index = bmi;
929                               slab->free_bit = (bno + 1) & (LONG_BITS - 1);
930                               atomic_add_int(&slab->navail, -1);
931                               if (flags & SAFLAG_ZERO)
932                                         bzero(obj, size);
933                               return (obj);
934                     }
935                     ++bmi;
936           }
937           _mpanic("slaballoc: corrupted zone: navail %d", slab->navail);
938           /* not reached */
939           return NULL;
940 }
941 
942 /*
943  * Reallocate memory within the chunk
944  */
945 static void *
memrealloc(void * ptr,size_t nsize)946 memrealloc(void *ptr, size_t nsize)
947 {
948           region_t region;
949           slab_t slab;
950           size_t osize;
951           char *obj;
952           int flags = 0;
953 
954           /*
955            * If 0 bytes is requested we have to return a unique pointer, allocate
956            * at least one byte.
957            */
958           if (nsize == 0)
959                     nsize = 1;
960 
961           /* Capture global flags */
962           flags |= g_malloc_flags;
963 
964           /*
965            * Locate the zone by looking up the dynamic slab size mask based
966            * on the memory region the allocation resides in.
967            */
968           region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
969           if ((slab = region->slab) == NULL)
970                     slab = (void *)((uintptr_t)ptr & region->mask);
971           MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
972           osize = slab->chunk_size;
973           if (nsize <= osize) {
974                     if (osize < 32 || nsize >= osize / 2) {
975                               obj = ptr;
976                               if ((flags & SAFLAG_ZERO) && nsize < osize)
977                                         bzero(obj + nsize, osize - nsize);
978                               return(obj);
979                     }
980           }
981 
982           /*
983            * Otherwise resize the object
984            */
985           obj = memalloc(nsize, 0);
986           if (obj) {
987                     if (nsize > osize)
988                               nsize = osize;
989                     bcopy(ptr, obj, nsize);
990                     memfree(ptr, 0);
991           }
992           return (obj);
993 }
994 
995 /*
996  * free (SLAB ALLOCATOR)
997  *
998  * Free a memory block previously allocated by malloc.
999  *
1000  * MPSAFE
1001  */
1002 static void
memfree(void * ptr,int flags)1003 memfree(void *ptr, int flags)
1004 {
1005           region_t region;
1006           slglobaldata_t slgd;
1007           slab_t slab;
1008           slab_t stmp;
1009           slab_t *slabp;
1010           int bmi;
1011           int bno;
1012           int j;
1013           u_long *bmp;
1014 
1015           /*
1016            * Locate the zone by looking up the dynamic slab size mask based
1017            * on the memory region the allocation resides in.
1018            *
1019            * WARNING!  The slab may be owned by another thread!
1020            */
1021           region = &Regions[((uintptr_t)ptr >> NREGIONS_SHIFT) & NREGIONS_MASK];
1022           if ((slab = region->slab) == NULL)
1023                     slab = (void *)((uintptr_t)ptr & region->mask);
1024           MASSERT(slab != NULL);
1025           MASSERT(slab->magic == ZALLOC_SLAB_MAGIC);
1026 
1027 #ifdef INVARIANTS
1028           /*
1029            * Put weird data into the memory to detect modifications after
1030            * freeing, illegal pointer use after freeing (we should fault on
1031            * the odd address), and so forth.
1032            */
1033           if (slab->chunk_size < sizeof(weirdary))
1034                     bcopy(weirdary, ptr, slab->chunk_size);
1035           else
1036                     bcopy(weirdary, ptr, sizeof(weirdary));
1037 #endif
1038           slgd = &slglobal;
1039 
1040           /*
1041            * Use mag_shortcut[] when possible
1042            */
1043           if (slgd->masked == 0 && slab->chunk_size <= NOMSLABSIZE) {
1044                     struct zoneinfo *zinfo;
1045 
1046                     zinfo = &slgd->zone[slab->zone_index];
1047                     j = zinfo->mag_index;
1048                     if (j < NMAGSHORTCUT) {
1049                               zinfo->mag_shortcut[j] = ptr;
1050                               zinfo->mag_index = j + 1;
1051                               return;
1052                     }
1053           }
1054 
1055           /*
1056            * Free to slab and increment navail.  We can delay incrementing
1057            * navail to prevent the slab from being destroyed out from under
1058            * us while we do other optimizations.
1059            */
1060           bno = ((uintptr_t)ptr - (uintptr_t)slab->chunks) / slab->chunk_size;
1061           bmi = bno >> LONG_BITS_SHIFT;
1062           bno &= (LONG_BITS - 1);
1063           bmp = &slab->bitmap[bmi];
1064 
1065           MASSERT(bmi >= 0 && bmi < slab->nmax);
1066           MASSERT((*bmp & (1LU << bno)) == 0);
1067           atomic_set_long(bmp, 1LU << bno);
1068 
1069           if (slab->slgd == slgd) {
1070                     /*
1071                      * We can only do the following if we own the slab.  Note
1072                      * that navail can be incremented by any thread even if
1073                      * we own the slab.
1074                      */
1075                     struct zoneinfo *zinfo;
1076 
1077                     atomic_add_int(&slab->navail, 1);
1078                     if (slab->free_index > bmi) {
1079                               slab->free_index = bmi;
1080                               slab->free_bit = bno;
1081                     } else if (slab->free_index == bmi &&
1082                                  slab->free_bit > bno) {
1083                               slab->free_bit = bno;
1084                     }
1085                     zinfo = &slgd->zone[slab->zone_index];
1086 
1087                     /*
1088                      * Freeing an object from a full slab makes it less than
1089                      * full.  The slab must be moved to the available list.
1090                      *
1091                      * If the available list has too many slabs, release some
1092                      * to the depot.
1093                      */
1094                     if (slab->state == FULL) {
1095                               TAILQ_REMOVE(&slgd->full_zones, slab, entry);
1096                               slab->state = AVAIL;
1097                               stmp = zinfo->avail_base;
1098                               slab->next = stmp;
1099                               zinfo->avail_base = slab;
1100                               ++zinfo->avail_count;
1101                               while (zinfo->avail_count > opt_cache) {
1102                                         slab = zinfo->avail_base;
1103                                         zinfo->avail_base = slab->next;
1104                                         --zinfo->avail_count;
1105                                         slabterm(slgd, slab);
1106                               }
1107                               goto done;
1108                     }
1109 
1110                     /*
1111                      * If the slab becomes completely empty dispose of it in
1112                      * some manner.  By default each thread caches up to 4
1113                      * empty slabs.  Only small slabs are cached.
1114                      */
1115                     if (slab->navail == slab->nmax && slab->state == AVAIL) {
1116                               /*
1117                                * Remove slab from available queue
1118                                */
1119                               slabp = &zinfo->avail_base;
1120                               while ((stmp = *slabp) != slab)
1121                                         slabp = &stmp->next;
1122                               *slabp = slab->next;
1123                               --zinfo->avail_count;
1124 
1125                               if (opt_free || opt_cache == 0) {
1126                                         /*
1127                                          * If local caching is disabled cache the
1128                                          * slab in the depot (or free it).
1129                                          */
1130                                         slabterm(slgd, slab);
1131                               } else if (slab->slab_size > BIGSLABSIZE) {
1132                                         /*
1133                                          * We do not try to retain large slabs
1134                                          * in per-thread caches.
1135                                          */
1136                                         slabterm(slgd, slab);
1137                               } else if (zinfo->empty_count > opt_cache) {
1138                                         /*
1139                                          * We have too many slabs cached, but
1140                                          * instead of freeing this one free
1141                                          * an empty slab that's been idle longer.
1142                                          *
1143                                          * (empty_count does not change)
1144                                          */
1145                                         stmp = zinfo->empty_base;
1146                                         slab->state = EMPTY;
1147                                         slab->next = stmp->next;
1148                                         zinfo->empty_base = slab;
1149                                         slabterm(slgd, stmp);
1150                               } else {
1151                                         /*
1152                                          * Cache the empty slab in our thread local
1153                                          * empty list.
1154                                          */
1155                                         ++zinfo->empty_count;
1156                                         slab->state = EMPTY;
1157                                         slab->next = zinfo->empty_base;
1158                                         zinfo->empty_base = slab;
1159                               }
1160                     }
1161           } else if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1162                     slglobaldata_t sldepot;
1163 
1164                     /*
1165                      * If freeing to a slab owned by the global depot, and
1166                      * the slab becomes completely EMPTY, try to move it to
1167                      * the correct list.
1168                      */
1169                     sldepot = &slglobaldepot;
1170                     if (__isthreaded)
1171                               _SPINLOCK(&sldepot->lock);
1172                     if (slab->slgd == NULL && slab->navail + 1 == slab->nmax) {
1173                               struct zoneinfo *zinfo;
1174 
1175                               /*
1176                                * Move the slab to the empty list
1177                                */
1178                               MASSERT(slab->state == AVAIL);
1179                               atomic_add_int(&slab->navail, 1);
1180                               zinfo = &sldepot->zone[slab->zone_index];
1181                               slabp = &zinfo->avail_base;
1182                               while (slab != *slabp)
1183                                         slabp = &(*slabp)->next;
1184                               *slabp = slab->next;
1185                               --zinfo->avail_count;
1186 
1187                               /*
1188                                * Clean out excessive empty entries from the
1189                                * depot.
1190                                */
1191                               slab->state = EMPTY;
1192                               slab->next = zinfo->empty_base;
1193                               zinfo->empty_base = slab;
1194                               ++zinfo->empty_count;
1195                               while (zinfo->empty_count > opt_cache) {
1196                                         slab = zinfo->empty_base;
1197                                         zinfo->empty_base = slab->next;
1198                                         --zinfo->empty_count;
1199                                         slab->state = UNKNOWN;
1200                                         if (__isthreaded)
1201                                                   _SPINUNLOCK(&sldepot->lock);
1202                                         slabfree(slab);
1203                                         if (__isthreaded)
1204                                                   _SPINLOCK(&sldepot->lock);
1205                               }
1206                     } else {
1207                               atomic_add_int(&slab->navail, 1);
1208                     }
1209                     if (__isthreaded)
1210                               _SPINUNLOCK(&sldepot->lock);
1211           } else {
1212                     /*
1213                      * We can't act on the slab other than by adjusting navail
1214                      * (and the bitmap which we did in the common code at the
1215                      * top).
1216                      */
1217                     atomic_add_int(&slab->navail, 1);
1218           }
1219 done:
1220           ;
1221 }
1222 
1223 /*
1224  * Allocate a new slab holding objects of size chunk_size.
1225  */
1226 static slab_t
slaballoc(int zi,size_t chunking,size_t chunk_size)1227 slaballoc(int zi, size_t chunking, size_t chunk_size)
1228 {
1229           slglobaldata_t slgd;
1230           slglobaldata_t sldepot;
1231           struct zoneinfo *zinfo;
1232           region_t region;
1233           void *save;
1234           slab_t slab;
1235           size_t slab_desire;
1236           size_t slab_size;
1237           size_t region_mask;
1238           uintptr_t chunk_offset;
1239           ssize_t maxchunks;
1240           ssize_t tmpchunks;
1241           int ispower2;
1242           int power;
1243           int ri;
1244           int rx;
1245           int nswath;
1246           int j;
1247 
1248           /*
1249            * First look in the depot.  Any given zone in the depot may be
1250            * locked by being set to -1.  We have to do this instead of simply
1251            * removing the entire chain because removing the entire chain can
1252            * cause racing threads to allocate local slabs for large objects,
1253            * resulting in a large VSZ.
1254            */
1255           slgd = &slglobal;
1256           sldepot = &slglobaldepot;
1257           zinfo = &sldepot->zone[zi];
1258 
1259           if (zinfo->avail_base) {
1260                     if (__isthreaded)
1261                               _SPINLOCK(&sldepot->lock);
1262                     slab = zinfo->avail_base;
1263                     if (slab) {
1264                               MASSERT(slab->slgd == NULL);
1265                               slab->slgd = slgd;
1266                               zinfo->avail_base = slab->next;
1267                               --zinfo->avail_count;
1268                               if (__isthreaded)
1269                                         _SPINUNLOCK(&sldepot->lock);
1270                               return slab;
1271                     }
1272                     if (__isthreaded)
1273                               _SPINUNLOCK(&sldepot->lock);
1274           }
1275 
1276           /*
1277            * Nothing in the depot, allocate a new slab by locating or assigning
1278            * a region and then using the system virtual memory allocator.
1279            */
1280           slab = NULL;
1281 
1282           /*
1283            * Calculate the start of the data chunks relative to the start
1284            * of the slab.  If chunk_size is a power of 2 we guarantee
1285            * power of 2 alignment.  If it is not we guarantee alignment
1286            * to the chunk size.
1287            */
1288           if ((chunk_size ^ (chunk_size - 1)) == (chunk_size << 1) - 1) {
1289                     ispower2 = 1;
1290                     chunk_offset = roundup2(sizeof(*slab), chunk_size);
1291           } else {
1292                     ispower2 = 0;
1293                     chunk_offset = sizeof(*slab) + chunking - 1;
1294                     chunk_offset -= chunk_offset % chunking;
1295           }
1296 
1297           /*
1298            * Calculate a reasonable number of chunks for the slab.
1299            *
1300            * Once initialized the MaxChunks[] array can only ever be
1301            * reinitialized to the same value.
1302            */
1303           maxchunks = MaxChunks[zi];
1304           if (maxchunks == 0) {
1305                     /*
1306                      * First calculate how many chunks would fit in 1/1024
1307                      * available memory.  This is around 2MB on a 32 bit
1308                      * system and 128G on a 64-bit (48-bits available) system.
1309                      */
1310                     maxchunks = (ssize_t)(NREGIONS_SIZE - chunk_offset) /
1311                                   (ssize_t)chunk_size;
1312                     if (maxchunks <= 0)
1313                               maxchunks = 1;
1314 
1315                     /*
1316                      * A slab cannot handle more than MAXCHUNKS chunks, but
1317                      * limit us to approximately MAXCHUNKS / 2 here because
1318                      * we may have to expand maxchunks when we calculate the
1319                      * actual power-of-2 slab.
1320                      */
1321                     if (maxchunks > MAXCHUNKS / 2)
1322                               maxchunks = MAXCHUNKS / 2;
1323 
1324                     /*
1325                      * Try to limit the slabs to BIGSLABSIZE (~128MB).  Larger
1326                      * slabs will be created if the allocation does not fit.
1327                      */
1328                     if (chunk_offset + chunk_size * maxchunks > BIGSLABSIZE) {
1329                               tmpchunks = (ssize_t)(BIGSLABSIZE - chunk_offset) /
1330                                             (ssize_t)chunk_size;
1331                               if (tmpchunks <= 0)
1332                                         tmpchunks = 1;
1333                               if (maxchunks > tmpchunks)
1334                                         maxchunks = tmpchunks;
1335                     }
1336 
1337                     /*
1338                      * If the slab calculates to greater than 2MB see if we
1339                      * can cut it down to ~2MB.  This controls VSZ but has
1340                      * no effect on run-time size or performance.
1341                      *
1342                      * This is very important in case you core dump and also
1343                      * important to reduce unnecessary region allocations.
1344                      */
1345                     if (chunk_offset + chunk_size * maxchunks > NOMSLABSIZE) {
1346                               tmpchunks = (ssize_t)(NOMSLABSIZE - chunk_offset) /
1347                                             (ssize_t)chunk_size;
1348                               if (tmpchunks < 1)
1349                                         tmpchunks = 1;
1350                               if (maxchunks > tmpchunks)
1351                                         maxchunks = tmpchunks;
1352                     }
1353 
1354                     /*
1355                      * If the slab calculates to greater than 128K see if we
1356                      * can cut it down to ~128K while still maintaining a
1357                      * reasonably large number of chunks in each slab.  This
1358                      * controls VSZ but has no effect on run-time size or
1359                      * performance.
1360                      *
1361                      * This is very important in case you core dump and also
1362                      * important to reduce unnecessary region allocations.
1363                      */
1364                     if (chunk_offset + chunk_size * maxchunks > LITSLABSIZE) {
1365                               tmpchunks = (ssize_t)(LITSLABSIZE - chunk_offset) /
1366                                             (ssize_t)chunk_size;
1367                               if (tmpchunks < 32)
1368                                         tmpchunks = 32;
1369                               if (maxchunks > tmpchunks)
1370                                         maxchunks = tmpchunks;
1371                     }
1372 
1373                     MaxChunks[zi] = maxchunks;
1374           }
1375           MASSERT(maxchunks > 0 && maxchunks <= MAXCHUNKS);
1376 
1377           /*
1378            * Calculate the actual slab size.  maxchunks will be recalculated
1379            * a little later.
1380            */
1381           slab_desire = chunk_offset + chunk_size * maxchunks;
1382           slab_size = 8 * MAXCHUNKS;
1383           power = 3 + MAXCHUNKS_BITS;
1384           while (slab_size < slab_desire) {
1385                     slab_size <<= 1;
1386                     ++power;
1387           }
1388 
1389           /*
1390            * Do a quick recalculation based on the actual slab size but not
1391            * yet dealing with whether the slab header is in-band or out-of-band.
1392            * The purpose here is to see if we can reasonably reduce slab_size
1393            * to a power of 4 to allow more chunk sizes to use the same slab
1394            * size.
1395            */
1396           if ((power & 1) && slab_size > 32768) {
1397                     maxchunks = (slab_size - chunk_offset) / chunk_size;
1398                     if (maxchunks >= MAXCHUNKS / 8) {
1399                               slab_size >>= 1;
1400                               --power;
1401                     }
1402           }
1403           if ((power & 2) && slab_size > 32768 * 4) {
1404                     maxchunks = (slab_size - chunk_offset) / chunk_size;
1405                     if (maxchunks >= MAXCHUNKS / 4) {
1406                               slab_size >>= 2;
1407                               power -= 2;
1408                     }
1409           }
1410           /*
1411            * This case occurs when the slab_size is larger than 1/1024 available
1412            * UVM.
1413            */
1414           nswath = slab_size / NREGIONS_SIZE;
1415           if (nswath > NREGIONS)
1416                     return (NULL);
1417 
1418 
1419           /*
1420            * Try to allocate from our current best region for this zi
1421            */
1422           region_mask = ~(slab_size - 1);
1423           ri = slgd->zone[zi].best_region;
1424           if (Regions[ri].mask == region_mask) {
1425                     if ((slab = _vmem_alloc(ri, slab_size)) != NULL)
1426                               goto found;
1427           }
1428 
1429           /*
1430            * Try to find an existing region to allocate from.  The normal
1431            * case will be for allocations that are less than 1/1024 available
1432            * UVM, which fit into a single Regions[] entry.
1433            */
1434           while (slab_size <= NREGIONS_SIZE) {
1435                     rx = -1;
1436                     for (ri = 0; ri < NREGIONS; ++ri) {
1437                               if (rx < 0 && Regions[ri].mask == 0)
1438                                         rx = ri;
1439                               if (Regions[ri].mask == region_mask) {
1440                                         slab = _vmem_alloc(ri, slab_size);
1441                                         if (slab) {
1442                                                   slgd->zone[zi].best_region = ri;
1443                                                   goto found;
1444                                         }
1445                               }
1446                     }
1447 
1448                     if (rx < 0)
1449                               return(NULL);
1450 
1451                     /*
1452                      * This can fail, retry either way
1453                      */
1454                     atomic_cmpset_ptr((void **)&Regions[rx].mask,
1455                                           NULL,
1456                                           (void *)region_mask);
1457           }
1458 
1459           for (;;) {
1460                     rx = -1;
1461                     for (ri = 0; ri < NREGIONS; ri += nswath) {
1462                               if (Regions[ri].mask == region_mask) {
1463                                         slab = _vmem_alloc(ri, slab_size);
1464                                         if (slab) {
1465                                                   slgd->zone[zi].best_region = ri;
1466                                                   goto found;
1467                                         }
1468                               }
1469                               if (rx < 0) {
1470                                         for (j = nswath - 1; j >= 0; --j) {
1471                                                   if (Regions[ri+j].mask != 0)
1472                                                             break;
1473                                         }
1474                                         if (j < 0)
1475                                                   rx = ri;
1476                               }
1477                     }
1478 
1479                     /*
1480                      * We found a candidate, try to allocate it backwards so
1481                      * another thread racing a slaballoc() does not see the
1482                      * mask in the base index position until we are done.
1483                      *
1484                      * We can safely zero-out any partial allocations because
1485                      * the mask is only accessed from the base index.  Any other
1486                      * threads racing us will fail prior to us clearing the mask.
1487                      */
1488                     if (rx < 0)
1489                               return(NULL);
1490                     for (j = nswath - 1; j >= 0; --j) {
1491                               if (!atomic_cmpset_ptr((void **)&Regions[rx+j].mask,
1492                                                          NULL, (void *)region_mask)) {
1493                                         while (++j < nswath)
1494                                                   Regions[rx+j].mask = 0;
1495                                         break;
1496                               }
1497                     }
1498                     /* retry */
1499           }
1500 
1501           /*
1502            * Fill in the new slab in region ri.  If the slab_size completely
1503            * fills one or more region slots we move the slab structure out of
1504            * band which should optimize the chunking (particularly for a power
1505            * of 2).
1506            */
1507 found:
1508           region = &Regions[ri];
1509           MASSERT(region->slab == NULL);
1510           if (slab_size >= NREGIONS_SIZE) {
1511                     save = slab;
1512                     slab = memalloc(sizeof(*slab), 0);
1513                     bzero(slab, sizeof(*slab));
1514                     slab->chunks = save;
1515                     for (j = 0; j < nswath; ++j)
1516                               region[j].slab = slab;
1517                     chunk_offset = 0;
1518           } else {
1519                     bzero(slab, sizeof(*slab));
1520                     slab->chunks = (char *)slab + chunk_offset;
1521           }
1522 
1523           /*
1524            * Calculate the start of the chunks memory and recalculate the
1525            * actual number of chunks the slab can hold.
1526            */
1527           maxchunks = (slab_size - chunk_offset) / chunk_size;
1528           if (maxchunks > MAXCHUNKS)
1529                     maxchunks = MAXCHUNKS;
1530 
1531           /*
1532            * And fill in the rest
1533            */
1534           slab->magic = ZALLOC_SLAB_MAGIC;
1535           slab->navail = maxchunks;
1536           slab->nmax = maxchunks;
1537           slab->slab_size = slab_size;
1538           slab->chunk_size = chunk_size;
1539           slab->zone_index = zi;
1540           slab->slgd = slgd;
1541           slab->state = UNKNOWN;
1542           slab->region = region;
1543 
1544           for (ri = 0; ri < maxchunks; ri += LONG_BITS) {
1545                     if (ri + LONG_BITS <= maxchunks)
1546                               slab->bitmap[ri >> LONG_BITS_SHIFT] = ULONG_MAX;
1547                     else
1548                               slab->bitmap[ri >> LONG_BITS_SHIFT] =
1549                                                             (1LU << (maxchunks - ri)) - 1;
1550           }
1551           return (slab);
1552 }
1553 
1554 /*
1555  * Free a slab.
1556  */
1557 static void
slabfree(slab_t slab)1558 slabfree(slab_t slab)
1559 {
1560           int nswath;
1561           int j;
1562 
1563           if (slab->region->slab == slab) {
1564                     /*
1565                      * Out-of-band slab.
1566                      */
1567                     nswath = slab->slab_size / NREGIONS_SIZE;
1568                     for (j = 0; j < nswath; ++j)
1569                               slab->region[j].slab = NULL;
1570                     slab->magic = 0;
1571                     _vmem_free(slab->chunks, slab->slab_size);
1572                     memfree(slab, 0);
1573           } else {
1574                     /*
1575                      * In-band slab.
1576                      */
1577                     slab->magic = 0;
1578                     _vmem_free(slab, slab->slab_size);
1579           }
1580 }
1581 
1582 /*
1583  * Terminate a slab's use in the current thread.  The slab may still have
1584  * outstanding allocations and thus not be deallocatable.
1585  */
1586 static void
slabterm(slglobaldata_t slgd,slab_t slab)1587 slabterm(slglobaldata_t slgd, slab_t slab)
1588 {
1589           slglobaldata_t sldepot;
1590           struct zoneinfo *zinfo;
1591           int zi = slab->zone_index;
1592 
1593           slab->slgd = NULL;
1594           --slgd->nslabs;
1595           sldepot = &slglobaldepot;
1596           zinfo = &sldepot->zone[zi];
1597 
1598           /*
1599            * Move the slab to the avail list or the empty list.
1600            */
1601           if (__isthreaded)
1602                     _SPINLOCK(&sldepot->lock);
1603           if (slab->navail == slab->nmax) {
1604                     slab->state = EMPTY;
1605                     slab->next = zinfo->empty_base;
1606                     zinfo->empty_base = slab;
1607                     ++zinfo->empty_count;
1608           } else {
1609                     slab->state = AVAIL;
1610                     slab->next = zinfo->avail_base;
1611                     zinfo->avail_base = slab;
1612                     ++zinfo->avail_count;
1613           }
1614 
1615           /*
1616            * Clean extra slabs out of the empty list
1617            */
1618           while (zinfo->empty_count > opt_cache) {
1619                     slab = zinfo->empty_base;
1620                     zinfo->empty_base = slab->next;
1621                     --zinfo->empty_count;
1622                     slab->state = UNKNOWN;
1623                     if (__isthreaded)
1624                               _SPINUNLOCK(&sldepot->lock);
1625                     slabfree(slab);
1626                     if (__isthreaded)
1627                               _SPINLOCK(&sldepot->lock);
1628           }
1629           if (__isthreaded)
1630                     _SPINUNLOCK(&sldepot->lock);
1631 }
1632 
1633 /*
1634  * _vmem_alloc()
1635  *
1636  *        Directly map memory in PAGE_SIZE'd chunks with the specified
1637  *        alignment.
1638  *
1639  *        Alignment must be a multiple of PAGE_SIZE.
1640  *
1641  *        Size must be >= alignment.
1642  */
1643 static void *
_vmem_alloc(int ri,size_t slab_size)1644 _vmem_alloc(int ri, size_t slab_size)
1645 {
1646           char *baddr = (void *)((uintptr_t)ri << NREGIONS_SHIFT);
1647           char *eaddr;
1648           char *addr;
1649           char *save;
1650           uintptr_t excess;
1651 
1652           if (slab_size < NREGIONS_SIZE)
1653                     eaddr = baddr + NREGIONS_SIZE;
1654           else
1655                     eaddr = baddr + slab_size;
1656 
1657           /*
1658            * This usually just works but might not.
1659            */
1660           addr = mmap(baddr, slab_size, PROT_READ|PROT_WRITE,
1661                         MAP_PRIVATE | MAP_ANON | MAP_SIZEALIGN, -1, 0);
1662           if (addr == MAP_FAILED) {
1663                     return (NULL);
1664           }
1665           if (addr < baddr || addr + slab_size > eaddr) {
1666                     munmap(addr, slab_size);
1667                     return (NULL);
1668           }
1669 
1670           /*
1671            * Check alignment.  The misaligned offset is also the excess
1672            * amount.  If misaligned unmap the excess so we have a chance of
1673            * mapping at the next alignment point and recursively try again.
1674            *
1675            * BBBBBBBBBBB BBBBBBBBBBB BBBBBBBBBBB  block alignment
1676            *   aaaaaaaaa aaaaaaaaaaa aa           mis-aligned allocation
1677            *   xxxxxxxxx                                    final excess calculation
1678            *   ^ returned address
1679            */
1680           excess = (uintptr_t)addr & (slab_size - 1);
1681           while (excess) {
1682                     excess = slab_size - excess;
1683                     save = addr;
1684 
1685                     munmap(save + excess, slab_size - excess);
1686                     addr = _vmem_alloc(ri, slab_size);
1687                     munmap(save, excess);
1688                     if (addr == NULL)
1689                               return (NULL);
1690                     if (addr < baddr || addr + slab_size > eaddr) {
1691                               munmap(addr, slab_size);
1692                               return (NULL);
1693                     }
1694                     excess = (uintptr_t)addr & (slab_size - 1);
1695           }
1696           return (addr);
1697 }
1698 
1699 /*
1700  * _vmem_free()
1701  *
1702  *        Free a chunk of memory allocated with _vmem_alloc()
1703  */
1704 static void
_vmem_free(void * ptr,size_t size)1705 _vmem_free(void *ptr, size_t size)
1706 {
1707           munmap(ptr, size);
1708 }
1709 
1710 /*
1711  * Panic on fatal conditions
1712  */
1713 static void
_mpanic(const char * ctl,...)1714 _mpanic(const char *ctl, ...)
1715 {
1716           va_list va;
1717 
1718           if (malloc_panic == 0) {
1719                     malloc_panic = 1;
1720                     va_start(va, ctl);
1721                     vfprintf(stderr, ctl, va);
1722                     fprintf(stderr, "\n");
1723                     fflush(stderr);
1724                     va_end(va);
1725           }
1726           abort();
1727 }
1728 
1729 __weak_reference(__aligned_alloc, aligned_alloc);
1730 __weak_reference(__malloc, malloc);
1731 __weak_reference(__calloc, calloc);
1732 __weak_reference(__posix_memalign, posix_memalign);
1733 __weak_reference(__realloc, realloc);
1734 __weak_reference(__free, free);
1735