1 /******************************************************************************/
2 #ifdef JEMALLOC_H_TYPES
3
4 typedef struct prof_bt_s prof_bt_t;
5 typedef struct prof_cnt_s prof_cnt_t;
6 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7 typedef struct prof_ctx_s prof_ctx_t;
8 typedef struct prof_tdata_s prof_tdata_t;
9
10 /* Option defaults. */
11 #define PROF_PREFIX_DEFAULT "jeprof"
12 #define LG_PROF_SAMPLE_DEFAULT 19
13 #define LG_PROF_INTERVAL_DEFAULT -1
14
15 /*
16 * Hard limit on stack backtrace depth. The version of prof_backtrace() that
17 * is based on __builtin_return_address() necessarily has a hard-coded number
18 * of backtrace frame handlers, and should be kept in sync with this setting.
19 */
20 #define PROF_BT_MAX 128
21
22 /* Maximum number of backtraces to store in each per thread LRU cache. */
23 #define PROF_TCMAX 1024
24
25 /* Initial hash table size. */
26 #define PROF_CKH_MINITEMS 64
27
28 /* Size of memory buffer to use when writing dump files. */
29 #define PROF_DUMP_BUFSIZE 65536
30
31 /* Size of stack-allocated buffer used by prof_printf(). */
32 #define PROF_PRINTF_BUFSIZE 128
33
34 /*
35 * Number of mutexes shared among all ctx's. No space is allocated for these
36 * unless profiling is enabled, so it's okay to over-provision.
37 */
38 #define PROF_NCTX_LOCKS 1024
39
40 /*
41 * prof_tdata pointers close to NULL are used to encode state information that
42 * is used for cleaning up during thread shutdown.
43 */
44 #define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1)
45 #define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2)
46 #define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY
47
48 #endif /* JEMALLOC_H_TYPES */
49 /******************************************************************************/
50 #ifdef JEMALLOC_H_STRUCTS
51
52 struct prof_bt_s {
53 /* Backtrace, stored as len program counters. */
54 void **vec;
55 unsigned len;
56 };
57
58 #ifdef JEMALLOC_PROF_LIBGCC
59 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
60 typedef struct {
61 prof_bt_t *bt;
62 unsigned nignore;
63 unsigned max;
64 } prof_unwind_data_t;
65 #endif
66
67 struct prof_cnt_s {
68 /*
69 * Profiling counters. An allocation/deallocation pair can operate on
70 * different prof_thr_cnt_t objects that are linked into the same
71 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
72 * negative. In principle it is possible for the *bytes counters to
73 * overflow/underflow, but a general solution would require something
74 * like 128-bit counters; this implementation doesn't bother to solve
75 * that problem.
76 */
77 int64_t curobjs;
78 int64_t curbytes;
79 uint64_t accumobjs;
80 uint64_t accumbytes;
81 };
82
83 struct prof_thr_cnt_s {
84 /* Linkage into prof_ctx_t's cnts_ql. */
85 ql_elm(prof_thr_cnt_t) cnts_link;
86
87 /* Linkage into thread's LRU. */
88 ql_elm(prof_thr_cnt_t) lru_link;
89
90 /*
91 * Associated context. If a thread frees an object that it did not
92 * allocate, it is possible that the context is not cached in the
93 * thread's hash table, in which case it must be able to look up the
94 * context, insert a new prof_thr_cnt_t into the thread's hash table,
95 * and link it into the prof_ctx_t's cnts_ql.
96 */
97 prof_ctx_t *ctx;
98
99 /*
100 * Threads use memory barriers to update the counters. Since there is
101 * only ever one writer, the only challenge is for the reader to get a
102 * consistent read of the counters.
103 *
104 * The writer uses this series of operations:
105 *
106 * 1) Increment epoch to an odd number.
107 * 2) Update counters.
108 * 3) Increment epoch to an even number.
109 *
110 * The reader must assure 1) that the epoch is even while it reads the
111 * counters, and 2) that the epoch doesn't change between the time it
112 * starts and finishes reading the counters.
113 */
114 unsigned epoch;
115
116 /* Profiling counters. */
117 prof_cnt_t cnts;
118 };
119
120 struct prof_ctx_s {
121 /* Associated backtrace. */
122 prof_bt_t *bt;
123
124 /* Protects nlimbo, cnt_merged, and cnts_ql. */
125 malloc_mutex_t *lock;
126
127 /*
128 * Number of threads that currently cause this ctx to be in a state of
129 * limbo due to one of:
130 * - Initializing per thread counters associated with this ctx.
131 * - Preparing to destroy this ctx.
132 * - Dumping a heap profile that includes this ctx.
133 * nlimbo must be 1 (single destroyer) in order to safely destroy the
134 * ctx.
135 */
136 unsigned nlimbo;
137
138 /* Temporary storage for summation during dump. */
139 prof_cnt_t cnt_summed;
140
141 /* When threads exit, they merge their stats into cnt_merged. */
142 prof_cnt_t cnt_merged;
143
144 /*
145 * List of profile counters, one for each thread that has allocated in
146 * this context.
147 */
148 ql_head(prof_thr_cnt_t) cnts_ql;
149
150 /* Linkage for list of contexts to be dumped. */
151 ql_elm(prof_ctx_t) dump_link;
152 };
153 typedef ql_head(prof_ctx_t) prof_ctx_list_t;
154
155 struct prof_tdata_s {
156 /*
157 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a
158 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
159 * objects. Other threads may read the prof_thr_cnt_t contents, but no
160 * others will ever write them.
161 *
162 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
163 * counter data into the associated prof_ctx_t objects, and unlink/free
164 * the prof_thr_cnt_t objects.
165 */
166 ckh_t bt2cnt;
167
168 /* LRU for contents of bt2cnt. */
169 ql_head(prof_thr_cnt_t) lru_ql;
170
171 /* Backtrace vector, used for calls to prof_backtrace(). */
172 void **vec;
173
174 /* Sampling state. */
175 uint64_t prng_state;
176 uint64_t threshold;
177 uint64_t accum;
178
179 /* State used to avoid dumping while operating on prof internals. */
180 bool enq;
181 bool enq_idump;
182 bool enq_gdump;
183 };
184
185 #endif /* JEMALLOC_H_STRUCTS */
186 /******************************************************************************/
187 #ifdef JEMALLOC_H_EXTERNS
188
189 extern bool opt_prof;
190 /*
191 * Even if opt_prof is true, sampling can be temporarily disabled by setting
192 * opt_prof_active to false. No locking is used when updating opt_prof_active,
193 * so there are no guarantees regarding how long it will take for all threads
194 * to notice state changes.
195 */
196 extern bool opt_prof_active;
197 extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */
198 extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */
199 extern bool opt_prof_gdump; /* High-water memory dumping. */
200 extern bool opt_prof_final; /* Final profile dumping. */
201 extern bool opt_prof_leak; /* Dump leak summary at exit. */
202 extern bool opt_prof_accum; /* Report cumulative bytes. */
203 extern char opt_prof_prefix[
204 /* Minimize memory bloat for non-prof builds. */
205 #ifdef JEMALLOC_PROF
206 PATH_MAX +
207 #endif
208 1];
209
210 /*
211 * Profile dump interval, measured in bytes allocated. Each arena triggers a
212 * profile dump when it reaches this threshold. The effect is that the
213 * interval between profile dumps averages prof_interval, though the actual
214 * interval between dumps will tend to be sporadic, and the interval will be a
215 * maximum of approximately (prof_interval * narenas).
216 */
217 extern uint64_t prof_interval;
218
219 /*
220 * If true, promote small sampled objects to large objects, since small run
221 * headers do not have embedded profile context pointers.
222 */
223 extern bool prof_promote;
224
225 void bt_init(prof_bt_t *bt, void **vec);
226 void prof_backtrace(prof_bt_t *bt, unsigned nignore);
227 prof_thr_cnt_t *prof_lookup(prof_bt_t *bt);
228 #ifdef JEMALLOC_JET
229 size_t prof_bt_count(void);
230 typedef int (prof_dump_open_t)(bool, const char *);
231 extern prof_dump_open_t *prof_dump_open;
232 #endif
233 void prof_idump(void);
234 bool prof_mdump(const char *filename);
235 void prof_gdump(void);
236 prof_tdata_t *prof_tdata_init(void);
237 void prof_tdata_cleanup(void *arg);
238 void prof_boot0(void);
239 void prof_boot1(void);
240 bool prof_boot2(void);
241 void prof_prefork(void);
242 void prof_postfork_parent(void);
243 void prof_postfork_child(void);
244
245 #endif /* JEMALLOC_H_EXTERNS */
246 /******************************************************************************/
247 #ifdef JEMALLOC_H_INLINES
248
249 #define PROF_ALLOC_PREP(nignore, size, ret) do { \
250 prof_tdata_t *prof_tdata; \
251 prof_bt_t bt; \
252 \
253 assert(size == s2u(size)); \
254 \
255 prof_tdata = prof_tdata_get(true); \
256 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \
257 if (prof_tdata != NULL) \
258 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
259 else \
260 ret = NULL; \
261 break; \
262 } \
263 \
264 if (opt_prof_active == false) { \
265 /* Sampling is currently inactive, so avoid sampling. */\
266 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
267 } else if (opt_lg_prof_sample == 0) { \
268 /* Don't bother with sampling logic, since sampling */\
269 /* interval is 1. */\
270 bt_init(&bt, prof_tdata->vec); \
271 prof_backtrace(&bt, nignore); \
272 ret = prof_lookup(&bt); \
273 } else { \
274 if (prof_tdata->threshold == 0) { \
275 /* Initialize. Seed the prng differently for */\
276 /* each thread. */\
277 prof_tdata->prng_state = \
278 (uint64_t)(uintptr_t)&size; \
279 prof_sample_threshold_update(prof_tdata); \
280 } \
281 \
282 /* Determine whether to capture a backtrace based on */\
283 /* whether size is enough for prof_accum to reach */\
284 /* prof_tdata->threshold. However, delay updating */\
285 /* these variables until prof_{m,re}alloc(), because */\
286 /* we don't know for sure that the allocation will */\
287 /* succeed. */\
288 /* */\
289 /* Use subtraction rather than addition to avoid */\
290 /* potential integer overflow. */\
291 if (size >= prof_tdata->threshold - \
292 prof_tdata->accum) { \
293 bt_init(&bt, prof_tdata->vec); \
294 prof_backtrace(&bt, nignore); \
295 ret = prof_lookup(&bt); \
296 } else \
297 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \
298 } \
299 } while (0)
300
301 #ifndef JEMALLOC_ENABLE_INLINE
302 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
303
304 prof_tdata_t *prof_tdata_get(bool create);
305 void prof_sample_threshold_update(prof_tdata_t *prof_tdata);
306 prof_ctx_t *prof_ctx_get(const void *ptr);
307 void prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
308 bool prof_sample_accum_update(size_t size);
309 void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
310 void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
311 size_t old_usize, prof_ctx_t *old_ctx);
312 void prof_free(const void *ptr, size_t size);
313 #endif
314
315 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
316 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
malloc_tsd_externs(prof_tdata,prof_tdata_t *)317 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
318 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
319 prof_tdata_cleanup)
320
321 JEMALLOC_INLINE prof_tdata_t *
322 prof_tdata_get(bool create)
323 {
324 prof_tdata_t *prof_tdata;
325
326 cassert(config_prof);
327
328 prof_tdata = *prof_tdata_tsd_get();
329 if (create && prof_tdata == NULL)
330 prof_tdata = prof_tdata_init();
331
332 return (prof_tdata);
333 }
334
335 JEMALLOC_INLINE void
prof_sample_threshold_update(prof_tdata_t * prof_tdata)336 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
337 {
338 /*
339 * The body of this function is compiled out unless heap profiling is
340 * enabled, so that it is possible to compile jemalloc with floating
341 * point support completely disabled. Avoiding floating point code is
342 * important on memory-constrained systems, but it also enables a
343 * workaround for versions of glibc that don't properly save/restore
344 * floating point registers during dynamic lazy symbol loading (which
345 * internally calls into whatever malloc implementation happens to be
346 * integrated into the application). Note that some compilers (e.g.
347 * gcc 4.8) may use floating point registers for fast memory moves, so
348 * jemalloc must be compiled with such optimizations disabled (e.g.
349 * -mno-sse) in order for the workaround to be complete.
350 */
351 #ifdef JEMALLOC_PROF
352 uint64_t r;
353 double u;
354
355 cassert(config_prof);
356
357 /*
358 * Compute sample threshold as a geometrically distributed random
359 * variable with mean (2^opt_lg_prof_sample).
360 *
361 * __ __
362 * | log(u) | 1
363 * prof_tdata->threshold = | -------- |, where p = -------------------
364 * | log(1-p) | opt_lg_prof_sample
365 * 2
366 *
367 * For more information on the math, see:
368 *
369 * Non-Uniform Random Variate Generation
370 * Luc Devroye
371 * Springer-Verlag, New York, 1986
372 * pp 500
373 * (http://luc.devroye.org/rnbookindex.html)
374 */
375 prng64(r, 53, prof_tdata->prng_state,
376 UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
377 u = (double)r * (1.0/9007199254740992.0L);
378 prof_tdata->threshold = (uint64_t)(log(u) /
379 log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
380 + (uint64_t)1U;
381 #endif
382 }
383
384 JEMALLOC_INLINE prof_ctx_t *
prof_ctx_get(const void * ptr)385 prof_ctx_get(const void *ptr)
386 {
387 prof_ctx_t *ret;
388 arena_chunk_t *chunk;
389
390 cassert(config_prof);
391 assert(ptr != NULL);
392
393 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
394 if (chunk != ptr) {
395 /* Region. */
396 ret = arena_prof_ctx_get(ptr);
397 } else
398 ret = huge_prof_ctx_get(ptr);
399
400 return (ret);
401 }
402
403 JEMALLOC_INLINE void
prof_ctx_set(const void * ptr,size_t usize,prof_ctx_t * ctx)404 prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
405 {
406 arena_chunk_t *chunk;
407
408 cassert(config_prof);
409 assert(ptr != NULL);
410
411 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
412 if (chunk != ptr) {
413 /* Region. */
414 arena_prof_ctx_set(ptr, usize, ctx);
415 } else
416 huge_prof_ctx_set(ptr, ctx);
417 }
418
419 JEMALLOC_INLINE bool
prof_sample_accum_update(size_t size)420 prof_sample_accum_update(size_t size)
421 {
422 prof_tdata_t *prof_tdata;
423
424 cassert(config_prof);
425 /* Sampling logic is unnecessary if the interval is 1. */
426 assert(opt_lg_prof_sample != 0);
427
428 prof_tdata = prof_tdata_get(false);
429 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
430 return (true);
431
432 /* Take care to avoid integer overflow. */
433 if (size >= prof_tdata->threshold - prof_tdata->accum) {
434 prof_tdata->accum -= (prof_tdata->threshold - size);
435 /* Compute new sample threshold. */
436 prof_sample_threshold_update(prof_tdata);
437 while (prof_tdata->accum >= prof_tdata->threshold) {
438 prof_tdata->accum -= prof_tdata->threshold;
439 prof_sample_threshold_update(prof_tdata);
440 }
441 return (false);
442 } else {
443 prof_tdata->accum += size;
444 return (true);
445 }
446 }
447
448 JEMALLOC_INLINE void
prof_malloc(const void * ptr,size_t usize,prof_thr_cnt_t * cnt)449 prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
450 {
451
452 cassert(config_prof);
453 assert(ptr != NULL);
454 assert(usize == isalloc(ptr, true));
455
456 if (opt_lg_prof_sample != 0) {
457 if (prof_sample_accum_update(usize)) {
458 /*
459 * Don't sample. For malloc()-like allocation, it is
460 * always possible to tell in advance how large an
461 * object's usable size will be, so there should never
462 * be a difference between the usize passed to
463 * PROF_ALLOC_PREP() and prof_malloc().
464 */
465 assert((uintptr_t)cnt == (uintptr_t)1U);
466 }
467 }
468
469 if ((uintptr_t)cnt > (uintptr_t)1U) {
470 prof_ctx_set(ptr, usize, cnt->ctx);
471
472 cnt->epoch++;
473 /*********/
474 mb_write();
475 /*********/
476 cnt->cnts.curobjs++;
477 cnt->cnts.curbytes += usize;
478 if (opt_prof_accum) {
479 cnt->cnts.accumobjs++;
480 cnt->cnts.accumbytes += usize;
481 }
482 /*********/
483 mb_write();
484 /*********/
485 cnt->epoch++;
486 /*********/
487 mb_write();
488 /*********/
489 } else
490 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
491 }
492
493 JEMALLOC_INLINE void
prof_realloc(const void * ptr,size_t usize,prof_thr_cnt_t * cnt,size_t old_usize,prof_ctx_t * old_ctx)494 prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
495 size_t old_usize, prof_ctx_t *old_ctx)
496 {
497 prof_thr_cnt_t *told_cnt;
498
499 cassert(config_prof);
500 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
501
502 if (ptr != NULL) {
503 assert(usize == isalloc(ptr, true));
504 if (opt_lg_prof_sample != 0) {
505 if (prof_sample_accum_update(usize)) {
506 /*
507 * Don't sample. The usize passed to
508 * PROF_ALLOC_PREP() was larger than what
509 * actually got allocated, so a backtrace was
510 * captured for this allocation, even though
511 * its actual usize was insufficient to cross
512 * the sample threshold.
513 */
514 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
515 }
516 }
517 }
518
519 if ((uintptr_t)old_ctx > (uintptr_t)1U) {
520 told_cnt = prof_lookup(old_ctx->bt);
521 if (told_cnt == NULL) {
522 /*
523 * It's too late to propagate OOM for this realloc(),
524 * so operate directly on old_cnt->ctx->cnt_merged.
525 */
526 malloc_mutex_lock(old_ctx->lock);
527 old_ctx->cnt_merged.curobjs--;
528 old_ctx->cnt_merged.curbytes -= old_usize;
529 malloc_mutex_unlock(old_ctx->lock);
530 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
531 }
532 } else
533 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
534
535 if ((uintptr_t)told_cnt > (uintptr_t)1U)
536 told_cnt->epoch++;
537 if ((uintptr_t)cnt > (uintptr_t)1U) {
538 prof_ctx_set(ptr, usize, cnt->ctx);
539 cnt->epoch++;
540 } else if (ptr != NULL)
541 prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
542 /*********/
543 mb_write();
544 /*********/
545 if ((uintptr_t)told_cnt > (uintptr_t)1U) {
546 told_cnt->cnts.curobjs--;
547 told_cnt->cnts.curbytes -= old_usize;
548 }
549 if ((uintptr_t)cnt > (uintptr_t)1U) {
550 cnt->cnts.curobjs++;
551 cnt->cnts.curbytes += usize;
552 if (opt_prof_accum) {
553 cnt->cnts.accumobjs++;
554 cnt->cnts.accumbytes += usize;
555 }
556 }
557 /*********/
558 mb_write();
559 /*********/
560 if ((uintptr_t)told_cnt > (uintptr_t)1U)
561 told_cnt->epoch++;
562 if ((uintptr_t)cnt > (uintptr_t)1U)
563 cnt->epoch++;
564 /*********/
565 mb_write(); /* Not strictly necessary. */
566 }
567
568 JEMALLOC_INLINE void
prof_free(const void * ptr,size_t size)569 prof_free(const void *ptr, size_t size)
570 {
571 prof_ctx_t *ctx = prof_ctx_get(ptr);
572
573 cassert(config_prof);
574
575 if ((uintptr_t)ctx > (uintptr_t)1) {
576 prof_thr_cnt_t *tcnt;
577 assert(size == isalloc(ptr, true));
578 tcnt = prof_lookup(ctx->bt);
579
580 if (tcnt != NULL) {
581 tcnt->epoch++;
582 /*********/
583 mb_write();
584 /*********/
585 tcnt->cnts.curobjs--;
586 tcnt->cnts.curbytes -= size;
587 /*********/
588 mb_write();
589 /*********/
590 tcnt->epoch++;
591 /*********/
592 mb_write();
593 /*********/
594 } else {
595 /*
596 * OOM during free() cannot be propagated, so operate
597 * directly on cnt->ctx->cnt_merged.
598 */
599 malloc_mutex_lock(ctx->lock);
600 ctx->cnt_merged.curobjs--;
601 ctx->cnt_merged.curbytes -= size;
602 malloc_mutex_unlock(ctx->lock);
603 }
604 }
605 }
606 #endif
607
608 #endif /* JEMALLOC_H_INLINES */
609 /******************************************************************************/
610