xref: /trueos/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 8943816bb4812ac55b5f3738b955ac07db05a3b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26  */
27 
28 /*
29  * DVA-based Adjustable Replacement Cache
30  *
31  * While much of the theory of operation used here is
32  * based on the self-tuning, low overhead replacement cache
33  * presented by Megiddo and Modha at FAST 2003, there are some
34  * significant differences:
35  *
36  * 1. The Megiddo and Modha model assumes any page is evictable.
37  * Pages in its cache cannot be "locked" into memory.  This makes
38  * the eviction algorithm simple: evict the last page in the list.
39  * This also make the performance characteristics easy to reason
40  * about.  Our cache is not so simple.  At any given moment, some
41  * subset of the blocks in the cache are un-evictable because we
42  * have handed out a reference to them.  Blocks are only evictable
43  * when there are no external references active.  This makes
44  * eviction far more problematic:  we choose to evict the evictable
45  * blocks that are the "lowest" in the list.
46  *
47  * There are times when it is not possible to evict the requested
48  * space.  In these circumstances we are unable to adjust the cache
49  * size.  To prevent the cache growing unbounded at these times we
50  * implement a "cache throttle" that slows the flow of new data
51  * into the cache until we can make space available.
52  *
53  * 2. The Megiddo and Modha model assumes a fixed cache size.
54  * Pages are evicted when the cache is full and there is a cache
55  * miss.  Our model has a variable sized cache.  It grows with
56  * high use, but also tries to react to memory pressure from the
57  * operating system: decreasing its size when system memory is
58  * tight.
59  *
60  * 3. The Megiddo and Modha model assumes a fixed page size. All
61  * elements of the cache are therefore exactly the same size.  So
62  * when adjusting the cache size following a cache miss, its simply
63  * a matter of choosing a single page to evict.  In our model, we
64  * have variable sized cache blocks (rangeing from 512 bytes to
65  * 128K bytes).  We therefore choose a set of blocks to evict to make
66  * space for a cache miss that approximates as closely as possible
67  * the space used by the new block.
68  *
69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70  * by N. Megiddo & D. Modha, FAST 2003
71  */
72 
73 /*
74  * The locking model:
75  *
76  * A new reference to a cache buffer can be obtained in two
77  * ways: 1) via a hash table lookup using the DVA as a key,
78  * or 2) via one of the ARC lists.  The arc_read() interface
79  * uses method 1, while the internal arc algorithms for
80  * adjusting the cache use method 2.  We therefore provide two
81  * types of locks: 1) the hash table lock array, and 2) the
82  * arc list locks.
83  *
84  * Buffers do not have their own mutexs, rather they rely on the
85  * hash table mutexs for the bulk of their protection (i.e. most
86  * fields in the arc_buf_hdr_t are protected by these mutexs).
87  *
88  * buf_hash_find() returns the appropriate mutex (held) when it
89  * locates the requested buffer in the hash table.  It returns
90  * NULL for the mutex if the buffer was not in the table.
91  *
92  * buf_hash_remove() expects the appropriate hash mutex to be
93  * already held before it is invoked.
94  *
95  * Each arc state also has a mutex which is used to protect the
96  * buffer list associated with the state.  When attempting to
97  * obtain a hash table lock while holding an arc list lock you
98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
99  * the active state mutex must be held before the ghost state mutex.
100  *
101  * Arc buffers may have an associated eviction callback function.
102  * This function will be invoked prior to removing the buffer (e.g.
103  * in arc_do_user_evicts()).  Note however that the data associated
104  * with the buffer may be evicted prior to the callback.  The callback
105  * must be made with *no locks held* (to prevent deadlock).  Additionally,
106  * the users of callbacks must ensure that their private data is
107  * protected from simultaneous callbacks from arc_clear_callback()
108  * and arc_do_user_evicts().
109  *
110  * Note that the majority of the performance stats are manipulated
111  * with atomic operations.
112  *
113  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114  *
115  *	- L2ARC buflist creation
116  *	- L2ARC buflist eviction
117  *	- L2ARC write completion, which walks L2ARC buflists
118  *	- ARC header destruction, as it removes from L2ARC buflists
119  *	- ARC header release, as it removes from L2ARC buflists
120  */
121 
122 #include <sys/spa.h>
123 #include <sys/zio.h>
124 #include <sys/zio_compress.h>
125 #include <sys/zfs_context.h>
126 #include <sys/arc.h>
127 #include <sys/refcount.h>
128 #include <sys/vdev.h>
129 #include <sys/vdev_impl.h>
130 #include <sys/dsl_pool.h>
131 #ifdef _KERNEL
132 #include <sys/dnlc.h>
133 #endif
134 #include <sys/callb.h>
135 #include <sys/kstat.h>
136 #include <sys/trim_map.h>
137 #include <zfs_fletcher.h>
138 #include <sys/sdt.h>
139 
140 #include <vm/vm_pageout.h>
141 #include <machine/vmparam.h>
142 
143 #ifdef illumos
144 #ifndef _KERNEL
145 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146 boolean_t arc_watch = B_FALSE;
147 int arc_procfd;
148 #endif
149 #endif /* illumos */
150 
151 static kmutex_t		arc_reclaim_thr_lock;
152 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
153 static uint8_t		arc_thread_exit;
154 
155 #define	ARC_REDUCE_DNLC_PERCENT	3
156 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157 
158 typedef enum arc_reclaim_strategy {
159 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
160 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
161 } arc_reclaim_strategy_t;
162 
163 /*
164  * The number of iterations through arc_evict_*() before we
165  * drop & reacquire the lock.
166  */
167 int arc_evict_iterations = 100;
168 
169 /* number of seconds before growing cache again */
170 static int		arc_grow_retry = 60;
171 
172 /* shift of arc_c for calculating both min and max arc_p */
173 static int		arc_p_min_shift = 4;
174 
175 /* log2(fraction of arc to reclaim) */
176 static int		arc_shrink_shift = 5;
177 
178 /*
179  * minimum lifespan of a prefetch block in clock ticks
180  * (initialized in arc_init())
181  */
182 static int		arc_min_prefetch_lifespan;
183 
184 /*
185  * If this percent of memory is free, don't throttle.
186  */
187 int arc_lotsfree_percent = 10;
188 
189 static int arc_dead;
190 extern int zfs_prefetch_disable;
191 
192 /*
193  * The arc has filled available memory and has now warmed up.
194  */
195 static boolean_t arc_warm;
196 
197 uint64_t zfs_arc_max;
198 uint64_t zfs_arc_min;
199 uint64_t zfs_arc_meta_limit = 0;
200 int zfs_arc_grow_retry = 0;
201 int zfs_arc_shrink_shift = 0;
202 int zfs_arc_p_min_shift = 0;
203 int zfs_disable_dup_eviction = 0;
204 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205 u_int zfs_arc_free_target = 0;
206 
207 static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
208 static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
209 
210 #ifdef _KERNEL
211 static void
arc_free_target_init(void * unused __unused)212 arc_free_target_init(void *unused __unused)
213 {
214 
215 	zfs_arc_free_target = vm_pageout_wakeup_thresh;
216 }
217 SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218     arc_free_target_init, NULL);
219 
220 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
221 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
222 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
223 TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
224 TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
225 SYSCTL_DECL(_vfs_zfs);
226 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
227     "Maximum ARC size");
228 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
229     "Minimum ARC size");
230 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
231     &zfs_arc_average_blocksize, 0,
232     "ARC average blocksize");
233 SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234     &arc_shrink_shift, 0,
235     "log2(fraction of arc to reclaim)");
236 
237 /*
238  * We don't have a tunable for arc_free_target due to the dependency on
239  * pagedaemon initialisation.
240  */
241 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
242     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
243     sysctl_vfs_zfs_arc_free_target, "IU",
244     "Desired number of free pages below which ARC triggers reclaim");
245 
246 static int
sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)247 sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
248 {
249 	u_int val;
250 	int err;
251 
252 	val = zfs_arc_free_target;
253 	err = sysctl_handle_int(oidp, &val, 0, req);
254 	if (err != 0 || req->newptr == NULL)
255 		return (err);
256 
257 	if (val < minfree)
258 		return (EINVAL);
259 	if (val > cnt.v_page_count)
260 		return (EINVAL);
261 
262 	zfs_arc_free_target = val;
263 
264 	return (0);
265 }
266 
267 /*
268  * Must be declared here, before the definition of corresponding kstat
269  * macro which uses the same names will confuse the compiler.
270  */
271 SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
272     CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
273     sysctl_vfs_zfs_arc_meta_limit, "QU",
274     "ARC metadata limit");
275 #endif
276 
277 /*
278  * Note that buffers can be in one of 6 states:
279  *	ARC_anon	- anonymous (discussed below)
280  *	ARC_mru		- recently used, currently cached
281  *	ARC_mru_ghost	- recentely used, no longer in cache
282  *	ARC_mfu		- frequently used, currently cached
283  *	ARC_mfu_ghost	- frequently used, no longer in cache
284  *	ARC_l2c_only	- exists in L2ARC but not other states
285  * When there are no active references to the buffer, they are
286  * are linked onto a list in one of these arc states.  These are
287  * the only buffers that can be evicted or deleted.  Within each
288  * state there are multiple lists, one for meta-data and one for
289  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
290  * etc.) is tracked separately so that it can be managed more
291  * explicitly: favored over data, limited explicitly.
292  *
293  * Anonymous buffers are buffers that are not associated with
294  * a DVA.  These are buffers that hold dirty block copies
295  * before they are written to stable storage.  By definition,
296  * they are "ref'd" and are considered part of arc_mru
297  * that cannot be freed.  Generally, they will aquire a DVA
298  * as they are written and migrate onto the arc_mru list.
299  *
300  * The ARC_l2c_only state is for buffers that are in the second
301  * level ARC but no longer in any of the ARC_m* lists.  The second
302  * level ARC itself may also contain buffers that are in any of
303  * the ARC_m* states - meaning that a buffer can exist in two
304  * places.  The reason for the ARC_l2c_only state is to keep the
305  * buffer header in the hash table, so that reads that hit the
306  * second level ARC benefit from these fast lookups.
307  */
308 
309 #define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
310 struct arcs_lock {
311 	kmutex_t	arcs_lock;
312 #ifdef _KERNEL
313 	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
314 #endif
315 };
316 
317 /*
318  * must be power of two for mask use to work
319  *
320  */
321 #define ARC_BUFC_NUMDATALISTS		16
322 #define ARC_BUFC_NUMMETADATALISTS	16
323 #define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
324 
325 typedef struct arc_state {
326 	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
327 	uint64_t arcs_size;	/* total amount of data in this state */
328 	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
329 	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
330 } arc_state_t;
331 
332 #define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
333 
334 /* The 6 states: */
335 static arc_state_t ARC_anon;
336 static arc_state_t ARC_mru;
337 static arc_state_t ARC_mru_ghost;
338 static arc_state_t ARC_mfu;
339 static arc_state_t ARC_mfu_ghost;
340 static arc_state_t ARC_l2c_only;
341 
342 typedef struct arc_stats {
343 	kstat_named_t arcstat_hits;
344 	kstat_named_t arcstat_misses;
345 	kstat_named_t arcstat_demand_data_hits;
346 	kstat_named_t arcstat_demand_data_misses;
347 	kstat_named_t arcstat_demand_metadata_hits;
348 	kstat_named_t arcstat_demand_metadata_misses;
349 	kstat_named_t arcstat_prefetch_data_hits;
350 	kstat_named_t arcstat_prefetch_data_misses;
351 	kstat_named_t arcstat_prefetch_metadata_hits;
352 	kstat_named_t arcstat_prefetch_metadata_misses;
353 	kstat_named_t arcstat_mru_hits;
354 	kstat_named_t arcstat_mru_ghost_hits;
355 	kstat_named_t arcstat_mfu_hits;
356 	kstat_named_t arcstat_mfu_ghost_hits;
357 	kstat_named_t arcstat_allocated;
358 	kstat_named_t arcstat_deleted;
359 	kstat_named_t arcstat_stolen;
360 	kstat_named_t arcstat_recycle_miss;
361 	/*
362 	 * Number of buffers that could not be evicted because the hash lock
363 	 * was held by another thread.  The lock may not necessarily be held
364 	 * by something using the same buffer, since hash locks are shared
365 	 * by multiple buffers.
366 	 */
367 	kstat_named_t arcstat_mutex_miss;
368 	/*
369 	 * Number of buffers skipped because they have I/O in progress, are
370 	 * indrect prefetch buffers that have not lived long enough, or are
371 	 * not from the spa we're trying to evict from.
372 	 */
373 	kstat_named_t arcstat_evict_skip;
374 	kstat_named_t arcstat_evict_l2_cached;
375 	kstat_named_t arcstat_evict_l2_eligible;
376 	kstat_named_t arcstat_evict_l2_ineligible;
377 	kstat_named_t arcstat_hash_elements;
378 	kstat_named_t arcstat_hash_elements_max;
379 	kstat_named_t arcstat_hash_collisions;
380 	kstat_named_t arcstat_hash_chains;
381 	kstat_named_t arcstat_hash_chain_max;
382 	kstat_named_t arcstat_p;
383 	kstat_named_t arcstat_c;
384 	kstat_named_t arcstat_c_min;
385 	kstat_named_t arcstat_c_max;
386 	kstat_named_t arcstat_size;
387 	kstat_named_t arcstat_hdr_size;
388 	kstat_named_t arcstat_data_size;
389 	kstat_named_t arcstat_other_size;
390 	kstat_named_t arcstat_l2_hits;
391 	kstat_named_t arcstat_l2_misses;
392 	kstat_named_t arcstat_l2_feeds;
393 	kstat_named_t arcstat_l2_rw_clash;
394 	kstat_named_t arcstat_l2_read_bytes;
395 	kstat_named_t arcstat_l2_write_bytes;
396 	kstat_named_t arcstat_l2_writes_sent;
397 	kstat_named_t arcstat_l2_writes_done;
398 	kstat_named_t arcstat_l2_writes_error;
399 	kstat_named_t arcstat_l2_writes_hdr_miss;
400 	kstat_named_t arcstat_l2_evict_lock_retry;
401 	kstat_named_t arcstat_l2_evict_reading;
402 	kstat_named_t arcstat_l2_free_on_write;
403 	kstat_named_t arcstat_l2_cdata_free_on_write;
404 	kstat_named_t arcstat_l2_abort_lowmem;
405 	kstat_named_t arcstat_l2_cksum_bad;
406 	kstat_named_t arcstat_l2_io_error;
407 	kstat_named_t arcstat_l2_size;
408 	kstat_named_t arcstat_l2_asize;
409 	kstat_named_t arcstat_l2_hdr_size;
410 	kstat_named_t arcstat_l2_compress_successes;
411 	kstat_named_t arcstat_l2_compress_zeros;
412 	kstat_named_t arcstat_l2_compress_failures;
413 	kstat_named_t arcstat_l2_write_trylock_fail;
414 	kstat_named_t arcstat_l2_write_passed_headroom;
415 	kstat_named_t arcstat_l2_write_spa_mismatch;
416 	kstat_named_t arcstat_l2_write_in_l2;
417 	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
418 	kstat_named_t arcstat_l2_write_not_cacheable;
419 	kstat_named_t arcstat_l2_write_full;
420 	kstat_named_t arcstat_l2_write_buffer_iter;
421 	kstat_named_t arcstat_l2_write_pios;
422 	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
423 	kstat_named_t arcstat_l2_write_buffer_list_iter;
424 	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
425 	kstat_named_t arcstat_memory_throttle_count;
426 	kstat_named_t arcstat_duplicate_buffers;
427 	kstat_named_t arcstat_duplicate_buffers_size;
428 	kstat_named_t arcstat_duplicate_reads;
429 	kstat_named_t arcstat_meta_used;
430 	kstat_named_t arcstat_meta_limit;
431 	kstat_named_t arcstat_meta_max;
432 } arc_stats_t;
433 
434 static arc_stats_t arc_stats = {
435 	{ "hits",			KSTAT_DATA_UINT64 },
436 	{ "misses",			KSTAT_DATA_UINT64 },
437 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
438 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
439 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
440 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
441 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
442 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
443 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
444 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
445 	{ "mru_hits",			KSTAT_DATA_UINT64 },
446 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
447 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
448 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
449 	{ "allocated",			KSTAT_DATA_UINT64 },
450 	{ "deleted",			KSTAT_DATA_UINT64 },
451 	{ "stolen",			KSTAT_DATA_UINT64 },
452 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
453 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
454 	{ "evict_skip",			KSTAT_DATA_UINT64 },
455 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
456 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
457 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
458 	{ "hash_elements",		KSTAT_DATA_UINT64 },
459 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
460 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
461 	{ "hash_chains",		KSTAT_DATA_UINT64 },
462 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
463 	{ "p",				KSTAT_DATA_UINT64 },
464 	{ "c",				KSTAT_DATA_UINT64 },
465 	{ "c_min",			KSTAT_DATA_UINT64 },
466 	{ "c_max",			KSTAT_DATA_UINT64 },
467 	{ "size",			KSTAT_DATA_UINT64 },
468 	{ "hdr_size",			KSTAT_DATA_UINT64 },
469 	{ "data_size",			KSTAT_DATA_UINT64 },
470 	{ "other_size",			KSTAT_DATA_UINT64 },
471 	{ "l2_hits",			KSTAT_DATA_UINT64 },
472 	{ "l2_misses",			KSTAT_DATA_UINT64 },
473 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
474 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
475 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
476 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
477 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
478 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
479 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
480 	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
481 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
482 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
483 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
484 	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
485 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
486 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
487 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
488 	{ "l2_size",			KSTAT_DATA_UINT64 },
489 	{ "l2_asize",			KSTAT_DATA_UINT64 },
490 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
491 	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
492 	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
493 	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
494 	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
495 	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
496 	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
497 	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
498 	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
499 	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
500 	{ "l2_write_full",		KSTAT_DATA_UINT64 },
501 	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
502 	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
503 	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
504 	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
505 	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
506 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
507 	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
508 	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
509 	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
510 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
511 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
512 	{ "arc_meta_max",		KSTAT_DATA_UINT64 }
513 };
514 
515 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
516 
517 #define	ARCSTAT_INCR(stat, val) \
518 	atomic_add_64(&arc_stats.stat.value.ui64, (val))
519 
520 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
521 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
522 
523 #define	ARCSTAT_MAX(stat, val) {					\
524 	uint64_t m;							\
525 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
526 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
527 		continue;						\
528 }
529 
530 #define	ARCSTAT_MAXSTAT(stat) \
531 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
532 
533 /*
534  * We define a macro to allow ARC hits/misses to be easily broken down by
535  * two separate conditions, giving a total of four different subtypes for
536  * each of hits and misses (so eight statistics total).
537  */
538 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
539 	if (cond1) {							\
540 		if (cond2) {						\
541 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
542 		} else {						\
543 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
544 		}							\
545 	} else {							\
546 		if (cond2) {						\
547 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
548 		} else {						\
549 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
550 		}							\
551 	}
552 
553 kstat_t			*arc_ksp;
554 static arc_state_t	*arc_anon;
555 static arc_state_t	*arc_mru;
556 static arc_state_t	*arc_mru_ghost;
557 static arc_state_t	*arc_mfu;
558 static arc_state_t	*arc_mfu_ghost;
559 static arc_state_t	*arc_l2c_only;
560 
561 /*
562  * There are several ARC variables that are critical to export as kstats --
563  * but we don't want to have to grovel around in the kstat whenever we wish to
564  * manipulate them.  For these variables, we therefore define them to be in
565  * terms of the statistic variable.  This assures that we are not introducing
566  * the possibility of inconsistency by having shadow copies of the variables,
567  * while still allowing the code to be readable.
568  */
569 #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
570 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
571 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
572 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
573 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
574 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
575 #define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
576 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
577 
578 #define	L2ARC_IS_VALID_COMPRESS(_c_) \
579 	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
580 
581 static int		arc_no_grow;	/* Don't try to grow cache size */
582 static uint64_t		arc_tempreserve;
583 static uint64_t		arc_loaned_bytes;
584 
585 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
586 
587 typedef struct arc_callback arc_callback_t;
588 
589 struct arc_callback {
590 	void			*acb_private;
591 	arc_done_func_t		*acb_done;
592 	arc_buf_t		*acb_buf;
593 	zio_t			*acb_zio_dummy;
594 	arc_callback_t		*acb_next;
595 };
596 
597 typedef struct arc_write_callback arc_write_callback_t;
598 
599 struct arc_write_callback {
600 	void		*awcb_private;
601 	arc_done_func_t	*awcb_ready;
602 	arc_done_func_t	*awcb_physdone;
603 	arc_done_func_t	*awcb_done;
604 	arc_buf_t	*awcb_buf;
605 };
606 
607 struct arc_buf_hdr {
608 	/* protected by hash lock */
609 	dva_t			b_dva;
610 	uint64_t		b_birth;
611 	uint64_t		b_cksum0;
612 
613 	kmutex_t		b_freeze_lock;
614 	zio_cksum_t		*b_freeze_cksum;
615 	void			*b_thawed;
616 
617 	arc_buf_hdr_t		*b_hash_next;
618 	arc_buf_t		*b_buf;
619 	arc_flags_t		b_flags;
620 	uint32_t		b_datacnt;
621 
622 	arc_callback_t		*b_acb;
623 	kcondvar_t		b_cv;
624 
625 	/* immutable */
626 	arc_buf_contents_t	b_type;
627 	uint64_t		b_size;
628 	uint64_t		b_spa;
629 
630 	/* protected by arc state mutex */
631 	arc_state_t		*b_state;
632 	list_node_t		b_arc_node;
633 
634 	/* updated atomically */
635 	clock_t			b_arc_access;
636 
637 	/* self protecting */
638 	refcount_t		b_refcnt;
639 
640 	l2arc_buf_hdr_t		*b_l2hdr;
641 	list_node_t		b_l2node;
642 };
643 
644 #ifdef _KERNEL
645 static int
sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)646 sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
647 {
648 	uint64_t val;
649 	int err;
650 
651 	val = arc_meta_limit;
652 	err = sysctl_handle_64(oidp, &val, 0, req);
653 	if (err != 0 || req->newptr == NULL)
654 		return (err);
655 
656         if (val <= 0 || val > arc_c_max)
657 		return (EINVAL);
658 
659 	arc_meta_limit = val;
660 	return (0);
661 }
662 #endif
663 
664 static arc_buf_t *arc_eviction_list;
665 static kmutex_t arc_eviction_mtx;
666 static arc_buf_hdr_t arc_eviction_hdr;
667 
668 #define	GHOST_STATE(state)	\
669 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
670 	(state) == arc_l2c_only)
671 
672 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
673 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
674 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
675 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
676 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
677 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
678 #define	HDR_FREE_IN_PROGRESS(hdr)	\
679 	((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS)
680 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
681 #define	HDR_L2_READING(hdr)	\
682 	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS &&	\
683 	    (hdr)->b_l2hdr != NULL)
684 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
685 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
686 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
687 
688 /*
689  * Other sizes
690  */
691 
692 #define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
693 #define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
694 
695 /*
696  * Hash table routines
697  */
698 
699 #define	HT_LOCK_PAD	CACHE_LINE_SIZE
700 
701 struct ht_lock {
702 	kmutex_t	ht_lock;
703 #ifdef _KERNEL
704 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
705 #endif
706 };
707 
708 #define	BUF_LOCKS 256
709 typedef struct buf_hash_table {
710 	uint64_t ht_mask;
711 	arc_buf_hdr_t **ht_table;
712 	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
713 } buf_hash_table_t;
714 
715 static buf_hash_table_t buf_hash_table;
716 
717 #define	BUF_HASH_INDEX(spa, dva, birth) \
718 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
719 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
720 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
721 #define	HDR_LOCK(hdr) \
722 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
723 
724 uint64_t zfs_crc64_table[256];
725 
726 /*
727  * Level 2 ARC
728  */
729 
730 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
731 #define	L2ARC_HEADROOM		2			/* num of writes */
732 /*
733  * If we discover during ARC scan any buffers to be compressed, we boost
734  * our headroom for the next scanning cycle by this percentage multiple.
735  */
736 #define	L2ARC_HEADROOM_BOOST	200
737 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
738 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
739 
740 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
741 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
742 
743 /* L2ARC Performance Tunables */
744 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
745 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
746 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
747 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
748 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
749 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
750 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
751 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
752 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
753 
754 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
755     &l2arc_write_max, 0, "max write size");
756 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
757     &l2arc_write_boost, 0, "extra write during warmup");
758 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
759     &l2arc_headroom, 0, "number of dev writes");
760 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
761     &l2arc_feed_secs, 0, "interval seconds");
762 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
763     &l2arc_feed_min_ms, 0, "min interval milliseconds");
764 
765 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
766     &l2arc_noprefetch, 0, "don't cache prefetch bufs");
767 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
768     &l2arc_feed_again, 0, "turbo warmup");
769 SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
770     &l2arc_norw, 0, "no reads during writes");
771 
772 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
773     &ARC_anon.arcs_size, 0, "size of anonymous state");
774 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
775     &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
776 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
777     &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
778 
779 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
780     &ARC_mru.arcs_size, 0, "size of mru state");
781 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
782     &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
783 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
784     &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
785 
786 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
787     &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
788 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
789     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
790     "size of metadata in mru ghost state");
791 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
792     &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
793     "size of data in mru ghost state");
794 
795 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
796     &ARC_mfu.arcs_size, 0, "size of mfu state");
797 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
798     &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
799 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
800     &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
801 
802 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
803     &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
804 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
805     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
806     "size of metadata in mfu ghost state");
807 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
808     &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
809     "size of data in mfu ghost state");
810 
811 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
812     &ARC_l2c_only.arcs_size, 0, "size of mru state");
813 
814 /*
815  * L2ARC Internals
816  */
817 typedef struct l2arc_dev {
818 	vdev_t			*l2ad_vdev;	/* vdev */
819 	spa_t			*l2ad_spa;	/* spa */
820 	uint64_t		l2ad_hand;	/* next write location */
821 	uint64_t		l2ad_start;	/* first addr on device */
822 	uint64_t		l2ad_end;	/* last addr on device */
823 	uint64_t		l2ad_evict;	/* last addr eviction reached */
824 	boolean_t		l2ad_first;	/* first sweep through */
825 	boolean_t		l2ad_writing;	/* currently writing */
826 	list_t			*l2ad_buflist;	/* buffer list */
827 	list_node_t		l2ad_node;	/* device list node */
828 } l2arc_dev_t;
829 
830 static list_t L2ARC_dev_list;			/* device list */
831 static list_t *l2arc_dev_list;			/* device list pointer */
832 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
833 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
834 static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
835 static list_t L2ARC_free_on_write;		/* free after write buf list */
836 static list_t *l2arc_free_on_write;		/* free after write list ptr */
837 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
838 static uint64_t l2arc_ndev;			/* number of devices */
839 
840 typedef struct l2arc_read_callback {
841 	arc_buf_t		*l2rcb_buf;		/* read buffer */
842 	spa_t			*l2rcb_spa;		/* spa */
843 	blkptr_t		l2rcb_bp;		/* original blkptr */
844 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
845 	int			l2rcb_flags;		/* original flags */
846 	enum zio_compress	l2rcb_compress;		/* applied compress */
847 } l2arc_read_callback_t;
848 
849 typedef struct l2arc_write_callback {
850 	l2arc_dev_t	*l2wcb_dev;		/* device info */
851 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
852 } l2arc_write_callback_t;
853 
854 struct l2arc_buf_hdr {
855 	/* protected by arc_buf_hdr  mutex */
856 	l2arc_dev_t		*b_dev;		/* L2ARC device */
857 	uint64_t		b_daddr;	/* disk address, offset byte */
858 	/* compression applied to buffer data */
859 	enum zio_compress	b_compress;
860 	/* real alloc'd buffer size depending on b_compress applied */
861 	int			b_asize;
862 	/* temporary buffer holder for in-flight compressed data */
863 	void			*b_tmp_cdata;
864 };
865 
866 typedef struct l2arc_data_free {
867 	/* protected by l2arc_free_on_write_mtx */
868 	void		*l2df_data;
869 	size_t		l2df_size;
870 	void		(*l2df_func)(void *, size_t);
871 	list_node_t	l2df_list_node;
872 } l2arc_data_free_t;
873 
874 static kmutex_t l2arc_feed_thr_lock;
875 static kcondvar_t l2arc_feed_thr_cv;
876 static uint8_t l2arc_thread_exit;
877 
878 static void arc_get_data_buf(arc_buf_t *);
879 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
880 static int arc_evict_needed(arc_buf_contents_t);
881 static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
882 static void arc_buf_watch(arc_buf_t *);
883 
884 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
885 static void l2arc_read_done(zio_t *);
886 static void l2arc_hdr_stat_add(void);
887 static void l2arc_hdr_stat_remove(void);
888 
889 static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *);
890 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
891 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
892 
893 static uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)894 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
895 {
896 	uint8_t *vdva = (uint8_t *)dva;
897 	uint64_t crc = -1ULL;
898 	int i;
899 
900 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
901 
902 	for (i = 0; i < sizeof (dva_t); i++)
903 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
904 
905 	crc ^= (spa>>8) ^ birth;
906 
907 	return (crc);
908 }
909 
910 #define	BUF_EMPTY(buf)						\
911 	((buf)->b_dva.dva_word[0] == 0 &&			\
912 	(buf)->b_dva.dva_word[1] == 0 &&			\
913 	(buf)->b_cksum0 == 0)
914 
915 #define	BUF_EQUAL(spa, dva, birth, buf)				\
916 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
917 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
918 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
919 
920 static void
buf_discard_identity(arc_buf_hdr_t * hdr)921 buf_discard_identity(arc_buf_hdr_t *hdr)
922 {
923 	hdr->b_dva.dva_word[0] = 0;
924 	hdr->b_dva.dva_word[1] = 0;
925 	hdr->b_birth = 0;
926 	hdr->b_cksum0 = 0;
927 }
928 
929 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)930 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
931 {
932 	const dva_t *dva = BP_IDENTITY(bp);
933 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
934 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
935 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
936 	arc_buf_hdr_t *hdr;
937 
938 	mutex_enter(hash_lock);
939 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
940 	    hdr = hdr->b_hash_next) {
941 		if (BUF_EQUAL(spa, dva, birth, hdr)) {
942 			*lockp = hash_lock;
943 			return (hdr);
944 		}
945 	}
946 	mutex_exit(hash_lock);
947 	*lockp = NULL;
948 	return (NULL);
949 }
950 
951 /*
952  * Insert an entry into the hash table.  If there is already an element
953  * equal to elem in the hash table, then the already existing element
954  * will be returned and the new element will not be inserted.
955  * Otherwise returns NULL.
956  */
957 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)958 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
959 {
960 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
961 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
962 	arc_buf_hdr_t *fhdr;
963 	uint32_t i;
964 
965 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
966 	ASSERT(hdr->b_birth != 0);
967 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
968 	*lockp = hash_lock;
969 	mutex_enter(hash_lock);
970 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
971 	    fhdr = fhdr->b_hash_next, i++) {
972 		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
973 			return (fhdr);
974 	}
975 
976 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
977 	buf_hash_table.ht_table[idx] = hdr;
978 	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
979 
980 	/* collect some hash table performance data */
981 	if (i > 0) {
982 		ARCSTAT_BUMP(arcstat_hash_collisions);
983 		if (i == 1)
984 			ARCSTAT_BUMP(arcstat_hash_chains);
985 
986 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
987 	}
988 
989 	ARCSTAT_BUMP(arcstat_hash_elements);
990 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
991 
992 	return (NULL);
993 }
994 
995 static void
buf_hash_remove(arc_buf_hdr_t * hdr)996 buf_hash_remove(arc_buf_hdr_t *hdr)
997 {
998 	arc_buf_hdr_t *fhdr, **hdrp;
999 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1000 
1001 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1002 	ASSERT(HDR_IN_HASH_TABLE(hdr));
1003 
1004 	hdrp = &buf_hash_table.ht_table[idx];
1005 	while ((fhdr = *hdrp) != hdr) {
1006 		ASSERT(fhdr != NULL);
1007 		hdrp = &fhdr->b_hash_next;
1008 	}
1009 	*hdrp = hdr->b_hash_next;
1010 	hdr->b_hash_next = NULL;
1011 	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1012 
1013 	/* collect some hash table performance data */
1014 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1015 
1016 	if (buf_hash_table.ht_table[idx] &&
1017 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1018 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1019 }
1020 
1021 /*
1022  * Global data structures and functions for the buf kmem cache.
1023  */
1024 static kmem_cache_t *hdr_cache;
1025 static kmem_cache_t *buf_cache;
1026 
1027 static void
buf_fini(void)1028 buf_fini(void)
1029 {
1030 	int i;
1031 
1032 	kmem_free(buf_hash_table.ht_table,
1033 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1034 	for (i = 0; i < BUF_LOCKS; i++)
1035 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1036 	kmem_cache_destroy(hdr_cache);
1037 	kmem_cache_destroy(buf_cache);
1038 }
1039 
1040 /*
1041  * Constructor callback - called when the cache is empty
1042  * and a new buf is requested.
1043  */
1044 /* ARGSUSED */
1045 static int
hdr_cons(void * vbuf,void * unused,int kmflag)1046 hdr_cons(void *vbuf, void *unused, int kmflag)
1047 {
1048 	arc_buf_hdr_t *hdr = vbuf;
1049 
1050 	bzero(hdr, sizeof (arc_buf_hdr_t));
1051 	refcount_create(&hdr->b_refcnt);
1052 	cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL);
1053 	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1054 	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1055 
1056 	return (0);
1057 }
1058 
1059 /* ARGSUSED */
1060 static int
buf_cons(void * vbuf,void * unused,int kmflag)1061 buf_cons(void *vbuf, void *unused, int kmflag)
1062 {
1063 	arc_buf_t *buf = vbuf;
1064 
1065 	bzero(buf, sizeof (arc_buf_t));
1066 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1067 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1068 
1069 	return (0);
1070 }
1071 
1072 /*
1073  * Destructor callback - called when a cached buf is
1074  * no longer required.
1075  */
1076 /* ARGSUSED */
1077 static void
hdr_dest(void * vbuf,void * unused)1078 hdr_dest(void *vbuf, void *unused)
1079 {
1080 	arc_buf_hdr_t *hdr = vbuf;
1081 
1082 	ASSERT(BUF_EMPTY(hdr));
1083 	refcount_destroy(&hdr->b_refcnt);
1084 	cv_destroy(&hdr->b_cv);
1085 	mutex_destroy(&hdr->b_freeze_lock);
1086 	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1087 }
1088 
1089 /* ARGSUSED */
1090 static void
buf_dest(void * vbuf,void * unused)1091 buf_dest(void *vbuf, void *unused)
1092 {
1093 	arc_buf_t *buf = vbuf;
1094 
1095 	mutex_destroy(&buf->b_evict_lock);
1096 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1097 }
1098 
1099 /*
1100  * Reclaim callback -- invoked when memory is low.
1101  */
1102 /* ARGSUSED */
1103 static void
hdr_recl(void * unused)1104 hdr_recl(void *unused)
1105 {
1106 	dprintf("hdr_recl called\n");
1107 	/*
1108 	 * umem calls the reclaim func when we destroy the buf cache,
1109 	 * which is after we do arc_fini().
1110 	 */
1111 	if (!arc_dead)
1112 		cv_signal(&arc_reclaim_thr_cv);
1113 }
1114 
1115 static void
buf_init(void)1116 buf_init(void)
1117 {
1118 	uint64_t *ct;
1119 	uint64_t hsize = 1ULL << 12;
1120 	int i, j;
1121 
1122 	/*
1123 	 * The hash table is big enough to fill all of physical memory
1124 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1125 	 * By default, the table will take up
1126 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1127 	 */
1128 	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1129 		hsize <<= 1;
1130 retry:
1131 	buf_hash_table.ht_mask = hsize - 1;
1132 	buf_hash_table.ht_table =
1133 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1134 	if (buf_hash_table.ht_table == NULL) {
1135 		ASSERT(hsize > (1ULL << 8));
1136 		hsize >>= 1;
1137 		goto retry;
1138 	}
1139 
1140 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1141 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1142 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1143 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1144 
1145 	for (i = 0; i < 256; i++)
1146 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1147 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1148 
1149 	for (i = 0; i < BUF_LOCKS; i++) {
1150 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1151 		    NULL, MUTEX_DEFAULT, NULL);
1152 	}
1153 }
1154 
1155 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
1156 
1157 static void
arc_cksum_verify(arc_buf_t * buf)1158 arc_cksum_verify(arc_buf_t *buf)
1159 {
1160 	zio_cksum_t zc;
1161 
1162 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1163 		return;
1164 
1165 	mutex_enter(&buf->b_hdr->b_freeze_lock);
1166 	if (buf->b_hdr->b_freeze_cksum == NULL ||
1167 	    (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) {
1168 		mutex_exit(&buf->b_hdr->b_freeze_lock);
1169 		return;
1170 	}
1171 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1172 	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1173 		panic("buffer modified while frozen!");
1174 	mutex_exit(&buf->b_hdr->b_freeze_lock);
1175 }
1176 
1177 static int
arc_cksum_equal(arc_buf_t * buf)1178 arc_cksum_equal(arc_buf_t *buf)
1179 {
1180 	zio_cksum_t zc;
1181 	int equal;
1182 
1183 	mutex_enter(&buf->b_hdr->b_freeze_lock);
1184 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1185 	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1186 	mutex_exit(&buf->b_hdr->b_freeze_lock);
1187 
1188 	return (equal);
1189 }
1190 
1191 static void
arc_cksum_compute(arc_buf_t * buf,boolean_t force)1192 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1193 {
1194 	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1195 		return;
1196 
1197 	mutex_enter(&buf->b_hdr->b_freeze_lock);
1198 	if (buf->b_hdr->b_freeze_cksum != NULL) {
1199 		mutex_exit(&buf->b_hdr->b_freeze_lock);
1200 		return;
1201 	}
1202 	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1203 	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1204 	    buf->b_hdr->b_freeze_cksum);
1205 	mutex_exit(&buf->b_hdr->b_freeze_lock);
1206 #ifdef illumos
1207 	arc_buf_watch(buf);
1208 #endif /* illumos */
1209 }
1210 
1211 #ifdef illumos
1212 #ifndef _KERNEL
1213 typedef struct procctl {
1214 	long cmd;
1215 	prwatch_t prwatch;
1216 } procctl_t;
1217 #endif
1218 
1219 /* ARGSUSED */
1220 static void
arc_buf_unwatch(arc_buf_t * buf)1221 arc_buf_unwatch(arc_buf_t *buf)
1222 {
1223 #ifndef _KERNEL
1224 	if (arc_watch) {
1225 		int result;
1226 		procctl_t ctl;
1227 		ctl.cmd = PCWATCH;
1228 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1229 		ctl.prwatch.pr_size = 0;
1230 		ctl.prwatch.pr_wflags = 0;
1231 		result = write(arc_procfd, &ctl, sizeof (ctl));
1232 		ASSERT3U(result, ==, sizeof (ctl));
1233 	}
1234 #endif
1235 }
1236 
1237 /* ARGSUSED */
1238 static void
arc_buf_watch(arc_buf_t * buf)1239 arc_buf_watch(arc_buf_t *buf)
1240 {
1241 #ifndef _KERNEL
1242 	if (arc_watch) {
1243 		int result;
1244 		procctl_t ctl;
1245 		ctl.cmd = PCWATCH;
1246 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1247 		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1248 		ctl.prwatch.pr_wflags = WA_WRITE;
1249 		result = write(arc_procfd, &ctl, sizeof (ctl));
1250 		ASSERT3U(result, ==, sizeof (ctl));
1251 	}
1252 #endif
1253 }
1254 #endif /* illumos */
1255 
1256 void
arc_buf_thaw(arc_buf_t * buf)1257 arc_buf_thaw(arc_buf_t *buf)
1258 {
1259 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1260 		if (buf->b_hdr->b_state != arc_anon)
1261 			panic("modifying non-anon buffer!");
1262 		if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1263 			panic("modifying buffer while i/o in progress!");
1264 		arc_cksum_verify(buf);
1265 	}
1266 
1267 	mutex_enter(&buf->b_hdr->b_freeze_lock);
1268 	if (buf->b_hdr->b_freeze_cksum != NULL) {
1269 		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1270 		buf->b_hdr->b_freeze_cksum = NULL;
1271 	}
1272 
1273 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1274 		if (buf->b_hdr->b_thawed)
1275 			kmem_free(buf->b_hdr->b_thawed, 1);
1276 		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1277 	}
1278 
1279 	mutex_exit(&buf->b_hdr->b_freeze_lock);
1280 
1281 #ifdef illumos
1282 	arc_buf_unwatch(buf);
1283 #endif /* illumos */
1284 }
1285 
1286 void
arc_buf_freeze(arc_buf_t * buf)1287 arc_buf_freeze(arc_buf_t *buf)
1288 {
1289 	kmutex_t *hash_lock;
1290 
1291 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1292 		return;
1293 
1294 	hash_lock = HDR_LOCK(buf->b_hdr);
1295 	mutex_enter(hash_lock);
1296 
1297 	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1298 	    buf->b_hdr->b_state == arc_anon);
1299 	arc_cksum_compute(buf, B_FALSE);
1300 	mutex_exit(hash_lock);
1301 
1302 }
1303 
1304 static void
get_buf_info(arc_buf_hdr_t * hdr,arc_state_t * state,list_t ** list,kmutex_t ** lock)1305 get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
1306 {
1307 	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1308 
1309 	if (hdr->b_type == ARC_BUFC_METADATA)
1310 		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1311 	else {
1312 		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1313 		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1314 	}
1315 
1316 	*list = &state->arcs_lists[buf_hashid];
1317 	*lock = ARCS_LOCK(state, buf_hashid);
1318 }
1319 
1320 
1321 static void
add_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1322 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1323 {
1324 	ASSERT(MUTEX_HELD(hash_lock));
1325 
1326 	if ((refcount_add(&hdr->b_refcnt, tag) == 1) &&
1327 	    (hdr->b_state != arc_anon)) {
1328 		uint64_t delta = hdr->b_size * hdr->b_datacnt;
1329 		uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
1330 		list_t *list;
1331 		kmutex_t *lock;
1332 
1333 		get_buf_info(hdr, hdr->b_state, &list, &lock);
1334 		ASSERT(!MUTEX_HELD(lock));
1335 		mutex_enter(lock);
1336 		ASSERT(list_link_active(&hdr->b_arc_node));
1337 		list_remove(list, hdr);
1338 		if (GHOST_STATE(hdr->b_state)) {
1339 			ASSERT0(hdr->b_datacnt);
1340 			ASSERT3P(hdr->b_buf, ==, NULL);
1341 			delta = hdr->b_size;
1342 		}
1343 		ASSERT(delta > 0);
1344 		ASSERT3U(*size, >=, delta);
1345 		atomic_add_64(size, -delta);
1346 		mutex_exit(lock);
1347 		/* remove the prefetch flag if we get a reference */
1348 		if (hdr->b_flags & ARC_FLAG_PREFETCH)
1349 			hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1350 	}
1351 }
1352 
1353 static int
remove_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1354 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1355 {
1356 	int cnt;
1357 	arc_state_t *state = hdr->b_state;
1358 
1359 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1360 	ASSERT(!GHOST_STATE(state));
1361 
1362 	if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) &&
1363 	    (state != arc_anon)) {
1364 		uint64_t *size = &state->arcs_lsize[hdr->b_type];
1365 		list_t *list;
1366 		kmutex_t *lock;
1367 
1368 		get_buf_info(hdr, state, &list, &lock);
1369 		ASSERT(!MUTEX_HELD(lock));
1370 		mutex_enter(lock);
1371 		ASSERT(!list_link_active(&hdr->b_arc_node));
1372 		list_insert_head(list, hdr);
1373 		ASSERT(hdr->b_datacnt > 0);
1374 		atomic_add_64(size, hdr->b_size * hdr->b_datacnt);
1375 		mutex_exit(lock);
1376 	}
1377 	return (cnt);
1378 }
1379 
1380 /*
1381  * Move the supplied buffer to the indicated state.  The mutex
1382  * for the buffer must be held by the caller.
1383  */
1384 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr,kmutex_t * hash_lock)1385 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1386     kmutex_t *hash_lock)
1387 {
1388 	arc_state_t *old_state = hdr->b_state;
1389 	int64_t refcnt = refcount_count(&hdr->b_refcnt);
1390 	uint64_t from_delta, to_delta;
1391 	list_t *list;
1392 	kmutex_t *lock;
1393 
1394 	ASSERT(MUTEX_HELD(hash_lock));
1395 	ASSERT3P(new_state, !=, old_state);
1396 	ASSERT(refcnt == 0 || hdr->b_datacnt > 0);
1397 	ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state));
1398 	ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon);
1399 
1400 	from_delta = to_delta = hdr->b_datacnt * hdr->b_size;
1401 
1402 	/*
1403 	 * If this buffer is evictable, transfer it from the
1404 	 * old state list to the new state list.
1405 	 */
1406 	if (refcnt == 0) {
1407 		if (old_state != arc_anon) {
1408 			int use_mutex;
1409 			uint64_t *size = &old_state->arcs_lsize[hdr->b_type];
1410 
1411 			get_buf_info(hdr, old_state, &list, &lock);
1412 			use_mutex = !MUTEX_HELD(lock);
1413 			if (use_mutex)
1414 				mutex_enter(lock);
1415 
1416 			ASSERT(list_link_active(&hdr->b_arc_node));
1417 			list_remove(list, hdr);
1418 
1419 			/*
1420 			 * If prefetching out of the ghost cache,
1421 			 * we will have a non-zero datacnt.
1422 			 */
1423 			if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) {
1424 				/* ghost elements have a ghost size */
1425 				ASSERT(hdr->b_buf == NULL);
1426 				from_delta = hdr->b_size;
1427 			}
1428 			ASSERT3U(*size, >=, from_delta);
1429 			atomic_add_64(size, -from_delta);
1430 
1431 			if (use_mutex)
1432 				mutex_exit(lock);
1433 		}
1434 		if (new_state != arc_anon) {
1435 			int use_mutex;
1436 			uint64_t *size = &new_state->arcs_lsize[hdr->b_type];
1437 
1438 			get_buf_info(hdr, new_state, &list, &lock);
1439 			use_mutex = !MUTEX_HELD(lock);
1440 			if (use_mutex)
1441 				mutex_enter(lock);
1442 
1443 			list_insert_head(list, hdr);
1444 
1445 			/* ghost elements have a ghost size */
1446 			if (GHOST_STATE(new_state)) {
1447 				ASSERT(hdr->b_datacnt == 0);
1448 				ASSERT(hdr->b_buf == NULL);
1449 				to_delta = hdr->b_size;
1450 			}
1451 			atomic_add_64(size, to_delta);
1452 
1453 			if (use_mutex)
1454 				mutex_exit(lock);
1455 		}
1456 	}
1457 
1458 	ASSERT(!BUF_EMPTY(hdr));
1459 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1460 		buf_hash_remove(hdr);
1461 
1462 	/* adjust state sizes */
1463 	if (to_delta)
1464 		atomic_add_64(&new_state->arcs_size, to_delta);
1465 	if (from_delta) {
1466 		ASSERT3U(old_state->arcs_size, >=, from_delta);
1467 		atomic_add_64(&old_state->arcs_size, -from_delta);
1468 	}
1469 	hdr->b_state = new_state;
1470 
1471 	/* adjust l2arc hdr stats */
1472 	if (new_state == arc_l2c_only)
1473 		l2arc_hdr_stat_add();
1474 	else if (old_state == arc_l2c_only)
1475 		l2arc_hdr_stat_remove();
1476 }
1477 
1478 void
arc_space_consume(uint64_t space,arc_space_type_t type)1479 arc_space_consume(uint64_t space, arc_space_type_t type)
1480 {
1481 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1482 
1483 	switch (type) {
1484 	case ARC_SPACE_DATA:
1485 		ARCSTAT_INCR(arcstat_data_size, space);
1486 		break;
1487 	case ARC_SPACE_OTHER:
1488 		ARCSTAT_INCR(arcstat_other_size, space);
1489 		break;
1490 	case ARC_SPACE_HDRS:
1491 		ARCSTAT_INCR(arcstat_hdr_size, space);
1492 		break;
1493 	case ARC_SPACE_L2HDRS:
1494 		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1495 		break;
1496 	}
1497 
1498 	ARCSTAT_INCR(arcstat_meta_used, space);
1499 	atomic_add_64(&arc_size, space);
1500 }
1501 
1502 void
arc_space_return(uint64_t space,arc_space_type_t type)1503 arc_space_return(uint64_t space, arc_space_type_t type)
1504 {
1505 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1506 
1507 	switch (type) {
1508 	case ARC_SPACE_DATA:
1509 		ARCSTAT_INCR(arcstat_data_size, -space);
1510 		break;
1511 	case ARC_SPACE_OTHER:
1512 		ARCSTAT_INCR(arcstat_other_size, -space);
1513 		break;
1514 	case ARC_SPACE_HDRS:
1515 		ARCSTAT_INCR(arcstat_hdr_size, -space);
1516 		break;
1517 	case ARC_SPACE_L2HDRS:
1518 		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1519 		break;
1520 	}
1521 
1522 	ASSERT(arc_meta_used >= space);
1523 	if (arc_meta_max < arc_meta_used)
1524 		arc_meta_max = arc_meta_used;
1525 	ARCSTAT_INCR(arcstat_meta_used, -space);
1526 	ASSERT(arc_size >= space);
1527 	atomic_add_64(&arc_size, -space);
1528 }
1529 
1530 arc_buf_t *
arc_buf_alloc(spa_t * spa,int size,void * tag,arc_buf_contents_t type)1531 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1532 {
1533 	arc_buf_hdr_t *hdr;
1534 	arc_buf_t *buf;
1535 
1536 	ASSERT3U(size, >, 0);
1537 	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1538 	ASSERT(BUF_EMPTY(hdr));
1539 	hdr->b_size = size;
1540 	hdr->b_type = type;
1541 	hdr->b_spa = spa_load_guid(spa);
1542 	hdr->b_state = arc_anon;
1543 	hdr->b_arc_access = 0;
1544 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1545 	buf->b_hdr = hdr;
1546 	buf->b_data = NULL;
1547 	buf->b_efunc = NULL;
1548 	buf->b_private = NULL;
1549 	buf->b_next = NULL;
1550 	hdr->b_buf = buf;
1551 	arc_get_data_buf(buf);
1552 	hdr->b_datacnt = 1;
1553 	hdr->b_flags = 0;
1554 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1555 	(void) refcount_add(&hdr->b_refcnt, tag);
1556 
1557 	return (buf);
1558 }
1559 
1560 static char *arc_onloan_tag = "onloan";
1561 
1562 /*
1563  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1564  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1565  * buffers must be returned to the arc before they can be used by the DMU or
1566  * freed.
1567  */
1568 arc_buf_t *
arc_loan_buf(spa_t * spa,int size)1569 arc_loan_buf(spa_t *spa, int size)
1570 {
1571 	arc_buf_t *buf;
1572 
1573 	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1574 
1575 	atomic_add_64(&arc_loaned_bytes, size);
1576 	return (buf);
1577 }
1578 
1579 /*
1580  * Return a loaned arc buffer to the arc.
1581  */
1582 void
arc_return_buf(arc_buf_t * buf,void * tag)1583 arc_return_buf(arc_buf_t *buf, void *tag)
1584 {
1585 	arc_buf_hdr_t *hdr = buf->b_hdr;
1586 
1587 	ASSERT(buf->b_data != NULL);
1588 	(void) refcount_add(&hdr->b_refcnt, tag);
1589 	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1590 
1591 	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1592 }
1593 
1594 /* Detach an arc_buf from a dbuf (tag) */
1595 void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)1596 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1597 {
1598 	arc_buf_hdr_t *hdr;
1599 
1600 	ASSERT(buf->b_data != NULL);
1601 	hdr = buf->b_hdr;
1602 	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1603 	(void) refcount_remove(&hdr->b_refcnt, tag);
1604 	buf->b_efunc = NULL;
1605 	buf->b_private = NULL;
1606 
1607 	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1608 }
1609 
1610 static arc_buf_t *
arc_buf_clone(arc_buf_t * from)1611 arc_buf_clone(arc_buf_t *from)
1612 {
1613 	arc_buf_t *buf;
1614 	arc_buf_hdr_t *hdr = from->b_hdr;
1615 	uint64_t size = hdr->b_size;
1616 
1617 	ASSERT(hdr->b_state != arc_anon);
1618 
1619 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1620 	buf->b_hdr = hdr;
1621 	buf->b_data = NULL;
1622 	buf->b_efunc = NULL;
1623 	buf->b_private = NULL;
1624 	buf->b_next = hdr->b_buf;
1625 	hdr->b_buf = buf;
1626 	arc_get_data_buf(buf);
1627 	bcopy(from->b_data, buf->b_data, size);
1628 
1629 	/*
1630 	 * This buffer already exists in the arc so create a duplicate
1631 	 * copy for the caller.  If the buffer is associated with user data
1632 	 * then track the size and number of duplicates.  These stats will be
1633 	 * updated as duplicate buffers are created and destroyed.
1634 	 */
1635 	if (hdr->b_type == ARC_BUFC_DATA) {
1636 		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1637 		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1638 	}
1639 	hdr->b_datacnt += 1;
1640 	return (buf);
1641 }
1642 
1643 void
arc_buf_add_ref(arc_buf_t * buf,void * tag)1644 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1645 {
1646 	arc_buf_hdr_t *hdr;
1647 	kmutex_t *hash_lock;
1648 
1649 	/*
1650 	 * Check to see if this buffer is evicted.  Callers
1651 	 * must verify b_data != NULL to know if the add_ref
1652 	 * was successful.
1653 	 */
1654 	mutex_enter(&buf->b_evict_lock);
1655 	if (buf->b_data == NULL) {
1656 		mutex_exit(&buf->b_evict_lock);
1657 		return;
1658 	}
1659 	hash_lock = HDR_LOCK(buf->b_hdr);
1660 	mutex_enter(hash_lock);
1661 	hdr = buf->b_hdr;
1662 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1663 	mutex_exit(&buf->b_evict_lock);
1664 
1665 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1666 	add_reference(hdr, hash_lock, tag);
1667 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1668 	arc_access(hdr, hash_lock);
1669 	mutex_exit(hash_lock);
1670 	ARCSTAT_BUMP(arcstat_hits);
1671 	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
1672 	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1673 	    data, metadata, hits);
1674 }
1675 
1676 static void
arc_buf_free_on_write(void * data,size_t size,void (* free_func)(void *,size_t))1677 arc_buf_free_on_write(void *data, size_t size,
1678     void (*free_func)(void *, size_t))
1679 {
1680 	l2arc_data_free_t *df;
1681 
1682 	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1683 	df->l2df_data = data;
1684 	df->l2df_size = size;
1685 	df->l2df_func = free_func;
1686 	mutex_enter(&l2arc_free_on_write_mtx);
1687 	list_insert_head(l2arc_free_on_write, df);
1688 	mutex_exit(&l2arc_free_on_write_mtx);
1689 }
1690 
1691 /*
1692  * Free the arc data buffer.  If it is an l2arc write in progress,
1693  * the buffer is placed on l2arc_free_on_write to be freed later.
1694  */
1695 static void
arc_buf_data_free(arc_buf_t * buf,void (* free_func)(void *,size_t))1696 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1697 {
1698 	arc_buf_hdr_t *hdr = buf->b_hdr;
1699 
1700 	if (HDR_L2_WRITING(hdr)) {
1701 		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1702 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1703 	} else {
1704 		free_func(buf->b_data, hdr->b_size);
1705 	}
1706 }
1707 
1708 /*
1709  * Free up buf->b_data and if 'remove' is set, then pull the
1710  * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1711  */
1712 static void
arc_buf_l2_cdata_free(arc_buf_hdr_t * hdr)1713 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1714 {
1715 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1716 
1717 	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1718 
1719 	if (l2hdr->b_tmp_cdata == NULL)
1720 		return;
1721 
1722 	ASSERT(HDR_L2_WRITING(hdr));
1723 	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1724 	    zio_data_buf_free);
1725 	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1726 	l2hdr->b_tmp_cdata = NULL;
1727 }
1728 
1729 static void
arc_buf_destroy(arc_buf_t * buf,boolean_t recycle,boolean_t remove)1730 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1731 {
1732 	arc_buf_t **bufp;
1733 
1734 	/* free up data associated with the buf */
1735 	if (buf->b_data) {
1736 		arc_state_t *state = buf->b_hdr->b_state;
1737 		uint64_t size = buf->b_hdr->b_size;
1738 		arc_buf_contents_t type = buf->b_hdr->b_type;
1739 
1740 		arc_cksum_verify(buf);
1741 #ifdef illumos
1742 		arc_buf_unwatch(buf);
1743 #endif /* illumos */
1744 
1745 		if (!recycle) {
1746 			if (type == ARC_BUFC_METADATA) {
1747 				arc_buf_data_free(buf, zio_buf_free);
1748 				arc_space_return(size, ARC_SPACE_DATA);
1749 			} else {
1750 				ASSERT(type == ARC_BUFC_DATA);
1751 				arc_buf_data_free(buf, zio_data_buf_free);
1752 				ARCSTAT_INCR(arcstat_data_size, -size);
1753 				atomic_add_64(&arc_size, -size);
1754 			}
1755 		}
1756 		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1757 			uint64_t *cnt = &state->arcs_lsize[type];
1758 
1759 			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1760 			ASSERT(state != arc_anon);
1761 
1762 			ASSERT3U(*cnt, >=, size);
1763 			atomic_add_64(cnt, -size);
1764 		}
1765 		ASSERT3U(state->arcs_size, >=, size);
1766 		atomic_add_64(&state->arcs_size, -size);
1767 		buf->b_data = NULL;
1768 
1769 		/*
1770 		 * If we're destroying a duplicate buffer make sure
1771 		 * that the appropriate statistics are updated.
1772 		 */
1773 		if (buf->b_hdr->b_datacnt > 1 &&
1774 		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1775 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1776 			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1777 		}
1778 		ASSERT(buf->b_hdr->b_datacnt > 0);
1779 		buf->b_hdr->b_datacnt -= 1;
1780 	}
1781 
1782 	/* only remove the buf if requested */
1783 	if (!remove)
1784 		return;
1785 
1786 	/* remove the buf from the hdr list */
1787 	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1788 		continue;
1789 	*bufp = buf->b_next;
1790 	buf->b_next = NULL;
1791 
1792 	ASSERT(buf->b_efunc == NULL);
1793 
1794 	/* clean up the buf */
1795 	buf->b_hdr = NULL;
1796 	kmem_cache_free(buf_cache, buf);
1797 }
1798 
1799 static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)1800 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1801 {
1802 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1803 	ASSERT3P(hdr->b_state, ==, arc_anon);
1804 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1805 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1806 
1807 	if (l2hdr != NULL) {
1808 		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1809 		/*
1810 		 * To prevent arc_free() and l2arc_evict() from
1811 		 * attempting to free the same buffer at the same time,
1812 		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1813 		 * give it priority.  l2arc_evict() can't destroy this
1814 		 * header while we are waiting on l2arc_buflist_mtx.
1815 		 *
1816 		 * The hdr may be removed from l2ad_buflist before we
1817 		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1818 		 */
1819 		if (!buflist_held) {
1820 			mutex_enter(&l2arc_buflist_mtx);
1821 			l2hdr = hdr->b_l2hdr;
1822 		}
1823 
1824 		if (l2hdr != NULL) {
1825 			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1826 			    hdr->b_size, 0);
1827 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1828 			arc_buf_l2_cdata_free(hdr);
1829 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1830 			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1831 			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1832 			    -l2hdr->b_asize, 0, 0);
1833 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1834 			if (hdr->b_state == arc_l2c_only)
1835 				l2arc_hdr_stat_remove();
1836 			hdr->b_l2hdr = NULL;
1837 		}
1838 
1839 		if (!buflist_held)
1840 			mutex_exit(&l2arc_buflist_mtx);
1841 	}
1842 
1843 	if (!BUF_EMPTY(hdr)) {
1844 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1845 		buf_discard_identity(hdr);
1846 	}
1847 	while (hdr->b_buf) {
1848 		arc_buf_t *buf = hdr->b_buf;
1849 
1850 		if (buf->b_efunc) {
1851 			mutex_enter(&arc_eviction_mtx);
1852 			mutex_enter(&buf->b_evict_lock);
1853 			ASSERT(buf->b_hdr != NULL);
1854 			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1855 			hdr->b_buf = buf->b_next;
1856 			buf->b_hdr = &arc_eviction_hdr;
1857 			buf->b_next = arc_eviction_list;
1858 			arc_eviction_list = buf;
1859 			mutex_exit(&buf->b_evict_lock);
1860 			mutex_exit(&arc_eviction_mtx);
1861 		} else {
1862 			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1863 		}
1864 	}
1865 	if (hdr->b_freeze_cksum != NULL) {
1866 		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1867 		hdr->b_freeze_cksum = NULL;
1868 	}
1869 	if (hdr->b_thawed) {
1870 		kmem_free(hdr->b_thawed, 1);
1871 		hdr->b_thawed = NULL;
1872 	}
1873 
1874 	ASSERT(!list_link_active(&hdr->b_arc_node));
1875 	ASSERT3P(hdr->b_hash_next, ==, NULL);
1876 	ASSERT3P(hdr->b_acb, ==, NULL);
1877 	kmem_cache_free(hdr_cache, hdr);
1878 }
1879 
1880 void
arc_buf_free(arc_buf_t * buf,void * tag)1881 arc_buf_free(arc_buf_t *buf, void *tag)
1882 {
1883 	arc_buf_hdr_t *hdr = buf->b_hdr;
1884 	int hashed = hdr->b_state != arc_anon;
1885 
1886 	ASSERT(buf->b_efunc == NULL);
1887 	ASSERT(buf->b_data != NULL);
1888 
1889 	if (hashed) {
1890 		kmutex_t *hash_lock = HDR_LOCK(hdr);
1891 
1892 		mutex_enter(hash_lock);
1893 		hdr = buf->b_hdr;
1894 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1895 
1896 		(void) remove_reference(hdr, hash_lock, tag);
1897 		if (hdr->b_datacnt > 1) {
1898 			arc_buf_destroy(buf, FALSE, TRUE);
1899 		} else {
1900 			ASSERT(buf == hdr->b_buf);
1901 			ASSERT(buf->b_efunc == NULL);
1902 			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1903 		}
1904 		mutex_exit(hash_lock);
1905 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1906 		int destroy_hdr;
1907 		/*
1908 		 * We are in the middle of an async write.  Don't destroy
1909 		 * this buffer unless the write completes before we finish
1910 		 * decrementing the reference count.
1911 		 */
1912 		mutex_enter(&arc_eviction_mtx);
1913 		(void) remove_reference(hdr, NULL, tag);
1914 		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1915 		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1916 		mutex_exit(&arc_eviction_mtx);
1917 		if (destroy_hdr)
1918 			arc_hdr_destroy(hdr);
1919 	} else {
1920 		if (remove_reference(hdr, NULL, tag) > 0)
1921 			arc_buf_destroy(buf, FALSE, TRUE);
1922 		else
1923 			arc_hdr_destroy(hdr);
1924 	}
1925 }
1926 
1927 boolean_t
arc_buf_remove_ref(arc_buf_t * buf,void * tag)1928 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1929 {
1930 	arc_buf_hdr_t *hdr = buf->b_hdr;
1931 	kmutex_t *hash_lock = HDR_LOCK(hdr);
1932 	boolean_t no_callback = (buf->b_efunc == NULL);
1933 
1934 	if (hdr->b_state == arc_anon) {
1935 		ASSERT(hdr->b_datacnt == 1);
1936 		arc_buf_free(buf, tag);
1937 		return (no_callback);
1938 	}
1939 
1940 	mutex_enter(hash_lock);
1941 	hdr = buf->b_hdr;
1942 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1943 	ASSERT(hdr->b_state != arc_anon);
1944 	ASSERT(buf->b_data != NULL);
1945 
1946 	(void) remove_reference(hdr, hash_lock, tag);
1947 	if (hdr->b_datacnt > 1) {
1948 		if (no_callback)
1949 			arc_buf_destroy(buf, FALSE, TRUE);
1950 	} else if (no_callback) {
1951 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1952 		ASSERT(buf->b_efunc == NULL);
1953 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1954 	}
1955 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1956 	    refcount_is_zero(&hdr->b_refcnt));
1957 	mutex_exit(hash_lock);
1958 	return (no_callback);
1959 }
1960 
1961 int
arc_buf_size(arc_buf_t * buf)1962 arc_buf_size(arc_buf_t *buf)
1963 {
1964 	return (buf->b_hdr->b_size);
1965 }
1966 
1967 /*
1968  * Called from the DMU to determine if the current buffer should be
1969  * evicted. In order to ensure proper locking, the eviction must be initiated
1970  * from the DMU. Return true if the buffer is associated with user data and
1971  * duplicate buffers still exist.
1972  */
1973 boolean_t
arc_buf_eviction_needed(arc_buf_t * buf)1974 arc_buf_eviction_needed(arc_buf_t *buf)
1975 {
1976 	arc_buf_hdr_t *hdr;
1977 	boolean_t evict_needed = B_FALSE;
1978 
1979 	if (zfs_disable_dup_eviction)
1980 		return (B_FALSE);
1981 
1982 	mutex_enter(&buf->b_evict_lock);
1983 	hdr = buf->b_hdr;
1984 	if (hdr == NULL) {
1985 		/*
1986 		 * We are in arc_do_user_evicts(); let that function
1987 		 * perform the eviction.
1988 		 */
1989 		ASSERT(buf->b_data == NULL);
1990 		mutex_exit(&buf->b_evict_lock);
1991 		return (B_FALSE);
1992 	} else if (buf->b_data == NULL) {
1993 		/*
1994 		 * We have already been added to the arc eviction list;
1995 		 * recommend eviction.
1996 		 */
1997 		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1998 		mutex_exit(&buf->b_evict_lock);
1999 		return (B_TRUE);
2000 	}
2001 
2002 	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2003 		evict_needed = B_TRUE;
2004 
2005 	mutex_exit(&buf->b_evict_lock);
2006 	return (evict_needed);
2007 }
2008 
2009 /*
2010  * Evict buffers from list until we've removed the specified number of
2011  * bytes.  Move the removed buffers to the appropriate evict state.
2012  * If the recycle flag is set, then attempt to "recycle" a buffer:
2013  * - look for a buffer to evict that is `bytes' long.
2014  * - return the data block from this buffer rather than freeing it.
2015  * This flag is used by callers that are trying to make space for a
2016  * new buffer in a full arc cache.
2017  *
2018  * This function makes a "best effort".  It skips over any buffers
2019  * it can't get a hash_lock on, and so may not catch all candidates.
2020  * It may also return without evicting as much space as requested.
2021  */
2022 static void *
arc_evict(arc_state_t * state,uint64_t spa,int64_t bytes,boolean_t recycle,arc_buf_contents_t type)2023 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2024     arc_buf_contents_t type)
2025 {
2026 	arc_state_t *evicted_state;
2027 	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2028 	int64_t bytes_remaining;
2029 	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2030 	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2031 	kmutex_t *lock, *evicted_lock;
2032 	kmutex_t *hash_lock;
2033 	boolean_t have_lock;
2034 	void *stolen = NULL;
2035 	arc_buf_hdr_t marker = { 0 };
2036 	int count = 0;
2037 	static int evict_metadata_offset, evict_data_offset;
2038 	int i, idx, offset, list_count, lists;
2039 
2040 	ASSERT(state == arc_mru || state == arc_mfu);
2041 
2042 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2043 
2044 	if (type == ARC_BUFC_METADATA) {
2045 		offset = 0;
2046 		list_count = ARC_BUFC_NUMMETADATALISTS;
2047 		list_start = &state->arcs_lists[0];
2048 		evicted_list_start = &evicted_state->arcs_lists[0];
2049 		idx = evict_metadata_offset;
2050 	} else {
2051 		offset = ARC_BUFC_NUMMETADATALISTS;
2052 		list_start = &state->arcs_lists[offset];
2053 		evicted_list_start = &evicted_state->arcs_lists[offset];
2054 		list_count = ARC_BUFC_NUMDATALISTS;
2055 		idx = evict_data_offset;
2056 	}
2057 	bytes_remaining = evicted_state->arcs_lsize[type];
2058 	lists = 0;
2059 
2060 evict_start:
2061 	list = &list_start[idx];
2062 	evicted_list = &evicted_list_start[idx];
2063 	lock = ARCS_LOCK(state, (offset + idx));
2064 	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2065 
2066 	mutex_enter(lock);
2067 	mutex_enter(evicted_lock);
2068 
2069 	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2070 		hdr_prev = list_prev(list, hdr);
2071 		bytes_remaining -= (hdr->b_size * hdr->b_datacnt);
2072 		/* prefetch buffers have a minimum lifespan */
2073 		if (HDR_IO_IN_PROGRESS(hdr) ||
2074 		    (spa && hdr->b_spa != spa) ||
2075 		    (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) &&
2076 		    ddi_get_lbolt() - hdr->b_arc_access <
2077 		    arc_min_prefetch_lifespan)) {
2078 			skipped++;
2079 			continue;
2080 		}
2081 		/* "lookahead" for better eviction candidate */
2082 		if (recycle && hdr->b_size != bytes &&
2083 		    hdr_prev && hdr_prev->b_size == bytes)
2084 			continue;
2085 
2086 		/* ignore markers */
2087 		if (hdr->b_spa == 0)
2088 			continue;
2089 
2090 		/*
2091 		 * It may take a long time to evict all the bufs requested.
2092 		 * To avoid blocking all arc activity, periodically drop
2093 		 * the arcs_mtx and give other threads a chance to run
2094 		 * before reacquiring the lock.
2095 		 *
2096 		 * If we are looking for a buffer to recycle, we are in
2097 		 * the hot code path, so don't sleep.
2098 		 */
2099 		if (!recycle && count++ > arc_evict_iterations) {
2100 			list_insert_after(list, hdr, &marker);
2101 			mutex_exit(evicted_lock);
2102 			mutex_exit(lock);
2103 			kpreempt(KPREEMPT_SYNC);
2104 			mutex_enter(lock);
2105 			mutex_enter(evicted_lock);
2106 			hdr_prev = list_prev(list, &marker);
2107 			list_remove(list, &marker);
2108 			count = 0;
2109 			continue;
2110 		}
2111 
2112 		hash_lock = HDR_LOCK(hdr);
2113 		have_lock = MUTEX_HELD(hash_lock);
2114 		if (have_lock || mutex_tryenter(hash_lock)) {
2115 			ASSERT0(refcount_count(&hdr->b_refcnt));
2116 			ASSERT(hdr->b_datacnt > 0);
2117 			while (hdr->b_buf) {
2118 				arc_buf_t *buf = hdr->b_buf;
2119 				if (!mutex_tryenter(&buf->b_evict_lock)) {
2120 					missed += 1;
2121 					break;
2122 				}
2123 				if (buf->b_data) {
2124 					bytes_evicted += hdr->b_size;
2125 					if (recycle && hdr->b_type == type &&
2126 					    hdr->b_size == bytes &&
2127 					    !HDR_L2_WRITING(hdr)) {
2128 						stolen = buf->b_data;
2129 						recycle = FALSE;
2130 					}
2131 				}
2132 				if (buf->b_efunc) {
2133 					mutex_enter(&arc_eviction_mtx);
2134 					arc_buf_destroy(buf,
2135 					    buf->b_data == stolen, FALSE);
2136 					hdr->b_buf = buf->b_next;
2137 					buf->b_hdr = &arc_eviction_hdr;
2138 					buf->b_next = arc_eviction_list;
2139 					arc_eviction_list = buf;
2140 					mutex_exit(&arc_eviction_mtx);
2141 					mutex_exit(&buf->b_evict_lock);
2142 				} else {
2143 					mutex_exit(&buf->b_evict_lock);
2144 					arc_buf_destroy(buf,
2145 					    buf->b_data == stolen, TRUE);
2146 				}
2147 			}
2148 
2149 			if (hdr->b_l2hdr) {
2150 				ARCSTAT_INCR(arcstat_evict_l2_cached,
2151 				    hdr->b_size);
2152 			} else {
2153 				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2154 					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2155 					    hdr->b_size);
2156 				} else {
2157 					ARCSTAT_INCR(
2158 					    arcstat_evict_l2_ineligible,
2159 					    hdr->b_size);
2160 				}
2161 			}
2162 
2163 			if (hdr->b_datacnt == 0) {
2164 				arc_change_state(evicted_state, hdr, hash_lock);
2165 				ASSERT(HDR_IN_HASH_TABLE(hdr));
2166 				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2167 				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2168 				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2169 			}
2170 			if (!have_lock)
2171 				mutex_exit(hash_lock);
2172 			if (bytes >= 0 && bytes_evicted >= bytes)
2173 				break;
2174 			if (bytes_remaining > 0) {
2175 				mutex_exit(evicted_lock);
2176 				mutex_exit(lock);
2177 				idx  = ((idx + 1) & (list_count - 1));
2178 				lists++;
2179 				goto evict_start;
2180 			}
2181 		} else {
2182 			missed += 1;
2183 		}
2184 	}
2185 
2186 	mutex_exit(evicted_lock);
2187 	mutex_exit(lock);
2188 
2189 	idx  = ((idx + 1) & (list_count - 1));
2190 	lists++;
2191 
2192 	if (bytes_evicted < bytes) {
2193 		if (lists < list_count)
2194 			goto evict_start;
2195 		else
2196 			dprintf("only evicted %lld bytes from %x",
2197 			    (longlong_t)bytes_evicted, state);
2198 	}
2199 	if (type == ARC_BUFC_METADATA)
2200 		evict_metadata_offset = idx;
2201 	else
2202 		evict_data_offset = idx;
2203 
2204 	if (skipped)
2205 		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2206 
2207 	if (missed)
2208 		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2209 
2210 	/*
2211 	 * Note: we have just evicted some data into the ghost state,
2212 	 * potentially putting the ghost size over the desired size.  Rather
2213 	 * that evicting from the ghost list in this hot code path, leave
2214 	 * this chore to the arc_reclaim_thread().
2215 	 */
2216 
2217 	if (stolen)
2218 		ARCSTAT_BUMP(arcstat_stolen);
2219 	return (stolen);
2220 }
2221 
2222 /*
2223  * Remove buffers from list until we've removed the specified number of
2224  * bytes.  Destroy the buffers that are removed.
2225  */
2226 static void
arc_evict_ghost(arc_state_t * state,uint64_t spa,int64_t bytes)2227 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2228 {
2229 	arc_buf_hdr_t *hdr, *hdr_prev;
2230 	arc_buf_hdr_t marker = { 0 };
2231 	list_t *list, *list_start;
2232 	kmutex_t *hash_lock, *lock;
2233 	uint64_t bytes_deleted = 0;
2234 	uint64_t bufs_skipped = 0;
2235 	int count = 0;
2236 	static int evict_offset;
2237 	int list_count, idx = evict_offset;
2238 	int offset, lists = 0;
2239 
2240 	ASSERT(GHOST_STATE(state));
2241 
2242 	/*
2243 	 * data lists come after metadata lists
2244 	 */
2245 	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2246 	list_count = ARC_BUFC_NUMDATALISTS;
2247 	offset = ARC_BUFC_NUMMETADATALISTS;
2248 
2249 evict_start:
2250 	list = &list_start[idx];
2251 	lock = ARCS_LOCK(state, idx + offset);
2252 
2253 	mutex_enter(lock);
2254 	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2255 		hdr_prev = list_prev(list, hdr);
2256 		if (hdr->b_type > ARC_BUFC_NUMTYPES)
2257 			panic("invalid hdr=%p", (void *)hdr);
2258 		if (spa && hdr->b_spa != spa)
2259 			continue;
2260 
2261 		/* ignore markers */
2262 		if (hdr->b_spa == 0)
2263 			continue;
2264 
2265 		hash_lock = HDR_LOCK(hdr);
2266 		/* caller may be trying to modify this buffer, skip it */
2267 		if (MUTEX_HELD(hash_lock))
2268 			continue;
2269 
2270 		/*
2271 		 * It may take a long time to evict all the bufs requested.
2272 		 * To avoid blocking all arc activity, periodically drop
2273 		 * the arcs_mtx and give other threads a chance to run
2274 		 * before reacquiring the lock.
2275 		 */
2276 		if (count++ > arc_evict_iterations) {
2277 			list_insert_after(list, hdr, &marker);
2278 			mutex_exit(lock);
2279 			kpreempt(KPREEMPT_SYNC);
2280 			mutex_enter(lock);
2281 			hdr_prev = list_prev(list, &marker);
2282 			list_remove(list, &marker);
2283 			count = 0;
2284 			continue;
2285 		}
2286 		if (mutex_tryenter(hash_lock)) {
2287 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2288 			ASSERT(hdr->b_buf == NULL);
2289 			ARCSTAT_BUMP(arcstat_deleted);
2290 			bytes_deleted += hdr->b_size;
2291 
2292 			if (hdr->b_l2hdr != NULL) {
2293 				/*
2294 				 * This buffer is cached on the 2nd Level ARC;
2295 				 * don't destroy the header.
2296 				 */
2297 				arc_change_state(arc_l2c_only, hdr, hash_lock);
2298 				mutex_exit(hash_lock);
2299 			} else {
2300 				arc_change_state(arc_anon, hdr, hash_lock);
2301 				mutex_exit(hash_lock);
2302 				arc_hdr_destroy(hdr);
2303 			}
2304 
2305 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2306 			if (bytes >= 0 && bytes_deleted >= bytes)
2307 				break;
2308 		} else if (bytes < 0) {
2309 			/*
2310 			 * Insert a list marker and then wait for the
2311 			 * hash lock to become available. Once its
2312 			 * available, restart from where we left off.
2313 			 */
2314 			list_insert_after(list, hdr, &marker);
2315 			mutex_exit(lock);
2316 			mutex_enter(hash_lock);
2317 			mutex_exit(hash_lock);
2318 			mutex_enter(lock);
2319 			hdr_prev = list_prev(list, &marker);
2320 			list_remove(list, &marker);
2321 		} else {
2322 			bufs_skipped += 1;
2323 		}
2324 
2325 	}
2326 	mutex_exit(lock);
2327 	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2328 	lists++;
2329 
2330 	if (lists < list_count)
2331 		goto evict_start;
2332 
2333 	evict_offset = idx;
2334 	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2335 	    (bytes < 0 || bytes_deleted < bytes)) {
2336 		list_start = &state->arcs_lists[0];
2337 		list_count = ARC_BUFC_NUMMETADATALISTS;
2338 		offset = lists = 0;
2339 		goto evict_start;
2340 	}
2341 
2342 	if (bufs_skipped) {
2343 		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2344 		ASSERT(bytes >= 0);
2345 	}
2346 
2347 	if (bytes_deleted < bytes)
2348 		dprintf("only deleted %lld bytes from %p",
2349 		    (longlong_t)bytes_deleted, state);
2350 }
2351 
2352 static void
arc_adjust(void)2353 arc_adjust(void)
2354 {
2355 	int64_t adjustment, delta;
2356 
2357 	/*
2358 	 * Adjust MRU size
2359 	 */
2360 
2361 	adjustment = MIN((int64_t)(arc_size - arc_c),
2362 	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2363 	    arc_p));
2364 
2365 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2366 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2367 		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2368 		adjustment -= delta;
2369 	}
2370 
2371 	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2372 		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2373 		(void) arc_evict(arc_mru, 0, delta, FALSE,
2374 		    ARC_BUFC_METADATA);
2375 	}
2376 
2377 	/*
2378 	 * Adjust MFU size
2379 	 */
2380 
2381 	adjustment = arc_size - arc_c;
2382 
2383 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2384 		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2385 		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2386 		adjustment -= delta;
2387 	}
2388 
2389 	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2390 		int64_t delta = MIN(adjustment,
2391 		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2392 		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2393 		    ARC_BUFC_METADATA);
2394 	}
2395 
2396 	/*
2397 	 * Adjust ghost lists
2398 	 */
2399 
2400 	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2401 
2402 	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2403 		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2404 		arc_evict_ghost(arc_mru_ghost, 0, delta);
2405 	}
2406 
2407 	adjustment =
2408 	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2409 
2410 	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2411 		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2412 		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2413 	}
2414 }
2415 
2416 static void
arc_do_user_evicts(void)2417 arc_do_user_evicts(void)
2418 {
2419 	static arc_buf_t *tmp_arc_eviction_list;
2420 
2421 	/*
2422 	 * Move list over to avoid LOR
2423 	 */
2424 restart:
2425 	mutex_enter(&arc_eviction_mtx);
2426 	tmp_arc_eviction_list = arc_eviction_list;
2427 	arc_eviction_list = NULL;
2428 	mutex_exit(&arc_eviction_mtx);
2429 
2430 	while (tmp_arc_eviction_list != NULL) {
2431 		arc_buf_t *buf = tmp_arc_eviction_list;
2432 		tmp_arc_eviction_list = buf->b_next;
2433 		mutex_enter(&buf->b_evict_lock);
2434 		buf->b_hdr = NULL;
2435 		mutex_exit(&buf->b_evict_lock);
2436 
2437 		if (buf->b_efunc != NULL)
2438 			VERIFY0(buf->b_efunc(buf->b_private));
2439 
2440 		buf->b_efunc = NULL;
2441 		buf->b_private = NULL;
2442 		kmem_cache_free(buf_cache, buf);
2443 	}
2444 
2445 	if (arc_eviction_list != NULL)
2446 		goto restart;
2447 }
2448 
2449 /*
2450  * Flush all *evictable* data from the cache for the given spa.
2451  * NOTE: this will not touch "active" (i.e. referenced) data.
2452  */
2453 void
arc_flush(spa_t * spa)2454 arc_flush(spa_t *spa)
2455 {
2456 	uint64_t guid = 0;
2457 
2458 	if (spa)
2459 		guid = spa_load_guid(spa);
2460 
2461 	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2462 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2463 		if (spa)
2464 			break;
2465 	}
2466 	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2467 		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2468 		if (spa)
2469 			break;
2470 	}
2471 	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2472 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2473 		if (spa)
2474 			break;
2475 	}
2476 	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2477 		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2478 		if (spa)
2479 			break;
2480 	}
2481 
2482 	arc_evict_ghost(arc_mru_ghost, guid, -1);
2483 	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2484 
2485 	mutex_enter(&arc_reclaim_thr_lock);
2486 	arc_do_user_evicts();
2487 	mutex_exit(&arc_reclaim_thr_lock);
2488 	ASSERT(spa || arc_eviction_list == NULL);
2489 }
2490 
2491 void
arc_shrink(void)2492 arc_shrink(void)
2493 {
2494 
2495 	if (arc_c > arc_c_min) {
2496 		uint64_t to_free;
2497 
2498 		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2499 			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2500 #ifdef _KERNEL
2501 		to_free = arc_c >> arc_shrink_shift;
2502 #else
2503 		to_free = arc_c >> arc_shrink_shift;
2504 #endif
2505 		if (arc_c > arc_c_min + to_free)
2506 			atomic_add_64(&arc_c, -to_free);
2507 		else
2508 			arc_c = arc_c_min;
2509 
2510 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2511 		if (arc_c > arc_size)
2512 			arc_c = MAX(arc_size, arc_c_min);
2513 		if (arc_p > arc_c)
2514 			arc_p = (arc_c >> 1);
2515 
2516 		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2517 			arc_p);
2518 
2519 		ASSERT(arc_c >= arc_c_min);
2520 		ASSERT((int64_t)arc_p >= 0);
2521 	}
2522 
2523 	if (arc_size > arc_c) {
2524 		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2525 			uint64_t, arc_c);
2526 		arc_adjust();
2527 	}
2528 }
2529 
2530 static int needfree = 0;
2531 
2532 static int
arc_reclaim_needed(void)2533 arc_reclaim_needed(void)
2534 {
2535 
2536 #ifdef _KERNEL
2537 
2538 	if (needfree) {
2539 		DTRACE_PROBE(arc__reclaim_needfree);
2540 		return (1);
2541 	}
2542 
2543 	/*
2544 	 * Cooperate with pagedaemon when it's time for it to scan
2545 	 * and reclaim some pages.
2546 	 */
2547 	if (freemem < zfs_arc_free_target) {
2548 		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2549 		    freemem, uint64_t, zfs_arc_free_target);
2550 		return (1);
2551 	}
2552 
2553 #ifdef sun
2554 	/*
2555 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2556 	 */
2557 	extra = desfree;
2558 
2559 	/*
2560 	 * check that we're out of range of the pageout scanner.  It starts to
2561 	 * schedule paging if freemem is less than lotsfree and needfree.
2562 	 * lotsfree is the high-water mark for pageout, and needfree is the
2563 	 * number of needed free pages.  We add extra pages here to make sure
2564 	 * the scanner doesn't start up while we're freeing memory.
2565 	 */
2566 	if (freemem < lotsfree + needfree + extra)
2567 		return (1);
2568 
2569 	/*
2570 	 * check to make sure that swapfs has enough space so that anon
2571 	 * reservations can still succeed. anon_resvmem() checks that the
2572 	 * availrmem is greater than swapfs_minfree, and the number of reserved
2573 	 * swap pages.  We also add a bit of extra here just to prevent
2574 	 * circumstances from getting really dire.
2575 	 */
2576 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2577 		return (1);
2578 
2579 	/*
2580 	 * Check that we have enough availrmem that memory locking (e.g., via
2581 	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2582 	 * stores the number of pages that cannot be locked; when availrmem
2583 	 * drops below pages_pp_maximum, page locking mechanisms such as
2584 	 * page_pp_lock() will fail.)
2585 	 */
2586 	if (availrmem <= pages_pp_maximum)
2587 		return (1);
2588 
2589 #endif	/* sun */
2590 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2591 	/*
2592 	 * If we're on an i386 platform, it's possible that we'll exhaust the
2593 	 * kernel heap space before we ever run out of available physical
2594 	 * memory.  Most checks of the size of the heap_area compare against
2595 	 * tune.t_minarmem, which is the minimum available real memory that we
2596 	 * can have in the system.  However, this is generally fixed at 25 pages
2597 	 * which is so low that it's useless.  In this comparison, we seek to
2598 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2599 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2600 	 * free)
2601 	 */
2602 	if (vmem_size(heap_arena, VMEM_FREE) <
2603 	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2604 		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2605 		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2606 		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2607 		return (1);
2608 	}
2609 #define	zio_arena	NULL
2610 #else
2611 #define	zio_arena	heap_arena
2612 #endif
2613 
2614 	/*
2615 	 * If zio data pages are being allocated out of a separate heap segment,
2616 	 * then enforce that the size of available vmem for this arena remains
2617 	 * above about 1/16th free.
2618 	 *
2619 	 * Note: The 1/16th arena free requirement was put in place
2620 	 * to aggressively evict memory from the arc in order to avoid
2621 	 * memory fragmentation issues.
2622 	 */
2623 	if (zio_arena != NULL &&
2624 	    vmem_size(zio_arena, VMEM_FREE) <
2625 	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2626 		return (1);
2627 
2628 	/*
2629 	 * Above limits know nothing about real level of KVA fragmentation.
2630 	 * Start aggressive reclamation if too little sequential KVA left.
2631 	 */
2632 	if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize)
2633 		return (1);
2634 
2635 #else	/* _KERNEL */
2636 	if (spa_get_random(100) == 0)
2637 		return (1);
2638 #endif	/* _KERNEL */
2639 	DTRACE_PROBE(arc__reclaim_no);
2640 
2641 	return (0);
2642 }
2643 
2644 extern kmem_cache_t	*zio_buf_cache[];
2645 extern kmem_cache_t	*zio_data_buf_cache[];
2646 extern kmem_cache_t	*range_seg_cache;
2647 
2648 static void __noinline
arc_kmem_reap_now(arc_reclaim_strategy_t strat)2649 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2650 {
2651 	size_t			i;
2652 	kmem_cache_t		*prev_cache = NULL;
2653 	kmem_cache_t		*prev_data_cache = NULL;
2654 
2655 	DTRACE_PROBE(arc__kmem_reap_start);
2656 #ifdef _KERNEL
2657 	if (arc_meta_used >= arc_meta_limit) {
2658 		/*
2659 		 * We are exceeding our meta-data cache limit.
2660 		 * Purge some DNLC entries to release holds on meta-data.
2661 		 */
2662 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2663 	}
2664 #if defined(__i386)
2665 	/*
2666 	 * Reclaim unused memory from all kmem caches.
2667 	 */
2668 	kmem_reap();
2669 #endif
2670 #endif
2671 
2672 	/*
2673 	 * An aggressive reclamation will shrink the cache size as well as
2674 	 * reap free buffers from the arc kmem caches.
2675 	 */
2676 	if (strat == ARC_RECLAIM_AGGR)
2677 		arc_shrink();
2678 
2679 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2680 		if (zio_buf_cache[i] != prev_cache) {
2681 			prev_cache = zio_buf_cache[i];
2682 			kmem_cache_reap_now(zio_buf_cache[i]);
2683 		}
2684 		if (zio_data_buf_cache[i] != prev_data_cache) {
2685 			prev_data_cache = zio_data_buf_cache[i];
2686 			kmem_cache_reap_now(zio_data_buf_cache[i]);
2687 		}
2688 	}
2689 	kmem_cache_reap_now(buf_cache);
2690 	kmem_cache_reap_now(hdr_cache);
2691 	kmem_cache_reap_now(range_seg_cache);
2692 
2693 #ifdef sun
2694 	/*
2695 	 * Ask the vmem arena to reclaim unused memory from its
2696 	 * quantum caches.
2697 	 */
2698 	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2699 		vmem_qcache_reap(zio_arena);
2700 #endif
2701 	DTRACE_PROBE(arc__kmem_reap_end);
2702 }
2703 
2704 static void
arc_reclaim_thread(void * dummy __unused)2705 arc_reclaim_thread(void *dummy __unused)
2706 {
2707 	clock_t			growtime = 0;
2708 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2709 	callb_cpr_t		cpr;
2710 
2711 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2712 
2713 	mutex_enter(&arc_reclaim_thr_lock);
2714 	while (arc_thread_exit == 0) {
2715 		if (arc_reclaim_needed()) {
2716 
2717 			if (arc_no_grow) {
2718 				if (last_reclaim == ARC_RECLAIM_CONS) {
2719 					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2720 					last_reclaim = ARC_RECLAIM_AGGR;
2721 				} else {
2722 					last_reclaim = ARC_RECLAIM_CONS;
2723 				}
2724 			} else {
2725 				arc_no_grow = TRUE;
2726 				last_reclaim = ARC_RECLAIM_AGGR;
2727 				DTRACE_PROBE(arc__reclaim_aggr);
2728 				membar_producer();
2729 			}
2730 
2731 			/* reset the growth delay for every reclaim */
2732 			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2733 
2734 			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2735 				/*
2736 				 * If needfree is TRUE our vm_lowmem hook
2737 				 * was called and in that case we must free some
2738 				 * memory, so switch to aggressive mode.
2739 				 */
2740 				arc_no_grow = TRUE;
2741 				last_reclaim = ARC_RECLAIM_AGGR;
2742 			}
2743 			arc_kmem_reap_now(last_reclaim);
2744 			arc_warm = B_TRUE;
2745 
2746 		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2747 			arc_no_grow = FALSE;
2748 		}
2749 
2750 		arc_adjust();
2751 
2752 		if (arc_eviction_list != NULL)
2753 			arc_do_user_evicts();
2754 
2755 #ifdef _KERNEL
2756 		if (needfree) {
2757 			needfree = 0;
2758 			wakeup(&needfree);
2759 		}
2760 #endif
2761 
2762 		/* block until needed, or one second, whichever is shorter */
2763 		CALLB_CPR_SAFE_BEGIN(&cpr);
2764 		(void) cv_timedwait(&arc_reclaim_thr_cv,
2765 		    &arc_reclaim_thr_lock, hz);
2766 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2767 	}
2768 
2769 	arc_thread_exit = 0;
2770 	cv_broadcast(&arc_reclaim_thr_cv);
2771 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2772 	thread_exit();
2773 }
2774 
2775 /*
2776  * Adapt arc info given the number of bytes we are trying to add and
2777  * the state that we are comming from.  This function is only called
2778  * when we are adding new content to the cache.
2779  */
2780 static void
arc_adapt(int bytes,arc_state_t * state)2781 arc_adapt(int bytes, arc_state_t *state)
2782 {
2783 	int mult;
2784 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2785 
2786 	if (state == arc_l2c_only)
2787 		return;
2788 
2789 	ASSERT(bytes > 0);
2790 	/*
2791 	 * Adapt the target size of the MRU list:
2792 	 *	- if we just hit in the MRU ghost list, then increase
2793 	 *	  the target size of the MRU list.
2794 	 *	- if we just hit in the MFU ghost list, then increase
2795 	 *	  the target size of the MFU list by decreasing the
2796 	 *	  target size of the MRU list.
2797 	 */
2798 	if (state == arc_mru_ghost) {
2799 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2800 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2801 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2802 
2803 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2804 	} else if (state == arc_mfu_ghost) {
2805 		uint64_t delta;
2806 
2807 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2808 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2809 		mult = MIN(mult, 10);
2810 
2811 		delta = MIN(bytes * mult, arc_p);
2812 		arc_p = MAX(arc_p_min, arc_p - delta);
2813 	}
2814 	ASSERT((int64_t)arc_p >= 0);
2815 
2816 	if (arc_reclaim_needed()) {
2817 		cv_signal(&arc_reclaim_thr_cv);
2818 		return;
2819 	}
2820 
2821 	if (arc_no_grow)
2822 		return;
2823 
2824 	if (arc_c >= arc_c_max)
2825 		return;
2826 
2827 	/*
2828 	 * If we're within (2 * maxblocksize) bytes of the target
2829 	 * cache size, increment the target cache size
2830 	 */
2831 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2832 		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2833 		atomic_add_64(&arc_c, (int64_t)bytes);
2834 		if (arc_c > arc_c_max)
2835 			arc_c = arc_c_max;
2836 		else if (state == arc_anon)
2837 			atomic_add_64(&arc_p, (int64_t)bytes);
2838 		if (arc_p > arc_c)
2839 			arc_p = arc_c;
2840 	}
2841 	ASSERT((int64_t)arc_p >= 0);
2842 }
2843 
2844 /*
2845  * Check if the cache has reached its limits and eviction is required
2846  * prior to insert.
2847  */
2848 static int
arc_evict_needed(arc_buf_contents_t type)2849 arc_evict_needed(arc_buf_contents_t type)
2850 {
2851 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2852 		return (1);
2853 
2854 	if (arc_reclaim_needed())
2855 		return (1);
2856 
2857 	return (arc_size > arc_c);
2858 }
2859 
2860 /*
2861  * The buffer, supplied as the first argument, needs a data block.
2862  * So, if we are at cache max, determine which cache should be victimized.
2863  * We have the following cases:
2864  *
2865  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2866  * In this situation if we're out of space, but the resident size of the MFU is
2867  * under the limit, victimize the MFU cache to satisfy this insertion request.
2868  *
2869  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2870  * Here, we've used up all of the available space for the MRU, so we need to
2871  * evict from our own cache instead.  Evict from the set of resident MRU
2872  * entries.
2873  *
2874  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2875  * c minus p represents the MFU space in the cache, since p is the size of the
2876  * cache that is dedicated to the MRU.  In this situation there's still space on
2877  * the MFU side, so the MRU side needs to be victimized.
2878  *
2879  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2880  * MFU's resident set is consuming more space than it has been allotted.  In
2881  * this situation, we must victimize our own cache, the MFU, for this insertion.
2882  */
2883 static void
arc_get_data_buf(arc_buf_t * buf)2884 arc_get_data_buf(arc_buf_t *buf)
2885 {
2886 	arc_state_t		*state = buf->b_hdr->b_state;
2887 	uint64_t		size = buf->b_hdr->b_size;
2888 	arc_buf_contents_t	type = buf->b_hdr->b_type;
2889 
2890 	arc_adapt(size, state);
2891 
2892 	/*
2893 	 * We have not yet reached cache maximum size,
2894 	 * just allocate a new buffer.
2895 	 */
2896 	if (!arc_evict_needed(type)) {
2897 		if (type == ARC_BUFC_METADATA) {
2898 			buf->b_data = zio_buf_alloc(size);
2899 			arc_space_consume(size, ARC_SPACE_DATA);
2900 		} else {
2901 			ASSERT(type == ARC_BUFC_DATA);
2902 			buf->b_data = zio_data_buf_alloc(size);
2903 			ARCSTAT_INCR(arcstat_data_size, size);
2904 			atomic_add_64(&arc_size, size);
2905 		}
2906 		goto out;
2907 	}
2908 
2909 	/*
2910 	 * If we are prefetching from the mfu ghost list, this buffer
2911 	 * will end up on the mru list; so steal space from there.
2912 	 */
2913 	if (state == arc_mfu_ghost)
2914 		state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ?
2915 		    arc_mru : arc_mfu;
2916 	else if (state == arc_mru_ghost)
2917 		state = arc_mru;
2918 
2919 	if (state == arc_mru || state == arc_anon) {
2920 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2921 		state = (arc_mfu->arcs_lsize[type] >= size &&
2922 		    arc_p > mru_used) ? arc_mfu : arc_mru;
2923 	} else {
2924 		/* MFU cases */
2925 		uint64_t mfu_space = arc_c - arc_p;
2926 		state =  (arc_mru->arcs_lsize[type] >= size &&
2927 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2928 	}
2929 	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2930 		if (type == ARC_BUFC_METADATA) {
2931 			buf->b_data = zio_buf_alloc(size);
2932 			arc_space_consume(size, ARC_SPACE_DATA);
2933 		} else {
2934 			ASSERT(type == ARC_BUFC_DATA);
2935 			buf->b_data = zio_data_buf_alloc(size);
2936 			ARCSTAT_INCR(arcstat_data_size, size);
2937 			atomic_add_64(&arc_size, size);
2938 		}
2939 		ARCSTAT_BUMP(arcstat_recycle_miss);
2940 	}
2941 	ASSERT(buf->b_data != NULL);
2942 out:
2943 	/*
2944 	 * Update the state size.  Note that ghost states have a
2945 	 * "ghost size" and so don't need to be updated.
2946 	 */
2947 	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2948 		arc_buf_hdr_t *hdr = buf->b_hdr;
2949 
2950 		atomic_add_64(&hdr->b_state->arcs_size, size);
2951 		if (list_link_active(&hdr->b_arc_node)) {
2952 			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2953 			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2954 		}
2955 		/*
2956 		 * If we are growing the cache, and we are adding anonymous
2957 		 * data, and we have outgrown arc_p, update arc_p
2958 		 */
2959 		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2960 		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2961 			arc_p = MIN(arc_c, arc_p + size);
2962 	}
2963 	ARCSTAT_BUMP(arcstat_allocated);
2964 }
2965 
2966 /*
2967  * This routine is called whenever a buffer is accessed.
2968  * NOTE: the hash lock is dropped in this function.
2969  */
2970 static void
arc_access(arc_buf_hdr_t * hdr,kmutex_t * hash_lock)2971 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2972 {
2973 	clock_t now;
2974 
2975 	ASSERT(MUTEX_HELD(hash_lock));
2976 
2977 	if (hdr->b_state == arc_anon) {
2978 		/*
2979 		 * This buffer is not in the cache, and does not
2980 		 * appear in our "ghost" list.  Add the new buffer
2981 		 * to the MRU state.
2982 		 */
2983 
2984 		ASSERT(hdr->b_arc_access == 0);
2985 		hdr->b_arc_access = ddi_get_lbolt();
2986 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
2987 		arc_change_state(arc_mru, hdr, hash_lock);
2988 
2989 	} else if (hdr->b_state == arc_mru) {
2990 		now = ddi_get_lbolt();
2991 
2992 		/*
2993 		 * If this buffer is here because of a prefetch, then either:
2994 		 * - clear the flag if this is a "referencing" read
2995 		 *   (any subsequent access will bump this into the MFU state).
2996 		 * or
2997 		 * - move the buffer to the head of the list if this is
2998 		 *   another prefetch (to make it less likely to be evicted).
2999 		 */
3000 		if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
3001 			if (refcount_count(&hdr->b_refcnt) == 0) {
3002 				ASSERT(list_link_active(&hdr->b_arc_node));
3003 			} else {
3004 				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3005 				ARCSTAT_BUMP(arcstat_mru_hits);
3006 			}
3007 			hdr->b_arc_access = now;
3008 			return;
3009 		}
3010 
3011 		/*
3012 		 * This buffer has been "accessed" only once so far,
3013 		 * but it is still in the cache. Move it to the MFU
3014 		 * state.
3015 		 */
3016 		if (now > hdr->b_arc_access + ARC_MINTIME) {
3017 			/*
3018 			 * More than 125ms have passed since we
3019 			 * instantiated this buffer.  Move it to the
3020 			 * most frequently used state.
3021 			 */
3022 			hdr->b_arc_access = now;
3023 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3024 			arc_change_state(arc_mfu, hdr, hash_lock);
3025 		}
3026 		ARCSTAT_BUMP(arcstat_mru_hits);
3027 	} else if (hdr->b_state == arc_mru_ghost) {
3028 		arc_state_t	*new_state;
3029 		/*
3030 		 * This buffer has been "accessed" recently, but
3031 		 * was evicted from the cache.  Move it to the
3032 		 * MFU state.
3033 		 */
3034 
3035 		if (hdr->b_flags & ARC_FLAG_PREFETCH) {
3036 			new_state = arc_mru;
3037 			if (refcount_count(&hdr->b_refcnt) > 0)
3038 				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3039 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3040 		} else {
3041 			new_state = arc_mfu;
3042 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3043 		}
3044 
3045 		hdr->b_arc_access = ddi_get_lbolt();
3046 		arc_change_state(new_state, hdr, hash_lock);
3047 
3048 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3049 	} else if (hdr->b_state == arc_mfu) {
3050 		/*
3051 		 * This buffer has been accessed more than once and is
3052 		 * still in the cache.  Keep it in the MFU state.
3053 		 *
3054 		 * NOTE: an add_reference() that occurred when we did
3055 		 * the arc_read() will have kicked this off the list.
3056 		 * If it was a prefetch, we will explicitly move it to
3057 		 * the head of the list now.
3058 		 */
3059 		if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
3060 			ASSERT(refcount_count(&hdr->b_refcnt) == 0);
3061 			ASSERT(list_link_active(&hdr->b_arc_node));
3062 		}
3063 		ARCSTAT_BUMP(arcstat_mfu_hits);
3064 		hdr->b_arc_access = ddi_get_lbolt();
3065 	} else if (hdr->b_state == arc_mfu_ghost) {
3066 		arc_state_t	*new_state = arc_mfu;
3067 		/*
3068 		 * This buffer has been accessed more than once but has
3069 		 * been evicted from the cache.  Move it back to the
3070 		 * MFU state.
3071 		 */
3072 
3073 		if (hdr->b_flags & ARC_FLAG_PREFETCH) {
3074 			/*
3075 			 * This is a prefetch access...
3076 			 * move this block back to the MRU state.
3077 			 */
3078 			ASSERT0(refcount_count(&hdr->b_refcnt));
3079 			new_state = arc_mru;
3080 		}
3081 
3082 		hdr->b_arc_access = ddi_get_lbolt();
3083 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3084 		arc_change_state(new_state, hdr, hash_lock);
3085 
3086 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3087 	} else if (hdr->b_state == arc_l2c_only) {
3088 		/*
3089 		 * This buffer is on the 2nd Level ARC.
3090 		 */
3091 
3092 		hdr->b_arc_access = ddi_get_lbolt();
3093 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3094 		arc_change_state(arc_mfu, hdr, hash_lock);
3095 	} else {
3096 		ASSERT(!"invalid arc state");
3097 	}
3098 }
3099 
3100 /* a generic arc_done_func_t which you can use */
3101 /* ARGSUSED */
3102 void
arc_bcopy_func(zio_t * zio,arc_buf_t * buf,void * arg)3103 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3104 {
3105 	if (zio == NULL || zio->io_error == 0)
3106 		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3107 	VERIFY(arc_buf_remove_ref(buf, arg));
3108 }
3109 
3110 /* a generic arc_done_func_t */
3111 void
arc_getbuf_func(zio_t * zio,arc_buf_t * buf,void * arg)3112 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3113 {
3114 	arc_buf_t **bufp = arg;
3115 	if (zio && zio->io_error) {
3116 		VERIFY(arc_buf_remove_ref(buf, arg));
3117 		*bufp = NULL;
3118 	} else {
3119 		*bufp = buf;
3120 		ASSERT(buf->b_data);
3121 	}
3122 }
3123 
3124 static void
arc_read_done(zio_t * zio)3125 arc_read_done(zio_t *zio)
3126 {
3127 	arc_buf_hdr_t	*hdr;
3128 	arc_buf_t	*buf;
3129 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3130 	kmutex_t	*hash_lock = NULL;
3131 	arc_callback_t	*callback_list, *acb;
3132 	int		freeable = FALSE;
3133 
3134 	buf = zio->io_private;
3135 	hdr = buf->b_hdr;
3136 
3137 	/*
3138 	 * The hdr was inserted into hash-table and removed from lists
3139 	 * prior to starting I/O.  We should find this header, since
3140 	 * it's in the hash table, and it should be legit since it's
3141 	 * not possible to evict it during the I/O.  The only possible
3142 	 * reason for it not to be found is if we were freed during the
3143 	 * read.
3144 	 */
3145 	if (HDR_IN_HASH_TABLE(hdr)) {
3146 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3147 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3148 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3149 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3150 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3151 
3152 		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3153 		    &hash_lock);
3154 
3155 		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3156 		    hash_lock == NULL) ||
3157 		    (found == hdr &&
3158 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3159 		    (found == hdr && HDR_L2_READING(hdr)));
3160 	}
3161 
3162 	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3163 	if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH))
3164 		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3165 
3166 	/* byteswap if necessary */
3167 	callback_list = hdr->b_acb;
3168 	ASSERT(callback_list != NULL);
3169 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3170 		dmu_object_byteswap_t bswap =
3171 		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3172 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3173 		    byteswap_uint64_array :
3174 		    dmu_ot_byteswap[bswap].ob_func;
3175 		func(buf->b_data, hdr->b_size);
3176 	}
3177 
3178 	arc_cksum_compute(buf, B_FALSE);
3179 #ifdef illumos
3180 	arc_buf_watch(buf);
3181 #endif /* illumos */
3182 
3183 	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3184 		/*
3185 		 * Only call arc_access on anonymous buffers.  This is because
3186 		 * if we've issued an I/O for an evicted buffer, we've already
3187 		 * called arc_access (to prevent any simultaneous readers from
3188 		 * getting confused).
3189 		 */
3190 		arc_access(hdr, hash_lock);
3191 	}
3192 
3193 	/* create copies of the data buffer for the callers */
3194 	abuf = buf;
3195 	for (acb = callback_list; acb; acb = acb->acb_next) {
3196 		if (acb->acb_done) {
3197 			if (abuf == NULL) {
3198 				ARCSTAT_BUMP(arcstat_duplicate_reads);
3199 				abuf = arc_buf_clone(buf);
3200 			}
3201 			acb->acb_buf = abuf;
3202 			abuf = NULL;
3203 		}
3204 	}
3205 	hdr->b_acb = NULL;
3206 	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3207 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3208 	if (abuf == buf) {
3209 		ASSERT(buf->b_efunc == NULL);
3210 		ASSERT(hdr->b_datacnt == 1);
3211 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3212 	}
3213 
3214 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3215 
3216 	if (zio->io_error != 0) {
3217 		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3218 		if (hdr->b_state != arc_anon)
3219 			arc_change_state(arc_anon, hdr, hash_lock);
3220 		if (HDR_IN_HASH_TABLE(hdr))
3221 			buf_hash_remove(hdr);
3222 		freeable = refcount_is_zero(&hdr->b_refcnt);
3223 	}
3224 
3225 	/*
3226 	 * Broadcast before we drop the hash_lock to avoid the possibility
3227 	 * that the hdr (and hence the cv) might be freed before we get to
3228 	 * the cv_broadcast().
3229 	 */
3230 	cv_broadcast(&hdr->b_cv);
3231 
3232 	if (hash_lock) {
3233 		mutex_exit(hash_lock);
3234 	} else {
3235 		/*
3236 		 * This block was freed while we waited for the read to
3237 		 * complete.  It has been removed from the hash table and
3238 		 * moved to the anonymous state (so that it won't show up
3239 		 * in the cache).
3240 		 */
3241 		ASSERT3P(hdr->b_state, ==, arc_anon);
3242 		freeable = refcount_is_zero(&hdr->b_refcnt);
3243 	}
3244 
3245 	/* execute each callback and free its structure */
3246 	while ((acb = callback_list) != NULL) {
3247 		if (acb->acb_done)
3248 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3249 
3250 		if (acb->acb_zio_dummy != NULL) {
3251 			acb->acb_zio_dummy->io_error = zio->io_error;
3252 			zio_nowait(acb->acb_zio_dummy);
3253 		}
3254 
3255 		callback_list = acb->acb_next;
3256 		kmem_free(acb, sizeof (arc_callback_t));
3257 	}
3258 
3259 	if (freeable)
3260 		arc_hdr_destroy(hdr);
3261 }
3262 
3263 /*
3264  * "Read" the block block at the specified DVA (in bp) via the
3265  * cache.  If the block is found in the cache, invoke the provided
3266  * callback immediately and return.  Note that the `zio' parameter
3267  * in the callback will be NULL in this case, since no IO was
3268  * required.  If the block is not in the cache pass the read request
3269  * on to the spa with a substitute callback function, so that the
3270  * requested block will be added to the cache.
3271  *
3272  * If a read request arrives for a block that has a read in-progress,
3273  * either wait for the in-progress read to complete (and return the
3274  * results); or, if this is a read with a "done" func, add a record
3275  * to the read to invoke the "done" func when the read completes,
3276  * and return; or just return.
3277  *
3278  * arc_read_done() will invoke all the requested "done" functions
3279  * for readers of this block.
3280  */
3281 int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,arc_flags_t * arc_flags,const zbookmark_phys_t * zb)3282 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3283     void *private, zio_priority_t priority, int zio_flags,
3284     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3285 {
3286 	arc_buf_hdr_t *hdr = NULL;
3287 	arc_buf_t *buf = NULL;
3288 	kmutex_t *hash_lock = NULL;
3289 	zio_t *rzio;
3290 	uint64_t guid = spa_load_guid(spa);
3291 
3292 	ASSERT(!BP_IS_EMBEDDED(bp) ||
3293 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3294 
3295 top:
3296 	if (!BP_IS_EMBEDDED(bp)) {
3297 		/*
3298 		 * Embedded BP's have no DVA and require no I/O to "read".
3299 		 * Create an anonymous arc buf to back it.
3300 		 */
3301 		hdr = buf_hash_find(guid, bp, &hash_lock);
3302 	}
3303 
3304 	if (hdr != NULL && hdr->b_datacnt > 0) {
3305 
3306 		*arc_flags |= ARC_FLAG_CACHED;
3307 
3308 		if (HDR_IO_IN_PROGRESS(hdr)) {
3309 
3310 			if (*arc_flags & ARC_FLAG_WAIT) {
3311 				cv_wait(&hdr->b_cv, hash_lock);
3312 				mutex_exit(hash_lock);
3313 				goto top;
3314 			}
3315 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3316 
3317 			if (done) {
3318 				arc_callback_t	*acb = NULL;
3319 
3320 				acb = kmem_zalloc(sizeof (arc_callback_t),
3321 				    KM_SLEEP);
3322 				acb->acb_done = done;
3323 				acb->acb_private = private;
3324 				if (pio != NULL)
3325 					acb->acb_zio_dummy = zio_null(pio,
3326 					    spa, NULL, NULL, NULL, zio_flags);
3327 
3328 				ASSERT(acb->acb_done != NULL);
3329 				acb->acb_next = hdr->b_acb;
3330 				hdr->b_acb = acb;
3331 				add_reference(hdr, hash_lock, private);
3332 				mutex_exit(hash_lock);
3333 				return (0);
3334 			}
3335 			mutex_exit(hash_lock);
3336 			return (0);
3337 		}
3338 
3339 		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3340 
3341 		if (done) {
3342 			add_reference(hdr, hash_lock, private);
3343 			/*
3344 			 * If this block is already in use, create a new
3345 			 * copy of the data so that we will be guaranteed
3346 			 * that arc_release() will always succeed.
3347 			 */
3348 			buf = hdr->b_buf;
3349 			ASSERT(buf);
3350 			ASSERT(buf->b_data);
3351 			if (HDR_BUF_AVAILABLE(hdr)) {
3352 				ASSERT(buf->b_efunc == NULL);
3353 				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3354 			} else {
3355 				buf = arc_buf_clone(buf);
3356 			}
3357 
3358 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3359 		    refcount_count(&hdr->b_refcnt) == 0) {
3360 			hdr->b_flags |= ARC_FLAG_PREFETCH;
3361 		}
3362 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3363 		arc_access(hdr, hash_lock);
3364 		if (*arc_flags & ARC_FLAG_L2CACHE)
3365 			hdr->b_flags |= ARC_FLAG_L2CACHE;
3366 		if (*arc_flags & ARC_FLAG_L2COMPRESS)
3367 			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3368 		mutex_exit(hash_lock);
3369 		ARCSTAT_BUMP(arcstat_hits);
3370 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
3371 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3372 		    data, metadata, hits);
3373 
3374 		if (done)
3375 			done(NULL, buf, private);
3376 	} else {
3377 		uint64_t size = BP_GET_LSIZE(bp);
3378 		arc_callback_t *acb;
3379 		vdev_t *vd = NULL;
3380 		uint64_t addr = 0;
3381 		boolean_t devw = B_FALSE;
3382 		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3383 		uint64_t b_asize = 0;
3384 
3385 		if (hdr == NULL) {
3386 			/* this block is not in the cache */
3387 			arc_buf_hdr_t *exists = NULL;
3388 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3389 			buf = arc_buf_alloc(spa, size, private, type);
3390 			hdr = buf->b_hdr;
3391 			if (!BP_IS_EMBEDDED(bp)) {
3392 				hdr->b_dva = *BP_IDENTITY(bp);
3393 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3394 				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3395 				exists = buf_hash_insert(hdr, &hash_lock);
3396 			}
3397 			if (exists != NULL) {
3398 				/* somebody beat us to the hash insert */
3399 				mutex_exit(hash_lock);
3400 				buf_discard_identity(hdr);
3401 				(void) arc_buf_remove_ref(buf, private);
3402 				goto top; /* restart the IO request */
3403 			}
3404 
3405 			/* if this is a prefetch, we don't have a reference */
3406 			if (*arc_flags & ARC_FLAG_PREFETCH) {
3407 				(void) remove_reference(hdr, hash_lock,
3408 				    private);
3409 				hdr->b_flags |= ARC_FLAG_PREFETCH;
3410 			}
3411 			if (*arc_flags & ARC_FLAG_L2CACHE)
3412 				hdr->b_flags |= ARC_FLAG_L2CACHE;
3413 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3414 				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3415 			if (BP_GET_LEVEL(bp) > 0)
3416 				hdr->b_flags |= ARC_FLAG_INDIRECT;
3417 		} else {
3418 			/* this block is in the ghost cache */
3419 			ASSERT(GHOST_STATE(hdr->b_state));
3420 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3421 			ASSERT0(refcount_count(&hdr->b_refcnt));
3422 			ASSERT(hdr->b_buf == NULL);
3423 
3424 			/* if this is a prefetch, we don't have a reference */
3425 			if (*arc_flags & ARC_FLAG_PREFETCH)
3426 				hdr->b_flags |= ARC_FLAG_PREFETCH;
3427 			else
3428 				add_reference(hdr, hash_lock, private);
3429 			if (*arc_flags & ARC_FLAG_L2CACHE)
3430 				hdr->b_flags |= ARC_FLAG_L2CACHE;
3431 			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3432 				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3433 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3434 			buf->b_hdr = hdr;
3435 			buf->b_data = NULL;
3436 			buf->b_efunc = NULL;
3437 			buf->b_private = NULL;
3438 			buf->b_next = NULL;
3439 			hdr->b_buf = buf;
3440 			ASSERT(hdr->b_datacnt == 0);
3441 			hdr->b_datacnt = 1;
3442 			arc_get_data_buf(buf);
3443 			arc_access(hdr, hash_lock);
3444 		}
3445 
3446 		ASSERT(!GHOST_STATE(hdr->b_state));
3447 
3448 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3449 		acb->acb_done = done;
3450 		acb->acb_private = private;
3451 
3452 		ASSERT(hdr->b_acb == NULL);
3453 		hdr->b_acb = acb;
3454 		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3455 
3456 		if (hdr->b_l2hdr != NULL &&
3457 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3458 			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3459 			addr = hdr->b_l2hdr->b_daddr;
3460 			b_compress = hdr->b_l2hdr->b_compress;
3461 			b_asize = hdr->b_l2hdr->b_asize;
3462 			/*
3463 			 * Lock out device removal.
3464 			 */
3465 			if (vdev_is_dead(vd) ||
3466 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3467 				vd = NULL;
3468 		}
3469 
3470 		if (hash_lock != NULL)
3471 			mutex_exit(hash_lock);
3472 
3473 		/*
3474 		 * At this point, we have a level 1 cache miss.  Try again in
3475 		 * L2ARC if possible.
3476 		 */
3477 		ASSERT3U(hdr->b_size, ==, size);
3478 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3479 		    uint64_t, size, zbookmark_phys_t *, zb);
3480 		ARCSTAT_BUMP(arcstat_misses);
3481 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
3482 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3483 		    data, metadata, misses);
3484 #ifdef _KERNEL
3485 		curthread->td_ru.ru_inblock++;
3486 #endif
3487 
3488 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3489 			/*
3490 			 * Read from the L2ARC if the following are true:
3491 			 * 1. The L2ARC vdev was previously cached.
3492 			 * 2. This buffer still has L2ARC metadata.
3493 			 * 3. This buffer isn't currently writing to the L2ARC.
3494 			 * 4. The L2ARC entry wasn't evicted, which may
3495 			 *    also have invalidated the vdev.
3496 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3497 			 */
3498 			if (hdr->b_l2hdr != NULL &&
3499 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3500 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3501 				l2arc_read_callback_t *cb;
3502 
3503 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3504 				ARCSTAT_BUMP(arcstat_l2_hits);
3505 
3506 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3507 				    KM_SLEEP);
3508 				cb->l2rcb_buf = buf;
3509 				cb->l2rcb_spa = spa;
3510 				cb->l2rcb_bp = *bp;
3511 				cb->l2rcb_zb = *zb;
3512 				cb->l2rcb_flags = zio_flags;
3513 				cb->l2rcb_compress = b_compress;
3514 
3515 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3516 				    addr + size < vd->vdev_psize -
3517 				    VDEV_LABEL_END_SIZE);
3518 
3519 				/*
3520 				 * l2arc read.  The SCL_L2ARC lock will be
3521 				 * released by l2arc_read_done().
3522 				 * Issue a null zio if the underlying buffer
3523 				 * was squashed to zero size by compression.
3524 				 */
3525 				if (b_compress == ZIO_COMPRESS_EMPTY) {
3526 					rzio = zio_null(pio, spa, vd,
3527 					    l2arc_read_done, cb,
3528 					    zio_flags | ZIO_FLAG_DONT_CACHE |
3529 					    ZIO_FLAG_CANFAIL |
3530 					    ZIO_FLAG_DONT_PROPAGATE |
3531 					    ZIO_FLAG_DONT_RETRY);
3532 				} else {
3533 					rzio = zio_read_phys(pio, vd, addr,
3534 					    b_asize, buf->b_data,
3535 					    ZIO_CHECKSUM_OFF,
3536 					    l2arc_read_done, cb, priority,
3537 					    zio_flags | ZIO_FLAG_DONT_CACHE |
3538 					    ZIO_FLAG_CANFAIL |
3539 					    ZIO_FLAG_DONT_PROPAGATE |
3540 					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3541 				}
3542 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3543 				    zio_t *, rzio);
3544 				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3545 
3546 				if (*arc_flags & ARC_FLAG_NOWAIT) {
3547 					zio_nowait(rzio);
3548 					return (0);
3549 				}
3550 
3551 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
3552 				if (zio_wait(rzio) == 0)
3553 					return (0);
3554 
3555 				/* l2arc read error; goto zio_read() */
3556 			} else {
3557 				DTRACE_PROBE1(l2arc__miss,
3558 				    arc_buf_hdr_t *, hdr);
3559 				ARCSTAT_BUMP(arcstat_l2_misses);
3560 				if (HDR_L2_WRITING(hdr))
3561 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3562 				spa_config_exit(spa, SCL_L2ARC, vd);
3563 			}
3564 		} else {
3565 			if (vd != NULL)
3566 				spa_config_exit(spa, SCL_L2ARC, vd);
3567 			if (l2arc_ndev != 0) {
3568 				DTRACE_PROBE1(l2arc__miss,
3569 				    arc_buf_hdr_t *, hdr);
3570 				ARCSTAT_BUMP(arcstat_l2_misses);
3571 			}
3572 		}
3573 
3574 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3575 		    arc_read_done, buf, priority, zio_flags, zb);
3576 
3577 		if (*arc_flags & ARC_FLAG_WAIT)
3578 			return (zio_wait(rzio));
3579 
3580 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3581 		zio_nowait(rzio);
3582 	}
3583 	return (0);
3584 }
3585 
3586 void
arc_set_callback(arc_buf_t * buf,arc_evict_func_t * func,void * private)3587 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3588 {
3589 	ASSERT(buf->b_hdr != NULL);
3590 	ASSERT(buf->b_hdr->b_state != arc_anon);
3591 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3592 	ASSERT(buf->b_efunc == NULL);
3593 	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3594 
3595 	buf->b_efunc = func;
3596 	buf->b_private = private;
3597 }
3598 
3599 /*
3600  * Notify the arc that a block was freed, and thus will never be used again.
3601  */
3602 void
arc_freed(spa_t * spa,const blkptr_t * bp)3603 arc_freed(spa_t *spa, const blkptr_t *bp)
3604 {
3605 	arc_buf_hdr_t *hdr;
3606 	kmutex_t *hash_lock;
3607 	uint64_t guid = spa_load_guid(spa);
3608 
3609 	ASSERT(!BP_IS_EMBEDDED(bp));
3610 
3611 	hdr = buf_hash_find(guid, bp, &hash_lock);
3612 	if (hdr == NULL)
3613 		return;
3614 	if (HDR_BUF_AVAILABLE(hdr)) {
3615 		arc_buf_t *buf = hdr->b_buf;
3616 		add_reference(hdr, hash_lock, FTAG);
3617 		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3618 		mutex_exit(hash_lock);
3619 
3620 		arc_release(buf, FTAG);
3621 		(void) arc_buf_remove_ref(buf, FTAG);
3622 	} else {
3623 		mutex_exit(hash_lock);
3624 	}
3625 
3626 }
3627 
3628 /*
3629  * Clear the user eviction callback set by arc_set_callback(), first calling
3630  * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3631  * clearing the callback may result in the arc_buf being destroyed.  However,
3632  * it will not result in the *last* arc_buf being destroyed, hence the data
3633  * will remain cached in the ARC. We make a copy of the arc buffer here so
3634  * that we can process the callback without holding any locks.
3635  *
3636  * It's possible that the callback is already in the process of being cleared
3637  * by another thread.  In this case we can not clear the callback.
3638  *
3639  * Returns B_TRUE if the callback was successfully called and cleared.
3640  */
3641 boolean_t
arc_clear_callback(arc_buf_t * buf)3642 arc_clear_callback(arc_buf_t *buf)
3643 {
3644 	arc_buf_hdr_t *hdr;
3645 	kmutex_t *hash_lock;
3646 	arc_evict_func_t *efunc = buf->b_efunc;
3647 	void *private = buf->b_private;
3648 	list_t *list, *evicted_list;
3649 	kmutex_t *lock, *evicted_lock;
3650 
3651 	mutex_enter(&buf->b_evict_lock);
3652 	hdr = buf->b_hdr;
3653 	if (hdr == NULL) {
3654 		/*
3655 		 * We are in arc_do_user_evicts().
3656 		 */
3657 		ASSERT(buf->b_data == NULL);
3658 		mutex_exit(&buf->b_evict_lock);
3659 		return (B_FALSE);
3660 	} else if (buf->b_data == NULL) {
3661 		/*
3662 		 * We are on the eviction list; process this buffer now
3663 		 * but let arc_do_user_evicts() do the reaping.
3664 		 */
3665 		buf->b_efunc = NULL;
3666 		mutex_exit(&buf->b_evict_lock);
3667 		VERIFY0(efunc(private));
3668 		return (B_TRUE);
3669 	}
3670 	hash_lock = HDR_LOCK(hdr);
3671 	mutex_enter(hash_lock);
3672 	hdr = buf->b_hdr;
3673 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3674 
3675 	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3676 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3677 
3678 	buf->b_efunc = NULL;
3679 	buf->b_private = NULL;
3680 
3681 	if (hdr->b_datacnt > 1) {
3682 		mutex_exit(&buf->b_evict_lock);
3683 		arc_buf_destroy(buf, FALSE, TRUE);
3684 	} else {
3685 		ASSERT(buf == hdr->b_buf);
3686 		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3687 		mutex_exit(&buf->b_evict_lock);
3688 	}
3689 
3690 	mutex_exit(hash_lock);
3691 	VERIFY0(efunc(private));
3692 	return (B_TRUE);
3693 }
3694 
3695 /*
3696  * Release this buffer from the cache, making it an anonymous buffer.  This
3697  * must be done after a read and prior to modifying the buffer contents.
3698  * If the buffer has more than one reference, we must make
3699  * a new hdr for the buffer.
3700  */
3701 void
arc_release(arc_buf_t * buf,void * tag)3702 arc_release(arc_buf_t *buf, void *tag)
3703 {
3704 	arc_buf_hdr_t *hdr;
3705 	kmutex_t *hash_lock = NULL;
3706 	l2arc_buf_hdr_t *l2hdr;
3707 	uint64_t buf_size;
3708 
3709 	/*
3710 	 * It would be nice to assert that if it's DMU metadata (level >
3711 	 * 0 || it's the dnode file), then it must be syncing context.
3712 	 * But we don't know that information at this level.
3713 	 */
3714 
3715 	mutex_enter(&buf->b_evict_lock);
3716 	hdr = buf->b_hdr;
3717 
3718 	/* this buffer is not on any list */
3719 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3720 
3721 	if (hdr->b_state == arc_anon) {
3722 		/* this buffer is already released */
3723 		ASSERT(buf->b_efunc == NULL);
3724 	} else {
3725 		hash_lock = HDR_LOCK(hdr);
3726 		mutex_enter(hash_lock);
3727 		hdr = buf->b_hdr;
3728 		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3729 	}
3730 
3731 	l2hdr = hdr->b_l2hdr;
3732 	if (l2hdr) {
3733 		mutex_enter(&l2arc_buflist_mtx);
3734 		arc_buf_l2_cdata_free(hdr);
3735 		hdr->b_l2hdr = NULL;
3736 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3737 	}
3738 	buf_size = hdr->b_size;
3739 
3740 	/*
3741 	 * Do we have more than one buf?
3742 	 */
3743 	if (hdr->b_datacnt > 1) {
3744 		arc_buf_hdr_t *nhdr;
3745 		arc_buf_t **bufp;
3746 		uint64_t blksz = hdr->b_size;
3747 		uint64_t spa = hdr->b_spa;
3748 		arc_buf_contents_t type = hdr->b_type;
3749 		uint32_t flags = hdr->b_flags;
3750 
3751 		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3752 		/*
3753 		 * Pull the data off of this hdr and attach it to
3754 		 * a new anonymous hdr.
3755 		 */
3756 		(void) remove_reference(hdr, hash_lock, tag);
3757 		bufp = &hdr->b_buf;
3758 		while (*bufp != buf)
3759 			bufp = &(*bufp)->b_next;
3760 		*bufp = buf->b_next;
3761 		buf->b_next = NULL;
3762 
3763 		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3764 		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3765 		if (refcount_is_zero(&hdr->b_refcnt)) {
3766 			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3767 			ASSERT3U(*size, >=, hdr->b_size);
3768 			atomic_add_64(size, -hdr->b_size);
3769 		}
3770 
3771 		/*
3772 		 * We're releasing a duplicate user data buffer, update
3773 		 * our statistics accordingly.
3774 		 */
3775 		if (hdr->b_type == ARC_BUFC_DATA) {
3776 			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3777 			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3778 			    -hdr->b_size);
3779 		}
3780 		hdr->b_datacnt -= 1;
3781 		arc_cksum_verify(buf);
3782 #ifdef illumos
3783 		arc_buf_unwatch(buf);
3784 #endif /* illumos */
3785 
3786 		mutex_exit(hash_lock);
3787 
3788 		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3789 		nhdr->b_size = blksz;
3790 		nhdr->b_spa = spa;
3791 		nhdr->b_type = type;
3792 		nhdr->b_buf = buf;
3793 		nhdr->b_state = arc_anon;
3794 		nhdr->b_arc_access = 0;
3795 		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
3796 		nhdr->b_l2hdr = NULL;
3797 		nhdr->b_datacnt = 1;
3798 		nhdr->b_freeze_cksum = NULL;
3799 		(void) refcount_add(&nhdr->b_refcnt, tag);
3800 		buf->b_hdr = nhdr;
3801 		mutex_exit(&buf->b_evict_lock);
3802 		atomic_add_64(&arc_anon->arcs_size, blksz);
3803 	} else {
3804 		mutex_exit(&buf->b_evict_lock);
3805 		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3806 		ASSERT(!list_link_active(&hdr->b_arc_node));
3807 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3808 		if (hdr->b_state != arc_anon)
3809 			arc_change_state(arc_anon, hdr, hash_lock);
3810 		hdr->b_arc_access = 0;
3811 		if (hash_lock)
3812 			mutex_exit(hash_lock);
3813 
3814 		buf_discard_identity(hdr);
3815 		arc_buf_thaw(buf);
3816 	}
3817 	buf->b_efunc = NULL;
3818 	buf->b_private = NULL;
3819 
3820 	if (l2hdr) {
3821 		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3822 		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3823 		    -l2hdr->b_asize, 0, 0);
3824 		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3825 		    hdr->b_size, 0);
3826 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3827 		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3828 		mutex_exit(&l2arc_buflist_mtx);
3829 	}
3830 }
3831 
3832 int
arc_released(arc_buf_t * buf)3833 arc_released(arc_buf_t *buf)
3834 {
3835 	int released;
3836 
3837 	mutex_enter(&buf->b_evict_lock);
3838 	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3839 	mutex_exit(&buf->b_evict_lock);
3840 	return (released);
3841 }
3842 
3843 #ifdef ZFS_DEBUG
3844 int
arc_referenced(arc_buf_t * buf)3845 arc_referenced(arc_buf_t *buf)
3846 {
3847 	int referenced;
3848 
3849 	mutex_enter(&buf->b_evict_lock);
3850 	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3851 	mutex_exit(&buf->b_evict_lock);
3852 	return (referenced);
3853 }
3854 #endif
3855 
3856 static void
arc_write_ready(zio_t * zio)3857 arc_write_ready(zio_t *zio)
3858 {
3859 	arc_write_callback_t *callback = zio->io_private;
3860 	arc_buf_t *buf = callback->awcb_buf;
3861 	arc_buf_hdr_t *hdr = buf->b_hdr;
3862 
3863 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3864 	callback->awcb_ready(zio, buf, callback->awcb_private);
3865 
3866 	/*
3867 	 * If the IO is already in progress, then this is a re-write
3868 	 * attempt, so we need to thaw and re-compute the cksum.
3869 	 * It is the responsibility of the callback to handle the
3870 	 * accounting for any re-write attempt.
3871 	 */
3872 	if (HDR_IO_IN_PROGRESS(hdr)) {
3873 		mutex_enter(&hdr->b_freeze_lock);
3874 		if (hdr->b_freeze_cksum != NULL) {
3875 			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3876 			hdr->b_freeze_cksum = NULL;
3877 		}
3878 		mutex_exit(&hdr->b_freeze_lock);
3879 	}
3880 	arc_cksum_compute(buf, B_FALSE);
3881 	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3882 }
3883 
3884 /*
3885  * The SPA calls this callback for each physical write that happens on behalf
3886  * of a logical write.  See the comment in dbuf_write_physdone() for details.
3887  */
3888 static void
arc_write_physdone(zio_t * zio)3889 arc_write_physdone(zio_t *zio)
3890 {
3891 	arc_write_callback_t *cb = zio->io_private;
3892 	if (cb->awcb_physdone != NULL)
3893 		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3894 }
3895 
3896 static void
arc_write_done(zio_t * zio)3897 arc_write_done(zio_t *zio)
3898 {
3899 	arc_write_callback_t *callback = zio->io_private;
3900 	arc_buf_t *buf = callback->awcb_buf;
3901 	arc_buf_hdr_t *hdr = buf->b_hdr;
3902 
3903 	ASSERT(hdr->b_acb == NULL);
3904 
3905 	if (zio->io_error == 0) {
3906 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3907 			buf_discard_identity(hdr);
3908 		} else {
3909 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3910 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3911 			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3912 		}
3913 	} else {
3914 		ASSERT(BUF_EMPTY(hdr));
3915 	}
3916 
3917 	/*
3918 	 * If the block to be written was all-zero or compressed enough to be
3919 	 * embedded in the BP, no write was performed so there will be no
3920 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3921 	 * (and uncached).
3922 	 */
3923 	if (!BUF_EMPTY(hdr)) {
3924 		arc_buf_hdr_t *exists;
3925 		kmutex_t *hash_lock;
3926 
3927 		ASSERT(zio->io_error == 0);
3928 
3929 		arc_cksum_verify(buf);
3930 
3931 		exists = buf_hash_insert(hdr, &hash_lock);
3932 		if (exists) {
3933 			/*
3934 			 * This can only happen if we overwrite for
3935 			 * sync-to-convergence, because we remove
3936 			 * buffers from the hash table when we arc_free().
3937 			 */
3938 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3939 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3940 					panic("bad overwrite, hdr=%p exists=%p",
3941 					    (void *)hdr, (void *)exists);
3942 				ASSERT(refcount_is_zero(&exists->b_refcnt));
3943 				arc_change_state(arc_anon, exists, hash_lock);
3944 				mutex_exit(hash_lock);
3945 				arc_hdr_destroy(exists);
3946 				exists = buf_hash_insert(hdr, &hash_lock);
3947 				ASSERT3P(exists, ==, NULL);
3948 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3949 				/* nopwrite */
3950 				ASSERT(zio->io_prop.zp_nopwrite);
3951 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3952 					panic("bad nopwrite, hdr=%p exists=%p",
3953 					    (void *)hdr, (void *)exists);
3954 			} else {
3955 				/* Dedup */
3956 				ASSERT(hdr->b_datacnt == 1);
3957 				ASSERT(hdr->b_state == arc_anon);
3958 				ASSERT(BP_GET_DEDUP(zio->io_bp));
3959 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3960 			}
3961 		}
3962 		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3963 		/* if it's not anon, we are doing a scrub */
3964 		if (!exists && hdr->b_state == arc_anon)
3965 			arc_access(hdr, hash_lock);
3966 		mutex_exit(hash_lock);
3967 	} else {
3968 		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3969 	}
3970 
3971 	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3972 	callback->awcb_done(zio, buf, callback->awcb_private);
3973 
3974 	kmem_free(callback, sizeof (arc_write_callback_t));
3975 }
3976 
3977 zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t l2arc,boolean_t l2arc_compress,const zio_prop_t * zp,arc_done_func_t * ready,arc_done_func_t * physdone,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,const zbookmark_phys_t * zb)3978 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3979     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3980     const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3981     arc_done_func_t *done, void *private, zio_priority_t priority,
3982     int zio_flags, const zbookmark_phys_t *zb)
3983 {
3984 	arc_buf_hdr_t *hdr = buf->b_hdr;
3985 	arc_write_callback_t *callback;
3986 	zio_t *zio;
3987 
3988 	ASSERT(ready != NULL);
3989 	ASSERT(done != NULL);
3990 	ASSERT(!HDR_IO_ERROR(hdr));
3991 	ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0);
3992 	ASSERT(hdr->b_acb == NULL);
3993 	if (l2arc)
3994 		hdr->b_flags |= ARC_FLAG_L2CACHE;
3995 	if (l2arc_compress)
3996 		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3997 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3998 	callback->awcb_ready = ready;
3999 	callback->awcb_physdone = physdone;
4000 	callback->awcb_done = done;
4001 	callback->awcb_private = private;
4002 	callback->awcb_buf = buf;
4003 
4004 	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4005 	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4006 	    priority, zio_flags, zb);
4007 
4008 	return (zio);
4009 }
4010 
4011 static int
arc_memory_throttle(uint64_t reserve,uint64_t txg)4012 arc_memory_throttle(uint64_t reserve, uint64_t txg)
4013 {
4014 #ifdef _KERNEL
4015 	uint64_t available_memory = ptob(freemem);
4016 	static uint64_t page_load = 0;
4017 	static uint64_t last_txg = 0;
4018 
4019 #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4020 	available_memory =
4021 	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4022 #endif
4023 
4024 	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4025 		return (0);
4026 
4027 	if (txg > last_txg) {
4028 		last_txg = txg;
4029 		page_load = 0;
4030 	}
4031 	/*
4032 	 * If we are in pageout, we know that memory is already tight,
4033 	 * the arc is already going to be evicting, so we just want to
4034 	 * continue to let page writes occur as quickly as possible.
4035 	 */
4036 	if (curproc == pageproc) {
4037 		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4038 			return (SET_ERROR(ERESTART));
4039 		/* Note: reserve is inflated, so we deflate */
4040 		page_load += reserve / 8;
4041 		return (0);
4042 	} else if (page_load > 0 && arc_reclaim_needed()) {
4043 		/* memory is low, delay before restarting */
4044 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4045 		return (SET_ERROR(EAGAIN));
4046 	}
4047 	page_load = 0;
4048 #endif
4049 	return (0);
4050 }
4051 
4052 void
arc_tempreserve_clear(uint64_t reserve)4053 arc_tempreserve_clear(uint64_t reserve)
4054 {
4055 	atomic_add_64(&arc_tempreserve, -reserve);
4056 	ASSERT((int64_t)arc_tempreserve >= 0);
4057 }
4058 
4059 int
arc_tempreserve_space(uint64_t reserve,uint64_t txg)4060 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4061 {
4062 	int error;
4063 	uint64_t anon_size;
4064 
4065 	if (reserve > arc_c/4 && !arc_no_grow) {
4066 		arc_c = MIN(arc_c_max, reserve * 4);
4067 		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4068 	}
4069 	if (reserve > arc_c)
4070 		return (SET_ERROR(ENOMEM));
4071 
4072 	/*
4073 	 * Don't count loaned bufs as in flight dirty data to prevent long
4074 	 * network delays from blocking transactions that are ready to be
4075 	 * assigned to a txg.
4076 	 */
4077 	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4078 
4079 	/*
4080 	 * Writes will, almost always, require additional memory allocations
4081 	 * in order to compress/encrypt/etc the data.  We therefore need to
4082 	 * make sure that there is sufficient available memory for this.
4083 	 */
4084 	error = arc_memory_throttle(reserve, txg);
4085 	if (error != 0)
4086 		return (error);
4087 
4088 	/*
4089 	 * Throttle writes when the amount of dirty data in the cache
4090 	 * gets too large.  We try to keep the cache less than half full
4091 	 * of dirty blocks so that our sync times don't grow too large.
4092 	 * Note: if two requests come in concurrently, we might let them
4093 	 * both succeed, when one of them should fail.  Not a huge deal.
4094 	 */
4095 
4096 	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4097 	    anon_size > arc_c / 4) {
4098 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4099 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4100 		    arc_tempreserve>>10,
4101 		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4102 		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4103 		    reserve>>10, arc_c>>10);
4104 		return (SET_ERROR(ERESTART));
4105 	}
4106 	atomic_add_64(&arc_tempreserve, reserve);
4107 	return (0);
4108 }
4109 
4110 static kmutex_t arc_lowmem_lock;
4111 #ifdef _KERNEL
4112 static eventhandler_tag arc_event_lowmem = NULL;
4113 
4114 static void
arc_lowmem(void * arg __unused,int howto __unused)4115 arc_lowmem(void *arg __unused, int howto __unused)
4116 {
4117 
4118 	/* Serialize access via arc_lowmem_lock. */
4119 	mutex_enter(&arc_lowmem_lock);
4120 	mutex_enter(&arc_reclaim_thr_lock);
4121 	needfree = 1;
4122 	DTRACE_PROBE(arc__needfree);
4123 	cv_signal(&arc_reclaim_thr_cv);
4124 
4125 	/*
4126 	 * It is unsafe to block here in arbitrary threads, because we can come
4127 	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4128 	 * with ARC reclaim thread.
4129 	 */
4130 	if (curproc == pageproc) {
4131 		while (needfree)
4132 			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4133 	}
4134 	mutex_exit(&arc_reclaim_thr_lock);
4135 	mutex_exit(&arc_lowmem_lock);
4136 }
4137 #endif
4138 
4139 void
arc_init(void)4140 arc_init(void)
4141 {
4142 	int i, prefetch_tunable_set = 0;
4143 
4144 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4145 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4146 	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4147 
4148 	/* Convert seconds to clock ticks */
4149 	arc_min_prefetch_lifespan = 1 * hz;
4150 
4151 	/* Start out with 1/8 of all memory */
4152 	arc_c = kmem_size() / 8;
4153 
4154 #ifdef sun
4155 #ifdef _KERNEL
4156 	/*
4157 	 * On architectures where the physical memory can be larger
4158 	 * than the addressable space (intel in 32-bit mode), we may
4159 	 * need to limit the cache to 1/8 of VM size.
4160 	 */
4161 	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4162 #endif
4163 #endif	/* sun */
4164 	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4165 	arc_c_min = MAX(arc_c / 4, 16 << 20);
4166 	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4167 	if (arc_c * 8 >= 1 << 30)
4168 		arc_c_max = (arc_c * 8) - (1 << 30);
4169 	else
4170 		arc_c_max = arc_c_min;
4171 	arc_c_max = MAX(arc_c * 5, arc_c_max);
4172 
4173 #ifdef _KERNEL
4174 	/*
4175 	 * Allow the tunables to override our calculations if they are
4176 	 * reasonable (ie. over 16MB)
4177 	 */
4178 	if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size())
4179 		arc_c_max = zfs_arc_max;
4180 	if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max)
4181 		arc_c_min = zfs_arc_min;
4182 #endif
4183 
4184 	arc_c = arc_c_max;
4185 	arc_p = (arc_c >> 1);
4186 
4187 	/* limit meta-data to 1/4 of the arc capacity */
4188 	arc_meta_limit = arc_c_max / 4;
4189 
4190 	/* Allow the tunable to override if it is reasonable */
4191 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4192 		arc_meta_limit = zfs_arc_meta_limit;
4193 
4194 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4195 		arc_c_min = arc_meta_limit / 2;
4196 
4197 	if (zfs_arc_grow_retry > 0)
4198 		arc_grow_retry = zfs_arc_grow_retry;
4199 
4200 	if (zfs_arc_shrink_shift > 0)
4201 		arc_shrink_shift = zfs_arc_shrink_shift;
4202 
4203 	if (zfs_arc_p_min_shift > 0)
4204 		arc_p_min_shift = zfs_arc_p_min_shift;
4205 
4206 	/* if kmem_flags are set, lets try to use less memory */
4207 	if (kmem_debugging())
4208 		arc_c = arc_c / 2;
4209 	if (arc_c < arc_c_min)
4210 		arc_c = arc_c_min;
4211 
4212 	zfs_arc_min = arc_c_min;
4213 	zfs_arc_max = arc_c_max;
4214 
4215 	arc_anon = &ARC_anon;
4216 	arc_mru = &ARC_mru;
4217 	arc_mru_ghost = &ARC_mru_ghost;
4218 	arc_mfu = &ARC_mfu;
4219 	arc_mfu_ghost = &ARC_mfu_ghost;
4220 	arc_l2c_only = &ARC_l2c_only;
4221 	arc_size = 0;
4222 
4223 	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4224 		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4225 		    NULL, MUTEX_DEFAULT, NULL);
4226 		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4227 		    NULL, MUTEX_DEFAULT, NULL);
4228 		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4229 		    NULL, MUTEX_DEFAULT, NULL);
4230 		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4231 		    NULL, MUTEX_DEFAULT, NULL);
4232 		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4233 		    NULL, MUTEX_DEFAULT, NULL);
4234 		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4235 		    NULL, MUTEX_DEFAULT, NULL);
4236 
4237 		list_create(&arc_mru->arcs_lists[i],
4238 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4239 		list_create(&arc_mru_ghost->arcs_lists[i],
4240 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4241 		list_create(&arc_mfu->arcs_lists[i],
4242 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4243 		list_create(&arc_mfu_ghost->arcs_lists[i],
4244 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4245 		list_create(&arc_mfu_ghost->arcs_lists[i],
4246 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4247 		list_create(&arc_l2c_only->arcs_lists[i],
4248 		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4249 	}
4250 
4251 	buf_init();
4252 
4253 	arc_thread_exit = 0;
4254 	arc_eviction_list = NULL;
4255 	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4256 	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4257 
4258 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4259 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4260 
4261 	if (arc_ksp != NULL) {
4262 		arc_ksp->ks_data = &arc_stats;
4263 		kstat_install(arc_ksp);
4264 	}
4265 
4266 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4267 	    TS_RUN, minclsyspri);
4268 
4269 #ifdef _KERNEL
4270 	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4271 	    EVENTHANDLER_PRI_FIRST);
4272 #endif
4273 
4274 	arc_dead = FALSE;
4275 	arc_warm = B_FALSE;
4276 
4277 	/*
4278 	 * Calculate maximum amount of dirty data per pool.
4279 	 *
4280 	 * If it has been set by /etc/system, take that.
4281 	 * Otherwise, use a percentage of physical memory defined by
4282 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4283 	 * zfs_dirty_data_max_max (default 4GB).
4284 	 */
4285 	if (zfs_dirty_data_max == 0) {
4286 		zfs_dirty_data_max = ptob(physmem) *
4287 		    zfs_dirty_data_max_percent / 100;
4288 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4289 		    zfs_dirty_data_max_max);
4290 	}
4291 
4292 #ifdef _KERNEL
4293 	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4294 		prefetch_tunable_set = 1;
4295 
4296 #ifdef __i386__
4297 	if (prefetch_tunable_set == 0) {
4298 		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4299 		    "-- to enable,\n");
4300 		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4301 		    "to /boot/loader.conf.\n");
4302 		zfs_prefetch_disable = 1;
4303 	}
4304 #else
4305 	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4306 	    prefetch_tunable_set == 0) {
4307 		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4308 		    "than 4GB of RAM is present;\n"
4309 		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4310 		    "to /boot/loader.conf.\n");
4311 		zfs_prefetch_disable = 1;
4312 	}
4313 #endif
4314 	/* Warn about ZFS memory and address space requirements. */
4315 	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4316 		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4317 		    "expect unstable behavior.\n");
4318 	}
4319 	if (kmem_size() < 512 * (1 << 20)) {
4320 		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4321 		    "expect unstable behavior.\n");
4322 		printf("             Consider tuning vm.kmem_size and "
4323 		    "vm.kmem_size_max\n");
4324 		printf("             in /boot/loader.conf.\n");
4325 	}
4326 #endif
4327 }
4328 
4329 void
arc_fini(void)4330 arc_fini(void)
4331 {
4332 	int i;
4333 
4334 	mutex_enter(&arc_reclaim_thr_lock);
4335 	arc_thread_exit = 1;
4336 	cv_signal(&arc_reclaim_thr_cv);
4337 	while (arc_thread_exit != 0)
4338 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4339 	mutex_exit(&arc_reclaim_thr_lock);
4340 
4341 	arc_flush(NULL);
4342 
4343 	arc_dead = TRUE;
4344 
4345 	if (arc_ksp != NULL) {
4346 		kstat_delete(arc_ksp);
4347 		arc_ksp = NULL;
4348 	}
4349 
4350 	mutex_destroy(&arc_eviction_mtx);
4351 	mutex_destroy(&arc_reclaim_thr_lock);
4352 	cv_destroy(&arc_reclaim_thr_cv);
4353 
4354 	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4355 		list_destroy(&arc_mru->arcs_lists[i]);
4356 		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4357 		list_destroy(&arc_mfu->arcs_lists[i]);
4358 		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4359 		list_destroy(&arc_l2c_only->arcs_lists[i]);
4360 
4361 		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4362 		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4363 		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4364 		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4365 		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4366 		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4367 	}
4368 
4369 	buf_fini();
4370 
4371 	ASSERT(arc_loaned_bytes == 0);
4372 
4373 	mutex_destroy(&arc_lowmem_lock);
4374 #ifdef _KERNEL
4375 	if (arc_event_lowmem != NULL)
4376 		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4377 #endif
4378 }
4379 
4380 /*
4381  * Level 2 ARC
4382  *
4383  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4384  * It uses dedicated storage devices to hold cached data, which are populated
4385  * using large infrequent writes.  The main role of this cache is to boost
4386  * the performance of random read workloads.  The intended L2ARC devices
4387  * include short-stroked disks, solid state disks, and other media with
4388  * substantially faster read latency than disk.
4389  *
4390  *                 +-----------------------+
4391  *                 |         ARC           |
4392  *                 +-----------------------+
4393  *                    |         ^     ^
4394  *                    |         |     |
4395  *      l2arc_feed_thread()    arc_read()
4396  *                    |         |     |
4397  *                    |  l2arc read   |
4398  *                    V         |     |
4399  *               +---------------+    |
4400  *               |     L2ARC     |    |
4401  *               +---------------+    |
4402  *                   |    ^           |
4403  *          l2arc_write() |           |
4404  *                   |    |           |
4405  *                   V    |           |
4406  *                 +-------+      +-------+
4407  *                 | vdev  |      | vdev  |
4408  *                 | cache |      | cache |
4409  *                 +-------+      +-------+
4410  *                 +=========+     .-----.
4411  *                 :  L2ARC  :    |-_____-|
4412  *                 : devices :    | Disks |
4413  *                 +=========+    `-_____-'
4414  *
4415  * Read requests are satisfied from the following sources, in order:
4416  *
4417  *	1) ARC
4418  *	2) vdev cache of L2ARC devices
4419  *	3) L2ARC devices
4420  *	4) vdev cache of disks
4421  *	5) disks
4422  *
4423  * Some L2ARC device types exhibit extremely slow write performance.
4424  * To accommodate for this there are some significant differences between
4425  * the L2ARC and traditional cache design:
4426  *
4427  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4428  * the ARC behave as usual, freeing buffers and placing headers on ghost
4429  * lists.  The ARC does not send buffers to the L2ARC during eviction as
4430  * this would add inflated write latencies for all ARC memory pressure.
4431  *
4432  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4433  * It does this by periodically scanning buffers from the eviction-end of
4434  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4435  * not already there. It scans until a headroom of buffers is satisfied,
4436  * which itself is a buffer for ARC eviction. If a compressible buffer is
4437  * found during scanning and selected for writing to an L2ARC device, we
4438  * temporarily boost scanning headroom during the next scan cycle to make
4439  * sure we adapt to compression effects (which might significantly reduce
4440  * the data volume we write to L2ARC). The thread that does this is
4441  * l2arc_feed_thread(), illustrated below; example sizes are included to
4442  * provide a better sense of ratio than this diagram:
4443  *
4444  *	       head -->                        tail
4445  *	        +---------------------+----------+
4446  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4447  *	        +---------------------+----------+   |   o L2ARC eligible
4448  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4449  *	        +---------------------+----------+   |
4450  *	             15.9 Gbytes      ^ 32 Mbytes    |
4451  *	                           headroom          |
4452  *	                                      l2arc_feed_thread()
4453  *	                                             |
4454  *	                 l2arc write hand <--[oooo]--'
4455  *	                         |           8 Mbyte
4456  *	                         |          write max
4457  *	                         V
4458  *		  +==============================+
4459  *	L2ARC dev |####|#|###|###|    |####| ... |
4460  *	          +==============================+
4461  *	                     32 Gbytes
4462  *
4463  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4464  * evicted, then the L2ARC has cached a buffer much sooner than it probably
4465  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4466  * safe to say that this is an uncommon case, since buffers at the end of
4467  * the ARC lists have moved there due to inactivity.
4468  *
4469  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4470  * then the L2ARC simply misses copying some buffers.  This serves as a
4471  * pressure valve to prevent heavy read workloads from both stalling the ARC
4472  * with waits and clogging the L2ARC with writes.  This also helps prevent
4473  * the potential for the L2ARC to churn if it attempts to cache content too
4474  * quickly, such as during backups of the entire pool.
4475  *
4476  * 5. After system boot and before the ARC has filled main memory, there are
4477  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4478  * lists can remain mostly static.  Instead of searching from tail of these
4479  * lists as pictured, the l2arc_feed_thread() will search from the list heads
4480  * for eligible buffers, greatly increasing its chance of finding them.
4481  *
4482  * The L2ARC device write speed is also boosted during this time so that
4483  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4484  * there are no L2ARC reads, and no fear of degrading read performance
4485  * through increased writes.
4486  *
4487  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4488  * the vdev queue can aggregate them into larger and fewer writes.  Each
4489  * device is written to in a rotor fashion, sweeping writes through
4490  * available space then repeating.
4491  *
4492  * 7. The L2ARC does not store dirty content.  It never needs to flush
4493  * write buffers back to disk based storage.
4494  *
4495  * 8. If an ARC buffer is written (and dirtied) which also exists in the
4496  * L2ARC, the now stale L2ARC buffer is immediately dropped.
4497  *
4498  * The performance of the L2ARC can be tweaked by a number of tunables, which
4499  * may be necessary for different workloads:
4500  *
4501  *	l2arc_write_max		max write bytes per interval
4502  *	l2arc_write_boost	extra write bytes during device warmup
4503  *	l2arc_noprefetch	skip caching prefetched buffers
4504  *	l2arc_headroom		number of max device writes to precache
4505  *	l2arc_headroom_boost	when we find compressed buffers during ARC
4506  *				scanning, we multiply headroom by this
4507  *				percentage factor for the next scan cycle,
4508  *				since more compressed buffers are likely to
4509  *				be present
4510  *	l2arc_feed_secs		seconds between L2ARC writing
4511  *
4512  * Tunables may be removed or added as future performance improvements are
4513  * integrated, and also may become zpool properties.
4514  *
4515  * There are three key functions that control how the L2ARC warms up:
4516  *
4517  *	l2arc_write_eligible()	check if a buffer is eligible to cache
4518  *	l2arc_write_size()	calculate how much to write
4519  *	l2arc_write_interval()	calculate sleep delay between writes
4520  *
4521  * These three functions determine what to write, how much, and how quickly
4522  * to send writes.
4523  */
4524 
4525 static boolean_t
l2arc_write_eligible(uint64_t spa_guid,arc_buf_hdr_t * hdr)4526 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
4527 {
4528 	/*
4529 	 * A buffer is *not* eligible for the L2ARC if it:
4530 	 * 1. belongs to a different spa.
4531 	 * 2. is already cached on the L2ARC.
4532 	 * 3. has an I/O in progress (it may be an incomplete read).
4533 	 * 4. is flagged not eligible (zfs property).
4534 	 */
4535 	if (hdr->b_spa != spa_guid) {
4536 		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4537 		return (B_FALSE);
4538 	}
4539 	if (hdr->b_l2hdr != NULL) {
4540 		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4541 		return (B_FALSE);
4542 	}
4543 	if (HDR_IO_IN_PROGRESS(hdr)) {
4544 		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4545 		return (B_FALSE);
4546 	}
4547 	if (!HDR_L2CACHE(hdr)) {
4548 		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4549 		return (B_FALSE);
4550 	}
4551 
4552 	return (B_TRUE);
4553 }
4554 
4555 static uint64_t
l2arc_write_size(void)4556 l2arc_write_size(void)
4557 {
4558 	uint64_t size;
4559 
4560 	/*
4561 	 * Make sure our globals have meaningful values in case the user
4562 	 * altered them.
4563 	 */
4564 	size = l2arc_write_max;
4565 	if (size == 0) {
4566 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4567 		    "be greater than zero, resetting it to the default (%d)",
4568 		    L2ARC_WRITE_SIZE);
4569 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4570 	}
4571 
4572 	if (arc_warm == B_FALSE)
4573 		size += l2arc_write_boost;
4574 
4575 	return (size);
4576 
4577 }
4578 
4579 static clock_t
l2arc_write_interval(clock_t began,uint64_t wanted,uint64_t wrote)4580 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4581 {
4582 	clock_t interval, next, now;
4583 
4584 	/*
4585 	 * If the ARC lists are busy, increase our write rate; if the
4586 	 * lists are stale, idle back.  This is achieved by checking
4587 	 * how much we previously wrote - if it was more than half of
4588 	 * what we wanted, schedule the next write much sooner.
4589 	 */
4590 	if (l2arc_feed_again && wrote > (wanted / 2))
4591 		interval = (hz * l2arc_feed_min_ms) / 1000;
4592 	else
4593 		interval = hz * l2arc_feed_secs;
4594 
4595 	now = ddi_get_lbolt();
4596 	next = MAX(now, MIN(now + interval, began + interval));
4597 
4598 	return (next);
4599 }
4600 
4601 static void
l2arc_hdr_stat_add(void)4602 l2arc_hdr_stat_add(void)
4603 {
4604 	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4605 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4606 }
4607 
4608 static void
l2arc_hdr_stat_remove(void)4609 l2arc_hdr_stat_remove(void)
4610 {
4611 	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4612 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4613 }
4614 
4615 /*
4616  * Cycle through L2ARC devices.  This is how L2ARC load balances.
4617  * If a device is returned, this also returns holding the spa config lock.
4618  */
4619 static l2arc_dev_t *
l2arc_dev_get_next(void)4620 l2arc_dev_get_next(void)
4621 {
4622 	l2arc_dev_t *first, *next = NULL;
4623 
4624 	/*
4625 	 * Lock out the removal of spas (spa_namespace_lock), then removal
4626 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4627 	 * both locks will be dropped and a spa config lock held instead.
4628 	 */
4629 	mutex_enter(&spa_namespace_lock);
4630 	mutex_enter(&l2arc_dev_mtx);
4631 
4632 	/* if there are no vdevs, there is nothing to do */
4633 	if (l2arc_ndev == 0)
4634 		goto out;
4635 
4636 	first = NULL;
4637 	next = l2arc_dev_last;
4638 	do {
4639 		/* loop around the list looking for a non-faulted vdev */
4640 		if (next == NULL) {
4641 			next = list_head(l2arc_dev_list);
4642 		} else {
4643 			next = list_next(l2arc_dev_list, next);
4644 			if (next == NULL)
4645 				next = list_head(l2arc_dev_list);
4646 		}
4647 
4648 		/* if we have come back to the start, bail out */
4649 		if (first == NULL)
4650 			first = next;
4651 		else if (next == first)
4652 			break;
4653 
4654 	} while (vdev_is_dead(next->l2ad_vdev));
4655 
4656 	/* if we were unable to find any usable vdevs, return NULL */
4657 	if (vdev_is_dead(next->l2ad_vdev))
4658 		next = NULL;
4659 
4660 	l2arc_dev_last = next;
4661 
4662 out:
4663 	mutex_exit(&l2arc_dev_mtx);
4664 
4665 	/*
4666 	 * Grab the config lock to prevent the 'next' device from being
4667 	 * removed while we are writing to it.
4668 	 */
4669 	if (next != NULL)
4670 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4671 	mutex_exit(&spa_namespace_lock);
4672 
4673 	return (next);
4674 }
4675 
4676 /*
4677  * Free buffers that were tagged for destruction.
4678  */
4679 static void
l2arc_do_free_on_write()4680 l2arc_do_free_on_write()
4681 {
4682 	list_t *buflist;
4683 	l2arc_data_free_t *df, *df_prev;
4684 
4685 	mutex_enter(&l2arc_free_on_write_mtx);
4686 	buflist = l2arc_free_on_write;
4687 
4688 	for (df = list_tail(buflist); df; df = df_prev) {
4689 		df_prev = list_prev(buflist, df);
4690 		ASSERT(df->l2df_data != NULL);
4691 		ASSERT(df->l2df_func != NULL);
4692 		df->l2df_func(df->l2df_data, df->l2df_size);
4693 		list_remove(buflist, df);
4694 		kmem_free(df, sizeof (l2arc_data_free_t));
4695 	}
4696 
4697 	mutex_exit(&l2arc_free_on_write_mtx);
4698 }
4699 
4700 /*
4701  * A write to a cache device has completed.  Update all headers to allow
4702  * reads from these buffers to begin.
4703  */
4704 static void
l2arc_write_done(zio_t * zio)4705 l2arc_write_done(zio_t *zio)
4706 {
4707 	l2arc_write_callback_t *cb;
4708 	l2arc_dev_t *dev;
4709 	list_t *buflist;
4710 	arc_buf_hdr_t *head, *hdr, *hdr_prev;
4711 	l2arc_buf_hdr_t *abl2;
4712 	kmutex_t *hash_lock;
4713 	int64_t bytes_dropped = 0;
4714 
4715 	cb = zio->io_private;
4716 	ASSERT(cb != NULL);
4717 	dev = cb->l2wcb_dev;
4718 	ASSERT(dev != NULL);
4719 	head = cb->l2wcb_head;
4720 	ASSERT(head != NULL);
4721 	buflist = dev->l2ad_buflist;
4722 	ASSERT(buflist != NULL);
4723 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4724 	    l2arc_write_callback_t *, cb);
4725 
4726 	if (zio->io_error != 0)
4727 		ARCSTAT_BUMP(arcstat_l2_writes_error);
4728 
4729 	mutex_enter(&l2arc_buflist_mtx);
4730 
4731 	/*
4732 	 * All writes completed, or an error was hit.
4733 	 */
4734 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
4735 		hdr_prev = list_prev(buflist, hdr);
4736 		abl2 = hdr->b_l2hdr;
4737 
4738 		/*
4739 		 * Release the temporary compressed buffer as soon as possible.
4740 		 */
4741 		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4742 			l2arc_release_cdata_buf(hdr);
4743 
4744 		hash_lock = HDR_LOCK(hdr);
4745 		if (!mutex_tryenter(hash_lock)) {
4746 			/*
4747 			 * This buffer misses out.  It may be in a stage
4748 			 * of eviction.  Its ARC_L2_WRITING flag will be
4749 			 * left set, denying reads to this buffer.
4750 			 */
4751 			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4752 			continue;
4753 		}
4754 
4755 		if (zio->io_error != 0) {
4756 			/*
4757 			 * Error - drop L2ARC entry.
4758 			 */
4759 			list_remove(buflist, hdr);
4760 			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4761 			bytes_dropped += abl2->b_asize;
4762 			hdr->b_l2hdr = NULL;
4763 			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4764 			    hdr->b_size, 0);
4765 			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4766 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
4767 		}
4768 
4769 		/*
4770 		 * Allow ARC to begin reads to this L2ARC entry.
4771 		 */
4772 		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
4773 
4774 		mutex_exit(hash_lock);
4775 	}
4776 
4777 	atomic_inc_64(&l2arc_writes_done);
4778 	list_remove(buflist, head);
4779 	kmem_cache_free(hdr_cache, head);
4780 	mutex_exit(&l2arc_buflist_mtx);
4781 
4782 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4783 
4784 	l2arc_do_free_on_write();
4785 
4786 	kmem_free(cb, sizeof (l2arc_write_callback_t));
4787 }
4788 
4789 /*
4790  * A read to a cache device completed.  Validate buffer contents before
4791  * handing over to the regular ARC routines.
4792  */
4793 static void
l2arc_read_done(zio_t * zio)4794 l2arc_read_done(zio_t *zio)
4795 {
4796 	l2arc_read_callback_t *cb;
4797 	arc_buf_hdr_t *hdr;
4798 	arc_buf_t *buf;
4799 	kmutex_t *hash_lock;
4800 	int equal;
4801 
4802 	ASSERT(zio->io_vd != NULL);
4803 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4804 
4805 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4806 
4807 	cb = zio->io_private;
4808 	ASSERT(cb != NULL);
4809 	buf = cb->l2rcb_buf;
4810 	ASSERT(buf != NULL);
4811 
4812 	hash_lock = HDR_LOCK(buf->b_hdr);
4813 	mutex_enter(hash_lock);
4814 	hdr = buf->b_hdr;
4815 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4816 
4817 	/*
4818 	 * If the buffer was compressed, decompress it first.
4819 	 */
4820 	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4821 		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4822 	ASSERT(zio->io_data != NULL);
4823 
4824 	/*
4825 	 * Check this survived the L2ARC journey.
4826 	 */
4827 	equal = arc_cksum_equal(buf);
4828 	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4829 		mutex_exit(hash_lock);
4830 		zio->io_private = buf;
4831 		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4832 		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4833 		arc_read_done(zio);
4834 	} else {
4835 		mutex_exit(hash_lock);
4836 		/*
4837 		 * Buffer didn't survive caching.  Increment stats and
4838 		 * reissue to the original storage device.
4839 		 */
4840 		if (zio->io_error != 0) {
4841 			ARCSTAT_BUMP(arcstat_l2_io_error);
4842 		} else {
4843 			zio->io_error = SET_ERROR(EIO);
4844 		}
4845 		if (!equal)
4846 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4847 
4848 		/*
4849 		 * If there's no waiter, issue an async i/o to the primary
4850 		 * storage now.  If there *is* a waiter, the caller must
4851 		 * issue the i/o in a context where it's OK to block.
4852 		 */
4853 		if (zio->io_waiter == NULL) {
4854 			zio_t *pio = zio_unique_parent(zio);
4855 
4856 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4857 
4858 			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4859 			    buf->b_data, zio->io_size, arc_read_done, buf,
4860 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4861 		}
4862 	}
4863 
4864 	kmem_free(cb, sizeof (l2arc_read_callback_t));
4865 }
4866 
4867 /*
4868  * This is the list priority from which the L2ARC will search for pages to
4869  * cache.  This is used within loops (0..3) to cycle through lists in the
4870  * desired order.  This order can have a significant effect on cache
4871  * performance.
4872  *
4873  * Currently the metadata lists are hit first, MFU then MRU, followed by
4874  * the data lists.  This function returns a locked list, and also returns
4875  * the lock pointer.
4876  */
4877 static list_t *
l2arc_list_locked(int list_num,kmutex_t ** lock)4878 l2arc_list_locked(int list_num, kmutex_t **lock)
4879 {
4880 	list_t *list = NULL;
4881 	int idx;
4882 
4883 	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4884 
4885 	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4886 		idx = list_num;
4887 		list = &arc_mfu->arcs_lists[idx];
4888 		*lock = ARCS_LOCK(arc_mfu, idx);
4889 	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4890 		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4891 		list = &arc_mru->arcs_lists[idx];
4892 		*lock = ARCS_LOCK(arc_mru, idx);
4893 	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4894 		ARC_BUFC_NUMDATALISTS)) {
4895 		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4896 		list = &arc_mfu->arcs_lists[idx];
4897 		*lock = ARCS_LOCK(arc_mfu, idx);
4898 	} else {
4899 		idx = list_num - ARC_BUFC_NUMLISTS;
4900 		list = &arc_mru->arcs_lists[idx];
4901 		*lock = ARCS_LOCK(arc_mru, idx);
4902 	}
4903 
4904 	ASSERT(!(MUTEX_HELD(*lock)));
4905 	mutex_enter(*lock);
4906 	return (list);
4907 }
4908 
4909 /*
4910  * Evict buffers from the device write hand to the distance specified in
4911  * bytes.  This distance may span populated buffers, it may span nothing.
4912  * This is clearing a region on the L2ARC device ready for writing.
4913  * If the 'all' boolean is set, every buffer is evicted.
4914  */
4915 static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)4916 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4917 {
4918 	list_t *buflist;
4919 	l2arc_buf_hdr_t *abl2;
4920 	arc_buf_hdr_t *hdr, *hdr_prev;
4921 	kmutex_t *hash_lock;
4922 	uint64_t taddr;
4923 	int64_t bytes_evicted = 0;
4924 
4925 	buflist = dev->l2ad_buflist;
4926 
4927 	if (buflist == NULL)
4928 		return;
4929 
4930 	if (!all && dev->l2ad_first) {
4931 		/*
4932 		 * This is the first sweep through the device.  There is
4933 		 * nothing to evict.
4934 		 */
4935 		return;
4936 	}
4937 
4938 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4939 		/*
4940 		 * When nearing the end of the device, evict to the end
4941 		 * before the device write hand jumps to the start.
4942 		 */
4943 		taddr = dev->l2ad_end;
4944 	} else {
4945 		taddr = dev->l2ad_hand + distance;
4946 	}
4947 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4948 	    uint64_t, taddr, boolean_t, all);
4949 
4950 top:
4951 	mutex_enter(&l2arc_buflist_mtx);
4952 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
4953 		hdr_prev = list_prev(buflist, hdr);
4954 
4955 		hash_lock = HDR_LOCK(hdr);
4956 		if (!mutex_tryenter(hash_lock)) {
4957 			/*
4958 			 * Missed the hash lock.  Retry.
4959 			 */
4960 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4961 			mutex_exit(&l2arc_buflist_mtx);
4962 			mutex_enter(hash_lock);
4963 			mutex_exit(hash_lock);
4964 			goto top;
4965 		}
4966 
4967 		if (HDR_L2_WRITE_HEAD(hdr)) {
4968 			/*
4969 			 * We hit a write head node.  Leave it for
4970 			 * l2arc_write_done().
4971 			 */
4972 			list_remove(buflist, hdr);
4973 			mutex_exit(hash_lock);
4974 			continue;
4975 		}
4976 
4977 		if (!all && hdr->b_l2hdr != NULL &&
4978 		    (hdr->b_l2hdr->b_daddr > taddr ||
4979 		    hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4980 			/*
4981 			 * We've evicted to the target address,
4982 			 * or the end of the device.
4983 			 */
4984 			mutex_exit(hash_lock);
4985 			break;
4986 		}
4987 
4988 		if (HDR_FREE_IN_PROGRESS(hdr)) {
4989 			/*
4990 			 * Already on the path to destruction.
4991 			 */
4992 			mutex_exit(hash_lock);
4993 			continue;
4994 		}
4995 
4996 		if (hdr->b_state == arc_l2c_only) {
4997 			ASSERT(!HDR_L2_READING(hdr));
4998 			/*
4999 			 * This doesn't exist in the ARC.  Destroy.
5000 			 * arc_hdr_destroy() will call list_remove()
5001 			 * and decrement arcstat_l2_size.
5002 			 */
5003 			arc_change_state(arc_anon, hdr, hash_lock);
5004 			arc_hdr_destroy(hdr);
5005 		} else {
5006 			/*
5007 			 * Invalidate issued or about to be issued
5008 			 * reads, since we may be about to write
5009 			 * over this location.
5010 			 */
5011 			if (HDR_L2_READING(hdr)) {
5012 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5013 				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5014 			}
5015 
5016 			/*
5017 			 * Tell ARC this no longer exists in L2ARC.
5018 			 */
5019 			if (hdr->b_l2hdr != NULL) {
5020 				abl2 = hdr->b_l2hdr;
5021 				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
5022 				bytes_evicted += abl2->b_asize;
5023 				hdr->b_l2hdr = NULL;
5024 				/*
5025 				 * We are destroying l2hdr, so ensure that
5026 				 * its compressed buffer, if any, is not leaked.
5027 				 */
5028 				ASSERT(abl2->b_tmp_cdata == NULL);
5029 				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
5030 				ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5031 			}
5032 			list_remove(buflist, hdr);
5033 
5034 			/*
5035 			 * This may have been leftover after a
5036 			 * failed write.
5037 			 */
5038 			hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5039 		}
5040 		mutex_exit(hash_lock);
5041 	}
5042 	mutex_exit(&l2arc_buflist_mtx);
5043 
5044 	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5045 	dev->l2ad_evict = taddr;
5046 }
5047 
5048 /*
5049  * Find and write ARC buffers to the L2ARC device.
5050  *
5051  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5052  * for reading until they have completed writing.
5053  * The headroom_boost is an in-out parameter used to maintain headroom boost
5054  * state between calls to this function.
5055  *
5056  * Returns the number of bytes actually written (which may be smaller than
5057  * the delta by which the device hand has changed due to alignment).
5058  */
5059 static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz,boolean_t * headroom_boost)5060 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5061     boolean_t *headroom_boost)
5062 {
5063 	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5064 	list_t *list;
5065 	uint64_t write_asize, write_psize, write_sz, headroom,
5066 	    buf_compress_minsz;
5067 	void *buf_data;
5068 	kmutex_t *list_lock;
5069 	boolean_t full;
5070 	l2arc_write_callback_t *cb;
5071 	zio_t *pio, *wzio;
5072 	uint64_t guid = spa_load_guid(spa);
5073 	const boolean_t do_headroom_boost = *headroom_boost;
5074 	int try;
5075 
5076 	ASSERT(dev->l2ad_vdev != NULL);
5077 
5078 	/* Lower the flag now, we might want to raise it again later. */
5079 	*headroom_boost = B_FALSE;
5080 
5081 	pio = NULL;
5082 	write_sz = write_asize = write_psize = 0;
5083 	full = B_FALSE;
5084 	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5085 	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5086 
5087 	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5088 	/*
5089 	 * We will want to try to compress buffers that are at least 2x the
5090 	 * device sector size.
5091 	 */
5092 	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5093 
5094 	/*
5095 	 * Copy buffers for L2ARC writing.
5096 	 */
5097 	mutex_enter(&l2arc_buflist_mtx);
5098 	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5099 		uint64_t passed_sz = 0;
5100 
5101 		list = l2arc_list_locked(try, &list_lock);
5102 		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5103 
5104 		/*
5105 		 * L2ARC fast warmup.
5106 		 *
5107 		 * Until the ARC is warm and starts to evict, read from the
5108 		 * head of the ARC lists rather than the tail.
5109 		 */
5110 		if (arc_warm == B_FALSE)
5111 			hdr = list_head(list);
5112 		else
5113 			hdr = list_tail(list);
5114 		if (hdr == NULL)
5115 			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5116 
5117 		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5118 		if (do_headroom_boost)
5119 			headroom = (headroom * l2arc_headroom_boost) / 100;
5120 
5121 		for (; hdr; hdr = hdr_prev) {
5122 			l2arc_buf_hdr_t *l2hdr;
5123 			kmutex_t *hash_lock;
5124 			uint64_t buf_sz;
5125 
5126 			if (arc_warm == B_FALSE)
5127 				hdr_prev = list_next(list, hdr);
5128 			else
5129 				hdr_prev = list_prev(list, hdr);
5130 			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5131 
5132 			hash_lock = HDR_LOCK(hdr);
5133 			if (!mutex_tryenter(hash_lock)) {
5134 				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5135 				/*
5136 				 * Skip this buffer rather than waiting.
5137 				 */
5138 				continue;
5139 			}
5140 
5141 			passed_sz += hdr->b_size;
5142 			if (passed_sz > headroom) {
5143 				/*
5144 				 * Searched too far.
5145 				 */
5146 				mutex_exit(hash_lock);
5147 				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5148 				break;
5149 			}
5150 
5151 			if (!l2arc_write_eligible(guid, hdr)) {
5152 				mutex_exit(hash_lock);
5153 				continue;
5154 			}
5155 
5156 			if ((write_sz + hdr->b_size) > target_sz) {
5157 				full = B_TRUE;
5158 				mutex_exit(hash_lock);
5159 				ARCSTAT_BUMP(arcstat_l2_write_full);
5160 				break;
5161 			}
5162 
5163 			if (pio == NULL) {
5164 				/*
5165 				 * Insert a dummy header on the buflist so
5166 				 * l2arc_write_done() can find where the
5167 				 * write buffers begin without searching.
5168 				 */
5169 				list_insert_head(dev->l2ad_buflist, head);
5170 
5171 				cb = kmem_alloc(
5172 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5173 				cb->l2wcb_dev = dev;
5174 				cb->l2wcb_head = head;
5175 				pio = zio_root(spa, l2arc_write_done, cb,
5176 				    ZIO_FLAG_CANFAIL);
5177 				ARCSTAT_BUMP(arcstat_l2_write_pios);
5178 			}
5179 
5180 			/*
5181 			 * Create and add a new L2ARC header.
5182 			 */
5183 			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5184 			l2hdr->b_dev = dev;
5185 			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5186 
5187 			/*
5188 			 * Temporarily stash the data buffer in b_tmp_cdata.
5189 			 * The subsequent write step will pick it up from
5190 			 * there. This is because can't access hdr->b_buf
5191 			 * without holding the hash_lock, which we in turn
5192 			 * can't access without holding the ARC list locks
5193 			 * (which we want to avoid during compression/writing).
5194 			 */
5195 			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5196 			l2hdr->b_asize = hdr->b_size;
5197 			l2hdr->b_tmp_cdata = hdr->b_buf->b_data;
5198 
5199 			buf_sz = hdr->b_size;
5200 			hdr->b_l2hdr = l2hdr;
5201 
5202 			list_insert_head(dev->l2ad_buflist, hdr);
5203 
5204 			/*
5205 			 * Compute and store the buffer cksum before
5206 			 * writing.  On debug the cksum is verified first.
5207 			 */
5208 			arc_cksum_verify(hdr->b_buf);
5209 			arc_cksum_compute(hdr->b_buf, B_TRUE);
5210 
5211 			mutex_exit(hash_lock);
5212 
5213 			write_sz += buf_sz;
5214 		}
5215 
5216 		mutex_exit(list_lock);
5217 
5218 		if (full == B_TRUE)
5219 			break;
5220 	}
5221 
5222 	/* No buffers selected for writing? */
5223 	if (pio == NULL) {
5224 		ASSERT0(write_sz);
5225 		mutex_exit(&l2arc_buflist_mtx);
5226 		kmem_cache_free(hdr_cache, head);
5227 		return (0);
5228 	}
5229 
5230 	/*
5231 	 * Now start writing the buffers. We're starting at the write head
5232 	 * and work backwards, retracing the course of the buffer selector
5233 	 * loop above.
5234 	 */
5235 	for (hdr = list_prev(dev->l2ad_buflist, head); hdr;
5236 	    hdr = list_prev(dev->l2ad_buflist, hdr)) {
5237 		l2arc_buf_hdr_t *l2hdr;
5238 		uint64_t buf_sz;
5239 
5240 		/*
5241 		 * We shouldn't need to lock the buffer here, since we flagged
5242 		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5243 		 * take care to only access its L2 cache parameters. In
5244 		 * particular, hdr->b_buf may be invalid by now due to
5245 		 * ARC eviction.
5246 		 */
5247 		l2hdr = hdr->b_l2hdr;
5248 		l2hdr->b_daddr = dev->l2ad_hand;
5249 
5250 		if ((hdr->b_flags & ARC_FLAG_L2COMPRESS) &&
5251 		    l2hdr->b_asize >= buf_compress_minsz) {
5252 			if (l2arc_compress_buf(l2hdr)) {
5253 				/*
5254 				 * If compression succeeded, enable headroom
5255 				 * boost on the next scan cycle.
5256 				 */
5257 				*headroom_boost = B_TRUE;
5258 			}
5259 		}
5260 
5261 		/*
5262 		 * Pick up the buffer data we had previously stashed away
5263 		 * (and now potentially also compressed).
5264 		 */
5265 		buf_data = l2hdr->b_tmp_cdata;
5266 		buf_sz = l2hdr->b_asize;
5267 
5268 		/*
5269 		 * If the data has not been compressed, then clear b_tmp_cdata
5270 		 * to make sure that it points only to a temporary compression
5271 		 * buffer.
5272 		 */
5273 		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5274 			l2hdr->b_tmp_cdata = NULL;
5275 
5276 		/* Compression may have squashed the buffer to zero length. */
5277 		if (buf_sz != 0) {
5278 			uint64_t buf_p_sz;
5279 
5280 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5281 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5282 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5283 			    ZIO_FLAG_CANFAIL, B_FALSE);
5284 
5285 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5286 			    zio_t *, wzio);
5287 			(void) zio_nowait(wzio);
5288 
5289 			write_asize += buf_sz;
5290 			/*
5291 			 * Keep the clock hand suitably device-aligned.
5292 			 */
5293 			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5294 			write_psize += buf_p_sz;
5295 			dev->l2ad_hand += buf_p_sz;
5296 		}
5297 	}
5298 
5299 	mutex_exit(&l2arc_buflist_mtx);
5300 
5301 	ASSERT3U(write_asize, <=, target_sz);
5302 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5303 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5304 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5305 	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5306 	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5307 
5308 	/*
5309 	 * Bump device hand to the device start if it is approaching the end.
5310 	 * l2arc_evict() will already have evicted ahead for this case.
5311 	 */
5312 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5313 		dev->l2ad_hand = dev->l2ad_start;
5314 		dev->l2ad_evict = dev->l2ad_start;
5315 		dev->l2ad_first = B_FALSE;
5316 	}
5317 
5318 	dev->l2ad_writing = B_TRUE;
5319 	(void) zio_wait(pio);
5320 	dev->l2ad_writing = B_FALSE;
5321 
5322 	return (write_asize);
5323 }
5324 
5325 /*
5326  * Compresses an L2ARC buffer.
5327  * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5328  * size in l2hdr->b_asize. This routine tries to compress the data and
5329  * depending on the compression result there are three possible outcomes:
5330  * *) The buffer was incompressible. The original l2hdr contents were left
5331  *    untouched and are ready for writing to an L2 device.
5332  * *) The buffer was all-zeros, so there is no need to write it to an L2
5333  *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5334  *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5335  * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5336  *    data buffer which holds the compressed data to be written, and b_asize
5337  *    tells us how much data there is. b_compress is set to the appropriate
5338  *    compression algorithm. Once writing is done, invoke
5339  *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5340  *
5341  * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5342  * buffer was incompressible).
5343  */
5344 static boolean_t
l2arc_compress_buf(l2arc_buf_hdr_t * l2hdr)5345 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5346 {
5347 	void *cdata;
5348 	size_t csize, len, rounded;
5349 
5350 	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5351 	ASSERT(l2hdr->b_tmp_cdata != NULL);
5352 
5353 	len = l2hdr->b_asize;
5354 	cdata = zio_data_buf_alloc(len);
5355 	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5356 	    cdata, l2hdr->b_asize);
5357 
5358 	if (csize == 0) {
5359 		/* zero block, indicate that there's nothing to write */
5360 		zio_data_buf_free(cdata, len);
5361 		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5362 		l2hdr->b_asize = 0;
5363 		l2hdr->b_tmp_cdata = NULL;
5364 		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5365 		return (B_TRUE);
5366 	}
5367 
5368 	rounded = P2ROUNDUP(csize,
5369 	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5370 	if (rounded < len) {
5371 		/*
5372 		 * Compression succeeded, we'll keep the cdata around for
5373 		 * writing and release it afterwards.
5374 		 */
5375 		if (rounded > csize) {
5376 			bzero((char *)cdata + csize, rounded - csize);
5377 			csize = rounded;
5378 		}
5379 		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5380 		l2hdr->b_asize = csize;
5381 		l2hdr->b_tmp_cdata = cdata;
5382 		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5383 		return (B_TRUE);
5384 	} else {
5385 		/*
5386 		 * Compression failed, release the compressed buffer.
5387 		 * l2hdr will be left unmodified.
5388 		 */
5389 		zio_data_buf_free(cdata, len);
5390 		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5391 		return (B_FALSE);
5392 	}
5393 }
5394 
5395 /*
5396  * Decompresses a zio read back from an l2arc device. On success, the
5397  * underlying zio's io_data buffer is overwritten by the uncompressed
5398  * version. On decompression error (corrupt compressed stream), the
5399  * zio->io_error value is set to signal an I/O error.
5400  *
5401  * Please note that the compressed data stream is not checksummed, so
5402  * if the underlying device is experiencing data corruption, we may feed
5403  * corrupt data to the decompressor, so the decompressor needs to be
5404  * able to handle this situation (LZ4 does).
5405  */
5406 static void
l2arc_decompress_zio(zio_t * zio,arc_buf_hdr_t * hdr,enum zio_compress c)5407 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5408 {
5409 	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5410 
5411 	if (zio->io_error != 0) {
5412 		/*
5413 		 * An io error has occured, just restore the original io
5414 		 * size in preparation for a main pool read.
5415 		 */
5416 		zio->io_orig_size = zio->io_size = hdr->b_size;
5417 		return;
5418 	}
5419 
5420 	if (c == ZIO_COMPRESS_EMPTY) {
5421 		/*
5422 		 * An empty buffer results in a null zio, which means we
5423 		 * need to fill its io_data after we're done restoring the
5424 		 * buffer's contents.
5425 		 */
5426 		ASSERT(hdr->b_buf != NULL);
5427 		bzero(hdr->b_buf->b_data, hdr->b_size);
5428 		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5429 	} else {
5430 		ASSERT(zio->io_data != NULL);
5431 		/*
5432 		 * We copy the compressed data from the start of the arc buffer
5433 		 * (the zio_read will have pulled in only what we need, the
5434 		 * rest is garbage which we will overwrite at decompression)
5435 		 * and then decompress back to the ARC data buffer. This way we
5436 		 * can minimize copying by simply decompressing back over the
5437 		 * original compressed data (rather than decompressing to an
5438 		 * aux buffer and then copying back the uncompressed buffer,
5439 		 * which is likely to be much larger).
5440 		 */
5441 		uint64_t csize;
5442 		void *cdata;
5443 
5444 		csize = zio->io_size;
5445 		cdata = zio_data_buf_alloc(csize);
5446 		bcopy(zio->io_data, cdata, csize);
5447 		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5448 		    hdr->b_size) != 0)
5449 			zio->io_error = EIO;
5450 		zio_data_buf_free(cdata, csize);
5451 	}
5452 
5453 	/* Restore the expected uncompressed IO size. */
5454 	zio->io_orig_size = zio->io_size = hdr->b_size;
5455 }
5456 
5457 /*
5458  * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5459  * This buffer serves as a temporary holder of compressed data while
5460  * the buffer entry is being written to an l2arc device. Once that is
5461  * done, we can dispose of it.
5462  */
5463 static void
l2arc_release_cdata_buf(arc_buf_hdr_t * hdr)5464 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
5465 {
5466 	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
5467 
5468 	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5469 	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5470 		/*
5471 		 * If the data was compressed, then we've allocated a
5472 		 * temporary buffer for it, so now we need to release it.
5473 		 */
5474 		ASSERT(l2hdr->b_tmp_cdata != NULL);
5475 		zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size);
5476 		l2hdr->b_tmp_cdata = NULL;
5477 	} else {
5478 		ASSERT(l2hdr->b_tmp_cdata == NULL);
5479 	}
5480 }
5481 
5482 /*
5483  * This thread feeds the L2ARC at regular intervals.  This is the beating
5484  * heart of the L2ARC.
5485  */
5486 static void
l2arc_feed_thread(void * dummy __unused)5487 l2arc_feed_thread(void *dummy __unused)
5488 {
5489 	callb_cpr_t cpr;
5490 	l2arc_dev_t *dev;
5491 	spa_t *spa;
5492 	uint64_t size, wrote;
5493 	clock_t begin, next = ddi_get_lbolt();
5494 	boolean_t headroom_boost = B_FALSE;
5495 
5496 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5497 
5498 	mutex_enter(&l2arc_feed_thr_lock);
5499 
5500 	while (l2arc_thread_exit == 0) {
5501 		CALLB_CPR_SAFE_BEGIN(&cpr);
5502 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5503 		    next - ddi_get_lbolt());
5504 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5505 		next = ddi_get_lbolt() + hz;
5506 
5507 		/*
5508 		 * Quick check for L2ARC devices.
5509 		 */
5510 		mutex_enter(&l2arc_dev_mtx);
5511 		if (l2arc_ndev == 0) {
5512 			mutex_exit(&l2arc_dev_mtx);
5513 			continue;
5514 		}
5515 		mutex_exit(&l2arc_dev_mtx);
5516 		begin = ddi_get_lbolt();
5517 
5518 		/*
5519 		 * This selects the next l2arc device to write to, and in
5520 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5521 		 * will return NULL if there are now no l2arc devices or if
5522 		 * they are all faulted.
5523 		 *
5524 		 * If a device is returned, its spa's config lock is also
5525 		 * held to prevent device removal.  l2arc_dev_get_next()
5526 		 * will grab and release l2arc_dev_mtx.
5527 		 */
5528 		if ((dev = l2arc_dev_get_next()) == NULL)
5529 			continue;
5530 
5531 		spa = dev->l2ad_spa;
5532 		ASSERT(spa != NULL);
5533 
5534 		/*
5535 		 * If the pool is read-only then force the feed thread to
5536 		 * sleep a little longer.
5537 		 */
5538 		if (!spa_writeable(spa)) {
5539 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5540 			spa_config_exit(spa, SCL_L2ARC, dev);
5541 			continue;
5542 		}
5543 
5544 		/*
5545 		 * Avoid contributing to memory pressure.
5546 		 */
5547 		if (arc_reclaim_needed()) {
5548 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5549 			spa_config_exit(spa, SCL_L2ARC, dev);
5550 			continue;
5551 		}
5552 
5553 		ARCSTAT_BUMP(arcstat_l2_feeds);
5554 
5555 		size = l2arc_write_size();
5556 
5557 		/*
5558 		 * Evict L2ARC buffers that will be overwritten.
5559 		 */
5560 		l2arc_evict(dev, size, B_FALSE);
5561 
5562 		/*
5563 		 * Write ARC buffers.
5564 		 */
5565 		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5566 
5567 		/*
5568 		 * Calculate interval between writes.
5569 		 */
5570 		next = l2arc_write_interval(begin, size, wrote);
5571 		spa_config_exit(spa, SCL_L2ARC, dev);
5572 	}
5573 
5574 	l2arc_thread_exit = 0;
5575 	cv_broadcast(&l2arc_feed_thr_cv);
5576 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5577 	thread_exit();
5578 }
5579 
5580 boolean_t
l2arc_vdev_present(vdev_t * vd)5581 l2arc_vdev_present(vdev_t *vd)
5582 {
5583 	l2arc_dev_t *dev;
5584 
5585 	mutex_enter(&l2arc_dev_mtx);
5586 	for (dev = list_head(l2arc_dev_list); dev != NULL;
5587 	    dev = list_next(l2arc_dev_list, dev)) {
5588 		if (dev->l2ad_vdev == vd)
5589 			break;
5590 	}
5591 	mutex_exit(&l2arc_dev_mtx);
5592 
5593 	return (dev != NULL);
5594 }
5595 
5596 /*
5597  * Add a vdev for use by the L2ARC.  By this point the spa has already
5598  * validated the vdev and opened it.
5599  */
5600 void
l2arc_add_vdev(spa_t * spa,vdev_t * vd)5601 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5602 {
5603 	l2arc_dev_t *adddev;
5604 
5605 	ASSERT(!l2arc_vdev_present(vd));
5606 
5607 	vdev_ashift_optimize(vd);
5608 
5609 	/*
5610 	 * Create a new l2arc device entry.
5611 	 */
5612 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5613 	adddev->l2ad_spa = spa;
5614 	adddev->l2ad_vdev = vd;
5615 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5616 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5617 	adddev->l2ad_hand = adddev->l2ad_start;
5618 	adddev->l2ad_evict = adddev->l2ad_start;
5619 	adddev->l2ad_first = B_TRUE;
5620 	adddev->l2ad_writing = B_FALSE;
5621 
5622 	/*
5623 	 * This is a list of all ARC buffers that are still valid on the
5624 	 * device.
5625 	 */
5626 	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5627 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5628 	    offsetof(arc_buf_hdr_t, b_l2node));
5629 
5630 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5631 
5632 	/*
5633 	 * Add device to global list
5634 	 */
5635 	mutex_enter(&l2arc_dev_mtx);
5636 	list_insert_head(l2arc_dev_list, adddev);
5637 	atomic_inc_64(&l2arc_ndev);
5638 	mutex_exit(&l2arc_dev_mtx);
5639 }
5640 
5641 /*
5642  * Remove a vdev from the L2ARC.
5643  */
5644 void
l2arc_remove_vdev(vdev_t * vd)5645 l2arc_remove_vdev(vdev_t *vd)
5646 {
5647 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5648 
5649 	/*
5650 	 * Find the device by vdev
5651 	 */
5652 	mutex_enter(&l2arc_dev_mtx);
5653 	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5654 		nextdev = list_next(l2arc_dev_list, dev);
5655 		if (vd == dev->l2ad_vdev) {
5656 			remdev = dev;
5657 			break;
5658 		}
5659 	}
5660 	ASSERT(remdev != NULL);
5661 
5662 	/*
5663 	 * Remove device from global list
5664 	 */
5665 	list_remove(l2arc_dev_list, remdev);
5666 	l2arc_dev_last = NULL;		/* may have been invalidated */
5667 	atomic_dec_64(&l2arc_ndev);
5668 	mutex_exit(&l2arc_dev_mtx);
5669 
5670 	/*
5671 	 * Clear all buflists and ARC references.  L2ARC device flush.
5672 	 */
5673 	l2arc_evict(remdev, 0, B_TRUE);
5674 	list_destroy(remdev->l2ad_buflist);
5675 	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5676 	kmem_free(remdev, sizeof (l2arc_dev_t));
5677 }
5678 
5679 void
l2arc_init(void)5680 l2arc_init(void)
5681 {
5682 	l2arc_thread_exit = 0;
5683 	l2arc_ndev = 0;
5684 	l2arc_writes_sent = 0;
5685 	l2arc_writes_done = 0;
5686 
5687 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5688 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5689 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5690 	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5691 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5692 
5693 	l2arc_dev_list = &L2ARC_dev_list;
5694 	l2arc_free_on_write = &L2ARC_free_on_write;
5695 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5696 	    offsetof(l2arc_dev_t, l2ad_node));
5697 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5698 	    offsetof(l2arc_data_free_t, l2df_list_node));
5699 }
5700 
5701 void
l2arc_fini(void)5702 l2arc_fini(void)
5703 {
5704 	/*
5705 	 * This is called from dmu_fini(), which is called from spa_fini();
5706 	 * Because of this, we can assume that all l2arc devices have
5707 	 * already been removed when the pools themselves were removed.
5708 	 */
5709 
5710 	l2arc_do_free_on_write();
5711 
5712 	mutex_destroy(&l2arc_feed_thr_lock);
5713 	cv_destroy(&l2arc_feed_thr_cv);
5714 	mutex_destroy(&l2arc_dev_mtx);
5715 	mutex_destroy(&l2arc_buflist_mtx);
5716 	mutex_destroy(&l2arc_free_on_write_mtx);
5717 
5718 	list_destroy(l2arc_dev_list);
5719 	list_destroy(l2arc_free_on_write);
5720 }
5721 
5722 void
l2arc_start(void)5723 l2arc_start(void)
5724 {
5725 	if (!(spa_mode_global & FWRITE))
5726 		return;
5727 
5728 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5729 	    TS_RUN, minclsyspri);
5730 }
5731 
5732 void
l2arc_stop(void)5733 l2arc_stop(void)
5734 {
5735 	if (!(spa_mode_global & FWRITE))
5736 		return;
5737 
5738 	mutex_enter(&l2arc_feed_thr_lock);
5739 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5740 	l2arc_thread_exit = 1;
5741 	while (l2arc_thread_exit != 0)
5742 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5743 	mutex_exit(&l2arc_feed_thr_lock);
5744 }
5745