xref: /trueos/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c (revision afa027f3f568c9647fd596026f6fb803d3d2887c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2013 by Delphix. All rights reserved.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
34 #include <sys/dmu.h>
35 #include <sys/dbuf.h>
36 #include <sys/kstat.h>
37 
38 /*
39  * I'm against tune-ables, but these should probably exist as tweakable globals
40  * until we can get this working the way we want it to.
41  */
42 
43 int zfs_prefetch_disable = 0;
44 
45 /* max # of streams per zfetch */
46 uint32_t	zfetch_max_streams = 8;
47 /* min time before stream reclaim */
48 uint32_t	zfetch_min_sec_reap = 2;
49 /* max number of blocks to fetch at a time */
50 uint32_t	zfetch_block_cap = 256;
51 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
52 uint64_t	zfetch_array_rd_sz = 1024 * 1024;
53 
54 SYSCTL_DECL(_vfs_zfs);
55 SYSCTL_INT(_vfs_zfs, OID_AUTO, prefetch_disable, CTLFLAG_RW,
56     &zfs_prefetch_disable, 0, "Disable prefetch");
57 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH");
58 TUNABLE_INT("vfs.zfs.zfetch.max_streams", &zfetch_max_streams);
59 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
60     &zfetch_max_streams, 0, "Max # of streams per zfetch");
61 TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
62 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
63     &zfetch_min_sec_reap, 0, "Min time before stream reclaim");
64 TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
65 SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
66     &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
67 TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
68 SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
69     &zfetch_array_rd_sz, 0,
70     "Number of bytes in a array_read at which we stop prefetching");
71 
72 /* forward decls for static routines */
73 static boolean_t	dmu_zfetch_colinear(zfetch_t *, zstream_t *);
74 static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
75 static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
76 static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
77 static boolean_t	dmu_zfetch_find(zfetch_t *, zstream_t *, int);
78 static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
79 static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
80 static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
81 static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
82 
83 typedef struct zfetch_stats {
84 	kstat_named_t zfetchstat_hits;
85 	kstat_named_t zfetchstat_misses;
86 	kstat_named_t zfetchstat_colinear_hits;
87 	kstat_named_t zfetchstat_colinear_misses;
88 	kstat_named_t zfetchstat_stride_hits;
89 	kstat_named_t zfetchstat_stride_misses;
90 	kstat_named_t zfetchstat_reclaim_successes;
91 	kstat_named_t zfetchstat_reclaim_failures;
92 	kstat_named_t zfetchstat_stream_resets;
93 	kstat_named_t zfetchstat_stream_noresets;
94 	kstat_named_t zfetchstat_bogus_streams;
95 } zfetch_stats_t;
96 
97 static zfetch_stats_t zfetch_stats = {
98 	{ "hits",			KSTAT_DATA_UINT64 },
99 	{ "misses",			KSTAT_DATA_UINT64 },
100 	{ "colinear_hits",		KSTAT_DATA_UINT64 },
101 	{ "colinear_misses",		KSTAT_DATA_UINT64 },
102 	{ "stride_hits",		KSTAT_DATA_UINT64 },
103 	{ "stride_misses",		KSTAT_DATA_UINT64 },
104 	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
105 	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
106 	{ "streams_resets",		KSTAT_DATA_UINT64 },
107 	{ "streams_noresets",		KSTAT_DATA_UINT64 },
108 	{ "bogus_streams",		KSTAT_DATA_UINT64 },
109 };
110 
111 #define	ZFETCHSTAT_INCR(stat, val) \
112 	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
113 
114 #define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
115 
116 kstat_t		*zfetch_ksp;
117 
118 /*
119  * Given a zfetch structure and a zstream structure, determine whether the
120  * blocks to be read are part of a co-linear pair of existing prefetch
121  * streams.  If a set is found, coalesce the streams, removing one, and
122  * configure the prefetch so it looks for a strided access pattern.
123  *
124  * In other words: if we find two sequential access streams that are
125  * the same length and distance N appart, and this read is N from the
126  * last stream, then we are probably in a strided access pattern.  So
127  * combine the two sequential streams into a single strided stream.
128  *
129  * Returns whether co-linear streams were found.
130  */
131 static boolean_t
dmu_zfetch_colinear(zfetch_t * zf,zstream_t * zh)132 dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
133 {
134 	zstream_t	*z_walk;
135 	zstream_t	*z_comp;
136 
137 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
138 		return (0);
139 
140 	if (zh == NULL) {
141 		rw_exit(&zf->zf_rwlock);
142 		return (0);
143 	}
144 
145 	for (z_walk = list_head(&zf->zf_stream); z_walk;
146 	    z_walk = list_next(&zf->zf_stream, z_walk)) {
147 		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
148 		    z_comp = list_next(&zf->zf_stream, z_comp)) {
149 			int64_t		diff;
150 
151 			if (z_walk->zst_len != z_walk->zst_stride ||
152 			    z_comp->zst_len != z_comp->zst_stride) {
153 				continue;
154 			}
155 
156 			diff = z_comp->zst_offset - z_walk->zst_offset;
157 			if (z_comp->zst_offset + diff == zh->zst_offset) {
158 				z_walk->zst_offset = zh->zst_offset;
159 				z_walk->zst_direction = diff < 0 ? -1 : 1;
160 				z_walk->zst_stride =
161 				    diff * z_walk->zst_direction;
162 				z_walk->zst_ph_offset =
163 				    zh->zst_offset + z_walk->zst_stride;
164 				dmu_zfetch_stream_remove(zf, z_comp);
165 				mutex_destroy(&z_comp->zst_lock);
166 				kmem_free(z_comp, sizeof (zstream_t));
167 
168 				dmu_zfetch_dofetch(zf, z_walk);
169 
170 				rw_exit(&zf->zf_rwlock);
171 				return (1);
172 			}
173 
174 			diff = z_walk->zst_offset - z_comp->zst_offset;
175 			if (z_walk->zst_offset + diff == zh->zst_offset) {
176 				z_walk->zst_offset = zh->zst_offset;
177 				z_walk->zst_direction = diff < 0 ? -1 : 1;
178 				z_walk->zst_stride =
179 				    diff * z_walk->zst_direction;
180 				z_walk->zst_ph_offset =
181 				    zh->zst_offset + z_walk->zst_stride;
182 				dmu_zfetch_stream_remove(zf, z_comp);
183 				mutex_destroy(&z_comp->zst_lock);
184 				kmem_free(z_comp, sizeof (zstream_t));
185 
186 				dmu_zfetch_dofetch(zf, z_walk);
187 
188 				rw_exit(&zf->zf_rwlock);
189 				return (1);
190 			}
191 		}
192 	}
193 
194 	rw_exit(&zf->zf_rwlock);
195 	return (0);
196 }
197 
198 /*
199  * Given a zstream_t, determine the bounds of the prefetch.  Then call the
200  * routine that actually prefetches the individual blocks.
201  */
202 static void
dmu_zfetch_dofetch(zfetch_t * zf,zstream_t * zs)203 dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
204 {
205 	uint64_t	prefetch_tail;
206 	uint64_t	prefetch_limit;
207 	uint64_t	prefetch_ofst;
208 	uint64_t	prefetch_len;
209 	uint64_t	blocks_fetched;
210 
211 	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
212 	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
213 
214 	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
215 	    (int64_t)(zs->zst_offset + zs->zst_stride));
216 	/*
217 	 * XXX: use a faster division method?
218 	 */
219 	prefetch_limit = zs->zst_offset + zs->zst_len +
220 	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
221 
222 	while (prefetch_tail < prefetch_limit) {
223 		prefetch_ofst = zs->zst_offset + zs->zst_direction *
224 		    (prefetch_tail - zs->zst_offset);
225 
226 		prefetch_len = zs->zst_len;
227 
228 		/*
229 		 * Don't prefetch beyond the end of the file, if working
230 		 * backwards.
231 		 */
232 		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
233 		    (prefetch_ofst > prefetch_tail)) {
234 			prefetch_len += prefetch_ofst;
235 			prefetch_ofst = 0;
236 		}
237 
238 		/* don't prefetch more than we're supposed to */
239 		if (prefetch_len > zs->zst_len)
240 			break;
241 
242 		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
243 		    prefetch_ofst, zs->zst_len);
244 
245 		prefetch_tail += zs->zst_stride;
246 		/* stop if we've run out of stuff to prefetch */
247 		if (blocks_fetched < zs->zst_len)
248 			break;
249 	}
250 	zs->zst_ph_offset = prefetch_tail;
251 	zs->zst_last = ddi_get_lbolt();
252 }
253 
254 void
zfetch_init(void)255 zfetch_init(void)
256 {
257 
258 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
259 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
260 	    KSTAT_FLAG_VIRTUAL);
261 
262 	if (zfetch_ksp != NULL) {
263 		zfetch_ksp->ks_data = &zfetch_stats;
264 		kstat_install(zfetch_ksp);
265 	}
266 }
267 
268 void
zfetch_fini(void)269 zfetch_fini(void)
270 {
271 	if (zfetch_ksp != NULL) {
272 		kstat_delete(zfetch_ksp);
273 		zfetch_ksp = NULL;
274 	}
275 }
276 
277 /*
278  * This takes a pointer to a zfetch structure and a dnode.  It performs the
279  * necessary setup for the zfetch structure, grokking data from the
280  * associated dnode.
281  */
282 void
dmu_zfetch_init(zfetch_t * zf,dnode_t * dno)283 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
284 {
285 	if (zf == NULL) {
286 		return;
287 	}
288 
289 	zf->zf_dnode = dno;
290 	zf->zf_stream_cnt = 0;
291 	zf->zf_alloc_fail = 0;
292 
293 	list_create(&zf->zf_stream, sizeof (zstream_t),
294 	    offsetof(zstream_t, zst_node));
295 
296 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
297 }
298 
299 /*
300  * This function computes the actual size, in blocks, that can be prefetched,
301  * and fetches it.
302  */
303 static uint64_t
dmu_zfetch_fetch(dnode_t * dn,uint64_t blkid,uint64_t nblks)304 dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
305 {
306 	uint64_t	fetchsz;
307 	uint64_t	i;
308 
309 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
310 
311 	for (i = 0; i < fetchsz; i++) {
312 		dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ);
313 	}
314 
315 	return (fetchsz);
316 }
317 
318 /*
319  * this function returns the number of blocks that would be prefetched, based
320  * upon the supplied dnode, blockid, and nblks.  This is used so that we can
321  * update streams in place, and then prefetch with their old value after the
322  * fact.  This way, we can delay the prefetch, but subsequent accesses to the
323  * stream won't result in the same data being prefetched multiple times.
324  */
325 static uint64_t
dmu_zfetch_fetchsz(dnode_t * dn,uint64_t blkid,uint64_t nblks)326 dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
327 {
328 	uint64_t	fetchsz;
329 
330 	if (blkid > dn->dn_maxblkid) {
331 		return (0);
332 	}
333 
334 	/* compute fetch size */
335 	if (blkid + nblks + 1 > dn->dn_maxblkid) {
336 		fetchsz = (dn->dn_maxblkid - blkid) + 1;
337 		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
338 	} else {
339 		fetchsz = nblks;
340 	}
341 
342 
343 	return (fetchsz);
344 }
345 
346 /*
347  * given a zfetch and a zstream structure, see if there is an associated zstream
348  * for this block read.  If so, it starts a prefetch for the stream it
349  * located and returns true, otherwise it returns false
350  */
351 static boolean_t
dmu_zfetch_find(zfetch_t * zf,zstream_t * zh,int prefetched)352 dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
353 {
354 	zstream_t	*zs;
355 	int64_t		diff;
356 	int		reset = !prefetched;
357 	int		rc = 0;
358 
359 	if (zh == NULL)
360 		return (0);
361 
362 	/*
363 	 * XXX: This locking strategy is a bit coarse; however, it's impact has
364 	 * yet to be tested.  If this turns out to be an issue, it can be
365 	 * modified in a number of different ways.
366 	 */
367 
368 	rw_enter(&zf->zf_rwlock, RW_READER);
369 top:
370 
371 	for (zs = list_head(&zf->zf_stream); zs;
372 	    zs = list_next(&zf->zf_stream, zs)) {
373 
374 		/*
375 		 * XXX - should this be an assert?
376 		 */
377 		if (zs->zst_len == 0) {
378 			/* bogus stream */
379 			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
380 			continue;
381 		}
382 
383 		/*
384 		 * We hit this case when we are in a strided prefetch stream:
385 		 * we will read "len" blocks before "striding".
386 		 */
387 		if (zh->zst_offset >= zs->zst_offset &&
388 		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
389 			if (prefetched) {
390 				/* already fetched */
391 				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
392 				rc = 1;
393 				goto out;
394 			} else {
395 				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
396 			}
397 		}
398 
399 		/*
400 		 * This is the forward sequential read case: we increment
401 		 * len by one each time we hit here, so we will enter this
402 		 * case on every read.
403 		 */
404 		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
405 
406 			reset = !prefetched && zs->zst_len > 1;
407 
408 			if (mutex_tryenter(&zs->zst_lock) == 0) {
409 				rc = 1;
410 				goto out;
411 			}
412 
413 			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
414 				mutex_exit(&zs->zst_lock);
415 				goto top;
416 			}
417 			zs->zst_len += zh->zst_len;
418 			diff = zs->zst_len - zfetch_block_cap;
419 			if (diff > 0) {
420 				zs->zst_offset += diff;
421 				zs->zst_len = zs->zst_len > diff ?
422 				    zs->zst_len - diff : 0;
423 			}
424 			zs->zst_direction = ZFETCH_FORWARD;
425 
426 			break;
427 
428 		/*
429 		 * Same as above, but reading backwards through the file.
430 		 */
431 		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
432 			/* backwards sequential access */
433 
434 			reset = !prefetched && zs->zst_len > 1;
435 
436 			if (mutex_tryenter(&zs->zst_lock) == 0) {
437 				rc = 1;
438 				goto out;
439 			}
440 
441 			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
442 				mutex_exit(&zs->zst_lock);
443 				goto top;
444 			}
445 
446 			zs->zst_offset = zs->zst_offset > zh->zst_len ?
447 			    zs->zst_offset - zh->zst_len : 0;
448 			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
449 			    zs->zst_ph_offset - zh->zst_len : 0;
450 			zs->zst_len += zh->zst_len;
451 
452 			diff = zs->zst_len - zfetch_block_cap;
453 			if (diff > 0) {
454 				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
455 				    zs->zst_ph_offset - diff : 0;
456 				zs->zst_len = zs->zst_len > diff ?
457 				    zs->zst_len - diff : zs->zst_len;
458 			}
459 			zs->zst_direction = ZFETCH_BACKWARD;
460 
461 			break;
462 
463 		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
464 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
465 			/* strided forward access */
466 
467 			if (mutex_tryenter(&zs->zst_lock) == 0) {
468 				rc = 1;
469 				goto out;
470 			}
471 
472 			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
473 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
474 				mutex_exit(&zs->zst_lock);
475 				goto top;
476 			}
477 
478 			zs->zst_offset += zs->zst_stride;
479 			zs->zst_direction = ZFETCH_FORWARD;
480 
481 			break;
482 
483 		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
484 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
485 			/* strided reverse access */
486 
487 			if (mutex_tryenter(&zs->zst_lock) == 0) {
488 				rc = 1;
489 				goto out;
490 			}
491 
492 			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
493 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
494 				mutex_exit(&zs->zst_lock);
495 				goto top;
496 			}
497 
498 			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
499 			    zs->zst_offset - zs->zst_stride : 0;
500 			zs->zst_ph_offset = (zs->zst_ph_offset >
501 			    (2 * zs->zst_stride)) ?
502 			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
503 			zs->zst_direction = ZFETCH_BACKWARD;
504 
505 			break;
506 		}
507 	}
508 
509 	if (zs) {
510 		if (reset) {
511 			zstream_t *remove = zs;
512 
513 			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
514 			rc = 0;
515 			mutex_exit(&zs->zst_lock);
516 			rw_exit(&zf->zf_rwlock);
517 			rw_enter(&zf->zf_rwlock, RW_WRITER);
518 			/*
519 			 * Relocate the stream, in case someone removes
520 			 * it while we were acquiring the WRITER lock.
521 			 */
522 			for (zs = list_head(&zf->zf_stream); zs;
523 			    zs = list_next(&zf->zf_stream, zs)) {
524 				if (zs == remove) {
525 					dmu_zfetch_stream_remove(zf, zs);
526 					mutex_destroy(&zs->zst_lock);
527 					kmem_free(zs, sizeof (zstream_t));
528 					break;
529 				}
530 			}
531 		} else {
532 			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
533 			rc = 1;
534 			dmu_zfetch_dofetch(zf, zs);
535 			mutex_exit(&zs->zst_lock);
536 		}
537 	}
538 out:
539 	rw_exit(&zf->zf_rwlock);
540 	return (rc);
541 }
542 
543 /*
544  * Clean-up state associated with a zfetch structure.  This frees allocated
545  * structure members, empties the zf_stream tree, and generally makes things
546  * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
547  */
548 void
dmu_zfetch_rele(zfetch_t * zf)549 dmu_zfetch_rele(zfetch_t *zf)
550 {
551 	zstream_t	*zs;
552 	zstream_t	*zs_next;
553 
554 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
555 
556 	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
557 		zs_next = list_next(&zf->zf_stream, zs);
558 
559 		list_remove(&zf->zf_stream, zs);
560 		mutex_destroy(&zs->zst_lock);
561 		kmem_free(zs, sizeof (zstream_t));
562 	}
563 	list_destroy(&zf->zf_stream);
564 	rw_destroy(&zf->zf_rwlock);
565 
566 	zf->zf_dnode = NULL;
567 }
568 
569 /*
570  * Given a zfetch and zstream structure, insert the zstream structure into the
571  * AVL tree contained within the zfetch structure.  Peform the appropriate
572  * book-keeping.  It is possible that another thread has inserted a stream which
573  * matches one that we are about to insert, so we must be sure to check for this
574  * case.  If one is found, return failure, and let the caller cleanup the
575  * duplicates.
576  */
577 static int
dmu_zfetch_stream_insert(zfetch_t * zf,zstream_t * zs)578 dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
579 {
580 	zstream_t	*zs_walk;
581 	zstream_t	*zs_next;
582 
583 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
584 
585 	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
586 		zs_next = list_next(&zf->zf_stream, zs_walk);
587 
588 		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
589 			return (0);
590 		}
591 	}
592 
593 	list_insert_head(&zf->zf_stream, zs);
594 	zf->zf_stream_cnt++;
595 	return (1);
596 }
597 
598 
599 /*
600  * Walk the list of zstreams in the given zfetch, find an old one (by time), and
601  * reclaim it for use by the caller.
602  */
603 static zstream_t *
dmu_zfetch_stream_reclaim(zfetch_t * zf)604 dmu_zfetch_stream_reclaim(zfetch_t *zf)
605 {
606 	zstream_t	*zs;
607 	clock_t		ticks;
608 
609 	ticks = zfetch_min_sec_reap * hz;
610 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
611 		return (0);
612 
613 	for (zs = list_head(&zf->zf_stream); zs;
614 	    zs = list_next(&zf->zf_stream, zs)) {
615 
616 		if (ddi_get_lbolt() - zs->zst_last > ticks)
617 			break;
618 	}
619 
620 	if (zs) {
621 		dmu_zfetch_stream_remove(zf, zs);
622 		mutex_destroy(&zs->zst_lock);
623 		bzero(zs, sizeof (zstream_t));
624 	} else {
625 		zf->zf_alloc_fail++;
626 	}
627 	rw_exit(&zf->zf_rwlock);
628 
629 	return (zs);
630 }
631 
632 /*
633  * Given a zfetch and zstream structure, remove the zstream structure from its
634  * container in the zfetch structure.  Perform the appropriate book-keeping.
635  */
636 static void
dmu_zfetch_stream_remove(zfetch_t * zf,zstream_t * zs)637 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
638 {
639 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
640 
641 	list_remove(&zf->zf_stream, zs);
642 	zf->zf_stream_cnt--;
643 }
644 
645 static int
dmu_zfetch_streams_equal(zstream_t * zs1,zstream_t * zs2)646 dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
647 {
648 	if (zs1->zst_offset != zs2->zst_offset)
649 		return (0);
650 
651 	if (zs1->zst_len != zs2->zst_len)
652 		return (0);
653 
654 	if (zs1->zst_stride != zs2->zst_stride)
655 		return (0);
656 
657 	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
658 		return (0);
659 
660 	if (zs1->zst_cap != zs2->zst_cap)
661 		return (0);
662 
663 	if (zs1->zst_direction != zs2->zst_direction)
664 		return (0);
665 
666 	return (1);
667 }
668 
669 /*
670  * This is the prefetch entry point.  It calls all of the other dmu_zfetch
671  * routines to create, delete, find, or operate upon prefetch streams.
672  */
673 void
dmu_zfetch(zfetch_t * zf,uint64_t offset,uint64_t size,int prefetched)674 dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
675 {
676 	zstream_t	zst;
677 	zstream_t	*newstream;
678 	boolean_t	fetched;
679 	int		inserted;
680 	unsigned int	blkshft;
681 	uint64_t	blksz;
682 
683 	if (zfs_prefetch_disable)
684 		return;
685 
686 	/* files that aren't ln2 blocksz are only one block -- nothing to do */
687 	if (!zf->zf_dnode->dn_datablkshift)
688 		return;
689 
690 	/* convert offset and size, into blockid and nblocks */
691 	blkshft = zf->zf_dnode->dn_datablkshift;
692 	blksz = (1 << blkshft);
693 
694 	bzero(&zst, sizeof (zstream_t));
695 	zst.zst_offset = offset >> blkshft;
696 	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
697 	    P2ALIGN(offset, blksz)) >> blkshft;
698 
699 	fetched = dmu_zfetch_find(zf, &zst, prefetched);
700 	if (fetched) {
701 		ZFETCHSTAT_BUMP(zfetchstat_hits);
702 	} else {
703 		ZFETCHSTAT_BUMP(zfetchstat_misses);
704 		fetched = dmu_zfetch_colinear(zf, &zst);
705 		if (fetched) {
706 			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
707 		} else {
708 			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
709 		}
710 	}
711 
712 	if (!fetched) {
713 		newstream = dmu_zfetch_stream_reclaim(zf);
714 
715 		/*
716 		 * we still couldn't find a stream, drop the lock, and allocate
717 		 * one if possible.  Otherwise, give up and go home.
718 		 */
719 		if (newstream) {
720 			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
721 		} else {
722 			uint64_t	maxblocks;
723 			uint32_t	max_streams;
724 			uint32_t	cur_streams;
725 
726 			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
727 			cur_streams = zf->zf_stream_cnt;
728 			maxblocks = zf->zf_dnode->dn_maxblkid;
729 
730 			max_streams = MIN(zfetch_max_streams,
731 			    (maxblocks / zfetch_block_cap));
732 			if (max_streams == 0) {
733 				max_streams++;
734 			}
735 
736 			if (cur_streams >= max_streams) {
737 				return;
738 			}
739 			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
740 		}
741 
742 		newstream->zst_offset = zst.zst_offset;
743 		newstream->zst_len = zst.zst_len;
744 		newstream->zst_stride = zst.zst_len;
745 		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
746 		newstream->zst_cap = zst.zst_len;
747 		newstream->zst_direction = ZFETCH_FORWARD;
748 		newstream->zst_last = ddi_get_lbolt();
749 
750 		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
751 
752 		rw_enter(&zf->zf_rwlock, RW_WRITER);
753 		inserted = dmu_zfetch_stream_insert(zf, newstream);
754 		rw_exit(&zf->zf_rwlock);
755 
756 		if (!inserted) {
757 			mutex_destroy(&newstream->zst_lock);
758 			kmem_free(newstream, sizeof (zstream_t));
759 		}
760 	}
761 }
762