xref: /freebsd-11-stable/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 21e28abad816a794e991d91c2f58d30f5b141217)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
24  */
25 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
26 /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
27 /* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
28 
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dbuf.h>
33 #include <sys/dnode.h>
34 #include <sys/zfs_context.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dmu_traverse.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/dsl_synctask.h>
41 #include <sys/dsl_prop.h>
42 #include <sys/dmu_zfetch.h>
43 #include <sys/zfs_ioctl.h>
44 #include <sys/zap.h>
45 #include <sys/zio_checksum.h>
46 #include <sys/zio_compress.h>
47 #include <sys/sa.h>
48 #include <sys/zfeature.h>
49 #include <sys/abd.h>
50 #ifdef _KERNEL
51 #include <sys/racct.h>
52 #include <sys/vm.h>
53 #include <sys/zfs_znode.h>
54 #endif
55 
56 /*
57  * Enable/disable nopwrite feature.
58  */
59 int zfs_nopwrite_enabled = 1;
60 SYSCTL_DECL(_vfs_zfs);
61 SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN,
62     &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
63 
64 /*
65  * Tunable to control percentage of dirtied blocks from frees in one TXG.
66  * After this threshold is crossed, additional dirty blocks from frees
67  * wait until the next TXG.
68  * A value of zero will disable this throttle.
69  */
70 uint32_t zfs_per_txg_dirty_frees_percent = 30;
71 SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
72 	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
73 
74 /*
75  * This can be used for testing, to ensure that certain actions happen
76  * while in the middle of a remap (which might otherwise complete too
77  * quickly).
78  */
79 int zfs_object_remap_one_indirect_delay_ticks = 0;
80 
81 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
82 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
83 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
84 	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
85 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
86 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
87 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
88 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
89 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
90 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
91 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
92 	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
93 	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
94 	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
95 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
96 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
97 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
98 	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
99 	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
100 	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
101 	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
102 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
103 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
104 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
105 	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
106 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
107 	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
108 	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
109 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
110 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
111 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
112 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
113 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
114 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
115 	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
116 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
117 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
118 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
119 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
120 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
121 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
122 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
123 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
124 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
125 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
126 	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
127 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
128 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
129 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
130 	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
131 	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
132 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
133 	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
134 	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
135 	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
136 };
137 
138 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
139 	{	byteswap_uint8_array,	"uint8"		},
140 	{	byteswap_uint16_array,	"uint16"	},
141 	{	byteswap_uint32_array,	"uint32"	},
142 	{	byteswap_uint64_array,	"uint64"	},
143 	{	zap_byteswap,		"zap"		},
144 	{	dnode_buf_byteswap,	"dnode"		},
145 	{	dmu_objset_byteswap,	"objset"	},
146 	{	zfs_znode_byteswap,	"znode"		},
147 	{	zfs_oldacl_byteswap,	"oldacl"	},
148 	{	zfs_acl_byteswap,	"acl"		}
149 };
150 
151 int
dmu_buf_hold_noread_by_dnode(dnode_t * dn,uint64_t offset,void * tag,dmu_buf_t ** dbp)152 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
153     void *tag, dmu_buf_t **dbp)
154 {
155 	uint64_t blkid;
156 	dmu_buf_impl_t *db;
157 
158 	blkid = dbuf_whichblock(dn, 0, offset);
159 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
160 	db = dbuf_hold(dn, blkid, tag);
161 	rw_exit(&dn->dn_struct_rwlock);
162 
163 	if (db == NULL) {
164 		*dbp = NULL;
165 		return (SET_ERROR(EIO));
166 	}
167 
168 	*dbp = &db->db;
169 	return (0);
170 }
171 int
dmu_buf_hold_noread(objset_t * os,uint64_t object,uint64_t offset,void * tag,dmu_buf_t ** dbp)172 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
173     void *tag, dmu_buf_t **dbp)
174 {
175 	dnode_t *dn;
176 	uint64_t blkid;
177 	dmu_buf_impl_t *db;
178 	int err;
179 
180 	err = dnode_hold(os, object, FTAG, &dn);
181 	if (err)
182 		return (err);
183 	blkid = dbuf_whichblock(dn, 0, offset);
184 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
185 	db = dbuf_hold(dn, blkid, tag);
186 	rw_exit(&dn->dn_struct_rwlock);
187 	dnode_rele(dn, FTAG);
188 
189 	if (db == NULL) {
190 		*dbp = NULL;
191 		return (SET_ERROR(EIO));
192 	}
193 
194 	*dbp = &db->db;
195 	return (err);
196 }
197 
198 int
dmu_buf_hold_by_dnode(dnode_t * dn,uint64_t offset,void * tag,dmu_buf_t ** dbp,int flags)199 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
200     void *tag, dmu_buf_t **dbp, int flags)
201 {
202 	int err;
203 	int db_flags = DB_RF_CANFAIL;
204 
205 	if (flags & DMU_READ_NO_PREFETCH)
206 		db_flags |= DB_RF_NOPREFETCH;
207 
208 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
209 	if (err == 0) {
210 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
211 		err = dbuf_read(db, NULL, db_flags);
212 		if (err != 0) {
213 			dbuf_rele(db, tag);
214 			*dbp = NULL;
215 		}
216 	}
217 
218 	return (err);
219 }
220 
221 int
dmu_buf_hold(objset_t * os,uint64_t object,uint64_t offset,void * tag,dmu_buf_t ** dbp,int flags)222 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
223     void *tag, dmu_buf_t **dbp, int flags)
224 {
225 	int err;
226 	int db_flags = DB_RF_CANFAIL;
227 
228 	if (flags & DMU_READ_NO_PREFETCH)
229 		db_flags |= DB_RF_NOPREFETCH;
230 
231 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
232 	if (err == 0) {
233 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
234 		err = dbuf_read(db, NULL, db_flags);
235 		if (err != 0) {
236 			dbuf_rele(db, tag);
237 			*dbp = NULL;
238 		}
239 	}
240 
241 	return (err);
242 }
243 
244 int
dmu_bonus_max(void)245 dmu_bonus_max(void)
246 {
247 	return (DN_MAX_BONUSLEN);
248 }
249 
250 int
dmu_set_bonus(dmu_buf_t * db_fake,int newsize,dmu_tx_t * tx)251 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
252 {
253 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
254 	dnode_t *dn;
255 	int error;
256 
257 	DB_DNODE_ENTER(db);
258 	dn = DB_DNODE(db);
259 
260 	if (dn->dn_bonus != db) {
261 		error = SET_ERROR(EINVAL);
262 	} else if (newsize < 0 || newsize > db_fake->db_size) {
263 		error = SET_ERROR(EINVAL);
264 	} else {
265 		dnode_setbonuslen(dn, newsize, tx);
266 		error = 0;
267 	}
268 
269 	DB_DNODE_EXIT(db);
270 	return (error);
271 }
272 
273 int
dmu_set_bonustype(dmu_buf_t * db_fake,dmu_object_type_t type,dmu_tx_t * tx)274 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
275 {
276 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
277 	dnode_t *dn;
278 	int error;
279 
280 	DB_DNODE_ENTER(db);
281 	dn = DB_DNODE(db);
282 
283 	if (!DMU_OT_IS_VALID(type)) {
284 		error = SET_ERROR(EINVAL);
285 	} else if (dn->dn_bonus != db) {
286 		error = SET_ERROR(EINVAL);
287 	} else {
288 		dnode_setbonus_type(dn, type, tx);
289 		error = 0;
290 	}
291 
292 	DB_DNODE_EXIT(db);
293 	return (error);
294 }
295 
296 dmu_object_type_t
dmu_get_bonustype(dmu_buf_t * db_fake)297 dmu_get_bonustype(dmu_buf_t *db_fake)
298 {
299 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
300 	dnode_t *dn;
301 	dmu_object_type_t type;
302 
303 	DB_DNODE_ENTER(db);
304 	dn = DB_DNODE(db);
305 	type = dn->dn_bonustype;
306 	DB_DNODE_EXIT(db);
307 
308 	return (type);
309 }
310 
311 int
dmu_rm_spill(objset_t * os,uint64_t object,dmu_tx_t * tx)312 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
313 {
314 	dnode_t *dn;
315 	int error;
316 
317 	error = dnode_hold(os, object, FTAG, &dn);
318 	dbuf_rm_spill(dn, tx);
319 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
320 	dnode_rm_spill(dn, tx);
321 	rw_exit(&dn->dn_struct_rwlock);
322 	dnode_rele(dn, FTAG);
323 	return (error);
324 }
325 
326 /*
327  * returns ENOENT, EIO, or 0.
328  */
329 int
dmu_bonus_hold(objset_t * os,uint64_t object,void * tag,dmu_buf_t ** dbp)330 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
331 {
332 	dnode_t *dn;
333 	dmu_buf_impl_t *db;
334 	int error;
335 
336 	error = dnode_hold(os, object, FTAG, &dn);
337 	if (error)
338 		return (error);
339 
340 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
341 	if (dn->dn_bonus == NULL) {
342 		rw_exit(&dn->dn_struct_rwlock);
343 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
344 		if (dn->dn_bonus == NULL)
345 			dbuf_create_bonus(dn);
346 	}
347 	db = dn->dn_bonus;
348 
349 	/* as long as the bonus buf is held, the dnode will be held */
350 	if (refcount_add(&db->db_holds, tag) == 1) {
351 		VERIFY(dnode_add_ref(dn, db));
352 		atomic_inc_32(&dn->dn_dbufs_count);
353 	}
354 
355 	/*
356 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
357 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
358 	 * a dnode hold for every dbuf.
359 	 */
360 	rw_exit(&dn->dn_struct_rwlock);
361 
362 	dnode_rele(dn, FTAG);
363 
364 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
365 
366 	*dbp = &db->db;
367 	return (0);
368 }
369 
370 /*
371  * returns ENOENT, EIO, or 0.
372  *
373  * This interface will allocate a blank spill dbuf when a spill blk
374  * doesn't already exist on the dnode.
375  *
376  * if you only want to find an already existing spill db, then
377  * dmu_spill_hold_existing() should be used.
378  */
379 int
dmu_spill_hold_by_dnode(dnode_t * dn,uint32_t flags,void * tag,dmu_buf_t ** dbp)380 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
381 {
382 	dmu_buf_impl_t *db = NULL;
383 	int err;
384 
385 	if ((flags & DB_RF_HAVESTRUCT) == 0)
386 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
387 
388 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
389 
390 	if ((flags & DB_RF_HAVESTRUCT) == 0)
391 		rw_exit(&dn->dn_struct_rwlock);
392 
393 	ASSERT(db != NULL);
394 	err = dbuf_read(db, NULL, flags);
395 	if (err == 0)
396 		*dbp = &db->db;
397 	else
398 		dbuf_rele(db, tag);
399 	return (err);
400 }
401 
402 int
dmu_spill_hold_existing(dmu_buf_t * bonus,void * tag,dmu_buf_t ** dbp)403 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
404 {
405 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
406 	dnode_t *dn;
407 	int err;
408 
409 	DB_DNODE_ENTER(db);
410 	dn = DB_DNODE(db);
411 
412 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
413 		err = SET_ERROR(EINVAL);
414 	} else {
415 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
416 
417 		if (!dn->dn_have_spill) {
418 			err = SET_ERROR(ENOENT);
419 		} else {
420 			err = dmu_spill_hold_by_dnode(dn,
421 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
422 		}
423 
424 		rw_exit(&dn->dn_struct_rwlock);
425 	}
426 
427 	DB_DNODE_EXIT(db);
428 	return (err);
429 }
430 
431 int
dmu_spill_hold_by_bonus(dmu_buf_t * bonus,void * tag,dmu_buf_t ** dbp)432 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
433 {
434 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
435 	dnode_t *dn;
436 	int err;
437 
438 	DB_DNODE_ENTER(db);
439 	dn = DB_DNODE(db);
440 	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
441 	DB_DNODE_EXIT(db);
442 
443 	return (err);
444 }
445 
446 /*
447  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
448  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
449  * and can induce severe lock contention when writing to several files
450  * whose dnodes are in the same block.
451  */
452 int
dmu_buf_hold_array_by_dnode(dnode_t * dn,uint64_t offset,uint64_t length,boolean_t read,void * tag,int * numbufsp,dmu_buf_t *** dbpp,uint32_t flags)453 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
454     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
455 {
456 	dmu_buf_t **dbp;
457 	uint64_t blkid, nblks, i;
458 	uint32_t dbuf_flags;
459 	int err;
460 	zio_t *zio;
461 
462 	ASSERT(length <= DMU_MAX_ACCESS);
463 
464 	/*
465 	 * Note: We directly notify the prefetch code of this read, so that
466 	 * we can tell it about the multi-block read.  dbuf_read() only knows
467 	 * about the one block it is accessing.
468 	 */
469 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
470 	    DB_RF_NOPREFETCH;
471 
472 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
473 	if (dn->dn_datablkshift) {
474 		int blkshift = dn->dn_datablkshift;
475 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
476 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
477 	} else {
478 		if (offset + length > dn->dn_datablksz) {
479 			zfs_panic_recover("zfs: accessing past end of object "
480 			    "%llx/%llx (size=%u access=%llu+%llu)",
481 			    (longlong_t)dn->dn_objset->
482 			    os_dsl_dataset->ds_object,
483 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
484 			    (longlong_t)offset, (longlong_t)length);
485 			rw_exit(&dn->dn_struct_rwlock);
486 			return (SET_ERROR(EIO));
487 		}
488 		nblks = 1;
489 	}
490 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
491 
492 #if defined(_KERNEL) && defined(RACCT)
493 	if (racct_enable && !read) {
494 		PROC_LOCK(curproc);
495 		racct_add_force(curproc, RACCT_WRITEBPS, length);
496 		racct_add_force(curproc, RACCT_WRITEIOPS, nblks);
497 		PROC_UNLOCK(curproc);
498 	}
499 #endif
500 
501 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
502 	blkid = dbuf_whichblock(dn, 0, offset);
503 	for (i = 0; i < nblks; i++) {
504 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
505 		if (db == NULL) {
506 			rw_exit(&dn->dn_struct_rwlock);
507 			dmu_buf_rele_array(dbp, nblks, tag);
508 			zio_nowait(zio);
509 			return (SET_ERROR(EIO));
510 		}
511 
512 		/* initiate async i/o */
513 		if (read)
514 			(void) dbuf_read(db, zio, dbuf_flags);
515 #ifdef _KERNEL
516 		else
517 			curthread->td_ru.ru_oublock++;
518 #endif
519 		dbp[i] = &db->db;
520 	}
521 
522 	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
523 	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
524 		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
525 		    read && DNODE_IS_CACHEABLE(dn));
526 	}
527 	rw_exit(&dn->dn_struct_rwlock);
528 
529 	/* wait for async i/o */
530 	err = zio_wait(zio);
531 	if (err) {
532 		dmu_buf_rele_array(dbp, nblks, tag);
533 		return (err);
534 	}
535 
536 	/* wait for other io to complete */
537 	if (read) {
538 		for (i = 0; i < nblks; i++) {
539 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
540 			mutex_enter(&db->db_mtx);
541 			while (db->db_state == DB_READ ||
542 			    db->db_state == DB_FILL)
543 				cv_wait(&db->db_changed, &db->db_mtx);
544 			if (db->db_state == DB_UNCACHED)
545 				err = SET_ERROR(EIO);
546 			mutex_exit(&db->db_mtx);
547 			if (err) {
548 				dmu_buf_rele_array(dbp, nblks, tag);
549 				return (err);
550 			}
551 		}
552 	}
553 
554 	*numbufsp = nblks;
555 	*dbpp = dbp;
556 	return (0);
557 }
558 
559 static int
dmu_buf_hold_array(objset_t * os,uint64_t object,uint64_t offset,uint64_t length,int read,void * tag,int * numbufsp,dmu_buf_t *** dbpp)560 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
561     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
562 {
563 	dnode_t *dn;
564 	int err;
565 
566 	err = dnode_hold(os, object, FTAG, &dn);
567 	if (err)
568 		return (err);
569 
570 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
571 	    numbufsp, dbpp, DMU_READ_PREFETCH);
572 
573 	dnode_rele(dn, FTAG);
574 
575 	return (err);
576 }
577 
578 int
dmu_buf_hold_array_by_bonus(dmu_buf_t * db_fake,uint64_t offset,uint64_t length,boolean_t read,void * tag,int * numbufsp,dmu_buf_t *** dbpp)579 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
580     uint64_t length, boolean_t read, void *tag, int *numbufsp,
581     dmu_buf_t ***dbpp)
582 {
583 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
584 	dnode_t *dn;
585 	int err;
586 
587 	DB_DNODE_ENTER(db);
588 	dn = DB_DNODE(db);
589 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
590 	    numbufsp, dbpp, DMU_READ_PREFETCH);
591 	DB_DNODE_EXIT(db);
592 
593 	return (err);
594 }
595 
596 void
dmu_buf_rele_array(dmu_buf_t ** dbp_fake,int numbufs,void * tag)597 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
598 {
599 	int i;
600 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
601 
602 	if (numbufs == 0)
603 		return;
604 
605 	for (i = 0; i < numbufs; i++) {
606 		if (dbp[i])
607 			dbuf_rele(dbp[i], tag);
608 	}
609 
610 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
611 }
612 
613 /*
614  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
615  * indirect blocks prefeteched will be those that point to the blocks containing
616  * the data starting at offset, and continuing to offset + len.
617  *
618  * Note that if the indirect blocks above the blocks being prefetched are not in
619  * cache, they will be asychronously read in.
620  */
621 void
dmu_prefetch(objset_t * os,uint64_t object,int64_t level,uint64_t offset,uint64_t len,zio_priority_t pri)622 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
623     uint64_t len, zio_priority_t pri)
624 {
625 	dnode_t *dn;
626 	uint64_t blkid;
627 	int nblks, err;
628 
629 	if (len == 0) {  /* they're interested in the bonus buffer */
630 		dn = DMU_META_DNODE(os);
631 
632 		if (object == 0 || object >= DN_MAX_OBJECT)
633 			return;
634 
635 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
636 		blkid = dbuf_whichblock(dn, level,
637 		    object * sizeof (dnode_phys_t));
638 		dbuf_prefetch(dn, level, blkid, pri, 0);
639 		rw_exit(&dn->dn_struct_rwlock);
640 		return;
641 	}
642 
643 	/*
644 	 * XXX - Note, if the dnode for the requested object is not
645 	 * already cached, we will do a *synchronous* read in the
646 	 * dnode_hold() call.  The same is true for any indirects.
647 	 */
648 	err = dnode_hold(os, object, FTAG, &dn);
649 	if (err != 0)
650 		return;
651 
652 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
653 	/*
654 	 * offset + len - 1 is the last byte we want to prefetch for, and offset
655 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
656 	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
657 	 * offset)  is the first.  Then the number we need to prefetch is the
658 	 * last - first + 1.
659 	 */
660 	if (level > 0 || dn->dn_datablkshift != 0) {
661 		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
662 		    dbuf_whichblock(dn, level, offset) + 1;
663 	} else {
664 		nblks = (offset < dn->dn_datablksz);
665 	}
666 
667 	if (nblks != 0) {
668 		blkid = dbuf_whichblock(dn, level, offset);
669 		for (int i = 0; i < nblks; i++)
670 			dbuf_prefetch(dn, level, blkid + i, pri, 0);
671 	}
672 
673 	rw_exit(&dn->dn_struct_rwlock);
674 
675 	dnode_rele(dn, FTAG);
676 }
677 
678 /*
679  * Get the next "chunk" of file data to free.  We traverse the file from
680  * the end so that the file gets shorter over time (if we crashes in the
681  * middle, this will leave us in a better state).  We find allocated file
682  * data by simply searching the allocated level 1 indirects.
683  *
684  * On input, *start should be the first offset that does not need to be
685  * freed (e.g. "offset + length").  On return, *start will be the first
686  * offset that should be freed.
687  */
688 static int
get_next_chunk(dnode_t * dn,uint64_t * start,uint64_t minimum)689 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
690 {
691 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
692 	/* bytes of data covered by a level-1 indirect block */
693 	uint64_t iblkrange =
694 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
695 
696 	ASSERT3U(minimum, <=, *start);
697 
698 	if (*start - minimum <= iblkrange * maxblks) {
699 		*start = minimum;
700 		return (0);
701 	}
702 	ASSERT(ISP2(iblkrange));
703 
704 	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
705 		int err;
706 
707 		/*
708 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
709 		 * indirect block at or before the input offset.  We must
710 		 * decrement *start so that it is at the end of the region
711 		 * to search.
712 		 */
713 		(*start)--;
714 		err = dnode_next_offset(dn,
715 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
716 
717 		/* if there are no indirect blocks before start, we are done */
718 		if (err == ESRCH) {
719 			*start = minimum;
720 			break;
721 		} else if (err != 0) {
722 			return (err);
723 		}
724 
725 		/* set start to the beginning of this L1 indirect */
726 		*start = P2ALIGN(*start, iblkrange);
727 	}
728 	if (*start < minimum)
729 		*start = minimum;
730 	return (0);
731 }
732 
733 /*
734  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
735  * otherwise return false.
736  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
737  */
738 /*ARGSUSED*/
739 static boolean_t
dmu_objset_zfs_unmounting(objset_t * os)740 dmu_objset_zfs_unmounting(objset_t *os)
741 {
742 #ifdef _KERNEL
743 	if (dmu_objset_type(os) == DMU_OST_ZFS)
744 		return (zfs_get_vfs_flag_unmounted(os));
745 #endif
746 	return (B_FALSE);
747 }
748 
749 static int
dmu_free_long_range_impl(objset_t * os,dnode_t * dn,uint64_t offset,uint64_t length)750 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
751     uint64_t length)
752 {
753 	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
754 	int err;
755 	uint64_t dirty_frees_threshold;
756 	dsl_pool_t *dp = dmu_objset_pool(os);
757 
758 	if (offset >= object_size)
759 		return (0);
760 
761 	if (zfs_per_txg_dirty_frees_percent <= 100)
762 		dirty_frees_threshold =
763 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
764 	else
765 		dirty_frees_threshold = zfs_dirty_data_max / 4;
766 
767 	if (length == DMU_OBJECT_END || offset + length > object_size)
768 		length = object_size - offset;
769 
770 	while (length != 0) {
771 		uint64_t chunk_end, chunk_begin, chunk_len;
772 		uint64_t long_free_dirty_all_txgs = 0;
773 		dmu_tx_t *tx;
774 
775 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
776 			return (SET_ERROR(EINTR));
777 
778 		chunk_end = chunk_begin = offset + length;
779 
780 		/* move chunk_begin backwards to the beginning of this chunk */
781 		err = get_next_chunk(dn, &chunk_begin, offset);
782 		if (err)
783 			return (err);
784 		ASSERT3U(chunk_begin, >=, offset);
785 		ASSERT3U(chunk_begin, <=, chunk_end);
786 
787 		chunk_len = chunk_end - chunk_begin;
788 
789 		mutex_enter(&dp->dp_lock);
790 		for (int t = 0; t < TXG_SIZE; t++) {
791 			long_free_dirty_all_txgs +=
792 			    dp->dp_long_free_dirty_pertxg[t];
793 		}
794 		mutex_exit(&dp->dp_lock);
795 
796 		/*
797 		 * To avoid filling up a TXG with just frees wait for
798 		 * the next TXG to open before freeing more chunks if
799 		 * we have reached the threshold of frees
800 		 */
801 		if (dirty_frees_threshold != 0 &&
802 		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
803 			txg_wait_open(dp, 0);
804 			continue;
805 		}
806 
807 		tx = dmu_tx_create(os);
808 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
809 
810 		/*
811 		 * Mark this transaction as typically resulting in a net
812 		 * reduction in space used.
813 		 */
814 		dmu_tx_mark_netfree(tx);
815 		err = dmu_tx_assign(tx, TXG_WAIT);
816 		if (err) {
817 			dmu_tx_abort(tx);
818 			return (err);
819 		}
820 
821 		mutex_enter(&dp->dp_lock);
822 		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
823 		    chunk_len;
824 		mutex_exit(&dp->dp_lock);
825 		DTRACE_PROBE3(free__long__range,
826 		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
827 		    uint64_t, dmu_tx_get_txg(tx));
828 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
829 		dmu_tx_commit(tx);
830 
831 		length -= chunk_len;
832 	}
833 	return (0);
834 }
835 
836 int
dmu_free_long_range(objset_t * os,uint64_t object,uint64_t offset,uint64_t length)837 dmu_free_long_range(objset_t *os, uint64_t object,
838     uint64_t offset, uint64_t length)
839 {
840 	dnode_t *dn;
841 	int err;
842 
843 	err = dnode_hold(os, object, FTAG, &dn);
844 	if (err != 0)
845 		return (err);
846 	err = dmu_free_long_range_impl(os, dn, offset, length);
847 
848 	/*
849 	 * It is important to zero out the maxblkid when freeing the entire
850 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
851 	 * will take the fast path, and (b) dnode_reallocate() can verify
852 	 * that the entire file has been freed.
853 	 */
854 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
855 		dn->dn_maxblkid = 0;
856 
857 	dnode_rele(dn, FTAG);
858 	return (err);
859 }
860 
861 int
dmu_free_long_object(objset_t * os,uint64_t object)862 dmu_free_long_object(objset_t *os, uint64_t object)
863 {
864 	dmu_tx_t *tx;
865 	int err;
866 
867 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
868 	if (err != 0)
869 		return (err);
870 
871 	tx = dmu_tx_create(os);
872 	dmu_tx_hold_bonus(tx, object);
873 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
874 	dmu_tx_mark_netfree(tx);
875 	err = dmu_tx_assign(tx, TXG_WAIT);
876 	if (err == 0) {
877 		err = dmu_object_free(os, object, tx);
878 		dmu_tx_commit(tx);
879 	} else {
880 		dmu_tx_abort(tx);
881 	}
882 
883 	return (err);
884 }
885 
886 int
dmu_free_range(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,dmu_tx_t * tx)887 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
888     uint64_t size, dmu_tx_t *tx)
889 {
890 	dnode_t *dn;
891 	int err = dnode_hold(os, object, FTAG, &dn);
892 	if (err)
893 		return (err);
894 	ASSERT(offset < UINT64_MAX);
895 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
896 	dnode_free_range(dn, offset, size, tx);
897 	dnode_rele(dn, FTAG);
898 	return (0);
899 }
900 
901 static int
dmu_read_impl(dnode_t * dn,uint64_t offset,uint64_t size,void * buf,uint32_t flags)902 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
903     void *buf, uint32_t flags)
904 {
905 	dmu_buf_t **dbp;
906 	int numbufs, err = 0;
907 
908 	/*
909 	 * Deal with odd block sizes, where there can't be data past the first
910 	 * block.  If we ever do the tail block optimization, we will need to
911 	 * handle that here as well.
912 	 */
913 	if (dn->dn_maxblkid == 0) {
914 		int newsz = offset > dn->dn_datablksz ? 0 :
915 		    MIN(size, dn->dn_datablksz - offset);
916 		bzero((char *)buf + newsz, size - newsz);
917 		size = newsz;
918 	}
919 
920 	while (size > 0) {
921 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
922 		int i;
923 
924 		/*
925 		 * NB: we could do this block-at-a-time, but it's nice
926 		 * to be reading in parallel.
927 		 */
928 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
929 		    TRUE, FTAG, &numbufs, &dbp, flags);
930 		if (err)
931 			break;
932 
933 		for (i = 0; i < numbufs; i++) {
934 			int tocpy;
935 			int bufoff;
936 			dmu_buf_t *db = dbp[i];
937 
938 			ASSERT(size > 0);
939 
940 			bufoff = offset - db->db_offset;
941 			tocpy = (int)MIN(db->db_size - bufoff, size);
942 
943 			bcopy((char *)db->db_data + bufoff, buf, tocpy);
944 
945 			offset += tocpy;
946 			size -= tocpy;
947 			buf = (char *)buf + tocpy;
948 		}
949 		dmu_buf_rele_array(dbp, numbufs, FTAG);
950 	}
951 	return (err);
952 }
953 
954 int
dmu_read(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,void * buf,uint32_t flags)955 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
956     void *buf, uint32_t flags)
957 {
958 	dnode_t *dn;
959 	int err;
960 
961 	err = dnode_hold(os, object, FTAG, &dn);
962 	if (err != 0)
963 		return (err);
964 
965 	err = dmu_read_impl(dn, offset, size, buf, flags);
966 	dnode_rele(dn, FTAG);
967 	return (err);
968 }
969 
970 int
dmu_read_by_dnode(dnode_t * dn,uint64_t offset,uint64_t size,void * buf,uint32_t flags)971 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
972     uint32_t flags)
973 {
974 	return (dmu_read_impl(dn, offset, size, buf, flags));
975 }
976 
977 static void
dmu_write_impl(dmu_buf_t ** dbp,int numbufs,uint64_t offset,uint64_t size,const void * buf,dmu_tx_t * tx)978 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
979     const void *buf, dmu_tx_t *tx)
980 {
981 	int i;
982 
983 	for (i = 0; i < numbufs; i++) {
984 		int tocpy;
985 		int bufoff;
986 		dmu_buf_t *db = dbp[i];
987 
988 		ASSERT(size > 0);
989 
990 		bufoff = offset - db->db_offset;
991 		tocpy = (int)MIN(db->db_size - bufoff, size);
992 
993 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
994 
995 		if (tocpy == db->db_size)
996 			dmu_buf_will_fill(db, tx);
997 		else
998 			dmu_buf_will_dirty(db, tx);
999 
1000 		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
1001 
1002 		if (tocpy == db->db_size)
1003 			dmu_buf_fill_done(db, tx);
1004 
1005 		offset += tocpy;
1006 		size -= tocpy;
1007 		buf = (char *)buf + tocpy;
1008 	}
1009 }
1010 
1011 void
dmu_write(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,const void * buf,dmu_tx_t * tx)1012 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1013     const void *buf, dmu_tx_t *tx)
1014 {
1015 	dmu_buf_t **dbp;
1016 	int numbufs;
1017 
1018 	if (size == 0)
1019 		return;
1020 
1021 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
1022 	    FALSE, FTAG, &numbufs, &dbp));
1023 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1024 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1025 }
1026 
1027 void
dmu_write_by_dnode(dnode_t * dn,uint64_t offset,uint64_t size,const void * buf,dmu_tx_t * tx)1028 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1029     const void *buf, dmu_tx_t *tx)
1030 {
1031 	dmu_buf_t **dbp;
1032 	int numbufs;
1033 
1034 	if (size == 0)
1035 		return;
1036 
1037 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1038 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1039 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1040 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1041 }
1042 
1043 static int
dmu_object_remap_one_indirect(objset_t * os,dnode_t * dn,uint64_t last_removal_txg,uint64_t offset)1044 dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
1045     uint64_t last_removal_txg, uint64_t offset)
1046 {
1047 	uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
1048 	int err = 0;
1049 
1050 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1051 	dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1052 	ASSERT3P(dbuf, !=, NULL);
1053 
1054 	/*
1055 	 * If the block hasn't been written yet, this default will ensure
1056 	 * we don't try to remap it.
1057 	 */
1058 	uint64_t birth = UINT64_MAX;
1059 	ASSERT3U(last_removal_txg, !=, UINT64_MAX);
1060 	if (dbuf->db_blkptr != NULL)
1061 		birth = dbuf->db_blkptr->blk_birth;
1062 	rw_exit(&dn->dn_struct_rwlock);
1063 
1064 	/*
1065 	 * If this L1 was already written after the last removal, then we've
1066 	 * already tried to remap it.
1067 	 */
1068 	if (birth <= last_removal_txg &&
1069 	    dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
1070 	    dbuf_can_remap(dbuf)) {
1071 		dmu_tx_t *tx = dmu_tx_create(os);
1072 		dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
1073 		err = dmu_tx_assign(tx, TXG_WAIT);
1074 		if (err == 0) {
1075 			(void) dbuf_dirty(dbuf, tx);
1076 			dmu_tx_commit(tx);
1077 		} else {
1078 			dmu_tx_abort(tx);
1079 		}
1080 	}
1081 
1082 	dbuf_rele(dbuf, FTAG);
1083 
1084 	delay(zfs_object_remap_one_indirect_delay_ticks);
1085 
1086 	return (err);
1087 }
1088 
1089 /*
1090  * Remap all blockpointers in the object, if possible, so that they reference
1091  * only concrete vdevs.
1092  *
1093  * To do this, iterate over the L0 blockpointers and remap any that reference
1094  * an indirect vdev. Note that we only examine L0 blockpointers; since we
1095  * cannot guarantee that we can remap all blockpointer anyways (due to split
1096  * blocks), we do not want to make the code unnecessarily complicated to
1097  * catch the unlikely case that there is an L1 block on an indirect vdev that
1098  * contains no indirect blockpointers.
1099  */
1100 int
dmu_object_remap_indirects(objset_t * os,uint64_t object,uint64_t last_removal_txg)1101 dmu_object_remap_indirects(objset_t *os, uint64_t object,
1102     uint64_t last_removal_txg)
1103 {
1104 	uint64_t offset, l1span;
1105 	int err;
1106 	dnode_t *dn;
1107 
1108 	err = dnode_hold(os, object, FTAG, &dn);
1109 	if (err != 0) {
1110 		return (err);
1111 	}
1112 
1113 	if (dn->dn_nlevels <= 1) {
1114 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
1115 			err = SET_ERROR(EINTR);
1116 		}
1117 
1118 		/*
1119 		 * If the dnode has no indirect blocks, we cannot dirty them.
1120 		 * We still want to remap the blkptr(s) in the dnode if
1121 		 * appropriate, so mark it as dirty.
1122 		 */
1123 		if (err == 0 && dnode_needs_remap(dn)) {
1124 			dmu_tx_t *tx = dmu_tx_create(os);
1125 			dmu_tx_hold_bonus(tx, dn->dn_object);
1126 			if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
1127 				dnode_setdirty(dn, tx);
1128 				dmu_tx_commit(tx);
1129 			} else {
1130 				dmu_tx_abort(tx);
1131 			}
1132 		}
1133 
1134 		dnode_rele(dn, FTAG);
1135 		return (err);
1136 	}
1137 
1138 	offset = 0;
1139 	l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
1140 	    dn->dn_datablkshift);
1141 	/*
1142 	 * Find the next L1 indirect that is not a hole.
1143 	 */
1144 	while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
1145 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
1146 			err = SET_ERROR(EINTR);
1147 			break;
1148 		}
1149 		if ((err = dmu_object_remap_one_indirect(os, dn,
1150 		    last_removal_txg, offset)) != 0) {
1151 			break;
1152 		}
1153 		offset += l1span;
1154 	}
1155 
1156 	dnode_rele(dn, FTAG);
1157 	return (err);
1158 }
1159 
1160 void
dmu_prealloc(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,dmu_tx_t * tx)1161 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1162     dmu_tx_t *tx)
1163 {
1164 	dmu_buf_t **dbp;
1165 	int numbufs, i;
1166 
1167 	if (size == 0)
1168 		return;
1169 
1170 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1171 	    FALSE, FTAG, &numbufs, &dbp));
1172 
1173 	for (i = 0; i < numbufs; i++) {
1174 		dmu_buf_t *db = dbp[i];
1175 
1176 		dmu_buf_will_not_fill(db, tx);
1177 	}
1178 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1179 }
1180 
1181 void
dmu_write_embedded(objset_t * os,uint64_t object,uint64_t offset,void * data,uint8_t etype,uint8_t comp,int uncompressed_size,int compressed_size,int byteorder,dmu_tx_t * tx)1182 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1183     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1184     int compressed_size, int byteorder, dmu_tx_t *tx)
1185 {
1186 	dmu_buf_t *db;
1187 
1188 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1189 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1190 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
1191 	    FTAG, &db));
1192 
1193 	dmu_buf_write_embedded(db,
1194 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1195 	    uncompressed_size, compressed_size, byteorder, tx);
1196 
1197 	dmu_buf_rele(db, FTAG);
1198 }
1199 
1200 /*
1201  * DMU support for xuio
1202  */
1203 kstat_t *xuio_ksp = NULL;
1204 
1205 int
dmu_xuio_init(xuio_t * xuio,int nblk)1206 dmu_xuio_init(xuio_t *xuio, int nblk)
1207 {
1208 	dmu_xuio_t *priv;
1209 	uio_t *uio = &xuio->xu_uio;
1210 
1211 	uio->uio_iovcnt = nblk;
1212 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
1213 
1214 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
1215 	priv->cnt = nblk;
1216 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
1217 	priv->iovp = uio->uio_iov;
1218 	XUIO_XUZC_PRIV(xuio) = priv;
1219 
1220 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1221 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1222 	else
1223 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1224 
1225 	return (0);
1226 }
1227 
1228 void
dmu_xuio_fini(xuio_t * xuio)1229 dmu_xuio_fini(xuio_t *xuio)
1230 {
1231 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1232 	int nblk = priv->cnt;
1233 
1234 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1235 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1236 	kmem_free(priv, sizeof (dmu_xuio_t));
1237 
1238 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
1239 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1240 	else
1241 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1242 }
1243 
1244 /*
1245  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1246  * and increase priv->next by 1.
1247  */
1248 int
dmu_xuio_add(xuio_t * xuio,arc_buf_t * abuf,offset_t off,size_t n)1249 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1250 {
1251 	struct iovec *iov;
1252 	uio_t *uio = &xuio->xu_uio;
1253 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1254 	int i = priv->next++;
1255 
1256 	ASSERT(i < priv->cnt);
1257 	ASSERT(off + n <= arc_buf_lsize(abuf));
1258 	iov = uio->uio_iov + i;
1259 	iov->iov_base = (char *)abuf->b_data + off;
1260 	iov->iov_len = n;
1261 	priv->bufs[i] = abuf;
1262 	return (0);
1263 }
1264 
1265 int
dmu_xuio_cnt(xuio_t * xuio)1266 dmu_xuio_cnt(xuio_t *xuio)
1267 {
1268 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1269 	return (priv->cnt);
1270 }
1271 
1272 arc_buf_t *
dmu_xuio_arcbuf(xuio_t * xuio,int i)1273 dmu_xuio_arcbuf(xuio_t *xuio, int i)
1274 {
1275 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1276 
1277 	ASSERT(i < priv->cnt);
1278 	return (priv->bufs[i]);
1279 }
1280 
1281 void
dmu_xuio_clear(xuio_t * xuio,int i)1282 dmu_xuio_clear(xuio_t *xuio, int i)
1283 {
1284 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1285 
1286 	ASSERT(i < priv->cnt);
1287 	priv->bufs[i] = NULL;
1288 }
1289 
1290 static void
xuio_stat_init(void)1291 xuio_stat_init(void)
1292 {
1293 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1294 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1295 	    KSTAT_FLAG_VIRTUAL);
1296 	if (xuio_ksp != NULL) {
1297 		xuio_ksp->ks_data = &xuio_stats;
1298 		kstat_install(xuio_ksp);
1299 	}
1300 }
1301 
1302 static void
xuio_stat_fini(void)1303 xuio_stat_fini(void)
1304 {
1305 	if (xuio_ksp != NULL) {
1306 		kstat_delete(xuio_ksp);
1307 		xuio_ksp = NULL;
1308 	}
1309 }
1310 
1311 void
xuio_stat_wbuf_copied(void)1312 xuio_stat_wbuf_copied(void)
1313 {
1314 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1315 }
1316 
1317 void
xuio_stat_wbuf_nocopy(void)1318 xuio_stat_wbuf_nocopy(void)
1319 {
1320 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1321 }
1322 
1323 #ifdef _KERNEL
1324 int
dmu_read_uio_dnode(dnode_t * dn,uio_t * uio,uint64_t size)1325 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
1326 {
1327 	dmu_buf_t **dbp;
1328 	int numbufs, i, err;
1329 	xuio_t *xuio = NULL;
1330 
1331 	/*
1332 	 * NB: we could do this block-at-a-time, but it's nice
1333 	 * to be reading in parallel.
1334 	 */
1335 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1336 	    TRUE, FTAG, &numbufs, &dbp, 0);
1337 	if (err)
1338 		return (err);
1339 
1340 #ifdef UIO_XUIO
1341 	if (uio->uio_extflg == UIO_XUIO)
1342 		xuio = (xuio_t *)uio;
1343 #endif
1344 
1345 	for (i = 0; i < numbufs; i++) {
1346 		int tocpy;
1347 		int bufoff;
1348 		dmu_buf_t *db = dbp[i];
1349 
1350 		ASSERT(size > 0);
1351 
1352 		bufoff = uio->uio_loffset - db->db_offset;
1353 		tocpy = (int)MIN(db->db_size - bufoff, size);
1354 
1355 		if (xuio) {
1356 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1357 			arc_buf_t *dbuf_abuf = dbi->db_buf;
1358 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1359 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1360 			if (!err) {
1361 				uio->uio_resid -= tocpy;
1362 				uio->uio_loffset += tocpy;
1363 			}
1364 
1365 			if (abuf == dbuf_abuf)
1366 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1367 			else
1368 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1369 		} else {
1370 #ifdef illumos
1371 			err = uiomove((char *)db->db_data + bufoff, tocpy,
1372 			    UIO_READ, uio);
1373 #else
1374 			err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
1375 			    tocpy, uio);
1376 #endif
1377 		}
1378 		if (err)
1379 			break;
1380 
1381 		size -= tocpy;
1382 	}
1383 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1384 
1385 	return (err);
1386 }
1387 
1388 /*
1389  * Read 'size' bytes into the uio buffer.
1390  * From object zdb->db_object.
1391  * Starting at offset uio->uio_loffset.
1392  *
1393  * If the caller already has a dbuf in the target object
1394  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1395  * because we don't have to find the dnode_t for the object.
1396  */
1397 int
dmu_read_uio_dbuf(dmu_buf_t * zdb,uio_t * uio,uint64_t size)1398 dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1399 {
1400 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1401 	dnode_t *dn;
1402 	int err;
1403 
1404 	if (size == 0)
1405 		return (0);
1406 
1407 	DB_DNODE_ENTER(db);
1408 	dn = DB_DNODE(db);
1409 	err = dmu_read_uio_dnode(dn, uio, size);
1410 	DB_DNODE_EXIT(db);
1411 
1412 	return (err);
1413 }
1414 
1415 /*
1416  * Read 'size' bytes into the uio buffer.
1417  * From the specified object
1418  * Starting at offset uio->uio_loffset.
1419  */
1420 int
dmu_read_uio(objset_t * os,uint64_t object,uio_t * uio,uint64_t size)1421 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1422 {
1423 	dnode_t *dn;
1424 	int err;
1425 
1426 	if (size == 0)
1427 		return (0);
1428 
1429 	err = dnode_hold(os, object, FTAG, &dn);
1430 	if (err)
1431 		return (err);
1432 
1433 	err = dmu_read_uio_dnode(dn, uio, size);
1434 
1435 	dnode_rele(dn, FTAG);
1436 
1437 	return (err);
1438 }
1439 
1440 int
dmu_write_uio_dnode(dnode_t * dn,uio_t * uio,uint64_t size,dmu_tx_t * tx)1441 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1442 {
1443 	dmu_buf_t **dbp;
1444 	int numbufs;
1445 	int err = 0;
1446 	int i;
1447 
1448 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1449 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1450 	if (err)
1451 		return (err);
1452 
1453 	for (i = 0; i < numbufs; i++) {
1454 		int tocpy;
1455 		int bufoff;
1456 		dmu_buf_t *db = dbp[i];
1457 
1458 		ASSERT(size > 0);
1459 
1460 		bufoff = uio->uio_loffset - db->db_offset;
1461 		tocpy = (int)MIN(db->db_size - bufoff, size);
1462 
1463 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1464 
1465 		if (tocpy == db->db_size)
1466 			dmu_buf_will_fill(db, tx);
1467 		else
1468 			dmu_buf_will_dirty(db, tx);
1469 
1470 #ifdef illumos
1471 		/*
1472 		 * XXX uiomove could block forever (eg. nfs-backed
1473 		 * pages).  There needs to be a uiolockdown() function
1474 		 * to lock the pages in memory, so that uiomove won't
1475 		 * block.
1476 		 */
1477 		err = uiomove((char *)db->db_data + bufoff, tocpy,
1478 		    UIO_WRITE, uio);
1479 #else
1480 		err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
1481 		    uio);
1482 #endif
1483 
1484 		if (tocpy == db->db_size)
1485 			dmu_buf_fill_done(db, tx);
1486 
1487 		if (err)
1488 			break;
1489 
1490 		size -= tocpy;
1491 	}
1492 
1493 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1494 	return (err);
1495 }
1496 
1497 /*
1498  * Write 'size' bytes from the uio buffer.
1499  * To object zdb->db_object.
1500  * Starting at offset uio->uio_loffset.
1501  *
1502  * If the caller already has a dbuf in the target object
1503  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1504  * because we don't have to find the dnode_t for the object.
1505  */
1506 int
dmu_write_uio_dbuf(dmu_buf_t * zdb,uio_t * uio,uint64_t size,dmu_tx_t * tx)1507 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1508     dmu_tx_t *tx)
1509 {
1510 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1511 	dnode_t *dn;
1512 	int err;
1513 
1514 	if (size == 0)
1515 		return (0);
1516 
1517 	DB_DNODE_ENTER(db);
1518 	dn = DB_DNODE(db);
1519 	err = dmu_write_uio_dnode(dn, uio, size, tx);
1520 	DB_DNODE_EXIT(db);
1521 
1522 	return (err);
1523 }
1524 
1525 /*
1526  * Write 'size' bytes from the uio buffer.
1527  * To the specified object.
1528  * Starting at offset uio->uio_loffset.
1529  */
1530 int
dmu_write_uio(objset_t * os,uint64_t object,uio_t * uio,uint64_t size,dmu_tx_t * tx)1531 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1532     dmu_tx_t *tx)
1533 {
1534 	dnode_t *dn;
1535 	int err;
1536 
1537 	if (size == 0)
1538 		return (0);
1539 
1540 	err = dnode_hold(os, object, FTAG, &dn);
1541 	if (err)
1542 		return (err);
1543 
1544 	err = dmu_write_uio_dnode(dn, uio, size, tx);
1545 
1546 	dnode_rele(dn, FTAG);
1547 
1548 	return (err);
1549 }
1550 
1551 #ifdef illumos
1552 int
dmu_write_pages(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,page_t * pp,dmu_tx_t * tx)1553 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1554     page_t *pp, dmu_tx_t *tx)
1555 {
1556 	dmu_buf_t **dbp;
1557 	int numbufs, i;
1558 	int err;
1559 
1560 	if (size == 0)
1561 		return (0);
1562 
1563 	err = dmu_buf_hold_array(os, object, offset, size,
1564 	    FALSE, FTAG, &numbufs, &dbp);
1565 	if (err)
1566 		return (err);
1567 
1568 	for (i = 0; i < numbufs; i++) {
1569 		int tocpy, copied, thiscpy;
1570 		int bufoff;
1571 		dmu_buf_t *db = dbp[i];
1572 		caddr_t va;
1573 
1574 		ASSERT(size > 0);
1575 		ASSERT3U(db->db_size, >=, PAGESIZE);
1576 
1577 		bufoff = offset - db->db_offset;
1578 		tocpy = (int)MIN(db->db_size - bufoff, size);
1579 
1580 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1581 
1582 		if (tocpy == db->db_size)
1583 			dmu_buf_will_fill(db, tx);
1584 		else
1585 			dmu_buf_will_dirty(db, tx);
1586 
1587 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1588 			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1589 			thiscpy = MIN(PAGESIZE, tocpy - copied);
1590 			va = zfs_map_page(pp, S_READ);
1591 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1592 			zfs_unmap_page(pp, va);
1593 			pp = pp->p_next;
1594 			bufoff += PAGESIZE;
1595 		}
1596 
1597 		if (tocpy == db->db_size)
1598 			dmu_buf_fill_done(db, tx);
1599 
1600 		offset += tocpy;
1601 		size -= tocpy;
1602 	}
1603 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1604 	return (err);
1605 }
1606 
1607 #else	/* !illumos */
1608 
1609 int
dmu_write_pages(objset_t * os,uint64_t object,uint64_t offset,uint64_t size,vm_page_t * ma,dmu_tx_t * tx)1610 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1611     vm_page_t *ma, dmu_tx_t *tx)
1612 {
1613 	dmu_buf_t **dbp;
1614 	struct sf_buf *sf;
1615 	int numbufs, i;
1616 	int err;
1617 
1618 	if (size == 0)
1619 		return (0);
1620 
1621 	err = dmu_buf_hold_array(os, object, offset, size,
1622 	    FALSE, FTAG, &numbufs, &dbp);
1623 	if (err)
1624 		return (err);
1625 
1626 	for (i = 0; i < numbufs; i++) {
1627 		int tocpy, copied, thiscpy;
1628 		int bufoff;
1629 		dmu_buf_t *db = dbp[i];
1630 		caddr_t va;
1631 
1632 		ASSERT(size > 0);
1633 		ASSERT3U(db->db_size, >=, PAGESIZE);
1634 
1635 		bufoff = offset - db->db_offset;
1636 		tocpy = (int)MIN(db->db_size - bufoff, size);
1637 
1638 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1639 
1640 		if (tocpy == db->db_size)
1641 			dmu_buf_will_fill(db, tx);
1642 		else
1643 			dmu_buf_will_dirty(db, tx);
1644 
1645 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1646 			ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
1647 			thiscpy = MIN(PAGESIZE, tocpy - copied);
1648 			va = zfs_map_page(*ma, &sf);
1649 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1650 			zfs_unmap_page(sf);
1651 			ma += 1;
1652 			bufoff += PAGESIZE;
1653 		}
1654 
1655 		if (tocpy == db->db_size)
1656 			dmu_buf_fill_done(db, tx);
1657 
1658 		offset += tocpy;
1659 		size -= tocpy;
1660 	}
1661 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1662 	return (err);
1663 }
1664 
1665 int
dmu_read_pages(objset_t * os,uint64_t object,vm_page_t * ma,int count,int * rbehind,int * rahead,int last_size)1666 dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
1667     int *rbehind, int *rahead, int last_size)
1668 {
1669 	struct sf_buf *sf;
1670 	vm_object_t vmobj;
1671 	vm_page_t m;
1672 	dmu_buf_t **dbp;
1673 	dmu_buf_t *db;
1674 	caddr_t va;
1675 	int numbufs, i;
1676 	int bufoff, pgoff, tocpy;
1677 	int mi, di;
1678 	int err;
1679 
1680 	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
1681 	ASSERT(last_size <= PAGE_SIZE);
1682 
1683 	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
1684 	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
1685 	if (err != 0)
1686 		return (err);
1687 
1688 #ifdef DEBUG
1689 	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
1690 	if (dbp[0]->db_offset != 0 || numbufs > 1) {
1691 		for (i = 0; i < numbufs; i++) {
1692 			ASSERT(ISP2(dbp[i]->db_size));
1693 			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
1694 			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
1695 		}
1696 	}
1697 #endif
1698 
1699 	vmobj = ma[0]->object;
1700 	zfs_vmobject_wlock(vmobj);
1701 
1702 	db = dbp[0];
1703 	for (i = 0; i < *rbehind; i++) {
1704 		m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i,
1705 		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
1706 		if (m == NULL)
1707 			break;
1708 		if (m->valid != 0) {
1709 			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
1710 			break;
1711 		}
1712 		ASSERT(m->dirty == 0);
1713 		ASSERT(!pmap_page_is_mapped(m));
1714 
1715 		ASSERT(db->db_size > PAGE_SIZE);
1716 		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
1717 		va = zfs_map_page(m, &sf);
1718 		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
1719 		zfs_unmap_page(sf);
1720 		m->valid = VM_PAGE_BITS_ALL;
1721 		vm_page_lock(m);
1722 		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1723 			vm_page_activate(m);
1724 		else
1725 			vm_page_deactivate(m);
1726 		vm_page_unlock(m);
1727 	}
1728 	*rbehind = i;
1729 
1730 	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
1731 	pgoff = 0;
1732 	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
1733 		if (pgoff == 0) {
1734 			m = ma[mi];
1735 			vm_page_assert_xbusied(m);
1736 			ASSERT(m->valid == 0);
1737 			ASSERT(m->dirty == 0);
1738 			ASSERT(!pmap_page_is_mapped(m));
1739 			va = zfs_map_page(m, &sf);
1740 		}
1741 		if (bufoff == 0)
1742 			db = dbp[di];
1743 
1744 		ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
1745 		    db->db_offset + bufoff);
1746 
1747 		/*
1748 		 * We do not need to clamp the copy size by the file
1749 		 * size as the last block is zero-filled beyond the
1750 		 * end of file anyway.
1751 		 */
1752 		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
1753 		bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
1754 
1755 		pgoff += tocpy;
1756 		ASSERT(pgoff <= PAGESIZE);
1757 		if (pgoff == PAGESIZE) {
1758 			zfs_unmap_page(sf);
1759 			m->valid = VM_PAGE_BITS_ALL;
1760 			ASSERT(mi < count);
1761 			mi++;
1762 			pgoff = 0;
1763 		}
1764 
1765 		bufoff += tocpy;
1766 		ASSERT(bufoff <= db->db_size);
1767 		if (bufoff == db->db_size) {
1768 			ASSERT(di < numbufs);
1769 			di++;
1770 			bufoff = 0;
1771 		}
1772 	}
1773 
1774 #ifdef DEBUG
1775 	/*
1776 	 * Three possibilities:
1777 	 * - last requested page ends at a buffer boundary and , thus,
1778 	 *   all pages and buffers have been iterated;
1779 	 * - all requested pages are filled, but the last buffer
1780 	 *   has not been exhausted;
1781 	 *   the read-ahead is possible only in this case;
1782 	 * - all buffers have been read, but the last page has not been
1783 	 *   fully filled;
1784 	 *   this is only possible if the file has only a single buffer
1785 	 *   with a size that is not a multiple of the page size.
1786 	 */
1787 	if (mi == count) {
1788 		ASSERT(di >= numbufs - 1);
1789 		IMPLY(*rahead != 0, di == numbufs - 1);
1790 		IMPLY(*rahead != 0, bufoff != 0);
1791 		ASSERT(pgoff == 0);
1792 	}
1793 	if (di == numbufs) {
1794 		ASSERT(mi >= count - 1);
1795 		ASSERT(*rahead == 0);
1796 		IMPLY(pgoff == 0, mi == count);
1797 		if (pgoff != 0) {
1798 			ASSERT(mi == count - 1);
1799 			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
1800 		}
1801 	}
1802 #endif
1803 	if (pgoff != 0) {
1804 		bzero(va + pgoff, PAGESIZE - pgoff);
1805 		zfs_unmap_page(sf);
1806 		m->valid = VM_PAGE_BITS_ALL;
1807 	}
1808 
1809 	for (i = 0; i < *rahead; i++) {
1810 		m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i,
1811 		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_NOBUSY);
1812 		if (m == NULL)
1813 			break;
1814 		if (m->valid != 0) {
1815 			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
1816 			break;
1817 		}
1818 		ASSERT(m->dirty == 0);
1819 		ASSERT(!pmap_page_is_mapped(m));
1820 
1821 		ASSERT(db->db_size > PAGE_SIZE);
1822 		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
1823 		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
1824 		va = zfs_map_page(m, &sf);
1825 		bcopy((char *)db->db_data + bufoff, va, tocpy);
1826 		if (tocpy < PAGESIZE) {
1827 			ASSERT(i == *rahead - 1);
1828 			ASSERT((db->db_size & PAGE_MASK) != 0);
1829 			bzero(va + tocpy, PAGESIZE - tocpy);
1830 		}
1831 		zfs_unmap_page(sf);
1832 		m->valid = VM_PAGE_BITS_ALL;
1833 		vm_page_lock(m);
1834 		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
1835 			vm_page_activate(m);
1836 		else
1837 			vm_page_deactivate(m);
1838 		vm_page_unlock(m);
1839 	}
1840 	*rahead = i;
1841 	zfs_vmobject_wunlock(vmobj);
1842 
1843 	dmu_buf_rele_array(dbp, numbufs, FTAG);
1844 	return (0);
1845 }
1846 #endif	/* illumos */
1847 #endif	/* _KERNEL */
1848 
1849 /*
1850  * Allocate a loaned anonymous arc buffer.
1851  */
1852 arc_buf_t *
dmu_request_arcbuf(dmu_buf_t * handle,int size)1853 dmu_request_arcbuf(dmu_buf_t *handle, int size)
1854 {
1855 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1856 
1857 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
1858 }
1859 
1860 /*
1861  * Free a loaned arc buffer.
1862  */
1863 void
dmu_return_arcbuf(arc_buf_t * buf)1864 dmu_return_arcbuf(arc_buf_t *buf)
1865 {
1866 	arc_return_buf(buf, FTAG);
1867 	arc_buf_destroy(buf, FTAG);
1868 }
1869 
1870 /*
1871  * When possible directly assign passed loaned arc buffer to a dbuf.
1872  * If this is not possible copy the contents of passed arc buf via
1873  * dmu_write().
1874  */
1875 void
dmu_assign_arcbuf_dnode(dnode_t * dn,uint64_t offset,arc_buf_t * buf,dmu_tx_t * tx)1876 dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
1877     dmu_tx_t *tx)
1878 {
1879 	dmu_buf_impl_t *db;
1880 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
1881 	uint64_t blkid;
1882 
1883 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1884 	blkid = dbuf_whichblock(dn, 0, offset);
1885 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1886 	rw_exit(&dn->dn_struct_rwlock);
1887 
1888 	/*
1889 	 * We can only assign if the offset is aligned, the arc buf is the
1890 	 * same size as the dbuf, and the dbuf is not metadata.
1891 	 */
1892 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
1893 #ifdef _KERNEL
1894 		curthread->td_ru.ru_oublock++;
1895 #ifdef RACCT
1896 		if (racct_enable) {
1897 			PROC_LOCK(curproc);
1898 			racct_add_force(curproc, RACCT_WRITEBPS, blksz);
1899 			racct_add_force(curproc, RACCT_WRITEIOPS, 1);
1900 			PROC_UNLOCK(curproc);
1901 		}
1902 #endif /* RACCT */
1903 #endif /* _KERNEL */
1904 		dbuf_assign_arcbuf(db, buf, tx);
1905 		dbuf_rele(db, FTAG);
1906 	} else {
1907 		objset_t *os;
1908 		uint64_t object;
1909 
1910 		/* compressed bufs must always be assignable to their dbuf */
1911 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
1912 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
1913 
1914 		os = dn->dn_objset;
1915 		object = dn->dn_object;
1916 
1917 		dbuf_rele(db, FTAG);
1918 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
1919 		dmu_return_arcbuf(buf);
1920 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1921 	}
1922 }
1923 
1924 void
dmu_assign_arcbuf(dmu_buf_t * handle,uint64_t offset,arc_buf_t * buf,dmu_tx_t * tx)1925 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1926     dmu_tx_t *tx)
1927 {
1928 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1929 
1930 	DB_DNODE_ENTER(dbuf);
1931 	dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx);
1932 	DB_DNODE_EXIT(dbuf);
1933 }
1934 
1935 typedef struct {
1936 	dbuf_dirty_record_t	*dsa_dr;
1937 	dmu_sync_cb_t		*dsa_done;
1938 	zgd_t			*dsa_zgd;
1939 	dmu_tx_t		*dsa_tx;
1940 } dmu_sync_arg_t;
1941 
1942 /* ARGSUSED */
1943 static void
dmu_sync_ready(zio_t * zio,arc_buf_t * buf,void * varg)1944 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1945 {
1946 	dmu_sync_arg_t *dsa = varg;
1947 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1948 	blkptr_t *bp = zio->io_bp;
1949 
1950 	if (zio->io_error == 0) {
1951 		if (BP_IS_HOLE(bp)) {
1952 			/*
1953 			 * A block of zeros may compress to a hole, but the
1954 			 * block size still needs to be known for replay.
1955 			 */
1956 			BP_SET_LSIZE(bp, db->db_size);
1957 		} else if (!BP_IS_EMBEDDED(bp)) {
1958 			ASSERT(BP_GET_LEVEL(bp) == 0);
1959 			bp->blk_fill = 1;
1960 		}
1961 	}
1962 }
1963 
1964 static void
dmu_sync_late_arrival_ready(zio_t * zio)1965 dmu_sync_late_arrival_ready(zio_t *zio)
1966 {
1967 	dmu_sync_ready(zio, NULL, zio->io_private);
1968 }
1969 
1970 /* ARGSUSED */
1971 static void
dmu_sync_done(zio_t * zio,arc_buf_t * buf,void * varg)1972 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1973 {
1974 	dmu_sync_arg_t *dsa = varg;
1975 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
1976 	dmu_buf_impl_t *db = dr->dr_dbuf;
1977 
1978 	mutex_enter(&db->db_mtx);
1979 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1980 	if (zio->io_error == 0) {
1981 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1982 		if (dr->dt.dl.dr_nopwrite) {
1983 			blkptr_t *bp = zio->io_bp;
1984 			blkptr_t *bp_orig = &zio->io_bp_orig;
1985 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1986 
1987 			ASSERT(BP_EQUAL(bp, bp_orig));
1988 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
1989 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1990 			ASSERT(zio_checksum_table[chksum].ci_flags &
1991 			    ZCHECKSUM_FLAG_NOPWRITE);
1992 		}
1993 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
1994 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1995 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1996 
1997 		/*
1998 		 * Old style holes are filled with all zeros, whereas
1999 		 * new-style holes maintain their lsize, type, level,
2000 		 * and birth time (see zio_write_compress). While we
2001 		 * need to reset the BP_SET_LSIZE() call that happened
2002 		 * in dmu_sync_ready for old style holes, we do *not*
2003 		 * want to wipe out the information contained in new
2004 		 * style holes. Thus, only zero out the block pointer if
2005 		 * it's an old style hole.
2006 		 */
2007 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
2008 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
2009 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
2010 	} else {
2011 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2012 	}
2013 	cv_broadcast(&db->db_changed);
2014 	mutex_exit(&db->db_mtx);
2015 
2016 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
2017 
2018 	kmem_free(dsa, sizeof (*dsa));
2019 }
2020 
2021 static void
dmu_sync_late_arrival_done(zio_t * zio)2022 dmu_sync_late_arrival_done(zio_t *zio)
2023 {
2024 	blkptr_t *bp = zio->io_bp;
2025 	dmu_sync_arg_t *dsa = zio->io_private;
2026 	blkptr_t *bp_orig = &zio->io_bp_orig;
2027 
2028 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
2029 		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
2030 		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
2031 		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
2032 		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
2033 		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
2034 	}
2035 
2036 	dmu_tx_commit(dsa->dsa_tx);
2037 
2038 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
2039 
2040 	abd_put(zio->io_abd);
2041 	kmem_free(dsa, sizeof (*dsa));
2042 }
2043 
2044 static int
dmu_sync_late_arrival(zio_t * pio,objset_t * os,dmu_sync_cb_t * done,zgd_t * zgd,zio_prop_t * zp,zbookmark_phys_t * zb)2045 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
2046     zio_prop_t *zp, zbookmark_phys_t *zb)
2047 {
2048 	dmu_sync_arg_t *dsa;
2049 	dmu_tx_t *tx;
2050 
2051 	tx = dmu_tx_create(os);
2052 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
2053 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
2054 		dmu_tx_abort(tx);
2055 		/* Make zl_get_data do txg_waited_synced() */
2056 		return (SET_ERROR(EIO));
2057 	}
2058 
2059 	/*
2060 	 * In order to prevent the zgd's lwb from being free'd prior to
2061 	 * dmu_sync_late_arrival_done() being called, we have to ensure
2062 	 * the lwb's "max txg" takes this tx's txg into account.
2063 	 */
2064 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
2065 
2066 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
2067 	dsa->dsa_dr = NULL;
2068 	dsa->dsa_done = done;
2069 	dsa->dsa_zgd = zgd;
2070 	dsa->dsa_tx = tx;
2071 
2072 	/*
2073 	 * Since we are currently syncing this txg, it's nontrivial to
2074 	 * determine what BP to nopwrite against, so we disable nopwrite.
2075 	 *
2076 	 * When syncing, the db_blkptr is initially the BP of the previous
2077 	 * txg.  We can not nopwrite against it because it will be changed
2078 	 * (this is similar to the non-late-arrival case where the dbuf is
2079 	 * dirty in a future txg).
2080 	 *
2081 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
2082 	 * We can not nopwrite against it because although the BP will not
2083 	 * (typically) be changed, the data has not yet been persisted to this
2084 	 * location.
2085 	 *
2086 	 * Finally, when dbuf_write_done() is called, it is theoretically
2087 	 * possible to always nopwrite, because the data that was written in
2088 	 * this txg is the same data that we are trying to write.  However we
2089 	 * would need to check that this dbuf is not dirty in any future
2090 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
2091 	 * don't nopwrite in this case.
2092 	 */
2093 	zp->zp_nopwrite = B_FALSE;
2094 
2095 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
2096 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
2097 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
2098 	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
2099 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
2100 
2101 	return (0);
2102 }
2103 
2104 /*
2105  * Intent log support: sync the block associated with db to disk.
2106  * N.B. and XXX: the caller is responsible for making sure that the
2107  * data isn't changing while dmu_sync() is writing it.
2108  *
2109  * Return values:
2110  *
2111  *	EEXIST: this txg has already been synced, so there's nothing to do.
2112  *		The caller should not log the write.
2113  *
2114  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
2115  *		The caller should not log the write.
2116  *
2117  *	EALREADY: this block is already in the process of being synced.
2118  *		The caller should track its progress (somehow).
2119  *
2120  *	EIO: could not do the I/O.
2121  *		The caller should do a txg_wait_synced().
2122  *
2123  *	0: the I/O has been initiated.
2124  *		The caller should log this blkptr in the done callback.
2125  *		It is possible that the I/O will fail, in which case
2126  *		the error will be reported to the done callback and
2127  *		propagated to pio from zio_done().
2128  */
2129 int
dmu_sync(zio_t * pio,uint64_t txg,dmu_sync_cb_t * done,zgd_t * zgd)2130 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
2131 {
2132 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
2133 	objset_t *os = db->db_objset;
2134 	dsl_dataset_t *ds = os->os_dsl_dataset;
2135 	dbuf_dirty_record_t *dr;
2136 	dmu_sync_arg_t *dsa;
2137 	zbookmark_phys_t zb;
2138 	zio_prop_t zp;
2139 	dnode_t *dn;
2140 
2141 	ASSERT(pio != NULL);
2142 	ASSERT(txg != 0);
2143 
2144 	SET_BOOKMARK(&zb, ds->ds_object,
2145 	    db->db.db_object, db->db_level, db->db_blkid);
2146 
2147 	DB_DNODE_ENTER(db);
2148 	dn = DB_DNODE(db);
2149 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
2150 	DB_DNODE_EXIT(db);
2151 
2152 	/*
2153 	 * If we're frozen (running ziltest), we always need to generate a bp.
2154 	 */
2155 	if (txg > spa_freeze_txg(os->os_spa))
2156 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
2157 
2158 	/*
2159 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
2160 	 * and us.  If we determine that this txg is not yet syncing,
2161 	 * but it begins to sync a moment later, that's OK because the
2162 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
2163 	 */
2164 	mutex_enter(&db->db_mtx);
2165 
2166 	if (txg <= spa_last_synced_txg(os->os_spa)) {
2167 		/*
2168 		 * This txg has already synced.  There's nothing to do.
2169 		 */
2170 		mutex_exit(&db->db_mtx);
2171 		return (SET_ERROR(EEXIST));
2172 	}
2173 
2174 	if (txg <= spa_syncing_txg(os->os_spa)) {
2175 		/*
2176 		 * This txg is currently syncing, so we can't mess with
2177 		 * the dirty record anymore; just write a new log block.
2178 		 */
2179 		mutex_exit(&db->db_mtx);
2180 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
2181 	}
2182 
2183 	dr = db->db_last_dirty;
2184 	while (dr && dr->dr_txg != txg)
2185 		dr = dr->dr_next;
2186 
2187 	if (dr == NULL) {
2188 		/*
2189 		 * There's no dr for this dbuf, so it must have been freed.
2190 		 * There's no need to log writes to freed blocks, so we're done.
2191 		 */
2192 		mutex_exit(&db->db_mtx);
2193 		return (SET_ERROR(ENOENT));
2194 	}
2195 
2196 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
2197 
2198 	if (db->db_blkptr != NULL) {
2199 		/*
2200 		 * We need to fill in zgd_bp with the current blkptr so that
2201 		 * the nopwrite code can check if we're writing the same
2202 		 * data that's already on disk.  We can only nopwrite if we
2203 		 * are sure that after making the copy, db_blkptr will not
2204 		 * change until our i/o completes.  We ensure this by
2205 		 * holding the db_mtx, and only allowing nopwrite if the
2206 		 * block is not already dirty (see below).  This is verified
2207 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
2208 		 * not changed.
2209 		 */
2210 		*zgd->zgd_bp = *db->db_blkptr;
2211 	}
2212 
2213 	/*
2214 	 * Assume the on-disk data is X, the current syncing data (in
2215 	 * txg - 1) is Y, and the current in-memory data is Z (currently
2216 	 * in dmu_sync).
2217 	 *
2218 	 * We usually want to perform a nopwrite if X and Z are the
2219 	 * same.  However, if Y is different (i.e. the BP is going to
2220 	 * change before this write takes effect), then a nopwrite will
2221 	 * be incorrect - we would override with X, which could have
2222 	 * been freed when Y was written.
2223 	 *
2224 	 * (Note that this is not a concern when we are nop-writing from
2225 	 * syncing context, because X and Y must be identical, because
2226 	 * all previous txgs have been synced.)
2227 	 *
2228 	 * Therefore, we disable nopwrite if the current BP could change
2229 	 * before this TXG.  There are two ways it could change: by
2230 	 * being dirty (dr_next is non-NULL), or by being freed
2231 	 * (dnode_block_freed()).  This behavior is verified by
2232 	 * zio_done(), which VERIFYs that the override BP is identical
2233 	 * to the on-disk BP.
2234 	 */
2235 	DB_DNODE_ENTER(db);
2236 	dn = DB_DNODE(db);
2237 	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
2238 		zp.zp_nopwrite = B_FALSE;
2239 	DB_DNODE_EXIT(db);
2240 
2241 	ASSERT(dr->dr_txg == txg);
2242 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
2243 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2244 		/*
2245 		 * We have already issued a sync write for this buffer,
2246 		 * or this buffer has already been synced.  It could not
2247 		 * have been dirtied since, or we would have cleared the state.
2248 		 */
2249 		mutex_exit(&db->db_mtx);
2250 		return (SET_ERROR(EALREADY));
2251 	}
2252 
2253 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2254 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
2255 	mutex_exit(&db->db_mtx);
2256 
2257 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
2258 	dsa->dsa_dr = dr;
2259 	dsa->dsa_done = done;
2260 	dsa->dsa_zgd = zgd;
2261 	dsa->dsa_tx = NULL;
2262 
2263 	zio_nowait(arc_write(pio, os->os_spa, txg,
2264 	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
2265 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
2266 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
2267 
2268 	return (0);
2269 }
2270 
2271 int
dmu_object_set_blocksize(objset_t * os,uint64_t object,uint64_t size,int ibs,dmu_tx_t * tx)2272 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
2273     dmu_tx_t *tx)
2274 {
2275 	dnode_t *dn;
2276 	int err;
2277 
2278 	err = dnode_hold(os, object, FTAG, &dn);
2279 	if (err)
2280 		return (err);
2281 	err = dnode_set_blksz(dn, size, ibs, tx);
2282 	dnode_rele(dn, FTAG);
2283 	return (err);
2284 }
2285 
2286 void
dmu_object_set_checksum(objset_t * os,uint64_t object,uint8_t checksum,dmu_tx_t * tx)2287 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
2288     dmu_tx_t *tx)
2289 {
2290 	dnode_t *dn;
2291 
2292 	/*
2293 	 * Send streams include each object's checksum function.  This
2294 	 * check ensures that the receiving system can understand the
2295 	 * checksum function transmitted.
2296 	 */
2297 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
2298 
2299 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
2300 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
2301 	dn->dn_checksum = checksum;
2302 	dnode_setdirty(dn, tx);
2303 	dnode_rele(dn, FTAG);
2304 }
2305 
2306 void
dmu_object_set_compress(objset_t * os,uint64_t object,uint8_t compress,dmu_tx_t * tx)2307 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
2308     dmu_tx_t *tx)
2309 {
2310 	dnode_t *dn;
2311 
2312 	/*
2313 	 * Send streams include each object's compression function.  This
2314 	 * check ensures that the receiving system can understand the
2315 	 * compression function transmitted.
2316 	 */
2317 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
2318 
2319 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
2320 	dn->dn_compress = compress;
2321 	dnode_setdirty(dn, tx);
2322 	dnode_rele(dn, FTAG);
2323 }
2324 
2325 int zfs_mdcomp_disable = 0;
2326 SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN,
2327     &zfs_mdcomp_disable, 0, "Disable metadata compression");
2328 
2329 /*
2330  * When the "redundant_metadata" property is set to "most", only indirect
2331  * blocks of this level and higher will have an additional ditto block.
2332  */
2333 int zfs_redundant_metadata_most_ditto_level = 2;
2334 
2335 void
dmu_write_policy(objset_t * os,dnode_t * dn,int level,int wp,zio_prop_t * zp)2336 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
2337 {
2338 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
2339 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
2340 	    (wp & WP_SPILL));
2341 	enum zio_checksum checksum = os->os_checksum;
2342 	enum zio_compress compress = os->os_compress;
2343 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
2344 	boolean_t dedup = B_FALSE;
2345 	boolean_t nopwrite = B_FALSE;
2346 	boolean_t dedup_verify = os->os_dedup_verify;
2347 	int copies = os->os_copies;
2348 
2349 	/*
2350 	 * We maintain different write policies for each of the following
2351 	 * types of data:
2352 	 *	 1. metadata
2353 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
2354 	 *	 3. all other level 0 blocks
2355 	 */
2356 	if (ismd) {
2357 		if (zfs_mdcomp_disable) {
2358 			compress = ZIO_COMPRESS_EMPTY;
2359 		} else {
2360 			/*
2361 			 * XXX -- we should design a compression algorithm
2362 			 * that specializes in arrays of bps.
2363 			 */
2364 			compress = zio_compress_select(os->os_spa,
2365 			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
2366 		}
2367 
2368 		/*
2369 		 * Metadata always gets checksummed.  If the data
2370 		 * checksum is multi-bit correctable, and it's not a
2371 		 * ZBT-style checksum, then it's suitable for metadata
2372 		 * as well.  Otherwise, the metadata checksum defaults
2373 		 * to fletcher4.
2374 		 */
2375 		if (!(zio_checksum_table[checksum].ci_flags &
2376 		    ZCHECKSUM_FLAG_METADATA) ||
2377 		    (zio_checksum_table[checksum].ci_flags &
2378 		    ZCHECKSUM_FLAG_EMBEDDED))
2379 			checksum = ZIO_CHECKSUM_FLETCHER_4;
2380 
2381 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2382 		    (os->os_redundant_metadata ==
2383 		    ZFS_REDUNDANT_METADATA_MOST &&
2384 		    (level >= zfs_redundant_metadata_most_ditto_level ||
2385 		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
2386 			copies++;
2387 	} else if (wp & WP_NOFILL) {
2388 		ASSERT(level == 0);
2389 
2390 		/*
2391 		 * If we're writing preallocated blocks, we aren't actually
2392 		 * writing them so don't set any policy properties.  These
2393 		 * blocks are currently only used by an external subsystem
2394 		 * outside of zfs (i.e. dump) and not written by the zio
2395 		 * pipeline.
2396 		 */
2397 		compress = ZIO_COMPRESS_OFF;
2398 		checksum = ZIO_CHECKSUM_NOPARITY;
2399 	} else {
2400 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
2401 		    compress);
2402 
2403 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
2404 		    zio_checksum_select(dn->dn_checksum, checksum) :
2405 		    dedup_checksum;
2406 
2407 		/*
2408 		 * Determine dedup setting.  If we are in dmu_sync(),
2409 		 * we won't actually dedup now because that's all
2410 		 * done in syncing context; but we do want to use the
2411 		 * dedup checkum.  If the checksum is not strong
2412 		 * enough to ensure unique signatures, force
2413 		 * dedup_verify.
2414 		 */
2415 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
2416 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
2417 			if (!(zio_checksum_table[checksum].ci_flags &
2418 			    ZCHECKSUM_FLAG_DEDUP))
2419 				dedup_verify = B_TRUE;
2420 		}
2421 
2422 		/*
2423 		 * Enable nopwrite if we have secure enough checksum
2424 		 * algorithm (see comment in zio_nop_write) and
2425 		 * compression is enabled.  We don't enable nopwrite if
2426 		 * dedup is enabled as the two features are mutually
2427 		 * exclusive.
2428 		 */
2429 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2430 		    ZCHECKSUM_FLAG_NOPWRITE) &&
2431 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2432 	}
2433 
2434 	zp->zp_checksum = checksum;
2435 	zp->zp_compress = compress;
2436 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2437 
2438 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2439 	zp->zp_level = level;
2440 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2441 	zp->zp_dedup = dedup;
2442 	zp->zp_dedup_verify = dedup && dedup_verify;
2443 	zp->zp_nopwrite = nopwrite;
2444 }
2445 
2446 int
dmu_offset_next(objset_t * os,uint64_t object,boolean_t hole,uint64_t * off)2447 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2448 {
2449 	dnode_t *dn;
2450 	int err;
2451 
2452 	/*
2453 	 * Sync any current changes before
2454 	 * we go trundling through the block pointers.
2455 	 */
2456 	err = dmu_object_wait_synced(os, object);
2457 	if (err) {
2458 		return (err);
2459 	}
2460 
2461 	err = dnode_hold(os, object, FTAG, &dn);
2462 	if (err) {
2463 		return (err);
2464 	}
2465 
2466 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2467 	dnode_rele(dn, FTAG);
2468 
2469 	return (err);
2470 }
2471 
2472 /*
2473  * Given the ZFS object, if it contains any dirty nodes
2474  * this function flushes all dirty blocks to disk. This
2475  * ensures the DMU object info is updated. A more efficient
2476  * future version might just find the TXG with the maximum
2477  * ID and wait for that to be synced.
2478  */
2479 int
dmu_object_wait_synced(objset_t * os,uint64_t object)2480 dmu_object_wait_synced(objset_t *os, uint64_t object)
2481 {
2482 	dnode_t *dn;
2483 	int error, i;
2484 
2485 	error = dnode_hold(os, object, FTAG, &dn);
2486 	if (error) {
2487 		return (error);
2488 	}
2489 
2490 	for (i = 0; i < TXG_SIZE; i++) {
2491 		if (list_link_active(&dn->dn_dirty_link[i])) {
2492 			break;
2493 		}
2494 	}
2495 	dnode_rele(dn, FTAG);
2496 	if (i != TXG_SIZE) {
2497 		txg_wait_synced(dmu_objset_pool(os), 0);
2498 	}
2499 
2500 	return (0);
2501 }
2502 
2503 void
dmu_object_info_from_dnode(dnode_t * dn,dmu_object_info_t * doi)2504 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2505 {
2506 	dnode_phys_t *dnp;
2507 
2508 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
2509 	mutex_enter(&dn->dn_mtx);
2510 
2511 	dnp = dn->dn_phys;
2512 
2513 	doi->doi_data_block_size = dn->dn_datablksz;
2514 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
2515 	    1ULL << dn->dn_indblkshift : 0;
2516 	doi->doi_type = dn->dn_type;
2517 	doi->doi_bonus_type = dn->dn_bonustype;
2518 	doi->doi_bonus_size = dn->dn_bonuslen;
2519 	doi->doi_indirection = dn->dn_nlevels;
2520 	doi->doi_checksum = dn->dn_checksum;
2521 	doi->doi_compress = dn->dn_compress;
2522 	doi->doi_nblkptr = dn->dn_nblkptr;
2523 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
2524 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
2525 	doi->doi_fill_count = 0;
2526 	for (int i = 0; i < dnp->dn_nblkptr; i++)
2527 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
2528 
2529 	mutex_exit(&dn->dn_mtx);
2530 	rw_exit(&dn->dn_struct_rwlock);
2531 }
2532 
2533 /*
2534  * Get information on a DMU object.
2535  * If doi is NULL, just indicates whether the object exists.
2536  */
2537 int
dmu_object_info(objset_t * os,uint64_t object,dmu_object_info_t * doi)2538 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2539 {
2540 	dnode_t *dn;
2541 	int err = dnode_hold(os, object, FTAG, &dn);
2542 
2543 	if (err)
2544 		return (err);
2545 
2546 	if (doi != NULL)
2547 		dmu_object_info_from_dnode(dn, doi);
2548 
2549 	dnode_rele(dn, FTAG);
2550 	return (0);
2551 }
2552 
2553 /*
2554  * As above, but faster; can be used when you have a held dbuf in hand.
2555  */
2556 void
dmu_object_info_from_db(dmu_buf_t * db_fake,dmu_object_info_t * doi)2557 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
2558 {
2559 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2560 
2561 	DB_DNODE_ENTER(db);
2562 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
2563 	DB_DNODE_EXIT(db);
2564 }
2565 
2566 /*
2567  * Faster still when you only care about the size.
2568  * This is specifically optimized for zfs_getattr().
2569  */
2570 void
dmu_object_size_from_db(dmu_buf_t * db_fake,uint32_t * blksize,u_longlong_t * nblk512)2571 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2572     u_longlong_t *nblk512)
2573 {
2574 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2575 	dnode_t *dn;
2576 
2577 	DB_DNODE_ENTER(db);
2578 	dn = DB_DNODE(db);
2579 
2580 	*blksize = dn->dn_datablksz;
2581 	/* add 1 for dnode space */
2582 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
2583 	    SPA_MINBLOCKSHIFT) + 1;
2584 	DB_DNODE_EXIT(db);
2585 }
2586 
2587 void
byteswap_uint64_array(void * vbuf,size_t size)2588 byteswap_uint64_array(void *vbuf, size_t size)
2589 {
2590 	uint64_t *buf = vbuf;
2591 	size_t count = size >> 3;
2592 	int i;
2593 
2594 	ASSERT((size & 7) == 0);
2595 
2596 	for (i = 0; i < count; i++)
2597 		buf[i] = BSWAP_64(buf[i]);
2598 }
2599 
2600 void
byteswap_uint32_array(void * vbuf,size_t size)2601 byteswap_uint32_array(void *vbuf, size_t size)
2602 {
2603 	uint32_t *buf = vbuf;
2604 	size_t count = size >> 2;
2605 	int i;
2606 
2607 	ASSERT((size & 3) == 0);
2608 
2609 	for (i = 0; i < count; i++)
2610 		buf[i] = BSWAP_32(buf[i]);
2611 }
2612 
2613 void
byteswap_uint16_array(void * vbuf,size_t size)2614 byteswap_uint16_array(void *vbuf, size_t size)
2615 {
2616 	uint16_t *buf = vbuf;
2617 	size_t count = size >> 1;
2618 	int i;
2619 
2620 	ASSERT((size & 1) == 0);
2621 
2622 	for (i = 0; i < count; i++)
2623 		buf[i] = BSWAP_16(buf[i]);
2624 }
2625 
2626 /* ARGSUSED */
2627 void
byteswap_uint8_array(void * vbuf,size_t size)2628 byteswap_uint8_array(void *vbuf, size_t size)
2629 {
2630 }
2631 
2632 void
dmu_init(void)2633 dmu_init(void)
2634 {
2635 	abd_init();
2636 	zfs_dbgmsg_init();
2637 	sa_cache_init();
2638 	xuio_stat_init();
2639 	dmu_objset_init();
2640 	dnode_init();
2641 	zfetch_init();
2642 	zio_compress_init();
2643 	l2arc_init();
2644 	arc_init();
2645 	dbuf_init();
2646 }
2647 
2648 void
dmu_fini(void)2649 dmu_fini(void)
2650 {
2651 	arc_fini(); /* arc depends on l2arc, so arc must go first */
2652 	l2arc_fini();
2653 	zfetch_fini();
2654 	zio_compress_fini();
2655 	dbuf_fini();
2656 	dnode_fini();
2657 	dmu_objset_fini();
2658 	xuio_stat_fini();
2659 	sa_cache_fini();
2660 	zfs_dbgmsg_fini();
2661 	abd_fini();
2662 }
2663