1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 */
26
27 #include <sys/dmu.h>
28 #include <sys/dmu_impl.h>
29 #include <sys/dbuf.h>
30 #include <sys/dmu_tx.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/dsl_dataset.h>
33 #include <sys/dsl_dir.h>
34 #include <sys/dsl_pool.h>
35 #include <sys/zap_impl.h>
36 #include <sys/spa.h>
37 #include <sys/sa.h>
38 #include <sys/sa_impl.h>
39 #include <sys/zfs_context.h>
40 #include <sys/trace_zfs.h>
41
42 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
43 uint64_t arg1, uint64_t arg2);
44
45 dmu_tx_stats_t dmu_tx_stats = {
46 { "dmu_tx_assigned", KSTAT_DATA_UINT64 },
47 { "dmu_tx_delay", KSTAT_DATA_UINT64 },
48 { "dmu_tx_error", KSTAT_DATA_UINT64 },
49 { "dmu_tx_suspended", KSTAT_DATA_UINT64 },
50 { "dmu_tx_group", KSTAT_DATA_UINT64 },
51 { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 },
52 { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 },
53 { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
54 { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
55 { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
56 { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
57 { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },
58 { "dmu_tx_quota", KSTAT_DATA_UINT64 },
59 };
60
61 static kstat_t *dmu_tx_ksp;
62
63 dmu_tx_t *
dmu_tx_create_dd(dsl_dir_t * dd)64 dmu_tx_create_dd(dsl_dir_t *dd)
65 {
66 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
67 tx->tx_dir = dd;
68 if (dd != NULL)
69 tx->tx_pool = dd->dd_pool;
70 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
71 offsetof(dmu_tx_hold_t, txh_node));
72 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
73 offsetof(dmu_tx_callback_t, dcb_node));
74 tx->tx_start = gethrtime();
75 return (tx);
76 }
77
78 dmu_tx_t *
dmu_tx_create(objset_t * os)79 dmu_tx_create(objset_t *os)
80 {
81 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
82 tx->tx_objset = os;
83 return (tx);
84 }
85
86 dmu_tx_t *
dmu_tx_create_assigned(struct dsl_pool * dp,uint64_t txg)87 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
88 {
89 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
90
91 TXG_VERIFY(dp->dp_spa, txg);
92 tx->tx_pool = dp;
93 tx->tx_txg = txg;
94 tx->tx_anyobj = TRUE;
95
96 return (tx);
97 }
98
99 int
dmu_tx_is_syncing(dmu_tx_t * tx)100 dmu_tx_is_syncing(dmu_tx_t *tx)
101 {
102 return (tx->tx_anyobj);
103 }
104
105 int
dmu_tx_private_ok(dmu_tx_t * tx)106 dmu_tx_private_ok(dmu_tx_t *tx)
107 {
108 return (tx->tx_anyobj);
109 }
110
111 static dmu_tx_hold_t *
dmu_tx_hold_dnode_impl(dmu_tx_t * tx,dnode_t * dn,enum dmu_tx_hold_type type,uint64_t arg1,uint64_t arg2)112 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
113 uint64_t arg1, uint64_t arg2)
114 {
115 dmu_tx_hold_t *txh;
116
117 if (dn != NULL) {
118 (void) zfs_refcount_add(&dn->dn_holds, tx);
119 if (tx->tx_txg != 0) {
120 mutex_enter(&dn->dn_mtx);
121 /*
122 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
123 * problem, but there's no way for it to happen (for
124 * now, at least).
125 */
126 ASSERT(dn->dn_assigned_txg == 0);
127 dn->dn_assigned_txg = tx->tx_txg;
128 (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
129 mutex_exit(&dn->dn_mtx);
130 }
131 }
132
133 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
134 txh->txh_tx = tx;
135 txh->txh_dnode = dn;
136 zfs_refcount_create(&txh->txh_space_towrite);
137 zfs_refcount_create(&txh->txh_memory_tohold);
138 txh->txh_type = type;
139 txh->txh_arg1 = arg1;
140 txh->txh_arg2 = arg2;
141 list_insert_tail(&tx->tx_holds, txh);
142
143 return (txh);
144 }
145
146 static dmu_tx_hold_t *
dmu_tx_hold_object_impl(dmu_tx_t * tx,objset_t * os,uint64_t object,enum dmu_tx_hold_type type,uint64_t arg1,uint64_t arg2)147 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
148 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
149 {
150 dnode_t *dn = NULL;
151 dmu_tx_hold_t *txh;
152 int err;
153
154 if (object != DMU_NEW_OBJECT) {
155 err = dnode_hold(os, object, FTAG, &dn);
156 if (err != 0) {
157 tx->tx_err = err;
158 return (NULL);
159 }
160 }
161 txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
162 if (dn != NULL)
163 dnode_rele(dn, FTAG);
164 return (txh);
165 }
166
167 void
dmu_tx_add_new_object(dmu_tx_t * tx,dnode_t * dn)168 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
169 {
170 /*
171 * If we're syncing, they can manipulate any object anyhow, and
172 * the hold on the dnode_t can cause problems.
173 */
174 if (!dmu_tx_is_syncing(tx))
175 (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
176 }
177
178 /*
179 * This function reads specified data from disk. The specified data will
180 * be needed to perform the transaction -- i.e, it will be read after
181 * we do dmu_tx_assign(). There are two reasons that we read the data now
182 * (before dmu_tx_assign()):
183 *
184 * 1. Reading it now has potentially better performance. The transaction
185 * has not yet been assigned, so the TXG is not held open, and also the
186 * caller typically has less locks held when calling dmu_tx_hold_*() than
187 * after the transaction has been assigned. This reduces the lock (and txg)
188 * hold times, thus reducing lock contention.
189 *
190 * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
191 * that are detected before they start making changes to the DMU state
192 * (i.e. now). Once the transaction has been assigned, and some DMU
193 * state has been changed, it can be difficult to recover from an i/o
194 * error (e.g. to undo the changes already made in memory at the DMU
195 * layer). Typically code to do so does not exist in the caller -- it
196 * assumes that the data has already been cached and thus i/o errors are
197 * not possible.
198 *
199 * It has been observed that the i/o initiated here can be a performance
200 * problem, and it appears to be optional, because we don't look at the
201 * data which is read. However, removing this read would only serve to
202 * move the work elsewhere (after the dmu_tx_assign()), where it may
203 * have a greater impact on performance (in addition to the impact on
204 * fault tolerance noted above).
205 */
206 static int
dmu_tx_check_ioerr(zio_t * zio,dnode_t * dn,int level,uint64_t blkid)207 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
208 {
209 int err;
210 dmu_buf_impl_t *db;
211
212 rw_enter(&dn->dn_struct_rwlock, RW_READER);
213 db = dbuf_hold_level(dn, level, blkid, FTAG);
214 rw_exit(&dn->dn_struct_rwlock);
215 if (db == NULL)
216 return (SET_ERROR(EIO));
217 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
218 dbuf_rele(db, FTAG);
219 return (err);
220 }
221
222 /* ARGSUSED */
223 static void
dmu_tx_count_write(dmu_tx_hold_t * txh,uint64_t off,uint64_t len)224 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
225 {
226 dnode_t *dn = txh->txh_dnode;
227 int err = 0;
228
229 if (len == 0)
230 return;
231
232 (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
233
234 if (dn == NULL)
235 return;
236
237 /*
238 * For i/o error checking, read the blocks that will be needed
239 * to perform the write: the first and last level-0 blocks (if
240 * they are not aligned, i.e. if they are partial-block writes),
241 * and all the level-1 blocks.
242 */
243 if (dn->dn_maxblkid == 0) {
244 if (off < dn->dn_datablksz &&
245 (off > 0 || len < dn->dn_datablksz)) {
246 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
247 if (err != 0) {
248 txh->txh_tx->tx_err = err;
249 }
250 }
251 } else {
252 zio_t *zio = zio_root(dn->dn_objset->os_spa,
253 NULL, NULL, ZIO_FLAG_CANFAIL);
254
255 /* first level-0 block */
256 uint64_t start = off >> dn->dn_datablkshift;
257 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
258 err = dmu_tx_check_ioerr(zio, dn, 0, start);
259 if (err != 0) {
260 txh->txh_tx->tx_err = err;
261 }
262 }
263
264 /* last level-0 block */
265 uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
266 if (end != start && end <= dn->dn_maxblkid &&
267 P2PHASE(off + len, dn->dn_datablksz)) {
268 err = dmu_tx_check_ioerr(zio, dn, 0, end);
269 if (err != 0) {
270 txh->txh_tx->tx_err = err;
271 }
272 }
273
274 /* level-1 blocks */
275 if (dn->dn_nlevels > 1) {
276 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
277 for (uint64_t i = (start >> shft) + 1;
278 i < end >> shft; i++) {
279 err = dmu_tx_check_ioerr(zio, dn, 1, i);
280 if (err != 0) {
281 txh->txh_tx->tx_err = err;
282 }
283 }
284 }
285
286 err = zio_wait(zio);
287 if (err != 0) {
288 txh->txh_tx->tx_err = err;
289 }
290 }
291 }
292
293 static void
dmu_tx_count_append(dmu_tx_hold_t * txh,uint64_t off,uint64_t len)294 dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
295 {
296 dnode_t *dn = txh->txh_dnode;
297 int err = 0;
298
299 if (len == 0)
300 return;
301
302 (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
303
304 if (dn == NULL)
305 return;
306
307 /*
308 * For i/o error checking, read the blocks that will be needed
309 * to perform the append; first level-0 block (if not aligned, i.e.
310 * if they are partial-block writes), no additional blocks are read.
311 */
312 if (dn->dn_maxblkid == 0) {
313 if (off < dn->dn_datablksz &&
314 (off > 0 || len < dn->dn_datablksz)) {
315 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
316 if (err != 0) {
317 txh->txh_tx->tx_err = err;
318 }
319 }
320 } else {
321 zio_t *zio = zio_root(dn->dn_objset->os_spa,
322 NULL, NULL, ZIO_FLAG_CANFAIL);
323
324 /* first level-0 block */
325 uint64_t start = off >> dn->dn_datablkshift;
326 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
327 err = dmu_tx_check_ioerr(zio, dn, 0, start);
328 if (err != 0) {
329 txh->txh_tx->tx_err = err;
330 }
331 }
332
333 err = zio_wait(zio);
334 if (err != 0) {
335 txh->txh_tx->tx_err = err;
336 }
337 }
338 }
339
340 static void
dmu_tx_count_dnode(dmu_tx_hold_t * txh)341 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
342 {
343 (void) zfs_refcount_add_many(&txh->txh_space_towrite,
344 DNODE_MIN_SIZE, FTAG);
345 }
346
347 void
dmu_tx_hold_write(dmu_tx_t * tx,uint64_t object,uint64_t off,int len)348 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
349 {
350 dmu_tx_hold_t *txh;
351
352 ASSERT0(tx->tx_txg);
353 ASSERT3U(len, <=, DMU_MAX_ACCESS);
354 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
355
356 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
357 object, THT_WRITE, off, len);
358 if (txh != NULL) {
359 dmu_tx_count_write(txh, off, len);
360 dmu_tx_count_dnode(txh);
361 }
362 }
363
364 void
dmu_tx_hold_write_by_dnode(dmu_tx_t * tx,dnode_t * dn,uint64_t off,int len)365 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
366 {
367 dmu_tx_hold_t *txh;
368
369 ASSERT0(tx->tx_txg);
370 ASSERT3U(len, <=, DMU_MAX_ACCESS);
371 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
372
373 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
374 if (txh != NULL) {
375 dmu_tx_count_write(txh, off, len);
376 dmu_tx_count_dnode(txh);
377 }
378 }
379
380 /*
381 * Should be used when appending to an object and the exact offset is unknown.
382 * The write must occur at or beyond the specified offset. Only the L0 block
383 * at provided offset will be prefetched.
384 */
385 void
dmu_tx_hold_append(dmu_tx_t * tx,uint64_t object,uint64_t off,int len)386 dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
387 {
388 dmu_tx_hold_t *txh;
389
390 ASSERT0(tx->tx_txg);
391 ASSERT3U(len, <=, DMU_MAX_ACCESS);
392
393 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
394 object, THT_APPEND, off, DMU_OBJECT_END);
395 if (txh != NULL) {
396 dmu_tx_count_append(txh, off, len);
397 dmu_tx_count_dnode(txh);
398 }
399 }
400
401 void
dmu_tx_hold_append_by_dnode(dmu_tx_t * tx,dnode_t * dn,uint64_t off,int len)402 dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
403 {
404 dmu_tx_hold_t *txh;
405
406 ASSERT0(tx->tx_txg);
407 ASSERT3U(len, <=, DMU_MAX_ACCESS);
408
409 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END);
410 if (txh != NULL) {
411 dmu_tx_count_append(txh, off, len);
412 dmu_tx_count_dnode(txh);
413 }
414 }
415
416 /*
417 * This function marks the transaction as being a "net free". The end
418 * result is that refquotas will be disabled for this transaction, and
419 * this transaction will be able to use half of the pool space overhead
420 * (see dsl_pool_adjustedsize()). Therefore this function should only
421 * be called for transactions that we expect will not cause a net increase
422 * in the amount of space used (but it's OK if that is occasionally not true).
423 */
424 void
dmu_tx_mark_netfree(dmu_tx_t * tx)425 dmu_tx_mark_netfree(dmu_tx_t *tx)
426 {
427 tx->tx_netfree = B_TRUE;
428 }
429
430 static void
dmu_tx_hold_free_impl(dmu_tx_hold_t * txh,uint64_t off,uint64_t len)431 dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
432 {
433 dmu_tx_t *tx = txh->txh_tx;
434 dnode_t *dn = txh->txh_dnode;
435 int err;
436
437 ASSERT(tx->tx_txg == 0);
438
439 dmu_tx_count_dnode(txh);
440
441 if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
442 return;
443 if (len == DMU_OBJECT_END)
444 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
445
446 dmu_tx_count_dnode(txh);
447
448 /*
449 * For i/o error checking, we read the first and last level-0
450 * blocks if they are not aligned, and all the level-1 blocks.
451 *
452 * Note: dbuf_free_range() assumes that we have not instantiated
453 * any level-0 dbufs that will be completely freed. Therefore we must
454 * exercise care to not read or count the first and last blocks
455 * if they are blocksize-aligned.
456 */
457 if (dn->dn_datablkshift == 0) {
458 if (off != 0 || len < dn->dn_datablksz)
459 dmu_tx_count_write(txh, 0, dn->dn_datablksz);
460 } else {
461 /* first block will be modified if it is not aligned */
462 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
463 dmu_tx_count_write(txh, off, 1);
464 /* last block will be modified if it is not aligned */
465 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
466 dmu_tx_count_write(txh, off + len, 1);
467 }
468
469 /*
470 * Check level-1 blocks.
471 */
472 if (dn->dn_nlevels > 1) {
473 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
474 SPA_BLKPTRSHIFT;
475 uint64_t start = off >> shift;
476 uint64_t end = (off + len) >> shift;
477
478 ASSERT(dn->dn_indblkshift != 0);
479
480 /*
481 * dnode_reallocate() can result in an object with indirect
482 * blocks having an odd data block size. In this case,
483 * just check the single block.
484 */
485 if (dn->dn_datablkshift == 0)
486 start = end = 0;
487
488 zio_t *zio = zio_root(tx->tx_pool->dp_spa,
489 NULL, NULL, ZIO_FLAG_CANFAIL);
490 for (uint64_t i = start; i <= end; i++) {
491 uint64_t ibyte = i << shift;
492 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
493 i = ibyte >> shift;
494 if (err == ESRCH || i > end)
495 break;
496 if (err != 0) {
497 tx->tx_err = err;
498 (void) zio_wait(zio);
499 return;
500 }
501
502 (void) zfs_refcount_add_many(&txh->txh_memory_tohold,
503 1 << dn->dn_indblkshift, FTAG);
504
505 err = dmu_tx_check_ioerr(zio, dn, 1, i);
506 if (err != 0) {
507 tx->tx_err = err;
508 (void) zio_wait(zio);
509 return;
510 }
511 }
512 err = zio_wait(zio);
513 if (err != 0) {
514 tx->tx_err = err;
515 return;
516 }
517 }
518 }
519
520 void
dmu_tx_hold_free(dmu_tx_t * tx,uint64_t object,uint64_t off,uint64_t len)521 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
522 {
523 dmu_tx_hold_t *txh;
524
525 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
526 object, THT_FREE, off, len);
527 if (txh != NULL)
528 (void) dmu_tx_hold_free_impl(txh, off, len);
529 }
530
531 void
dmu_tx_hold_free_by_dnode(dmu_tx_t * tx,dnode_t * dn,uint64_t off,uint64_t len)532 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
533 {
534 dmu_tx_hold_t *txh;
535
536 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
537 if (txh != NULL)
538 (void) dmu_tx_hold_free_impl(txh, off, len);
539 }
540
541 static void
dmu_tx_hold_zap_impl(dmu_tx_hold_t * txh,const char * name)542 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
543 {
544 dmu_tx_t *tx = txh->txh_tx;
545 dnode_t *dn = txh->txh_dnode;
546 int err;
547
548 ASSERT(tx->tx_txg == 0);
549
550 dmu_tx_count_dnode(txh);
551
552 /*
553 * Modifying a almost-full microzap is around the worst case (128KB)
554 *
555 * If it is a fat zap, the worst case would be 7*16KB=112KB:
556 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
557 * - 4 new blocks written if adding:
558 * - 2 blocks for possibly split leaves,
559 * - 2 grown ptrtbl blocks
560 */
561 (void) zfs_refcount_add_many(&txh->txh_space_towrite,
562 MZAP_MAX_BLKSZ, FTAG);
563
564 if (dn == NULL)
565 return;
566
567 ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
568
569 if (dn->dn_maxblkid == 0 || name == NULL) {
570 /*
571 * This is a microzap (only one block), or we don't know
572 * the name. Check the first block for i/o errors.
573 */
574 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
575 if (err != 0) {
576 tx->tx_err = err;
577 }
578 } else {
579 /*
580 * Access the name so that we'll check for i/o errors to
581 * the leaf blocks, etc. We ignore ENOENT, as this name
582 * may not yet exist.
583 */
584 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
585 if (err == EIO || err == ECKSUM || err == ENXIO) {
586 tx->tx_err = err;
587 }
588 }
589 }
590
591 void
dmu_tx_hold_zap(dmu_tx_t * tx,uint64_t object,int add,const char * name)592 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
593 {
594 dmu_tx_hold_t *txh;
595
596 ASSERT0(tx->tx_txg);
597
598 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
599 object, THT_ZAP, add, (uintptr_t)name);
600 if (txh != NULL)
601 dmu_tx_hold_zap_impl(txh, name);
602 }
603
604 void
dmu_tx_hold_zap_by_dnode(dmu_tx_t * tx,dnode_t * dn,int add,const char * name)605 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
606 {
607 dmu_tx_hold_t *txh;
608
609 ASSERT0(tx->tx_txg);
610 ASSERT(dn != NULL);
611
612 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
613 if (txh != NULL)
614 dmu_tx_hold_zap_impl(txh, name);
615 }
616
617 void
dmu_tx_hold_bonus(dmu_tx_t * tx,uint64_t object)618 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
619 {
620 dmu_tx_hold_t *txh;
621
622 ASSERT(tx->tx_txg == 0);
623
624 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
625 object, THT_BONUS, 0, 0);
626 if (txh)
627 dmu_tx_count_dnode(txh);
628 }
629
630 void
dmu_tx_hold_bonus_by_dnode(dmu_tx_t * tx,dnode_t * dn)631 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
632 {
633 dmu_tx_hold_t *txh;
634
635 ASSERT0(tx->tx_txg);
636
637 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
638 if (txh)
639 dmu_tx_count_dnode(txh);
640 }
641
642 void
dmu_tx_hold_space(dmu_tx_t * tx,uint64_t space)643 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
644 {
645 dmu_tx_hold_t *txh;
646
647 ASSERT(tx->tx_txg == 0);
648
649 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
650 DMU_NEW_OBJECT, THT_SPACE, space, 0);
651 if (txh) {
652 (void) zfs_refcount_add_many(
653 &txh->txh_space_towrite, space, FTAG);
654 }
655 }
656
657 #ifdef ZFS_DEBUG
658 void
dmu_tx_dirty_buf(dmu_tx_t * tx,dmu_buf_impl_t * db)659 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
660 {
661 boolean_t match_object = B_FALSE;
662 boolean_t match_offset = B_FALSE;
663
664 DB_DNODE_ENTER(db);
665 dnode_t *dn = DB_DNODE(db);
666 ASSERT(tx->tx_txg != 0);
667 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
668 ASSERT3U(dn->dn_object, ==, db->db.db_object);
669
670 if (tx->tx_anyobj) {
671 DB_DNODE_EXIT(db);
672 return;
673 }
674
675 /* XXX No checking on the meta dnode for now */
676 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
677 DB_DNODE_EXIT(db);
678 return;
679 }
680
681 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
682 txh = list_next(&tx->tx_holds, txh)) {
683 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
684 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
685 match_object = TRUE;
686 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
687 int datablkshift = dn->dn_datablkshift ?
688 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
689 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
690 int shift = datablkshift + epbs * db->db_level;
691 uint64_t beginblk = shift >= 64 ? 0 :
692 (txh->txh_arg1 >> shift);
693 uint64_t endblk = shift >= 64 ? 0 :
694 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
695 uint64_t blkid = db->db_blkid;
696
697 /* XXX txh_arg2 better not be zero... */
698
699 dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
700 txh->txh_type, (u_longlong_t)beginblk,
701 (u_longlong_t)endblk);
702
703 switch (txh->txh_type) {
704 case THT_WRITE:
705 if (blkid >= beginblk && blkid <= endblk)
706 match_offset = TRUE;
707 /*
708 * We will let this hold work for the bonus
709 * or spill buffer so that we don't need to
710 * hold it when creating a new object.
711 */
712 if (blkid == DMU_BONUS_BLKID ||
713 blkid == DMU_SPILL_BLKID)
714 match_offset = TRUE;
715 /*
716 * They might have to increase nlevels,
717 * thus dirtying the new TLIBs. Or the
718 * might have to change the block size,
719 * thus dirying the new lvl=0 blk=0.
720 */
721 if (blkid == 0)
722 match_offset = TRUE;
723 break;
724 case THT_APPEND:
725 if (blkid >= beginblk && (blkid <= endblk ||
726 txh->txh_arg2 == DMU_OBJECT_END))
727 match_offset = TRUE;
728
729 /*
730 * THT_WRITE used for bonus and spill blocks.
731 */
732 ASSERT(blkid != DMU_BONUS_BLKID &&
733 blkid != DMU_SPILL_BLKID);
734
735 /*
736 * They might have to increase nlevels,
737 * thus dirtying the new TLIBs. Or the
738 * might have to change the block size,
739 * thus dirying the new lvl=0 blk=0.
740 */
741 if (blkid == 0)
742 match_offset = TRUE;
743 break;
744 case THT_FREE:
745 /*
746 * We will dirty all the level 1 blocks in
747 * the free range and perhaps the first and
748 * last level 0 block.
749 */
750 if (blkid >= beginblk && (blkid <= endblk ||
751 txh->txh_arg2 == DMU_OBJECT_END))
752 match_offset = TRUE;
753 break;
754 case THT_SPILL:
755 if (blkid == DMU_SPILL_BLKID)
756 match_offset = TRUE;
757 break;
758 case THT_BONUS:
759 if (blkid == DMU_BONUS_BLKID)
760 match_offset = TRUE;
761 break;
762 case THT_ZAP:
763 match_offset = TRUE;
764 break;
765 case THT_NEWOBJECT:
766 match_object = TRUE;
767 break;
768 default:
769 cmn_err(CE_PANIC, "bad txh_type %d",
770 txh->txh_type);
771 }
772 }
773 if (match_object && match_offset) {
774 DB_DNODE_EXIT(db);
775 return;
776 }
777 }
778 DB_DNODE_EXIT(db);
779 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
780 (u_longlong_t)db->db.db_object, db->db_level,
781 (u_longlong_t)db->db_blkid);
782 }
783 #endif
784
785 /*
786 * If we can't do 10 iops, something is wrong. Let us go ahead
787 * and hit zfs_dirty_data_max.
788 */
789 hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
790 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
791
792 /*
793 * We delay transactions when we've determined that the backend storage
794 * isn't able to accommodate the rate of incoming writes.
795 *
796 * If there is already a transaction waiting, we delay relative to when
797 * that transaction finishes waiting. This way the calculated min_time
798 * is independent of the number of threads concurrently executing
799 * transactions.
800 *
801 * If we are the only waiter, wait relative to when the transaction
802 * started, rather than the current time. This credits the transaction for
803 * "time already served", e.g. reading indirect blocks.
804 *
805 * The minimum time for a transaction to take is calculated as:
806 * min_time = scale * (dirty - min) / (max - dirty)
807 * min_time is then capped at zfs_delay_max_ns.
808 *
809 * The delay has two degrees of freedom that can be adjusted via tunables.
810 * The percentage of dirty data at which we start to delay is defined by
811 * zfs_delay_min_dirty_percent. This should typically be at or above
812 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
813 * delay after writing at full speed has failed to keep up with the incoming
814 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
815 * speaking, this variable determines the amount of delay at the midpoint of
816 * the curve.
817 *
818 * delay
819 * 10ms +-------------------------------------------------------------*+
820 * | *|
821 * 9ms + *+
822 * | *|
823 * 8ms + *+
824 * | * |
825 * 7ms + * +
826 * | * |
827 * 6ms + * +
828 * | * |
829 * 5ms + * +
830 * | * |
831 * 4ms + * +
832 * | * |
833 * 3ms + * +
834 * | * |
835 * 2ms + (midpoint) * +
836 * | | ** |
837 * 1ms + v *** +
838 * | zfs_delay_scale ----------> ******** |
839 * 0 +-------------------------------------*********----------------+
840 * 0% <- zfs_dirty_data_max -> 100%
841 *
842 * Note that since the delay is added to the outstanding time remaining on the
843 * most recent transaction, the delay is effectively the inverse of IOPS.
844 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
845 * was chosen such that small changes in the amount of accumulated dirty data
846 * in the first 3/4 of the curve yield relatively small differences in the
847 * amount of delay.
848 *
849 * The effects can be easier to understand when the amount of delay is
850 * represented on a log scale:
851 *
852 * delay
853 * 100ms +-------------------------------------------------------------++
854 * + +
855 * | |
856 * + *+
857 * 10ms + *+
858 * + ** +
859 * | (midpoint) ** |
860 * + | ** +
861 * 1ms + v **** +
862 * + zfs_delay_scale ----------> ***** +
863 * | **** |
864 * + **** +
865 * 100us + ** +
866 * + * +
867 * | * |
868 * + * +
869 * 10us + * +
870 * + +
871 * | |
872 * + +
873 * +--------------------------------------------------------------+
874 * 0% <- zfs_dirty_data_max -> 100%
875 *
876 * Note here that only as the amount of dirty data approaches its limit does
877 * the delay start to increase rapidly. The goal of a properly tuned system
878 * should be to keep the amount of dirty data out of that range by first
879 * ensuring that the appropriate limits are set for the I/O scheduler to reach
880 * optimal throughput on the backend storage, and then by changing the value
881 * of zfs_delay_scale to increase the steepness of the curve.
882 */
883 static void
dmu_tx_delay(dmu_tx_t * tx,uint64_t dirty)884 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
885 {
886 dsl_pool_t *dp = tx->tx_pool;
887 uint64_t delay_min_bytes, wrlog;
888 hrtime_t wakeup, tx_time = 0, now;
889
890 /* Calculate minimum transaction time for the dirty data amount. */
891 delay_min_bytes =
892 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
893 if (dirty > delay_min_bytes) {
894 /*
895 * The caller has already waited until we are under the max.
896 * We make them pass us the amount of dirty data so we don't
897 * have to handle the case of it being >= the max, which
898 * could cause a divide-by-zero if it's == the max.
899 */
900 ASSERT3U(dirty, <, zfs_dirty_data_max);
901
902 tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
903 (zfs_dirty_data_max - dirty);
904 }
905
906 /* Calculate minimum transaction time for the TX_WRITE log size. */
907 wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
908 delay_min_bytes =
909 zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
910 if (wrlog >= zfs_wrlog_data_max) {
911 tx_time = zfs_delay_max_ns;
912 } else if (wrlog > delay_min_bytes) {
913 tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
914 (zfs_wrlog_data_max - wrlog), tx_time);
915 }
916
917 if (tx_time == 0)
918 return;
919
920 tx_time = MIN(tx_time, zfs_delay_max_ns);
921 now = gethrtime();
922 if (now > tx->tx_start + tx_time)
923 return;
924
925 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
926 uint64_t, tx_time);
927
928 mutex_enter(&dp->dp_lock);
929 wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
930 dp->dp_last_wakeup = wakeup;
931 mutex_exit(&dp->dp_lock);
932
933 zfs_sleep_until(wakeup);
934 }
935
936 /*
937 * This routine attempts to assign the transaction to a transaction group.
938 * To do so, we must determine if there is sufficient free space on disk.
939 *
940 * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
941 * on it), then it is assumed that there is sufficient free space,
942 * unless there's insufficient slop space in the pool (see the comment
943 * above spa_slop_shift in spa_misc.c).
944 *
945 * If it is not a "netfree" transaction, then if the data already on disk
946 * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
947 * ENOSPC. Otherwise, if the current rough estimate of pending changes,
948 * plus the rough estimate of this transaction's changes, may exceed the
949 * allowed usage, then this will fail with ERESTART, which will cause the
950 * caller to wait for the pending changes to be written to disk (by waiting
951 * for the next TXG to open), and then check the space usage again.
952 *
953 * The rough estimate of pending changes is comprised of the sum of:
954 *
955 * - this transaction's holds' txh_space_towrite
956 *
957 * - dd_tempreserved[], which is the sum of in-flight transactions'
958 * holds' txh_space_towrite (i.e. those transactions that have called
959 * dmu_tx_assign() but not yet called dmu_tx_commit()).
960 *
961 * - dd_space_towrite[], which is the amount of dirtied dbufs.
962 *
963 * Note that all of these values are inflated by spa_get_worst_case_asize(),
964 * which means that we may get ERESTART well before we are actually in danger
965 * of running out of space, but this also mitigates any small inaccuracies
966 * in the rough estimate (e.g. txh_space_towrite doesn't take into account
967 * indirect blocks, and dd_space_towrite[] doesn't take into account changes
968 * to the MOS).
969 *
970 * Note that due to this algorithm, it is possible to exceed the allowed
971 * usage by one transaction. Also, as we approach the allowed usage,
972 * we will allow a very limited amount of changes into each TXG, thus
973 * decreasing performance.
974 */
975 static int
dmu_tx_try_assign(dmu_tx_t * tx,uint64_t txg_how)976 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
977 {
978 spa_t *spa = tx->tx_pool->dp_spa;
979
980 ASSERT0(tx->tx_txg);
981
982 if (tx->tx_err) {
983 DMU_TX_STAT_BUMP(dmu_tx_error);
984 return (tx->tx_err);
985 }
986
987 if (spa_suspended(spa)) {
988 DMU_TX_STAT_BUMP(dmu_tx_suspended);
989
990 /*
991 * If the user has indicated a blocking failure mode
992 * then return ERESTART which will block in dmu_tx_wait().
993 * Otherwise, return EIO so that an error can get
994 * propagated back to the VOP calls.
995 *
996 * Note that we always honor the txg_how flag regardless
997 * of the failuremode setting.
998 */
999 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1000 !(txg_how & TXG_WAIT))
1001 return (SET_ERROR(EIO));
1002
1003 return (SET_ERROR(ERESTART));
1004 }
1005
1006 if (!tx->tx_dirty_delayed &&
1007 dsl_pool_need_wrlog_delay(tx->tx_pool)) {
1008 tx->tx_wait_dirty = B_TRUE;
1009 DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
1010 return (SET_ERROR(ERESTART));
1011 }
1012
1013 if (!tx->tx_dirty_delayed &&
1014 dsl_pool_need_dirty_delay(tx->tx_pool)) {
1015 tx->tx_wait_dirty = B_TRUE;
1016 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
1017 return (SET_ERROR(ERESTART));
1018 }
1019
1020 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1021 tx->tx_needassign_txh = NULL;
1022
1023 /*
1024 * NB: No error returns are allowed after txg_hold_open, but
1025 * before processing the dnode holds, due to the
1026 * dmu_tx_unassign() logic.
1027 */
1028
1029 uint64_t towrite = 0;
1030 uint64_t tohold = 0;
1031 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1032 txh = list_next(&tx->tx_holds, txh)) {
1033 dnode_t *dn = txh->txh_dnode;
1034 if (dn != NULL) {
1035 /*
1036 * This thread can't hold the dn_struct_rwlock
1037 * while assigning the tx, because this can lead to
1038 * deadlock. Specifically, if this dnode is already
1039 * assigned to an earlier txg, this thread may need
1040 * to wait for that txg to sync (the ERESTART case
1041 * below). The other thread that has assigned this
1042 * dnode to an earlier txg prevents this txg from
1043 * syncing until its tx can complete (calling
1044 * dmu_tx_commit()), but it may need to acquire the
1045 * dn_struct_rwlock to do so (e.g. via
1046 * dmu_buf_hold*()).
1047 *
1048 * Note that this thread can't hold the lock for
1049 * read either, but the rwlock doesn't record
1050 * enough information to make that assertion.
1051 */
1052 ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));
1053
1054 mutex_enter(&dn->dn_mtx);
1055 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1056 mutex_exit(&dn->dn_mtx);
1057 tx->tx_needassign_txh = txh;
1058 DMU_TX_STAT_BUMP(dmu_tx_group);
1059 return (SET_ERROR(ERESTART));
1060 }
1061 if (dn->dn_assigned_txg == 0)
1062 dn->dn_assigned_txg = tx->tx_txg;
1063 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1064 (void) zfs_refcount_add(&dn->dn_tx_holds, tx);
1065 mutex_exit(&dn->dn_mtx);
1066 }
1067 towrite += zfs_refcount_count(&txh->txh_space_towrite);
1068 tohold += zfs_refcount_count(&txh->txh_memory_tohold);
1069 }
1070
1071 /* needed allocation: worst-case estimate of write space */
1072 uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
1073 /* calculate memory footprint estimate */
1074 uint64_t memory = towrite + tohold;
1075
1076 if (tx->tx_dir != NULL && asize != 0) {
1077 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1078 asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
1079 if (err != 0)
1080 return (err);
1081 }
1082
1083 DMU_TX_STAT_BUMP(dmu_tx_assigned);
1084
1085 return (0);
1086 }
1087
1088 static void
dmu_tx_unassign(dmu_tx_t * tx)1089 dmu_tx_unassign(dmu_tx_t *tx)
1090 {
1091 if (tx->tx_txg == 0)
1092 return;
1093
1094 txg_rele_to_quiesce(&tx->tx_txgh);
1095
1096 /*
1097 * Walk the transaction's hold list, removing the hold on the
1098 * associated dnode, and notifying waiters if the refcount drops to 0.
1099 */
1100 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
1101 txh && txh != tx->tx_needassign_txh;
1102 txh = list_next(&tx->tx_holds, txh)) {
1103 dnode_t *dn = txh->txh_dnode;
1104
1105 if (dn == NULL)
1106 continue;
1107 mutex_enter(&dn->dn_mtx);
1108 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1109
1110 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1111 dn->dn_assigned_txg = 0;
1112 cv_broadcast(&dn->dn_notxholds);
1113 }
1114 mutex_exit(&dn->dn_mtx);
1115 }
1116
1117 txg_rele_to_sync(&tx->tx_txgh);
1118
1119 tx->tx_lasttried_txg = tx->tx_txg;
1120 tx->tx_txg = 0;
1121 }
1122
1123 /*
1124 * Assign tx to a transaction group; txg_how is a bitmask:
1125 *
1126 * If TXG_WAIT is set and the currently open txg is full, this function
1127 * will wait until there's a new txg. This should be used when no locks
1128 * are being held. With this bit set, this function will only fail if
1129 * we're truly out of space (or over quota).
1130 *
1131 * If TXG_WAIT is *not* set and we can't assign into the currently open
1132 * txg without blocking, this function will return immediately with
1133 * ERESTART. This should be used whenever locks are being held. On an
1134 * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
1135 * and try again.
1136 *
1137 * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1138 * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1139 * details on the throttle). This is used by the VFS operations, after
1140 * they have already called dmu_tx_wait() (though most likely on a
1141 * different tx).
1142 *
1143 * It is guaranteed that subsequent successful calls to dmu_tx_assign()
1144 * will assign the tx to monotonically increasing txgs. Of course this is
1145 * not strong monotonicity, because the same txg can be returned multiple
1146 * times in a row. This guarantee holds both for subsequent calls from
1147 * one thread and for multiple threads. For example, it is impossible to
1148 * observe the following sequence of events:
1149 *
1150 * Thread 1 Thread 2
1151 *
1152 * dmu_tx_assign(T1, ...)
1153 * 1 <- dmu_tx_get_txg(T1)
1154 * dmu_tx_assign(T2, ...)
1155 * 2 <- dmu_tx_get_txg(T2)
1156 * dmu_tx_assign(T3, ...)
1157 * 1 <- dmu_tx_get_txg(T3)
1158 */
1159 int
dmu_tx_assign(dmu_tx_t * tx,uint64_t txg_how)1160 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
1161 {
1162 int err;
1163
1164 ASSERT(tx->tx_txg == 0);
1165 ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
1166 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1167
1168 /* If we might wait, we must not hold the config lock. */
1169 IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
1170
1171 if ((txg_how & TXG_NOTHROTTLE))
1172 tx->tx_dirty_delayed = B_TRUE;
1173
1174 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1175 dmu_tx_unassign(tx);
1176
1177 if (err != ERESTART || !(txg_how & TXG_WAIT))
1178 return (err);
1179
1180 dmu_tx_wait(tx);
1181 }
1182
1183 txg_rele_to_quiesce(&tx->tx_txgh);
1184
1185 return (0);
1186 }
1187
1188 void
dmu_tx_wait(dmu_tx_t * tx)1189 dmu_tx_wait(dmu_tx_t *tx)
1190 {
1191 spa_t *spa = tx->tx_pool->dp_spa;
1192 dsl_pool_t *dp = tx->tx_pool;
1193 hrtime_t before;
1194
1195 ASSERT(tx->tx_txg == 0);
1196 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1197
1198 before = gethrtime();
1199
1200 if (tx->tx_wait_dirty) {
1201 uint64_t dirty;
1202
1203 /*
1204 * dmu_tx_try_assign() has determined that we need to wait
1205 * because we've consumed much or all of the dirty buffer
1206 * space.
1207 */
1208 mutex_enter(&dp->dp_lock);
1209 if (dp->dp_dirty_total >= zfs_dirty_data_max)
1210 DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
1211 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1212 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1213 dirty = dp->dp_dirty_total;
1214 mutex_exit(&dp->dp_lock);
1215
1216 dmu_tx_delay(tx, dirty);
1217
1218 tx->tx_wait_dirty = B_FALSE;
1219
1220 /*
1221 * Note: setting tx_dirty_delayed only has effect if the
1222 * caller used TX_WAIT. Otherwise they are going to
1223 * destroy this tx and try again. The common case,
1224 * zfs_write(), uses TX_WAIT.
1225 */
1226 tx->tx_dirty_delayed = B_TRUE;
1227 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1228 /*
1229 * If the pool is suspended we need to wait until it
1230 * is resumed. Note that it's possible that the pool
1231 * has become active after this thread has tried to
1232 * obtain a tx. If that's the case then tx_lasttried_txg
1233 * would not have been set.
1234 */
1235 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1236 } else if (tx->tx_needassign_txh) {
1237 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1238
1239 mutex_enter(&dn->dn_mtx);
1240 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1241 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1242 mutex_exit(&dn->dn_mtx);
1243 tx->tx_needassign_txh = NULL;
1244 } else {
1245 /*
1246 * If we have a lot of dirty data just wait until we sync
1247 * out a TXG at which point we'll hopefully have synced
1248 * a portion of the changes.
1249 */
1250 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1251 }
1252
1253 spa_tx_assign_add_nsecs(spa, gethrtime() - before);
1254 }
1255
1256 static void
dmu_tx_destroy(dmu_tx_t * tx)1257 dmu_tx_destroy(dmu_tx_t *tx)
1258 {
1259 dmu_tx_hold_t *txh;
1260
1261 while ((txh = list_head(&tx->tx_holds)) != NULL) {
1262 dnode_t *dn = txh->txh_dnode;
1263
1264 list_remove(&tx->tx_holds, txh);
1265 zfs_refcount_destroy_many(&txh->txh_space_towrite,
1266 zfs_refcount_count(&txh->txh_space_towrite));
1267 zfs_refcount_destroy_many(&txh->txh_memory_tohold,
1268 zfs_refcount_count(&txh->txh_memory_tohold));
1269 kmem_free(txh, sizeof (dmu_tx_hold_t));
1270 if (dn != NULL)
1271 dnode_rele(dn, tx);
1272 }
1273
1274 list_destroy(&tx->tx_callbacks);
1275 list_destroy(&tx->tx_holds);
1276 kmem_free(tx, sizeof (dmu_tx_t));
1277 }
1278
1279 void
dmu_tx_commit(dmu_tx_t * tx)1280 dmu_tx_commit(dmu_tx_t *tx)
1281 {
1282 ASSERT(tx->tx_txg != 0);
1283
1284 /*
1285 * Go through the transaction's hold list and remove holds on
1286 * associated dnodes, notifying waiters if no holds remain.
1287 */
1288 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1289 txh = list_next(&tx->tx_holds, txh)) {
1290 dnode_t *dn = txh->txh_dnode;
1291
1292 if (dn == NULL)
1293 continue;
1294
1295 mutex_enter(&dn->dn_mtx);
1296 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1297
1298 if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1299 dn->dn_assigned_txg = 0;
1300 cv_broadcast(&dn->dn_notxholds);
1301 }
1302 mutex_exit(&dn->dn_mtx);
1303 }
1304
1305 if (tx->tx_tempreserve_cookie)
1306 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1307
1308 if (!list_is_empty(&tx->tx_callbacks))
1309 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1310
1311 if (tx->tx_anyobj == FALSE)
1312 txg_rele_to_sync(&tx->tx_txgh);
1313
1314 dmu_tx_destroy(tx);
1315 }
1316
1317 void
dmu_tx_abort(dmu_tx_t * tx)1318 dmu_tx_abort(dmu_tx_t *tx)
1319 {
1320 ASSERT(tx->tx_txg == 0);
1321
1322 /*
1323 * Call any registered callbacks with an error code.
1324 */
1325 if (!list_is_empty(&tx->tx_callbacks))
1326 dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED));
1327
1328 dmu_tx_destroy(tx);
1329 }
1330
1331 uint64_t
dmu_tx_get_txg(dmu_tx_t * tx)1332 dmu_tx_get_txg(dmu_tx_t *tx)
1333 {
1334 ASSERT(tx->tx_txg != 0);
1335 return (tx->tx_txg);
1336 }
1337
1338 dsl_pool_t *
dmu_tx_pool(dmu_tx_t * tx)1339 dmu_tx_pool(dmu_tx_t *tx)
1340 {
1341 ASSERT(tx->tx_pool != NULL);
1342 return (tx->tx_pool);
1343 }
1344
1345 void
dmu_tx_callback_register(dmu_tx_t * tx,dmu_tx_callback_func_t * func,void * data)1346 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1347 {
1348 dmu_tx_callback_t *dcb;
1349
1350 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1351
1352 dcb->dcb_func = func;
1353 dcb->dcb_data = data;
1354
1355 list_insert_tail(&tx->tx_callbacks, dcb);
1356 }
1357
1358 /*
1359 * Call all the commit callbacks on a list, with a given error code.
1360 */
1361 void
dmu_tx_do_callbacks(list_t * cb_list,int error)1362 dmu_tx_do_callbacks(list_t *cb_list, int error)
1363 {
1364 dmu_tx_callback_t *dcb;
1365
1366 while ((dcb = list_tail(cb_list)) != NULL) {
1367 list_remove(cb_list, dcb);
1368 dcb->dcb_func(dcb->dcb_data, error);
1369 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1370 }
1371 }
1372
1373 /*
1374 * Interface to hold a bunch of attributes.
1375 * used for creating new files.
1376 * attrsize is the total size of all attributes
1377 * to be added during object creation
1378 *
1379 * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1380 */
1381
1382 /*
1383 * hold necessary attribute name for attribute registration.
1384 * should be a very rare case where this is needed. If it does
1385 * happen it would only happen on the first write to the file system.
1386 */
1387 static void
dmu_tx_sa_registration_hold(sa_os_t * sa,dmu_tx_t * tx)1388 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1389 {
1390 if (!sa->sa_need_attr_registration)
1391 return;
1392
1393 for (int i = 0; i != sa->sa_num_attrs; i++) {
1394 if (!sa->sa_attr_table[i].sa_registered) {
1395 if (sa->sa_reg_attr_obj)
1396 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1397 B_TRUE, sa->sa_attr_table[i].sa_name);
1398 else
1399 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1400 B_TRUE, sa->sa_attr_table[i].sa_name);
1401 }
1402 }
1403 }
1404
1405 void
dmu_tx_hold_spill(dmu_tx_t * tx,uint64_t object)1406 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1407 {
1408 dmu_tx_hold_t *txh;
1409
1410 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1411 THT_SPILL, 0, 0);
1412 if (txh != NULL)
1413 (void) zfs_refcount_add_many(&txh->txh_space_towrite,
1414 SPA_OLD_MAXBLOCKSIZE, FTAG);
1415 }
1416
1417 void
dmu_tx_hold_sa_create(dmu_tx_t * tx,int attrsize)1418 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1419 {
1420 sa_os_t *sa = tx->tx_objset->os_sa;
1421
1422 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1423
1424 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1425 return;
1426
1427 if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1428 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1429 } else {
1430 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1431 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1432 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1433 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1434 }
1435
1436 dmu_tx_sa_registration_hold(sa, tx);
1437
1438 if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
1439 return;
1440
1441 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1442 THT_SPILL, 0, 0);
1443 }
1444
1445 /*
1446 * Hold SA attribute
1447 *
1448 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1449 *
1450 * variable_size is the total size of all variable sized attributes
1451 * passed to this function. It is not the total size of all
1452 * variable size attributes that *may* exist on this object.
1453 */
1454 void
dmu_tx_hold_sa(dmu_tx_t * tx,sa_handle_t * hdl,boolean_t may_grow)1455 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1456 {
1457 uint64_t object;
1458 sa_os_t *sa = tx->tx_objset->os_sa;
1459
1460 ASSERT(hdl != NULL);
1461
1462 object = sa_handle_object(hdl);
1463
1464 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1465 DB_DNODE_ENTER(db);
1466 dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));
1467 DB_DNODE_EXIT(db);
1468
1469 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1470 return;
1471
1472 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1473 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1474 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1475 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1476 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1477 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1478 }
1479
1480 dmu_tx_sa_registration_hold(sa, tx);
1481
1482 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1483 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1484
1485 if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1486 ASSERT(tx->tx_txg == 0);
1487 dmu_tx_hold_spill(tx, object);
1488 } else {
1489 dnode_t *dn;
1490
1491 DB_DNODE_ENTER(db);
1492 dn = DB_DNODE(db);
1493 if (dn->dn_have_spill) {
1494 ASSERT(tx->tx_txg == 0);
1495 dmu_tx_hold_spill(tx, object);
1496 }
1497 DB_DNODE_EXIT(db);
1498 }
1499 }
1500
1501 void
dmu_tx_init(void)1502 dmu_tx_init(void)
1503 {
1504 dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
1505 KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
1506 KSTAT_FLAG_VIRTUAL);
1507
1508 if (dmu_tx_ksp != NULL) {
1509 dmu_tx_ksp->ks_data = &dmu_tx_stats;
1510 kstat_install(dmu_tx_ksp);
1511 }
1512 }
1513
1514 void
dmu_tx_fini(void)1515 dmu_tx_fini(void)
1516 {
1517 if (dmu_tx_ksp != NULL) {
1518 kstat_delete(dmu_tx_ksp);
1519 dmu_tx_ksp = NULL;
1520 }
1521 }
1522
1523 #if defined(_KERNEL)
1524 EXPORT_SYMBOL(dmu_tx_create);
1525 EXPORT_SYMBOL(dmu_tx_hold_write);
1526 EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
1527 EXPORT_SYMBOL(dmu_tx_hold_append);
1528 EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode);
1529 EXPORT_SYMBOL(dmu_tx_hold_free);
1530 EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
1531 EXPORT_SYMBOL(dmu_tx_hold_zap);
1532 EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode);
1533 EXPORT_SYMBOL(dmu_tx_hold_bonus);
1534 EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode);
1535 EXPORT_SYMBOL(dmu_tx_abort);
1536 EXPORT_SYMBOL(dmu_tx_assign);
1537 EXPORT_SYMBOL(dmu_tx_wait);
1538 EXPORT_SYMBOL(dmu_tx_commit);
1539 EXPORT_SYMBOL(dmu_tx_mark_netfree);
1540 EXPORT_SYMBOL(dmu_tx_get_txg);
1541 EXPORT_SYMBOL(dmu_tx_callback_register);
1542 EXPORT_SYMBOL(dmu_tx_do_callbacks);
1543 EXPORT_SYMBOL(dmu_tx_hold_spill);
1544 EXPORT_SYMBOL(dmu_tx_hold_sa_create);
1545 EXPORT_SYMBOL(dmu_tx_hold_sa);
1546 #endif
1547