1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2013 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Joyent, Inc. All rights reserved.
27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 #include <sys/dmu.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dsl_prop.h>
36 #include <sys/dsl_synctask.h>
37 #include <sys/dsl_deleg.h>
38 #include <sys/dmu_impl.h>
39 #include <sys/spa.h>
40 #include <sys/metaslab.h>
41 #include <sys/zap.h>
42 #include <sys/zio.h>
43 #include <sys/arc.h>
44 #include <sys/sunddi.h>
45 #include <sys/zvol.h>
46 #ifdef _KERNEL
47 #include <sys/zfs_vfsops.h>
48 #endif
49 #include <sys/zfeature.h>
50 #include <sys/policy.h>
51 #include <sys/zfs_znode.h>
52 #include "zfs_namecheck.h"
53 #include "zfs_prop.h"
54
55 /*
56 * Filesystem and Snapshot Limits
57 * ------------------------------
58 *
59 * These limits are used to restrict the number of filesystems and/or snapshots
60 * that can be created at a given level in the tree or below. A typical
61 * use-case is with a delegated dataset where the administrator wants to ensure
62 * that a user within the zone is not creating too many additional filesystems
63 * or snapshots, even though they're not exceeding their space quota.
64 *
65 * The filesystem and snapshot counts are stored as extensible properties. This
66 * capability is controlled by a feature flag and must be enabled to be used.
67 * Once enabled, the feature is not active until the first limit is set. At
68 * that point, future operations to create/destroy filesystems or snapshots
69 * will validate and update the counts.
70 *
71 * Because the count properties will not exist before the feature is active,
72 * the counts are updated when a limit is first set on an uninitialized
73 * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
74 * all of the nested filesystems/snapshots. Thus, a new leaf node has a
75 * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
76 * snapshot count properties on a node indicate uninitialized counts on that
77 * node.) When first setting a limit on an uninitialized node, the code starts
78 * at the filesystem with the new limit and descends into all sub-filesystems
79 * to add the count properties.
80 *
81 * In practice this is lightweight since a limit is typically set when the
82 * filesystem is created and thus has no children. Once valid, changing the
83 * limit value won't require a re-traversal since the counts are already valid.
84 * When recursively fixing the counts, if a node with a limit is encountered
85 * during the descent, the counts are known to be valid and there is no need to
86 * descend into that filesystem's children. The counts on filesystems above the
87 * one with the new limit will still be uninitialized, unless a limit is
88 * eventually set on one of those filesystems. The counts are always recursively
89 * updated when a limit is set on a dataset, unless there is already a limit.
90 * When a new limit value is set on a filesystem with an existing limit, it is
91 * possible for the new limit to be less than the current count at that level
92 * since a user who can change the limit is also allowed to exceed the limit.
93 *
94 * Once the feature is active, then whenever a filesystem or snapshot is
95 * created, the code recurses up the tree, validating the new count against the
96 * limit at each initialized level. In practice, most levels will not have a
97 * limit set. If there is a limit at any initialized level up the tree, the
98 * check must pass or the creation will fail. Likewise, when a filesystem or
99 * snapshot is destroyed, the counts are recursively adjusted all the way up
100 * the initizized nodes in the tree. Renaming a filesystem into different point
101 * in the tree will first validate, then update the counts on each branch up to
102 * the common ancestor. A receive will also validate the counts and then update
103 * them.
104 *
105 * An exception to the above behavior is that the limit is not enforced if the
106 * user has permission to modify the limit. This is primarily so that
107 * recursive snapshots in the global zone always work. We want to prevent a
108 * denial-of-service in which a lower level delegated dataset could max out its
109 * limit and thus block recursive snapshots from being taken in the global zone.
110 * Because of this, it is possible for the snapshot count to be over the limit
111 * and snapshots taken in the global zone could cause a lower level dataset to
112 * hit or exceed its limit. The administrator taking the global zone recursive
113 * snapshot should be aware of this side-effect and behave accordingly.
114 * For consistency, the filesystem limit is also not enforced if the user can
115 * modify the limit.
116 *
117 * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
118 * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
119 * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
120 * dsl_dir_init_fs_ss_count().
121 *
122 * There is a special case when we receive a filesystem that already exists. In
123 * this case a temporary clone name of %X is created (see dmu_recv_begin). We
124 * never update the filesystem counts for temporary clones.
125 *
126 * Likewise, we do not update the snapshot counts for temporary snapshots,
127 * such as those created by zfs diff.
128 */
129
130 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
131
132 /* ARGSUSED */
133 static void
dsl_dir_evict(dmu_buf_t * db,void * arg)134 dsl_dir_evict(dmu_buf_t *db, void *arg)
135 {
136 dsl_dir_t *dd = arg;
137 dsl_pool_t *dp = dd->dd_pool;
138 int t;
139
140 for (t = 0; t < TXG_SIZE; t++) {
141 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
142 ASSERT(dd->dd_tempreserved[t] == 0);
143 ASSERT(dd->dd_space_towrite[t] == 0);
144 }
145
146 if (dd->dd_parent)
147 dsl_dir_rele(dd->dd_parent, dd);
148
149 spa_close(dd->dd_pool->dp_spa, dd);
150
151 /*
152 * The props callback list should have been cleaned up by
153 * objset_evict().
154 */
155 list_destroy(&dd->dd_prop_cbs);
156 mutex_destroy(&dd->dd_lock);
157 kmem_free(dd, sizeof (dsl_dir_t));
158 }
159
160 int
dsl_dir_hold_obj(dsl_pool_t * dp,uint64_t ddobj,const char * tail,void * tag,dsl_dir_t ** ddp)161 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
162 const char *tail, void *tag, dsl_dir_t **ddp)
163 {
164 dmu_buf_t *dbuf;
165 dsl_dir_t *dd;
166 int err;
167
168 ASSERT(dsl_pool_config_held(dp));
169
170 err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
171 if (err != 0)
172 return (err);
173 dd = dmu_buf_get_user(dbuf);
174 #ifdef ZFS_DEBUG
175 {
176 dmu_object_info_t doi;
177 dmu_object_info_from_db(dbuf, &doi);
178 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
179 ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
180 }
181 #endif
182 if (dd == NULL) {
183 dsl_dir_t *winner;
184
185 dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
186 dd->dd_object = ddobj;
187 dd->dd_dbuf = dbuf;
188 dd->dd_pool = dp;
189 dd->dd_phys = dbuf->db_data;
190 mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
191
192 list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
193 offsetof(dsl_prop_cb_record_t, cbr_node));
194
195 dsl_dir_snap_cmtime_update(dd);
196
197 if (dd->dd_phys->dd_parent_obj) {
198 err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
199 NULL, dd, &dd->dd_parent);
200 if (err != 0)
201 goto errout;
202 if (tail) {
203 #ifdef ZFS_DEBUG
204 uint64_t foundobj;
205
206 err = zap_lookup(dp->dp_meta_objset,
207 dd->dd_parent->dd_phys->dd_child_dir_zapobj,
208 tail, sizeof (foundobj), 1, &foundobj);
209 ASSERT(err || foundobj == ddobj);
210 #endif
211 (void) strcpy(dd->dd_myname, tail);
212 } else {
213 err = zap_value_search(dp->dp_meta_objset,
214 dd->dd_parent->dd_phys->dd_child_dir_zapobj,
215 ddobj, 0, dd->dd_myname);
216 }
217 if (err != 0)
218 goto errout;
219 } else {
220 (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
221 }
222
223 if (dsl_dir_is_clone(dd)) {
224 dmu_buf_t *origin_bonus;
225 dsl_dataset_phys_t *origin_phys;
226
227 /*
228 * We can't open the origin dataset, because
229 * that would require opening this dsl_dir.
230 * Just look at its phys directly instead.
231 */
232 err = dmu_bonus_hold(dp->dp_meta_objset,
233 dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
234 if (err != 0)
235 goto errout;
236 origin_phys = origin_bonus->db_data;
237 dd->dd_origin_txg =
238 origin_phys->ds_creation_txg;
239 dmu_buf_rele(origin_bonus, FTAG);
240 }
241
242 winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
243 dsl_dir_evict);
244 if (winner) {
245 if (dd->dd_parent)
246 dsl_dir_rele(dd->dd_parent, dd);
247 mutex_destroy(&dd->dd_lock);
248 kmem_free(dd, sizeof (dsl_dir_t));
249 dd = winner;
250 } else {
251 spa_open_ref(dp->dp_spa, dd);
252 }
253 }
254
255 /*
256 * The dsl_dir_t has both open-to-close and instantiate-to-evict
257 * holds on the spa. We need the open-to-close holds because
258 * otherwise the spa_refcnt wouldn't change when we open a
259 * dir which the spa also has open, so we could incorrectly
260 * think it was OK to unload/export/destroy the pool. We need
261 * the instantiate-to-evict hold because the dsl_dir_t has a
262 * pointer to the dd_pool, which has a pointer to the spa_t.
263 */
264 spa_open_ref(dp->dp_spa, tag);
265 ASSERT3P(dd->dd_pool, ==, dp);
266 ASSERT3U(dd->dd_object, ==, ddobj);
267 ASSERT3P(dd->dd_dbuf, ==, dbuf);
268 *ddp = dd;
269 return (0);
270
271 errout:
272 if (dd->dd_parent)
273 dsl_dir_rele(dd->dd_parent, dd);
274 mutex_destroy(&dd->dd_lock);
275 kmem_free(dd, sizeof (dsl_dir_t));
276 dmu_buf_rele(dbuf, tag);
277 return (err);
278 }
279
280 void
dsl_dir_rele(dsl_dir_t * dd,void * tag)281 dsl_dir_rele(dsl_dir_t *dd, void *tag)
282 {
283 dprintf_dd(dd, "%s\n", "");
284 spa_close(dd->dd_pool->dp_spa, tag);
285 dmu_buf_rele(dd->dd_dbuf, tag);
286 }
287
288 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
289 void
dsl_dir_name(dsl_dir_t * dd,char * buf)290 dsl_dir_name(dsl_dir_t *dd, char *buf)
291 {
292 if (dd->dd_parent) {
293 dsl_dir_name(dd->dd_parent, buf);
294 (void) strcat(buf, "/");
295 } else {
296 buf[0] = '\0';
297 }
298 if (!MUTEX_HELD(&dd->dd_lock)) {
299 /*
300 * recursive mutex so that we can use
301 * dprintf_dd() with dd_lock held
302 */
303 mutex_enter(&dd->dd_lock);
304 (void) strcat(buf, dd->dd_myname);
305 mutex_exit(&dd->dd_lock);
306 } else {
307 (void) strcat(buf, dd->dd_myname);
308 }
309 }
310
311 /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
312 int
dsl_dir_namelen(dsl_dir_t * dd)313 dsl_dir_namelen(dsl_dir_t *dd)
314 {
315 int result = 0;
316
317 if (dd->dd_parent) {
318 /* parent's name + 1 for the "/" */
319 result = dsl_dir_namelen(dd->dd_parent) + 1;
320 }
321
322 if (!MUTEX_HELD(&dd->dd_lock)) {
323 /* see dsl_dir_name */
324 mutex_enter(&dd->dd_lock);
325 result += strlen(dd->dd_myname);
326 mutex_exit(&dd->dd_lock);
327 } else {
328 result += strlen(dd->dd_myname);
329 }
330
331 return (result);
332 }
333
334 static int
getcomponent(const char * path,char * component,const char ** nextp)335 getcomponent(const char *path, char *component, const char **nextp)
336 {
337 char *p;
338
339 if ((path == NULL) || (path[0] == '\0'))
340 return (SET_ERROR(ENOENT));
341 /* This would be a good place to reserve some namespace... */
342 p = strpbrk(path, "/@");
343 if (p && (p[1] == '/' || p[1] == '@')) {
344 /* two separators in a row */
345 return (SET_ERROR(EINVAL));
346 }
347 if (p == NULL || p == path) {
348 /*
349 * if the first thing is an @ or /, it had better be an
350 * @ and it had better not have any more ats or slashes,
351 * and it had better have something after the @.
352 */
353 if (p != NULL &&
354 (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
355 return (SET_ERROR(EINVAL));
356 if (strlen(path) >= MAXNAMELEN)
357 return (SET_ERROR(ENAMETOOLONG));
358 (void) strcpy(component, path);
359 p = NULL;
360 } else if (p[0] == '/') {
361 if (p - path >= MAXNAMELEN)
362 return (SET_ERROR(ENAMETOOLONG));
363 (void) strncpy(component, path, p - path);
364 component[p - path] = '\0';
365 p++;
366 } else if (p[0] == '@') {
367 /*
368 * if the next separator is an @, there better not be
369 * any more slashes.
370 */
371 if (strchr(path, '/'))
372 return (SET_ERROR(EINVAL));
373 if (p - path >= MAXNAMELEN)
374 return (SET_ERROR(ENAMETOOLONG));
375 (void) strncpy(component, path, p - path);
376 component[p - path] = '\0';
377 } else {
378 panic("invalid p=%p", (void *)p);
379 }
380 *nextp = p;
381 return (0);
382 }
383
384 /*
385 * Return the dsl_dir_t, and possibly the last component which couldn't
386 * be found in *tail. The name must be in the specified dsl_pool_t. This
387 * thread must hold the dp_config_rwlock for the pool. Returns NULL if the
388 * path is bogus, or if tail==NULL and we couldn't parse the whole name.
389 * (*tail)[0] == '@' means that the last component is a snapshot.
390 */
391 int
dsl_dir_hold(dsl_pool_t * dp,const char * name,void * tag,dsl_dir_t ** ddp,const char ** tailp)392 dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
393 dsl_dir_t **ddp, const char **tailp)
394 {
395 char buf[MAXNAMELEN];
396 const char *spaname, *next, *nextnext = NULL;
397 int err;
398 dsl_dir_t *dd;
399 uint64_t ddobj;
400
401 err = getcomponent(name, buf, &next);
402 if (err != 0)
403 return (err);
404
405 /* Make sure the name is in the specified pool. */
406 spaname = spa_name(dp->dp_spa);
407 if (strcmp(buf, spaname) != 0)
408 return (SET_ERROR(EXDEV));
409
410 ASSERT(dsl_pool_config_held(dp));
411
412 err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
413 if (err != 0) {
414 return (err);
415 }
416
417 while (next != NULL) {
418 dsl_dir_t *child_ds;
419 err = getcomponent(next, buf, &nextnext);
420 if (err != 0)
421 break;
422 ASSERT(next[0] != '\0');
423 if (next[0] == '@')
424 break;
425 dprintf("looking up %s in obj%lld\n",
426 buf, dd->dd_phys->dd_child_dir_zapobj);
427
428 err = zap_lookup(dp->dp_meta_objset,
429 dd->dd_phys->dd_child_dir_zapobj,
430 buf, sizeof (ddobj), 1, &ddobj);
431 if (err != 0) {
432 if (err == ENOENT)
433 err = 0;
434 break;
435 }
436
437 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
438 if (err != 0)
439 break;
440 dsl_dir_rele(dd, tag);
441 dd = child_ds;
442 next = nextnext;
443 }
444
445 if (err != 0) {
446 dsl_dir_rele(dd, tag);
447 return (err);
448 }
449
450 /*
451 * It's an error if there's more than one component left, or
452 * tailp==NULL and there's any component left.
453 */
454 if (next != NULL &&
455 (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
456 /* bad path name */
457 dsl_dir_rele(dd, tag);
458 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
459 err = SET_ERROR(ENOENT);
460 }
461 if (tailp != NULL)
462 *tailp = next;
463 *ddp = dd;
464 return (err);
465 }
466
467 /*
468 * If the counts are already initialized for this filesystem and its
469 * descendants then do nothing, otherwise initialize the counts.
470 *
471 * The counts on this filesystem, and those below, may be uninitialized due to
472 * either the use of a pre-existing pool which did not support the
473 * filesystem/snapshot limit feature, or one in which the feature had not yet
474 * been enabled.
475 *
476 * Recursively descend the filesystem tree and update the filesystem/snapshot
477 * counts on each filesystem below, then update the cumulative count on the
478 * current filesystem. If the filesystem already has a count set on it,
479 * then we know that its counts, and the counts on the filesystems below it,
480 * are already correct, so we don't have to update this filesystem.
481 */
482 static void
dsl_dir_init_fs_ss_count(dsl_dir_t * dd,dmu_tx_t * tx)483 dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
484 {
485 uint64_t my_fs_cnt = 0;
486 uint64_t my_ss_cnt = 0;
487 dsl_pool_t *dp = dd->dd_pool;
488 objset_t *os = dp->dp_meta_objset;
489 zap_cursor_t *zc;
490 zap_attribute_t *za;
491 dsl_dataset_t *ds;
492
493 ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
494 ASSERT(dsl_pool_config_held(dp));
495 ASSERT(dmu_tx_is_syncing(tx));
496
497 dsl_dir_zapify(dd, tx);
498
499 /*
500 * If the filesystem count has already been initialized then we
501 * don't need to recurse down any further.
502 */
503 if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
504 return;
505
506 zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
507 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
508
509 /* Iterate my child dirs */
510 for (zap_cursor_init(zc, os, dd->dd_phys->dd_child_dir_zapobj);
511 zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
512 dsl_dir_t *chld_dd;
513 uint64_t count;
514
515 VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
516 &chld_dd));
517
518 /*
519 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
520 * temporary datasets.
521 */
522 if (chld_dd->dd_myname[0] == '$' ||
523 chld_dd->dd_myname[0] == '%') {
524 dsl_dir_rele(chld_dd, FTAG);
525 continue;
526 }
527
528 my_fs_cnt++; /* count this child */
529
530 dsl_dir_init_fs_ss_count(chld_dd, tx);
531
532 VERIFY0(zap_lookup(os, chld_dd->dd_object,
533 DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
534 my_fs_cnt += count;
535 VERIFY0(zap_lookup(os, chld_dd->dd_object,
536 DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
537 my_ss_cnt += count;
538
539 dsl_dir_rele(chld_dd, FTAG);
540 }
541 zap_cursor_fini(zc);
542 /* Count my snapshots (we counted children's snapshots above) */
543 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
544 dd->dd_phys->dd_head_dataset_obj, FTAG, &ds));
545
546 for (zap_cursor_init(zc, os, ds->ds_phys->ds_snapnames_zapobj);
547 zap_cursor_retrieve(zc, za) == 0;
548 zap_cursor_advance(zc)) {
549 /* Don't count temporary snapshots */
550 if (za->za_name[0] != '%')
551 my_ss_cnt++;
552 }
553 zap_cursor_fini(zc);
554
555 dsl_dataset_rele(ds, FTAG);
556
557 kmem_free(zc, sizeof (zap_cursor_t));
558 kmem_free(za, sizeof (zap_attribute_t));
559
560 /* we're in a sync task, update counts */
561 dmu_buf_will_dirty(dd->dd_dbuf, tx);
562 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
563 sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
564 VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
565 sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
566 }
567
568 static int
dsl_dir_actv_fs_ss_limit_check(void * arg,dmu_tx_t * tx)569 dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
570 {
571 char *ddname = (char *)arg;
572 dsl_pool_t *dp = dmu_tx_pool(tx);
573 dsl_dataset_t *ds;
574 dsl_dir_t *dd;
575 int error;
576
577 error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
578 if (error != 0)
579 return (error);
580
581 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
582 dsl_dataset_rele(ds, FTAG);
583 return (SET_ERROR(ENOTSUP));
584 }
585
586 dd = ds->ds_dir;
587 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
588 dsl_dir_is_zapified(dd) &&
589 zap_contains(dp->dp_meta_objset, dd->dd_object,
590 DD_FIELD_FILESYSTEM_COUNT) == 0) {
591 dsl_dataset_rele(ds, FTAG);
592 return (SET_ERROR(EALREADY));
593 }
594
595 dsl_dataset_rele(ds, FTAG);
596 return (0);
597 }
598
599 static void
dsl_dir_actv_fs_ss_limit_sync(void * arg,dmu_tx_t * tx)600 dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
601 {
602 char *ddname = (char *)arg;
603 dsl_pool_t *dp = dmu_tx_pool(tx);
604 dsl_dataset_t *ds;
605 spa_t *spa;
606
607 VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
608
609 spa = dsl_dataset_get_spa(ds);
610
611 if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
612 /*
613 * Since the feature was not active and we're now setting a
614 * limit, increment the feature-active counter so that the
615 * feature becomes active for the first time.
616 *
617 * We are already in a sync task so we can update the MOS.
618 */
619 spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
620 }
621
622 /*
623 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
624 * we need to ensure the counts are correct. Descend down the tree from
625 * this point and update all of the counts to be accurate.
626 */
627 dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
628
629 dsl_dataset_rele(ds, FTAG);
630 }
631
632 /*
633 * Make sure the feature is enabled and activate it if necessary.
634 * Since we're setting a limit, ensure the on-disk counts are valid.
635 * This is only called by the ioctl path when setting a limit value.
636 *
637 * We do not need to validate the new limit, since users who can change the
638 * limit are also allowed to exceed the limit.
639 */
640 int
dsl_dir_activate_fs_ss_limit(const char * ddname)641 dsl_dir_activate_fs_ss_limit(const char *ddname)
642 {
643 int error;
644
645 error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
646 dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0);
647
648 if (error == EALREADY)
649 error = 0;
650
651 return (error);
652 }
653
654 /*
655 * Used to determine if the filesystem_limit or snapshot_limit should be
656 * enforced. We allow the limit to be exceeded if the user has permission to
657 * write the property value. We pass in the creds that we got in the open
658 * context since we will always be the GZ root in syncing context. We also have
659 * to handle the case where we are allowed to change the limit on the current
660 * dataset, but there may be another limit in the tree above.
661 *
662 * We can never modify these two properties within a non-global zone. In
663 * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
664 * can't use that function since we are already holding the dp_config_rwlock.
665 * In addition, we already have the dd and dealing with snapshots is simplified
666 * in this code.
667 */
668
669 typedef enum {
670 ENFORCE_ALWAYS,
671 ENFORCE_NEVER,
672 ENFORCE_ABOVE
673 } enforce_res_t;
674
675 static enforce_res_t
dsl_enforce_ds_ss_limits(dsl_dir_t * dd,zfs_prop_t prop,cred_t * cr)676 dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
677 {
678 enforce_res_t enforce = ENFORCE_ALWAYS;
679 uint64_t obj;
680 dsl_dataset_t *ds;
681 uint64_t zoned;
682
683 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
684 prop == ZFS_PROP_SNAPSHOT_LIMIT);
685
686 #ifdef _KERNEL
687 #ifdef __FreeBSD__
688 if (jailed(cr))
689 #else
690 if (crgetzoneid(cr) != GLOBAL_ZONEID)
691 #endif
692 return (ENFORCE_ALWAYS);
693
694 if (secpolicy_zfs(cr) == 0)
695 return (ENFORCE_NEVER);
696 #endif
697
698 if ((obj = dd->dd_phys->dd_head_dataset_obj) == 0)
699 return (ENFORCE_ALWAYS);
700
701 ASSERT(dsl_pool_config_held(dd->dd_pool));
702
703 if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
704 return (ENFORCE_ALWAYS);
705
706 if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
707 /* Only root can access zoned fs's from the GZ */
708 enforce = ENFORCE_ALWAYS;
709 } else {
710 if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
711 enforce = ENFORCE_ABOVE;
712 }
713
714 dsl_dataset_rele(ds, FTAG);
715 return (enforce);
716 }
717
718 /*
719 * Check if adding additional child filesystem(s) would exceed any filesystem
720 * limits or adding additional snapshot(s) would exceed any snapshot limits.
721 * The prop argument indicates which limit to check.
722 *
723 * Note that all filesystem limits up to the root (or the highest
724 * initialized) filesystem or the given ancestor must be satisfied.
725 */
726 int
dsl_fs_ss_limit_check(dsl_dir_t * dd,uint64_t delta,zfs_prop_t prop,dsl_dir_t * ancestor,cred_t * cr)727 dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
728 dsl_dir_t *ancestor, cred_t *cr)
729 {
730 objset_t *os = dd->dd_pool->dp_meta_objset;
731 uint64_t limit, count;
732 char *count_prop;
733 enforce_res_t enforce;
734 int err = 0;
735
736 ASSERT(dsl_pool_config_held(dd->dd_pool));
737 ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
738 prop == ZFS_PROP_SNAPSHOT_LIMIT);
739
740 /*
741 * If we're allowed to change the limit, don't enforce the limit
742 * e.g. this can happen if a snapshot is taken by an administrative
743 * user in the global zone (i.e. a recursive snapshot by root).
744 * However, we must handle the case of delegated permissions where we
745 * are allowed to change the limit on the current dataset, but there
746 * is another limit in the tree above.
747 */
748 enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
749 if (enforce == ENFORCE_NEVER)
750 return (0);
751
752 /*
753 * e.g. if renaming a dataset with no snapshots, count adjustment
754 * is 0.
755 */
756 if (delta == 0)
757 return (0);
758
759 if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
760 /*
761 * We don't enforce the limit for temporary snapshots. This is
762 * indicated by a NULL cred_t argument.
763 */
764 if (cr == NULL)
765 return (0);
766
767 count_prop = DD_FIELD_SNAPSHOT_COUNT;
768 } else {
769 count_prop = DD_FIELD_FILESYSTEM_COUNT;
770 }
771
772 /*
773 * If an ancestor has been provided, stop checking the limit once we
774 * hit that dir. We need this during rename so that we don't overcount
775 * the check once we recurse up to the common ancestor.
776 */
777 if (ancestor == dd)
778 return (0);
779
780 /*
781 * If we hit an uninitialized node while recursing up the tree, we can
782 * stop since we know there is no limit here (or above). The counts are
783 * not valid on this node and we know we won't touch this node's counts.
784 */
785 if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
786 count_prop, sizeof (count), 1, &count) == ENOENT)
787 return (0);
788
789 err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
790 B_FALSE);
791 if (err != 0)
792 return (err);
793
794 /* Is there a limit which we've hit? */
795 if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
796 return (SET_ERROR(EDQUOT));
797
798 if (dd->dd_parent != NULL)
799 err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
800 ancestor, cr);
801
802 return (err);
803 }
804
805 /*
806 * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
807 * parents. When a new filesystem/snapshot is created, increment the count on
808 * all parents, and when a filesystem/snapshot is destroyed, decrement the
809 * count.
810 */
811 void
dsl_fs_ss_count_adjust(dsl_dir_t * dd,int64_t delta,const char * prop,dmu_tx_t * tx)812 dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
813 dmu_tx_t *tx)
814 {
815 int err;
816 objset_t *os = dd->dd_pool->dp_meta_objset;
817 uint64_t count;
818
819 ASSERT(dsl_pool_config_held(dd->dd_pool));
820 ASSERT(dmu_tx_is_syncing(tx));
821 ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
822 strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
823
824 /*
825 * When we receive an incremental stream into a filesystem that already
826 * exists, a temporary clone is created. We don't count this temporary
827 * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
828 * $MOS & $ORIGIN) objsets.
829 */
830 if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
831 strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
832 return;
833
834 /*
835 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
836 */
837 if (delta == 0)
838 return;
839
840 /*
841 * If we hit an uninitialized node while recursing up the tree, we can
842 * stop since we know the counts are not valid on this node and we
843 * know we shouldn't touch this node's counts. An uninitialized count
844 * on the node indicates that either the feature has not yet been
845 * activated or there are no limits on this part of the tree.
846 */
847 if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
848 prop, sizeof (count), 1, &count)) == ENOENT)
849 return;
850 VERIFY0(err);
851
852 count += delta;
853 /* Use a signed verify to make sure we're not neg. */
854 VERIFY3S(count, >=, 0);
855
856 VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
857 tx));
858
859 /* Roll up this additional count into our ancestors */
860 if (dd->dd_parent != NULL)
861 dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
862 }
863
864 uint64_t
dsl_dir_create_sync(dsl_pool_t * dp,dsl_dir_t * pds,const char * name,dmu_tx_t * tx)865 dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
866 dmu_tx_t *tx)
867 {
868 objset_t *mos = dp->dp_meta_objset;
869 uint64_t ddobj;
870 dsl_dir_phys_t *ddphys;
871 dmu_buf_t *dbuf;
872
873 ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
874 DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
875 if (pds) {
876 VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
877 name, sizeof (uint64_t), 1, &ddobj, tx));
878 } else {
879 /* it's the root dir */
880 VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
881 DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
882 }
883 VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
884 dmu_buf_will_dirty(dbuf, tx);
885 ddphys = dbuf->db_data;
886
887 ddphys->dd_creation_time = gethrestime_sec();
888 if (pds) {
889 ddphys->dd_parent_obj = pds->dd_object;
890
891 /* update the filesystem counts */
892 dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
893 }
894 ddphys->dd_props_zapobj = zap_create(mos,
895 DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
896 ddphys->dd_child_dir_zapobj = zap_create(mos,
897 DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
898 if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
899 ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
900 dmu_buf_rele(dbuf, FTAG);
901
902 return (ddobj);
903 }
904
905 boolean_t
dsl_dir_is_clone(dsl_dir_t * dd)906 dsl_dir_is_clone(dsl_dir_t *dd)
907 {
908 return (dd->dd_phys->dd_origin_obj &&
909 (dd->dd_pool->dp_origin_snap == NULL ||
910 dd->dd_phys->dd_origin_obj !=
911 dd->dd_pool->dp_origin_snap->ds_object));
912 }
913
914 void
dsl_dir_stats(dsl_dir_t * dd,nvlist_t * nv)915 dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
916 {
917 mutex_enter(&dd->dd_lock);
918 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
919 dd->dd_phys->dd_used_bytes);
920 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
921 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
922 dd->dd_phys->dd_reserved);
923 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
924 dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
925 (dd->dd_phys->dd_uncompressed_bytes * 100 /
926 dd->dd_phys->dd_compressed_bytes));
927 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
928 dd->dd_phys->dd_uncompressed_bytes);
929 if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
930 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
931 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
932 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
933 dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
934 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
935 dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
936 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
937 dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
938 dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
939 }
940 mutex_exit(&dd->dd_lock);
941
942 if (dsl_dir_is_zapified(dd)) {
943 uint64_t count;
944 objset_t *os = dd->dd_pool->dp_meta_objset;
945
946 if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
947 sizeof (count), 1, &count) == 0) {
948 dsl_prop_nvlist_add_uint64(nv,
949 ZFS_PROP_FILESYSTEM_COUNT, count);
950 }
951 if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
952 sizeof (count), 1, &count) == 0) {
953 dsl_prop_nvlist_add_uint64(nv,
954 ZFS_PROP_SNAPSHOT_COUNT, count);
955 }
956 }
957
958 if (dsl_dir_is_clone(dd)) {
959 dsl_dataset_t *ds;
960 char buf[MAXNAMELEN];
961
962 VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
963 dd->dd_phys->dd_origin_obj, FTAG, &ds));
964 dsl_dataset_name(ds, buf);
965 dsl_dataset_rele(ds, FTAG);
966 dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
967 }
968 }
969
970 void
dsl_dir_dirty(dsl_dir_t * dd,dmu_tx_t * tx)971 dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
972 {
973 dsl_pool_t *dp = dd->dd_pool;
974
975 ASSERT(dd->dd_phys);
976
977 if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
978 /* up the hold count until we can be written out */
979 dmu_buf_add_ref(dd->dd_dbuf, dd);
980 }
981 }
982
983 static int64_t
parent_delta(dsl_dir_t * dd,uint64_t used,int64_t delta)984 parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
985 {
986 uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
987 uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
988 return (new_accounted - old_accounted);
989 }
990
991 void
dsl_dir_sync(dsl_dir_t * dd,dmu_tx_t * tx)992 dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
993 {
994 ASSERT(dmu_tx_is_syncing(tx));
995
996 mutex_enter(&dd->dd_lock);
997 ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
998 dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
999 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
1000 dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
1001 mutex_exit(&dd->dd_lock);
1002
1003 /* release the hold from dsl_dir_dirty */
1004 dmu_buf_rele(dd->dd_dbuf, dd);
1005 }
1006
1007 static uint64_t
dsl_dir_space_towrite(dsl_dir_t * dd)1008 dsl_dir_space_towrite(dsl_dir_t *dd)
1009 {
1010 uint64_t space = 0;
1011 int i;
1012
1013 ASSERT(MUTEX_HELD(&dd->dd_lock));
1014
1015 for (i = 0; i < TXG_SIZE; i++) {
1016 space += dd->dd_space_towrite[i&TXG_MASK];
1017 ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
1018 }
1019 return (space);
1020 }
1021
1022 /*
1023 * How much space would dd have available if ancestor had delta applied
1024 * to it? If ondiskonly is set, we're only interested in what's
1025 * on-disk, not estimated pending changes.
1026 */
1027 uint64_t
dsl_dir_space_available(dsl_dir_t * dd,dsl_dir_t * ancestor,int64_t delta,int ondiskonly)1028 dsl_dir_space_available(dsl_dir_t *dd,
1029 dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
1030 {
1031 uint64_t parentspace, myspace, quota, used;
1032
1033 /*
1034 * If there are no restrictions otherwise, assume we have
1035 * unlimited space available.
1036 */
1037 quota = UINT64_MAX;
1038 parentspace = UINT64_MAX;
1039
1040 if (dd->dd_parent != NULL) {
1041 parentspace = dsl_dir_space_available(dd->dd_parent,
1042 ancestor, delta, ondiskonly);
1043 }
1044
1045 mutex_enter(&dd->dd_lock);
1046 if (dd->dd_phys->dd_quota != 0)
1047 quota = dd->dd_phys->dd_quota;
1048 used = dd->dd_phys->dd_used_bytes;
1049 if (!ondiskonly)
1050 used += dsl_dir_space_towrite(dd);
1051
1052 if (dd->dd_parent == NULL) {
1053 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
1054 quota = MIN(quota, poolsize);
1055 }
1056
1057 if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
1058 /*
1059 * We have some space reserved, in addition to what our
1060 * parent gave us.
1061 */
1062 parentspace += dd->dd_phys->dd_reserved - used;
1063 }
1064
1065 if (dd == ancestor) {
1066 ASSERT(delta <= 0);
1067 ASSERT(used >= -delta);
1068 used += delta;
1069 if (parentspace != UINT64_MAX)
1070 parentspace -= delta;
1071 }
1072
1073 if (used > quota) {
1074 /* over quota */
1075 myspace = 0;
1076 } else {
1077 /*
1078 * the lesser of the space provided by our parent and
1079 * the space left in our quota
1080 */
1081 myspace = MIN(parentspace, quota - used);
1082 }
1083
1084 mutex_exit(&dd->dd_lock);
1085
1086 return (myspace);
1087 }
1088
1089 struct tempreserve {
1090 list_node_t tr_node;
1091 dsl_dir_t *tr_ds;
1092 uint64_t tr_size;
1093 };
1094
1095 static int
dsl_dir_tempreserve_impl(dsl_dir_t * dd,uint64_t asize,boolean_t netfree,boolean_t ignorequota,boolean_t checkrefquota,list_t * tr_list,dmu_tx_t * tx,boolean_t first)1096 dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
1097 boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
1098 dmu_tx_t *tx, boolean_t first)
1099 {
1100 uint64_t txg = tx->tx_txg;
1101 uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
1102 uint64_t deferred = 0;
1103 struct tempreserve *tr;
1104 int retval = EDQUOT;
1105 int txgidx = txg & TXG_MASK;
1106 int i;
1107 uint64_t ref_rsrv = 0;
1108
1109 ASSERT3U(txg, !=, 0);
1110 ASSERT3S(asize, >, 0);
1111
1112 mutex_enter(&dd->dd_lock);
1113
1114 /*
1115 * Check against the dsl_dir's quota. We don't add in the delta
1116 * when checking for over-quota because they get one free hit.
1117 */
1118 est_inflight = dsl_dir_space_towrite(dd);
1119 for (i = 0; i < TXG_SIZE; i++)
1120 est_inflight += dd->dd_tempreserved[i];
1121 used_on_disk = dd->dd_phys->dd_used_bytes;
1122
1123 /*
1124 * On the first iteration, fetch the dataset's used-on-disk and
1125 * refreservation values. Also, if checkrefquota is set, test if
1126 * allocating this space would exceed the dataset's refquota.
1127 */
1128 if (first && tx->tx_objset) {
1129 int error;
1130 dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
1131
1132 error = dsl_dataset_check_quota(ds, checkrefquota,
1133 asize, est_inflight, &used_on_disk, &ref_rsrv);
1134 if (error) {
1135 mutex_exit(&dd->dd_lock);
1136 return (error);
1137 }
1138 }
1139
1140 /*
1141 * If this transaction will result in a net free of space,
1142 * we want to let it through.
1143 */
1144 if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
1145 quota = UINT64_MAX;
1146 else
1147 quota = dd->dd_phys->dd_quota;
1148
1149 /*
1150 * Adjust the quota against the actual pool size at the root
1151 * minus any outstanding deferred frees.
1152 * To ensure that it's possible to remove files from a full
1153 * pool without inducing transient overcommits, we throttle
1154 * netfree transactions against a quota that is slightly larger,
1155 * but still within the pool's allocation slop. In cases where
1156 * we're very close to full, this will allow a steady trickle of
1157 * removes to get through.
1158 */
1159 if (dd->dd_parent == NULL) {
1160 spa_t *spa = dd->dd_pool->dp_spa;
1161 uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
1162 deferred = metaslab_class_get_deferred(spa_normal_class(spa));
1163 if (poolsize - deferred < quota) {
1164 quota = poolsize - deferred;
1165 retval = ENOSPC;
1166 }
1167 }
1168
1169 /*
1170 * If they are requesting more space, and our current estimate
1171 * is over quota, they get to try again unless the actual
1172 * on-disk is over quota and there are no pending changes (which
1173 * may free up space for us).
1174 */
1175 if (used_on_disk + est_inflight >= quota) {
1176 if (est_inflight > 0 || used_on_disk < quota ||
1177 (retval == ENOSPC && used_on_disk < quota + deferred))
1178 retval = ERESTART;
1179 dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
1180 "quota=%lluK tr=%lluK err=%d\n",
1181 used_on_disk>>10, est_inflight>>10,
1182 quota>>10, asize>>10, retval);
1183 mutex_exit(&dd->dd_lock);
1184 return (SET_ERROR(retval));
1185 }
1186
1187 /* We need to up our estimated delta before dropping dd_lock */
1188 dd->dd_tempreserved[txgidx] += asize;
1189
1190 parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
1191 asize - ref_rsrv);
1192 mutex_exit(&dd->dd_lock);
1193
1194 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1195 tr->tr_ds = dd;
1196 tr->tr_size = asize;
1197 list_insert_tail(tr_list, tr);
1198
1199 /* see if it's OK with our parent */
1200 if (dd->dd_parent && parent_rsrv) {
1201 boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
1202
1203 return (dsl_dir_tempreserve_impl(dd->dd_parent,
1204 parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
1205 } else {
1206 return (0);
1207 }
1208 }
1209
1210 /*
1211 * Reserve space in this dsl_dir, to be used in this tx's txg.
1212 * After the space has been dirtied (and dsl_dir_willuse_space()
1213 * has been called), the reservation should be canceled, using
1214 * dsl_dir_tempreserve_clear().
1215 */
1216 int
dsl_dir_tempreserve_space(dsl_dir_t * dd,uint64_t lsize,uint64_t asize,uint64_t fsize,uint64_t usize,void ** tr_cookiep,dmu_tx_t * tx)1217 dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
1218 uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
1219 {
1220 int err;
1221 list_t *tr_list;
1222
1223 if (asize == 0) {
1224 *tr_cookiep = NULL;
1225 return (0);
1226 }
1227
1228 tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
1229 list_create(tr_list, sizeof (struct tempreserve),
1230 offsetof(struct tempreserve, tr_node));
1231 ASSERT3S(asize, >, 0);
1232 ASSERT3S(fsize, >=, 0);
1233
1234 err = arc_tempreserve_space(lsize, tx->tx_txg);
1235 if (err == 0) {
1236 struct tempreserve *tr;
1237
1238 tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
1239 tr->tr_size = lsize;
1240 list_insert_tail(tr_list, tr);
1241 } else {
1242 if (err == EAGAIN) {
1243 /*
1244 * If arc_memory_throttle() detected that pageout
1245 * is running and we are low on memory, we delay new
1246 * non-pageout transactions to give pageout an
1247 * advantage.
1248 *
1249 * It is unfortunate to be delaying while the caller's
1250 * locks are held.
1251 */
1252 txg_delay(dd->dd_pool, tx->tx_txg,
1253 MSEC2NSEC(10), MSEC2NSEC(10));
1254 err = SET_ERROR(ERESTART);
1255 }
1256 }
1257
1258 if (err == 0) {
1259 err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
1260 FALSE, asize > usize, tr_list, tx, TRUE);
1261 }
1262
1263 if (err != 0)
1264 dsl_dir_tempreserve_clear(tr_list, tx);
1265 else
1266 *tr_cookiep = tr_list;
1267
1268 return (err);
1269 }
1270
1271 /*
1272 * Clear a temporary reservation that we previously made with
1273 * dsl_dir_tempreserve_space().
1274 */
1275 void
dsl_dir_tempreserve_clear(void * tr_cookie,dmu_tx_t * tx)1276 dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
1277 {
1278 int txgidx = tx->tx_txg & TXG_MASK;
1279 list_t *tr_list = tr_cookie;
1280 struct tempreserve *tr;
1281
1282 ASSERT3U(tx->tx_txg, !=, 0);
1283
1284 if (tr_cookie == NULL)
1285 return;
1286
1287 while ((tr = list_head(tr_list)) != NULL) {
1288 if (tr->tr_ds) {
1289 mutex_enter(&tr->tr_ds->dd_lock);
1290 ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
1291 tr->tr_size);
1292 tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
1293 mutex_exit(&tr->tr_ds->dd_lock);
1294 } else {
1295 arc_tempreserve_clear(tr->tr_size);
1296 }
1297 list_remove(tr_list, tr);
1298 kmem_free(tr, sizeof (struct tempreserve));
1299 }
1300
1301 kmem_free(tr_list, sizeof (list_t));
1302 }
1303
1304 /*
1305 * This should be called from open context when we think we're going to write
1306 * or free space, for example when dirtying data. Be conservative; it's okay
1307 * to write less space or free more, but we don't want to write more or free
1308 * less than the amount specified.
1309 */
1310 void
dsl_dir_willuse_space(dsl_dir_t * dd,int64_t space,dmu_tx_t * tx)1311 dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
1312 {
1313 int64_t parent_space;
1314 uint64_t est_used;
1315
1316 mutex_enter(&dd->dd_lock);
1317 if (space > 0)
1318 dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
1319
1320 est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
1321 parent_space = parent_delta(dd, est_used, space);
1322 mutex_exit(&dd->dd_lock);
1323
1324 /* Make sure that we clean up dd_space_to* */
1325 dsl_dir_dirty(dd, tx);
1326
1327 /* XXX this is potentially expensive and unnecessary... */
1328 if (parent_space && dd->dd_parent)
1329 dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
1330 }
1331
1332 /* call from syncing context when we actually write/free space for this dd */
1333 void
dsl_dir_diduse_space(dsl_dir_t * dd,dd_used_t type,int64_t used,int64_t compressed,int64_t uncompressed,dmu_tx_t * tx)1334 dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
1335 int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
1336 {
1337 int64_t accounted_delta;
1338
1339 /*
1340 * dsl_dataset_set_refreservation_sync_impl() calls this with
1341 * dd_lock held, so that it can atomically update
1342 * ds->ds_reserved and the dsl_dir accounting, so that
1343 * dsl_dataset_check_quota() can see dataset and dir accounting
1344 * consistently.
1345 */
1346 boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
1347
1348 ASSERT(dmu_tx_is_syncing(tx));
1349 ASSERT(type < DD_USED_NUM);
1350
1351 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1352
1353 if (needlock)
1354 mutex_enter(&dd->dd_lock);
1355 accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
1356 ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
1357 ASSERT(compressed >= 0 ||
1358 dd->dd_phys->dd_compressed_bytes >= -compressed);
1359 ASSERT(uncompressed >= 0 ||
1360 dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
1361 dd->dd_phys->dd_used_bytes += used;
1362 dd->dd_phys->dd_uncompressed_bytes += uncompressed;
1363 dd->dd_phys->dd_compressed_bytes += compressed;
1364
1365 if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
1366 ASSERT(used > 0 ||
1367 dd->dd_phys->dd_used_breakdown[type] >= -used);
1368 dd->dd_phys->dd_used_breakdown[type] += used;
1369 #ifdef DEBUG
1370 dd_used_t t;
1371 uint64_t u = 0;
1372 for (t = 0; t < DD_USED_NUM; t++)
1373 u += dd->dd_phys->dd_used_breakdown[t];
1374 ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
1375 #endif
1376 }
1377 if (needlock)
1378 mutex_exit(&dd->dd_lock);
1379
1380 if (dd->dd_parent != NULL) {
1381 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1382 accounted_delta, compressed, uncompressed, tx);
1383 dsl_dir_transfer_space(dd->dd_parent,
1384 used - accounted_delta,
1385 DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
1386 }
1387 }
1388
1389 void
dsl_dir_transfer_space(dsl_dir_t * dd,int64_t delta,dd_used_t oldtype,dd_used_t newtype,dmu_tx_t * tx)1390 dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
1391 dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
1392 {
1393 ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
1394 ASSERT(oldtype < DD_USED_NUM);
1395 ASSERT(newtype < DD_USED_NUM);
1396
1397 if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
1398 return;
1399
1400 if (tx != NULL)
1401 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1402 mutex_enter(&dd->dd_lock);
1403 ASSERT(delta > 0 ?
1404 dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
1405 dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
1406 ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
1407 dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
1408 dd->dd_phys->dd_used_breakdown[newtype] += delta;
1409 mutex_exit(&dd->dd_lock);
1410 }
1411
1412 typedef struct dsl_dir_set_qr_arg {
1413 const char *ddsqra_name;
1414 zprop_source_t ddsqra_source;
1415 uint64_t ddsqra_value;
1416 } dsl_dir_set_qr_arg_t;
1417
1418 static int
dsl_dir_set_quota_check(void * arg,dmu_tx_t * tx)1419 dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
1420 {
1421 dsl_dir_set_qr_arg_t *ddsqra = arg;
1422 dsl_pool_t *dp = dmu_tx_pool(tx);
1423 dsl_dataset_t *ds;
1424 int error;
1425 uint64_t towrite, newval;
1426
1427 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1428 if (error != 0)
1429 return (error);
1430
1431 error = dsl_prop_predict(ds->ds_dir, "quota",
1432 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1433 if (error != 0) {
1434 dsl_dataset_rele(ds, FTAG);
1435 return (error);
1436 }
1437
1438 if (newval == 0) {
1439 dsl_dataset_rele(ds, FTAG);
1440 return (0);
1441 }
1442
1443 mutex_enter(&ds->ds_dir->dd_lock);
1444 /*
1445 * If we are doing the preliminary check in open context, and
1446 * there are pending changes, then don't fail it, since the
1447 * pending changes could under-estimate the amount of space to be
1448 * freed up.
1449 */
1450 towrite = dsl_dir_space_towrite(ds->ds_dir);
1451 if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
1452 (newval < ds->ds_dir->dd_phys->dd_reserved ||
1453 newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
1454 error = SET_ERROR(ENOSPC);
1455 }
1456 mutex_exit(&ds->ds_dir->dd_lock);
1457 dsl_dataset_rele(ds, FTAG);
1458 return (error);
1459 }
1460
1461 static void
dsl_dir_set_quota_sync(void * arg,dmu_tx_t * tx)1462 dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
1463 {
1464 dsl_dir_set_qr_arg_t *ddsqra = arg;
1465 dsl_pool_t *dp = dmu_tx_pool(tx);
1466 dsl_dataset_t *ds;
1467 uint64_t newval;
1468
1469 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1470
1471 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1472 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
1473 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1474 &ddsqra->ddsqra_value, tx);
1475
1476 VERIFY0(dsl_prop_get_int_ds(ds,
1477 zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
1478 } else {
1479 newval = ddsqra->ddsqra_value;
1480 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1481 zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
1482 }
1483
1484 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
1485 mutex_enter(&ds->ds_dir->dd_lock);
1486 ds->ds_dir->dd_phys->dd_quota = newval;
1487 mutex_exit(&ds->ds_dir->dd_lock);
1488 dsl_dataset_rele(ds, FTAG);
1489 }
1490
1491 int
dsl_dir_set_quota(const char * ddname,zprop_source_t source,uint64_t quota)1492 dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
1493 {
1494 dsl_dir_set_qr_arg_t ddsqra;
1495
1496 ddsqra.ddsqra_name = ddname;
1497 ddsqra.ddsqra_source = source;
1498 ddsqra.ddsqra_value = quota;
1499
1500 return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1501 dsl_dir_set_quota_sync, &ddsqra, 0));
1502 }
1503
1504 int
dsl_dir_set_reservation_check(void * arg,dmu_tx_t * tx)1505 dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1506 {
1507 dsl_dir_set_qr_arg_t *ddsqra = arg;
1508 dsl_pool_t *dp = dmu_tx_pool(tx);
1509 dsl_dataset_t *ds;
1510 dsl_dir_t *dd;
1511 uint64_t newval, used, avail;
1512 int error;
1513
1514 error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1515 if (error != 0)
1516 return (error);
1517 dd = ds->ds_dir;
1518
1519 /*
1520 * If we are doing the preliminary check in open context, the
1521 * space estimates may be inaccurate.
1522 */
1523 if (!dmu_tx_is_syncing(tx)) {
1524 dsl_dataset_rele(ds, FTAG);
1525 return (0);
1526 }
1527
1528 error = dsl_prop_predict(ds->ds_dir,
1529 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1530 ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1531 if (error != 0) {
1532 dsl_dataset_rele(ds, FTAG);
1533 return (error);
1534 }
1535
1536 mutex_enter(&dd->dd_lock);
1537 used = dd->dd_phys->dd_used_bytes;
1538 mutex_exit(&dd->dd_lock);
1539
1540 if (dd->dd_parent) {
1541 avail = dsl_dir_space_available(dd->dd_parent,
1542 NULL, 0, FALSE);
1543 } else {
1544 avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1545 }
1546
1547 if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
1548 uint64_t delta = MAX(used, newval) -
1549 MAX(used, dd->dd_phys->dd_reserved);
1550
1551 if (delta > avail ||
1552 (dd->dd_phys->dd_quota > 0 &&
1553 newval > dd->dd_phys->dd_quota))
1554 error = SET_ERROR(ENOSPC);
1555 }
1556
1557 dsl_dataset_rele(ds, FTAG);
1558 return (error);
1559 }
1560
1561 void
dsl_dir_set_reservation_sync_impl(dsl_dir_t * dd,uint64_t value,dmu_tx_t * tx)1562 dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1563 {
1564 uint64_t used;
1565 int64_t delta;
1566
1567 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1568
1569 mutex_enter(&dd->dd_lock);
1570 used = dd->dd_phys->dd_used_bytes;
1571 delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1572 dd->dd_phys->dd_reserved = value;
1573
1574 if (dd->dd_parent != NULL) {
1575 /* Roll up this additional usage into our ancestors */
1576 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1577 delta, 0, 0, tx);
1578 }
1579 mutex_exit(&dd->dd_lock);
1580 }
1581
1582 static void
dsl_dir_set_reservation_sync(void * arg,dmu_tx_t * tx)1583 dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1584 {
1585 dsl_dir_set_qr_arg_t *ddsqra = arg;
1586 dsl_pool_t *dp = dmu_tx_pool(tx);
1587 dsl_dataset_t *ds;
1588 uint64_t newval;
1589
1590 VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1591
1592 if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1593 dsl_prop_set_sync_impl(ds,
1594 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1595 ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1596 &ddsqra->ddsqra_value, tx);
1597
1598 VERIFY0(dsl_prop_get_int_ds(ds,
1599 zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1600 } else {
1601 newval = ddsqra->ddsqra_value;
1602 spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1603 zfs_prop_to_name(ZFS_PROP_RESERVATION),
1604 (longlong_t)newval);
1605 }
1606
1607 dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1608 dsl_dataset_rele(ds, FTAG);
1609 }
1610
1611 int
dsl_dir_set_reservation(const char * ddname,zprop_source_t source,uint64_t reservation)1612 dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1613 uint64_t reservation)
1614 {
1615 dsl_dir_set_qr_arg_t ddsqra;
1616
1617 ddsqra.ddsqra_name = ddname;
1618 ddsqra.ddsqra_source = source;
1619 ddsqra.ddsqra_value = reservation;
1620
1621 return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1622 dsl_dir_set_reservation_sync, &ddsqra, 0));
1623 }
1624
1625 static dsl_dir_t *
closest_common_ancestor(dsl_dir_t * ds1,dsl_dir_t * ds2)1626 closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1627 {
1628 for (; ds1; ds1 = ds1->dd_parent) {
1629 dsl_dir_t *dd;
1630 for (dd = ds2; dd; dd = dd->dd_parent) {
1631 if (ds1 == dd)
1632 return (dd);
1633 }
1634 }
1635 return (NULL);
1636 }
1637
1638 /*
1639 * If delta is applied to dd, how much of that delta would be applied to
1640 * ancestor? Syncing context only.
1641 */
1642 static int64_t
would_change(dsl_dir_t * dd,int64_t delta,dsl_dir_t * ancestor)1643 would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1644 {
1645 if (dd == ancestor)
1646 return (delta);
1647
1648 mutex_enter(&dd->dd_lock);
1649 delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1650 mutex_exit(&dd->dd_lock);
1651 return (would_change(dd->dd_parent, delta, ancestor));
1652 }
1653
1654 typedef struct dsl_dir_rename_arg {
1655 const char *ddra_oldname;
1656 const char *ddra_newname;
1657 cred_t *ddra_cred;
1658 } dsl_dir_rename_arg_t;
1659
1660 /* ARGSUSED */
1661 static int
dsl_valid_rename(dsl_pool_t * dp,dsl_dataset_t * ds,void * arg)1662 dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1663 {
1664 int *deltap = arg;
1665 char namebuf[MAXNAMELEN];
1666
1667 dsl_dataset_name(ds, namebuf);
1668
1669 if (strlen(namebuf) + *deltap >= MAXNAMELEN)
1670 return (SET_ERROR(ENAMETOOLONG));
1671 return (0);
1672 }
1673
1674 static int
dsl_dir_rename_check(void * arg,dmu_tx_t * tx)1675 dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1676 {
1677 dsl_dir_rename_arg_t *ddra = arg;
1678 dsl_pool_t *dp = dmu_tx_pool(tx);
1679 dsl_dir_t *dd, *newparent;
1680 const char *mynewname;
1681 int error;
1682 int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1683
1684 /* target dir should exist */
1685 error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1686 if (error != 0)
1687 return (error);
1688
1689 /* new parent should exist */
1690 error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1691 &newparent, &mynewname);
1692 if (error != 0) {
1693 dsl_dir_rele(dd, FTAG);
1694 return (error);
1695 }
1696
1697 /* can't rename to different pool */
1698 if (dd->dd_pool != newparent->dd_pool) {
1699 dsl_dir_rele(newparent, FTAG);
1700 dsl_dir_rele(dd, FTAG);
1701 return (SET_ERROR(EXDEV));
1702 }
1703
1704 /* new name should not already exist */
1705 if (mynewname == NULL) {
1706 dsl_dir_rele(newparent, FTAG);
1707 dsl_dir_rele(dd, FTAG);
1708 return (SET_ERROR(EEXIST));
1709 }
1710
1711 /* if the name length is growing, validate child name lengths */
1712 if (delta > 0) {
1713 error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1714 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1715 if (error != 0) {
1716 dsl_dir_rele(newparent, FTAG);
1717 dsl_dir_rele(dd, FTAG);
1718 return (error);
1719 }
1720 }
1721
1722 if (dmu_tx_is_syncing(tx)) {
1723 if (spa_feature_is_active(dp->dp_spa,
1724 SPA_FEATURE_FS_SS_LIMIT)) {
1725 /*
1726 * Although this is the check function and we don't
1727 * normally make on-disk changes in check functions,
1728 * we need to do that here.
1729 *
1730 * Ensure this portion of the tree's counts have been
1731 * initialized in case the new parent has limits set.
1732 */
1733 dsl_dir_init_fs_ss_count(dd, tx);
1734 }
1735 }
1736
1737 if (newparent != dd->dd_parent) {
1738 /* is there enough space? */
1739 uint64_t myspace =
1740 MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1741 objset_t *os = dd->dd_pool->dp_meta_objset;
1742 uint64_t fs_cnt = 0;
1743 uint64_t ss_cnt = 0;
1744
1745 if (dsl_dir_is_zapified(dd)) {
1746 int err;
1747
1748 err = zap_lookup(os, dd->dd_object,
1749 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1750 &fs_cnt);
1751 if (err != ENOENT && err != 0) {
1752 dsl_dir_rele(newparent, FTAG);
1753 dsl_dir_rele(dd, FTAG);
1754 return (err);
1755 }
1756
1757 /*
1758 * have to add 1 for the filesystem itself that we're
1759 * moving
1760 */
1761 fs_cnt++;
1762
1763 err = zap_lookup(os, dd->dd_object,
1764 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1765 &ss_cnt);
1766 if (err != ENOENT && err != 0) {
1767 dsl_dir_rele(newparent, FTAG);
1768 dsl_dir_rele(dd, FTAG);
1769 return (err);
1770 }
1771 }
1772
1773 /* no rename into our descendant */
1774 if (closest_common_ancestor(dd, newparent) == dd) {
1775 dsl_dir_rele(newparent, FTAG);
1776 dsl_dir_rele(dd, FTAG);
1777 return (SET_ERROR(EINVAL));
1778 }
1779
1780 error = dsl_dir_transfer_possible(dd->dd_parent,
1781 newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
1782 if (error != 0) {
1783 dsl_dir_rele(newparent, FTAG);
1784 dsl_dir_rele(dd, FTAG);
1785 return (error);
1786 }
1787 }
1788
1789 dsl_dir_rele(newparent, FTAG);
1790 dsl_dir_rele(dd, FTAG);
1791 return (0);
1792 }
1793
1794 static void
dsl_dir_rename_sync(void * arg,dmu_tx_t * tx)1795 dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1796 {
1797 dsl_dir_rename_arg_t *ddra = arg;
1798 dsl_pool_t *dp = dmu_tx_pool(tx);
1799 dsl_dir_t *dd, *newparent;
1800 const char *mynewname;
1801 int error;
1802 objset_t *mos = dp->dp_meta_objset;
1803
1804 VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1805 VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1806 &mynewname));
1807
1808 /* Log this before we change the name. */
1809 spa_history_log_internal_dd(dd, "rename", tx,
1810 "-> %s", ddra->ddra_newname);
1811
1812 if (newparent != dd->dd_parent) {
1813 objset_t *os = dd->dd_pool->dp_meta_objset;
1814 uint64_t fs_cnt = 0;
1815 uint64_t ss_cnt = 0;
1816
1817 /*
1818 * We already made sure the dd counts were initialized in the
1819 * check function.
1820 */
1821 if (spa_feature_is_active(dp->dp_spa,
1822 SPA_FEATURE_FS_SS_LIMIT)) {
1823 VERIFY0(zap_lookup(os, dd->dd_object,
1824 DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
1825 &fs_cnt));
1826 /* add 1 for the filesystem itself that we're moving */
1827 fs_cnt++;
1828
1829 VERIFY0(zap_lookup(os, dd->dd_object,
1830 DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
1831 &ss_cnt));
1832 }
1833
1834 dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
1835 DD_FIELD_FILESYSTEM_COUNT, tx);
1836 dsl_fs_ss_count_adjust(newparent, fs_cnt,
1837 DD_FIELD_FILESYSTEM_COUNT, tx);
1838
1839 dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
1840 DD_FIELD_SNAPSHOT_COUNT, tx);
1841 dsl_fs_ss_count_adjust(newparent, ss_cnt,
1842 DD_FIELD_SNAPSHOT_COUNT, tx);
1843
1844 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1845 -dd->dd_phys->dd_used_bytes,
1846 -dd->dd_phys->dd_compressed_bytes,
1847 -dd->dd_phys->dd_uncompressed_bytes, tx);
1848 dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1849 dd->dd_phys->dd_used_bytes,
1850 dd->dd_phys->dd_compressed_bytes,
1851 dd->dd_phys->dd_uncompressed_bytes, tx);
1852
1853 if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1854 uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1855 dd->dd_phys->dd_used_bytes;
1856
1857 dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1858 -unused_rsrv, 0, 0, tx);
1859 dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1860 unused_rsrv, 0, 0, tx);
1861 }
1862 }
1863
1864 dmu_buf_will_dirty(dd->dd_dbuf, tx);
1865
1866 /* remove from old parent zapobj */
1867 error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1868 dd->dd_myname, tx);
1869 ASSERT0(error);
1870
1871 (void) strcpy(dd->dd_myname, mynewname);
1872 dsl_dir_rele(dd->dd_parent, dd);
1873 dd->dd_phys->dd_parent_obj = newparent->dd_object;
1874 VERIFY0(dsl_dir_hold_obj(dp,
1875 newparent->dd_object, NULL, dd, &dd->dd_parent));
1876
1877 /* add to new parent zapobj */
1878 VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
1879 dd->dd_myname, 8, 1, &dd->dd_object, tx));
1880
1881 #ifdef __FreeBSD__
1882 #ifdef _KERNEL
1883 zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
1884 zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
1885 #endif
1886 #endif
1887
1888 dsl_prop_notify_all(dd);
1889
1890 dsl_dir_rele(newparent, FTAG);
1891 dsl_dir_rele(dd, FTAG);
1892 }
1893
1894 int
dsl_dir_rename(const char * oldname,const char * newname)1895 dsl_dir_rename(const char *oldname, const char *newname)
1896 {
1897 dsl_dir_rename_arg_t ddra;
1898
1899 ddra.ddra_oldname = oldname;
1900 ddra.ddra_newname = newname;
1901 ddra.ddra_cred = CRED();
1902
1903 return (dsl_sync_task(oldname,
1904 dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
1905 }
1906
1907 int
dsl_dir_transfer_possible(dsl_dir_t * sdd,dsl_dir_t * tdd,uint64_t fs_cnt,uint64_t ss_cnt,uint64_t space,cred_t * cr)1908 dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
1909 uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
1910 {
1911 dsl_dir_t *ancestor;
1912 int64_t adelta;
1913 uint64_t avail;
1914 int err;
1915
1916 ancestor = closest_common_ancestor(sdd, tdd);
1917 adelta = would_change(sdd, -space, ancestor);
1918 avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1919 if (avail < space)
1920 return (SET_ERROR(ENOSPC));
1921
1922 err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
1923 ancestor, cr);
1924 if (err != 0)
1925 return (err);
1926 err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
1927 ancestor, cr);
1928 if (err != 0)
1929 return (err);
1930
1931 return (0);
1932 }
1933
1934 timestruc_t
dsl_dir_snap_cmtime(dsl_dir_t * dd)1935 dsl_dir_snap_cmtime(dsl_dir_t *dd)
1936 {
1937 timestruc_t t;
1938
1939 mutex_enter(&dd->dd_lock);
1940 t = dd->dd_snap_cmtime;
1941 mutex_exit(&dd->dd_lock);
1942
1943 return (t);
1944 }
1945
1946 void
dsl_dir_snap_cmtime_update(dsl_dir_t * dd)1947 dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1948 {
1949 timestruc_t t;
1950
1951 gethrestime(&t);
1952 mutex_enter(&dd->dd_lock);
1953 dd->dd_snap_cmtime = t;
1954 mutex_exit(&dd->dd_lock);
1955 }
1956
1957 void
dsl_dir_zapify(dsl_dir_t * dd,dmu_tx_t * tx)1958 dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
1959 {
1960 objset_t *mos = dd->dd_pool->dp_meta_objset;
1961 dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
1962 }
1963
1964 boolean_t
dsl_dir_is_zapified(dsl_dir_t * dd)1965 dsl_dir_is_zapified(dsl_dir_t *dd)
1966 {
1967 dmu_object_info_t doi;
1968
1969 dmu_object_info_from_db(dd->dd_dbuf, &doi);
1970 return (doi.doi_type == DMU_OTN_ZAP_METADATA);
1971 }
1972