1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24  * All rights reserved.
25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 /* Portions Copyright 2010 Robert Milkowski */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/sysmacros.h>
36 #include <sys/kmem.h>
37 #include <sys/acl.h>
38 #include <sys/vnode.h>
39 #include <sys/vfs.h>
40 #include <sys/mntent.h>
41 #include <sys/mount.h>
42 #include <sys/cmn_err.h>
43 #include <sys/zfs_znode.h>
44 #include <sys/zfs_dir.h>
45 #include <sys/zil.h>
46 #include <sys/fs/zfs.h>
47 #include <sys/dmu.h>
48 #include <sys/dsl_prop.h>
49 #include <sys/dsl_dataset.h>
50 #include <sys/dsl_deleg.h>
51 #include <sys/spa.h>
52 #include <sys/zap.h>
53 #include <sys/sa.h>
54 #include <sys/sa_impl.h>
55 #include <sys/varargs.h>
56 #include <sys/policy.h>
57 #include <sys/atomic.h>
58 #include <sys/zfs_ioctl.h>
59 #include <sys/zfs_ctldir.h>
60 #include <sys/zfs_fuid.h>
61 #include <sys/sunddi.h>
62 #include <sys/dnlc.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/spa_boot.h>
65 #include <sys/jail.h>
66 #include "zfs_comutil.h"
67 
68 struct mtx zfs_debug_mtx;
69 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
70 
71 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
72 
73 int zfs_super_owner;
74 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
75     "File system owner can perform privileged operation on his file systems");
76 
77 int zfs_debug_level;
78 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
79 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
80     "Debug level");
81 
82 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
83 static int zfs_version_acl = ZFS_ACL_VERSION;
84 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
85     "ZFS_ACL_VERSION");
86 static int zfs_version_spa = SPA_VERSION;
87 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
88     "SPA_VERSION");
89 static int zfs_version_zpl = ZPL_VERSION;
90 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
91     "ZPL_VERSION");
92 
93 static int zfs_mount(vfs_t *vfsp);
94 static int zfs_umount(vfs_t *vfsp, int fflag);
95 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
96 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
97 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
98 static int zfs_sync(vfs_t *vfsp, int waitfor);
99 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
100     struct ucred **credanonp, int *numsecflavors, int **secflavors);
101 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
102 static void zfs_objset_close(zfsvfs_t *zfsvfs);
103 static void zfs_freevfs(vfs_t *vfsp);
104 
105 static struct vfsops zfs_vfsops = {
106 	.vfs_mount =		zfs_mount,
107 	.vfs_unmount =		zfs_umount,
108 	.vfs_root =		zfs_root,
109 	.vfs_statfs =		zfs_statfs,
110 	.vfs_vget =		zfs_vget,
111 	.vfs_sync =		zfs_sync,
112 	.vfs_checkexp =		zfs_checkexp,
113 	.vfs_fhtovp =		zfs_fhtovp,
114 };
115 
116 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
117 
118 /*
119  * We need to keep a count of active fs's.
120  * This is necessary to prevent our module
121  * from being unloaded after a umount -f
122  */
123 static uint32_t	zfs_active_fs_count = 0;
124 
125 /*ARGSUSED*/
126 static int
zfs_sync(vfs_t * vfsp,int waitfor)127 zfs_sync(vfs_t *vfsp, int waitfor)
128 {
129 
130 	/*
131 	 * Data integrity is job one.  We don't want a compromised kernel
132 	 * writing to the storage pool, so we never sync during panic.
133 	 */
134 	if (panicstr)
135 		return (0);
136 
137 	/*
138 	 * Ignore the system syncher.  ZFS already commits async data
139 	 * at zfs_txg_timeout intervals.
140 	 */
141 	if (waitfor == MNT_LAZY)
142 		return (0);
143 
144 	if (vfsp != NULL) {
145 		/*
146 		 * Sync a specific filesystem.
147 		 */
148 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
149 		dsl_pool_t *dp;
150 		int error;
151 
152 		error = vfs_stdsync(vfsp, waitfor);
153 		if (error != 0)
154 			return (error);
155 
156 		ZFS_ENTER(zfsvfs);
157 		dp = dmu_objset_pool(zfsvfs->z_os);
158 
159 		/*
160 		 * If the system is shutting down, then skip any
161 		 * filesystems which may exist on a suspended pool.
162 		 */
163 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
164 			ZFS_EXIT(zfsvfs);
165 			return (0);
166 		}
167 
168 		if (zfsvfs->z_log != NULL)
169 			zil_commit(zfsvfs->z_log, 0);
170 
171 		ZFS_EXIT(zfsvfs);
172 	} else {
173 		/*
174 		 * Sync all ZFS filesystems.  This is what happens when you
175 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
176 		 * request by waiting for all pools to commit all dirty data.
177 		 */
178 		spa_sync_allpools();
179 	}
180 
181 	return (0);
182 }
183 
184 #ifndef __FreeBSD_kernel__
185 static int
zfs_create_unique_device(dev_t * dev)186 zfs_create_unique_device(dev_t *dev)
187 {
188 	major_t new_major;
189 
190 	do {
191 		ASSERT3U(zfs_minor, <=, MAXMIN32);
192 		minor_t start = zfs_minor;
193 		do {
194 			mutex_enter(&zfs_dev_mtx);
195 			if (zfs_minor >= MAXMIN32) {
196 				/*
197 				 * If we're still using the real major
198 				 * keep out of /dev/zfs and /dev/zvol minor
199 				 * number space.  If we're using a getudev()'ed
200 				 * major number, we can use all of its minors.
201 				 */
202 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
203 					zfs_minor = ZFS_MIN_MINOR;
204 				else
205 					zfs_minor = 0;
206 			} else {
207 				zfs_minor++;
208 			}
209 			*dev = makedevice(zfs_major, zfs_minor);
210 			mutex_exit(&zfs_dev_mtx);
211 		} while (vfs_devismounted(*dev) && zfs_minor != start);
212 		if (zfs_minor == start) {
213 			/*
214 			 * We are using all ~262,000 minor numbers for the
215 			 * current major number.  Create a new major number.
216 			 */
217 			if ((new_major = getudev()) == (major_t)-1) {
218 				cmn_err(CE_WARN,
219 				    "zfs_mount: Can't get unique major "
220 				    "device number.");
221 				return (-1);
222 			}
223 			mutex_enter(&zfs_dev_mtx);
224 			zfs_major = new_major;
225 			zfs_minor = 0;
226 
227 			mutex_exit(&zfs_dev_mtx);
228 		} else {
229 			break;
230 		}
231 		/* CONSTANTCONDITION */
232 	} while (1);
233 
234 	return (0);
235 }
236 #endif	/* !__FreeBSD_kernel__ */
237 
238 static void
atime_changed_cb(void * arg,uint64_t newval)239 atime_changed_cb(void *arg, uint64_t newval)
240 {
241 	zfsvfs_t *zfsvfs = arg;
242 
243 	if (newval == TRUE) {
244 		zfsvfs->z_atime = TRUE;
245 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
246 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
247 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
248 	} else {
249 		zfsvfs->z_atime = FALSE;
250 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
251 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
252 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
253 	}
254 }
255 
256 static void
xattr_changed_cb(void * arg,uint64_t newval)257 xattr_changed_cb(void *arg, uint64_t newval)
258 {
259 	zfsvfs_t *zfsvfs = arg;
260 
261 	if (newval == TRUE) {
262 		/* XXX locking on vfs_flag? */
263 #ifdef TODO
264 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
265 #endif
266 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
267 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
268 	} else {
269 		/* XXX locking on vfs_flag? */
270 #ifdef TODO
271 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
272 #endif
273 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
274 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
275 	}
276 }
277 
278 static void
blksz_changed_cb(void * arg,uint64_t newval)279 blksz_changed_cb(void *arg, uint64_t newval)
280 {
281 	zfsvfs_t *zfsvfs = arg;
282 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
283 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
284 	ASSERT(ISP2(newval));
285 
286 	zfsvfs->z_max_blksz = newval;
287 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
288 }
289 
290 static void
readonly_changed_cb(void * arg,uint64_t newval)291 readonly_changed_cb(void *arg, uint64_t newval)
292 {
293 	zfsvfs_t *zfsvfs = arg;
294 
295 	if (newval) {
296 		/* XXX locking on vfs_flag? */
297 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
298 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
299 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
300 	} else {
301 		/* XXX locking on vfs_flag? */
302 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
303 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
304 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
305 	}
306 }
307 
308 static void
setuid_changed_cb(void * arg,uint64_t newval)309 setuid_changed_cb(void *arg, uint64_t newval)
310 {
311 	zfsvfs_t *zfsvfs = arg;
312 
313 	if (newval == FALSE) {
314 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
315 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
316 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
317 	} else {
318 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
319 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
320 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
321 	}
322 }
323 
324 static void
exec_changed_cb(void * arg,uint64_t newval)325 exec_changed_cb(void *arg, uint64_t newval)
326 {
327 	zfsvfs_t *zfsvfs = arg;
328 
329 	if (newval == FALSE) {
330 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
331 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
332 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
333 	} else {
334 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
335 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
336 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
337 	}
338 }
339 
340 /*
341  * The nbmand mount option can be changed at mount time.
342  * We can't allow it to be toggled on live file systems or incorrect
343  * behavior may be seen from cifs clients
344  *
345  * This property isn't registered via dsl_prop_register(), but this callback
346  * will be called when a file system is first mounted
347  */
348 static void
nbmand_changed_cb(void * arg,uint64_t newval)349 nbmand_changed_cb(void *arg, uint64_t newval)
350 {
351 	zfsvfs_t *zfsvfs = arg;
352 	if (newval == FALSE) {
353 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
354 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
355 	} else {
356 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
357 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
358 	}
359 }
360 
361 static void
snapdir_changed_cb(void * arg,uint64_t newval)362 snapdir_changed_cb(void *arg, uint64_t newval)
363 {
364 	zfsvfs_t *zfsvfs = arg;
365 
366 	zfsvfs->z_show_ctldir = newval;
367 }
368 
369 static void
vscan_changed_cb(void * arg,uint64_t newval)370 vscan_changed_cb(void *arg, uint64_t newval)
371 {
372 	zfsvfs_t *zfsvfs = arg;
373 
374 	zfsvfs->z_vscan = newval;
375 }
376 
377 static void
acl_mode_changed_cb(void * arg,uint64_t newval)378 acl_mode_changed_cb(void *arg, uint64_t newval)
379 {
380 	zfsvfs_t *zfsvfs = arg;
381 
382 	zfsvfs->z_acl_mode = newval;
383 }
384 
385 static void
acl_inherit_changed_cb(void * arg,uint64_t newval)386 acl_inherit_changed_cb(void *arg, uint64_t newval)
387 {
388 	zfsvfs_t *zfsvfs = arg;
389 
390 	zfsvfs->z_acl_inherit = newval;
391 }
392 
393 static int
zfs_register_callbacks(vfs_t * vfsp)394 zfs_register_callbacks(vfs_t *vfsp)
395 {
396 	struct dsl_dataset *ds = NULL;
397 	objset_t *os = NULL;
398 	zfsvfs_t *zfsvfs = NULL;
399 	uint64_t nbmand;
400 	boolean_t readonly = B_FALSE;
401 	boolean_t do_readonly = B_FALSE;
402 	boolean_t setuid = B_FALSE;
403 	boolean_t do_setuid = B_FALSE;
404 	boolean_t exec = B_FALSE;
405 	boolean_t do_exec = B_FALSE;
406 #ifdef illumos
407 	boolean_t devices = B_FALSE;
408 	boolean_t do_devices = B_FALSE;
409 #endif
410 	boolean_t xattr = B_FALSE;
411 	boolean_t do_xattr = B_FALSE;
412 	boolean_t atime = B_FALSE;
413 	boolean_t do_atime = B_FALSE;
414 	int error = 0;
415 
416 	ASSERT(vfsp);
417 	zfsvfs = vfsp->vfs_data;
418 	ASSERT(zfsvfs);
419 	os = zfsvfs->z_os;
420 
421 	/*
422 	 * This function can be called for a snapshot when we update snapshot's
423 	 * mount point, which isn't really supported.
424 	 */
425 	if (dmu_objset_is_snapshot(os))
426 		return (EOPNOTSUPP);
427 
428 	/*
429 	 * The act of registering our callbacks will destroy any mount
430 	 * options we may have.  In order to enable temporary overrides
431 	 * of mount options, we stash away the current values and
432 	 * restore them after we register the callbacks.
433 	 */
434 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
435 	    !spa_writeable(dmu_objset_spa(os))) {
436 		readonly = B_TRUE;
437 		do_readonly = B_TRUE;
438 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
439 		readonly = B_FALSE;
440 		do_readonly = B_TRUE;
441 	}
442 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
443 		setuid = B_FALSE;
444 		do_setuid = B_TRUE;
445 	} else {
446 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
447 			setuid = B_FALSE;
448 			do_setuid = B_TRUE;
449 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
450 			setuid = B_TRUE;
451 			do_setuid = B_TRUE;
452 		}
453 	}
454 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
455 		exec = B_FALSE;
456 		do_exec = B_TRUE;
457 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
458 		exec = B_TRUE;
459 		do_exec = B_TRUE;
460 	}
461 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
462 		xattr = B_FALSE;
463 		do_xattr = B_TRUE;
464 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
465 		xattr = B_TRUE;
466 		do_xattr = B_TRUE;
467 	}
468 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
469 		atime = B_FALSE;
470 		do_atime = B_TRUE;
471 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
472 		atime = B_TRUE;
473 		do_atime = B_TRUE;
474 	}
475 
476 	/*
477 	 * We need to enter pool configuration here, so that we can use
478 	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
479 	 * dsl_prop_get_integer() can not be used, because it has to acquire
480 	 * spa_namespace_lock and we can not do that because we already hold
481 	 * z_teardown_lock.  The problem is that spa_config_sync() is called
482 	 * with spa_namespace_lock held and the function calls ZFS vnode
483 	 * operations to write the cache file and thus z_teardown_lock is
484 	 * acquired after spa_namespace_lock.
485 	 */
486 	ds = dmu_objset_ds(os);
487 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
488 
489 	/*
490 	 * nbmand is a special property.  It can only be changed at
491 	 * mount time.
492 	 *
493 	 * This is weird, but it is documented to only be changeable
494 	 * at mount time.
495 	 */
496 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
497 		nbmand = B_FALSE;
498 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
499 		nbmand = B_TRUE;
500 	} else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
501 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
502 		return (error);
503 	}
504 
505 	/*
506 	 * Register property callbacks.
507 	 *
508 	 * It would probably be fine to just check for i/o error from
509 	 * the first prop_register(), but I guess I like to go
510 	 * overboard...
511 	 */
512 	error = dsl_prop_register(ds,
513 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
514 	error = error ? error : dsl_prop_register(ds,
515 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
516 	error = error ? error : dsl_prop_register(ds,
517 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
518 	error = error ? error : dsl_prop_register(ds,
519 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
520 #ifdef illumos
521 	error = error ? error : dsl_prop_register(ds,
522 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
523 #endif
524 	error = error ? error : dsl_prop_register(ds,
525 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
526 	error = error ? error : dsl_prop_register(ds,
527 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
528 	error = error ? error : dsl_prop_register(ds,
529 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
530 	error = error ? error : dsl_prop_register(ds,
531 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
532 	error = error ? error : dsl_prop_register(ds,
533 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
534 	    zfsvfs);
535 	error = error ? error : dsl_prop_register(ds,
536 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
537 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
538 	if (error)
539 		goto unregister;
540 
541 	/*
542 	 * Invoke our callbacks to restore temporary mount options.
543 	 */
544 	if (do_readonly)
545 		readonly_changed_cb(zfsvfs, readonly);
546 	if (do_setuid)
547 		setuid_changed_cb(zfsvfs, setuid);
548 	if (do_exec)
549 		exec_changed_cb(zfsvfs, exec);
550 	if (do_xattr)
551 		xattr_changed_cb(zfsvfs, xattr);
552 	if (do_atime)
553 		atime_changed_cb(zfsvfs, atime);
554 
555 	nbmand_changed_cb(zfsvfs, nbmand);
556 
557 	return (0);
558 
559 unregister:
560 	dsl_prop_unregister_all(ds, zfsvfs);
561 	return (error);
562 }
563 
564 static int
zfs_space_delta_cb(dmu_object_type_t bonustype,void * data,uint64_t * userp,uint64_t * groupp)565 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
566     uint64_t *userp, uint64_t *groupp)
567 {
568 	/*
569 	 * Is it a valid type of object to track?
570 	 */
571 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
572 		return (SET_ERROR(ENOENT));
573 
574 	/*
575 	 * If we have a NULL data pointer
576 	 * then assume the id's aren't changing and
577 	 * return EEXIST to the dmu to let it know to
578 	 * use the same ids
579 	 */
580 	if (data == NULL)
581 		return (SET_ERROR(EEXIST));
582 
583 	if (bonustype == DMU_OT_ZNODE) {
584 		znode_phys_t *znp = data;
585 		*userp = znp->zp_uid;
586 		*groupp = znp->zp_gid;
587 	} else {
588 		int hdrsize;
589 		sa_hdr_phys_t *sap = data;
590 		sa_hdr_phys_t sa = *sap;
591 		boolean_t swap = B_FALSE;
592 
593 		ASSERT(bonustype == DMU_OT_SA);
594 
595 		if (sa.sa_magic == 0) {
596 			/*
597 			 * This should only happen for newly created
598 			 * files that haven't had the znode data filled
599 			 * in yet.
600 			 */
601 			*userp = 0;
602 			*groupp = 0;
603 			return (0);
604 		}
605 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
606 			sa.sa_magic = SA_MAGIC;
607 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
608 			swap = B_TRUE;
609 		} else {
610 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
611 		}
612 
613 		hdrsize = sa_hdrsize(&sa);
614 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
615 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
616 		    SA_UID_OFFSET));
617 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
618 		    SA_GID_OFFSET));
619 		if (swap) {
620 			*userp = BSWAP_64(*userp);
621 			*groupp = BSWAP_64(*groupp);
622 		}
623 	}
624 	return (0);
625 }
626 
627 static void
fuidstr_to_sid(zfsvfs_t * zfsvfs,const char * fuidstr,char * domainbuf,int buflen,uid_t * ridp)628 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
629     char *domainbuf, int buflen, uid_t *ridp)
630 {
631 	uint64_t fuid;
632 	const char *domain;
633 
634 	fuid = strtonum(fuidstr, NULL);
635 
636 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
637 	if (domain)
638 		(void) strlcpy(domainbuf, domain, buflen);
639 	else
640 		domainbuf[0] = '\0';
641 	*ridp = FUID_RID(fuid);
642 }
643 
644 static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type)645 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
646 {
647 	switch (type) {
648 	case ZFS_PROP_USERUSED:
649 		return (DMU_USERUSED_OBJECT);
650 	case ZFS_PROP_GROUPUSED:
651 		return (DMU_GROUPUSED_OBJECT);
652 	case ZFS_PROP_USERQUOTA:
653 		return (zfsvfs->z_userquota_obj);
654 	case ZFS_PROP_GROUPQUOTA:
655 		return (zfsvfs->z_groupquota_obj);
656 	}
657 	return (0);
658 }
659 
660 int
zfs_userspace_many(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,uint64_t * cookiep,void * vbuf,uint64_t * bufsizep)661 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
662     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
663 {
664 	int error;
665 	zap_cursor_t zc;
666 	zap_attribute_t za;
667 	zfs_useracct_t *buf = vbuf;
668 	uint64_t obj;
669 
670 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
671 		return (SET_ERROR(ENOTSUP));
672 
673 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
674 	if (obj == 0) {
675 		*bufsizep = 0;
676 		return (0);
677 	}
678 
679 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
680 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
681 	    zap_cursor_advance(&zc)) {
682 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
683 		    *bufsizep)
684 			break;
685 
686 		fuidstr_to_sid(zfsvfs, za.za_name,
687 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
688 
689 		buf->zu_space = za.za_first_integer;
690 		buf++;
691 	}
692 	if (error == ENOENT)
693 		error = 0;
694 
695 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
696 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
697 	*cookiep = zap_cursor_serialize(&zc);
698 	zap_cursor_fini(&zc);
699 	return (error);
700 }
701 
702 /*
703  * buf must be big enough (eg, 32 bytes)
704  */
705 static int
id_to_fuidstr(zfsvfs_t * zfsvfs,const char * domain,uid_t rid,char * buf,boolean_t addok)706 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
707     char *buf, boolean_t addok)
708 {
709 	uint64_t fuid;
710 	int domainid = 0;
711 
712 	if (domain && domain[0]) {
713 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
714 		if (domainid == -1)
715 			return (SET_ERROR(ENOENT));
716 	}
717 	fuid = FUID_ENCODE(domainid, rid);
718 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
719 	return (0);
720 }
721 
722 int
zfs_userspace_one(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t * valp)723 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
724     const char *domain, uint64_t rid, uint64_t *valp)
725 {
726 	char buf[32];
727 	int err;
728 	uint64_t obj;
729 
730 	*valp = 0;
731 
732 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
733 		return (SET_ERROR(ENOTSUP));
734 
735 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
736 	if (obj == 0)
737 		return (0);
738 
739 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
740 	if (err)
741 		return (err);
742 
743 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
744 	if (err == ENOENT)
745 		err = 0;
746 	return (err);
747 }
748 
749 int
zfs_set_userquota(zfsvfs_t * zfsvfs,zfs_userquota_prop_t type,const char * domain,uint64_t rid,uint64_t quota)750 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
751     const char *domain, uint64_t rid, uint64_t quota)
752 {
753 	char buf[32];
754 	int err;
755 	dmu_tx_t *tx;
756 	uint64_t *objp;
757 	boolean_t fuid_dirtied;
758 
759 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
760 		return (SET_ERROR(EINVAL));
761 
762 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
763 		return (SET_ERROR(ENOTSUP));
764 
765 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
766 	    &zfsvfs->z_groupquota_obj;
767 
768 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
769 	if (err)
770 		return (err);
771 	fuid_dirtied = zfsvfs->z_fuid_dirty;
772 
773 	tx = dmu_tx_create(zfsvfs->z_os);
774 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
775 	if (*objp == 0) {
776 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
777 		    zfs_userquota_prop_prefixes[type]);
778 	}
779 	if (fuid_dirtied)
780 		zfs_fuid_txhold(zfsvfs, tx);
781 	err = dmu_tx_assign(tx, TXG_WAIT);
782 	if (err) {
783 		dmu_tx_abort(tx);
784 		return (err);
785 	}
786 
787 	mutex_enter(&zfsvfs->z_lock);
788 	if (*objp == 0) {
789 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
790 		    DMU_OT_NONE, 0, tx);
791 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
792 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
793 	}
794 	mutex_exit(&zfsvfs->z_lock);
795 
796 	if (quota == 0) {
797 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
798 		if (err == ENOENT)
799 			err = 0;
800 	} else {
801 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
802 	}
803 	ASSERT(err == 0);
804 	if (fuid_dirtied)
805 		zfs_fuid_sync(zfsvfs, tx);
806 	dmu_tx_commit(tx);
807 	return (err);
808 }
809 
810 boolean_t
zfs_fuid_overquota(zfsvfs_t * zfsvfs,boolean_t isgroup,uint64_t fuid)811 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
812 {
813 	char buf[32];
814 	uint64_t used, quota, usedobj, quotaobj;
815 	int err;
816 
817 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
818 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
819 
820 	if (quotaobj == 0 || zfsvfs->z_replay)
821 		return (B_FALSE);
822 
823 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
824 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
825 	if (err != 0)
826 		return (B_FALSE);
827 
828 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
829 	if (err != 0)
830 		return (B_FALSE);
831 	return (used >= quota);
832 }
833 
834 boolean_t
zfs_owner_overquota(zfsvfs_t * zfsvfs,znode_t * zp,boolean_t isgroup)835 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
836 {
837 	uint64_t fuid;
838 	uint64_t quotaobj;
839 
840 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
841 
842 	fuid = isgroup ? zp->z_gid : zp->z_uid;
843 
844 	if (quotaobj == 0 || zfsvfs->z_replay)
845 		return (B_FALSE);
846 
847 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
848 }
849 
850 /*
851  * Associate this zfsvfs with the given objset, which must be owned.
852  * This will cache a bunch of on-disk state from the objset in the
853  * zfsvfs.
854  */
855 static int
zfsvfs_init(zfsvfs_t * zfsvfs,objset_t * os)856 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
857 {
858 	int error;
859 	uint64_t val;
860 
861 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
862 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
863 	zfsvfs->z_os = os;
864 
865 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
866 	if (error != 0)
867 		return (error);
868 	if (zfsvfs->z_version >
869 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
870 		(void) printf("Can't mount a version %lld file system "
871 		    "on a version %lld pool\n. Pool must be upgraded to mount "
872 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
873 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
874 		return (SET_ERROR(ENOTSUP));
875 	}
876 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
877 	if (error != 0)
878 		return (error);
879 	zfsvfs->z_norm = (int)val;
880 
881 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
882 	if (error != 0)
883 		return (error);
884 	zfsvfs->z_utf8 = (val != 0);
885 
886 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
887 	if (error != 0)
888 		return (error);
889 	zfsvfs->z_case = (uint_t)val;
890 
891 	/*
892 	 * Fold case on file systems that are always or sometimes case
893 	 * insensitive.
894 	 */
895 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
896 	    zfsvfs->z_case == ZFS_CASE_MIXED)
897 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
898 
899 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
900 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
901 
902 	uint64_t sa_obj = 0;
903 	if (zfsvfs->z_use_sa) {
904 		/* should either have both of these objects or none */
905 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
906 		    &sa_obj);
907 		if (error != 0)
908 			return (error);
909 	}
910 
911 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
912 	    &zfsvfs->z_attr_table);
913 	if (error != 0)
914 		return (error);
915 
916 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
917 		sa_register_update_callback(os, zfs_sa_upgrade);
918 
919 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
920 	    &zfsvfs->z_root);
921 	if (error != 0)
922 		return (error);
923 	ASSERT(zfsvfs->z_root != 0);
924 
925 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
926 	    &zfsvfs->z_unlinkedobj);
927 	if (error != 0)
928 		return (error);
929 
930 	error = zap_lookup(os, MASTER_NODE_OBJ,
931 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
932 	    8, 1, &zfsvfs->z_userquota_obj);
933 	if (error == ENOENT)
934 		zfsvfs->z_userquota_obj = 0;
935 	else if (error != 0)
936 		return (error);
937 
938 	error = zap_lookup(os, MASTER_NODE_OBJ,
939 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
940 	    8, 1, &zfsvfs->z_groupquota_obj);
941 	if (error == ENOENT)
942 		zfsvfs->z_groupquota_obj = 0;
943 	else if (error != 0)
944 		return (error);
945 
946 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
947 	    &zfsvfs->z_fuid_obj);
948 	if (error == ENOENT)
949 		zfsvfs->z_fuid_obj = 0;
950 	else if (error != 0)
951 		return (error);
952 
953 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
954 	    &zfsvfs->z_shares_dir);
955 	if (error == ENOENT)
956 		zfsvfs->z_shares_dir = 0;
957 	else if (error != 0)
958 		return (error);
959 
960 	/*
961 	 * Only use the name cache if we are looking for a
962 	 * name on a file system that does not require normalization
963 	 * or case folding.  We can also look there if we happen to be
964 	 * on a non-normalizing, mixed sensitivity file system IF we
965 	 * are looking for the exact name (which is always the case on
966 	 * FreeBSD).
967 	 */
968 	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
969 	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
970 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
971 
972 	return (0);
973 }
974 
975 int
zfsvfs_create(const char * osname,zfsvfs_t ** zfvp)976 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
977 {
978 	objset_t *os;
979 	zfsvfs_t *zfsvfs;
980 	int error;
981 
982 	/*
983 	 * XXX: Fix struct statfs so this isn't necessary!
984 	 *
985 	 * The 'osname' is used as the filesystem's special node, which means
986 	 * it must fit in statfs.f_mntfromname, or else it can't be
987 	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
988 	 * 'zfs unmount' to think it's not mounted when it is.
989 	 */
990 	if (strlen(osname) >= MNAMELEN)
991 		return (SET_ERROR(ENAMETOOLONG));
992 
993 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
994 
995 	/*
996 	 * We claim to always be readonly so we can open snapshots;
997 	 * other ZPL code will prevent us from writing to snapshots.
998 	 */
999 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
1000 	if (error) {
1001 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1002 		return (error);
1003 	}
1004 
1005 	zfsvfs->z_vfs = NULL;
1006 	zfsvfs->z_parent = zfsvfs;
1007 
1008 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1009 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1010 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1011 	    offsetof(znode_t, z_link_node));
1012 #ifdef DIAGNOSTIC
1013 	rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
1014 #else
1015 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
1016 #endif
1017 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
1018 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1019 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1020 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1021 
1022 	error = zfsvfs_init(zfsvfs, os);
1023 	if (error != 0) {
1024 		dmu_objset_disown(os, zfsvfs);
1025 		*zfvp = NULL;
1026 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
1027 		return (error);
1028 	}
1029 
1030 	*zfvp = zfsvfs;
1031 	return (0);
1032 }
1033 
1034 static int
zfsvfs_setup(zfsvfs_t * zfsvfs,boolean_t mounting)1035 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1036 {
1037 	int error;
1038 
1039 	error = zfs_register_callbacks(zfsvfs->z_vfs);
1040 	if (error)
1041 		return (error);
1042 
1043 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
1044 
1045 	/*
1046 	 * If we are not mounting (ie: online recv), then we don't
1047 	 * have to worry about replaying the log as we blocked all
1048 	 * operations out since we closed the ZIL.
1049 	 */
1050 	if (mounting) {
1051 		boolean_t readonly;
1052 
1053 		/*
1054 		 * During replay we remove the read only flag to
1055 		 * allow replays to succeed.
1056 		 */
1057 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1058 		if (readonly != 0)
1059 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1060 		else
1061 			zfs_unlinked_drain(zfsvfs);
1062 
1063 		/*
1064 		 * Parse and replay the intent log.
1065 		 *
1066 		 * Because of ziltest, this must be done after
1067 		 * zfs_unlinked_drain().  (Further note: ziltest
1068 		 * doesn't use readonly mounts, where
1069 		 * zfs_unlinked_drain() isn't called.)  This is because
1070 		 * ziltest causes spa_sync() to think it's committed,
1071 		 * but actually it is not, so the intent log contains
1072 		 * many txg's worth of changes.
1073 		 *
1074 		 * In particular, if object N is in the unlinked set in
1075 		 * the last txg to actually sync, then it could be
1076 		 * actually freed in a later txg and then reallocated
1077 		 * in a yet later txg.  This would write a "create
1078 		 * object N" record to the intent log.  Normally, this
1079 		 * would be fine because the spa_sync() would have
1080 		 * written out the fact that object N is free, before
1081 		 * we could write the "create object N" intent log
1082 		 * record.
1083 		 *
1084 		 * But when we are in ziltest mode, we advance the "open
1085 		 * txg" without actually spa_sync()-ing the changes to
1086 		 * disk.  So we would see that object N is still
1087 		 * allocated and in the unlinked set, and there is an
1088 		 * intent log record saying to allocate it.
1089 		 */
1090 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1091 			if (zil_replay_disable) {
1092 				zil_destroy(zfsvfs->z_log, B_FALSE);
1093 			} else {
1094 				zfsvfs->z_replay = B_TRUE;
1095 				zil_replay(zfsvfs->z_os, zfsvfs,
1096 				    zfs_replay_vector);
1097 				zfsvfs->z_replay = B_FALSE;
1098 			}
1099 		}
1100 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1101 	}
1102 
1103 	/*
1104 	 * Set the objset user_ptr to track its zfsvfs.
1105 	 */
1106 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1107 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1108 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1109 
1110 	return (0);
1111 }
1112 
1113 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1114 
1115 void
zfsvfs_free(zfsvfs_t * zfsvfs)1116 zfsvfs_free(zfsvfs_t *zfsvfs)
1117 {
1118 	int i;
1119 
1120 	/*
1121 	 * This is a barrier to prevent the filesystem from going away in
1122 	 * zfs_znode_move() until we can safely ensure that the filesystem is
1123 	 * not unmounted. We consider the filesystem valid before the barrier
1124 	 * and invalid after the barrier.
1125 	 */
1126 	rw_enter(&zfsvfs_lock, RW_READER);
1127 	rw_exit(&zfsvfs_lock);
1128 
1129 	zfs_fuid_destroy(zfsvfs);
1130 
1131 	mutex_destroy(&zfsvfs->z_znodes_lock);
1132 	mutex_destroy(&zfsvfs->z_lock);
1133 	list_destroy(&zfsvfs->z_all_znodes);
1134 	rrm_destroy(&zfsvfs->z_teardown_lock);
1135 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1136 	rw_destroy(&zfsvfs->z_fuid_lock);
1137 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1138 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1139 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1140 }
1141 
1142 static void
zfs_set_fuid_feature(zfsvfs_t * zfsvfs)1143 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1144 {
1145 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1146 	if (zfsvfs->z_vfs) {
1147 		if (zfsvfs->z_use_fuids) {
1148 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1149 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1150 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1151 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1152 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1153 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1154 		} else {
1155 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1156 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1157 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1158 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1159 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1160 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1161 		}
1162 	}
1163 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1164 }
1165 
1166 static int
zfs_domount(vfs_t * vfsp,char * osname)1167 zfs_domount(vfs_t *vfsp, char *osname)
1168 {
1169 	uint64_t recordsize, fsid_guid;
1170 	int error = 0;
1171 	zfsvfs_t *zfsvfs;
1172 	vnode_t *vp;
1173 
1174 	ASSERT(vfsp);
1175 	ASSERT(osname);
1176 
1177 	error = zfsvfs_create(osname, &zfsvfs);
1178 	if (error)
1179 		return (error);
1180 	zfsvfs->z_vfs = vfsp;
1181 
1182 #ifdef illumos
1183 	/* Initialize the generic filesystem structure. */
1184 	vfsp->vfs_bcount = 0;
1185 	vfsp->vfs_data = NULL;
1186 
1187 	if (zfs_create_unique_device(&mount_dev) == -1) {
1188 		error = SET_ERROR(ENODEV);
1189 		goto out;
1190 	}
1191 	ASSERT(vfs_devismounted(mount_dev) == 0);
1192 #endif
1193 
1194 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1195 	    NULL))
1196 		goto out;
1197 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1198 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1199 
1200 	vfsp->vfs_data = zfsvfs;
1201 	vfsp->mnt_flag |= MNT_LOCAL;
1202 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1203 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1204 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1205 	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
1206 
1207 	/*
1208 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1209 	 * separates our fsid from any other filesystem types, and a
1210 	 * 56-bit objset unique ID.  The objset unique ID is unique to
1211 	 * all objsets open on this system, provided by unique_create().
1212 	 * The 8-bit fs type must be put in the low bits of fsid[1]
1213 	 * because that's where other Solaris filesystems put it.
1214 	 */
1215 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1216 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1217 	vfsp->vfs_fsid.val[0] = fsid_guid;
1218 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1219 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
1220 
1221 	/*
1222 	 * Set features for file system.
1223 	 */
1224 	zfs_set_fuid_feature(zfsvfs);
1225 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1226 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1227 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1228 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1229 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1230 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1231 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1232 	}
1233 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1234 
1235 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1236 		uint64_t pval;
1237 
1238 		atime_changed_cb(zfsvfs, B_FALSE);
1239 		readonly_changed_cb(zfsvfs, B_TRUE);
1240 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1241 			goto out;
1242 		xattr_changed_cb(zfsvfs, pval);
1243 		zfsvfs->z_issnap = B_TRUE;
1244 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1245 
1246 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1247 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1248 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1249 	} else {
1250 		error = zfsvfs_setup(zfsvfs, B_TRUE);
1251 	}
1252 
1253 	vfs_mountedfrom(vfsp, osname);
1254 
1255 	if (!zfsvfs->z_issnap)
1256 		zfsctl_create(zfsvfs);
1257 out:
1258 	if (error) {
1259 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1260 		zfsvfs_free(zfsvfs);
1261 	} else {
1262 		atomic_inc_32(&zfs_active_fs_count);
1263 	}
1264 
1265 	return (error);
1266 }
1267 
1268 void
zfs_unregister_callbacks(zfsvfs_t * zfsvfs)1269 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1270 {
1271 	objset_t *os = zfsvfs->z_os;
1272 
1273 	if (!dmu_objset_is_snapshot(os))
1274 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1275 }
1276 
1277 #ifdef SECLABEL
1278 /*
1279  * Convert a decimal digit string to a uint64_t integer.
1280  */
1281 static int
str_to_uint64(char * str,uint64_t * objnum)1282 str_to_uint64(char *str, uint64_t *objnum)
1283 {
1284 	uint64_t num = 0;
1285 
1286 	while (*str) {
1287 		if (*str < '0' || *str > '9')
1288 			return (SET_ERROR(EINVAL));
1289 
1290 		num = num*10 + *str++ - '0';
1291 	}
1292 
1293 	*objnum = num;
1294 	return (0);
1295 }
1296 
1297 /*
1298  * The boot path passed from the boot loader is in the form of
1299  * "rootpool-name/root-filesystem-object-number'. Convert this
1300  * string to a dataset name: "rootpool-name/root-filesystem-name".
1301  */
1302 static int
zfs_parse_bootfs(char * bpath,char * outpath)1303 zfs_parse_bootfs(char *bpath, char *outpath)
1304 {
1305 	char *slashp;
1306 	uint64_t objnum;
1307 	int error;
1308 
1309 	if (*bpath == 0 || *bpath == '/')
1310 		return (SET_ERROR(EINVAL));
1311 
1312 	(void) strcpy(outpath, bpath);
1313 
1314 	slashp = strchr(bpath, '/');
1315 
1316 	/* if no '/', just return the pool name */
1317 	if (slashp == NULL) {
1318 		return (0);
1319 	}
1320 
1321 	/* if not a number, just return the root dataset name */
1322 	if (str_to_uint64(slashp+1, &objnum)) {
1323 		return (0);
1324 	}
1325 
1326 	*slashp = '\0';
1327 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1328 	*slashp = '/';
1329 
1330 	return (error);
1331 }
1332 
1333 /*
1334  * Check that the hex label string is appropriate for the dataset being
1335  * mounted into the global_zone proper.
1336  *
1337  * Return an error if the hex label string is not default or
1338  * admin_low/admin_high.  For admin_low labels, the corresponding
1339  * dataset must be readonly.
1340  */
1341 int
zfs_check_global_label(const char * dsname,const char * hexsl)1342 zfs_check_global_label(const char *dsname, const char *hexsl)
1343 {
1344 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1345 		return (0);
1346 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1347 		return (0);
1348 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1349 		/* must be readonly */
1350 		uint64_t rdonly;
1351 
1352 		if (dsl_prop_get_integer(dsname,
1353 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1354 			return (SET_ERROR(EACCES));
1355 		return (rdonly ? 0 : EACCES);
1356 	}
1357 	return (SET_ERROR(EACCES));
1358 }
1359 
1360 /*
1361  * Determine whether the mount is allowed according to MAC check.
1362  * by comparing (where appropriate) label of the dataset against
1363  * the label of the zone being mounted into.  If the dataset has
1364  * no label, create one.
1365  *
1366  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
1367  */
1368 static int
zfs_mount_label_policy(vfs_t * vfsp,char * osname)1369 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
1370 {
1371 	int		error, retv;
1372 	zone_t		*mntzone = NULL;
1373 	ts_label_t	*mnt_tsl;
1374 	bslabel_t	*mnt_sl;
1375 	bslabel_t	ds_sl;
1376 	char		ds_hexsl[MAXNAMELEN];
1377 
1378 	retv = EACCES;				/* assume the worst */
1379 
1380 	/*
1381 	 * Start by getting the dataset label if it exists.
1382 	 */
1383 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1384 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1385 	if (error)
1386 		return (SET_ERROR(EACCES));
1387 
1388 	/*
1389 	 * If labeling is NOT enabled, then disallow the mount of datasets
1390 	 * which have a non-default label already.  No other label checks
1391 	 * are needed.
1392 	 */
1393 	if (!is_system_labeled()) {
1394 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1395 			return (0);
1396 		return (SET_ERROR(EACCES));
1397 	}
1398 
1399 	/*
1400 	 * Get the label of the mountpoint.  If mounting into the global
1401 	 * zone (i.e. mountpoint is not within an active zone and the
1402 	 * zoned property is off), the label must be default or
1403 	 * admin_low/admin_high only; no other checks are needed.
1404 	 */
1405 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1406 	if (mntzone->zone_id == GLOBAL_ZONEID) {
1407 		uint64_t zoned;
1408 
1409 		zone_rele(mntzone);
1410 
1411 		if (dsl_prop_get_integer(osname,
1412 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1413 			return (SET_ERROR(EACCES));
1414 		if (!zoned)
1415 			return (zfs_check_global_label(osname, ds_hexsl));
1416 		else
1417 			/*
1418 			 * This is the case of a zone dataset being mounted
1419 			 * initially, before the zone has been fully created;
1420 			 * allow this mount into global zone.
1421 			 */
1422 			return (0);
1423 	}
1424 
1425 	mnt_tsl = mntzone->zone_slabel;
1426 	ASSERT(mnt_tsl != NULL);
1427 	label_hold(mnt_tsl);
1428 	mnt_sl = label2bslabel(mnt_tsl);
1429 
1430 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1431 		/*
1432 		 * The dataset doesn't have a real label, so fabricate one.
1433 		 */
1434 		char *str = NULL;
1435 
1436 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
1437 		    dsl_prop_set_string(osname,
1438 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1439 		    ZPROP_SRC_LOCAL, str) == 0)
1440 			retv = 0;
1441 		if (str != NULL)
1442 			kmem_free(str, strlen(str) + 1);
1443 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1444 		/*
1445 		 * Now compare labels to complete the MAC check.  If the
1446 		 * labels are equal then allow access.  If the mountpoint
1447 		 * label dominates the dataset label, allow readonly access.
1448 		 * Otherwise, access is denied.
1449 		 */
1450 		if (blequal(mnt_sl, &ds_sl))
1451 			retv = 0;
1452 		else if (bldominates(mnt_sl, &ds_sl)) {
1453 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1454 			retv = 0;
1455 		}
1456 	}
1457 
1458 	label_rele(mnt_tsl);
1459 	zone_rele(mntzone);
1460 	return (retv);
1461 }
1462 #endif	/* SECLABEL */
1463 
1464 #ifdef OPENSOLARIS_MOUNTROOT
1465 static int
zfs_mountroot(vfs_t * vfsp,enum whymountroot why)1466 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1467 {
1468 	int error = 0;
1469 	static int zfsrootdone = 0;
1470 	zfsvfs_t *zfsvfs = NULL;
1471 	znode_t *zp = NULL;
1472 	vnode_t *vp = NULL;
1473 	char *zfs_bootfs;
1474 	char *zfs_devid;
1475 
1476 	ASSERT(vfsp);
1477 
1478 	/*
1479 	 * The filesystem that we mount as root is defined in the
1480 	 * boot property "zfs-bootfs" with a format of
1481 	 * "poolname/root-dataset-objnum".
1482 	 */
1483 	if (why == ROOT_INIT) {
1484 		if (zfsrootdone++)
1485 			return (SET_ERROR(EBUSY));
1486 		/*
1487 		 * the process of doing a spa_load will require the
1488 		 * clock to be set before we could (for example) do
1489 		 * something better by looking at the timestamp on
1490 		 * an uberblock, so just set it to -1.
1491 		 */
1492 		clkset(-1);
1493 
1494 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1495 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1496 			    "bootfs name");
1497 			return (SET_ERROR(EINVAL));
1498 		}
1499 		zfs_devid = spa_get_bootprop("diskdevid");
1500 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1501 		if (zfs_devid)
1502 			spa_free_bootprop(zfs_devid);
1503 		if (error) {
1504 			spa_free_bootprop(zfs_bootfs);
1505 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1506 			    error);
1507 			return (error);
1508 		}
1509 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1510 			spa_free_bootprop(zfs_bootfs);
1511 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1512 			    error);
1513 			return (error);
1514 		}
1515 
1516 		spa_free_bootprop(zfs_bootfs);
1517 
1518 		if (error = vfs_lock(vfsp))
1519 			return (error);
1520 
1521 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1522 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1523 			goto out;
1524 		}
1525 
1526 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1527 		ASSERT(zfsvfs);
1528 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1529 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1530 			goto out;
1531 		}
1532 
1533 		vp = ZTOV(zp);
1534 		mutex_enter(&vp->v_lock);
1535 		vp->v_flag |= VROOT;
1536 		mutex_exit(&vp->v_lock);
1537 		rootvp = vp;
1538 
1539 		/*
1540 		 * Leave rootvp held.  The root file system is never unmounted.
1541 		 */
1542 
1543 		vfs_add((struct vnode *)0, vfsp,
1544 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1545 out:
1546 		vfs_unlock(vfsp);
1547 		return (error);
1548 	} else if (why == ROOT_REMOUNT) {
1549 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1550 		vfsp->vfs_flag |= VFS_REMOUNT;
1551 
1552 		/* refresh mount options */
1553 		zfs_unregister_callbacks(vfsp->vfs_data);
1554 		return (zfs_register_callbacks(vfsp));
1555 
1556 	} else if (why == ROOT_UNMOUNT) {
1557 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1558 		(void) zfs_sync(vfsp, 0, 0);
1559 		return (0);
1560 	}
1561 
1562 	/*
1563 	 * if "why" is equal to anything else other than ROOT_INIT,
1564 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1565 	 */
1566 	return (SET_ERROR(ENOTSUP));
1567 }
1568 #endif	/* OPENSOLARIS_MOUNTROOT */
1569 
1570 static int
getpoolname(const char * osname,char * poolname)1571 getpoolname(const char *osname, char *poolname)
1572 {
1573 	char *p;
1574 
1575 	p = strchr(osname, '/');
1576 	if (p == NULL) {
1577 		if (strlen(osname) >= MAXNAMELEN)
1578 			return (ENAMETOOLONG);
1579 		(void) strcpy(poolname, osname);
1580 	} else {
1581 		if (p - osname >= MAXNAMELEN)
1582 			return (ENAMETOOLONG);
1583 		(void) strncpy(poolname, osname, p - osname);
1584 		poolname[p - osname] = '\0';
1585 	}
1586 	return (0);
1587 }
1588 
1589 /*ARGSUSED*/
1590 static int
zfs_mount(vfs_t * vfsp)1591 zfs_mount(vfs_t *vfsp)
1592 {
1593 	kthread_t	*td = curthread;
1594 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1595 	cred_t		*cr = td->td_ucred;
1596 	char		*osname;
1597 	int		error = 0;
1598 	int		canwrite;
1599 
1600 #ifdef illumos
1601 	if (mvp->v_type != VDIR)
1602 		return (SET_ERROR(ENOTDIR));
1603 
1604 	mutex_enter(&mvp->v_lock);
1605 	if ((uap->flags & MS_REMOUNT) == 0 &&
1606 	    (uap->flags & MS_OVERLAY) == 0 &&
1607 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
1608 		mutex_exit(&mvp->v_lock);
1609 		return (SET_ERROR(EBUSY));
1610 	}
1611 	mutex_exit(&mvp->v_lock);
1612 
1613 	/*
1614 	 * ZFS does not support passing unparsed data in via MS_DATA.
1615 	 * Users should use the MS_OPTIONSTR interface; this means
1616 	 * that all option parsing is already done and the options struct
1617 	 * can be interrogated.
1618 	 */
1619 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
1620 		return (SET_ERROR(EINVAL));
1621 
1622 	/*
1623 	 * Get the objset name (the "special" mount argument).
1624 	 */
1625 	if (error = pn_get(uap->spec, fromspace, &spn))
1626 		return (error);
1627 
1628 	osname = spn.pn_path;
1629 #else	/* !illumos */
1630 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
1631 		return (SET_ERROR(EPERM));
1632 
1633 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1634 		return (SET_ERROR(EINVAL));
1635 
1636 	/*
1637 	 * If full-owner-access is enabled and delegated administration is
1638 	 * turned on, we must set nosuid.
1639 	 */
1640 	if (zfs_super_owner &&
1641 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1642 		secpolicy_fs_mount_clearopts(cr, vfsp);
1643 	}
1644 #endif	/* illumos */
1645 
1646 	/*
1647 	 * Check for mount privilege?
1648 	 *
1649 	 * If we don't have privilege then see if
1650 	 * we have local permission to allow it
1651 	 */
1652 	error = secpolicy_fs_mount(cr, mvp, vfsp);
1653 	if (error) {
1654 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1655 			goto out;
1656 
1657 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1658 			vattr_t		vattr;
1659 
1660 			/*
1661 			 * Make sure user is the owner of the mount point
1662 			 * or has sufficient privileges.
1663 			 */
1664 
1665 			vattr.va_mask = AT_UID;
1666 
1667 			vn_lock(mvp, LK_SHARED | LK_RETRY);
1668 			if (VOP_GETATTR(mvp, &vattr, cr)) {
1669 				VOP_UNLOCK(mvp, 0);
1670 				goto out;
1671 			}
1672 
1673 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1674 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1675 				VOP_UNLOCK(mvp, 0);
1676 				goto out;
1677 			}
1678 			VOP_UNLOCK(mvp, 0);
1679 		}
1680 
1681 		secpolicy_fs_mount_clearopts(cr, vfsp);
1682 	}
1683 
1684 	/*
1685 	 * Refuse to mount a filesystem if we are in a local zone and the
1686 	 * dataset is not visible.
1687 	 */
1688 	if (!INGLOBALZONE(curthread) &&
1689 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1690 		error = SET_ERROR(EPERM);
1691 		goto out;
1692 	}
1693 
1694 #ifdef SECLABEL
1695 	error = zfs_mount_label_policy(vfsp, osname);
1696 	if (error)
1697 		goto out;
1698 #endif
1699 
1700 	vfsp->vfs_flag |= MNT_NFS4ACLS;
1701 
1702 	/*
1703 	 * When doing a remount, we simply refresh our temporary properties
1704 	 * according to those options set in the current VFS options.
1705 	 */
1706 	if (vfsp->vfs_flag & MS_REMOUNT) {
1707 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
1708 
1709 		/*
1710 		 * Refresh mount options with z_teardown_lock blocking I/O while
1711 		 * the filesystem is in an inconsistent state.
1712 		 * The lock also serializes this code with filesystem
1713 		 * manipulations between entry to zfs_suspend_fs() and return
1714 		 * from zfs_resume_fs().
1715 		 */
1716 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1717 		zfs_unregister_callbacks(zfsvfs);
1718 		error = zfs_register_callbacks(vfsp);
1719 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1720 		goto out;
1721 	}
1722 
1723 	/* Initial root mount: try hard to import the requested root pool. */
1724 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1725 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1726 		char pname[MAXNAMELEN];
1727 
1728 		error = getpoolname(osname, pname);
1729 		if (error == 0)
1730 			error = spa_import_rootpool(pname);
1731 		if (error)
1732 			goto out;
1733 	}
1734 	DROP_GIANT();
1735 	error = zfs_domount(vfsp, osname);
1736 	PICKUP_GIANT();
1737 
1738 #ifdef illumos
1739 	/*
1740 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
1741 	 * disappear due to a forced unmount.
1742 	 */
1743 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1744 		VFS_HOLD(mvp->v_vfsp);
1745 #endif
1746 
1747 out:
1748 	return (error);
1749 }
1750 
1751 static int
zfs_statfs(vfs_t * vfsp,struct statfs * statp)1752 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1753 {
1754 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1755 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1756 
1757 	statp->f_version = STATFS_VERSION;
1758 
1759 	ZFS_ENTER(zfsvfs);
1760 
1761 	dmu_objset_space(zfsvfs->z_os,
1762 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1763 
1764 	/*
1765 	 * The underlying storage pool actually uses multiple block sizes.
1766 	 * We report the fragsize as the smallest block size we support,
1767 	 * and we report our blocksize as the filesystem's maximum blocksize.
1768 	 */
1769 	statp->f_bsize = SPA_MINBLOCKSIZE;
1770 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1771 
1772 	/*
1773 	 * The following report "total" blocks of various kinds in the
1774 	 * file system, but reported in terms of f_frsize - the
1775 	 * "fragment" size.
1776 	 */
1777 
1778 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1779 	statp->f_bfree = availbytes / statp->f_bsize;
1780 	statp->f_bavail = statp->f_bfree; /* no root reservation */
1781 
1782 	/*
1783 	 * statvfs() should really be called statufs(), because it assumes
1784 	 * static metadata.  ZFS doesn't preallocate files, so the best
1785 	 * we can do is report the max that could possibly fit in f_files,
1786 	 * and that minus the number actually used in f_ffree.
1787 	 * For f_ffree, report the smaller of the number of object available
1788 	 * and the number of blocks (each object will take at least a block).
1789 	 */
1790 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1791 	statp->f_files = statp->f_ffree + usedobjs;
1792 
1793 	/*
1794 	 * We're a zfs filesystem.
1795 	 */
1796 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
1797 
1798 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1799 	    sizeof(statp->f_mntfromname));
1800 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1801 	    sizeof(statp->f_mntonname));
1802 
1803 	statp->f_namemax = MAXNAMELEN - 1;
1804 
1805 	ZFS_EXIT(zfsvfs);
1806 	return (0);
1807 }
1808 
1809 static int
zfs_root(vfs_t * vfsp,int flags,vnode_t ** vpp)1810 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1811 {
1812 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1813 	znode_t *rootzp;
1814 	int error;
1815 
1816 	ZFS_ENTER(zfsvfs);
1817 
1818 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1819 	if (error == 0)
1820 		*vpp = ZTOV(rootzp);
1821 
1822 	ZFS_EXIT(zfsvfs);
1823 
1824 	if (error == 0) {
1825 		error = vn_lock(*vpp, flags);
1826 		if (error != 0) {
1827 			VN_RELE(*vpp);
1828 			*vpp = NULL;
1829 		}
1830 	}
1831 	return (error);
1832 }
1833 
1834 /*
1835  * Teardown the zfsvfs::z_os.
1836  *
1837  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1838  * and 'z_teardown_inactive_lock' held.
1839  */
1840 static int
zfsvfs_teardown(zfsvfs_t * zfsvfs,boolean_t unmounting)1841 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1842 {
1843 	znode_t	*zp;
1844 
1845 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1846 
1847 	if (!unmounting) {
1848 		/*
1849 		 * We purge the parent filesystem's vfsp as the parent
1850 		 * filesystem and all of its snapshots have their vnode's
1851 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1852 		 * 'z_parent' is self referential for non-snapshots.
1853 		 */
1854 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1855 #ifdef FREEBSD_NAMECACHE
1856 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1857 #endif
1858 	}
1859 
1860 	/*
1861 	 * Close the zil. NB: Can't close the zil while zfs_inactive
1862 	 * threads are blocked as zil_close can call zfs_inactive.
1863 	 */
1864 	if (zfsvfs->z_log) {
1865 		zil_close(zfsvfs->z_log);
1866 		zfsvfs->z_log = NULL;
1867 	}
1868 
1869 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1870 
1871 	/*
1872 	 * If we are not unmounting (ie: online recv) and someone already
1873 	 * unmounted this file system while we were doing the switcheroo,
1874 	 * or a reopen of z_os failed then just bail out now.
1875 	 */
1876 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1877 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1878 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1879 		return (SET_ERROR(EIO));
1880 	}
1881 
1882 	/*
1883 	 * At this point there are no vops active, and any new vops will
1884 	 * fail with EIO since we have z_teardown_lock for writer (only
1885 	 * relavent for forced unmount).
1886 	 *
1887 	 * Release all holds on dbufs.
1888 	 */
1889 	mutex_enter(&zfsvfs->z_znodes_lock);
1890 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1891 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
1892 		if (zp->z_sa_hdl) {
1893 			ASSERT(ZTOV(zp)->v_count >= 0);
1894 			zfs_znode_dmu_fini(zp);
1895 		}
1896 	mutex_exit(&zfsvfs->z_znodes_lock);
1897 
1898 	/*
1899 	 * If we are unmounting, set the unmounted flag and let new vops
1900 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1901 	 * other vops will fail with EIO.
1902 	 */
1903 	if (unmounting) {
1904 		zfsvfs->z_unmounted = B_TRUE;
1905 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1906 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1907 	}
1908 
1909 	/*
1910 	 * z_os will be NULL if there was an error in attempting to reopen
1911 	 * zfsvfs, so just return as the properties had already been
1912 	 * unregistered and cached data had been evicted before.
1913 	 */
1914 	if (zfsvfs->z_os == NULL)
1915 		return (0);
1916 
1917 	/*
1918 	 * Unregister properties.
1919 	 */
1920 	zfs_unregister_callbacks(zfsvfs);
1921 
1922 	/*
1923 	 * Evict cached data
1924 	 */
1925 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
1926 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1927 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1928 	dmu_objset_evict_dbufs(zfsvfs->z_os);
1929 
1930 	return (0);
1931 }
1932 
1933 /*ARGSUSED*/
1934 static int
zfs_umount(vfs_t * vfsp,int fflag)1935 zfs_umount(vfs_t *vfsp, int fflag)
1936 {
1937 	kthread_t *td = curthread;
1938 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1939 	objset_t *os;
1940 	cred_t *cr = td->td_ucred;
1941 	int ret;
1942 
1943 	ret = secpolicy_fs_unmount(cr, vfsp);
1944 	if (ret) {
1945 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1946 		    ZFS_DELEG_PERM_MOUNT, cr))
1947 			return (ret);
1948 	}
1949 
1950 	/*
1951 	 * We purge the parent filesystem's vfsp as the parent filesystem
1952 	 * and all of its snapshots have their vnode's v_vfsp set to the
1953 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1954 	 * referential for non-snapshots.
1955 	 */
1956 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1957 
1958 	/*
1959 	 * Unmount any snapshots mounted under .zfs before unmounting the
1960 	 * dataset itself.
1961 	 */
1962 	if (zfsvfs->z_ctldir != NULL) {
1963 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1964 			return (ret);
1965 	}
1966 
1967 	if (fflag & MS_FORCE) {
1968 		/*
1969 		 * Mark file system as unmounted before calling
1970 		 * vflush(FORCECLOSE). This way we ensure no future vnops
1971 		 * will be called and risk operating on DOOMED vnodes.
1972 		 */
1973 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1974 		zfsvfs->z_unmounted = B_TRUE;
1975 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
1976 	}
1977 
1978 	/*
1979 	 * Flush all the files.
1980 	 */
1981 	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1982 	if (ret != 0)
1983 		return (ret);
1984 
1985 #ifdef illumos
1986 	if (!(fflag & MS_FORCE)) {
1987 		/*
1988 		 * Check the number of active vnodes in the file system.
1989 		 * Our count is maintained in the vfs structure, but the
1990 		 * number is off by 1 to indicate a hold on the vfs
1991 		 * structure itself.
1992 		 *
1993 		 * The '.zfs' directory maintains a reference of its
1994 		 * own, and any active references underneath are
1995 		 * reflected in the vnode count.
1996 		 */
1997 		if (zfsvfs->z_ctldir == NULL) {
1998 			if (vfsp->vfs_count > 1)
1999 				return (SET_ERROR(EBUSY));
2000 		} else {
2001 			if (vfsp->vfs_count > 2 ||
2002 			    zfsvfs->z_ctldir->v_count > 1)
2003 				return (SET_ERROR(EBUSY));
2004 		}
2005 	}
2006 #endif
2007 
2008 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
2009 	os = zfsvfs->z_os;
2010 
2011 	/*
2012 	 * z_os will be NULL if there was an error in
2013 	 * attempting to reopen zfsvfs.
2014 	 */
2015 	if (os != NULL) {
2016 		/*
2017 		 * Unset the objset user_ptr.
2018 		 */
2019 		mutex_enter(&os->os_user_ptr_lock);
2020 		dmu_objset_set_user(os, NULL);
2021 		mutex_exit(&os->os_user_ptr_lock);
2022 
2023 		/*
2024 		 * Finally release the objset
2025 		 */
2026 		dmu_objset_disown(os, zfsvfs);
2027 	}
2028 
2029 	/*
2030 	 * We can now safely destroy the '.zfs' directory node.
2031 	 */
2032 	if (zfsvfs->z_ctldir != NULL)
2033 		zfsctl_destroy(zfsvfs);
2034 	zfs_freevfs(vfsp);
2035 
2036 	return (0);
2037 }
2038 
2039 static int
zfs_vget(vfs_t * vfsp,ino_t ino,int flags,vnode_t ** vpp)2040 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
2041 {
2042 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2043 	znode_t		*zp;
2044 	int 		err;
2045 
2046 	/*
2047 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
2048 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
2049 	 * This will make NFS to switch to LOOKUP instead of using VGET.
2050 	 */
2051 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
2052 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
2053 		return (EOPNOTSUPP);
2054 
2055 	ZFS_ENTER(zfsvfs);
2056 	err = zfs_zget(zfsvfs, ino, &zp);
2057 	if (err == 0 && zp->z_unlinked) {
2058 		vrele(ZTOV(zp));
2059 		err = EINVAL;
2060 	}
2061 	if (err == 0)
2062 		*vpp = ZTOV(zp);
2063 	ZFS_EXIT(zfsvfs);
2064 	if (err == 0)
2065 		err = vn_lock(*vpp, flags);
2066 	if (err != 0)
2067 		*vpp = NULL;
2068 	return (err);
2069 }
2070 
2071 static int
zfs_checkexp(vfs_t * vfsp,struct sockaddr * nam,int * extflagsp,struct ucred ** credanonp,int * numsecflavors,int ** secflavors)2072 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
2073     struct ucred **credanonp, int *numsecflavors, int **secflavors)
2074 {
2075 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2076 
2077 	/*
2078 	 * If this is regular file system vfsp is the same as
2079 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
2080 	 * zfsvfs->z_parent->z_vfs represents parent file system
2081 	 * which we have to use here, because only this file system
2082 	 * has mnt_export configured.
2083 	 */
2084 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
2085 	    credanonp, numsecflavors, secflavors));
2086 }
2087 
2088 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
2089 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
2090 
2091 static int
zfs_fhtovp(vfs_t * vfsp,fid_t * fidp,int flags,vnode_t ** vpp)2092 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
2093 {
2094 	struct componentname cn;
2095 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2096 	znode_t		*zp;
2097 	vnode_t		*dvp;
2098 	uint64_t	object = 0;
2099 	uint64_t	fid_gen = 0;
2100 	uint64_t	gen_mask;
2101 	uint64_t	zp_gen;
2102 	int 		i, err;
2103 
2104 	*vpp = NULL;
2105 
2106 	ZFS_ENTER(zfsvfs);
2107 
2108 	/*
2109 	 * On FreeBSD we can get snapshot's mount point or its parent file
2110 	 * system mount point depending if snapshot is already mounted or not.
2111 	 */
2112 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
2113 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
2114 		uint64_t	objsetid = 0;
2115 		uint64_t	setgen = 0;
2116 
2117 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2118 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2119 
2120 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2121 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2122 
2123 		ZFS_EXIT(zfsvfs);
2124 
2125 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2126 		if (err)
2127 			return (SET_ERROR(EINVAL));
2128 		ZFS_ENTER(zfsvfs);
2129 	}
2130 
2131 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2132 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
2133 
2134 		for (i = 0; i < sizeof (zfid->zf_object); i++)
2135 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2136 
2137 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
2138 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2139 	} else {
2140 		ZFS_EXIT(zfsvfs);
2141 		return (SET_ERROR(EINVAL));
2142 	}
2143 
2144 	/*
2145 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
2146 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
2147 	 * we are in the .zfs/shares directory tree.
2148 	 */
2149 	if ((fid_gen == 0 &&
2150 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
2151 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
2152 		ZFS_EXIT(zfsvfs);
2153 		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
2154 		if (object == ZFSCTL_INO_SNAPDIR) {
2155 			cn.cn_nameptr = "snapshot";
2156 			cn.cn_namelen = strlen(cn.cn_nameptr);
2157 			cn.cn_nameiop = LOOKUP;
2158 			cn.cn_flags = ISLASTCN | LOCKLEAF;
2159 			cn.cn_lkflags = flags;
2160 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2161 			vput(dvp);
2162 		} else if (object == zfsvfs->z_shares_dir) {
2163 			/*
2164 			 * XXX This branch must not be taken,
2165 			 * if it is, then the lookup below will
2166 			 * explode.
2167 			 */
2168 			cn.cn_nameptr = "shares";
2169 			cn.cn_namelen = strlen(cn.cn_nameptr);
2170 			cn.cn_nameiop = LOOKUP;
2171 			cn.cn_flags = ISLASTCN;
2172 			cn.cn_lkflags = flags;
2173 			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
2174 			vput(dvp);
2175 		} else {
2176 			*vpp = dvp;
2177 		}
2178 		return (err);
2179 	}
2180 
2181 	gen_mask = -1ULL >> (64 - 8 * i);
2182 
2183 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2184 	if (err = zfs_zget(zfsvfs, object, &zp)) {
2185 		ZFS_EXIT(zfsvfs);
2186 		return (err);
2187 	}
2188 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2189 	    sizeof (uint64_t));
2190 	zp_gen = zp_gen & gen_mask;
2191 	if (zp_gen == 0)
2192 		zp_gen = 1;
2193 	if (zp->z_unlinked || zp_gen != fid_gen) {
2194 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2195 		vrele(ZTOV(zp));
2196 		ZFS_EXIT(zfsvfs);
2197 		return (SET_ERROR(EINVAL));
2198 	}
2199 
2200 	*vpp = ZTOV(zp);
2201 	ZFS_EXIT(zfsvfs);
2202 	err = vn_lock(*vpp, flags);
2203 	if (err == 0)
2204 		vnode_create_vobject(*vpp, zp->z_size, curthread);
2205 	else
2206 		*vpp = NULL;
2207 	return (err);
2208 }
2209 
2210 /*
2211  * Block out VOPs and close zfsvfs_t::z_os
2212  *
2213  * Note, if successful, then we return with the 'z_teardown_lock' and
2214  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
2215  * dataset and objset intact so that they can be atomically handed off during
2216  * a subsequent rollback or recv operation and the resume thereafter.
2217  */
2218 int
zfs_suspend_fs(zfsvfs_t * zfsvfs)2219 zfs_suspend_fs(zfsvfs_t *zfsvfs)
2220 {
2221 	int error;
2222 
2223 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2224 		return (error);
2225 
2226 	return (0);
2227 }
2228 
2229 /*
2230  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
2231  * is an invariant across any of the operations that can be performed while the
2232  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
2233  * are the same: the relevant objset and associated dataset are owned by
2234  * zfsvfs, held, and long held on entry.
2235  */
2236 int
zfs_resume_fs(zfsvfs_t * zfsvfs,dsl_dataset_t * ds)2237 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2238 {
2239 	int err;
2240 	znode_t *zp;
2241 
2242 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
2243 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2244 
2245 	/*
2246 	 * We already own this, so just update the objset_t, as the one we
2247 	 * had before may have been evicted.
2248 	 */
2249 	objset_t *os;
2250 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
2251 	VERIFY(dsl_dataset_long_held(ds));
2252 	VERIFY0(dmu_objset_from_ds(ds, &os));
2253 
2254 	err = zfsvfs_init(zfsvfs, os);
2255 	if (err != 0)
2256 		goto bail;
2257 
2258 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2259 
2260 	zfs_set_fuid_feature(zfsvfs);
2261 
2262 	/*
2263 	 * Attempt to re-establish all the active znodes with
2264 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
2265 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2266 	 * when they try to use their znode.
2267 	 */
2268 	mutex_enter(&zfsvfs->z_znodes_lock);
2269 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2270 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2271 		(void) zfs_rezget(zp);
2272 	}
2273 	mutex_exit(&zfsvfs->z_znodes_lock);
2274 
2275 bail:
2276 	/* release the VOPs */
2277 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
2278 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
2279 
2280 	if (err) {
2281 		/*
2282 		 * Since we couldn't setup the sa framework, try to force
2283 		 * unmount this file system.
2284 		 */
2285 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2286 			vfs_ref(zfsvfs->z_vfs);
2287 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2288 		}
2289 	}
2290 	return (err);
2291 }
2292 
2293 static void
zfs_freevfs(vfs_t * vfsp)2294 zfs_freevfs(vfs_t *vfsp)
2295 {
2296 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2297 
2298 #ifdef illumos
2299 	/*
2300 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2301 	 * from zfs_mount().  Release it here.  If we came through
2302 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
2303 	 * skip the VFS_RELE for rootvfs.
2304 	 */
2305 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
2306 		VFS_RELE(zfsvfs->z_parent->z_vfs);
2307 #endif
2308 
2309 	zfsvfs_free(zfsvfs);
2310 
2311 	atomic_dec_32(&zfs_active_fs_count);
2312 }
2313 
2314 #ifdef __i386__
2315 static int desiredvnodes_backup;
2316 #endif
2317 
2318 static void
zfs_vnodes_adjust(void)2319 zfs_vnodes_adjust(void)
2320 {
2321 #ifdef __i386__
2322 	int newdesiredvnodes;
2323 
2324 	desiredvnodes_backup = desiredvnodes;
2325 
2326 	/*
2327 	 * We calculate newdesiredvnodes the same way it is done in
2328 	 * vntblinit(). If it is equal to desiredvnodes, it means that
2329 	 * it wasn't tuned by the administrator and we can tune it down.
2330 	 */
2331 	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
2332 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
2333 	    sizeof(struct vnode))));
2334 	if (newdesiredvnodes == desiredvnodes)
2335 		desiredvnodes = (3 * newdesiredvnodes) / 4;
2336 #endif
2337 }
2338 
2339 static void
zfs_vnodes_adjust_back(void)2340 zfs_vnodes_adjust_back(void)
2341 {
2342 
2343 #ifdef __i386__
2344 	desiredvnodes = desiredvnodes_backup;
2345 #endif
2346 }
2347 
2348 void
zfs_init(void)2349 zfs_init(void)
2350 {
2351 
2352 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2353 
2354 	/*
2355 	 * Initialize .zfs directory structures
2356 	 */
2357 	zfsctl_init();
2358 
2359 	/*
2360 	 * Initialize znode cache, vnode ops, etc...
2361 	 */
2362 	zfs_znode_init();
2363 
2364 	/*
2365 	 * Reduce number of vnodes. Originally number of vnodes is calculated
2366 	 * with UFS inode in mind. We reduce it here, because it's too big for
2367 	 * ZFS/i386.
2368 	 */
2369 	zfs_vnodes_adjust();
2370 
2371 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2372 }
2373 
2374 void
zfs_fini(void)2375 zfs_fini(void)
2376 {
2377 	zfsctl_fini();
2378 	zfs_znode_fini();
2379 	zfs_vnodes_adjust_back();
2380 }
2381 
2382 int
zfs_busy(void)2383 zfs_busy(void)
2384 {
2385 	return (zfs_active_fs_count != 0);
2386 }
2387 
2388 int
zfs_set_version(zfsvfs_t * zfsvfs,uint64_t newvers)2389 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2390 {
2391 	int error;
2392 	objset_t *os = zfsvfs->z_os;
2393 	dmu_tx_t *tx;
2394 
2395 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2396 		return (SET_ERROR(EINVAL));
2397 
2398 	if (newvers < zfsvfs->z_version)
2399 		return (SET_ERROR(EINVAL));
2400 
2401 	if (zfs_spa_version_map(newvers) >
2402 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2403 		return (SET_ERROR(ENOTSUP));
2404 
2405 	tx = dmu_tx_create(os);
2406 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2407 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2408 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2409 		    ZFS_SA_ATTRS);
2410 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2411 	}
2412 	error = dmu_tx_assign(tx, TXG_WAIT);
2413 	if (error) {
2414 		dmu_tx_abort(tx);
2415 		return (error);
2416 	}
2417 
2418 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2419 	    8, 1, &newvers, tx);
2420 
2421 	if (error) {
2422 		dmu_tx_commit(tx);
2423 		return (error);
2424 	}
2425 
2426 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2427 		uint64_t sa_obj;
2428 
2429 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2430 		    SPA_VERSION_SA);
2431 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2432 		    DMU_OT_NONE, 0, tx);
2433 
2434 		error = zap_add(os, MASTER_NODE_OBJ,
2435 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2436 		ASSERT0(error);
2437 
2438 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
2439 		sa_register_update_callback(os, zfs_sa_upgrade);
2440 	}
2441 
2442 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2443 	    "from %llu to %llu", zfsvfs->z_version, newvers);
2444 
2445 	dmu_tx_commit(tx);
2446 
2447 	zfsvfs->z_version = newvers;
2448 
2449 	zfs_set_fuid_feature(zfsvfs);
2450 
2451 	return (0);
2452 }
2453 
2454 /*
2455  * Read a property stored within the master node.
2456  */
2457 int
zfs_get_zplprop(objset_t * os,zfs_prop_t prop,uint64_t * value)2458 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2459 {
2460 	const char *pname;
2461 	int error = ENOENT;
2462 
2463 	/*
2464 	 * Look up the file system's value for the property.  For the
2465 	 * version property, we look up a slightly different string.
2466 	 */
2467 	if (prop == ZFS_PROP_VERSION)
2468 		pname = ZPL_VERSION_STR;
2469 	else
2470 		pname = zfs_prop_to_name(prop);
2471 
2472 	if (os != NULL)
2473 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2474 
2475 	if (error == ENOENT) {
2476 		/* No value set, use the default value */
2477 		switch (prop) {
2478 		case ZFS_PROP_VERSION:
2479 			*value = ZPL_VERSION;
2480 			break;
2481 		case ZFS_PROP_NORMALIZE:
2482 		case ZFS_PROP_UTF8ONLY:
2483 			*value = 0;
2484 			break;
2485 		case ZFS_PROP_CASE:
2486 			*value = ZFS_CASE_SENSITIVE;
2487 			break;
2488 		default:
2489 			return (error);
2490 		}
2491 		error = 0;
2492 	}
2493 	return (error);
2494 }
2495 
2496 #ifdef _KERNEL
2497 void
zfsvfs_update_fromname(const char * oldname,const char * newname)2498 zfsvfs_update_fromname(const char *oldname, const char *newname)
2499 {
2500 	char tmpbuf[MAXPATHLEN];
2501 	struct mount *mp;
2502 	char *fromname;
2503 	size_t oldlen;
2504 
2505 	oldlen = strlen(oldname);
2506 
2507 	mtx_lock(&mountlist_mtx);
2508 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2509 		fromname = mp->mnt_stat.f_mntfromname;
2510 		if (strcmp(fromname, oldname) == 0) {
2511 			(void)strlcpy(fromname, newname,
2512 			    sizeof(mp->mnt_stat.f_mntfromname));
2513 			continue;
2514 		}
2515 		if (strncmp(fromname, oldname, oldlen) == 0 &&
2516 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2517 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
2518 			    newname, fromname + oldlen);
2519 			(void)strlcpy(fromname, tmpbuf,
2520 			    sizeof(mp->mnt_stat.f_mntfromname));
2521 			continue;
2522 		}
2523 	}
2524 	mtx_unlock(&mountlist_mtx);
2525 }
2526 #endif
2527