zfs_vfsops.c - OpenGrok cross reference for /netbsd/src/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
 * All rights reserved.
 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 * Copyright (c) 2014 Integros [integros.com]
 */

/* Portions Copyright 2010 Robert Milkowski */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/acl.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/mntent.h>
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
#include <sys/dmu.h>
#include <sys/dsl_prop.h>
#include <sys/dsl_dataset.h>
#include <sys/dsl_deleg.h>
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/varargs.h>
#include <sys/policy.h>
#include <sys/atomic.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/sunddi.h>
#include <sys/dnlc.h>
#include <sys/dmu_objset.h>
#include <sys/spa_boot.h>
#include "zfs_comutil.h"

#ifdef __FreeBSD_kernel__

#include <sys/jail.h>

struct mtx zfs_debug_mtx;
MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);

SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");

int zfs_super_owner;
SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
    "File system owner can perform privileged operation on his file systems");

int zfs_debug_level;
SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
    "Debug level");

SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
static int zfs_version_acl = ZFS_ACL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
    "ZFS_ACL_VERSION");
static int zfs_version_spa = SPA_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
    "SPA_VERSION");
static int zfs_version_zpl = ZPL_VERSION;
SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
    "ZPL_VERSION");

static int zfs_mount(vfs_t *vfsp);
static int zfs_umount(vfs_t *vfsp, int fflag);
static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
static int zfs_sync(vfs_t *vfsp, int waitfor);
static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
    struct ucred **credanonp, int *numsecflavors, int **secflavors);
static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
static void zfs_objset_close(zfsvfs_t *zfsvfs);
static void zfs_freevfs(vfs_t *vfsp);

struct vfsops zfs_vfsops = {
          .vfs_mount =                  zfs_mount,
          .vfs_unmount =                zfs_umount,
          .vfs_root =                   zfs_root,
          .vfs_statfs =                 zfs_statfs,
          .vfs_vget =                   zfs_vget,
          .vfs_sync =                   zfs_sync,
          .vfs_checkexp =               zfs_checkexp,
          .vfs_fhtovp =                 zfs_fhtovp,
};

VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);

#endif /* __FreeBSD_kernel__ */

#ifdef __NetBSD__

#include <sys/fstrans.h>
#include <sys/mkdev.h>
#include <miscfs/genfs/genfs.h>

int zfs_debug_level;
kmutex_t zfs_debug_mtx;

#define   DROP_GIANT()        /* nothing */
#define PICKUP_GIANT()        /* nothing */
#define vfs_stdsync(a, b) 0

static int zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len);
static int zfs_umount(vfs_t *vfsp, int fflag);
static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
static int zfs_statvfs(vfs_t *vfsp, struct statvfs *statp);
static int zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size);
static int zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp);
static int zfs_vget(vfs_t *vfsp, ino_t ino, int lktype, vnode_t **vpp);
static int zfs_sync(vfs_t *vfsp, int waitfor);
static int zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr);
static void zfs_freevfs(vfs_t *vfsp);

void zfs_init(void);
void zfs_fini(void);

extern const struct vnodeopv_desc zfs_vnodeop_opv_desc;
extern const struct vnodeopv_desc zfs_specop_opv_desc;
extern const struct vnodeopv_desc zfs_fifoop_opv_desc;
extern const struct vnodeopv_desc zfs_sfsop_opv_desc;

static const struct vnodeopv_desc * const zfs_vnodeop_descs[] = {
          &zfs_vnodeop_opv_desc,
          &zfs_specop_opv_desc,
          &zfs_fifoop_opv_desc,
          &zfs_sfsop_opv_desc,
          NULL,
};

struct vfsops zfs_vfsops = {
          .vfs_name = MOUNT_ZFS,
          .vfs_min_mount_data = sizeof(struct zfs_args),
          .vfs_opv_descs = zfs_vnodeop_descs,
          .vfs_mount = zfs_mount,
          .vfs_unmount = zfs_umount,
          .vfs_root = zfs_root,
          .vfs_statvfs = zfs_statvfs,
          .vfs_sync = zfs_netbsd_sync,
          .vfs_vget = zfs_vget,
          .vfs_loadvnode = zfs_loadvnode,
          .vfs_newvnode = zfs_newvnode,
          .vfs_init = zfs_init,
          .vfs_done = zfs_fini,
          .vfs_start = (void *)nullop,
          .vfs_renamelock_enter = genfs_renamelock_enter,
          .vfs_renamelock_exit = genfs_renamelock_exit,
          .vfs_reinit = (void *)nullop,
          .vfs_vptofh = zfs_netbsd_vptofh,
          .vfs_fhtovp = zfs_netbsd_fhtovp,
          .vfs_quotactl = (void *)eopnotsupp,
          .vfs_extattrctl = (void *)eopnotsupp,
          .vfs_suspendctl = genfs_suspendctl,
          .vfs_snapshot = (void *)eopnotsupp,
          .vfs_fsync = (void *)eopnotsupp,
};

static bool
zfs_sync_selector(void *cl, struct vnode *vp)
{
          znode_t *zp;

          /*
           * Skip the vnode/inode if inaccessible, is control node or if the
           * atime is clean.
           */
          if (zfsctl_is_node(vp))
                    return false;
          zp = VTOZ(vp);
          return zp != NULL && vp->v_type != VNON && zp->z_atime_dirty != 0
              && !zp->z_unlinked;
}

static int
zfs_netbsd_sync(vfs_t *vfsp, int waitfor, cred_t *cr)
{
          struct vnode_iterator *marker;
          zfsvfs_t *zfsvfs = vfsp->vfs_data;
          vnode_t *vp;

          /*
           * On NetBSD, we need to push out atime updates.  Solaris does
           * this during VOP_INACTIVE, but that does not work well with the
           * BSD VFS, so we do it in batch here.
           */
          vfs_vnode_iterator_init(vfsp, &marker);
          while ((vp = vfs_vnode_iterator_next(marker, zfs_sync_selector, NULL)))
          {
                    znode_t *zp;
                    dmu_buf_t *dbp;
                    dmu_tx_t *tx;
                    int error;

                    error = vn_lock(vp, LK_EXCLUSIVE);
                    if (error) {
                              VN_RELE(vp);
                              continue;
                    }
                    ZFS_ENTER(zfsvfs);
                    zp = VTOZ(vp);
                    tx = dmu_tx_create(zfsvfs->z_os);
                    dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
                    zfs_sa_upgrade_txholds(tx, zp);
                    error = dmu_tx_assign(tx, TXG_WAIT);
                    if (error) {
                              dmu_tx_abort(tx);
                    } else {
                              (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
                                  (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
                              zp->z_atime_dirty = 0;
                              dmu_tx_commit(tx);
                    }
                    ZFS_EXIT(zfsvfs);
                    vput(vp);
          }
          vfs_vnode_iterator_destroy(marker);

          /*
           * Then do the regular ZFS stuff.
           */
          return zfs_sync(vfsp, waitfor);
}

static int
zfs_netbsd_vptofh(vnode_t *vp, fid_t *fidp, size_t *fh_size)
{
          znode_t             *zp;
          zfsvfs_t  *zfsvfs;
          uint32_t  gen;
          uint64_t  gen64;
          uint64_t  object;
          zfid_short_t        *zfid;
          int                 size, i, error;

          if (zfsctl_is_node(vp))
                    return zfsctl_vptofh(vp, fidp, fh_size);

          zp = VTOZ(vp);
          zfsvfs = zp->z_zfsvfs;
          object = zp->z_id;

          ZFS_ENTER(zfsvfs);
          ZFS_VERIFY_ZP(zp);

          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
              &gen64, sizeof (uint64_t))) != 0) {
                    ZFS_EXIT(zfsvfs);
                    return (error);
          }

          gen = (uint32_t)gen64;

          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;

          if (*fh_size < size) {
                    ZFS_EXIT(zfsvfs);
                    *fh_size = size;
                    return SET_ERROR(E2BIG);
          }
          *fh_size = size;

          zfid = (zfid_short_t *)fidp;

          zfid->zf_len = size;

          for (i = 0; i < sizeof (zfid->zf_object); i++)
                    zfid->zf_object[i] = (uint8_t)(object >> (8 * i));

          /* Must have a non-zero generation number to distinguish from .zfs */
          if (gen == 0)
                    gen = 1;
          for (i = 0; i < sizeof (zfid->zf_gen); i++)
                    zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));

          if (size == LONG_FID_LEN) {
                    uint64_t  objsetid = dmu_objset_id(zfsvfs->z_os);
                    zfid_long_t         *zlfid;

                    zlfid = (zfid_long_t *)fidp;

                    for (i = 0; i < sizeof (zlfid->zf_setid); i++)
                              zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));

                    /* XXX - this should be the generation number for the objset */
                    for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
                              zlfid->zf_setgen[i] = 0;
          }

          ZFS_EXIT(zfsvfs);
          return 0;
}

static int
zfs_netbsd_fhtovp(vfs_t *vfsp, fid_t *fidp, int lktype, vnode_t **vpp)
{
          zfsvfs_t  *zfsvfs = vfsp->vfs_data;
          znode_t             *zp;
          vnode_t             *dvp;
          uint64_t  object = 0;
          uint64_t  fid_gen = 0;
          uint64_t  gen_mask;
          uint64_t  zp_gen;
          int                 i, err;

          *vpp = NULL;

          ZFS_ENTER(zfsvfs);

          if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
                    zfid_long_t         *zlfid = (zfid_long_t *)fidp;
                    uint64_t  objsetid = 0;
                    uint64_t  setgen = 0;

                    for (i = 0; i < sizeof (zlfid->zf_setid); i++)
                              objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);

                    for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
                              setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);

                    ZFS_EXIT(zfsvfs);

                    err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
                    if (err)
                              return (SET_ERROR(EINVAL));
                    ZFS_ENTER(zfsvfs);
          }

          if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
                    zfid_short_t        *zfid = (zfid_short_t *)fidp;

                    for (i = 0; i < sizeof (zfid->zf_object); i++)
                              object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);

                    for (i = 0; i < sizeof (zfid->zf_gen); i++)
                              fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
          } else {
                    ZFS_EXIT(zfsvfs);
                    return (SET_ERROR(EINVAL));
          }

          /* A zero fid_gen means we are in the .zfs control directories */
          if (fid_gen == 0 &&
               (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
                    ZFS_EXIT(zfsvfs);
                    if (object == ZFSCTL_INO_ROOT)
                              err = zfsctl_root(zfsvfs, vpp);
                    else
                              err = zfsctl_snapshot(zfsvfs, vpp);
                    if (err)
                              return err;
                    err = vn_lock(*vpp, LK_EXCLUSIVE);
                    if (err) {
                              vrele(*vpp);
                              *vpp = NULL;
                              return err;
                    }
                    return 0;
          }

          gen_mask = -1ULL >> (64 - 8 * i);

          dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
          if (err = zfs_zget(zfsvfs, object, &zp)) {
                    ZFS_EXIT(zfsvfs);
                    return SET_ERROR(ESTALE);
          }
          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
              sizeof (uint64_t));
          zp_gen = zp_gen & gen_mask;
          if (zp_gen == 0)
                    zp_gen = 1;
          if (zp->z_unlinked || zp_gen != fid_gen) {
                    dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
                    VN_RELE(ZTOV(zp));
                    ZFS_EXIT(zfsvfs);
                    return SET_ERROR(ESTALE);
          }

          *vpp = ZTOV(zp);
          ZFS_EXIT(zfsvfs);
          err = vn_lock(*vpp, lktype);
          if (err) {
                    vrele(*vpp);
                    *vpp = NULL;
                    return err;
          }
          return 0;
}
#endif /* __NetBSD__ */

/*
 * We need to keep a count of active fs's.
 * This is necessary to prevent our module
 * from being unloaded after a umount -f
 */
static uint32_t     zfs_active_fs_count = 0;

/*ARGSUSED*/
static int
zfs_sync(vfs_t *vfsp, int waitfor)
{
        /*
           * Data integrity is job one.  We don't want a compromised kernel
           * writing to the storage pool, so we never sync during panic.
           */
          if (panicstr)
                    return (0);

          /*
           * Ignore the system syncher.  ZFS already commits async data
           * at zfs_txg_timeout intervals.
           */
          if (waitfor == MNT_LAZY)
                    return (0);

          if (vfsp != NULL) {
                    /*
                     * Sync a specific filesystem.
                     */
                    zfsvfs_t *zfsvfs = vfsp->vfs_data;
                    dsl_pool_t *dp;
                    int error;

                    error = vfs_stdsync(vfsp, waitfor);
                    if (error != 0)
                              return (error);

                    ZFS_ENTER(zfsvfs);
                    dp = dmu_objset_pool(zfsvfs->z_os);

                    /*
                     * If the system is shutting down, then skip any
                     * filesystems which may exist on a suspended pool.
                     */
                    if (sys_shutdown && spa_suspended(dp->dp_spa)) {
                              ZFS_EXIT(zfsvfs);
                              return (0);
                    }

                    if (zfsvfs->z_log != NULL)
                              zil_commit(zfsvfs->z_log, 0);

                    ZFS_EXIT(zfsvfs);
          } else {
                    /*
                     * Sync all ZFS filesystems.  This is what happens when you
                     * run sync(1M).  Unlike other filesystems, ZFS honors the
                     * request by waiting for all pools to commit all dirty data.
                     */
                    spa_sync_allpools();
          }

          return (0);
}

#ifdef illumos
static int
zfs_create_unique_device(dev_t *dev)
{
          major_t new_major;

          do {
                    ASSERT3U(zfs_minor, <=, MAXMIN32);
                    minor_t start = zfs_minor;
                    do {
                              mutex_enter(&zfs_dev_mtx);
                              if (zfs_minor >= MAXMIN32) {
                                        /*
                                         * If we're still using the real major
                                         * keep out of /dev/zfs and /dev/zvol minor
                                         * number space.  If we're using a getudev()'ed
                                         * major number, we can use all of its minors.
                                         */
                                        if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
                                                  zfs_minor = ZFS_MIN_MINOR;
                                        else
                                                  zfs_minor = 0;
                              } else {
                                        zfs_minor++;
                              }
                              *dev = makedevice(zfs_major, zfs_minor);
                              mutex_exit(&zfs_dev_mtx);
                    } while (vfs_devismounted(*dev) && zfs_minor != start);
#ifdef illumos
                    if (zfs_minor == start) {
                              /*
                               * We are using all ~262,000 minor numbers for the
                               * current major number.  Create a new major number.
                               */
                              if ((new_major = getudev()) == (major_t)-1) {
                                        cmn_err(CE_WARN,
                                            "zfs_mount: Can't get unique major "
                                            "device number.");
                                        return (-1);
                              }
                              mutex_enter(&zfs_dev_mtx);
                              zfs_major = new_major;
                              zfs_minor = 0;

                              mutex_exit(&zfs_dev_mtx);
                    } else {
                              break;
                    }
                    /* CONSTANTCONDITION */
#endif
          } while (1);

          return (0);
}
#endif    /* illumos */


static void
atime_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          if (newval == TRUE) {
                    zfsvfs->z_atime = TRUE;
                    zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
          } else {
                    zfsvfs->z_atime = FALSE;
                    zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
          }
}

static void
xattr_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          if (newval == TRUE) {
                    /* XXX locking on vfs_flag? */
#ifdef TODO
                    zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
#endif
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
          } else {
                    /* XXX locking on vfs_flag? */
#ifdef TODO
                    zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
#endif
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
          }
}

static void
blksz_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;
          ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
          ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
          ASSERT(ISP2(newval));

          zfsvfs->z_max_blksz = newval;
          zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
}

static void
readonly_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          if (newval) {
                    /* XXX locking on vfs_flag? */
                    zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
          } else {
                    /* XXX locking on vfs_flag? */
                    zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
          }
}

static void
setuid_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          if (newval == FALSE) {
                    zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
          } else {
                    zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
          }
}

static void
exec_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          if (newval == FALSE) {
                    zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
          } else {
                    zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
          }
}

/*
 * The nbmand mount option can be changed at mount time.
 * We can't allow it to be toggled on live file systems or incorrect
 * behavior may be seen from cifs clients
 *
 * This property isn't registered via dsl_prop_register(), but this callback
 * will be called when a file system is first mounted
 */
static void
nbmand_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;
          if (newval == FALSE) {
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
          } else {
                    vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
                    vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
          }
}

static void
snapdir_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          zfsvfs->z_show_ctldir = newval;
}

static void
vscan_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          zfsvfs->z_vscan = newval;
}

static void
acl_mode_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          zfsvfs->z_acl_mode = newval;
}

static void
acl_inherit_changed_cb(void *arg, uint64_t newval)
{
          zfsvfs_t *zfsvfs = arg;

          zfsvfs->z_acl_inherit = newval;
}

static int
zfs_register_callbacks(vfs_t *vfsp)
{
          struct dsl_dataset *ds = NULL;
          objset_t *os = NULL;
          zfsvfs_t *zfsvfs = NULL;
          uint64_t nbmand;
          boolean_t readonly = B_FALSE;
          boolean_t do_readonly = B_FALSE;
          boolean_t setuid = B_FALSE;
          boolean_t do_setuid = B_FALSE;
          boolean_t exec = B_FALSE;
          boolean_t do_exec = B_FALSE;
#ifdef illumos
          boolean_t devices = B_FALSE;
          boolean_t do_devices = B_FALSE;
#endif
          boolean_t xattr = B_FALSE;
          boolean_t do_xattr = B_FALSE;
          boolean_t atime = B_FALSE;
          boolean_t do_atime = B_FALSE;
          int error = 0;

          ASSERT(vfsp);
          zfsvfs = vfsp->vfs_data;
          ASSERT(zfsvfs);
          os = zfsvfs->z_os;

          /*
           * This function can be called for a snapshot when we update snapshot's
           * mount point, which isn't really supported.
           */
          if (dmu_objset_is_snapshot(os))
                    return (EOPNOTSUPP);

          /*
           * The act of registering our callbacks will destroy any mount
           * options we may have.  In order to enable temporary overrides
           * of mount options, we stash away the current values and
           * restore them after we register the callbacks.
           */
          if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
              !spa_writeable(dmu_objset_spa(os))) {
                    readonly = B_TRUE;
                    do_readonly = B_TRUE;
          } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
                    readonly = B_FALSE;
                    do_readonly = B_TRUE;
          }
          if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
                    setuid = B_FALSE;
                    do_setuid = B_TRUE;
          } else {
                    if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
                              setuid = B_FALSE;
                              do_setuid = B_TRUE;
                    } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
                              setuid = B_TRUE;
                              do_setuid = B_TRUE;
                    }
          }
          if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
                    exec = B_FALSE;
                    do_exec = B_TRUE;
          } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
                    exec = B_TRUE;
                    do_exec = B_TRUE;
          }
          if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
                    xattr = B_FALSE;
                    do_xattr = B_TRUE;
          } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
                    xattr = B_TRUE;
                    do_xattr = B_TRUE;
          }
          if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
                    atime = B_FALSE;
                    do_atime = B_TRUE;
          } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
                    atime = B_TRUE;
                    do_atime = B_TRUE;
          }

          /*
           * We need to enter pool configuration here, so that we can use
           * dsl_prop_get_int_ds() to handle the special nbmand property below.
           * dsl_prop_get_integer() can not be used, because it has to acquire
           * spa_namespace_lock and we can not do that because we already hold
           * z_teardown_lock.  The problem is that spa_config_sync() is called
           * with spa_namespace_lock held and the function calls ZFS vnode
           * operations to write the cache file and thus z_teardown_lock is
           * acquired after spa_namespace_lock.
           */
          ds = dmu_objset_ds(os);
          dsl_pool_config_enter(dmu_objset_pool(os), FTAG);

          /*
           * nbmand is a special property.  It can only be changed at
           * mount time.
           *
           * This is weird, but it is documented to only be changeable
           * at mount time.
           */
          if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
                    nbmand = B_FALSE;
          } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
                    nbmand = B_TRUE;
          } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
                    dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
                    return (error);
          }

          /*
           * Register property callbacks.
           *
           * It would probably be fine to just check for i/o error from
           * the first prop_register(), but I guess I like to go
           * overboard...
           */
          error = dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
#ifdef illumos
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
#endif
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
              zfsvfs);
          error = error ? error : dsl_prop_register(ds,
              zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
          dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
          if (error)
                    goto unregister;

          /*
           * Invoke our callbacks to restore temporary mount options.
           */
          if (do_readonly)
                    readonly_changed_cb(zfsvfs, readonly);
          if (do_setuid)
                    setuid_changed_cb(zfsvfs, setuid);
          if (do_exec)
                    exec_changed_cb(zfsvfs, exec);
          if (do_xattr)
                    xattr_changed_cb(zfsvfs, xattr);
          if (do_atime)
                    atime_changed_cb(zfsvfs, atime);

          nbmand_changed_cb(zfsvfs, nbmand);

          return (0);

unregister:
          dsl_prop_unregister_all(ds, zfsvfs);
          return (error);
}

static int
zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
    uint64_t *userp, uint64_t *groupp)
{
          /*
           * Is it a valid type of object to track?
           */
          if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
                    return (SET_ERROR(ENOENT));

          /*
           * If we have a NULL data pointer
           * then assume the id's aren't changing and
           * return EEXIST to the dmu to let it know to
           * use the same ids
           */
          if (data == NULL)
                    return (SET_ERROR(EEXIST));

          if (bonustype == DMU_OT_ZNODE) {
                    znode_phys_t *znp = data;
                    *userp = znp->zp_uid;
                    *groupp = znp->zp_gid;
          } else {
                    int hdrsize;
                    sa_hdr_phys_t *sap = data;
                    sa_hdr_phys_t sa = *sap;
                    boolean_t swap = B_FALSE;

                    ASSERT(bonustype == DMU_OT_SA);

                    if (sa.sa_magic == 0) {
                              /*
                               * This should only happen for newly created
                               * files that haven't had the znode data filled
                               * in yet.
                               */
                              *userp = 0;
                              *groupp = 0;
                              return (0);
                    }
                    if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
                              sa.sa_magic = SA_MAGIC;
                              sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
                              swap = B_TRUE;
                    } else {
                              VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
                    }

                    hdrsize = sa_hdrsize(&sa);
                    VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
                    *userp = *((uint64_t *)((uintptr_t)data + hdrsize +
                        SA_UID_OFFSET));
                    *groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
                        SA_GID_OFFSET));
                    if (swap) {
                              *userp = BSWAP_64(*userp);
                              *groupp = BSWAP_64(*groupp);
                    }
          }
          return (0);
}

static void
fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
    char *domainbuf, int buflen, uid_t *ridp)
{
          uint64_t fuid;
          const char *domain;

          fuid = strtonum(fuidstr, NULL);

          domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
          if (domain)
                    (void) strlcpy(domainbuf, domain, buflen);
          else
                    domainbuf[0] = '\0';
          *ridp = FUID_RID(fuid);
}

static uint64_t
zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
{
          switch (type) {
          case ZFS_PROP_USERUSED:
                    return (DMU_USERUSED_OBJECT);
          case ZFS_PROP_GROUPUSED:
                    return (DMU_GROUPUSED_OBJECT);
          case ZFS_PROP_USERQUOTA:
                    return (zfsvfs->z_userquota_obj);
          case ZFS_PROP_GROUPQUOTA:
                    return (zfsvfs->z_groupquota_obj);
          }
          return (0);
}

int
zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
{
          int error;
          zap_cursor_t zc;
          zap_attribute_t za;
          zfs_useracct_t *buf = vbuf;
          uint64_t obj;

          if (!dmu_objset_userspace_present(zfsvfs->z_os))
                    return (SET_ERROR(ENOTSUP));

          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
          if (obj == 0) {
                    *bufsizep = 0;
                    return (0);
          }

          for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
              (error = zap_cursor_retrieve(&zc, &za)) == 0;
              zap_cursor_advance(&zc)) {
                    if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
                        *bufsizep)
                              break;

                    fuidstr_to_sid(zfsvfs, za.za_name,
                        buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);

                    buf->zu_space = za.za_first_integer;
                    buf++;
          }
          if (error == ENOENT)
                    error = 0;

          ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
          *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
          *cookiep = zap_cursor_serialize(&zc);
          zap_cursor_fini(&zc);
          return (error);
}

/*
 * buf must be big enough (eg, 32 bytes)
 */
static int
id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
    char *buf, boolean_t addok)
{
          uint64_t fuid;
          int domainid = 0;

          if (domain && domain[0]) {
                    domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
                    if (domainid == -1)
                              return (SET_ERROR(ENOENT));
          }
          fuid = FUID_ENCODE(domainid, rid);
          (void) sprintf(buf, "%llx", (longlong_t)fuid);
          return (0);
}

int
zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    const char *domain, uint64_t rid, uint64_t *valp)
{
          char buf[32];
          int err;
          uint64_t obj;

          *valp = 0;

          if (!dmu_objset_userspace_present(zfsvfs->z_os))
                    return (SET_ERROR(ENOTSUP));

          obj = zfs_userquota_prop_to_obj(zfsvfs, type);
          if (obj == 0)
                    return (0);

          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
          if (err)
                    return (err);

          err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
          if (err == ENOENT)
                    err = 0;
          return (err);
}

int
zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    const char *domain, uint64_t rid, uint64_t quota)
{
          char buf[32];
          int err;
          dmu_tx_t *tx;
          uint64_t *objp;
          boolean_t fuid_dirtied;

          if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
                    return (SET_ERROR(EINVAL));

          if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
                    return (SET_ERROR(ENOTSUP));

          objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
              &zfsvfs->z_groupquota_obj;

          err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
          if (err)
                    return (err);
          fuid_dirtied = zfsvfs->z_fuid_dirty;

          tx = dmu_tx_create(zfsvfs->z_os);
          dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
          if (*objp == 0) {
                    dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
                        zfs_userquota_prop_prefixes[type]);
          }
          if (fuid_dirtied)
                    zfs_fuid_txhold(zfsvfs, tx);
          err = dmu_tx_assign(tx, TXG_WAIT);
          if (err) {
                    dmu_tx_abort(tx);
                    return (err);
          }

          mutex_enter(&zfsvfs->z_lock);
          if (*objp == 0) {
                    *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
                        DMU_OT_NONE, 0, tx);
                    VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
                        zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
          }
          mutex_exit(&zfsvfs->z_lock);

          if (quota == 0) {
                    err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
                    if (err == ENOENT)
                              err = 0;
          } else {
                    err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
          }
          ASSERT(err == 0);
          if (fuid_dirtied)
                    zfs_fuid_sync(zfsvfs, tx);
          dmu_tx_commit(tx);
          return (err);
}

boolean_t
zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
{
          char buf[32];
          uint64_t used, quota, usedobj, quotaobj;
          int err;

          usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;

          if (quotaobj == 0 || zfsvfs->z_replay)
                    return (B_FALSE);

          (void) sprintf(buf, "%llx", (longlong_t)fuid);
          err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
          if (err != 0)
                    return (B_FALSE);

          err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
          if (err != 0)
                    return (B_FALSE);
          return (used >= quota);
}

boolean_t
zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
{
          uint64_t fuid;
          uint64_t quotaobj;

          quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;

          fuid = isgroup ? zp->z_gid : zp->z_uid;

          if (quotaobj == 0 || zfsvfs->z_replay)
                    return (B_FALSE);

          return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
}

/*
 * Associate this zfsvfs with the given objset, which must be owned.
 * This will cache a bunch of on-disk state from the objset in the
 * zfsvfs.
 */
static int
zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
{
          int error;
          uint64_t val;

          zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
          zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
          zfsvfs->z_os = os;

          error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
          if (error != 0)
                    return (error);
          if (zfsvfs->z_version >
              zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
                    (void) printf("Can't mount a version %lld file system "
                        "on a version %lld pool\n. Pool must be upgraded to mount "
                        "this file system.", (u_longlong_t)zfsvfs->z_version,
                        (u_longlong_t)spa_version(dmu_objset_spa(os)));
                    return (SET_ERROR(ENOTSUP));
          }
          error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
          if (error != 0)
                    return (error);
          zfsvfs->z_norm = (int)val;

          error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
          if (error != 0)
                    return (error);
          zfsvfs->z_utf8 = (val != 0);

          error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
          if (error != 0)
                    return (error);
          zfsvfs->z_case = (uint_t)val;

          /*
           * Fold case on file systems that are always or sometimes case
           * insensitive.
           */
          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
              zfsvfs->z_case == ZFS_CASE_MIXED)
                    zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;

          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);

          uint64_t sa_obj = 0;
          if (zfsvfs->z_use_sa) {
                    /* should either have both of these objects or none */
                    error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
                        &sa_obj);
                    if (error != 0)
                              return (error);
          }

          error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
              &zfsvfs->z_attr_table);
          if (error != 0)
                    return (error);

          if (zfsvfs->z_version >= ZPL_VERSION_SA)
                    sa_register_update_callback(os, zfs_sa_upgrade);

          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
              &zfsvfs->z_root);
          if (error != 0)
                    return (error);
          ASSERT(zfsvfs->z_root != 0);

          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
              &zfsvfs->z_unlinkedobj);
          if (error != 0)
                    return (error);

          error = zap_lookup(os, MASTER_NODE_OBJ,
              zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
              8, 1, &zfsvfs->z_userquota_obj);
          if (error == ENOENT)
                    zfsvfs->z_userquota_obj = 0;
          else if (error != 0)
                    return (error);

          error = zap_lookup(os, MASTER_NODE_OBJ,
              zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
              8, 1, &zfsvfs->z_groupquota_obj);
          if (error == ENOENT)
                    zfsvfs->z_groupquota_obj = 0;
          else if (error != 0)
                    return (error);

          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
              &zfsvfs->z_fuid_obj);
          if (error == ENOENT)
                    zfsvfs->z_fuid_obj = 0;
          else if (error != 0)
                    return (error);

          error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
              &zfsvfs->z_shares_dir);
          if (error == ENOENT)
                    zfsvfs->z_shares_dir = 0;
          else if (error != 0)
                    return (error);

          /*
           * Only use the name cache if we are looking for a
           * name on a file system that does not require normalization
           * or case folding.  We can also look there if we happen to be
           * on a non-normalizing, mixed sensitivity file system IF we
           * are looking for the exact name (which is always the case on
           * FreeBSD).
           */
          zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
              ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
              !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));

          return (0);
}

int
zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
{
          objset_t *os;
          zfsvfs_t *zfsvfs;
          int error;

          /*
           * XXX: Fix struct statfs so this isn't necessary!
           *
           * The 'osname' is used as the filesystem's special node, which means
           * it must fit in statfs.f_mntfromname, or else it can't be
           * enumerated, so libzfs_mnttab_find() returns NULL, which causes
           * 'zfs unmount' to think it's not mounted when it is.
           */
          if (strlen(osname) >= MNAMELEN)
                    return (SET_ERROR(ENAMETOOLONG));

          zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);

          /*
           * We claim to always be readonly so we can open snapshots;
           * other ZPL code will prevent us from writing to snapshots.
           */
          error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
          if (error) {
                    kmem_free(zfsvfs, sizeof (zfsvfs_t));
                    return (error);
          }

          zfsvfs->z_vfs = NULL;
          zfsvfs->z_parent = zfsvfs;

          mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
          mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
          list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
              offsetof(znode_t, z_link_node));
#ifdef DIAGNOSTIC
          rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
#else
          rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
#endif
          rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
          rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
          for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                    mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);

          error = zfsvfs_init(zfsvfs, os);
          if (error != 0) {
                    dmu_objset_disown(os, zfsvfs);
                    *zfvp = NULL;
                    kmem_free(zfsvfs, sizeof (zfsvfs_t));
                    return (error);
          }

          *zfvp = zfsvfs;
          return (0);
}

static int
zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
{
          int error;

          error = zfs_register_callbacks(zfsvfs->z_vfs);
          if (error)
                    return (error);

          zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);

          /*
           * If we are not mounting (ie: online recv), then we don't
           * have to worry about replaying the log as we blocked all
           * operations out since we closed the ZIL.
           */
          if (mounting) {
                    boolean_t readonly;

                    /*
                     * During replay we remove the read only flag to
                     * allow replays to succeed.
                     */
                    readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
                    if (readonly != 0)
                              zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
                    else
                              zfs_unlinked_drain(zfsvfs);

                    /*
                     * Parse and replay the intent log.
                     *
                     * Because of ziltest, this must be done after
                     * zfs_unlinked_drain().  (Further note: ziltest
                     * doesn't use readonly mounts, where
                     * zfs_unlinked_drain() isn't called.)  This is because
                     * ziltest causes spa_sync() to think it's committed,
                     * but actually it is not, so the intent log contains
                     * many txg's worth of changes.
                     *
                     * In particular, if object N is in the unlinked set in
                     * the last txg to actually sync, then it could be
                     * actually freed in a later txg and then reallocated
                     * in a yet later txg.  This would write a "create
                     * object N" record to the intent log.  Normally, this
                     * would be fine because the spa_sync() would have
                     * written out the fact that object N is free, before
                     * we could write the "create object N" intent log
                     * record.
                     *
                     * But when we are in ziltest mode, we advance the "open
                     * txg" without actually spa_sync()-ing the changes to
                     * disk.  So we would see that object N is still
                     * allocated and in the unlinked set, and there is an
                     * intent log record saying to allocate it.
                     */
                    if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
                              if (zil_replay_disable) {
                                        zil_destroy(zfsvfs->z_log, B_FALSE);
                              } else {
                                        zfsvfs->z_replay = B_TRUE;
                                        zil_replay(zfsvfs->z_os, zfsvfs,
                                            zfs_replay_vector);
                                        zfsvfs->z_replay = B_FALSE;
                              }
                    }
                    zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
          }

          /*
           * Set the objset user_ptr to track its zfsvfs.
           */
          mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
          dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
          mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);

          return (0);
}

extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */

void
zfsvfs_free(zfsvfs_t *zfsvfs)
{
          int i;

          /*
           * This is a barrier to prevent the filesystem from going away in
           * zfs_znode_move() until we can safely ensure that the filesystem is
           * not unmounted. We consider the filesystem valid before the barrier
           * and invalid after the barrier.
           */
          rw_enter(&zfsvfs_lock, RW_READER);
          rw_exit(&zfsvfs_lock);

          zfs_fuid_destroy(zfsvfs);

          mutex_destroy(&zfsvfs->z_znodes_lock);
          mutex_destroy(&zfsvfs->z_lock);
          list_destroy(&zfsvfs->z_all_znodes);
          rrm_destroy(&zfsvfs->z_teardown_lock);
          rw_destroy(&zfsvfs->z_teardown_inactive_lock);
          rw_destroy(&zfsvfs->z_fuid_lock);
          for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
                    mutex_destroy(&zfsvfs->z_hold_mtx[i]);
          kmem_free(zfsvfs, sizeof (zfsvfs_t));
}

static void
zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
{
          zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
          if (zfsvfs->z_vfs) {
                    if (zfsvfs->z_use_fuids) {
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
                              vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
                    } else {
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
                              vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
                    }
          }
          zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
}

#ifdef __NetBSD__
int
#else
static int
#endif
zfs_domount(vfs_t *vfsp, char *osname)
{
          uint64_t recordsize, fsid_guid;
          int error = 0;
          zfsvfs_t *zfsvfs;
          vnode_t *vp;

          ASSERT(vfsp);
          ASSERT(osname);

          error = zfsvfs_create(osname, &zfsvfs);
          if (error)
                    return (error);
          zfsvfs->z_vfs = vfsp;

#ifdef illumos
          /* Initialize the generic filesystem structure. */
          vfsp->vfs_bcount = 0;
          vfsp->vfs_data = NULL;

          if (zfs_create_unique_device(&mount_dev) == -1) {
                    error = SET_ERROR(ENODEV);
                    goto out;
          }
          ASSERT(vfs_devismounted(mount_dev) == 0);
#endif

          if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
              NULL))
                    goto out;
          zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
          zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;

          vfsp->vfs_data = zfsvfs;
#ifdef __FreeBSD_kernel__
          vfsp->mnt_flag |= MNT_LOCAL;
          vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
          vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
          vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
          vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
#endif
#ifdef __NetBSD__
          vfsp->mnt_flag |= MNT_LOCAL;
          vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP;
#endif

          /*
           * The fsid is 64 bits, composed of an 8-bit fs type, which
           * separates our fsid from any other filesystem types, and a
           * 56-bit objset unique ID.  The objset unique ID is unique to
           * all objsets open on this system, provided by unique_create().
           * The 8-bit fs type must be put in the low bits of fsid[1]
           * because that's where other Solaris filesystems put it.
           */
          fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
          ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
#ifdef __FreeBSD_kernel__
          vfsp->vfs_fsid.val[0] = fsid_guid;
          vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
              vfsp->mnt_vfc->vfc_typenum & 0xFF;
#endif
#ifdef __NetBSD__
          vfsp->mnt_stat.f_fsidx.__fsid_val[0] = fsid_guid;
          vfsp->mnt_stat.f_fsidx.__fsid_val[1] = ((fsid_guid>>32) << 8) |
              makefstype(vfsp->mnt_op->vfs_name) & 0xFF;
          vfsp->mnt_stat.f_fsid = fsid_guid;
#endif

          /*
           * Set features for file system.
           */
          zfs_set_fuid_feature(zfsvfs);
          if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
                    vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
                    vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
                    vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
          } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
                    vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
                    vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
          }
          vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);

          if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
                    uint64_t pval;

                    atime_changed_cb(zfsvfs, B_FALSE);
                    readonly_changed_cb(zfsvfs, B_TRUE);
                    if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
                              goto out;
                    xattr_changed_cb(zfsvfs, pval);
                    zfsvfs->z_issnap = B_TRUE;
                    zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;

                    mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
                    dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
                    mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
          } else {
                    error = zfsvfs_setup(zfsvfs, B_TRUE);
          }

#ifdef __FreeBSD_kernel__
          vfs_mountedfrom(vfsp, osname);
#endif
#ifdef __NetBSD__
          set_statvfs_info("on-name", UIO_SYSSPACE, osname, UIO_SYSSPACE, "zfs", vfsp, curlwp);
#endif

          if (!zfsvfs->z_issnap)
                    zfsctl_create(zfsvfs);
out:
          if (error) {
                    dmu_objset_disown(zfsvfs->z_os, zfsvfs);
                    zfsvfs_free(zfsvfs);
          } else {
                    atomic_inc_32(&zfs_active_fs_count);
          }

          return (error);
}

void
zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
{
          objset_t *os = zfsvfs->z_os;

          if (!dmu_objset_is_snapshot(os))
                    dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
}

#ifdef SECLABEL
/*
 * Convert a decimal digit string to a uint64_t integer.
 */
static int
str_to_uint64(char *str, uint64_t *objnum)
{
          uint64_t num = 0;

          while (*str) {
                    if (*str < '0' || *str > '9')
                              return (SET_ERROR(EINVAL));

                    num = num*10 + *str++ - '0';
          }

          *objnum = num;
          return (0);
}

/*
 * The boot path passed from the boot loader is in the form of
 * "rootpool-name/root-filesystem-object-number'. Convert this
 * string to a dataset name: "rootpool-name/root-filesystem-name".
 */
static int
zfs_parse_bootfs(char *bpath, char *outpath)
{
          char *slashp;
          uint64_t objnum;
          int error;

          if (*bpath == 0 || *bpath == '/')
                    return (SET_ERROR(EINVAL));

          (void) strcpy(outpath, bpath);

          slashp = strchr(bpath, '/');

          /* if no '/', just return the pool name */
          if (slashp == NULL) {
                    return (0);
          }

          /* if not a number, just return the root dataset name */
          if (str_to_uint64(slashp+1, &objnum)) {
                    return (0);
          }

          *slashp = '\0';
          error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
          *slashp = '/';

          return (error);
}

/*
 * Check that the hex label string is appropriate for the dataset being
 * mounted into the global_zone proper.
 *
 * Return an error if the hex label string is not default or
 * admin_low/admin_high.  For admin_low labels, the corresponding
 * dataset must be readonly.
 */
int
zfs_check_global_label(const char *dsname, const char *hexsl)
{
          if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
                    return (0);
          if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
                    return (0);
          if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
                    /* must be readonly */
                    uint64_t rdonly;

                    if (dsl_prop_get_integer(dsname,
                        zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
                              return (SET_ERROR(EACCES));
                    return (rdonly ? 0 : EACCES);
          }
          return (SET_ERROR(EACCES));
}

/*
 * Determine whether the mount is allowed according to MAC check.
 * by comparing (where appropriate) label of the dataset against
 * the label of the zone being mounted into.  If the dataset has
 * no label, create one.
 *
 * Returns 0 if access allowed, error otherwise (e.g. EACCES)
 */
static int
zfs_mount_label_policy(vfs_t *vfsp, char *osname)
{
          int                 error, retv;
          zone_t              *mntzone = NULL;
          ts_label_t          *mnt_tsl;
          bslabel_t *mnt_sl;
          bslabel_t ds_sl;
          char                ds_hexsl[MAXNAMELEN];

          retv = EACCES;                                    /* assume the worst */

          /*
           * Start by getting the dataset label if it exists.
           */
          error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
              1, sizeof (ds_hexsl), &ds_hexsl, NULL);
          if (error)
                    return (SET_ERROR(EACCES));

          /*
           * If labeling is NOT enabled, then disallow the mount of datasets
           * which have a non-default label already.  No other label checks
           * are needed.
           */
          if (!is_system_labeled()) {
                    if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
                              return (0);
                    return (SET_ERROR(EACCES));
          }

          /*
           * Get the label of the mountpoint.  If mounting into the global
           * zone (i.e. mountpoint is not within an active zone and the
           * zoned property is off), the label must be default or
           * admin_low/admin_high only; no other checks are needed.
           */
          mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
          if (mntzone->zone_id == GLOBAL_ZONEID) {
                    uint64_t zoned;

                    zone_rele(mntzone);

                    if (dsl_prop_get_integer(osname,
                        zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
                              return (SET_ERROR(EACCES));
                    if (!zoned)
                              return (zfs_check_global_label(osname, ds_hexsl));
                    else
                              /*
                               * This is the case of a zone dataset being mounted
                               * initially, before the zone has been fully created;
                               * allow this mount into global zone.
                               */
                              return (0);
          }

          mnt_tsl = mntzone->zone_slabel;
          ASSERT(mnt_tsl != NULL);
          label_hold(mnt_tsl);
          mnt_sl = label2bslabel(mnt_tsl);

          if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
                    /*
                     * The dataset doesn't have a real label, so fabricate one.
                     */
                    char *str = NULL;

                    if (l_to_str_internal(mnt_sl, &str) == 0 &&
                        dsl_prop_set_string(osname,
                        zfs_prop_to_name(ZFS_PROP_MLSLABEL),
                        ZPROP_SRC_LOCAL, str) == 0)
                              retv = 0;
                    if (str != NULL)
                              kmem_free(str, strlen(str) + 1);
          } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
                    /*
                     * Now compare labels to complete the MAC check.  If the
                     * labels are equal then allow access.  If the mountpoint
                     * label dominates the dataset label, allow readonly access.
                     * Otherwise, access is denied.
                     */
                    if (blequal(mnt_sl, &ds_sl))
                              retv = 0;
                    else if (bldominates(mnt_sl, &ds_sl)) {
                              vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
                              retv = 0;
                    }
          }

          label_rele(mnt_tsl);
          zone_rele(mntzone);
          return (retv);
}
#endif    /* SECLABEL */

#ifdef OPENSOLARIS_MOUNTROOT
static int
zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
{
          int error = 0;
          static int zfsrootdone = 0;
          zfsvfs_t *zfsvfs = NULL;
          znode_t *zp = NULL;
          vnode_t *vp = NULL;
          char *zfs_bootfs;
          char *zfs_devid;

          ASSERT(vfsp);

          /*
           * The filesystem that we mount as root is defined in the
           * boot property "zfs-bootfs" with a format of
           * "poolname/root-dataset-objnum".
           */
          if (why == ROOT_INIT) {
                    if (zfsrootdone++)
                              return (SET_ERROR(EBUSY));
                    /*
                     * the process of doing a spa_load will require the
                     * clock to be set before we could (for example) do
                     * something better by looking at the timestamp on
                     * an uberblock, so just set it to -1.
                     */
                    clkset(-1);

                    if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
                              cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
                                  "bootfs name");
                              return (SET_ERROR(EINVAL));
                    }
                    zfs_devid = spa_get_bootprop("diskdevid");
                    error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
                    if (zfs_devid)
                              spa_free_bootprop(zfs_devid);
                    if (error) {
                              spa_free_bootprop(zfs_bootfs);
                              cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
                                  error);
                              return (error);
                    }
                    if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
                              spa_free_bootprop(zfs_bootfs);
                              cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
                                  error);
                              return (error);
                    }

                    spa_free_bootprop(zfs_bootfs);

                    if (error = vfs_lock(vfsp))
                              return (error);

                    if (error = zfs_domount(vfsp, rootfs.bo_name)) {
                              cmn_err(CE_NOTE, "zfs_domount: error %d", error);
                              goto out;
                    }

                    zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
                    ASSERT(zfsvfs);
                    if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
                              cmn_err(CE_NOTE, "zfs_zget: error %d", error);
                              goto out;
                    }

                    vp = ZTOV(zp);
                    mutex_enter(&vp->v_lock);
                    vp->v_flag |= VROOT;
                    mutex_exit(&vp->v_lock);
                    rootvp = vp;

                    /*
                     * Leave rootvp held.  The root file system is never unmounted.
                     */

                    vfs_add((struct vnode *)0, vfsp,
                        (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
out:
                    vfs_unlock(vfsp);
                    return (error);
          } else if (why == ROOT_REMOUNT) {
                    readonly_changed_cb(vfsp->vfs_data, B_FALSE);
                    vfsp->vfs_flag |= VFS_REMOUNT;

                    /* refresh mount options */
                    zfs_unregister_callbacks(vfsp->vfs_data);
                    return (zfs_register_callbacks(vfsp));

          } else if (why == ROOT_UNMOUNT) {
                    zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
                    (void) zfs_sync(vfsp, 0, 0);
                    return (0);
          }

          /*
           * if "why" is equal to anything else other than ROOT_INIT,
           * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
           */
          return (SET_ERROR(ENOTSUP));
}
#endif    /* OPENSOLARIS_MOUNTROOT */

static int
getpoolname(const char *osname, char *poolname)
{
          char *p;

          p = strchr(osname, '/');
          if (p == NULL) {
                    if (strlen(osname) >= MAXNAMELEN)
                              return (ENAMETOOLONG);
                    (void) strcpy(poolname, osname);
          } else {
                    if (p - osname >= MAXNAMELEN)
                              return (ENAMETOOLONG);
                    (void) strncpy(poolname, osname, p - osname);
                    poolname[p - osname] = '\0';
          }
          return (0);
}

/*ARGSUSED*/
#ifdef illumos
static int
zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
#endif
#ifdef __FreeBSD_kernel__
static int
zfs_mount(vfs_t *vfsp)
#endif
#ifdef __NetBSD__
static int
zfs_mount(vfs_t *vfsp, const char *path, void *data, size_t *data_len)
#endif
{
          vnode_t             *mvp = vfsp->mnt_vnodecovered;
          char                *osname;
          int                 error = 0;
          int                 canwrite;

#ifdef illumos
          if (mvp->v_type != VDIR)
                    return (SET_ERROR(ENOTDIR));

          mutex_enter(&mvp->v_lock);
          if ((uap->flags & MS_REMOUNT) == 0 &&
              (uap->flags & MS_OVERLAY) == 0 &&
              (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
                    mutex_exit(&mvp->v_lock);
                    return (SET_ERROR(EBUSY));
          }
          mutex_exit(&mvp->v_lock);

          /*
           * ZFS does not support passing unparsed data in via MS_DATA.
           * Users should use the MS_OPTIONSTR interface; this means
           * that all option parsing is already done and the options struct
           * can be interrogated.
           */
          if ((uap->flags & MS_DATA) && uap->datalen > 0)
                    return (SET_ERROR(EINVAL));
#endif /* illumos */

#ifdef __FreeBSD_kernel__
          kthread_t *td = curthread;
          cred_t              *cr = td->td_ucred;

          if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
                    return (SET_ERROR(EPERM));

          if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
                    return (SET_ERROR(EINVAL));

          /*
           * If full-owner-access is enabled and delegated administration is
           * turned on, we must set nosuid.
           */
          if (zfs_super_owner &&
              dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
                    secpolicy_fs_mount_clearopts(cr, vfsp);
          }

#endif /* __FreeBSD_kernel__ */

#ifdef __NetBSD__
          cred_t              *cr = CRED();
          struct mounta       *uap = data;

          if (uap == NULL)
                    return (SET_ERROR(EINVAL));

          if (*data_len < sizeof *uap)
                    return (SET_ERROR(EINVAL));

          if (mvp->v_type != VDIR)
                    return (SET_ERROR(ENOTDIR));

          mutex_enter(mvp->v_interlock);
          if ((uap->flags & MS_REMOUNT) == 0 &&
              (uap->flags & MS_OVERLAY) == 0 &&
              (vrefcnt(mvp) != 1 || (mvp->v_flag & VROOT))) {
                    mutex_exit(mvp->v_interlock);
                    return (SET_ERROR(EBUSY));
          }
          mutex_exit(mvp->v_interlock);

          osname = PNBUF_GET();
          strlcpy(osname, uap->fspec, strlen(uap->fspec) + 1);
#endif /* __NetBSD__ */

          /*
           * Check for mount privilege?
           *
           * If we don't have privilege then see if
           * we have local permission to allow it
           */
          error = secpolicy_fs_mount(cr, mvp, vfsp);
          if (error) {
                    if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
                              goto out;

                    if (!(vfsp->vfs_flag & MS_REMOUNT)) {
                              vattr_t             vattr;

                              /*
                               * Make sure user is the owner of the mount point
                               * or has sufficient privileges.
                               */

                              vattr.va_mask = AT_UID;

#ifdef __FreeBSD_kernel__
                              vn_lock(mvp, LK_SHARED | LK_RETRY);
                              if (VOP_GETATTR(mvp, &vattr, cr)) {
                                        VOP_UNLOCK(mvp, 0);
                                        goto out;
                              }

                              if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
                                  VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
                                        VOP_UNLOCK(mvp, 0);
                                        goto out;
                              }
                              VOP_UNLOCK(mvp, 0);
#endif
#ifdef __NetBSD__
                              vn_lock(mvp, LK_SHARED | LK_RETRY);
                              if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
                                        VOP_UNLOCK(mvp, 0);
                                        goto out;
                              }

                              if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
                                  VOP_ACCESS(mvp, VWRITE, cr) != 0) {
                                        VOP_UNLOCK(mvp, 0);
                                        goto out;
                              }
                              VOP_UNLOCK(mvp, 0);
#endif
                    }

                    secpolicy_fs_mount_clearopts(cr, vfsp);
          }

          /*
           * Refuse to mount a filesystem if we are in a local zone and the
           * dataset is not visible.
           */
          if (!INGLOBALZONE(curthread) &&
              (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
                    error = SET_ERROR(EPERM);
                    goto out;
          }

#ifdef SECLABEL
          error = zfs_mount_label_policy(vfsp, osname);
          if (error)
                    goto out;
#endif

#ifdef __FreeBSD_kernel__
          vfsp->vfs_flag |= MNT_NFS4ACLS;
#endif
#ifdef __NetBSD__
          vfsp->mnt_iflag |= IMNT_MPSAFE | IMNT_NCLOOKUP;
#endif

          /*
           * When doing a remount, we simply refresh our temporary properties
           * according to those options set in the current VFS options.
           */
          if (vfsp->vfs_flag & MS_REMOUNT) {
                    zfsvfs_t *zfsvfs = vfsp->vfs_data;

                    /*
                     * Refresh mount options with z_teardown_lock blocking I/O while
                     * the filesystem is in an inconsistent state.
                     * The lock also serializes this code with filesystem
                     * manipulations between entry to zfs_suspend_fs() and return
                     * from zfs_resume_fs().
                     */
                    rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
                    zfs_unregister_callbacks(zfsvfs);
                    error = zfs_register_callbacks(vfsp);
                    rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
                    goto out;
          }

#ifdef __FreeBSD_kernel__
          /* Initial root mount: try hard to import the requested root pool. */
          if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
              (vfsp->vfs_flag & MNT_UPDATE) == 0) {
                    char pname[MAXNAMELEN];

                    error = getpoolname(osname, pname);
                    if (error == 0)
                              error = spa_import_rootpool(pname);
                    if (error)
                              goto out;
          }
#endif

          DROP_GIANT();
          error = zfs_domount(vfsp, osname);
          PICKUP_GIANT();

#ifdef illumos
          /*
           * Add an extra VFS_HOLD on our parent vfs so that it can't
           * disappear due to a forced unmount.
           */
          if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
                    VFS_HOLD(mvp->v_vfsp);
#endif

#ifdef __NetBSD__
          /* setup zfs mount info */
          strlcpy(vfsp->mnt_stat.f_mntfromname, osname,
              sizeof(vfsp->mnt_stat.f_mntfromname));
          set_statvfs_info(path, UIO_USERSPACE, vfsp->mnt_stat.f_mntfromname,
              UIO_SYSSPACE, vfsp->mnt_op->vfs_name, vfsp, curlwp);
#endif

out:
          return (error);
}

#ifdef __FreeBSD_kernel__
static int
zfs_statfs(vfs_t *vfsp, struct statfs *statp)
#endif
#ifdef __NetBSD__
static int
zfs_statvfs(vfs_t *vfsp, struct statvfs *statp)
#endif
{
          zfsvfs_t *zfsvfs = vfsp->vfs_data;
          uint64_t refdbytes, availbytes, usedobjs, availobjs;

#ifdef __FreeBSD_kernel__
          statp->f_version = STATFS_VERSION;
#endif

          ZFS_ENTER(zfsvfs);

          dmu_objset_space(zfsvfs->z_os,
              &refdbytes, &availbytes, &usedobjs, &availobjs);

          /*
           * The underlying storage pool actually uses multiple block sizes.
           * We report the fragsize as the smallest block size we support,
           * and we report our blocksize as the filesystem's maximum blocksize.
           */
          statp->f_bsize = SPA_MINBLOCKSIZE;
#ifdef __NetBSD__
          statp->f_frsize = SPA_MINBLOCKSIZE;
#endif
          statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;

          /*
           * The following report "total" blocks of various kinds in the
           * file system, but reported in terms of f_frsize - the
           * "fragment" size.
           */

          statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
          statp->f_bfree = availbytes / statp->f_bsize;
          statp->f_bavail = statp->f_bfree; /* no root reservation */
          statp->f_bresvd = 0;

          /*
           * statvfs() should really be called statufs(), because it assumes
           * static metadata.  ZFS doesn't preallocate files, so the best
           * we can do is report the max that could possibly fit in f_files,
           * and that minus the number actually used in f_ffree.
           * For f_ffree, report the smaller of the number of object available
           * and the number of blocks (each object will take at least a block).
           */
          statp->f_ffree = MIN(availobjs, statp->f_bfree);
#ifndef __FreeBSD__
          statp->f_favail = statp->f_ffree;       /* no "root reservation" */
#endif
          statp->f_files = statp->f_ffree + usedobjs;
          statp->f_fresvd = 0;

#ifdef __FreeBSD__
          (void) cmpldev(&d32, vfsp->vfs_dev);
          statp->f_fsid = d32;
#endif
#ifdef __NetBSD__
          statp->f_fsid = vfsp->mnt_stat.f_fsid;
          statp->f_fsidx = vfsp->mnt_stat.f_fsidx;
#endif

          /*
           * We're a zfs filesystem.
           */
          (void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));

          strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
              sizeof(statp->f_mntfromname));
          strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
              sizeof(statp->f_mntonname));

#ifdef __FreeBSD_kernel__
          statp->f_namemax = MAXNAMELEN - 1;
#endif
#ifdef __NetBSD__
          statp->f_namemax = ZFS_MAXNAMELEN;
#endif

          ZFS_EXIT(zfsvfs);
          return (0);
}

static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
          zfsvfs_t *zfsvfs = vfsp->vfs_data;
          znode_t *rootzp;
          int error;

          ZFS_ENTER(zfsvfs);

          error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
          if (error == 0)
                    *vpp = ZTOV(rootzp);

          ZFS_EXIT(zfsvfs);

          if (error == 0) {
                    error = vn_lock(*vpp, flags);
                    if (error != 0) {
                              VN_RELE(*vpp);
                              *vpp = NULL;
                    }
          }
          return (error);
}

/*
 * Teardown the zfsvfs::z_os.
 *
 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
 * and 'z_teardown_inactive_lock' held.
 */
static int
zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
          znode_t   *zp;

          rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);

          if (!unmounting) {
                    /*
                     * We purge the parent filesystem's vfsp as the parent
                     * filesystem and all of its snapshots have their vnode's
                     * v_vfsp set to the parent's filesystem's vfsp.  Note,
                     * 'z_parent' is self referential for non-snapshots.
                     */
                    (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
#ifdef FREEBSD_NAMECACHE
                    cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
#endif
          }

          /*
           * Close the zil. NB: Can't close the zil while zfs_inactive
           * threads are blocked as zil_close can call zfs_inactive.
           */
          if (zfsvfs->z_log) {
                    zil_close(zfsvfs->z_log);
                    zfsvfs->z_log = NULL;
          }

          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);

          /*
           * If we are not unmounting (ie: online recv) and someone already
           * unmounted this file system while we were doing the switcheroo,
           * or a reopen of z_os failed then just bail out now.
           */
          if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
                    rw_exit(&zfsvfs->z_teardown_inactive_lock);
                    rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
                    return (SET_ERROR(EIO));
          }

          /*
           * At this point there are no vops active, and any new vops will
           * fail with EIO since we have z_teardown_lock for writer (only
           * relavent for forced unmount).
           *
           * Release all holds on dbufs.
           */
          mutex_enter(&zfsvfs->z_znodes_lock);
          for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
              zp = list_next(&zfsvfs->z_all_znodes, zp))
                    if (zp->z_sa_hdl) {
#ifdef __NetBSD__
                              ASSERT(vrefcnt(ZTOV(zp)) >= 0);
#else
                              ASSERT(ZTOV(zp)->v_count >= 0);
#endif
                              zfs_znode_dmu_fini(zp);
                    }
          mutex_exit(&zfsvfs->z_znodes_lock);

          /*
           * If we are unmounting, set the unmounted flag and let new vops
           * unblock.  zfs_inactive will have the unmounted behavior, and all
           * other vops will fail with EIO.
           */
          if (unmounting) {
                    zfsvfs->z_unmounted = B_TRUE;
                    rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
                    rw_exit(&zfsvfs->z_teardown_inactive_lock);
          }

          /*
           * z_os will be NULL if there was an error in attempting to reopen
           * zfsvfs, so just return as the properties had already been
           * unregistered and cached data had been evicted before.
           */
          if (zfsvfs->z_os == NULL)
                    return (0);

          /*
           * Unregister properties.
           */
          zfs_unregister_callbacks(zfsvfs);

          /*
           * Evict cached data
           */
          if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
              !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
                    txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
          dmu_objset_evict_dbufs(zfsvfs->z_os);

          return (0);
}

/*ARGSUSED*/
static int
zfs_umount(vfs_t *vfsp, int fflag)
{
          zfsvfs_t *zfsvfs = vfsp->vfs_data;
          objset_t *os;
          int ret;
#ifdef __FreeBSD_kernel__
          kthread_t *td = curthread;
          cred_t *cr = td->td_ucred;
#endif
#ifdef __NetBSD__
          cred_t *cr = CRED();
#endif

          ret = secpolicy_fs_unmount(cr, vfsp);
          if (ret) {
                    if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
                        ZFS_DELEG_PERM_MOUNT, cr))
                              return (ret);
          }

          /*
           * We purge the parent filesystem's vfsp as the parent filesystem
           * and all of its snapshots have their vnode's v_vfsp set to the
           * parent's filesystem's vfsp.  Note, 'z_parent' is self
           * referential for non-snapshots.
           */
          (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);

          /*
           * Unmount any snapshots mounted under .zfs before unmounting the
           * dataset itself.
           */
          if (zfsvfs->z_ctldir != NULL) {
                    if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
                              return (ret);
          }

          if (fflag & MS_FORCE) {
                    /*
                     * Mark file system as unmounted before calling
                     * vflush(FORCECLOSE). This way we ensure no future vnops
                     * will be called and risk operating on DOOMED vnodes.
                     */
                    rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
                    zfsvfs->z_unmounted = B_TRUE;
                    rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
          }

          /*
           * Flush all the files.
           */
#ifdef __FreeBSD_kernel__
          ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
#endif
#ifdef __NetBSD__
          ret = vflush(vfsp, NULL, (fflag & MS_FORCE) ? FORCECLOSE : 0);
#endif
          if (ret != 0)
                    return (ret);

#ifdef illumos
          if (!(fflag & MS_FORCE)) {
                    /*
                     * Check the number of active vnodes in the file system.
                     * Our count is maintained in the vfs structure, but the
                     * number is off by 1 to indicate a hold on the vfs
                     * structure itself.
                     *
                     * The '.zfs' directory maintains a reference of its
                     * own, and any active references underneath are
                     * reflected in the vnode count.
                     */
                    if (zfsvfs->z_ctldir == NULL) {
                              if (vfsp->vfs_count > 1)
                                        return (SET_ERROR(EBUSY));
                    } else {
                              if (vfsp->vfs_count > 2 ||
                                  zfsvfs->z_ctldir->v_count > 1)
                                        return (SET_ERROR(EBUSY));
                    }
          }
#endif

          VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
          os = zfsvfs->z_os;

          /*
           * z_os will be NULL if there was an error in
           * attempting to reopen zfsvfs.
           */
          if (os != NULL) {
                    /*
                     * Unset the objset user_ptr.
                     */
                    mutex_enter(&os->os_user_ptr_lock);
                    dmu_objset_set_user(os, NULL);
                    mutex_exit(&os->os_user_ptr_lock);

                    /*
                     * Finally release the objset
                     */
                    dmu_objset_disown(os, zfsvfs);
          }

          /*
           * We can now safely destroy the '.zfs' directory node.
           */
          if (zfsvfs->z_ctldir != NULL)
                    zfsctl_destroy(zfsvfs);
          zfs_freevfs(vfsp);

          return (0);
}

static int
zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
{
          zfsvfs_t  *zfsvfs = vfsp->vfs_data;
          znode_t             *zp;
          int                 err;

          /*
           * zfs_zget() can't operate on virtual entries like .zfs/ or
           * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
           * This will make NFS to switch to LOOKUP instead of using VGET.
           */
          if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
              (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
                    return (EOPNOTSUPP);

          ZFS_ENTER(zfsvfs);
          err = zfs_zget(zfsvfs, ino, &zp);
          if (err == 0 && zp->z_unlinked) {
                    VN_RELE(ZTOV(zp));
                    err = EINVAL;
          }
          if (err == 0)
                    *vpp = ZTOV(zp);
          ZFS_EXIT(zfsvfs);
          if (err == 0)
                    err = vn_lock(*vpp, flags);
          if (err != 0)
                    *vpp = NULL;

          return (err);
}

#ifdef __FreeBSD_kernel__
static int
zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
    struct ucred **credanonp, int *numsecflavors, int **secflavors)
{
          zfsvfs_t *zfsvfs = vfsp->vfs_data;

          /*
           * If this is regular file system vfsp is the same as
           * zfsvfs->z_parent->z_vfs, but if it is snapshot,
           * zfsvfs->z_parent->z_vfs represents parent file system
           * which we have to use here, because only this file system
           * has mnt_export configured.
           */
          return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
              credanonp, numsecflavors, secflavors));
}

CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
#endif

#ifdef __FreeBSD_kernel__
static int
zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
{
          struct componentname cn;
          zfsvfs_t  *zfsvfs = vfsp->vfs_data;
          znode_t             *zp;
          vnode_t             *dvp;
          uint64_t  object = 0;
          uint64_t  fid_gen = 0;
          uint64_t  gen_mask;
          uint64_t  zp_gen;
          int                 i, err;

          *vpp = NULL;

          ZFS_ENTER(zfsvfs);

          /*
           * On FreeBSD we can get snapshot's mount point or its parent file
           * system mount point depending if snapshot is already mounted or not.
           */
          if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
                    zfid_long_t         *zlfid = (zfid_long_t *)fidp;
                    uint64_t  objsetid = 0;
                    uint64_t  setgen = 0;

                    for (i = 0; i < sizeof (zlfid->zf_setid); i++)
                              objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);

                    for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
                              setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);

                    ZFS_EXIT(zfsvfs);

                    err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
                    if (err)
                              return (SET_ERROR(EINVAL));
                    ZFS_ENTER(zfsvfs);
          }

          if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
                    zfid_short_t        *zfid = (zfid_short_t *)fidp;

                    for (i = 0; i < sizeof (zfid->zf_object); i++)
                              object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);

                    for (i = 0; i < sizeof (zfid->zf_gen); i++)
                              fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
          } else {
                    ZFS_EXIT(zfsvfs);
                    return (SET_ERROR(EINVAL));
          }

          /*
           * A zero fid_gen means we are in .zfs or the .zfs/snapshot
           * directory tree. If the object == zfsvfs->z_shares_dir, then
           * we are in the .zfs/shares directory tree.
           */
          if ((fid_gen == 0 &&
               (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
              (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
                    ZFS_EXIT(zfsvfs);
                    VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
                    if (object == ZFSCTL_INO_SNAPDIR) {
                              cn.cn_nameptr = "snapshot";
                              cn.cn_namelen = strlen(cn.cn_nameptr);
                              cn.cn_nameiop = LOOKUP;
                              cn.cn_flags = ISLASTCN | LOCKLEAF;
                              cn.cn_lkflags = flags;
                              VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
                              vput(dvp);
                    } else if (object == zfsvfs->z_shares_dir) {
                              /*
                               * XXX This branch must not be taken,
                               * if it is, then the lookup below will
                               * explode.
                               */
                              cn.cn_nameptr = "shares";
                              cn.cn_namelen = strlen(cn.cn_nameptr);
                              cn.cn_nameiop = LOOKUP;
                              cn.cn_flags = ISLASTCN;
                              cn.cn_lkflags = flags;
                              VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
                              vput(dvp);
                    } else {
                              *vpp = dvp;
                    }
                    return (err);
          }

          gen_mask = -1ULL >> (64 - 8 * i);

          dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
          if (err = zfs_zget(zfsvfs, object, &zp)) {
                    ZFS_EXIT(zfsvfs);
                    return (err);
          }
          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
              sizeof (uint64_t));
          zp_gen = zp_gen & gen_mask;
          if (zp_gen == 0)
                    zp_gen = 1;
          if (zp->z_unlinked || zp_gen != fid_gen) {
                    dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
                    VN_RELE(ZTOV(zp));
                    ZFS_EXIT(zfsvfs);
                    return (SET_ERROR(EINVAL));
          }

          *vpp = ZTOV(zp);
          ZFS_EXIT(zfsvfs);
          err = vn_lock(*vpp, flags);
          if (err == 0)
                    vnode_create_vobject(*vpp, zp->z_size, curthread);
          else
                    *vpp = NULL;
          return (err);
}
#endif /* __FreeBSD_kernel__ */

/*
 * Block out VOPs and close zfsvfs_t::z_os
 *
 * Note, if successful, then we return with the 'z_teardown_lock' and
 * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
 * dataset and objset intact so that they can be atomically handed off during
 * a subsequent rollback or recv operation and the resume thereafter.
 */
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
{
          int error;

#ifdef __NetBSD__
          if ((error = vfs_suspend(zfsvfs->z_vfs, 0)) != 0)
                    return error;
          if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) {
                    vfs_resume(zfsvfs->z_vfs);
                    return (error);
          }
#else
          if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
                    return (error);
#endif

          return (0);
}

/*
 * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
 * is an invariant across any of the operations that can be performed while the
 * filesystem was suspended.  Whether it succeeded or failed, the preconditions
 * are the same: the relevant objset and associated dataset are owned by
 * zfsvfs, held, and long held on entry.
 */
#ifdef __NetBSD__
static bool
zfs_resume_selector(void *cl, struct vnode *vp)
{

          if (zfsctl_is_node(vp))
                    return false;
          return (VTOZ(vp)->z_sa_hdl == NULL);
}
#endif
int
zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
{
          int err;
          znode_t *zp;

          ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
          ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));

          /*
           * We already own this, so just update the objset_t, as the one we
           * had before may have been evicted.
           */
          objset_t *os;
          VERIFY3P(ds->ds_owner, ==, zfsvfs);
          VERIFY(dsl_dataset_long_held(ds));
          VERIFY0(dmu_objset_from_ds(ds, &os));

          err = zfsvfs_init(zfsvfs, os);
          if (err != 0)
                    goto bail;

          VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);

          zfs_set_fuid_feature(zfsvfs);

          /*
           * Attempt to re-establish all the active znodes with
           * their dbufs.  If a zfs_rezget() fails, then we'll let
           * any potential callers discover that via ZFS_ENTER_VERIFY_VP
           * when they try to use their znode.
           */
          mutex_enter(&zfsvfs->z_znodes_lock);
          for (zp = list_head(&zfsvfs->z_all_znodes); zp;
              zp = list_next(&zfsvfs->z_all_znodes, zp)) {
                    (void) zfs_rezget(zp);
          }
          mutex_exit(&zfsvfs->z_znodes_lock);

bail:
          /* release the VOPs */
          rw_exit(&zfsvfs->z_teardown_inactive_lock);
          rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
#ifdef __NetBSD__
          struct vnode_iterator *marker;
          vnode_t *vp;

          vfs_vnode_iterator_init(zfsvfs->z_vfs, &marker);
          while ((vp = vfs_vnode_iterator_next(marker,
              zfs_resume_selector, NULL))) {
                    vgone(vp);
          }
          vfs_vnode_iterator_destroy(marker);
          vfs_resume(zfsvfs->z_vfs);
#endif

          if (err) {
                    /*
                     * Since we couldn't setup the sa framework, try to force
                     * unmount this file system.
                     */
                    if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
                              vfs_ref(zfsvfs->z_vfs);
                              (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
                    }
          }
          return (err);
}

static void
zfs_freevfs(vfs_t *vfsp)
{
          zfsvfs_t *zfsvfs = vfsp->vfs_data;

#ifdef illumos
          /*
           * If this is a snapshot, we have an extra VFS_HOLD on our parent
           * from zfs_mount().  Release it here.  If we came through
           * zfs_mountroot() instead, we didn't grab an extra hold, so
           * skip the VFS_RELE for rootvfs.
           */
          if (zfsvfs->z_issnap && (vfsp != rootvfs))
                    VFS_RELE(zfsvfs->z_parent->z_vfs);
#endif

          zfsvfs_free(zfsvfs);

          atomic_dec_32(&zfs_active_fs_count);
}

#ifdef __FreeBSD_kernel__
#ifdef __i386__
static int desiredvnodes_backup;
#endif

static void
zfs_vnodes_adjust(void)
{
#ifdef __i386__
          int newdesiredvnodes;

          desiredvnodes_backup = desiredvnodes;

          /*
           * We calculate newdesiredvnodes the same way it is done in
           * vntblinit(). If it is equal to desiredvnodes, it means that
           * it wasn't tuned by the administrator and we can tune it down.
           */
          newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
              vm_kmem_size / (5 * (sizeof(struct vm_object) +
              sizeof(struct vnode))));
          if (newdesiredvnodes == desiredvnodes)
                    desiredvnodes = (3 * newdesiredvnodes) / 4;
#endif
}

static void
zfs_vnodes_adjust_back(void)
{

#ifdef __i386__
          desiredvnodes = desiredvnodes_backup;
#endif
}
#endif /* __FreeBSD_kernel__ */

#ifdef __NetBSD__
static void
zfs_vnodes_adjust(void)
{
}

static void
zfs_vnodes_adjust_back(void)
{
}
#endif

void
zfs_init(void)
{

          printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");

          /*
           * Initialize .zfs directory structures
           */
          zfsctl_init();

          /*
           * Initialize znode cache, vnode ops, etc...
           */
          zfs_znode_init();

          /*
           * Reduce number of vnodes. Originally number of vnodes is calculated
           * with UFS inode in mind. We reduce it here, because it's too big for
           * ZFS/i386.
           */
          zfs_vnodes_adjust();

          dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
}

void
zfs_fini(void)
{
          zfsctl_fini();
          zfs_znode_fini();
          zfs_vnodes_adjust_back();
}

int
zfs_busy(void)
{
          return (zfs_active_fs_count != 0);
}

int
zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
{
          int error;
          objset_t *os = zfsvfs->z_os;
          dmu_tx_t *tx;

          if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
                    return (SET_ERROR(EINVAL));

          if (newvers < zfsvfs->z_version)
                    return (SET_ERROR(EINVAL));

          if (zfs_spa_version_map(newvers) >
              spa_version(dmu_objset_spa(zfsvfs->z_os)))
                    return (SET_ERROR(ENOTSUP));

          tx = dmu_tx_create(os);
          dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
                    dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
                        ZFS_SA_ATTRS);
                    dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
          }
          error = dmu_tx_assign(tx, TXG_WAIT);
          if (error) {
                    dmu_tx_abort(tx);
                    return (error);
          }

          error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
              8, 1, &newvers, tx);

          if (error) {
                    dmu_tx_commit(tx);
                    return (error);
          }

          if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
                    uint64_t sa_obj;

                    ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
                        SPA_VERSION_SA);
                    sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
                        DMU_OT_NONE, 0, tx);

                    error = zap_add(os, MASTER_NODE_OBJ,
                        ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
                    ASSERT0(error);

                    VERIFY(0 == sa_set_sa_object(os, sa_obj));
                    sa_register_update_callback(os, zfs_sa_upgrade);
          }

          spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
              "from %llu to %llu", zfsvfs->z_version, newvers);

          dmu_tx_commit(tx);

          zfsvfs->z_version = newvers;

          zfs_set_fuid_feature(zfsvfs);

          return (0);
}

/*
 * Read a property stored within the master node.
 */
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
          const char *pname;
          int error = ENOENT;

          /*
           * Look up the file system's value for the property.  For the
           * version property, we look up a slightly different string.
           */
          if (prop == ZFS_PROP_VERSION)
                    pname = ZPL_VERSION_STR;
          else
                    pname = zfs_prop_to_name(prop);

          if (os != NULL)
                    error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);

          if (error == ENOENT) {
                    /* No value set, use the default value */
                    switch (prop) {
                    case ZFS_PROP_VERSION:
                              *value = ZPL_VERSION;
                              break;
                    case ZFS_PROP_NORMALIZE:
                    case ZFS_PROP_UTF8ONLY:
                              *value = 0;
                              break;
                    case ZFS_PROP_CASE:
                              *value = ZFS_CASE_SENSITIVE;
                              break;
                    default:
                              return (error);
                    }
                    error = 0;
          }
          return (error);
}

#if defined(__FreeBSD_kernel__) || defined(__NetBSD__)
#ifdef _KERNEL
void
zfsvfs_update_fromname(const char *oldname, const char *newname)
{
          char tmpbuf[MAXPATHLEN];
          struct mount *mp;
          char *fromname;
          size_t oldlen;

          oldlen = strlen(oldname);

#ifdef __NetBSD__
          mount_iterator_t *iter;
          mountlist_iterator_init(&iter);
          while ((mp = mountlist_iterator_next(iter)) != NULL) {
#else
          mtx_lock(&mountlist_mtx);
          TAILQ_FOREACH(mp, &mountlist, mnt_list) {
#endif
                    fromname = mp->mnt_stat.f_mntfromname;
                    if (strcmp(fromname, oldname) == 0) {
                              (void)strlcpy(fromname, newname,
                                  sizeof(mp->mnt_stat.f_mntfromname));
                              continue;
                    }
                    if (strncmp(fromname, oldname, oldlen) == 0 &&
                        (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
                              (void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
                                  newname, fromname + oldlen);
                              (void)strlcpy(fromname, tmpbuf,
                                  sizeof(mp->mnt_stat.f_mntfromname));
                              continue;
                    }
          }
#ifdef __NetBSD__
          mountlist_iterator_destroy(iter);
#else
          mtx_unlock(&mountlist_mtx);
#endif
}
#endif
#endif