Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/sysmacros.h>
     30 #include <sys/kmem.h>
     31 #include <sys/pathname.h>
     32 #include <sys/vnode.h>
     33 #include <sys/vfs.h>
     34 #include <sys/vfs_opreg.h>
     35 #include <sys/mntent.h>
     36 #include <sys/mount.h>
     37 #include <sys/cmn_err.h>
     38 #include "fs/fs_subr.h"
     39 #include <sys/zfs_znode.h>
     40 #include <sys/zfs_dir.h>
     41 #include <sys/zil.h>
     42 #include <sys/fs/zfs.h>
     43 #include <sys/dmu.h>
     44 #include <sys/dsl_prop.h>
     45 #include <sys/dsl_dataset.h>
     46 #include <sys/dsl_deleg.h>
     47 #include <sys/spa.h>
     48 #include <sys/zap.h>
     49 #include <sys/varargs.h>
     50 #include <sys/policy.h>
     51 #include <sys/atomic.h>
     52 #include <sys/mkdev.h>
     53 #include <sys/modctl.h>
     54 #include <sys/refstr.h>
     55 #include <sys/zfs_ioctl.h>
     56 #include <sys/zfs_ctldir.h>
     57 #include <sys/zfs_fuid.h>
     58 #include <sys/bootconf.h>
     59 #include <sys/sunddi.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/dmu_objset.h>
     62 #include <sys/spa_boot.h>
     63 
     64 int zfsfstype;
     65 vfsops_t *zfs_vfsops = NULL;
     66 static major_t zfs_major;
     67 static minor_t zfs_minor;
     68 static kmutex_t	zfs_dev_mtx;
     69 
     70 extern int sys_shutdown;
     71 
     72 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
     73 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
     74 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
     75 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
     76 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
     77 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
     78 static void zfs_freevfs(vfs_t *vfsp);
     79 
     80 static const fs_operation_def_t zfs_vfsops_template[] = {
     81 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
     82 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
     83 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
     84 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
     85 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
     86 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
     87 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
     88 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
     89 	NULL,			NULL
     90 };
     91 
     92 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
     93 	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
     94 	NULL,			NULL
     95 };
     96 
     97 /*
     98  * We need to keep a count of active fs's.
     99  * This is necessary to prevent our module
    100  * from being unloaded after a umount -f
    101  */
    102 static uint32_t	zfs_active_fs_count = 0;
    103 
    104 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
    105 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
    106 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
    107 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
    108 
    109 /*
    110  * MO_DEFAULT is not used since the default value is determined
    111  * by the equivalent property.
    112  */
    113 static mntopt_t mntopts[] = {
    114 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
    115 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
    116 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
    117 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
    118 };
    119 
    120 static mntopts_t zfs_mntopts = {
    121 	sizeof (mntopts) / sizeof (mntopt_t),
    122 	mntopts
    123 };
    124 
    125 /*ARGSUSED*/
    126 int
    127 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
    128 {
    129 	/*
    130 	 * Data integrity is job one.  We don't want a compromised kernel
    131 	 * writing to the storage pool, so we never sync during panic.
    132 	 */
    133 	if (panicstr)
    134 		return (0);
    135 
    136 	/*
    137 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
    138 	 * to sync metadata, which they would otherwise cache indefinitely.
    139 	 * Semantically, the only requirement is that the sync be initiated.
    140 	 * The DMU syncs out txgs frequently, so there's nothing to do.
    141 	 */
    142 	if (flag & SYNC_ATTR)
    143 		return (0);
    144 
    145 	if (vfsp != NULL) {
    146 		/*
    147 		 * Sync a specific filesystem.
    148 		 */
    149 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    150 		dsl_pool_t *dp;
    151 
    152 		ZFS_ENTER(zfsvfs);
    153 		dp = dmu_objset_pool(zfsvfs->z_os);
    154 
    155 		/*
    156 		 * If the system is shutting down, then skip any
    157 		 * filesystems which may exist on a suspended pool.
    158 		 */
    159 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
    160 			ZFS_EXIT(zfsvfs);
    161 			return (0);
    162 		}
    163 
    164 		if (zfsvfs->z_log != NULL)
    165 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
    166 		else
    167 			txg_wait_synced(dp, 0);
    168 		ZFS_EXIT(zfsvfs);
    169 	} else {
    170 		/*
    171 		 * Sync all ZFS filesystems.  This is what happens when you
    172 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    173 		 * request by waiting for all pools to commit all dirty data.
    174 		 */
    175 		spa_sync_allpools();
    176 	}
    177 
    178 	return (0);
    179 }
    180 
    181 static int
    182 zfs_create_unique_device(dev_t *dev)
    183 {
    184 	major_t new_major;
    185 
    186 	do {
    187 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    188 		minor_t start = zfs_minor;
    189 		do {
    190 			mutex_enter(&zfs_dev_mtx);
    191 			if (zfs_minor >= MAXMIN32) {
    192 				/*
    193 				 * If we're still using the real major
    194 				 * keep out of /dev/zfs and /dev/zvol minor
    195 				 * number space.  If we're using a getudev()'ed
    196 				 * major number, we can use all of its minors.
    197 				 */
    198 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    199 					zfs_minor = ZFS_MIN_MINOR;
    200 				else
    201 					zfs_minor = 0;
    202 			} else {
    203 				zfs_minor++;
    204 			}
    205 			*dev = makedevice(zfs_major, zfs_minor);
    206 			mutex_exit(&zfs_dev_mtx);
    207 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    208 		if (zfs_minor == start) {
    209 			/*
    210 			 * We are using all ~262,000 minor numbers for the
    211 			 * current major number.  Create a new major number.
    212 			 */
    213 			if ((new_major = getudev()) == (major_t)-1) {
    214 				cmn_err(CE_WARN,
    215 				    "zfs_mount: Can't get unique major "
    216 				    "device number.");
    217 				return (-1);
    218 			}
    219 			mutex_enter(&zfs_dev_mtx);
    220 			zfs_major = new_major;
    221 			zfs_minor = 0;
    222 
    223 			mutex_exit(&zfs_dev_mtx);
    224 		} else {
    225 			break;
    226 		}
    227 		/* CONSTANTCONDITION */
    228 	} while (1);
    229 
    230 	return (0);
    231 }
    232 
    233 static void
    234 atime_changed_cb(void *arg, uint64_t newval)
    235 {
    236 	zfsvfs_t *zfsvfs = arg;
    237 
    238 	if (newval == TRUE) {
    239 		zfsvfs->z_atime = TRUE;
    240 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    241 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    242 	} else {
    243 		zfsvfs->z_atime = FALSE;
    244 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    245 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    246 	}
    247 }
    248 
    249 static void
    250 xattr_changed_cb(void *arg, uint64_t newval)
    251 {
    252 	zfsvfs_t *zfsvfs = arg;
    253 
    254 	if (newval == TRUE) {
    255 		/* XXX locking on vfs_flag? */
    256 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    257 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    258 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    259 	} else {
    260 		/* XXX locking on vfs_flag? */
    261 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    262 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    263 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    264 	}
    265 }
    266 
    267 static void
    268 blksz_changed_cb(void *arg, uint64_t newval)
    269 {
    270 	zfsvfs_t *zfsvfs = arg;
    271 
    272 	if (newval < SPA_MINBLOCKSIZE ||
    273 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
    274 		newval = SPA_MAXBLOCKSIZE;
    275 
    276 	zfsvfs->z_max_blksz = newval;
    277 	zfsvfs->z_vfs->vfs_bsize = newval;
    278 }
    279 
    280 static void
    281 readonly_changed_cb(void *arg, uint64_t newval)
    282 {
    283 	zfsvfs_t *zfsvfs = arg;
    284 
    285 	if (newval) {
    286 		/* XXX locking on vfs_flag? */
    287 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    288 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    289 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    290 	} else {
    291 		/* XXX locking on vfs_flag? */
    292 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    293 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    294 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    295 	}
    296 }
    297 
    298 static void
    299 devices_changed_cb(void *arg, uint64_t newval)
    300 {
    301 	zfsvfs_t *zfsvfs = arg;
    302 
    303 	if (newval == FALSE) {
    304 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
    305 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
    306 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
    307 	} else {
    308 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
    309 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
    310 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
    311 	}
    312 }
    313 
    314 static void
    315 setuid_changed_cb(void *arg, uint64_t newval)
    316 {
    317 	zfsvfs_t *zfsvfs = arg;
    318 
    319 	if (newval == FALSE) {
    320 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    321 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    322 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    323 	} else {
    324 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    325 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    326 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    327 	}
    328 }
    329 
    330 static void
    331 exec_changed_cb(void *arg, uint64_t newval)
    332 {
    333 	zfsvfs_t *zfsvfs = arg;
    334 
    335 	if (newval == FALSE) {
    336 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    337 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    338 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    339 	} else {
    340 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    341 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    342 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    343 	}
    344 }
    345 
    346 /*
    347  * The nbmand mount option can be changed at mount time.
    348  * We can't allow it to be toggled on live file systems or incorrect
    349  * behavior may be seen from cifs clients
    350  *
    351  * This property isn't registered via dsl_prop_register(), but this callback
    352  * will be called when a file system is first mounted
    353  */
    354 static void
    355 nbmand_changed_cb(void *arg, uint64_t newval)
    356 {
    357 	zfsvfs_t *zfsvfs = arg;
    358 	if (newval == FALSE) {
    359 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    360 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    361 	} else {
    362 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    363 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    364 	}
    365 }
    366 
    367 static void
    368 snapdir_changed_cb(void *arg, uint64_t newval)
    369 {
    370 	zfsvfs_t *zfsvfs = arg;
    371 
    372 	zfsvfs->z_show_ctldir = newval;
    373 }
    374 
    375 static void
    376 vscan_changed_cb(void *arg, uint64_t newval)
    377 {
    378 	zfsvfs_t *zfsvfs = arg;
    379 
    380 	zfsvfs->z_vscan = newval;
    381 }
    382 
    383 static void
    384 acl_mode_changed_cb(void *arg, uint64_t newval)
    385 {
    386 	zfsvfs_t *zfsvfs = arg;
    387 
    388 	zfsvfs->z_acl_mode = newval;
    389 }
    390 
    391 static void
    392 acl_inherit_changed_cb(void *arg, uint64_t newval)
    393 {
    394 	zfsvfs_t *zfsvfs = arg;
    395 
    396 	zfsvfs->z_acl_inherit = newval;
    397 }
    398 
    399 static int
    400 zfs_register_callbacks(vfs_t *vfsp)
    401 {
    402 	struct dsl_dataset *ds = NULL;
    403 	objset_t *os = NULL;
    404 	zfsvfs_t *zfsvfs = NULL;
    405 	uint64_t nbmand;
    406 	int readonly, do_readonly = B_FALSE;
    407 	int setuid, do_setuid = B_FALSE;
    408 	int exec, do_exec = B_FALSE;
    409 	int devices, do_devices = B_FALSE;
    410 	int xattr, do_xattr = B_FALSE;
    411 	int atime, do_atime = B_FALSE;
    412 	int error = 0;
    413 
    414 	ASSERT(vfsp);
    415 	zfsvfs = vfsp->vfs_data;
    416 	ASSERT(zfsvfs);
    417 	os = zfsvfs->z_os;
    418 
    419 	/*
    420 	 * The act of registering our callbacks will destroy any mount
    421 	 * options we may have.  In order to enable temporary overrides
    422 	 * of mount options, we stash away the current values and
    423 	 * restore them after we register the callbacks.
    424 	 */
    425 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
    426 		readonly = B_TRUE;
    427 		do_readonly = B_TRUE;
    428 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    429 		readonly = B_FALSE;
    430 		do_readonly = B_TRUE;
    431 	}
    432 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    433 		devices = B_FALSE;
    434 		setuid = B_FALSE;
    435 		do_devices = B_TRUE;
    436 		do_setuid = B_TRUE;
    437 	} else {
    438 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
    439 			devices = B_FALSE;
    440 			do_devices = B_TRUE;
    441 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
    442 			devices = B_TRUE;
    443 			do_devices = B_TRUE;
    444 		}
    445 
    446 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    447 			setuid = B_FALSE;
    448 			do_setuid = B_TRUE;
    449 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    450 			setuid = B_TRUE;
    451 			do_setuid = B_TRUE;
    452 		}
    453 	}
    454 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    455 		exec = B_FALSE;
    456 		do_exec = B_TRUE;
    457 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    458 		exec = B_TRUE;
    459 		do_exec = B_TRUE;
    460 	}
    461 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    462 		xattr = B_FALSE;
    463 		do_xattr = B_TRUE;
    464 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    465 		xattr = B_TRUE;
    466 		do_xattr = B_TRUE;
    467 	}
    468 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    469 		atime = B_FALSE;
    470 		do_atime = B_TRUE;
    471 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    472 		atime = B_TRUE;
    473 		do_atime = B_TRUE;
    474 	}
    475 
    476 	/*
    477 	 * nbmand is a special property.  It can only be changed at
    478 	 * mount time.
    479 	 *
    480 	 * This is weird, but it is documented to only be changeable
    481 	 * at mount time.
    482 	 */
    483 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    484 		nbmand = B_FALSE;
    485 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    486 		nbmand = B_TRUE;
    487 	} else {
    488 		char osname[MAXNAMELEN];
    489 
    490 		dmu_objset_name(os, osname);
    491 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
    492 		    NULL)) {
    493 			return (error);
    494 		}
    495 	}
    496 
    497 	/*
    498 	 * Register property callbacks.
    499 	 *
    500 	 * It would probably be fine to just check for i/o error from
    501 	 * the first prop_register(), but I guess I like to go
    502 	 * overboard...
    503 	 */
    504 	ds = dmu_objset_ds(os);
    505 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
    506 	error = error ? error : dsl_prop_register(ds,
    507 	    "xattr", xattr_changed_cb, zfsvfs);
    508 	error = error ? error : dsl_prop_register(ds,
    509 	    "recordsize", blksz_changed_cb, zfsvfs);
    510 	error = error ? error : dsl_prop_register(ds,
    511 	    "readonly", readonly_changed_cb, zfsvfs);
    512 	error = error ? error : dsl_prop_register(ds,
    513 	    "devices", devices_changed_cb, zfsvfs);
    514 	error = error ? error : dsl_prop_register(ds,
    515 	    "setuid", setuid_changed_cb, zfsvfs);
    516 	error = error ? error : dsl_prop_register(ds,
    517 	    "exec", exec_changed_cb, zfsvfs);
    518 	error = error ? error : dsl_prop_register(ds,
    519 	    "snapdir", snapdir_changed_cb, zfsvfs);
    520 	error = error ? error : dsl_prop_register(ds,
    521 	    "aclmode", acl_mode_changed_cb, zfsvfs);
    522 	error = error ? error : dsl_prop_register(ds,
    523 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
    524 	error = error ? error : dsl_prop_register(ds,
    525 	    "vscan", vscan_changed_cb, zfsvfs);
    526 	if (error)
    527 		goto unregister;
    528 
    529 	/*
    530 	 * Invoke our callbacks to restore temporary mount options.
    531 	 */
    532 	if (do_readonly)
    533 		readonly_changed_cb(zfsvfs, readonly);
    534 	if (do_setuid)
    535 		setuid_changed_cb(zfsvfs, setuid);
    536 	if (do_exec)
    537 		exec_changed_cb(zfsvfs, exec);
    538 	if (do_devices)
    539 		devices_changed_cb(zfsvfs, devices);
    540 	if (do_xattr)
    541 		xattr_changed_cb(zfsvfs, xattr);
    542 	if (do_atime)
    543 		atime_changed_cb(zfsvfs, atime);
    544 
    545 	nbmand_changed_cb(zfsvfs, nbmand);
    546 
    547 	return (0);
    548 
    549 unregister:
    550 	/*
    551 	 * We may attempt to unregister some callbacks that are not
    552 	 * registered, but this is OK; it will simply return ENOMSG,
    553 	 * which we will ignore.
    554 	 */
    555 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
    556 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
    557 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
    558 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
    559 	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
    560 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
    561 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
    562 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
    563 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
    564 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
    565 	    zfsvfs);
    566 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
    567 	return (error);
    568 
    569 }
    570 
    571 static void
    572 uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
    573     int64_t delta, dmu_tx_t *tx)
    574 {
    575 	uint64_t used = 0;
    576 	char buf[32];
    577 	int err;
    578 	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
    579 
    580 	if (delta == 0)
    581 		return;
    582 
    583 	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
    584 	err = zap_lookup(os, obj, buf, 8, 1, &used);
    585 	ASSERT(err == 0 || err == ENOENT);
    586 	/* no underflow/overflow */
    587 	ASSERT(delta > 0 || used >= -delta);
    588 	ASSERT(delta < 0 || used + delta > used);
    589 	used += delta;
    590 	if (used == 0)
    591 		err = zap_remove(os, obj, buf, tx);
    592 	else
    593 		err = zap_update(os, obj, buf, 8, 1, &used, tx);
    594 	ASSERT(err == 0);
    595 }
    596 
    597 static int
    598 zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
    599     uint64_t *userp, uint64_t *groupp)
    600 {
    601 	znode_phys_t *znp = bonus;
    602 
    603 	if (bonustype != DMU_OT_ZNODE)
    604 		return (ENOENT);
    605 
    606 	*userp = znp->zp_uid;
    607 	*groupp = znp->zp_gid;
    608 	return (0);
    609 }
    610 
    611 static void
    612 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
    613     char *domainbuf, int buflen, uid_t *ridp)
    614 {
    615 	uint64_t fuid;
    616 	const char *domain;
    617 
    618 	fuid = strtonum(fuidstr, NULL);
    619 
    620 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
    621 	if (domain)
    622 		(void) strlcpy(domainbuf, domain, buflen);
    623 	else
    624 		domainbuf[0] = '\0';
    625 	*ridp = FUID_RID(fuid);
    626 }
    627 
    628 static uint64_t
    629 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
    630 {
    631 	switch (type) {
    632 	case ZFS_PROP_USERUSED:
    633 		return (DMU_USERUSED_OBJECT);
    634 	case ZFS_PROP_GROUPUSED:
    635 		return (DMU_GROUPUSED_OBJECT);
    636 	case ZFS_PROP_USERQUOTA:
    637 		return (zfsvfs->z_userquota_obj);
    638 	case ZFS_PROP_GROUPQUOTA:
    639 		return (zfsvfs->z_groupquota_obj);
    640 	}
    641 	return (0);
    642 }
    643 
    644 int
    645 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    646     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
    647 {
    648 	int error;
    649 	zap_cursor_t zc;
    650 	zap_attribute_t za;
    651 	zfs_useracct_t *buf = vbuf;
    652 	uint64_t obj;
    653 
    654 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    655 		return (ENOTSUP);
    656 
    657 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    658 	if (obj == 0) {
    659 		*bufsizep = 0;
    660 		return (0);
    661 	}
    662 
    663 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
    664 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
    665 	    zap_cursor_advance(&zc)) {
    666 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
    667 		    *bufsizep)
    668 			break;
    669 
    670 		fuidstr_to_sid(zfsvfs, za.za_name,
    671 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
    672 
    673 		buf->zu_space = za.za_first_integer;
    674 		buf++;
    675 	}
    676 	if (error == ENOENT)
    677 		error = 0;
    678 
    679 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
    680 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
    681 	*cookiep = zap_cursor_serialize(&zc);
    682 	zap_cursor_fini(&zc);
    683 	return (error);
    684 }
    685 
    686 /*
    687  * buf must be big enough (eg, 32 bytes)
    688  */
    689 static int
    690 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
    691     char *buf, boolean_t addok)
    692 {
    693 	uint64_t fuid;
    694 	int domainid = 0;
    695 
    696 	if (domain && domain[0]) {
    697 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
    698 		if (domainid == -1)
    699 			return (ENOENT);
    700 	}
    701 	fuid = FUID_ENCODE(domainid, rid);
    702 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
    703 	return (0);
    704 }
    705 
    706 int
    707 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    708     const char *domain, uint64_t rid, uint64_t *valp)
    709 {
    710 	char buf[32];
    711 	int err;
    712 	uint64_t obj;
    713 
    714 	*valp = 0;
    715 
    716 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
    717 		return (ENOTSUP);
    718 
    719 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
    720 	if (obj == 0)
    721 		return (0);
    722 
    723 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
    724 	if (err)
    725 		return (err);
    726 
    727 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
    728 	if (err == ENOENT)
    729 		err = 0;
    730 	return (err);
    731 }
    732 
    733 int
    734 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
    735     const char *domain, uint64_t rid, uint64_t quota)
    736 {
    737 	char buf[32];
    738 	int err;
    739 	dmu_tx_t *tx;
    740 	uint64_t *objp;
    741 	boolean_t fuid_dirtied;
    742 
    743 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
    744 		return (EINVAL);
    745 
    746 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
    747 		return (ENOTSUP);
    748 
    749 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
    750 	    &zfsvfs->z_groupquota_obj;
    751 
    752 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
    753 	if (err)
    754 		return (err);
    755 	fuid_dirtied = zfsvfs->z_fuid_dirty;
    756 
    757 	tx = dmu_tx_create(zfsvfs->z_os);
    758 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
    759 	if (*objp == 0) {
    760 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
    761 		    zfs_userquota_prop_prefixes[type]);
    762 	}
    763 	if (fuid_dirtied)
    764 		zfs_fuid_txhold(zfsvfs, tx);
    765 	err = dmu_tx_assign(tx, TXG_WAIT);
    766 	if (err) {
    767 		dmu_tx_abort(tx);
    768 		return (err);
    769 	}
    770 
    771 	mutex_enter(&zfsvfs->z_lock);
    772 	if (*objp == 0) {
    773 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
    774 		    DMU_OT_NONE, 0, tx);
    775 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
    776 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
    777 	}
    778 	mutex_exit(&zfsvfs->z_lock);
    779 
    780 	if (quota == 0) {
    781 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
    782 		if (err == ENOENT)
    783 			err = 0;
    784 	} else {
    785 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
    786 	}
    787 	ASSERT(err == 0);
    788 	if (fuid_dirtied)
    789 		zfs_fuid_sync(zfsvfs, tx);
    790 	dmu_tx_commit(tx);
    791 	return (err);
    792 }
    793 
    794 boolean_t
    795 zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
    796 {
    797 	char buf[32];
    798 	uint64_t used, quota, usedobj, quotaobj;
    799 	int err;
    800 
    801 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
    802 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
    803 
    804 	if (quotaobj == 0 || zfsvfs->z_replay)
    805 		return (B_FALSE);
    806 
    807 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
    808 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
    809 	if (err != 0)
    810 		return (B_FALSE);
    811 
    812 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
    813 	if (err != 0)
    814 		return (B_FALSE);
    815 	return (used >= quota);
    816 }
    817 
    818 int
    819 zfsvfs_create(const char *osname, zfsvfs_t **zvp)
    820 {
    821 	objset_t *os;
    822 	zfsvfs_t *zfsvfs;
    823 	uint64_t zval;
    824 	int i, error;
    825 
    826 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
    827 
    828 	/*
    829 	 * We claim to always be readonly so we can open snapshots;
    830 	 * other ZPL code will prevent us from writing to snapshots.
    831 	 */
    832 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
    833 	if (error) {
    834 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
    835 		return (error);
    836 	}
    837 
    838 	/*
    839 	 * Initialize the zfs-specific filesystem structure.
    840 	 * Should probably make this a kmem cache, shuffle fields,
    841 	 * and just bzero up to z_hold_mtx[].
    842 	 */
    843 	zfsvfs->z_vfs = NULL;
    844 	zfsvfs->z_parent = zfsvfs;
    845 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
    846 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
    847 	zfsvfs->z_os = os;
    848 
    849 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
    850 	if (error) {
    851 		goto out;
    852 	} else if (zfsvfs->z_version > ZPL_VERSION) {
    853 		(void) printf("Mismatched versions:  File system "
    854 		    "is version %llu on-disk format, which is "
    855 		    "incompatible with this software version %lld!",
    856 		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
    857 		error = ENOTSUP;
    858 		goto out;
    859 	}
    860 
    861 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
    862 		goto out;
    863 	zfsvfs->z_norm = (int)zval;
    864 
    865 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
    866 		goto out;
    867 	zfsvfs->z_utf8 = (zval != 0);
    868 
    869 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
    870 		goto out;
    871 	zfsvfs->z_case = (uint_t)zval;
    872 
    873 	/*
    874 	 * Fold case on file systems that are always or sometimes case
    875 	 * insensitive.
    876 	 */
    877 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
    878 	    zfsvfs->z_case == ZFS_CASE_MIXED)
    879 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
    880 
    881 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
    882 
    883 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
    884 	    &zfsvfs->z_root);
    885 	if (error)
    886 		goto out;
    887 	ASSERT(zfsvfs->z_root != 0);
    888 
    889 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
    890 	    &zfsvfs->z_unlinkedobj);
    891 	if (error)
    892 		goto out;
    893 
    894 	error = zap_lookup(os, MASTER_NODE_OBJ,
    895 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
    896 	    8, 1, &zfsvfs->z_userquota_obj);
    897 	if (error && error != ENOENT)
    898 		goto out;
    899 
    900 	error = zap_lookup(os, MASTER_NODE_OBJ,
    901 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
    902 	    8, 1, &zfsvfs->z_groupquota_obj);
    903 	if (error && error != ENOENT)
    904 		goto out;
    905 
    906 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
    907 	    &zfsvfs->z_fuid_obj);
    908 	if (error && error != ENOENT)
    909 		goto out;
    910 
    911 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
    912 	    &zfsvfs->z_shares_dir);
    913 	if (error && error != ENOENT)
    914 		goto out;
    915 
    916 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
    917 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
    918 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
    919 	    offsetof(znode_t, z_link_node));
    920 	rrw_init(&zfsvfs->z_teardown_lock);
    921 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
    922 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
    923 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
    924 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
    925 
    926 	*zvp = zfsvfs;
    927 	return (0);
    928 
    929 out:
    930 	dmu_objset_disown(os, zfsvfs);
    931 	*zvp = NULL;
    932 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
    933 	return (error);
    934 }
    935 
    936 static int
    937 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
    938 {
    939 	int error;
    940 
    941 	error = zfs_register_callbacks(zfsvfs->z_vfs);
    942 	if (error)
    943 		return (error);
    944 
    945 	/*
    946 	 * Set the objset user_ptr to track its zfsvfs.
    947 	 */
    948 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
    949 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
    950 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
    951 
    952 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
    953 	if (zil_disable) {
    954 		zil_destroy(zfsvfs->z_log, B_FALSE);
    955 		zfsvfs->z_log = NULL;
    956 	}
    957 
    958 	/*
    959 	 * If we are not mounting (ie: online recv), then we don't
    960 	 * have to worry about replaying the log as we blocked all
    961 	 * operations out since we closed the ZIL.
    962 	 */
    963 	if (mounting) {
    964 		boolean_t readonly;
    965 
    966 		/*
    967 		 * During replay we remove the read only flag to
    968 		 * allow replays to succeed.
    969 		 */
    970 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
    971 		if (readonly != 0)
    972 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    973 		else
    974 			zfs_unlinked_drain(zfsvfs);
    975 
    976 		if (zfsvfs->z_log) {
    977 			/*
    978 			 * Parse and replay the intent log.
    979 			 *
    980 			 * Because of ziltest, this must be done after
    981 			 * zfs_unlinked_drain().  (Further note: ziltest
    982 			 * doesn't use readonly mounts, where
    983 			 * zfs_unlinked_drain() isn't called.)  This is because
    984 			 * ziltest causes spa_sync() to think it's committed,
    985 			 * but actually it is not, so the intent log contains
    986 			 * many txg's worth of changes.
    987 			 *
    988 			 * In particular, if object N is in the unlinked set in
    989 			 * the last txg to actually sync, then it could be
    990 			 * actually freed in a later txg and then reallocated
    991 			 * in a yet later txg.  This would write a "create
    992 			 * object N" record to the intent log.  Normally, this
    993 			 * would be fine because the spa_sync() would have
    994 			 * written out the fact that object N is free, before
    995 			 * we could write the "create object N" intent log
    996 			 * record.
    997 			 *
    998 			 * But when we are in ziltest mode, we advance the "open
    999 			 * txg" without actually spa_sync()-ing the changes to
   1000 			 * disk.  So we would see that object N is still
   1001 			 * allocated and in the unlinked set, and there is an
   1002 			 * intent log record saying to allocate it.
   1003 			 */
   1004 			zfsvfs->z_replay = B_TRUE;
   1005 			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
   1006 			zfsvfs->z_replay = B_FALSE;
   1007 		}
   1008 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
   1009 	}
   1010 
   1011 	return (0);
   1012 }
   1013 
   1014 void
   1015 zfsvfs_free(zfsvfs_t *zfsvfs)
   1016 {
   1017 	int i;
   1018 	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
   1019 
   1020 	/*
   1021 	 * This is a barrier to prevent the filesystem from going away in
   1022 	 * zfs_znode_move() until we can safely ensure that the filesystem is
   1023 	 * not unmounted. We consider the filesystem valid before the barrier
   1024 	 * and invalid after the barrier.
   1025 	 */
   1026 	rw_enter(&zfsvfs_lock, RW_READER);
   1027 	rw_exit(&zfsvfs_lock);
   1028 
   1029 	zfs_fuid_destroy(zfsvfs);
   1030 
   1031 	mutex_destroy(&zfsvfs->z_znodes_lock);
   1032 	mutex_destroy(&zfsvfs->z_lock);
   1033 	list_destroy(&zfsvfs->z_all_znodes);
   1034 	rrw_destroy(&zfsvfs->z_teardown_lock);
   1035 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
   1036 	rw_destroy(&zfsvfs->z_fuid_lock);
   1037 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
   1038 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
   1039 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
   1040 }
   1041 
   1042 static void
   1043 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
   1044 {
   1045 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
   1046 	if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
   1047 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
   1048 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
   1049 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
   1050 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
   1051 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
   1052 		vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
   1053 	}
   1054 }
   1055 
   1056 static int
   1057 zfs_domount(vfs_t *vfsp, char *osname)
   1058 {
   1059 	dev_t mount_dev;
   1060 	uint64_t recordsize, fsid_guid;
   1061 	int error = 0;
   1062 	zfsvfs_t *zfsvfs;
   1063 
   1064 	ASSERT(vfsp);
   1065 	ASSERT(osname);
   1066 
   1067 	error = zfsvfs_create(osname, &zfsvfs);
   1068 	if (error)
   1069 		return (error);
   1070 	zfsvfs->z_vfs = vfsp;
   1071 
   1072 	/* Initialize the generic filesystem structure. */
   1073 	vfsp->vfs_bcount = 0;
   1074 	vfsp->vfs_data = NULL;
   1075 
   1076 	if (zfs_create_unique_device(&mount_dev) == -1) {
   1077 		error = ENODEV;
   1078 		goto out;
   1079 	}
   1080 	ASSERT(vfs_devismounted(mount_dev) == 0);
   1081 
   1082 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
   1083 	    NULL))
   1084 		goto out;
   1085 
   1086 	vfsp->vfs_dev = mount_dev;
   1087 	vfsp->vfs_fstype = zfsfstype;
   1088 	vfsp->vfs_bsize = recordsize;
   1089 	vfsp->vfs_flag |= VFS_NOTRUNC;
   1090 	vfsp->vfs_data = zfsvfs;
   1091 
   1092 	/*
   1093 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
   1094 	 * separates our fsid from any other filesystem types, and a
   1095 	 * 56-bit objset unique ID.  The objset unique ID is unique to
   1096 	 * all objsets open on this system, provided by unique_create().
   1097 	 * The 8-bit fs type must be put in the low bits of fsid[1]
   1098 	 * because that's where other Solaris filesystems put it.
   1099 	 */
   1100 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
   1101 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
   1102 	vfsp->vfs_fsid.val[0] = fsid_guid;
   1103 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
   1104 	    zfsfstype & 0xFF;
   1105 
   1106 	/*
   1107 	 * Set features for file system.
   1108 	 */
   1109 	zfs_set_fuid_feature(zfsvfs);
   1110 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
   1111 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1112 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1113 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
   1114 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
   1115 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
   1116 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
   1117 	}
   1118 
   1119 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
   1120 		uint64_t pval;
   1121 
   1122 		atime_changed_cb(zfsvfs, B_FALSE);
   1123 		readonly_changed_cb(zfsvfs, B_TRUE);
   1124 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
   1125 			goto out;
   1126 		xattr_changed_cb(zfsvfs, pval);
   1127 		zfsvfs->z_issnap = B_TRUE;
   1128 
   1129 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
   1130 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
   1131 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
   1132 	} else {
   1133 		error = zfsvfs_setup(zfsvfs, B_TRUE);
   1134 	}
   1135 
   1136 	if (!zfsvfs->z_issnap)
   1137 		zfsctl_create(zfsvfs);
   1138 out:
   1139 	if (error) {
   1140 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
   1141 		zfsvfs_free(zfsvfs);
   1142 	} else {
   1143 		atomic_add_32(&zfs_active_fs_count, 1);
   1144 	}
   1145 
   1146 	return (error);
   1147 }
   1148 
   1149 void
   1150 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
   1151 {
   1152 	objset_t *os = zfsvfs->z_os;
   1153 	struct dsl_dataset *ds;
   1154 
   1155 	/*
   1156 	 * Unregister properties.
   1157 	 */
   1158 	if (!dmu_objset_is_snapshot(os)) {
   1159 		ds = dmu_objset_ds(os);
   1160 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
   1161 		    zfsvfs) == 0);
   1162 
   1163 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
   1164 		    zfsvfs) == 0);
   1165 
   1166 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
   1167 		    zfsvfs) == 0);
   1168 
   1169 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
   1170 		    zfsvfs) == 0);
   1171 
   1172 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
   1173 		    zfsvfs) == 0);
   1174 
   1175 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
   1176 		    zfsvfs) == 0);
   1177 
   1178 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
   1179 		    zfsvfs) == 0);
   1180 
   1181 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
   1182 		    zfsvfs) == 0);
   1183 
   1184 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
   1185 		    zfsvfs) == 0);
   1186 
   1187 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
   1188 		    acl_inherit_changed_cb, zfsvfs) == 0);
   1189 
   1190 		VERIFY(dsl_prop_unregister(ds, "vscan",
   1191 		    vscan_changed_cb, zfsvfs) == 0);
   1192 	}
   1193 }
   1194 
   1195 /*
   1196  * Convert a decimal digit string to a uint64_t integer.
   1197  */
   1198 static int
   1199 str_to_uint64(char *str, uint64_t *objnum)
   1200 {
   1201 	uint64_t num = 0;
   1202 
   1203 	while (*str) {
   1204 		if (*str < '0' || *str > '9')
   1205 			return (EINVAL);
   1206 
   1207 		num = num*10 + *str++ - '0';
   1208 	}
   1209 
   1210 	*objnum = num;
   1211 	return (0);
   1212 }
   1213 
   1214 /*
   1215  * The boot path passed from the boot loader is in the form of
   1216  * "rootpool-name/root-filesystem-object-number'. Convert this
   1217  * string to a dataset name: "rootpool-name/root-filesystem-name".
   1218  */
   1219 static int
   1220 zfs_parse_bootfs(char *bpath, char *outpath)
   1221 {
   1222 	char *slashp;
   1223 	uint64_t objnum;
   1224 	int error;
   1225 
   1226 	if (*bpath == 0 || *bpath == '/')
   1227 		return (EINVAL);
   1228 
   1229 	(void) strcpy(outpath, bpath);
   1230 
   1231 	slashp = strchr(bpath, '/');
   1232 
   1233 	/* if no '/', just return the pool name */
   1234 	if (slashp == NULL) {
   1235 		return (0);
   1236 	}
   1237 
   1238 	/* if not a number, just return the root dataset name */
   1239 	if (str_to_uint64(slashp+1, &objnum)) {
   1240 		return (0);
   1241 	}
   1242 
   1243 	*slashp = '\0';
   1244 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
   1245 	*slashp = '/';
   1246 
   1247 	return (error);
   1248 }
   1249 
   1250 /*
   1251  * zfs_check_global_label:
   1252  *	Check that the hex label string is appropriate for the dataset
   1253  *	being mounted into the global_zone proper.
   1254  *
   1255  *	Return an error if the hex label string is not default or
   1256  *	admin_low/admin_high.  For admin_low labels, the corresponding
   1257  *	dataset must be readonly.
   1258  */
   1259 int
   1260 zfs_check_global_label(const char *dsname, const char *hexsl)
   1261 {
   1262 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
   1263 		return (0);
   1264 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
   1265 		return (0);
   1266 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
   1267 		/* must be readonly */
   1268 		uint64_t rdonly;
   1269 
   1270 		if (dsl_prop_get_integer(dsname,
   1271 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
   1272 			return (EACCES);
   1273 		return (rdonly ? 0 : EACCES);
   1274 	}
   1275 	return (EACCES);
   1276 }
   1277 
   1278 /*
   1279  * zfs_mount_label_policy:
   1280  *	Determine whether the mount is allowed according to MAC check.
   1281  *	by comparing (where appropriate) label of the dataset against
   1282  *	the label of the zone being mounted into.  If the dataset has
   1283  *	no label, create one.
   1284  *
   1285  *	Returns:
   1286  *		 0 :	access allowed
   1287  *		>0 :	error code, such as EACCES
   1288  */
   1289 static int
   1290 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
   1291 {
   1292 	int		error, retv;
   1293 	zone_t		*mntzone = NULL;
   1294 	ts_label_t	*mnt_tsl;
   1295 	bslabel_t	*mnt_sl;
   1296 	bslabel_t	ds_sl;
   1297 	char		ds_hexsl[MAXNAMELEN];
   1298 
   1299 	retv = EACCES;				/* assume the worst */
   1300 
   1301 	/*
   1302 	 * Start by getting the dataset label if it exists.
   1303 	 */
   1304 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
   1305 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
   1306 	if (error)
   1307 		return (EACCES);
   1308 
   1309 	/*
   1310 	 * If labeling is NOT enabled, then disallow the mount of datasets
   1311 	 * which have a non-default label already.  No other label checks
   1312 	 * are needed.
   1313 	 */
   1314 	if (!is_system_labeled()) {
   1315 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
   1316 			return (0);
   1317 		return (EACCES);
   1318 	}
   1319 
   1320 	/*
   1321 	 * Get the label of the mountpoint.  If mounting into the global
   1322 	 * zone (i.e. mountpoint is not within an active zone and the
   1323 	 * zoned property is off), the label must be default or
   1324 	 * admin_low/admin_high only; no other checks are needed.
   1325 	 */
   1326 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
   1327 	if (mntzone->zone_id == GLOBAL_ZONEID) {
   1328 		uint64_t zoned;
   1329 
   1330 		zone_rele(mntzone);
   1331 
   1332 		if (dsl_prop_get_integer(osname,
   1333 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
   1334 			return (EACCES);
   1335 		if (!zoned)
   1336 			return (zfs_check_global_label(osname, ds_hexsl));
   1337 		else
   1338 			/*
   1339 			 * This is the case of a zone dataset being mounted
   1340 			 * initially, before the zone has been fully created;
   1341 			 * allow this mount into global zone.
   1342 			 */
   1343 			return (0);
   1344 	}
   1345 
   1346 	mnt_tsl = mntzone->zone_slabel;
   1347 	ASSERT(mnt_tsl != NULL);
   1348 	label_hold(mnt_tsl);
   1349 	mnt_sl = label2bslabel(mnt_tsl);
   1350 
   1351 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
   1352 		/*
   1353 		 * The dataset doesn't have a real label, so fabricate one.
   1354 		 */
   1355 		char *str = NULL;
   1356 
   1357 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
   1358 		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
   1359 		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
   1360 			retv = 0;
   1361 		if (str != NULL)
   1362 			kmem_free(str, strlen(str) + 1);
   1363 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
   1364 		/*
   1365 		 * Now compare labels to complete the MAC check.  If the
   1366 		 * labels are equal then allow access.  If the mountpoint
   1367 		 * label dominates the dataset label, allow readonly access.
   1368 		 * Otherwise, access is denied.
   1369 		 */
   1370 		if (blequal(mnt_sl, &ds_sl))
   1371 			retv = 0;
   1372 		else if (bldominates(mnt_sl, &ds_sl)) {
   1373 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
   1374 			retv = 0;
   1375 		}
   1376 	}
   1377 
   1378 	label_rele(mnt_tsl);
   1379 	zone_rele(mntzone);
   1380 	return (retv);
   1381 }
   1382 
   1383 static int
   1384 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
   1385 {
   1386 	int error = 0;
   1387 	static int zfsrootdone = 0;
   1388 	zfsvfs_t *zfsvfs = NULL;
   1389 	znode_t *zp = NULL;
   1390 	vnode_t *vp = NULL;
   1391 	char *zfs_bootfs;
   1392 	char *zfs_devid;
   1393 
   1394 	ASSERT(vfsp);
   1395 
   1396 	/*
   1397 	 * The filesystem that we mount as root is defined in the
   1398 	 * boot property "zfs-bootfs" with a format of
   1399 	 * "poolname/root-dataset-objnum".
   1400 	 */
   1401 	if (why == ROOT_INIT) {
   1402 		if (zfsrootdone++)
   1403 			return (EBUSY);
   1404 		/*
   1405 		 * the process of doing a spa_load will require the
   1406 		 * clock to be set before we could (for example) do
   1407 		 * something better by looking at the timestamp on
   1408 		 * an uberblock, so just set it to -1.
   1409 		 */
   1410 		clkset(-1);
   1411 
   1412 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
   1413 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
   1414 			    "bootfs name");
   1415 			return (EINVAL);
   1416 		}
   1417 		zfs_devid = spa_get_bootprop("diskdevid");
   1418 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
   1419 		if (zfs_devid)
   1420 			spa_free_bootprop(zfs_devid);
   1421 		if (error) {
   1422 			spa_free_bootprop(zfs_bootfs);
   1423 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
   1424 			    error);
   1425 			return (error);
   1426 		}
   1427 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
   1428 			spa_free_bootprop(zfs_bootfs);
   1429 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
   1430 			    error);
   1431 			return (error);
   1432 		}
   1433 
   1434 		spa_free_bootprop(zfs_bootfs);
   1435 
   1436 		if (error = vfs_lock(vfsp))
   1437 			return (error);
   1438 
   1439 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
   1440 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
   1441 			goto out;
   1442 		}
   1443 
   1444 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
   1445 		ASSERT(zfsvfs);
   1446 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
   1447 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
   1448 			goto out;
   1449 		}
   1450 
   1451 		vp = ZTOV(zp);
   1452 		mutex_enter(&vp->v_lock);
   1453 		vp->v_flag |= VROOT;
   1454 		mutex_exit(&vp->v_lock);
   1455 		rootvp = vp;
   1456 
   1457 		/*
   1458 		 * Leave rootvp held.  The root file system is never unmounted.
   1459 		 */
   1460 
   1461 		vfs_add((struct vnode *)0, vfsp,
   1462 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
   1463 out:
   1464 		vfs_unlock(vfsp);
   1465 		return (error);
   1466 	} else if (why == ROOT_REMOUNT) {
   1467 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
   1468 		vfsp->vfs_flag |= VFS_REMOUNT;
   1469 
   1470 		/* refresh mount options */
   1471 		zfs_unregister_callbacks(vfsp->vfs_data);
   1472 		return (zfs_register_callbacks(vfsp));
   1473 
   1474 	} else if (why == ROOT_UNMOUNT) {
   1475 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
   1476 		(void) zfs_sync(vfsp, 0, 0);
   1477 		return (0);
   1478 	}
   1479 
   1480 	/*
   1481 	 * if "why" is equal to anything else other than ROOT_INIT,
   1482 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
   1483 	 */
   1484 	return (ENOTSUP);
   1485 }
   1486 
   1487 /*ARGSUSED*/
   1488 static int
   1489 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
   1490 {
   1491 	char		*osname;
   1492 	pathname_t	spn;
   1493 	int		error = 0;
   1494 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
   1495 	    UIO_SYSSPACE : UIO_USERSPACE;
   1496 	int		canwrite;
   1497 
   1498 	if (mvp->v_type != VDIR)
   1499 		return (ENOTDIR);
   1500 
   1501 	mutex_enter(&mvp->v_lock);
   1502 	if ((uap->flags & MS_REMOUNT) == 0 &&
   1503 	    (uap->flags & MS_OVERLAY) == 0 &&
   1504 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
   1505 		mutex_exit(&mvp->v_lock);
   1506 		return (EBUSY);
   1507 	}
   1508 	mutex_exit(&mvp->v_lock);
   1509 
   1510 	/*
   1511 	 * ZFS does not support passing unparsed data in via MS_DATA.
   1512 	 * Users should use the MS_OPTIONSTR interface; this means
   1513 	 * that all option parsing is already done and the options struct
   1514 	 * can be interrogated.
   1515 	 */
   1516 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
   1517 		return (EINVAL);
   1518 
   1519 	/*
   1520 	 * Get the objset name (the "special" mount argument).
   1521 	 */
   1522 	if (error = pn_get(uap->spec, fromspace, &spn))
   1523 		return (error);
   1524 
   1525 	osname = spn.pn_path;
   1526 
   1527 	/*
   1528 	 * Check for mount privilege?
   1529 	 *
   1530 	 * If we don't have privilege then see if
   1531 	 * we have local permission to allow it
   1532 	 */
   1533 	error = secpolicy_fs_mount(cr, mvp, vfsp);
   1534 	if (error) {
   1535 		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
   1536 		if (error == 0) {
   1537 			vattr_t		vattr;
   1538 
   1539 			/*
   1540 			 * Make sure user is the owner of the mount point
   1541 			 * or has sufficient privileges.
   1542 			 */
   1543 
   1544 			vattr.va_mask = AT_UID;
   1545 
   1546 			if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
   1547 				goto out;
   1548 			}
   1549 
   1550 			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
   1551 			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
   1552 				error = EPERM;
   1553 				goto out;
   1554 			}
   1555 
   1556 			secpolicy_fs_mount_clearopts(cr, vfsp);
   1557 		} else {
   1558 			goto out;
   1559 		}
   1560 	}
   1561 
   1562 	/*
   1563 	 * Refuse to mount a filesystem if we are in a local zone and the
   1564 	 * dataset is not visible.
   1565 	 */
   1566 	if (!INGLOBALZONE(curproc) &&
   1567 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
   1568 		error = EPERM;
   1569 		goto out;
   1570 	}
   1571 
   1572 	error = zfs_mount_label_policy(vfsp, osname);
   1573 	if (error)
   1574 		goto out;
   1575 
   1576 	/*
   1577 	 * When doing a remount, we simply refresh our temporary properties
   1578 	 * according to those options set in the current VFS options.
   1579 	 */
   1580 	if (uap->flags & MS_REMOUNT) {
   1581 		/* refresh mount options */
   1582 		zfs_unregister_callbacks(vfsp->vfs_data);
   1583 		error = zfs_register_callbacks(vfsp);
   1584 		goto out;
   1585 	}
   1586 
   1587 	error = zfs_domount(vfsp, osname);
   1588 
   1589 	/*
   1590 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
   1591 	 * disappear due to a forced unmount.
   1592 	 */
   1593 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
   1594 		VFS_HOLD(mvp->v_vfsp);
   1595 
   1596 out:
   1597 	pn_free(&spn);
   1598 	return (error);
   1599 }
   1600 
   1601 static int
   1602 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
   1603 {
   1604 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1605 	dev32_t d32;
   1606 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
   1607 
   1608 	ZFS_ENTER(zfsvfs);
   1609 
   1610 	dmu_objset_space(zfsvfs->z_os,
   1611 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
   1612 
   1613 	/*
   1614 	 * The underlying storage pool actually uses multiple block sizes.
   1615 	 * We report the fragsize as the smallest block size we support,
   1616 	 * and we report our blocksize as the filesystem's maximum blocksize.
   1617 	 */
   1618 	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
   1619 	statp->f_bsize = zfsvfs->z_max_blksz;
   1620 
   1621 	/*
   1622 	 * The following report "total" blocks of various kinds in the
   1623 	 * file system, but reported in terms of f_frsize - the
   1624 	 * "fragment" size.
   1625 	 */
   1626 
   1627 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
   1628 	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
   1629 	statp->f_bavail = statp->f_bfree; /* no root reservation */
   1630 
   1631 	/*
   1632 	 * statvfs() should really be called statufs(), because it assumes
   1633 	 * static metadata.  ZFS doesn't preallocate files, so the best
   1634 	 * we can do is report the max that could possibly fit in f_files,
   1635 	 * and that minus the number actually used in f_ffree.
   1636 	 * For f_ffree, report the smaller of the number of object available
   1637 	 * and the number of blocks (each object will take at least a block).
   1638 	 */
   1639 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
   1640 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
   1641 	statp->f_files = statp->f_ffree + usedobjs;
   1642 
   1643 	(void) cmpldev(&d32, vfsp->vfs_dev);
   1644 	statp->f_fsid = d32;
   1645 
   1646 	/*
   1647 	 * We're a zfs filesystem.
   1648 	 */
   1649 	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
   1650 
   1651 	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
   1652 
   1653 	statp->f_namemax = ZFS_MAXNAMELEN;
   1654 
   1655 	/*
   1656 	 * We have all of 32 characters to stuff a string here.
   1657 	 * Is there anything useful we could/should provide?
   1658 	 */
   1659 	bzero(statp->f_fstr, sizeof (statp->f_fstr));
   1660 
   1661 	ZFS_EXIT(zfsvfs);
   1662 	return (0);
   1663 }
   1664 
   1665 static int
   1666 zfs_root(vfs_t *vfsp, vnode_t **vpp)
   1667 {
   1668 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1669 	znode_t *rootzp;
   1670 	int error;
   1671 
   1672 	ZFS_ENTER(zfsvfs);
   1673 
   1674 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
   1675 	if (error == 0)
   1676 		*vpp = ZTOV(rootzp);
   1677 
   1678 	ZFS_EXIT(zfsvfs);
   1679 	return (error);
   1680 }
   1681 
   1682 /*
   1683  * Teardown the zfsvfs::z_os.
   1684  *
   1685  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
   1686  * and 'z_teardown_inactive_lock' held.
   1687  */
   1688 static int
   1689 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
   1690 {
   1691 	znode_t	*zp;
   1692 
   1693 	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
   1694 
   1695 	if (!unmounting) {
   1696 		/*
   1697 		 * We purge the parent filesystem's vfsp as the parent
   1698 		 * filesystem and all of its snapshots have their vnode's
   1699 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
   1700 		 * 'z_parent' is self referential for non-snapshots.
   1701 		 */
   1702 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   1703 	}
   1704 
   1705 	/*
   1706 	 * Close the zil. NB: Can't close the zil while zfs_inactive
   1707 	 * threads are blocked as zil_close can call zfs_inactive.
   1708 	 */
   1709 	if (zfsvfs->z_log) {
   1710 		zil_close(zfsvfs->z_log);
   1711 		zfsvfs->z_log = NULL;
   1712 	}
   1713 
   1714 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
   1715 
   1716 	/*
   1717 	 * If we are not unmounting (ie: online recv) and someone already
   1718 	 * unmounted this file system while we were doing the switcheroo,
   1719 	 * or a reopen of z_os failed then just bail out now.
   1720 	 */
   1721 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
   1722 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   1723 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   1724 		return (EIO);
   1725 	}
   1726 
   1727 	/*
   1728 	 * At this point there are no vops active, and any new vops will
   1729 	 * fail with EIO since we have z_teardown_lock for writer (only
   1730 	 * relavent for forced unmount).
   1731 	 *
   1732 	 * Release all holds on dbufs.
   1733 	 */
   1734 	mutex_enter(&zfsvfs->z_znodes_lock);
   1735 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
   1736 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
   1737 		if (zp->z_dbuf) {
   1738 			ASSERT(ZTOV(zp)->v_count > 0);
   1739 			zfs_znode_dmu_fini(zp);
   1740 		}
   1741 	mutex_exit(&zfsvfs->z_znodes_lock);
   1742 
   1743 	/*
   1744 	 * If we are unmounting, set the unmounted flag and let new vops
   1745 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
   1746 	 * other vops will fail with EIO.
   1747 	 */
   1748 	if (unmounting) {
   1749 		zfsvfs->z_unmounted = B_TRUE;
   1750 		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   1751 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
   1752 	}
   1753 
   1754 	/*
   1755 	 * z_os will be NULL if there was an error in attempting to reopen
   1756 	 * zfsvfs, so just return as the properties had already been
   1757 	 * unregistered and cached data had been evicted before.
   1758 	 */
   1759 	if (zfsvfs->z_os == NULL)
   1760 		return (0);
   1761 
   1762 	/*
   1763 	 * Unregister properties.
   1764 	 */
   1765 	zfs_unregister_callbacks(zfsvfs);
   1766 
   1767 	/*
   1768 	 * Evict cached data
   1769 	 */
   1770 	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
   1771 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
   1772 		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
   1773 	}
   1774 
   1775 	return (0);
   1776 }
   1777 
   1778 /*ARGSUSED*/
   1779 static int
   1780 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
   1781 {
   1782 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1783 	objset_t *os;
   1784 	int ret;
   1785 
   1786 	ret = secpolicy_fs_unmount(cr, vfsp);
   1787 	if (ret) {
   1788 		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
   1789 		    ZFS_DELEG_PERM_MOUNT, cr);
   1790 		if (ret)
   1791 			return (ret);
   1792 	}
   1793 
   1794 	/*
   1795 	 * We purge the parent filesystem's vfsp as the parent filesystem
   1796 	 * and all of its snapshots have their vnode's v_vfsp set to the
   1797 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
   1798 	 * referential for non-snapshots.
   1799 	 */
   1800 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
   1801 
   1802 	/*
   1803 	 * Unmount any snapshots mounted under .zfs before unmounting the
   1804 	 * dataset itself.
   1805 	 */
   1806 	if (zfsvfs->z_ctldir != NULL &&
   1807 	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
   1808 		return (ret);
   1809 	}
   1810 
   1811 	if (!(fflag & MS_FORCE)) {
   1812 		/*
   1813 		 * Check the number of active vnodes in the file system.
   1814 		 * Our count is maintained in the vfs structure, but the
   1815 		 * number is off by 1 to indicate a hold on the vfs
   1816 		 * structure itself.
   1817 		 *
   1818 		 * The '.zfs' directory maintains a reference of its
   1819 		 * own, and any active references underneath are
   1820 		 * reflected in the vnode count.
   1821 		 */
   1822 		if (zfsvfs->z_ctldir == NULL) {
   1823 			if (vfsp->vfs_count > 1)
   1824 				return (EBUSY);
   1825 		} else {
   1826 			if (vfsp->vfs_count > 2 ||
   1827 			    zfsvfs->z_ctldir->v_count > 1)
   1828 				return (EBUSY);
   1829 		}
   1830 	}
   1831 
   1832 	vfsp->vfs_flag |= VFS_UNMOUNTED;
   1833 
   1834 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
   1835 	os = zfsvfs->z_os;
   1836 
   1837 	/*
   1838 	 * z_os will be NULL if there was an error in
   1839 	 * attempting to reopen zfsvfs.
   1840 	 */
   1841 	if (os != NULL) {
   1842 		/*
   1843 		 * Unset the objset user_ptr.
   1844 		 */
   1845 		mutex_enter(&os->os_user_ptr_lock);
   1846 		dmu_objset_set_user(os, NULL);
   1847 		mutex_exit(&os->os_user_ptr_lock);
   1848 
   1849 		/*
   1850 		 * Finally release the objset
   1851 		 */
   1852 		dmu_objset_disown(os, zfsvfs);
   1853 	}
   1854 
   1855 	/*
   1856 	 * We can now safely destroy the '.zfs' directory node.
   1857 	 */
   1858 	if (zfsvfs->z_ctldir != NULL)
   1859 		zfsctl_destroy(zfsvfs);
   1860 
   1861 	return (0);
   1862 }
   1863 
   1864 static int
   1865 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
   1866 {
   1867 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
   1868 	znode_t		*zp;
   1869 	uint64_t	object = 0;
   1870 	uint64_t	fid_gen = 0;
   1871 	uint64_t	gen_mask;
   1872 	uint64_t	zp_gen;
   1873 	int 		i, err;
   1874 
   1875 	*vpp = NULL;
   1876 
   1877 	ZFS_ENTER(zfsvfs);
   1878 
   1879 	if (fidp->fid_len == LONG_FID_LEN) {
   1880 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
   1881 		uint64_t	objsetid = 0;
   1882 		uint64_t	setgen = 0;
   1883 
   1884 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
   1885 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
   1886 
   1887 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
   1888 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
   1889 
   1890 		ZFS_EXIT(zfsvfs);
   1891 
   1892 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
   1893 		if (err)
   1894 			return (EINVAL);
   1895 		ZFS_ENTER(zfsvfs);
   1896 	}
   1897 
   1898 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
   1899 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
   1900 
   1901 		for (i = 0; i < sizeof (zfid->zf_object); i++)
   1902 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
   1903 
   1904 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
   1905 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
   1906 	} else {
   1907 		ZFS_EXIT(zfsvfs);
   1908 		return (EINVAL);
   1909 	}
   1910 
   1911 	/* A zero fid_gen means we are in the .zfs control directories */
   1912 	if (fid_gen == 0 &&
   1913 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
   1914 		*vpp = zfsvfs->z_ctldir;
   1915 		ASSERT(*vpp != NULL);
   1916 		if (object == ZFSCTL_INO_SNAPDIR) {
   1917 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
   1918 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
   1919 		} else {
   1920 			VN_HOLD(*vpp);
   1921 		}
   1922 		ZFS_EXIT(zfsvfs);
   1923 		return (0);
   1924 	}
   1925 
   1926 	gen_mask = -1ULL >> (64 - 8 * i);
   1927 
   1928 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
   1929 	if (err = zfs_zget(zfsvfs, object, &zp)) {
   1930 		ZFS_EXIT(zfsvfs);
   1931 		return (err);
   1932 	}
   1933 	zp_gen = zp->z_phys->zp_gen & gen_mask;
   1934 	if (zp_gen == 0)
   1935 		zp_gen = 1;
   1936 	if (zp->z_unlinked || zp_gen != fid_gen) {
   1937 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
   1938 		VN_RELE(ZTOV(zp));
   1939 		ZFS_EXIT(zfsvfs);
   1940 		return (EINVAL);
   1941 	}
   1942 
   1943 	*vpp = ZTOV(zp);
   1944 	ZFS_EXIT(zfsvfs);
   1945 	return (0);
   1946 }
   1947 
   1948 /*
   1949  * Block out VOPs and close zfsvfs_t::z_os
   1950  *
   1951  * Note, if successful, then we return with the 'z_teardown_lock' and
   1952  * 'z_teardown_inactive_lock' write held.
   1953  */
   1954 int
   1955 zfs_suspend_fs(zfsvfs_t *zfsvfs)
   1956 {
   1957 	int error;
   1958 
   1959 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
   1960 		return (error);
   1961 	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
   1962 
   1963 	return (0);
   1964 }
   1965 
   1966 /*
   1967  * Reopen zfsvfs_t::z_os and release VOPs.
   1968  */
   1969 int
   1970 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
   1971 {
   1972 	int err;
   1973 
   1974 	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
   1975 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
   1976 
   1977 	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
   1978 	    &zfsvfs->z_os);
   1979 	if (err) {
   1980 		zfsvfs->z_os = NULL;
   1981 	} else {
   1982 		znode_t *zp;
   1983 
   1984 		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
   1985 
   1986 		/*
   1987 		 * Attempt to re-establish all the active znodes with
   1988 		 * their dbufs.  If a zfs_rezget() fails, then we'll let
   1989 		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
   1990 		 * when they try to use their znode.
   1991 		 */
   1992 		mutex_enter(&zfsvfs->z_znodes_lock);
   1993 		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
   1994 		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
   1995 			(void) zfs_rezget(zp);
   1996 		}
   1997 		mutex_exit(&zfsvfs->z_znodes_lock);
   1998 
   1999 	}
   2000 
   2001 	/* release the VOPs */
   2002 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
   2003 	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
   2004 
   2005 	if (err) {
   2006 		/*
   2007 		 * Since we couldn't reopen zfsvfs::z_os, force
   2008 		 * unmount this file system.
   2009 		 */
   2010 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
   2011 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
   2012 	}
   2013 	return (err);
   2014 }
   2015 
   2016 static void
   2017 zfs_freevfs(vfs_t *vfsp)
   2018 {
   2019 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   2020 
   2021 	/*
   2022 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
   2023 	 * from zfs_mount().  Release it here.
   2024 	 */
   2025 	if (zfsvfs->z_issnap)
   2026 		VFS_RELE(zfsvfs->z_parent->z_vfs);
   2027 
   2028 	zfsvfs_free(zfsvfs);
   2029 
   2030 	atomic_add_32(&zfs_active_fs_count, -1);
   2031 }
   2032 
   2033 /*
   2034  * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
   2035  * so we can't safely do any non-idempotent initialization here.
   2036  * Leave that to zfs_init() and zfs_fini(), which are called
   2037  * from the module's _init() and _fini() entry points.
   2038  */
   2039 /*ARGSUSED*/
   2040 static int
   2041 zfs_vfsinit(int fstype, char *name)
   2042 {
   2043 	int error;
   2044 
   2045 	zfsfstype = fstype;
   2046 
   2047 	/*
   2048 	 * Setup vfsops and vnodeops tables.
   2049 	 */
   2050 	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
   2051 	if (error != 0) {
   2052 		cmn_err(CE_WARN, "zfs: bad vfs ops template");
   2053 	}
   2054 
   2055 	error = zfs_create_op_tables();
   2056 	if (error) {
   2057 		zfs_remove_op_tables();
   2058 		cmn_err(CE_WARN, "zfs: bad vnode ops template");
   2059 		(void) vfs_freevfsops_by_type(zfsfstype);
   2060 		return (error);
   2061 	}
   2062 
   2063 	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
   2064 
   2065 	/*
   2066 	 * Unique major number for all zfs mounts.
   2067 	 * If we run out of 32-bit minors, we'll getudev() another major.
   2068 	 */
   2069 	zfs_major = ddi_name_to_major(ZFS_DRIVER);
   2070 	zfs_minor = ZFS_MIN_MINOR;
   2071 
   2072 	return (0);
   2073 }
   2074 
   2075 void
   2076 zfs_init(void)
   2077 {
   2078 	/*
   2079 	 * Initialize .zfs directory structures
   2080 	 */
   2081 	zfsctl_init();
   2082 
   2083 	/*
   2084 	 * Initialize znode cache, vnode ops, etc...
   2085 	 */
   2086 	zfs_znode_init();
   2087 
   2088 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
   2089 }
   2090 
   2091 void
   2092 zfs_fini(void)
   2093 {
   2094 	zfsctl_fini();
   2095 	zfs_znode_fini();
   2096 }
   2097 
   2098 int
   2099 zfs_busy(void)
   2100 {
   2101 	return (zfs_active_fs_count != 0);
   2102 }
   2103 
   2104 int
   2105 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
   2106 {
   2107 	int error;
   2108 	objset_t *os = zfsvfs->z_os;
   2109 	dmu_tx_t *tx;
   2110 
   2111 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
   2112 		return (EINVAL);
   2113 
   2114 	if (newvers < zfsvfs->z_version)
   2115 		return (EINVAL);
   2116 
   2117 	tx = dmu_tx_create(os);
   2118 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
   2119 	error = dmu_tx_assign(tx, TXG_WAIT);
   2120 	if (error) {
   2121 		dmu_tx_abort(tx);
   2122 		return (error);
   2123 	}
   2124 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
   2125 	    8, 1, &newvers, tx);
   2126 
   2127 	if (error) {
   2128 		dmu_tx_commit(tx);
   2129 		return (error);
   2130 	}
   2131 
   2132 	spa_history_internal_log(LOG_DS_UPGRADE,
   2133 	    dmu_objset_spa(os), tx, CRED(),
   2134 	    "oldver=%llu newver=%llu dataset = %llu",
   2135 	    zfsvfs->z_version, newvers, dmu_objset_id(os));
   2136 
   2137 	dmu_tx_commit(tx);
   2138 
   2139 	zfsvfs->z_version = newvers;
   2140 
   2141 	if (zfsvfs->z_version >= ZPL_VERSION_FUID)
   2142 		zfs_set_fuid_feature(zfsvfs);
   2143 
   2144 	return (0);
   2145 }
   2146 
   2147 /*
   2148  * Read a property stored within the master node.
   2149  */
   2150 int
   2151 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
   2152 {
   2153 	const char *pname;
   2154 	int error = ENOENT;
   2155 
   2156 	/*
   2157 	 * Look up the file system's value for the property.  For the
   2158 	 * version property, we look up a slightly different string.
   2159 	 */
   2160 	if (prop == ZFS_PROP_VERSION)
   2161 		pname = ZPL_VERSION_STR;
   2162 	else
   2163 		pname = zfs_prop_to_name(prop);
   2164 
   2165 	if (os != NULL)
   2166 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
   2167 
   2168 	if (error == ENOENT) {
   2169 		/* No value set, use the default value */
   2170 		switch (prop) {
   2171 		case ZFS_PROP_VERSION:
   2172 			*value = ZPL_VERSION;
   2173 			break;
   2174 		case ZFS_PROP_NORMALIZE:
   2175 		case ZFS_PROP_UTF8ONLY:
   2176 			*value = 0;
   2177 			break;
   2178 		case ZFS_PROP_CASE:
   2179 			*value = ZFS_CASE_SENSITIVE;
   2180 			break;
   2181 		default:
   2182 			return (error);
   2183 		}
   2184 		error = 0;
   2185 	}
   2186 	return (error);
   2187 }
   2188 
   2189 static vfsdef_t vfw = {
   2190 	VFSDEF_VERSION,
   2191 	MNTTYPE_ZFS,
   2192 	zfs_vfsinit,
   2193 	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
   2194 	    VSW_XID,
   2195 	&zfs_mntopts
   2196 };
   2197 
   2198 struct modlfs zfs_modlfs = {
   2199 	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
   2200 };
   2201