Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/param.h>
     28 #include <sys/systm.h>
     29 #include <sys/sysmacros.h>
     30 #include <sys/kmem.h>
     31 #include <sys/pathname.h>
     32 #include <sys/vnode.h>
     33 #include <sys/vfs.h>
     34 #include <sys/vfs_opreg.h>
     35 #include <sys/mntent.h>
     36 #include <sys/mount.h>
     37 #include <sys/cmn_err.h>
     38 #include "fs/fs_subr.h"
     39 #include <sys/zfs_znode.h>
     40 #include <sys/zfs_dir.h>
     41 #include <sys/zil.h>
     42 #include <sys/fs/zfs.h>
     43 #include <sys/dmu.h>
     44 #include <sys/dsl_prop.h>
     45 #include <sys/dsl_dataset.h>
     46 #include <sys/dsl_deleg.h>
     47 #include <sys/spa.h>
     48 #include <sys/zap.h>
     49 #include <sys/varargs.h>
     50 #include <sys/policy.h>
     51 #include <sys/atomic.h>
     52 #include <sys/mkdev.h>
     53 #include <sys/modctl.h>
     54 #include <sys/refstr.h>
     55 #include <sys/zfs_ioctl.h>
     56 #include <sys/zfs_ctldir.h>
     57 #include <sys/zfs_fuid.h>
     58 #include <sys/bootconf.h>
     59 #include <sys/sunddi.h>
     60 #include <sys/dnlc.h>
     61 #include <sys/dmu_objset.h>
     62 #include <sys/spa_boot.h>
     63 
     64 int zfsfstype;
     65 vfsops_t *zfs_vfsops = NULL;
     66 static major_t zfs_major;
     67 static minor_t zfs_minor;
     68 static kmutex_t	zfs_dev_mtx;
     69 
     70 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
     71 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
     72 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
     73 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
     74 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
     75 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
     76 static void zfs_freevfs(vfs_t *vfsp);
     77 
     78 static const fs_operation_def_t zfs_vfsops_template[] = {
     79 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
     80 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
     81 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
     82 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
     83 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
     84 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
     85 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
     86 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
     87 	NULL,			NULL
     88 };
     89 
     90 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
     91 	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
     92 	NULL,			NULL
     93 };
     94 
     95 /*
     96  * We need to keep a count of active fs's.
     97  * This is necessary to prevent our module
     98  * from being unloaded after a umount -f
     99  */
    100 static uint32_t	zfs_active_fs_count = 0;
    101 
    102 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
    103 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
    104 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
    105 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
    106 
    107 /*
    108  * MO_DEFAULT is not used since the default value is determined
    109  * by the equivalent property.
    110  */
    111 static mntopt_t mntopts[] = {
    112 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
    113 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
    114 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
    115 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
    116 };
    117 
    118 static mntopts_t zfs_mntopts = {
    119 	sizeof (mntopts) / sizeof (mntopt_t),
    120 	mntopts
    121 };
    122 
    123 /*ARGSUSED*/
    124 int
    125 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
    126 {
    127 	/*
    128 	 * Data integrity is job one.  We don't want a compromised kernel
    129 	 * writing to the storage pool, so we never sync during panic.
    130 	 */
    131 	if (panicstr)
    132 		return (0);
    133 
    134 	/*
    135 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
    136 	 * to sync metadata, which they would otherwise cache indefinitely.
    137 	 * Semantically, the only requirement is that the sync be initiated.
    138 	 * The DMU syncs out txgs frequently, so there's nothing to do.
    139 	 */
    140 	if (flag & SYNC_ATTR)
    141 		return (0);
    142 
    143 	if (vfsp != NULL) {
    144 		/*
    145 		 * Sync a specific filesystem.
    146 		 */
    147 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    148 
    149 		ZFS_ENTER(zfsvfs);
    150 		if (zfsvfs->z_log != NULL)
    151 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
    152 		else
    153 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
    154 		ZFS_EXIT(zfsvfs);
    155 	} else {
    156 		/*
    157 		 * Sync all ZFS filesystems.  This is what happens when you
    158 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    159 		 * request by waiting for all pools to commit all dirty data.
    160 		 */
    161 		spa_sync_allpools();
    162 	}
    163 
    164 	return (0);
    165 }
    166 
    167 static int
    168 zfs_create_unique_device(dev_t *dev)
    169 {
    170 	major_t new_major;
    171 
    172 	do {
    173 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    174 		minor_t start = zfs_minor;
    175 		do {
    176 			mutex_enter(&zfs_dev_mtx);
    177 			if (zfs_minor >= MAXMIN32) {
    178 				/*
    179 				 * If we're still using the real major
    180 				 * keep out of /dev/zfs and /dev/zvol minor
    181 				 * number space.  If we're using a getudev()'ed
    182 				 * major number, we can use all of its minors.
    183 				 */
    184 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    185 					zfs_minor = ZFS_MIN_MINOR;
    186 				else
    187 					zfs_minor = 0;
    188 			} else {
    189 				zfs_minor++;
    190 			}
    191 			*dev = makedevice(zfs_major, zfs_minor);
    192 			mutex_exit(&zfs_dev_mtx);
    193 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    194 		if (zfs_minor == start) {
    195 			/*
    196 			 * We are using all ~262,000 minor numbers for the
    197 			 * current major number.  Create a new major number.
    198 			 */
    199 			if ((new_major = getudev()) == (major_t)-1) {
    200 				cmn_err(CE_WARN,
    201 				    "zfs_mount: Can't get unique major "
    202 				    "device number.");
    203 				return (-1);
    204 			}
    205 			mutex_enter(&zfs_dev_mtx);
    206 			zfs_major = new_major;
    207 			zfs_minor = 0;
    208 
    209 			mutex_exit(&zfs_dev_mtx);
    210 		} else {
    211 			break;
    212 		}
    213 		/* CONSTANTCONDITION */
    214 	} while (1);
    215 
    216 	return (0);
    217 }
    218 
    219 static void
    220 atime_changed_cb(void *arg, uint64_t newval)
    221 {
    222 	zfsvfs_t *zfsvfs = arg;
    223 
    224 	if (newval == TRUE) {
    225 		zfsvfs->z_atime = TRUE;
    226 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    227 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    228 	} else {
    229 		zfsvfs->z_atime = FALSE;
    230 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    231 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    232 	}
    233 }
    234 
    235 static void
    236 xattr_changed_cb(void *arg, uint64_t newval)
    237 {
    238 	zfsvfs_t *zfsvfs = arg;
    239 
    240 	if (newval == TRUE) {
    241 		/* XXX locking on vfs_flag? */
    242 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    243 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    244 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    245 	} else {
    246 		/* XXX locking on vfs_flag? */
    247 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    248 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    249 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    250 	}
    251 }
    252 
    253 static void
    254 blksz_changed_cb(void *arg, uint64_t newval)
    255 {
    256 	zfsvfs_t *zfsvfs = arg;
    257 
    258 	if (newval < SPA_MINBLOCKSIZE ||
    259 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
    260 		newval = SPA_MAXBLOCKSIZE;
    261 
    262 	zfsvfs->z_max_blksz = newval;
    263 	zfsvfs->z_vfs->vfs_bsize = newval;
    264 }
    265 
    266 static void
    267 readonly_changed_cb(void *arg, uint64_t newval)
    268 {
    269 	zfsvfs_t *zfsvfs = arg;
    270 
    271 	if (newval) {
    272 		/* XXX locking on vfs_flag? */
    273 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    274 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    275 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    276 	} else {
    277 		/* XXX locking on vfs_flag? */
    278 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    279 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    280 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    281 	}
    282 }
    283 
    284 static void
    285 devices_changed_cb(void *arg, uint64_t newval)
    286 {
    287 	zfsvfs_t *zfsvfs = arg;
    288 
    289 	if (newval == FALSE) {
    290 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
    291 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
    292 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
    293 	} else {
    294 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
    295 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
    296 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
    297 	}
    298 }
    299 
    300 static void
    301 setuid_changed_cb(void *arg, uint64_t newval)
    302 {
    303 	zfsvfs_t *zfsvfs = arg;
    304 
    305 	if (newval == FALSE) {
    306 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    307 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    308 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    309 	} else {
    310 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    311 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    312 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    313 	}
    314 }
    315 
    316 static void
    317 exec_changed_cb(void *arg, uint64_t newval)
    318 {
    319 	zfsvfs_t *zfsvfs = arg;
    320 
    321 	if (newval == FALSE) {
    322 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    323 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    324 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    325 	} else {
    326 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    327 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    328 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    329 	}
    330 }
    331 
    332 /*
    333  * The nbmand mount option can be changed at mount time.
    334  * We can't allow it to be toggled on live file systems or incorrect
    335  * behavior may be seen from cifs clients
    336  *
    337  * This property isn't registered via dsl_prop_register(), but this callback
    338  * will be called when a file system is first mounted
    339  */
    340 static void
    341 nbmand_changed_cb(void *arg, uint64_t newval)
    342 {
    343 	zfsvfs_t *zfsvfs = arg;
    344 	if (newval == FALSE) {
    345 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    346 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    347 	} else {
    348 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    349 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    350 	}
    351 }
    352 
    353 static void
    354 snapdir_changed_cb(void *arg, uint64_t newval)
    355 {
    356 	zfsvfs_t *zfsvfs = arg;
    357 
    358 	zfsvfs->z_show_ctldir = newval;
    359 }
    360 
    361 static void
    362 vscan_changed_cb(void *arg, uint64_t newval)
    363 {
    364 	zfsvfs_t *zfsvfs = arg;
    365 
    366 	zfsvfs->z_vscan = newval;
    367 }
    368 
    369 static void
    370 acl_mode_changed_cb(void *arg, uint64_t newval)
    371 {
    372 	zfsvfs_t *zfsvfs = arg;
    373 
    374 	zfsvfs->z_acl_mode = newval;
    375 }
    376 
    377 static void
    378 acl_inherit_changed_cb(void *arg, uint64_t newval)
    379 {
    380 	zfsvfs_t *zfsvfs = arg;
    381 
    382 	zfsvfs->z_acl_inherit = newval;
    383 }
    384 
    385 static int
    386 zfs_register_callbacks(vfs_t *vfsp)
    387 {
    388 	struct dsl_dataset *ds = NULL;
    389 	objset_t *os = NULL;
    390 	zfsvfs_t *zfsvfs = NULL;
    391 	uint64_t nbmand;
    392 	int readonly, do_readonly = B_FALSE;
    393 	int setuid, do_setuid = B_FALSE;
    394 	int exec, do_exec = B_FALSE;
    395 	int devices, do_devices = B_FALSE;
    396 	int xattr, do_xattr = B_FALSE;
    397 	int atime, do_atime = B_FALSE;
    398 	int error = 0;
    399 
    400 	ASSERT(vfsp);
    401 	zfsvfs = vfsp->vfs_data;
    402 	ASSERT(zfsvfs);
    403 	os = zfsvfs->z_os;
    404 
    405 	/*
    406 	 * The act of registering our callbacks will destroy any mount
    407 	 * options we may have.  In order to enable temporary overrides
    408 	 * of mount options, we stash away the current values and
    409 	 * restore them after we register the callbacks.
    410 	 */
    411 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
    412 		readonly = B_TRUE;
    413 		do_readonly = B_TRUE;
    414 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    415 		readonly = B_FALSE;
    416 		do_readonly = B_TRUE;
    417 	}
    418 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    419 		devices = B_FALSE;
    420 		setuid = B_FALSE;
    421 		do_devices = B_TRUE;
    422 		do_setuid = B_TRUE;
    423 	} else {
    424 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
    425 			devices = B_FALSE;
    426 			do_devices = B_TRUE;
    427 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
    428 			devices = B_TRUE;
    429 			do_devices = B_TRUE;
    430 		}
    431 
    432 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    433 			setuid = B_FALSE;
    434 			do_setuid = B_TRUE;
    435 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    436 			setuid = B_TRUE;
    437 			do_setuid = B_TRUE;
    438 		}
    439 	}
    440 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    441 		exec = B_FALSE;
    442 		do_exec = B_TRUE;
    443 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    444 		exec = B_TRUE;
    445 		do_exec = B_TRUE;
    446 	}
    447 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    448 		xattr = B_FALSE;
    449 		do_xattr = B_TRUE;
    450 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    451 		xattr = B_TRUE;
    452 		do_xattr = B_TRUE;
    453 	}
    454 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    455 		atime = B_FALSE;
    456 		do_atime = B_TRUE;
    457 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    458 		atime = B_TRUE;
    459 		do_atime = B_TRUE;
    460 	}
    461 
    462 	/*
    463 	 * nbmand is a special property.  It can only be changed at
    464 	 * mount time.
    465 	 *
    466 	 * This is weird, but it is documented to only be changeable
    467 	 * at mount time.
    468 	 */
    469 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    470 		nbmand = B_FALSE;
    471 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    472 		nbmand = B_TRUE;
    473 	} else {
    474 		char osname[MAXNAMELEN];
    475 
    476 		dmu_objset_name(os, osname);
    477 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
    478 		    NULL)) {
    479 			return (error);
    480 		}
    481 	}
    482 
    483 	/*
    484 	 * Register property callbacks.
    485 	 *
    486 	 * It would probably be fine to just check for i/o error from
    487 	 * the first prop_register(), but I guess I like to go
    488 	 * overboard...
    489 	 */
    490 	ds = dmu_objset_ds(os);
    491 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
    492 	error = error ? error : dsl_prop_register(ds,
    493 	    "xattr", xattr_changed_cb, zfsvfs);
    494 	error = error ? error : dsl_prop_register(ds,
    495 	    "recordsize", blksz_changed_cb, zfsvfs);
    496 	error = error ? error : dsl_prop_register(ds,
    497 	    "readonly", readonly_changed_cb, zfsvfs);
    498 	error = error ? error : dsl_prop_register(ds,
    499 	    "devices", devices_changed_cb, zfsvfs);
    500 	error = error ? error : dsl_prop_register(ds,
    501 	    "setuid", setuid_changed_cb, zfsvfs);
    502 	error = error ? error : dsl_prop_register(ds,
    503 	    "exec", exec_changed_cb, zfsvfs);
    504 	error = error ? error : dsl_prop_register(ds,
    505 	    "snapdir", snapdir_changed_cb, zfsvfs);
    506 	error = error ? error : dsl_prop_register(ds,
    507 	    "aclmode", acl_mode_changed_cb, zfsvfs);
    508 	error = error ? error : dsl_prop_register(ds,
    509 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
    510 	error = error ? error : dsl_prop_register(ds,
    511 	    "vscan", vscan_changed_cb, zfsvfs);
    512 	if (error)
    513 		goto unregister;
    514 
    515 	/*
    516 	 * Invoke our callbacks to restore temporary mount options.
    517 	 */
    518 	if (do_readonly)
    519 		readonly_changed_cb(zfsvfs, readonly);
    520 	if (do_setuid)
    521 		setuid_changed_cb(zfsvfs, setuid);
    522 	if (do_exec)
    523 		exec_changed_cb(zfsvfs, exec);
    524 	if (do_devices)
    525 		devices_changed_cb(zfsvfs, devices);
    526 	if (do_xattr)
    527 		xattr_changed_cb(zfsvfs, xattr);
    528 	if (do_atime)
    529 		atime_changed_cb(zfsvfs, atime);
    530 
    531 	nbmand_changed_cb(zfsvfs, nbmand);
    532 
    533 	return (0);
    534 
    535 unregister:
    536 	/*
    537 	 * We may attempt to unregister some callbacks that are not
    538 	 * registered, but this is OK; it will simply return ENOMSG,
    539 	 * which we will ignore.
    540 	 */
    541 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
    542 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
    543 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
    544 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
    545 	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
    546 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
    547 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
    548 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
    549 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
    550 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
    551 	    zfsvfs);
    552 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
    553 	return (error);
    554 
    555 }
    556 
    557 static int
    558 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
    559 {
    560 	int error;
    561 
    562 	error = zfs_register_callbacks(zfsvfs->z_vfs);
    563 	if (error)
    564 		return (error);
    565 
    566 	/*
    567 	 * Set the objset user_ptr to track its zfsvfs.
    568 	 */
    569 	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
    570 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
    571 	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
    572 
    573 	/*
    574 	 * If we are not mounting (ie: online recv), then we don't
    575 	 * have to worry about replaying the log as we blocked all
    576 	 * operations out since we closed the ZIL.
    577 	 */
    578 	if (mounting) {
    579 		boolean_t readonly;
    580 
    581 		/*
    582 		 * During replay we remove the read only flag to
    583 		 * allow replays to succeed.
    584 		 */
    585 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
    586 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    587 
    588 		/*
    589 		 * Parse and replay the intent log.
    590 		 */
    591 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
    592 		    zfs_replay_vector, zfs_unlinked_drain);
    593 
    594 		zfs_unlinked_drain(zfsvfs);
    595 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
    596 	}
    597 
    598 	if (!zil_disable)
    599 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
    600 
    601 	return (0);
    602 }
    603 
    604 static void
    605 zfs_freezfsvfs(zfsvfs_t *zfsvfs)
    606 {
    607 	mutex_destroy(&zfsvfs->z_znodes_lock);
    608 	mutex_destroy(&zfsvfs->z_online_recv_lock);
    609 	list_destroy(&zfsvfs->z_all_znodes);
    610 	rrw_destroy(&zfsvfs->z_teardown_lock);
    611 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
    612 	rw_destroy(&zfsvfs->z_fuid_lock);
    613 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
    614 }
    615 
    616 static int
    617 zfs_domount(vfs_t *vfsp, char *osname)
    618 {
    619 	dev_t mount_dev;
    620 	uint64_t recordsize, readonly;
    621 	int error = 0;
    622 	int mode;
    623 	zfsvfs_t *zfsvfs;
    624 	znode_t *zp = NULL;
    625 
    626 	ASSERT(vfsp);
    627 	ASSERT(osname);
    628 
    629 	/*
    630 	 * Initialize the zfs-specific filesystem structure.
    631 	 * Should probably make this a kmem cache, shuffle fields,
    632 	 * and just bzero up to z_hold_mtx[].
    633 	 */
    634 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
    635 	zfsvfs->z_vfs = vfsp;
    636 	zfsvfs->z_parent = zfsvfs;
    637 	zfsvfs->z_assign = TXG_NOWAIT;
    638 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
    639 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
    640 
    641 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
    642 	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
    643 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
    644 	    offsetof(znode_t, z_link_node));
    645 	rrw_init(&zfsvfs->z_teardown_lock);
    646 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
    647 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
    648 
    649 	/* Initialize the generic filesystem structure. */
    650 	vfsp->vfs_bcount = 0;
    651 	vfsp->vfs_data = NULL;
    652 
    653 	if (zfs_create_unique_device(&mount_dev) == -1) {
    654 		error = ENODEV;
    655 		goto out;
    656 	}
    657 	ASSERT(vfs_devismounted(mount_dev) == 0);
    658 
    659 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
    660 	    NULL))
    661 		goto out;
    662 
    663 	vfsp->vfs_dev = mount_dev;
    664 	vfsp->vfs_fstype = zfsfstype;
    665 	vfsp->vfs_bsize = recordsize;
    666 	vfsp->vfs_flag |= VFS_NOTRUNC;
    667 	vfsp->vfs_data = zfsvfs;
    668 
    669 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
    670 		goto out;
    671 
    672 	mode = DS_MODE_OWNER;
    673 	if (readonly)
    674 		mode |= DS_MODE_READONLY;
    675 
    676 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
    677 	if (error == EROFS) {
    678 		mode = DS_MODE_OWNER | DS_MODE_READONLY;
    679 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
    680 		    &zfsvfs->z_os);
    681 	}
    682 
    683 	if (error)
    684 		goto out;
    685 
    686 	if (error = zfs_init_fs(zfsvfs, &zp))
    687 		goto out;
    688 
    689 	/* The call to zfs_init_fs leaves the vnode held, release it here. */
    690 	VN_RELE(ZTOV(zp));
    691 
    692 	/*
    693 	 * Set features for file system.
    694 	 */
    695 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
    696 	if (zfsvfs->z_use_fuids) {
    697 		vfs_set_feature(vfsp, VFSFT_XVATTR);
    698 		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
    699 		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
    700 		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
    701 	}
    702 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
    703 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    704 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    705 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
    706 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
    707 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    708 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    709 	}
    710 
    711 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
    712 		uint64_t pval;
    713 
    714 		ASSERT(mode & DS_MODE_READONLY);
    715 		atime_changed_cb(zfsvfs, B_FALSE);
    716 		readonly_changed_cb(zfsvfs, B_TRUE);
    717 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
    718 			goto out;
    719 		xattr_changed_cb(zfsvfs, pval);
    720 		zfsvfs->z_issnap = B_TRUE;
    721 	} else {
    722 		error = zfsvfs_setup(zfsvfs, B_TRUE);
    723 	}
    724 
    725 	if (!zfsvfs->z_issnap)
    726 		zfsctl_create(zfsvfs);
    727 out:
    728 	if (error) {
    729 		if (zfsvfs->z_os)
    730 			dmu_objset_close(zfsvfs->z_os);
    731 		zfs_freezfsvfs(zfsvfs);
    732 	} else {
    733 		atomic_add_32(&zfs_active_fs_count, 1);
    734 	}
    735 
    736 	return (error);
    737 }
    738 
    739 void
    740 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
    741 {
    742 	objset_t *os = zfsvfs->z_os;
    743 	struct dsl_dataset *ds;
    744 
    745 	/*
    746 	 * Unregister properties.
    747 	 */
    748 	if (!dmu_objset_is_snapshot(os)) {
    749 		ds = dmu_objset_ds(os);
    750 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
    751 		    zfsvfs) == 0);
    752 
    753 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
    754 		    zfsvfs) == 0);
    755 
    756 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
    757 		    zfsvfs) == 0);
    758 
    759 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
    760 		    zfsvfs) == 0);
    761 
    762 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
    763 		    zfsvfs) == 0);
    764 
    765 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
    766 		    zfsvfs) == 0);
    767 
    768 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
    769 		    zfsvfs) == 0);
    770 
    771 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
    772 		    zfsvfs) == 0);
    773 
    774 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
    775 		    zfsvfs) == 0);
    776 
    777 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
    778 		    acl_inherit_changed_cb, zfsvfs) == 0);
    779 
    780 		VERIFY(dsl_prop_unregister(ds, "vscan",
    781 		    vscan_changed_cb, zfsvfs) == 0);
    782 	}
    783 }
    784 
    785 /*
    786  * Convert a decimal digit string to a uint64_t integer.
    787  */
    788 static int
    789 str_to_uint64(char *str, uint64_t *objnum)
    790 {
    791 	uint64_t num = 0;
    792 
    793 	while (*str) {
    794 		if (*str < '0' || *str > '9')
    795 			return (EINVAL);
    796 
    797 		num = num*10 + *str++ - '0';
    798 	}
    799 
    800 	*objnum = num;
    801 	return (0);
    802 }
    803 
    804 /*
    805  * The boot path passed from the boot loader is in the form of
    806  * "rootpool-name/root-filesystem-object-number'. Convert this
    807  * string to a dataset name: "rootpool-name/root-filesystem-name".
    808  */
    809 static int
    810 zfs_parse_bootfs(char *bpath, char *outpath)
    811 {
    812 	char *slashp;
    813 	uint64_t objnum;
    814 	int error;
    815 
    816 	if (*bpath == 0 || *bpath == '/')
    817 		return (EINVAL);
    818 
    819 	(void) strcpy(outpath, bpath);
    820 
    821 	slashp = strchr(bpath, '/');
    822 
    823 	/* if no '/', just return the pool name */
    824 	if (slashp == NULL) {
    825 		return (0);
    826 	}
    827 
    828 	/* if not a number, just return the root dataset name */
    829 	if (str_to_uint64(slashp+1, &objnum)) {
    830 		return (0);
    831 	}
    832 
    833 	*slashp = '\0';
    834 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
    835 	*slashp = '/';
    836 
    837 	return (error);
    838 }
    839 
    840 static int
    841 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
    842 {
    843 	int error = 0;
    844 	static int zfsrootdone = 0;
    845 	zfsvfs_t *zfsvfs = NULL;
    846 	znode_t *zp = NULL;
    847 	vnode_t *vp = NULL;
    848 	char *zfs_bootfs;
    849 	char *zfs_devid;
    850 
    851 	ASSERT(vfsp);
    852 
    853 	/*
    854 	 * The filesystem that we mount as root is defined in the
    855 	 * boot property "zfs-bootfs" with a format of
    856 	 * "poolname/root-dataset-objnum".
    857 	 */
    858 	if (why == ROOT_INIT) {
    859 		if (zfsrootdone++)
    860 			return (EBUSY);
    861 		/*
    862 		 * the process of doing a spa_load will require the
    863 		 * clock to be set before we could (for example) do
    864 		 * something better by looking at the timestamp on
    865 		 * an uberblock, so just set it to -1.
    866 		 */
    867 		clkset(-1);
    868 
    869 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
    870 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
    871 			    "bootfs name");
    872 			return (EINVAL);
    873 		}
    874 		zfs_devid = spa_get_bootprop("diskdevid");
    875 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
    876 		if (zfs_devid)
    877 			spa_free_bootprop(zfs_devid);
    878 		if (error) {
    879 			spa_free_bootprop(zfs_bootfs);
    880 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
    881 			    error);
    882 			return (error);
    883 		}
    884 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
    885 			spa_free_bootprop(zfs_bootfs);
    886 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
    887 			    error);
    888 			return (error);
    889 		}
    890 
    891 		spa_free_bootprop(zfs_bootfs);
    892 
    893 		if (error = vfs_lock(vfsp))
    894 			return (error);
    895 
    896 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
    897 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
    898 			goto out;
    899 		}
    900 
    901 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
    902 		ASSERT(zfsvfs);
    903 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
    904 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
    905 			goto out;
    906 		}
    907 
    908 		vp = ZTOV(zp);
    909 		mutex_enter(&vp->v_lock);
    910 		vp->v_flag |= VROOT;
    911 		mutex_exit(&vp->v_lock);
    912 		rootvp = vp;
    913 
    914 		/*
    915 		 * Leave rootvp held.  The root file system is never unmounted.
    916 		 */
    917 
    918 		vfs_add((struct vnode *)0, vfsp,
    919 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
    920 out:
    921 		vfs_unlock(vfsp);
    922 		return (error);
    923 	} else if (why == ROOT_REMOUNT) {
    924 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
    925 		vfsp->vfs_flag |= VFS_REMOUNT;
    926 
    927 		/* refresh mount options */
    928 		zfs_unregister_callbacks(vfsp->vfs_data);
    929 		return (zfs_register_callbacks(vfsp));
    930 
    931 	} else if (why == ROOT_UNMOUNT) {
    932 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
    933 		(void) zfs_sync(vfsp, 0, 0);
    934 		return (0);
    935 	}
    936 
    937 	/*
    938 	 * if "why" is equal to anything else other than ROOT_INIT,
    939 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
    940 	 */
    941 	return (ENOTSUP);
    942 }
    943 
    944 /*ARGSUSED*/
    945 static int
    946 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
    947 {
    948 	char		*osname;
    949 	pathname_t	spn;
    950 	int		error = 0;
    951 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
    952 	    UIO_SYSSPACE : UIO_USERSPACE;
    953 	int		canwrite;
    954 
    955 	if (mvp->v_type != VDIR)
    956 		return (ENOTDIR);
    957 
    958 	mutex_enter(&mvp->v_lock);
    959 	if ((uap->flags & MS_REMOUNT) == 0 &&
    960 	    (uap->flags & MS_OVERLAY) == 0 &&
    961 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
    962 		mutex_exit(&mvp->v_lock);
    963 		return (EBUSY);
    964 	}
    965 	mutex_exit(&mvp->v_lock);
    966 
    967 	/*
    968 	 * ZFS does not support passing unparsed data in via MS_DATA.
    969 	 * Users should use the MS_OPTIONSTR interface; this means
    970 	 * that all option parsing is already done and the options struct
    971 	 * can be interrogated.
    972 	 */
    973 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
    974 		return (EINVAL);
    975 
    976 	/*
    977 	 * Get the objset name (the "special" mount argument).
    978 	 */
    979 	if (error = pn_get(uap->spec, fromspace, &spn))
    980 		return (error);
    981 
    982 	osname = spn.pn_path;
    983 
    984 	/*
    985 	 * Check for mount privilege?
    986 	 *
    987 	 * If we don't have privilege then see if
    988 	 * we have local permission to allow it
    989 	 */
    990 	error = secpolicy_fs_mount(cr, mvp, vfsp);
    991 	if (error) {
    992 		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
    993 		if (error == 0) {
    994 			vattr_t		vattr;
    995 
    996 			/*
    997 			 * Make sure user is the owner of the mount point
    998 			 * or has sufficient privileges.
    9