Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)zfs_vfsops.c	1.38	07/12/07 SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/param.h>
     30 #include <sys/systm.h>
     31 #include <sys/sysmacros.h>
     32 #include <sys/kmem.h>
     33 #include <sys/pathname.h>
     34 #include <sys/vnode.h>
     35 #include <sys/vfs.h>
     36 #include <sys/vfs_opreg.h>
     37 #include <sys/mntent.h>
     38 #include <sys/mount.h>
     39 #include <sys/cmn_err.h>
     40 #include "fs/fs_subr.h"
     41 #include <sys/zfs_znode.h>
     42 #include <sys/zfs_dir.h>
     43 #include <sys/zil.h>
     44 #include <sys/fs/zfs.h>
     45 #include <sys/dmu.h>
     46 #include <sys/dsl_prop.h>
     47 #include <sys/dsl_dataset.h>
     48 #include <sys/dsl_deleg.h>
     49 #include <sys/spa.h>
     50 #include <sys/zap.h>
     51 #include <sys/varargs.h>
     52 #include <sys/policy.h>
     53 #include <sys/atomic.h>
     54 #include <sys/mkdev.h>
     55 #include <sys/modctl.h>
     56 #include <sys/refstr.h>
     57 #include <sys/zfs_ioctl.h>
     58 #include <sys/zfs_ctldir.h>
     59 #include <sys/zfs_fuid.h>
     60 #include <sys/bootconf.h>
     61 #include <sys/sunddi.h>
     62 #include <sys/dnlc.h>
     63 #include <sys/dmu_objset.h>
     64 
     65 int zfsfstype;
     66 vfsops_t *zfs_vfsops = NULL;
     67 static major_t zfs_major;
     68 static minor_t zfs_minor;
     69 static kmutex_t	zfs_dev_mtx;
     70 
     71 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
     72 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
     73 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
     74 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
     75 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
     76 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
     77 static void zfs_freevfs(vfs_t *vfsp);
     78 
     79 static const fs_operation_def_t zfs_vfsops_template[] = {
     80 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
     81 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
     82 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
     83 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
     84 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
     85 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
     86 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
     87 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
     88 	NULL,			NULL
     89 };
     90 
     91 static const fs_operation_def_t zfs_vfsops_eio_template[] = {
     92 	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
     93 	NULL,			NULL
     94 };
     95 
     96 /*
     97  * We need to keep a count of active fs's.
     98  * This is necessary to prevent our module
     99  * from being unloaded after a umount -f
    100  */
    101 static uint32_t	zfs_active_fs_count = 0;
    102 
    103 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
    104 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
    105 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
    106 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
    107 
    108 /*
    109  * MO_DEFAULT is not used since the default value is determined
    110  * by the equivalent property.
    111  */
    112 static mntopt_t mntopts[] = {
    113 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
    114 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
    115 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
    116 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
    117 };
    118 
    119 static mntopts_t zfs_mntopts = {
    120 	sizeof (mntopts) / sizeof (mntopt_t),
    121 	mntopts
    122 };
    123 
    124 /*ARGSUSED*/
    125 int
    126 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
    127 {
    128 	/*
    129 	 * Data integrity is job one.  We don't want a compromised kernel
    130 	 * writing to the storage pool, so we never sync during panic.
    131 	 */
    132 	if (panicstr)
    133 		return (0);
    134 
    135 	/*
    136 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
    137 	 * to sync metadata, which they would otherwise cache indefinitely.
    138 	 * Semantically, the only requirement is that the sync be initiated.
    139 	 * The DMU syncs out txgs frequently, so there's nothing to do.
    140 	 */
    141 	if (flag & SYNC_ATTR)
    142 		return (0);
    143 
    144 	if (vfsp != NULL) {
    145 		/*
    146 		 * Sync a specific filesystem.
    147 		 */
    148 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
    149 
    150 		ZFS_ENTER(zfsvfs);
    151 		if (zfsvfs->z_log != NULL)
    152 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
    153 		else
    154 			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
    155 		ZFS_EXIT(zfsvfs);
    156 	} else {
    157 		/*
    158 		 * Sync all ZFS filesystems.  This is what happens when you
    159 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
    160 		 * request by waiting for all pools to commit all dirty data.
    161 		 */
    162 		spa_sync_allpools();
    163 	}
    164 
    165 	return (0);
    166 }
    167 
    168 static int
    169 zfs_create_unique_device(dev_t *dev)
    170 {
    171 	major_t new_major;
    172 
    173 	do {
    174 		ASSERT3U(zfs_minor, <=, MAXMIN32);
    175 		minor_t start = zfs_minor;
    176 		do {
    177 			mutex_enter(&zfs_dev_mtx);
    178 			if (zfs_minor >= MAXMIN32) {
    179 				/*
    180 				 * If we're still using the real major
    181 				 * keep out of /dev/zfs and /dev/zvol minor
    182 				 * number space.  If we're using a getudev()'ed
    183 				 * major number, we can use all of its minors.
    184 				 */
    185 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
    186 					zfs_minor = ZFS_MIN_MINOR;
    187 				else
    188 					zfs_minor = 0;
    189 			} else {
    190 				zfs_minor++;
    191 			}
    192 			*dev = makedevice(zfs_major, zfs_minor);
    193 			mutex_exit(&zfs_dev_mtx);
    194 		} while (vfs_devismounted(*dev) && zfs_minor != start);
    195 		if (zfs_minor == start) {
    196 			/*
    197 			 * We are using all ~262,000 minor numbers for the
    198 			 * current major number.  Create a new major number.
    199 			 */
    200 			if ((new_major = getudev()) == (major_t)-1) {
    201 				cmn_err(CE_WARN,
    202 				    "zfs_mount: Can't get unique major "
    203 				    "device number.");
    204 				return (-1);
    205 			}
    206 			mutex_enter(&zfs_dev_mtx);
    207 			zfs_major = new_major;
    208 			zfs_minor = 0;
    209 
    210 			mutex_exit(&zfs_dev_mtx);
    211 		} else {
    212 			break;
    213 		}
    214 		/* CONSTANTCONDITION */
    215 	} while (1);
    216 
    217 	return (0);
    218 }
    219 
    220 static void
    221 atime_changed_cb(void *arg, uint64_t newval)
    222 {
    223 	zfsvfs_t *zfsvfs = arg;
    224 
    225 	if (newval == TRUE) {
    226 		zfsvfs->z_atime = TRUE;
    227 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
    228 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
    229 	} else {
    230 		zfsvfs->z_atime = FALSE;
    231 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
    232 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
    233 	}
    234 }
    235 
    236 static void
    237 xattr_changed_cb(void *arg, uint64_t newval)
    238 {
    239 	zfsvfs_t *zfsvfs = arg;
    240 
    241 	if (newval == TRUE) {
    242 		/* XXX locking on vfs_flag? */
    243 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
    244 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
    245 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
    246 	} else {
    247 		/* XXX locking on vfs_flag? */
    248 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
    249 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
    250 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
    251 	}
    252 }
    253 
    254 static void
    255 blksz_changed_cb(void *arg, uint64_t newval)
    256 {
    257 	zfsvfs_t *zfsvfs = arg;
    258 
    259 	if (newval < SPA_MINBLOCKSIZE ||
    260 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
    261 		newval = SPA_MAXBLOCKSIZE;
    262 
    263 	zfsvfs->z_max_blksz = newval;
    264 	zfsvfs->z_vfs->vfs_bsize = newval;
    265 }
    266 
    267 static void
    268 readonly_changed_cb(void *arg, uint64_t newval)
    269 {
    270 	zfsvfs_t *zfsvfs = arg;
    271 
    272 	if (newval) {
    273 		/* XXX locking on vfs_flag? */
    274 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
    275 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
    276 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
    277 	} else {
    278 		/* XXX locking on vfs_flag? */
    279 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    280 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
    281 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
    282 	}
    283 }
    284 
    285 static void
    286 devices_changed_cb(void *arg, uint64_t newval)
    287 {
    288 	zfsvfs_t *zfsvfs = arg;
    289 
    290 	if (newval == FALSE) {
    291 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
    292 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
    293 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
    294 	} else {
    295 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
    296 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
    297 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
    298 	}
    299 }
    300 
    301 static void
    302 setuid_changed_cb(void *arg, uint64_t newval)
    303 {
    304 	zfsvfs_t *zfsvfs = arg;
    305 
    306 	if (newval == FALSE) {
    307 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
    308 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
    309 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
    310 	} else {
    311 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
    312 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
    313 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
    314 	}
    315 }
    316 
    317 static void
    318 exec_changed_cb(void *arg, uint64_t newval)
    319 {
    320 	zfsvfs_t *zfsvfs = arg;
    321 
    322 	if (newval == FALSE) {
    323 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
    324 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
    325 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
    326 	} else {
    327 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
    328 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
    329 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
    330 	}
    331 }
    332 
    333 /*
    334  * The nbmand mount option can be changed at mount time.
    335  * We can't allow it to be toggled on live file systems or incorrect
    336  * behavior may be seen from cifs clients
    337  *
    338  * This property isn't registered via dsl_prop_register(), but this callback
    339  * will be called when a file system is first mounted
    340  */
    341 static void
    342 nbmand_changed_cb(void *arg, uint64_t newval)
    343 {
    344 	zfsvfs_t *zfsvfs = arg;
    345 	if (newval == FALSE) {
    346 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
    347 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
    348 	} else {
    349 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
    350 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
    351 	}
    352 }
    353 
    354 static void
    355 snapdir_changed_cb(void *arg, uint64_t newval)
    356 {
    357 	zfsvfs_t *zfsvfs = arg;
    358 
    359 	zfsvfs->z_show_ctldir = newval;
    360 }
    361 
    362 static void
    363 vscan_changed_cb(void *arg, uint64_t newval)
    364 {
    365 	zfsvfs_t *zfsvfs = arg;
    366 
    367 	zfsvfs->z_vscan = newval;
    368 }
    369 
    370 static void
    371 acl_mode_changed_cb(void *arg, uint64_t newval)
    372 {
    373 	zfsvfs_t *zfsvfs = arg;
    374 
    375 	zfsvfs->z_acl_mode = newval;
    376 }
    377 
    378 static void
    379 acl_inherit_changed_cb(void *arg, uint64_t newval)
    380 {
    381 	zfsvfs_t *zfsvfs = arg;
    382 
    383 	zfsvfs->z_acl_inherit = newval;
    384 }
    385 
    386 static int
    387 zfs_register_callbacks(vfs_t *vfsp)
    388 {
    389 	struct dsl_dataset *ds = NULL;
    390 	objset_t *os = NULL;
    391 	zfsvfs_t *zfsvfs = NULL;
    392 	uint64_t nbmand;
    393 	int readonly, do_readonly = B_FALSE;
    394 	int setuid, do_setuid = B_FALSE;
    395 	int exec, do_exec = B_FALSE;
    396 	int devices, do_devices = B_FALSE;
    397 	int xattr, do_xattr = B_FALSE;
    398 	int atime, do_atime = B_FALSE;
    399 	int error = 0;
    400 
    401 	ASSERT(vfsp);
    402 	zfsvfs = vfsp->vfs_data;
    403 	ASSERT(zfsvfs);
    404 	os = zfsvfs->z_os;
    405 
    406 	/*
    407 	 * The act of registering our callbacks will destroy any mount
    408 	 * options we may have.  In order to enable temporary overrides
    409 	 * of mount options, we stash away the current values and
    410 	 * restore them after we register the callbacks.
    411 	 */
    412 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
    413 		readonly = B_TRUE;
    414 		do_readonly = B_TRUE;
    415 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
    416 		readonly = B_FALSE;
    417 		do_readonly = B_TRUE;
    418 	}
    419 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
    420 		devices = B_FALSE;
    421 		setuid = B_FALSE;
    422 		do_devices = B_TRUE;
    423 		do_setuid = B_TRUE;
    424 	} else {
    425 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
    426 			devices = B_FALSE;
    427 			do_devices = B_TRUE;
    428 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
    429 			devices = B_TRUE;
    430 			do_devices = B_TRUE;
    431 		}
    432 
    433 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
    434 			setuid = B_FALSE;
    435 			do_setuid = B_TRUE;
    436 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
    437 			setuid = B_TRUE;
    438 			do_setuid = B_TRUE;
    439 		}
    440 	}
    441 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
    442 		exec = B_FALSE;
    443 		do_exec = B_TRUE;
    444 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
    445 		exec = B_TRUE;
    446 		do_exec = B_TRUE;
    447 	}
    448 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
    449 		xattr = B_FALSE;
    450 		do_xattr = B_TRUE;
    451 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
    452 		xattr = B_TRUE;
    453 		do_xattr = B_TRUE;
    454 	}
    455 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
    456 		atime = B_FALSE;
    457 		do_atime = B_TRUE;
    458 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
    459 		atime = B_TRUE;
    460 		do_atime = B_TRUE;
    461 	}
    462 
    463 	/*
    464 	 * nbmand is a special property.  It can only be changed at
    465 	 * mount time.
    466 	 *
    467 	 * This is weird, but it is documented to only be changeable
    468 	 * at mount time.
    469 	 */
    470 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
    471 		nbmand = B_FALSE;
    472 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
    473 		nbmand = B_TRUE;
    474 	} else {
    475 		char osname[MAXNAMELEN];
    476 
    477 		dmu_objset_name(os, osname);
    478 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
    479 		    NULL))
    480 		return (error);
    481 	}
    482 
    483 	/*
    484 	 * Register property callbacks.
    485 	 *
    486 	 * It would probably be fine to just check for i/o error from
    487 	 * the first prop_register(), but I guess I like to go
    488 	 * overboard...
    489 	 */
    490 	ds = dmu_objset_ds(os);
    491 	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
    492 	error = error ? error : dsl_prop_register(ds,
    493 	    "xattr", xattr_changed_cb, zfsvfs);
    494 	error = error ? error : dsl_prop_register(ds,
    495 	    "recordsize", blksz_changed_cb, zfsvfs);
    496 	error = error ? error : dsl_prop_register(ds,
    497 	    "readonly", readonly_changed_cb, zfsvfs);
    498 	error = error ? error : dsl_prop_register(ds,
    499 	    "devices", devices_changed_cb, zfsvfs);
    500 	error = error ? error : dsl_prop_register(ds,
    501 	    "setuid", setuid_changed_cb, zfsvfs);
    502 	error = error ? error : dsl_prop_register(ds,
    503 	    "exec", exec_changed_cb, zfsvfs);
    504 	error = error ? error : dsl_prop_register(ds,
    505 	    "snapdir", snapdir_changed_cb, zfsvfs);
    506 	error = error ? error : dsl_prop_register(ds,
    507 	    "aclmode", acl_mode_changed_cb, zfsvfs);
    508 	error = error ? error : dsl_prop_register(ds,
    509 	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
    510 	error = error ? error : dsl_prop_register(ds,
    511 	    "vscan", vscan_changed_cb, zfsvfs);
    512 	if (error)
    513 		goto unregister;
    514 
    515 	/*
    516 	 * Invoke our callbacks to restore temporary mount options.
    517 	 */
    518 	if (do_readonly)
    519 		readonly_changed_cb(zfsvfs, readonly);
    520 	if (do_setuid)
    521 		setuid_changed_cb(zfsvfs, setuid);
    522 	if (do_exec)
    523 		exec_changed_cb(zfsvfs, exec);
    524 	if (do_devices)
    525 		devices_changed_cb(zfsvfs, devices);
    526 	if (do_xattr)
    527 		xattr_changed_cb(zfsvfs, xattr);
    528 	if (do_atime)
    529 		atime_changed_cb(zfsvfs, atime);
    530 
    531 	nbmand_changed_cb(zfsvfs, nbmand);
    532 
    533 	return (0);
    534 
    535 unregister:
    536 	/*
    537 	 * We may attempt to unregister some callbacks that are not
    538 	 * registered, but this is OK; it will simply return ENOMSG,
    539 	 * which we will ignore.
    540 	 */
    541 	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
    542 	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
    543 	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
    544 	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
    545 	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
    546 	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
    547 	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
    548 	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
    549 	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
    550 	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
    551 	    zfsvfs);
    552 	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
    553 	return (error);
    554 
    555 }
    556 
    557 static int
    558 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
    559 {
    560 	uint_t readonly;
    561 	int error;
    562 
    563 	error = zfs_register_callbacks(zfsvfs->z_vfs);
    564 	if (error)
    565 		return (error);
    566 
    567 	/*
    568 	 * Set the objset user_ptr to track its zfsvfs.
    569 	 */
    570 	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
    571 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
    572 	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
    573 
    574 	/*
    575 	 * If we are not mounting (ie: online recv), then we don't
    576 	 * have to worry about replaying the log as we blocked all
    577 	 * operations out since we closed the ZIL.
    578 	 */
    579 	if (mounting) {
    580 		/*
    581 		 * During replay we remove the read only flag to
    582 		 * allow replays to succeed.
    583 		 */
    584 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
    585 		if (readonly != 0)
    586 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
    587 		else
    588 			zfs_unlinked_drain(zfsvfs);
    589 
    590 		/*
    591 		 * Parse and replay the intent log.
    592 		 *
    593 		 * Because of ziltest, this must be done after
    594 		 * zfs_unlinked_drain().  (Further note: ziltest doesn't
    595 		 * use readonly mounts, where zfs_unlinked_drain() isn't
    596 		 * called.)  This is because ziltest causes spa_sync()
    597 		 * to think it's committed, but actually it is not, so
    598 		 * the intent log contains many txg's worth of changes.
    599 		 *
    600 		 * In particular, if object N is in the unlinked set in
    601 		 * the last txg to actually sync, then it could be
    602 		 * actually freed in a later txg and then reallocated in
    603 		 * a yet later txg.  This would write a "create object
    604 		 * N" record to the intent log.  Normally, this would be
    605 		 * fine because the spa_sync() would have written out
    606 		 * the fact that object N is free, before we could write
    607 		 * the "create object N" intent log record.
    608 		 *
    609 		 * But when we are in ziltest mode, we advance the "open
    610 		 * txg" without actually spa_sync()-ing the changes to
    611 		 * disk.  So we would see that object N is still
    612 		 * allocated and in the unlinked set, and there is an
    613 		 * intent log record saying to allocate it.
    614 		 */
    615 		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
    616 		    zfs_replay_vector);
    617 
    618 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
    619 	}
    620 
    621 	if (!zil_disable)
    622 		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
    623 
    624 	return (0);
    625 }
    626 
    627 static int
    628 zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
    629 {
    630 	dev_t mount_dev;
    631 	uint64_t recordsize, readonly;
    632 	int error = 0;
    633 	int mode;
    634 	zfsvfs_t *zfsvfs;
    635 	znode_t *zp = NULL;
    636 
    637 	ASSERT(vfsp);
    638 	ASSERT(osname);
    639 
    640 	/*
    641 	 * Initialize the zfs-specific filesystem structure.
    642 	 * Should probably make this a kmem cache, shuffle fields,
    643 	 * and just bzero up to z_hold_mtx[].
    644 	 */
    645 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
    646 	zfsvfs->z_vfs = vfsp;
    647 	zfsvfs->z_parent = zfsvfs;
    648 	zfsvfs->z_assign = TXG_NOWAIT;
    649 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
    650 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
    651 
    652 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
    653 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
    654 	    offsetof(znode_t, z_link_node));
    655 	rrw_init(&zfsvfs->z_teardown_lock);
    656 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
    657 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
    658 
    659 	/* Initialize the generic filesystem structure. */
    660 	vfsp->vfs_bcount = 0;
    661 	vfsp->vfs_data = NULL;
    662 
    663 	if (zfs_create_unique_device(&mount_dev) == -1) {
    664 		error = ENODEV;
    665 		goto out;
    666 	}
    667 	ASSERT(vfs_devismounted(mount_dev) == 0);
    668 
    669 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
    670 	    NULL))
    671 		goto out;
    672 
    673 	vfsp->vfs_dev = mount_dev;
    674 	vfsp->vfs_fstype = zfsfstype;
    675 	vfsp->vfs_bsize = recordsize;
    676 	vfsp->vfs_flag |= VFS_NOTRUNC;
    677 	vfsp->vfs_data = zfsvfs;
    678 
    679 	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
    680 		goto out;
    681 
    682 	if (readonly)
    683 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
    684 	else
    685 		mode = DS_MODE_PRIMARY;
    686 
    687 	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
    688 	if (error == EROFS) {
    689 		mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
    690 		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
    691 		    &zfsvfs->z_os);
    692 	}
    693 
    694 	if (error)
    695 		goto out;
    696 
    697 	if (error = zfs_init_fs(zfsvfs, &zp, cr))
    698 		goto out;
    699 
    700 	/* The call to zfs_init_fs leaves the vnode held, release it here. */
    701 	VN_RELE(ZTOV(zp));
    702 
    703 	/*
    704 	 * Set features for file system.
    705 	 */
    706 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
    707 	if (zfsvfs->z_use_fuids) {
    708 		vfs_set_feature(vfsp, VFSFT_XVATTR);
    709 		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
    710 		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
    711 	}
    712 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
    713 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    714 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    715 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
    716 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
    717 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
    718 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
    719 	}
    720 
    721 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
    722 		uint64_t pval;
    723 
    724 		ASSERT(mode & DS_MODE_READONLY);
    725 		atime_changed_cb(zfsvfs, B_FALSE);
    726 		readonly_changed_cb(zfsvfs, B_TRUE);
    727 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
    728 			goto out;
    729 		xattr_changed_cb(zfsvfs, pval);
    730 		zfsvfs->z_issnap = B_TRUE;
    731 	} else {
    732 		error = zfsvfs_setup(zfsvfs, B_TRUE);
    733 	}
    734 
    735 	if (!zfsvfs->z_issnap)
    736 		zfsctl_create(zfsvfs);
    737 out:
    738 	if (error) {
    739 		if (zfsvfs->z_os)
    740 			dmu_objset_close(zfsvfs->z_os);
    741 		mutex_destroy(&zfsvfs->z_znodes_lock);
    742 		list_destroy(&zfsvfs->z_all_znodes);
    743 		rrw_destroy(&zfsvfs->z_teardown_lock);
    744 		rw_destroy(&zfsvfs->z_teardown_inactive_lock);
    745 		rw_destroy(&zfsvfs->z_fuid_lock);
    746 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
    747 	} else {
    748 		atomic_add_32(&zfs_active_fs_count, 1);
    749 	}
    750 
    751 	return (error);
    752 }
    753 
    754 void
    755 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
    756 {
    757 	objset_t *os = zfsvfs->z_os;
    758 	struct dsl_dataset *ds;
    759 
    760 	/*
    761 	 * Unregister properties.
    762 	 */
    763 	if (!dmu_objset_is_snapshot(os)) {
    764 		ds = dmu_objset_ds(os);
    765 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
    766 		    zfsvfs) == 0);
    767 
    768 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
    769 		    zfsvfs) == 0);
    770 
    771 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
    772 		    zfsvfs) == 0);
    773 
    774 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
    775 		    zfsvfs) == 0);
    776 
    777 		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
    778 		    zfsvfs) == 0);
    779 
    780 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
    781 		    zfsvfs) == 0);
    782 
    783 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
    784 		    zfsvfs) == 0);
    785 
    786 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
    787 		    zfsvfs) == 0);
    788 
    789 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
    790 		    zfsvfs) == 0);
    791 
    792 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
    793 		    acl_inherit_changed_cb, zfsvfs) == 0);
    794 
    795 		VERIFY(dsl_prop_unregister(ds, "vscan",
    796 		    vscan_changed_cb, zfsvfs) == 0);
    797 	}
    798 }
    799 
    800 /*
    801  * Convert a decimal digit string to a uint64_t integer.
    802  */
    803 static int
    804 str_to_uint64(char *str, uint64_t *objnum)
    805 {
    806 	uint64_t num = 0;
    807 
    808 	while (*str) {
    809 		if (*str < '0' || *str > '9')
    810 			return (EINVAL);
    811 
    812 		num = num*10 + *str++ - '0';
    813 	}
    814 
    815 	*objnum = num;
    816 	return (0);
    817 }
    818 
    819 /*
    820  * The boot path passed from the boot loader is in the form of
    821  * "rootpool-name/root-filesystem-object-number'. Convert this
    822  * string to a dataset name: "rootpool-name/root-filesystem-name".
    823  */
    824 static int
    825 parse_bootpath(char *bpath, char *outpath)
    826 {
    827 	char *slashp;
    828 	uint64_t objnum;
    829 	int error;
    830 
    831 	if (*bpath == 0 || *bpath == '/')
    832 		return (EINVAL);
    833 
    834 	slashp = strchr(bpath, '/');
    835 
    836 	/* if no '/', just return the pool name */
    837 	if (slashp == NULL) {
    838 		(void) strcpy(outpath, bpath);
    839 		return (0);
    840 	}
    841 
    842 	if (error = str_to_uint64(slashp+1, &objnum))
    843 		return (error);
    844 
    845 	*slashp = '\0';
    846 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
    847 	*slashp = '/';
    848 
    849 	return (error);
    850 }
    851 
    852 static int
    853 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
    854 {
    855 	int error = 0;
    856 	int ret = 0;
    857 	static int zfsrootdone = 0;
    858 	zfsvfs_t *zfsvfs = NULL;
    859 	znode_t *zp = NULL;
    860 	vnode_t *vp = NULL;
    861 	char *zfs_bootpath;
    862 #if defined(_OBP)
    863 	int proplen;
    864 #endif
    865 
    866 	ASSERT(vfsp);
    867 
    868 	/*
    869 	 * The filesystem that we mount as root is defined in the
    870 	 * "zfs-bootfs" property.
    871 	 */
    872 	if (why == ROOT_INIT) {
    873 		if (zfsrootdone++)
    874 			return (EBUSY);
    875 
    876 #if defined(_OBP)
    877 		proplen = BOP_GETPROPLEN(bootops, "zfs-bootfs");
    878 		if (proplen == 0)
    879 			return (EIO);
    880 		zfs_bootpath = kmem_zalloc(proplen, KM_SLEEP);
    881 		if (BOP_GETPROP(bootops, "zfs-bootfs", zfs_bootpath) == -1) {
    882 			kmem_free(zfs_bootpath, proplen);
    883 			return (EIO);
    884 		}
    885 		error = parse_bootpath(zfs_bootpath, rootfs.bo_name);
    886 		kmem_free(zfs_bootpath, proplen);
    887 #else
    888 		if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
    889 		    DDI_PROP_DONTPASS, "zfs-bootfs", &zfs_bootpath) !=
    890 		    DDI_SUCCESS)
    891 			return (EIO);
    892 
    893 		error = parse_bootpath(zfs_bootpath, rootfs.bo_name);
    894 		ddi_prop_free(zfs_bootpath);
    895 #endif
    896 
    897 		if (error)
    898 			return (error);
    899 
    900 		if (error = vfs_lock(vfsp))
    901 			return (error);
    902 
    903 		if (error = zfs_domount(vfsp, rootfs.bo_name, CRED()))
    904 			goto out;
    905 
    906 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
    907 		ASSERT(zfsvfs);
    908 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp))
    909 			goto out;
    910 
    911 		vp = ZTOV(zp);
    912 		mutex_enter(&vp->v_lock);
    913 		vp->v_flag |= VROOT;
    914 		mutex_exit(&vp->v_lock);
    915 		rootvp = vp;
    916 
    917 		/*
    918 		 * The zfs_zget call above returns with a hold on vp, we release
    919 		 * it here.
    920 		 */
    921 		VN_RELE(vp);
    922 
    923 		/*
    924 		 * Mount root as readonly initially, it will be remouted
    925 		 * read/write by /lib/svc/method/fs-usr.
    926 		 */
    927 		readonly_changed_cb(vfsp->vfs_data, B_TRUE);
    928 		vfs_add((struct vnode *)0, vfsp,
    929 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
    930 out:
    931 		vfs_unlock(vfsp);
    932 		ret = (error) ? error : 0;
    933 		return (ret);
    934 	} else if (why == ROOT_REMOUNT) {
    935 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
    936 		vfsp->vfs_flag |= VFS_REMOUNT;
    937 
    938 		/* refresh mount options */
    939 		zfs_unregister_callbacks(vfsp->vfs_data);
    940 		return (zfs_register_callbacks(vfsp));
    941 
    942 	} else if (why == ROOT_UNMOUNT) {
    943 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
    944 		(void) zfs_sync(vfsp, 0, 0);
    945 		return (0);
    946 	}
    947 
    948 	/*
    949 	 * if "why" is equal to anything else other than ROOT_INIT,
    950 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
    951 	 */
    952 	return (ENOTSUP);
    953 }
    954 
    955 /*ARGSUSED*/
    956 static int
    957 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
    958 {
    959 	char		*osname;
    960 	pathname_t	spn;
    961 	int		error = 0;
    962 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
    963 	    UIO_SYSSPACE : UIO_USERSPACE;
    964 	int		canwrite;
    965 
    966 	if (mvp->v_type != VDIR)
    967 		return (ENOTDIR);
    968 
    969 	mutex_enter(&mvp->v_lock);
    970 	if ((uap->flags & MS_REMOUNT) == 0 &&
    971 	    (uap->flags & MS_OVERLAY) == 0 &&
    972 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
    973 		mutex_exit(&mvp->v_lock);
    974 		return (EBUSY);
    975 	}
    976 	mutex_exit(&mvp->v_lock);
    977 
    978 	/*
    979 	 * ZFS does not support passing unparsed data in via MS_DATA.
    980 	 * Users should use the MS_OPTIONSTR interface; this means
    981 	 * that all option parsing is already done and the options struct
    982 	 * can be interrogated.
    983 	 */
    984 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
    985 		return (EINVAL);
    986 
    987 	/*
    988 	 * Get the objset name (the "special" mount argument).
    989 	 */
    990 	if (error = pn_get(uap->spec, fromspace, &spn))
    991 		return (error);
    992 
    993 	osname = spn.pn_path;
    994 
    995 	/*
    996 	 * Check for mount privilege?
    997 	 *
    998 	 * If we don't have privilege then see if
    999 	 * we have local permission to allow it
   1000 	 */
   1001 	error = secpolicy_fs_mount(cr, mvp,