Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     27 
     28 /*
     29  * ZFS control directory (a.k.a. ".zfs")
     30  *
     31  * This directory provides a common location for all ZFS meta-objects.
     32  * Currently, this is only the 'snapshot' directory, but this may expand in the
     33  * future.  The elements are built using the GFS primitives, as the hierarchy
     34  * does not actually exist on disk.
     35  *
     36  * For 'snapshot', we don't want to have all snapshots always mounted, because
     37  * this would take up a huge amount of space in /etc/mnttab.  We have three
     38  * types of objects:
     39  *
     40  * 	ctldir ------> snapshotdir -------> snapshot
     41  *                                             |
     42  *                                             |
     43  *                                             V
     44  *                                         mounted fs
     45  *
     46  * The 'snapshot' node contains just enough information to lookup '..' and act
     47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
     48  * perform an automount of the underlying filesystem and return the
     49  * corresponding vnode.
     50  *
     51  * All mounts are handled automatically by the kernel, but unmounts are
     52  * (currently) handled from user land.  The main reason is that there is no
     53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
     54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
     55  * unmounts any snapshots within the snapshot directory.
     56  *
     57  * The '.zfs', '.zfs/snapshot', and all directories created under
     58  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
     59  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
     60  *
     61  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
     62  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
     63  * However, vnodes within these mounted on file systems have their v_vfsp
     64  * fields set to the head filesystem to make NFS happy (see
     65  * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
     66  * so that it cannot be freed until all snapshots have been unmounted.
     67  */
     68 
     69 #include <fs/fs_subr.h>
     70 #include <sys/zfs_ctldir.h>
     71 #include <sys/zfs_ioctl.h>
     72 #include <sys/zfs_vfsops.h>
     73 #include <sys/vfs_opreg.h>
     74 #include <sys/gfs.h>
     75 #include <sys/stat.h>
     76 #include <sys/dmu.h>
     77 #include <sys/dsl_deleg.h>
     78 #include <sys/mount.h>
     79 #include <sys/sunddi.h>
     80 
     81 #include "zfs_namecheck.h"
     82 
     83 typedef struct zfsctl_node {
     84 	gfs_dir_t	zc_gfs_private;
     85 	uint64_t	zc_id;
     86 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
     87 } zfsctl_node_t;
     88 
     89 typedef struct zfsctl_snapdir {
     90 	zfsctl_node_t	sd_node;
     91 	kmutex_t	sd_lock;
     92 	avl_tree_t	sd_snaps;
     93 } zfsctl_snapdir_t;
     94 
     95 typedef struct {
     96 	char		*se_name;
     97 	vnode_t		*se_root;
     98 	avl_node_t	se_node;
     99 } zfs_snapentry_t;
    100 
    101 static int
    102 snapentry_compare(const void *a, const void *b)
    103 {
    104 	const zfs_snapentry_t *sa = a;
    105 	const zfs_snapentry_t *sb = b;
    106 	int ret = strcmp(sa->se_name, sb->se_name);
    107 
    108 	if (ret < 0)
    109 		return (-1);
    110 	else if (ret > 0)
    111 		return (1);
    112 	else
    113 		return (0);
    114 }
    115 
    116 vnodeops_t *zfsctl_ops_root;
    117 vnodeops_t *zfsctl_ops_snapdir;
    118 vnodeops_t *zfsctl_ops_snapshot;
    119 
    120 static const fs_operation_def_t zfsctl_tops_root[];
    121 static const fs_operation_def_t zfsctl_tops_snapdir[];
    122 static const fs_operation_def_t zfsctl_tops_snapshot[];
    123 
    124 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
    125 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
    126 static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
    127 
    128 static gfs_opsvec_t zfsctl_opsvec[] = {
    129 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
    130 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
    131 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
    132 	{ NULL }
    133 };
    134 
    135 /*
    136  * Root directory elements.  We have only a single static entry, 'snapshot'.
    137  */
    138 static gfs_dirent_t zfsctl_root_entries[] = {
    139 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
    140 	{ NULL }
    141 };
    142 
    143 /* include . and .. in the calculation */
    144 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
    145     sizeof (gfs_dirent_t)) + 1)
    146 
    147 
    148 /*
    149  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
    150  * directories.  This is called from the ZFS init routine, and initializes the
    151  * vnode ops vectors that we'll be using.
    152  */
    153 void
    154 zfsctl_init(void)
    155 {
    156 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
    157 }
    158 
    159 void
    160 zfsctl_fini(void)
    161 {
    162 	/*
    163 	 * Remove vfsctl vnode ops
    164 	 */
    165 	if (zfsctl_ops_root)
    166 		vn_freevnodeops(zfsctl_ops_root);
    167 	if (zfsctl_ops_snapdir)
    168 		vn_freevnodeops(zfsctl_ops_snapdir);
    169 	if (zfsctl_ops_snapshot)
    170 		vn_freevnodeops(zfsctl_ops_snapshot);
    171 
    172 	zfsctl_ops_root = NULL;
    173 	zfsctl_ops_snapdir = NULL;
    174 	zfsctl_ops_snapshot = NULL;
    175 }
    176 
    177 /*
    178  * Return the inode number associated with the 'snapshot' directory.
    179  */
    180 /* ARGSUSED */
    181 static ino64_t
    182 zfsctl_root_inode_cb(vnode_t *vp, int index)
    183 {
    184 	ASSERT(index == 0);
    185 	return (ZFSCTL_INO_SNAPDIR);
    186 }
    187 
    188 /*
    189  * Create the '.zfs' directory.  This directory is cached as part of the VFS
    190  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
    191  * therefore checks against a vfs_count of 2 instead of 1.  This reference
    192  * is removed when the ctldir is destroyed in the unmount.
    193  */
    194 void
    195 zfsctl_create(zfsvfs_t *zfsvfs)
    196 {
    197 	vnode_t *vp, *rvp;
    198 	zfsctl_node_t *zcp;
    199 
    200 	ASSERT(zfsvfs->z_ctldir == NULL);
    201 
    202 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
    203 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
    204 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
    205 	zcp = vp->v_data;
    206 	zcp->zc_id = ZFSCTL_INO_ROOT;
    207 
    208 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
    209 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
    210 	VN_RELE(rvp);
    211 
    212 	/*
    213 	 * We're only faking the fact that we have a root of a filesystem for
    214 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
    215 	 * for us.
    216 	 */
    217 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
    218 
    219 	zfsvfs->z_ctldir = vp;
    220 }
    221 
    222 /*
    223  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
    224  * There might still be more references if we were force unmounted, but only
    225  * new zfs_inactive() calls can occur and they don't reference .zfs
    226  */
    227 void
    228 zfsctl_destroy(zfsvfs_t *zfsvfs)
    229 {
    230 	VN_RELE(zfsvfs->z_ctldir);
    231 	zfsvfs->z_ctldir = NULL;
    232 }
    233 
    234 /*
    235  * Given a root znode, retrieve the associated .zfs directory.
    236  * Add a hold to the vnode and return it.
    237  */
    238 vnode_t *
    239 zfsctl_root(znode_t *zp)
    240 {
    241 	ASSERT(zfs_has_ctldir(zp));
    242 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
    243 	return (zp->z_zfsvfs->z_ctldir);
    244 }
    245 
    246 /*
    247  * Common open routine.  Disallow any write access.
    248  */
    249 /* ARGSUSED */
    250 static int
    251 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
    252 {
    253 	if (flags & FWRITE)
    254 		return (EACCES);
    255 
    256 	return (0);
    257 }
    258 
    259 /*
    260  * Common close routine.  Nothing to do here.
    261  */
    262 /* ARGSUSED */
    263 static int
    264 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
    265     cred_t *cr, caller_context_t *ct)
    266 {
    267 	return (0);
    268 }
    269 
    270 /*
    271  * Common access routine.  Disallow writes.
    272  */
    273 /* ARGSUSED */
    274 static int
    275 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
    276     caller_context_t *ct)
    277 {
    278 	if (mode & VWRITE)
    279 		return (EACCES);
    280 
    281 	return (0);
    282 }
    283 
    284 /*
    285  * Common getattr function.  Fill in basic information.
    286  */
    287 static void
    288 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
    289 {
    290 	zfsctl_node_t	*zcp = vp->v_data;
    291 	timestruc_t	now;
    292 
    293 	vap->va_uid = 0;
    294 	vap->va_gid = 0;
    295 	vap->va_rdev = 0;
    296 	/*
    297 	 * We are a purly virtual object, so we have no
    298 	 * blocksize or allocated blocks.
    299 	 */
    300 	vap->va_blksize = 0;
    301 	vap->va_nblocks = 0;
    302 	vap->va_seq = 0;
    303 	vap->va_fsid = vp->v_vfsp->vfs_dev;
    304 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
    305 	    S_IROTH | S_IXOTH;
    306 	vap->va_type = VDIR;
    307 	/*
    308 	 * We live in the now (for atime).
    309 	 */
    310 	gethrestime(&now);
    311 	vap->va_atime = now;
    312 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
    313 }
    314 
    315 /*ARGSUSED*/
    316 static int
    317 zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
    318 {
    319 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
    320 	zfsctl_node_t	*zcp = vp->v_data;
    321 	uint64_t	object = zcp->zc_id;
    322 	zfid_short_t	*zfid;
    323 	int		i;
    324 
    325 	ZFS_ENTER(zfsvfs);
    326 
    327 	if (fidp->fid_len < SHORT_FID_LEN) {
    328 		fidp->fid_len = SHORT_FID_LEN;
    329 		ZFS_EXIT(zfsvfs);
    330 		return (ENOSPC);
    331 	}
    332 
    333 	zfid = (zfid_short_t *)fidp;
    334 
    335 	zfid->zf_len = SHORT_FID_LEN;
    336 
    337 	for (i = 0; i < sizeof (zfid->zf_object); i++)
    338 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
    339 
    340 	/* .zfs znodes always have a generation number of 0 */
    341 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
    342 		zfid->zf_gen[i] = 0;
    343 
    344 	ZFS_EXIT(zfsvfs);
    345 	return (0);
    346 }
    347 
    348 /*
    349  * .zfs inode namespace
    350  *
    351  * We need to generate unique inode numbers for all files and directories
    352  * within the .zfs pseudo-filesystem.  We use the following scheme:
    353  *
    354  * 	ENTRY			ZFSCTL_INODE
    355  * 	.zfs			1
    356  * 	.zfs/snapshot		2
    357  * 	.zfs/snapshot/<snap>	objectid(snap)
    358  */
    359 
    360 #define	ZFSCTL_INO_SNAP(id)	(id)
    361 
    362 /*
    363  * Get root directory attributes.
    364  */
    365 /* ARGSUSED */
    366 static int
    367 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    368     caller_context_t *ct)
    369 {
    370 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    371 
    372 	ZFS_ENTER(zfsvfs);
    373 	vap->va_nodeid = ZFSCTL_INO_ROOT;
    374 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
    375 
    376 	zfsctl_common_getattr(vp, vap);
    377 	ZFS_EXIT(zfsvfs);
    378 
    379 	return (0);
    380 }
    381 
    382 /*
    383  * Special case the handling of "..".
    384  */
    385 /* ARGSUSED */
    386 int
    387 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
    388     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
    389     int *direntflags, pathname_t *realpnp)
    390 {
    391 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    392 	int err;
    393 
    394 	/*
    395 	 * No extended attributes allowed under .zfs
    396 	 */
    397 	if (flags & LOOKUP_XATTR)
    398 		return (EINVAL);
    399 
    400 	ZFS_ENTER(zfsvfs);
    401 
    402 	if (strcmp(nm, "..") == 0) {
    403 		err = VFS_ROOT(dvp->v_vfsp, vpp);
    404 	} else {
    405 		err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
    406 		    cr, ct, direntflags, realpnp);
    407 	}
    408 
    409 	ZFS_EXIT(zfsvfs);
    410 
    411 	return (err);
    412 }
    413 
    414 static const fs_operation_def_t zfsctl_tops_root[] = {
    415 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
    416 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
    417 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
    418 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
    419 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
    420 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
    421 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
    422 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
    423 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
    424 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
    425 	{ NULL }
    426 };
    427 
    428 static int
    429 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
    430 {
    431 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
    432 
    433 	if (snapshot_namecheck(name, NULL, NULL) != 0)
    434 		return (EILSEQ);
    435 	dmu_objset_name(os, zname);
    436 	if (strlen(zname) + 1 + strlen(name) >= len)
    437 		return (ENAMETOOLONG);
    438 	(void) strcat(zname, "@");
    439 	(void) strcat(zname, name);
    440 	return (0);
    441 }
    442 
    443 static int
    444 zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
    445 {
    446 	vnode_t *svp = sep->se_root;
    447 	int error;
    448 
    449 	ASSERT(vn_ismntpt(svp));
    450 
    451 	/* this will be dropped by dounmount() */
    452 	if ((error = vn_vfswlock(svp)) != 0)
    453 		return (error);
    454 
    455 	VN_HOLD(svp);
    456 	error = dounmount(vn_mountedvfs(svp), fflags, cr);
    457 	if (error) {
    458 		VN_RELE(svp);
    459 		return (error);
    460 	}
    461 	VFS_RELE(svp->v_vfsp);
    462 	/*
    463 	 * We can't use VN_RELE(), as that will try to invoke
    464 	 * zfsctl_snapdir_inactive(), which would cause us to destroy
    465 	 * the sd_lock mutex held by our caller.
    466 	 */
    467 	ASSERT(svp->v_count == 1);
    468 	gfs_vop_inactive(svp, cr, NULL);
    469 
    470 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
    471 	kmem_free(sep, sizeof (zfs_snapentry_t));
    472 
    473 	return (0);
    474 }
    475 
    476 static void
    477 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
    478 {
    479 	avl_index_t where;
    480 	vfs_t *vfsp;
    481 	refstr_t *pathref;
    482 	char newpath[MAXNAMELEN];
    483 	char *tail;
    484 
    485 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
    486 	ASSERT(sep != NULL);
    487 
    488 	vfsp = vn_mountedvfs(sep->se_root);
    489 	ASSERT(vfsp != NULL);
    490 
    491 	vfs_lock_wait(vfsp);
    492 
    493 	/*
    494 	 * Change the name in the AVL tree.
    495 	 */
    496 	avl_remove(&sdp->sd_snaps, sep);
    497 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
    498 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
    499 	(void) strcpy(sep->se_name, nm);
    500 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
    501 	avl_insert(&sdp->sd_snaps, sep, where);
    502 
    503 	/*
    504 	 * Change the current mountpoint info:
    505 	 * 	- update the tail of the mntpoint path
    506 	 *	- update the tail of the resource path
    507 	 */
    508 	pathref = vfs_getmntpoint(vfsp);
    509 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
    510 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
    511 	*(tail+1) = '\0';
    512 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
    513 	(void) strcat(newpath, nm);
    514 	refstr_rele(pathref);
    515 	vfs_setmntpoint(vfsp, newpath);
    516 
    517 	pathref = vfs_getresource(vfsp);
    518 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
    519 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
    520 	*(tail+1) = '\0';
    521 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
    522 	(void) strcat(newpath, nm);
    523 	refstr_rele(pathref);
    524 	vfs_setresource(vfsp, newpath);
    525 
    526 	vfs_unlock(vfsp);
    527 }
    528 
    529 /*ARGSUSED*/
    530 static int
    531 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
    532     cred_t *cr, caller_context_t *ct, int flags)
    533 {
    534 	zfsctl_snapdir_t *sdp = sdvp->v_data;
    535 	zfs_snapentry_t search, *sep;
    536 	zfsvfs_t *zfsvfs;
    537 	avl_index_t where;
    538 	char from[MAXNAMELEN], to[MAXNAMELEN];
    539 	char real[MAXNAMELEN];
    540 	int err;
    541 
    542 	zfsvfs = sdvp->v_vfsp->vfs_data;
    543 	ZFS_ENTER(zfsvfs);
    544 
    545 	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
    546 		err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
    547 		    MAXNAMELEN, NULL);
    548 		if (err == 0) {
    549 			snm = real;
    550 		} else if (err != ENOTSUP) {
    551 			ZFS_EXIT(zfsvfs);
    552 			return (err);
    553 		}
    554 	}
    555 
    556 	ZFS_EXIT(zfsvfs);
    557 
    558 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
    559 	if (!err)
    560 		err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
    561 	if (!err)
    562 		err = zfs_secpolicy_rename_perms(from, to, cr);
    563 	if (err)
    564 		return (err);
    565 
    566 	/*
    567 	 * Cannot move snapshots out of the snapdir.
    568 	 */
    569 	if (sdvp != tdvp)
    570 		return (EINVAL);
    571 
    572 	if (strcmp(snm, tnm) == 0)
    573 		return (0);
    574 
    575 	mutex_enter(&sdp->sd_lock);
    576 
    577 	search.se_name = (char *)snm;
    578 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
    579 		mutex_exit(&sdp->sd_lock);
    580 		return (ENOENT);
    581 	}
    582 
    583 	err = dmu_objset_rename(from, to, B_FALSE);
    584 	if (err == 0)
    585 		zfsctl_rename_snap(sdp, sep, tnm);
    586 
    587 	mutex_exit(&sdp->sd_lock);
    588 
    589 	return (err);
    590 }
    591 
    592 /* ARGSUSED */
    593 static int
    594 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
    595     caller_context_t *ct, int flags)
    596 {
    597 	zfsctl_snapdir_t *sdp = dvp->v_data;
    598 	zfs_snapentry_t *sep;
    599 	zfs_snapentry_t search;
    600 	zfsvfs_t *zfsvfs;
    601 	char snapname[MAXNAMELEN];
    602 	char real[MAXNAMELEN];
    603 	int err;
    604 
    605 	zfsvfs = dvp->v_vfsp->vfs_data;
    606 	ZFS_ENTER(zfsvfs);
    607 
    608 	if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
    609 
    610 		err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
    611 		    MAXNAMELEN, NULL);
    612 		if (err == 0) {
    613 			name = real;
    614 		} else if (err != ENOTSUP) {
    615 			ZFS_EXIT(zfsvfs);
    616 			return (err);
    617 		}
    618 	}
    619 
    620 	ZFS_EXIT(zfsvfs);
    621 
    622 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
    623 	if (!err)
    624 		err = zfs_secpolicy_destroy_perms(snapname, cr);
    625 	if (err)
    626 		return (err);
    627 
    628 	mutex_enter(&sdp->sd_lock);
    629 
    630 	search.se_name = name;
    631 	sep = avl_find(&sdp->sd_snaps, &search, NULL);
    632 	if (sep) {
    633 		avl_remove(&sdp->sd_snaps, sep);
    634 		err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
    635 		if (err)
    636 			avl_add(&sdp->sd_snaps, sep);
    637 		else
    638 			err = dmu_objset_destroy(snapname);
    639 	} else {
    640 		err = ENOENT;
    641 	}
    642 
    643 	mutex_exit(&sdp->sd_lock);
    644 
    645 	return (err);
    646 }
    647 
    648 /*
    649  * This creates a snapshot under '.zfs/snapshot'.
    650  */
    651 /* ARGSUSED */
    652 static int
    653 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
    654     cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
    655 {
    656 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    657 	char name[MAXNAMELEN];
    658 	int err;
    659 	static enum symfollow follow = NO_FOLLOW;
    660 	static enum uio_seg seg = UIO_SYSSPACE;
    661 
    662 	if (snapshot_namecheck(dirname, NULL, NULL) != 0)
    663 		return (EILSEQ);
    664 
    665 	dmu_objset_name(zfsvfs->z_os, name);
    666 
    667 	*vpp = NULL;
    668 
    669 	err = zfs_secpolicy_snapshot_perms(name, cr);
    670 	if (err)
    671 		return (err);
    672 
    673 	if (err == 0) {
    674 		err = dmu_objset_snapshot(name, dirname, B_FALSE);
    675 		if (err)
    676 			return (err);
    677 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
    678 	}
    679 
    680 	return (err);
    681 }
    682 
    683 /*
    684  * Lookup entry point for the 'snapshot' directory.  Try to open the
    685  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
    686  * Perform a mount of the associated dataset on top of the vnode.
    687  */
    688 /* ARGSUSED */
    689 static int
    690 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
    691     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
    692     int *direntflags, pathname_t *realpnp)
    693 {
    694 	zfsctl_snapdir_t *sdp = dvp->v_data;
    695 	objset_t *snap;
    696 	char snapname[MAXNAMELEN];
    697 	char real[MAXNAMELEN];
    698 	char *mountpoint;
    699 	zfs_snapentry_t *sep, search;
    700 	struct mounta margs;
    701 	vfs_t *vfsp;
    702 	size_t mountpoint_len;
    703 	avl_index_t where;
    704 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    705 	int err;
    706 
    707 	/*
    708 	 * No extended attributes allowed under .zfs
    709 	 */
    710 	if (flags & LOOKUP_XATTR)
    711 		return (EINVAL);
    712 
    713 	ASSERT(dvp->v_type == VDIR);
    714 
    715 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
    716 		return (0);
    717 
    718 	/*
    719 	 * If we get a recursive call, that means we got called
    720 	 * from the domount() code while it was trying to look up the
    721 	 * spec (which looks like a local path for zfs).  We need to
    722 	 * add some flag to domount() to tell it not to do this lookup.
    723 	 */
    724 	if (MUTEX_HELD(&sdp->sd_lock))
    725 		return (ENOENT);
    726 
    727 	ZFS_ENTER(zfsvfs);
    728 
    729 	if (flags & FIGNORECASE) {
    730 		boolean_t conflict = B_FALSE;
    731 
    732 		err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
    733 		    MAXNAMELEN, &conflict);
    734 		if (err == 0) {
    735 			nm = real;
    736 		} else if (err != ENOTSUP) {
    737 			ZFS_EXIT(zfsvfs);
    738 			return (err);
    739 		}
    740 		if (realpnp)
    741 			(void) strlcpy(realpnp->pn_buf, nm,
    742 			    realpnp->pn_bufsize);
    743 		if (conflict && direntflags)
    744 			*direntflags = ED_CASE_CONFLICT;
    745 	}
    746 
    747 	mutex_enter(&sdp->sd_lock);
    748 	search.se_name = (char *)nm;
    749 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
    750 		*vpp = sep->se_root;
    751 		VN_HOLD(*vpp);
    752 		err = traverse(vpp);
    753 		if (err) {
    754 			VN_RELE(*vpp);
    755 			*vpp = NULL;
    756 		} else if (*vpp == sep->se_root) {
    757 			/*
    758 			 * The snapshot was unmounted behind our backs,
    759 			 * try to remount it.
    760 			 */
    761 			goto domount;
    762 		} else {
    763 			/*
    764 			 * VROOT was set during the traverse call.  We need
    765 			 * to clear it since we're pretending to be part
    766 			 * of our parent's vfs.
    767 			 */
    768 			(*vpp)->v_flag &= ~VROOT;
    769 		}
    770 		mutex_exit(&sdp->sd_lock);
    771 		ZFS_EXIT(zfsvfs);
    772 		return (err);
    773 	}
    774 
    775 	/*
    776 	 * The requested snapshot is not currently mounted, look it up.
    777 	 */
    778 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
    779 	if (err) {
    780 		mutex_exit(&sdp->sd_lock);
    781 		ZFS_EXIT(zfsvfs);
    782 		/*
    783 		 * handle "ls *" or "?" in a graceful manner,
    784 		 * forcing EILSEQ to ENOENT.
    785 		 * Since shell ultimately passes "*" or "?" as name to lookup
    786 		 */
    787 		return (err == EILSEQ ? ENOENT : err);
    788 	}
    789 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
    790 	    DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
    791 		mutex_exit(&sdp->sd_lock);
    792 		ZFS_EXIT(zfsvfs);
    793 		return (ENOENT);
    794 	}
    795 
    796 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
    797 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
    798 	(void) strcpy(sep->se_name, nm);
    799 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
    800 	avl_insert(&sdp->sd_snaps, sep, where);
    801 
    802 	dmu_objset_close(snap);
    803 domount:
    804 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
    805 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
    806 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
    807 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
    808 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
    809 
    810 	margs.spec = snapname;
    811 	margs.dir = mountpoint;
    812 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
    813 	margs.fstype = "zfs";
    814 	margs.dataptr = NULL;
    815 	margs.datalen = 0;
    816 	margs.optptr = NULL;
    817 	margs.optlen = 0;
    818 
    819 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
    820 	kmem_free(mountpoint, mountpoint_len);
    821 
    822 	if (err == 0) {
    823 		/*
    824 		 * Return the mounted root rather than the covered mount point.
    825 		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
    826 		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
    827 		 * vnode is the root the newly created vfsp.
    828 		 */
    829 		VFS_RELE(vfsp);
    830 		err = traverse(vpp);
    831 	}
    832 
    833 	if (err == 0) {
    834 		/*
    835 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
    836 		 *
    837 		 * This is where we lie about our v_vfsp in order to
    838 		 * make .zfs/snapshot/<snapname> accessible over NFS
    839 		 * without requiring manual mounts of <snapname>.
    840 		 */
    841 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
    842 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
    843 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
    844 		(*vpp)->v_flag &= ~VROOT;
    845 	}
    846 	mutex_exit(&sdp->sd_lock);
    847 	ZFS_EXIT(zfsvfs);
    848 
    849 	/*
    850 	 * If we had an error, drop our hold on the vnode and
    851 	 * zfsctl_snapshot_inactive() will clean up.
    852 	 */
    853 	if (err) {
    854 		VN_RELE(*vpp);
    855 		*vpp = NULL;
    856 	}
    857 	return (err);
    858 }
    859 
    860 /* ARGSUSED */
    861 static int
    862 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
    863     offset_t *offp, offset_t *nextp, void *data, int flags)
    864 {
    865 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    866 	char snapname[MAXNAMELEN];
    867 	uint64_t id, cookie;
    868 	boolean_t case_conflict;
    869 	int error;
    870 
    871 	ZFS_ENTER(zfsvfs);
    872 
    873 	cookie = *offp;
    874 	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
    875 	    &cookie, &case_conflict);
    876 	if (error) {
    877 		ZFS_EXIT(zfsvfs);
    878 		if (error == ENOENT) {
    879 			*eofp = 1;
    880 			return (0);
    881 		}
    882 		return (error);
    883 	}
    884 
    885 	if (flags & V_RDDIR_ENTFLAGS) {
    886 		edirent_t *eodp = dp;
    887 
    888 		(void) strcpy(eodp->ed_name, snapname);
    889 		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
    890 		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
    891 	} else {
    892 		struct dirent64 *odp = dp;
    893 
    894 		(void) strcpy(odp->d_name, snapname);
    895 		odp->d_ino = ZFSCTL_INO_SNAP(id);
    896 	}
    897 	*nextp = cookie;
    898 
    899 	ZFS_EXIT(zfsvfs);
    900 
    901 	return (0);
    902 }
    903 
    904 /*
    905  * pvp is the '.zfs' directory (zfsctl_node_t).
    906  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
    907  *
    908  * This function is the callback to create a GFS vnode for '.zfs/snapshot'
    909  * when a lookup is performed on .zfs for "snapshot".
    910  */
    911 vnode_t *
    912 zfsctl_mknode_snapdir(vnode_t *pvp)
    913 {
    914 	vnode_t *vp;
    915 	zfsctl_snapdir_t *sdp;
    916 
    917 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
    918 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
    919 	    zfsctl_snapdir_readdir_cb, NULL);
    920 	sdp = vp->v_data;
    921 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
    922 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
    923 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
    924 	avl_create(&sdp->sd_snaps, snapentry_compare,
    925 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
    926 	return (vp);
    927 }
    928 
    929 /* ARGSUSED */
    930 static int
    931 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    932     caller_context_t *ct)
    933 {
    934 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    935 	zfsctl_snapdir_t *sdp = vp->v_data;
    936 
    937 	ZFS_ENTER(zfsvfs);
    938 	zfsctl_common_getattr(vp, vap);
    939 	vap->va_nodeid = gfs_file_inode(vp);
    940 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
    941 	ZFS_EXIT(zfsvfs);
    942 
    943 	return (0);
    944 }
    945 
    946 /* ARGSUSED */
    947 static void
    948 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
    949 {
    950 	zfsctl_snapdir_t *sdp = vp->v_data;
    951 	void *private;
    952 
    953 	private = gfs_dir_inactive(vp);
    954 	if (private != NULL) {
    955 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
    956 		mutex_destroy(&sdp->sd_lock);
    957 		avl_destroy(&sdp->sd_snaps);
    958 		kmem_free(private, sizeof (zfsctl_snapdir_t));
    959 	}
    960 }
    961 
    962 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
    963 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
    964 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
    965 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
    966 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
    967 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
    968 	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
    969 	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
    970 	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
    971 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
    972 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
    973 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
    974 	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
    975 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
    976 	{ NULL }
    977 };
    978 
    979 /*
    980  * pvp is the GFS vnode '.zfs/snapshot'.
    981  *
    982  * This creates a GFS node under '.zfs/snapshot' representing each
    983  * snapshot.  This newly created GFS node is what we mount snapshot
    984  * vfs_t's ontop of.
    985  */
    986 static vnode_t *
    987 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
    988 {
    989 	vnode_t *vp;
    990 	zfsctl_node_t *zcp;
    991 
    992 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
    993 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
    994 	zcp = vp->v_data;
    995 	zcp->zc_id = objset;
    996 	VFS_HOLD(vp->v_vfsp);
    997 
    998 	return (vp);
    999 }
   1000 
   1001 static void
   1002 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   1003 {
   1004 	zfsctl_snapdir_t *sdp;
   1005 	zfs_snapentry_t *sep, *next;
   1006 	vnode_t *dvp;
   1007 
   1008 	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
   1009 	sdp =