Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)zfs_ctldir.c	1.18	07/12/09 SMI"
     27 
     28 /*
     29  * ZFS control directory (a.k.a. ".zfs")
     30  *
     31  * This directory provides a common location for all ZFS meta-objects.
     32  * Currently, this is only the 'snapshot' directory, but this may expand in the
     33  * future.  The elements are built using the GFS primitives, as the hierarchy
     34  * does not actually exist on disk.
     35  *
     36  * For 'snapshot', we don't want to have all snapshots always mounted, because
     37  * this would take up a huge amount of space in /etc/mnttab.  We have three
     38  * types of objects:
     39  *
     40  * 	ctldir ------> snapshotdir -------> snapshot
     41  *                                             |
     42  *                                             |
     43  *                                             V
     44  *                                         mounted fs
     45  *
     46  * The 'snapshot' node contains just enough information to lookup '..' and act
     47  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
     48  * perform an automount of the underlying filesystem and return the
     49  * corresponding vnode.
     50  *
     51  * All mounts are handled automatically by the kernel, but unmounts are
     52  * (currently) handled from user land.  The main reason is that there is no
     53  * reliable way to auto-unmount the filesystem when it's "no longer in use".
     54  * When the user unmounts a filesystem, we call zfsctl_unmount(), which
     55  * unmounts any snapshots within the snapshot directory.
     56  *
     57  * The '.zfs', '.zfs/snapshot', and all directories created under
     58  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
     59  * share the same vfs_t as the head filesystem (what '.zfs' lives under).
     60  *
     61  * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
     62  * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
     63  * However, vnodes within these mounted on file systems have their v_vfsp
     64  * fields set to the head filesystem to make NFS happy (see
     65  * zfsctl_snapdir_lookup()).
     66  */
     67 
     68 #include <fs/fs_subr.h>
     69 #include <sys/zfs_ctldir.h>
     70 #include <sys/zfs_ioctl.h>
     71 #include <sys/zfs_vfsops.h>
     72 #include <sys/vfs_opreg.h>
     73 #include <sys/gfs.h>
     74 #include <sys/stat.h>
     75 #include <sys/dmu.h>
     76 #include <sys/dsl_deleg.h>
     77 #include <sys/mount.h>
     78 
     79 typedef struct {
     80 	char		*se_name;
     81 	vnode_t		*se_root;
     82 	avl_node_t	se_node;
     83 } zfs_snapentry_t;
     84 
     85 static int
     86 snapentry_compare(const void *a, const void *b)
     87 {
     88 	const zfs_snapentry_t *sa = a;
     89 	const zfs_snapentry_t *sb = b;
     90 	int ret = strcmp(sa->se_name, sb->se_name);
     91 
     92 	if (ret < 0)
     93 		return (-1);
     94 	else if (ret > 0)
     95 		return (1);
     96 	else
     97 		return (0);
     98 }
     99 
    100 vnodeops_t *zfsctl_ops_root;
    101 vnodeops_t *zfsctl_ops_snapdir;
    102 vnodeops_t *zfsctl_ops_snapshot;
    103 
    104 static const fs_operation_def_t zfsctl_tops_root[];
    105 static const fs_operation_def_t zfsctl_tops_snapdir[];
    106 static const fs_operation_def_t zfsctl_tops_snapshot[];
    107 
    108 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
    109 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
    110 
    111 static gfs_opsvec_t zfsctl_opsvec[] = {
    112 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
    113 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
    114 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
    115 	{ NULL }
    116 };
    117 
    118 typedef struct zfsctl_node {
    119 	gfs_dir_t	zc_gfs_private;
    120 	uint64_t	zc_id;
    121 	timestruc_t	zc_cmtime;	/* ctime and mtime, always the same */
    122 } zfsctl_node_t;
    123 
    124 typedef struct zfsctl_snapdir {
    125 	zfsctl_node_t	sd_node;
    126 	kmutex_t	sd_lock;
    127 	avl_tree_t	sd_snaps;
    128 } zfsctl_snapdir_t;
    129 
    130 /*
    131  * Root directory elements.  We have only a single static entry, 'snapshot'.
    132  */
    133 static gfs_dirent_t zfsctl_root_entries[] = {
    134 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
    135 	{ NULL }
    136 };
    137 
    138 /* include . and .. in the calculation */
    139 #define	NROOT_ENTRIES	((sizeof (zfsctl_root_entries) / \
    140     sizeof (gfs_dirent_t)) + 1)
    141 
    142 
    143 /*
    144  * Initialize the various GFS pieces we'll need to create and manipulate .zfs
    145  * directories.  This is called from the ZFS init routine, and initializes the
    146  * vnode ops vectors that we'll be using.
    147  */
    148 void
    149 zfsctl_init(void)
    150 {
    151 	VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
    152 }
    153 
    154 void
    155 zfsctl_fini(void)
    156 {
    157 	/*
    158 	 * Remove vfsctl vnode ops
    159 	 */
    160 	if (zfsctl_ops_root)
    161 		vn_freevnodeops(zfsctl_ops_root);
    162 	if (zfsctl_ops_snapdir)
    163 		vn_freevnodeops(zfsctl_ops_snapdir);
    164 	if (zfsctl_ops_snapshot)
    165 		vn_freevnodeops(zfsctl_ops_snapshot);
    166 
    167 	zfsctl_ops_root = NULL;
    168 	zfsctl_ops_snapdir = NULL;
    169 	zfsctl_ops_snapshot = NULL;
    170 }
    171 
    172 /*
    173  * Return the inode number associated with the 'snapshot' directory.
    174  */
    175 /* ARGSUSED */
    176 static ino64_t
    177 zfsctl_root_inode_cb(vnode_t *vp, int index)
    178 {
    179 	ASSERT(index == 0);
    180 	return (ZFSCTL_INO_SNAPDIR);
    181 }
    182 
    183 /*
    184  * Create the '.zfs' directory.  This directory is cached as part of the VFS
    185  * structure.  This results in a hold on the vfs_t.  The code in zfs_umount()
    186  * therefore checks against a vfs_count of 2 instead of 1.  This reference
    187  * is removed when the ctldir is destroyed in the unmount.
    188  */
    189 void
    190 zfsctl_create(zfsvfs_t *zfsvfs)
    191 {
    192 	vnode_t *vp, *rvp;
    193 	zfsctl_node_t *zcp;
    194 
    195 	ASSERT(zfsvfs->z_ctldir == NULL);
    196 
    197 	vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
    198 	    zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
    199 	    zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
    200 	zcp = vp->v_data;
    201 	zcp->zc_id = ZFSCTL_INO_ROOT;
    202 
    203 	VERIFY(VFS_ROOT(zfsvfs->z_vfs, &rvp) == 0);
    204 	ZFS_TIME_DECODE(&zcp->zc_cmtime, VTOZ(rvp)->z_phys->zp_crtime);
    205 	VN_RELE(rvp);
    206 
    207 	/*
    208 	 * We're only faking the fact that we have a root of a filesystem for
    209 	 * the sake of the GFS interfaces.  Undo the flag manipulation it did
    210 	 * for us.
    211 	 */
    212 	vp->v_flag &= ~(VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT);
    213 
    214 	zfsvfs->z_ctldir = vp;
    215 }
    216 
    217 /*
    218  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
    219  * There might still be more references if we were force unmounted, but only
    220  * new zfs_inactive() calls can occur and they don't reference .zfs
    221  */
    222 void
    223 zfsctl_destroy(zfsvfs_t *zfsvfs)
    224 {
    225 	VN_RELE(zfsvfs->z_ctldir);
    226 	zfsvfs->z_ctldir = NULL;
    227 }
    228 
    229 /*
    230  * Given a root znode, retrieve the associated .zfs directory.
    231  * Add a hold to the vnode and return it.
    232  */
    233 vnode_t *
    234 zfsctl_root(znode_t *zp)
    235 {
    236 	ASSERT(zfs_has_ctldir(zp));
    237 	VN_HOLD(zp->z_zfsvfs->z_ctldir);
    238 	return (zp->z_zfsvfs->z_ctldir);
    239 }
    240 
    241 /*
    242  * Common open routine.  Disallow any write access.
    243  */
    244 /* ARGSUSED */
    245 static int
    246 zfsctl_common_open(vnode_t **vpp, int flags, cred_t *cr, caller_context_t *ct)
    247 {
    248 	if (flags & FWRITE)
    249 		return (EACCES);
    250 
    251 	return (0);
    252 }
    253 
    254 /*
    255  * Common close routine.  Nothing to do here.
    256  */
    257 /* ARGSUSED */
    258 static int
    259 zfsctl_common_close(vnode_t *vpp, int flags, int count, offset_t off,
    260     cred_t *cr, caller_context_t *ct)
    261 {
    262 	return (0);
    263 }
    264 
    265 /*
    266  * Common access routine.  Disallow writes.
    267  */
    268 /* ARGSUSED */
    269 static int
    270 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
    271     caller_context_t *ct)
    272 {
    273 	if (mode & VWRITE)
    274 		return (EACCES);
    275 
    276 	return (0);
    277 }
    278 
    279 /*
    280  * Common getattr function.  Fill in basic information.
    281  */
    282 static void
    283 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
    284 {
    285 	zfsctl_node_t	*zcp = vp->v_data;
    286 	timestruc_t	now;
    287 
    288 	vap->va_uid = 0;
    289 	vap->va_gid = 0;
    290 	vap->va_rdev = 0;
    291 	/*
    292 	 * We are a purly virtual object, so we have no
    293 	 * blocksize or allocated blocks.
    294 	 */
    295 	vap->va_blksize = 0;
    296 	vap->va_nblocks = 0;
    297 	vap->va_seq = 0;
    298 	vap->va_fsid = vp->v_vfsp->vfs_dev;
    299 	vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
    300 	    S_IROTH | S_IXOTH;
    301 	vap->va_type = VDIR;
    302 	/*
    303 	 * We live in the now (for atime).
    304 	 */
    305 	gethrestime(&now);
    306 	vap->va_atime = now;
    307 	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
    308 }
    309 
    310 /*ARGSUSED*/
    311 static int
    312 zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
    313 {
    314 	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
    315 	zfsctl_node_t	*zcp = vp->v_data;
    316 	uint64_t	object = zcp->zc_id;
    317 	zfid_short_t	*zfid;
    318 	int		i;
    319 
    320 	ZFS_ENTER(zfsvfs);
    321 
    322 	if (fidp->fid_len < SHORT_FID_LEN) {
    323 		fidp->fid_len = SHORT_FID_LEN;
    324 		ZFS_EXIT(zfsvfs);
    325 		return (ENOSPC);
    326 	}
    327 
    328 	zfid = (zfid_short_t *)fidp;
    329 
    330 	zfid->zf_len = SHORT_FID_LEN;
    331 
    332 	for (i = 0; i < sizeof (zfid->zf_object); i++)
    333 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
    334 
    335 	/* .zfs znodes always have a generation number of 0 */
    336 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
    337 		zfid->zf_gen[i] = 0;
    338 
    339 	ZFS_EXIT(zfsvfs);
    340 	return (0);
    341 }
    342 
    343 /*
    344  * .zfs inode namespace
    345  *
    346  * We need to generate unique inode numbers for all files and directories
    347  * within the .zfs pseudo-filesystem.  We use the following scheme:
    348  *
    349  * 	ENTRY			ZFSCTL_INODE
    350  * 	.zfs			1
    351  * 	.zfs/snapshot		2
    352  * 	.zfs/snapshot/<snap>	objectid(snap)
    353  */
    354 
    355 #define	ZFSCTL_INO_SNAP(id)	(id)
    356 
    357 /*
    358  * Get root directory attributes.
    359  */
    360 /* ARGSUSED */
    361 static int
    362 zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    363     caller_context_t *ct)
    364 {
    365 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    366 
    367 	ZFS_ENTER(zfsvfs);
    368 	vap->va_nodeid = ZFSCTL_INO_ROOT;
    369 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
    370 
    371 	zfsctl_common_getattr(vp, vap);
    372 	ZFS_EXIT(zfsvfs);
    373 
    374 	return (0);
    375 }
    376 
    377 /*
    378  * Special case the handling of "..".
    379  */
    380 /* ARGSUSED */
    381 int
    382 zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
    383     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
    384     int *direntflags, pathname_t *realpnp)
    385 {
    386 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    387 	int err;
    388 
    389 	/*
    390 	 * No extended attributes allowed under .zfs
    391 	 */
    392 	if (flags & LOOKUP_XATTR)
    393 		return (EINVAL);
    394 
    395 	ZFS_ENTER(zfsvfs);
    396 
    397 	if (strcmp(nm, "..") == 0) {
    398 		err = VFS_ROOT(dvp->v_vfsp, vpp);
    399 	} else {
    400 		err = gfs_dir_lookup(dvp, nm, vpp, cr);
    401 	}
    402 
    403 	ZFS_EXIT(zfsvfs);
    404 
    405 	return (err);
    406 }
    407 
    408 static const fs_operation_def_t zfsctl_tops_root[] = {
    409 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
    410 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
    411 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
    412 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_root_getattr }	},
    413 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
    414 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir } 	},
    415 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
    416 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
    417 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
    418 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
    419 	{ NULL }
    420 };
    421 
    422 static int
    423 zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
    424 {
    425 	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
    426 
    427 	dmu_objset_name(os, zname);
    428 	if (strlen(zname) + 1 + strlen(name) >= len)
    429 		return (ENAMETOOLONG);
    430 	(void) strcat(zname, "@");
    431 	(void) strcat(zname, name);
    432 	return (0);
    433 }
    434 
    435 int
    436 zfsctl_unmount_snap(vnode_t *dvp, const char *name, int force, cred_t *cr)
    437 {
    438 	zfsctl_snapdir_t *sdp = dvp->v_data;
    439 	zfs_snapentry_t search, *sep;
    440 	avl_index_t where;
    441 	int err;
    442 
    443 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
    444 
    445 	search.se_name = (char *)name;
    446 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL)
    447 		return (ENOENT);
    448 
    449 	ASSERT(vn_ismntpt(sep->se_root));
    450 
    451 	/* this will be dropped by dounmount() */
    452 	if ((err = vn_vfswlock(sep->se_root)) != 0)
    453 		return (err);
    454 
    455 	VN_HOLD(sep->se_root);
    456 	err = dounmount(vn_mountedvfs(sep->se_root), force, kcred);
    457 	if (err) {
    458 		VN_RELE(sep->se_root);
    459 		return (err);
    460 	}
    461 	ASSERT(sep->se_root->v_count == 1);
    462 	gfs_vop_inactive(sep->se_root, cr, NULL);
    463 
    464 	avl_remove(&sdp->sd_snaps, sep);
    465 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
    466 	kmem_free(sep, sizeof (zfs_snapentry_t));
    467 
    468 	return (0);
    469 }
    470 
    471 
    472 static void
    473 zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
    474 {
    475 	avl_index_t where;
    476 	vfs_t *vfsp;
    477 	refstr_t *pathref;
    478 	char newpath[MAXNAMELEN];
    479 	char *tail;
    480 
    481 	ASSERT(MUTEX_HELD(&sdp->sd_lock));
    482 	ASSERT(sep != NULL);
    483 
    484 	vfsp = vn_mountedvfs(sep->se_root);
    485 	ASSERT(vfsp != NULL);
    486 
    487 	vfs_lock_wait(vfsp);
    488 
    489 	/*
    490 	 * Change the name in the AVL tree.
    491 	 */
    492 	avl_remove(&sdp->sd_snaps, sep);
    493 	kmem_free(sep->se_name, strlen(sep->se_name) + 1);
    494 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
    495 	(void) strcpy(sep->se_name, nm);
    496 	VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
    497 	avl_insert(&sdp->sd_snaps, sep, where);
    498 
    499 	/*
    500 	 * Change the current mountpoint info:
    501 	 * 	- update the tail of the mntpoint path
    502 	 *	- update the tail of the resource path
    503 	 */
    504 	pathref = vfs_getmntpoint(vfsp);
    505 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
    506 	VERIFY((tail = strrchr(newpath, '/')) != NULL);
    507 	*(tail+1) = '\0';
    508 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
    509 	(void) strcat(newpath, nm);
    510 	refstr_rele(pathref);
    511 	vfs_setmntpoint(vfsp, newpath);
    512 
    513 	pathref = vfs_getresource(vfsp);
    514 	(void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
    515 	VERIFY((tail = strrchr(newpath, '@')) != NULL);
    516 	*(tail+1) = '\0';
    517 	ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
    518 	(void) strcat(newpath, nm);
    519 	refstr_rele(pathref);
    520 	vfs_setresource(vfsp, newpath);
    521 
    522 	vfs_unlock(vfsp);
    523 }
    524 
    525 /*ARGSUSED*/
    526 static int
    527 zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
    528     cred_t *cr, caller_context_t *ct, int flags)
    529 {
    530 	zfsctl_snapdir_t *sdp = sdvp->v_data;
    531 	zfs_snapentry_t search, *sep;
    532 	avl_index_t where;
    533 	char from[MAXNAMELEN], to[MAXNAMELEN];
    534 	int err;
    535 
    536 	err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
    537 	if (err)
    538 		return (err);
    539 
    540 	err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
    541 	if (err)
    542 		return (err);
    543 
    544 	if (err = zfs_secpolicy_rename_perms(from, to, cr))
    545 		return (err);
    546 	/*
    547 	 * Cannot move snapshots out of the snapdir.
    548 	 */
    549 	if (sdvp != tdvp)
    550 		return (EINVAL);
    551 
    552 	if (strcmp(snm, tnm) == 0)
    553 		return (0);
    554 
    555 	mutex_enter(&sdp->sd_lock);
    556 
    557 	search.se_name = (char *)snm;
    558 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
    559 		mutex_exit(&sdp->sd_lock);
    560 		return (ENOENT);
    561 	}
    562 
    563 	err = dmu_objset_rename(from, to, B_FALSE);
    564 	if (err == 0)
    565 		zfsctl_rename_snap(sdp, sep, tnm);
    566 
    567 	mutex_exit(&sdp->sd_lock);
    568 
    569 	return (err);
    570 }
    571 
    572 /* ARGSUSED */
    573 static int
    574 zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
    575     caller_context_t *ct, int flags)
    576 {
    577 	zfsctl_snapdir_t *sdp = dvp->v_data;
    578 	char snapname[MAXNAMELEN];
    579 	int err;
    580 
    581 	err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
    582 	if (err)
    583 		return (err);
    584 
    585 	if (err = zfs_secpolicy_destroy_perms(snapname, cr))
    586 		return (err);
    587 
    588 	mutex_enter(&sdp->sd_lock);
    589 
    590 	err = zfsctl_unmount_snap(dvp, name, MS_FORCE, cr);
    591 	if (err) {
    592 		mutex_exit(&sdp->sd_lock);
    593 		return (err);
    594 	}
    595 
    596 	err = dmu_objset_destroy(snapname);
    597 
    598 	mutex_exit(&sdp->sd_lock);
    599 
    600 	return (err);
    601 }
    602 
    603 /*
    604  * This creates a snapshot under '.zfs/snapshot'.
    605  */
    606 /* ARGSUSED */
    607 static int
    608 zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
    609     cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
    610 {
    611 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    612 	char name[MAXNAMELEN];
    613 	int err;
    614 	static enum symfollow follow = NO_FOLLOW;
    615 	static enum uio_seg seg = UIO_SYSSPACE;
    616 
    617 	dmu_objset_name(zfsvfs->z_os, name);
    618 
    619 	*vpp = NULL;
    620 
    621 	err = zfs_secpolicy_snapshot_perms(name, cr);
    622 	if (err)
    623 		return (err);
    624 
    625 	if (err == 0) {
    626 		err = dmu_objset_snapshot(name, dirname, B_FALSE);
    627 		if (err)
    628 			return (err);
    629 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
    630 	}
    631 
    632 	return (err);
    633 }
    634 
    635 /*
    636  * Lookup entry point for the 'snapshot' directory.  Try to open the
    637  * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
    638  * Perform a mount of the associated dataset on top of the vnode.
    639  */
    640 /* ARGSUSED */
    641 static int
    642 zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
    643     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
    644     int *direntflags, pathname_t *realpnp)
    645 {
    646 	zfsctl_snapdir_t *sdp = dvp->v_data;
    647 	objset_t *snap;
    648 	char snapname[MAXNAMELEN];
    649 	char *mountpoint;
    650 	zfs_snapentry_t *sep, search;
    651 	struct mounta margs;
    652 	vfs_t *vfsp;
    653 	size_t mountpoint_len;
    654 	avl_index_t where;
    655 	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
    656 	int err;
    657 
    658 	/*
    659 	 * No extended attributes allowed under .zfs
    660 	 */
    661 	if (flags & LOOKUP_XATTR)
    662 		return (EINVAL);
    663 
    664 	ASSERT(dvp->v_type == VDIR);
    665 
    666 	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
    667 		return (0);
    668 
    669 	/*
    670 	 * If we get a recursive call, that means we got called
    671 	 * from the domount() code while it was trying to look up the
    672 	 * spec (which looks like a local path for zfs).  We need to
    673 	 * add some flag to domount() to tell it not to do this lookup.
    674 	 */
    675 	if (MUTEX_HELD(&sdp->sd_lock))
    676 		return (ENOENT);
    677 
    678 	ZFS_ENTER(zfsvfs);
    679 
    680 	mutex_enter(&sdp->sd_lock);
    681 	search.se_name = (char *)nm;
    682 	if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
    683 		*vpp = sep->se_root;
    684 		VN_HOLD(*vpp);
    685 		err = traverse(vpp);
    686 		if (err) {
    687 			VN_RELE(*vpp);
    688 			*vpp = NULL;
    689 		} else if (*vpp == sep->se_root) {
    690 			/*
    691 			 * The snapshot was unmounted behind our backs,
    692 			 * try to remount it.
    693 			 */
    694 			goto domount;
    695 		}
    696 		mutex_exit(&sdp->sd_lock);
    697 		ZFS_EXIT(zfsvfs);
    698 		return (err);
    699 	}
    700 
    701 	/*
    702 	 * The requested snapshot is not currently mounted, look it up.
    703 	 */
    704 	err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
    705 	if (err) {
    706 		mutex_exit(&sdp->sd_lock);
    707 		ZFS_EXIT(zfsvfs);
    708 		return (err);
    709 	}
    710 	if (dmu_objset_open(snapname, DMU_OST_ZFS,
    711 	    DS_MODE_STANDARD | DS_MODE_READONLY, &snap) != 0) {
    712 		mutex_exit(&sdp->sd_lock);
    713 		ZFS_EXIT(zfsvfs);
    714 		return (ENOENT);
    715 	}
    716 
    717 	sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
    718 	sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
    719 	(void) strcpy(sep->se_name, nm);
    720 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
    721 	avl_insert(&sdp->sd_snaps, sep, where);
    722 
    723 	dmu_objset_close(snap);
    724 domount:
    725 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
    726 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
    727 	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
    728 	(void) snprintf(mountpoint, mountpoint_len, "%s/.zfs/snapshot/%s",
    729 	    refstr_value(dvp->v_vfsp->vfs_mntpt), nm);
    730 
    731 	margs.spec = snapname;
    732 	margs.dir = mountpoint;
    733 	margs.flags = MS_SYSSPACE | MS_NOMNTTAB;
    734 	margs.fstype = "zfs";
    735 	margs.dataptr = NULL;
    736 	margs.datalen = 0;
    737 	margs.optptr = NULL;
    738 	margs.optlen = 0;
    739 
    740 	err = domount("zfs", &margs, *vpp, kcred, &vfsp);
    741 	kmem_free(mountpoint, mountpoint_len);
    742 
    743 	if (err == 0) {
    744 		/*
    745 		 * Return the mounted root rather than the covered mount point.
    746 		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
    747 		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
    748 		 * vnode is the root the newly created vfsp.
    749 		 */
    750 		VFS_RELE(vfsp);
    751 		err = traverse(vpp);
    752 	}
    753 
    754 	if (err == 0) {
    755 		/*
    756 		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
    757 		 *
    758 		 * This is where we lie about our v_vfsp in order to
    759 		 * make .zfs/snapshot/<snapname> accessible over NFS
    760 		 * without requiring manual mounts of <snapname>.
    761 		 */
    762 		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
    763 		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
    764 		(*vpp)->v_vfsp = zfsvfs->z_vfs;
    765 		(*vpp)->v_flag &= ~VROOT;
    766 	}
    767 	mutex_exit(&sdp->sd_lock);
    768 	ZFS_EXIT(zfsvfs);
    769 
    770 	/*
    771 	 * If we had an error, drop our hold on the vnode and
    772 	 * zfsctl_snapshot_inactive() will clean up.
    773 	 */
    774 	if (err) {
    775 		VN_RELE(*vpp);
    776 		*vpp = NULL;
    777 	}
    778 	return (err);
    779 }
    780 
    781 /* ARGSUSED */
    782 static int
    783 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
    784     offset_t *offp, offset_t *nextp, void *data, int flags)
    785 {
    786 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    787 	char snapname[MAXNAMELEN];
    788 	uint64_t id, cookie;
    789 	boolean_t case_conflict;
    790 	int error;
    791 
    792 	ZFS_ENTER(zfsvfs);
    793 
    794 	cookie = *offp;
    795 	error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
    796 	    &cookie, &case_conflict);
    797 	if (error) {
    798 		ZFS_EXIT(zfsvfs);
    799 		if (error == ENOENT) {
    800 			*eofp = 1;
    801 			return (0);
    802 		}
    803 		return (error);
    804 	}
    805 
    806 	if (flags & V_RDDIR_ENTFLAGS) {
    807 		edirent_t *eodp = dp;
    808 
    809 		(void) strcpy(eodp->ed_name, snapname);
    810 		eodp->ed_ino = ZFSCTL_INO_SNAP(id);
    811 		eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
    812 	} else {
    813 		struct dirent64 *odp = dp;
    814 
    815 		(void) strcpy(odp->d_name, snapname);
    816 		odp->d_ino = ZFSCTL_INO_SNAP(id);
    817 	}
    818 	*nextp = cookie;
    819 
    820 	ZFS_EXIT(zfsvfs);
    821 
    822 	return (0);
    823 }
    824 
    825 /*
    826  * pvp is the '.zfs' directory (zfsctl_node_t).
    827  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
    828  *
    829  * This function is the callback to create a GFS vnode for '.zfs/snapshot'
    830  * when a lookup is performed on .zfs for "snapshot".
    831  */
    832 vnode_t *
    833 zfsctl_mknode_snapdir(vnode_t *pvp)
    834 {
    835 	vnode_t *vp;
    836 	zfsctl_snapdir_t *sdp;
    837 
    838 	vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp,
    839 	    zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
    840 	    zfsctl_snapdir_readdir_cb, NULL);
    841 	sdp = vp->v_data;
    842 	sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
    843 	sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
    844 	mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
    845 	avl_create(&sdp->sd_snaps, snapentry_compare,
    846 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
    847 	return (vp);
    848 }
    849 
    850 /* ARGSUSED */
    851 static int
    852 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    853     caller_context_t *ct)
    854 {
    855 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
    856 	zfsctl_snapdir_t *sdp = vp->v_data;
    857 
    858 	ZFS_ENTER(zfsvfs);
    859 	zfsctl_common_getattr(vp, vap);
    860 	vap->va_nodeid = gfs_file_inode(vp);
    861 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
    862 	ZFS_EXIT(zfsvfs);
    863 
    864 	return (0);
    865 }
    866 
    867 /* ARGSUSED */
    868 static void
    869 zfsctl_snapdir_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
    870 {
    871 	zfsctl_snapdir_t *sdp = vp->v_data;
    872 	void *private;
    873 
    874 	private = gfs_dir_inactive(vp);
    875 	if (private != NULL) {
    876 		ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
    877 		mutex_destroy(&sdp->sd_lock);
    878 		avl_destroy(&sdp->sd_snaps);
    879 		kmem_free(private, sizeof (zfsctl_snapdir_t));
    880 	}
    881 }
    882 
    883 static const fs_operation_def_t zfsctl_tops_snapdir[] = {
    884 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
    885 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
    886 	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
    887 	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_snapdir_getattr } },
    888 	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
    889 	{ VOPNAME_RENAME,	{ .vop_rename = zfsctl_snapdir_rename }	},
    890 	{ VOPNAME_RMDIR,	{ .vop_rmdir = zfsctl_snapdir_remove }	},
    891 	{ VOPNAME_MKDIR,	{ .vop_mkdir = zfsctl_snapdir_mkdir }	},
    892 	{ VOPNAME_READDIR,	{ .vop_readdir = gfs_vop_readdir }	},
    893 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_snapdir_lookup }	},
    894 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
    895 	{ VOPNAME_INACTIVE,	{ .vop_inactive = zfsctl_snapdir_inactive } },
    896 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid }	},
    897 	{ NULL }
    898 };
    899 
    900 /*
    901  * pvp is the GFS vnode '.zfs/snapshot'.
    902  *
    903  * This creates a GFS node under '.zfs/snapshot' representing each
    904  * snapshot.  This newly created GFS node is what we mount snapshot
    905  * vfs_t's ontop of.
    906  */
    907 static vnode_t *
    908 zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
    909 {
    910 	vnode_t *vp;
    911 	zfsctl_node_t *zcp;
    912 
    913 	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
    914 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
    915 	zcp = vp->v_data;
    916 	zcp->zc_id = objset;
    917 
    918 	return (vp);
    919 }
    920 
    921 static void
    922 zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
    923 {
    924 	zfsctl_snapdir_t *sdp;
    925 	zfs_snapentry_t *sep, *next;
    926 	vnode_t *dvp;
    927 
    928 	VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr) == 0);
    929 	sdp = dvp->v_data;
    930 
    931 	mutex_enter(&sdp->sd_lock);
    932 
    933 	if (vp->v_count > 1) {
    934 		mutex_exit(&sdp->sd_lock);
    935 		return;
    936 	}
    937 	ASSERT(!vn_ismntpt(vp));
    938 
    939 	sep = avl_first(&sdp->sd_snaps);
    940 	while (sep != NULL) {
    941 		next = AVL_NEXT(&sdp->sd_snaps, sep);
    942 
    943 		if (sep->se_root == vp) {
    944 			avl_remove(&sdp->sd_snaps, sep);
    945 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
    946 			kmem_free(sep, sizeof (zfs_snapentry_t));
    947 			break;
    948 		}
    949 		sep = next;
    950 	}
    951 	ASSERT(sep != NULL);
    952 
    953 	mutex_exit(&sdp->sd_lock);
    954 	VN_RELE(dvp);
    955 
    956 	/*
    957 	 * Dispose of the vnode for the snapshot mount point.
    958 	 * This is safe to do because once this entry has been removed
    959 	 * from the AVL tree, it can't be found again, so cannot become
    960 	 * "active".  If we lookup the same name again we will end up
    961 	 * creating a new vnode.
    962 	 */
    963 	gfs_vop_inactive(vp, cr, ct);
    964 }
    965 
    966 
    967 /*
    968  * These VP's should never see the light of day.  They should always
    969  * be covered.
    970  */
    971 static const fs_operation_def_t zfsctl_tops_snapshot[] = {
    972 	VOPNAME_INACTIVE, { .vop_inactive =  zfsctl_snapshot_inactive },
    973 	NULL, NULL
    974 };
    975 
    976 int
    977 zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
    978 {
    979 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
    980 	vnode_t *dvp, *vp;
    981 	zfsctl_snapdir_t *sdp;
    982 	zfsctl_node_t *zcp;
    983 	zfs_snapentry_t *sep;
    984 	int error;
    985 
    986 	ASSERT(zfsvfs->z_ctldir != NULL);
    987 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
    988 	    NULL, 0, NULL, kcred, NULL, NULL, NULL);
    989 	if (error != 0)
    990 		return (error);
    991 	sdp = dvp->v_data;
    992 
    993 	mutex_enter(&sdp->sd_lock);
    994 	sep = avl_first(&sdp->sd_snaps);
    995 	while (sep != NULL) {
    996 		vp = sep->se_root;
    997 		zcp = vp->v_data;
    998 		if (zcp->zc_id == objsetid)
    999 			break;
   1000 
   1001 		sep = AVL_NEXT(&sdp->sd_snaps, sep);
   1002 	}
   1003 
   1004 	if (sep != NULL) {
   1005 		VN_HOLD(vp);
   1006 		/*
   1007 		 * Return the mounted root rather than the covered mount point.
   1008 		 * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
   1009 		 * and returns the ZFS vnode mounted on top of the GFS node.
   1010 		 * This ZFS vnode is the root of the vfs for objset 'objsetid'.
   1011 		 */
   1012 		error = traverse(&vp);
   1013 		if (error == 0) {
   1014 			if (vp == sep->se_root)
   1015 				error = EINVAL;
   1016 			else
   1017 				*zfsvfsp = VTOZ(vp)->z_zfsvfs;
   1018 		}
   1019 		mutex_exit(&sdp->sd_lock);
   1020 		VN_RELE(vp);
   1021 	} else {
   1022 		error = EINVAL;
   1023 		mutex_exit(&sdp->sd_lock);
   1024 	}
   1025 
   1026 	VN_RELE(dvp);
   1027 
   1028 	return (error);
   1029 }
   1030 
   1031 /*
   1032  * Unmount any snapshots for the given filesystem.  This is called from
   1033  * zfs_umount() - if we have a ctldir, then go through and unmount all the
   1034  * snapshots.
   1035  */
   1036 int
   1037 zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
   1038 {
   1039 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
   1040 	vnode_t *dvp, *svp;
   1041 	zfsctl_snapdir_t *sdp;
   1042 	zfs_snapentry_t *sep, *next;
   1043 	int error;
   1044 
   1045 	ASSERT(zfsvfs->z_ctldir != NULL);
   1046 	error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
   1047 	    NULL, 0, NULL, cr, NULL, NULL, NULL);
   1048 	if (error != 0)
   1049 		return (error);
   1050 	sdp = dvp->v_data;
   1051 
   1052 	mutex_enter(&sdp->sd_lock);
   1053 
   1054 	sep = avl_first(&sdp->sd_snaps);
   1055 	while (sep != NULL) {
   1056 		svp = sep->se_root;
   1057 		next = AVL_NEXT(&sdp->sd_snaps, sep);
   1058 
   1059 		/*
   1060 		 * If this snapshot is not mounted, then it must
   1061 		 * have just been unmounted by somebody else, and
   1062 		 * will be cleaned up by zfsctl_snapdir_inactive().
   1063 		 */
   1064 		if (vn_ismntpt(svp)) {
   1065 			if ((error = vn_vfswlock(svp)) != 0)
   1066 				goto out;
   1067 
   1068 			VN_HOLD(svp);
   1069 			error = dounmount(vn_mountedvfs(svp), fflags, cr);
   1070 			if (error) {
   1071 				VN_RELE(svp);
   1072 				goto out;
   1073 			}
   1074 
   1075 			avl_remove(&sdp->sd_snaps, sep);
   1076 			kmem_free(sep->se_name, strlen(sep->se_name) + 1);
   1077 			kmem_free(sep, sizeof (zfs_snapentry_t));
   1078 
   1079 			/*
   1080 			 * We can't use VN_RELE(), as that will try to
   1081 			 * invoke zfsctl_snapdir_inactive(), and that
   1082 			 * would lead to an attempt to re-grab the sd_lock.
   1083 			 */
   1084 			ASSERT3U(svp->v_count, ==, 1);
   1085 			gfs_vop_inactive(svp, cr, NULL);
   1086 		}
   1087 		sep = next;
   1088 	}
   1089 out:
   1090 	mutex_exit(&sdp->sd_lock);
   1091 	VN_RELE(dvp);
   1092 
   1093 	return (error);
   1094 }
   1095