Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /* Portions Copyright 2007 Shivakumar GN */
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/cmn_err.h>
     31 #include <sys/debug.h>
     32 #include <sys/dirent.h>
     33 #include <sys/kmem.h>
     34 #include <sys/mman.h>
     35 #include <sys/mutex.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/systm.h>
     38 #include <sys/sunddi.h>
     39 #include <sys/uio.h>
     40 #include <sys/vmsystm.h>
     41 #include <sys/vfs.h>
     42 #include <sys/vnode.h>
     43 
     44 #include <vm/as.h>
     45 #include <vm/seg_vn.h>
     46 
     47 #include <sys/gfs.h>
     48 
     49 /*
     50  * Generic pseudo-filesystem routines.
     51  *
     52  * There are significant similarities between the implementation of certain file
     53  * system entry points across different filesystems.  While one could attempt to
     54  * "choke up on the bat" and incorporate common functionality into a VOP
     55  * preamble or postamble, such an approach is limited in the benefit it can
     56  * provide.  In this file we instead define a toolkit of routines which can be
     57  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
     58  * of the exercise) in a more component-like fashion.
     59  *
     60  * There are three basic classes of routines:
     61  *
     62  * 1) Lowlevel support routines
     63  *
     64  *    These routines are designed to play a support role for existing
     65  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
     66  *    without forcing the filesystem to hand over management to GFS.  The
     67  *    routines covered are:
     68  *
     69  *	gfs_readdir_init()
     70  *	gfs_readdir_emit()
     71  *	gfs_readdir_emitn()
     72  *	gfs_readdir_pred()
     73  *	gfs_readdir_fini()
     74  *	gfs_lookup_dot()
     75  *
     76  * 2) Complete GFS management
     77  *
     78  *    These routines take a more active role in management of the
     79  *    pseudo-filesystem.  They handle the relationship between vnode private
     80  *    data and VFS data, as well as the relationship between vnodes in the
     81  *    directory hierarchy.
     82  *
     83  *    In order to use these interfaces, the first member of every private
     84  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
     85  *    to GFS.
     86  *
     87  * 	gfs_file_create()
     88  * 	gfs_dir_create()
     89  * 	gfs_root_create()
     90  *
     91  *	gfs_file_inactive()
     92  *	gfs_dir_inactive()
     93  *	gfs_dir_lookup()
     94  *	gfs_dir_readdir()
     95  *
     96  * 	gfs_vop_inactive()
     97  * 	gfs_vop_lookup()
     98  * 	gfs_vop_readdir()
     99  * 	gfs_vop_map()
    100  *
    101  * 3) Single File pseudo-filesystems
    102  *
    103  *    This routine creates a rooted file to be overlayed ontop of another
    104  *    file in the physical filespace.
    105  *
    106  *    Note that the parent is NULL (actually the vfs), but there is nothing
    107  *    technically keeping such a file from utilizing the "Complete GFS
    108  *    management" set of routines.
    109  *
    110  * 	gfs_root_create_file()
    111  */
    112 
    113 /*
    114  * gfs_make_opsvec: take an array of vnode type definitions and create
    115  * their vnodeops_t structures
    116  *
    117  * This routine takes an array of gfs_opsvec_t's.  It could
    118  * alternatively take an array of gfs_opsvec_t*'s, which would allow
    119  * vnode types to be completely defined in files external to the caller
    120  * of gfs_make_opsvec().  As it stands, much more sharing takes place --
    121  * both the caller and the vnode type provider need to access gfsv_ops
    122  * and gfsv_template, and the caller also needs to know gfsv_name.
    123  */
    124 int
    125 gfs_make_opsvec(gfs_opsvec_t *vec)
    126 {
    127 	int error, i;
    128 
    129 	for (i = 0; ; i++) {
    130 		if (vec[i].gfsv_name == NULL)
    131 			return (0);
    132 		error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
    133 		    vec[i].gfsv_ops);
    134 		if (error)
    135 			break;
    136 	}
    137 
    138 	cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
    139 	    vec[i].gfsv_name);
    140 	for (i--; i >= 0; i--) {
    141 		vn_freevnodeops(*vec[i].gfsv_ops);
    142 		*vec[i].gfsv_ops = NULL;
    143 	}
    144 	return (error);
    145 }
    146 
    147 /*
    148  * Low level directory routines
    149  *
    150  * These routines provide some simple abstractions for reading directories.
    151  * They are designed to be used by existing pseudo filesystems (namely procfs)
    152  * that already have a complicated management infrastructure.
    153  */
    154 
    155 /*
    156  * gfs_get_parent_ino: used to obtain a parent inode number and the
    157  * inode number of the given vnode in preparation for calling gfs_readdir_init.
    158  */
    159 int
    160 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
    161     ino64_t *pino, ino64_t *ino)
    162 {
    163 	vnode_t *parent;
    164 	gfs_dir_t *dp = dvp->v_data;
    165 	int error;
    166 
    167 	*ino = dp->gfsd_file.gfs_ino;
    168 	parent = dp->gfsd_file.gfs_parent;
    169 
    170 	if (parent == NULL) {
    171 		*pino = *ino;		/* root of filesystem */
    172 	} else if (dvp->v_flag & V_XATTRDIR) {
    173 		vattr_t va;
    174 
    175 		va.va_mask = AT_NODEID;
    176 		error = VOP_GETATTR(parent, &va, 0, cr, ct);
    177 		if (error)
    178 			return (error);
    179 		*pino = va.va_nodeid;
    180 	} else {
    181 		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
    182 	}
    183 
    184 	return (0);
    185 }
    186 
    187 /*
    188  * gfs_readdir_init: initiate a generic readdir
    189  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
    190  *   name_max	- the directory's maximum file name length
    191  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
    192  *   uiop	- the uiop passed to readdir
    193  *   parent	- the parent directory's inode
    194  *   self	- this directory's inode
    195  *   flags	- flags from VOP_READDIR
    196  *
    197  * Returns 0 or a non-zero errno.
    198  *
    199  * Typical VOP_READDIR usage of gfs_readdir_*:
    200  *
    201  *	if ((error = gfs_readdir_init(...)) != 0)
    202  *		return (error);
    203  *	eof = 0;
    204  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
    205  *		if (!consumer_entry_at(voffset))
    206  *			voffset = consumer_next_entry(voffset);
    207  *		if (consumer_eof(voffset)) {
    208  *			eof = 1
    209  *			break;
    210  *		}
    211  *		if ((error = gfs_readdir_emit(..., voffset,
    212  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
    213  *			break;
    214  *	}
    215  *	return (gfs_readdir_fini(..., error, eofp, eof));
    216  *
    217  * As you can see, a zero result from gfs_readdir_pred() or
    218  * gfs_readdir_emit() indicates that processing should continue,
    219  * whereas a non-zero result indicates that the loop should terminate.
    220  * Most consumers need do nothing more than let gfs_readdir_fini()
    221  * determine what the cause of failure was and return the appropriate
    222  * value.
    223  */
    224 int
    225 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
    226     uio_t *uiop, ino64_t parent, ino64_t self, int flags)
    227 {
    228 	size_t dirent_size;
    229 
    230 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
    231 	    (uiop->uio_loffset % ureclen) != 0)
    232 		return (EINVAL);
    233 
    234 	st->grd_ureclen = ureclen;
    235 	st->grd_oresid = uiop->uio_resid;
    236 	st->grd_namlen = name_max;
    237 	if (flags & V_RDDIR_ENTFLAGS)
    238 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
    239 	else
    240 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
    241 	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
    242 	st->grd_parent = parent;
    243 	st->grd_self = self;
    244 	st->grd_flags = flags;
    245 
    246 	return (0);
    247 }
    248 
    249 /*
    250  * gfs_readdir_emit_int: internal routine to emit directory entry
    251  *
    252  *   st		- the current readdir state, which must have d_ino/ed_ino
    253  *		  and d_name/ed_name set
    254  *   uiop	- caller-supplied uio pointer
    255  *   next	- the offset of the next entry
    256  */
    257 static int
    258 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
    259 {
    260 	int reclen;
    261 	dirent64_t *dp;
    262 	edirent_t *edp;
    263 
    264 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    265 		edp = st->grd_dirent;
    266 		reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
    267 	} else {
    268 		dp = st->grd_dirent;
    269 		reclen = DIRENT64_RECLEN(strlen(dp->d_name));
    270 	}
    271 
    272 	if (reclen > uiop->uio_resid) {
    273 		/*
    274 		 * Error if no entries were returned yet
    275 		 */
    276 		if (uiop->uio_resid == st->grd_oresid)
    277 			return (EINVAL);
    278 		return (-1);
    279 	}
    280 
    281 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    282 		edp->ed_off = next;
    283 		edp->ed_reclen = (ushort_t)reclen;
    284 	} else {
    285 		dp->d_off = next;
    286 		dp->d_reclen = (ushort_t)reclen;
    287 	}
    288 
    289 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
    290 		return (EFAULT);
    291 
    292 	uiop->uio_loffset = next;
    293 
    294 	return (0);
    295 }
    296 
    297 /*
    298  * gfs_readdir_emit: emit a directory entry
    299  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
    300  *   ino        - the entry's inode
    301  *   name       - the entry's name
    302  *   eflags	- value for ed_eflags (if processing edirent_t)
    303  *
    304  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
    305  * readdir loop should terminate.  A non-zero result (either errno or
    306  * -1) from this function is typically passed directly to
    307  * gfs_readdir_fini().
    308  */
    309 int
    310 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
    311     ino64_t ino, const char *name, int eflags)
    312 {
    313 	offset_t off = (voff + 2) * st->grd_ureclen;
    314 
    315 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    316 		edirent_t *edp = st->grd_dirent;
    317 
    318 		edp->ed_ino = ino;
    319 		(void) strncpy(edp->ed_name, name, st->grd_namlen);
    320 		edp->ed_eflags = eflags;
    321 	} else {
    322 		dirent64_t *dp = st->grd_dirent;
    323 
    324 		dp->d_ino = ino;
    325 		(void) strncpy(dp->d_name, name, st->grd_namlen);
    326 	}
    327 
    328 	/*
    329 	 * Inter-entry offsets are invalid, so we assume a record size of
    330 	 * grd_ureclen and explicitly set the offset appropriately.
    331 	 */
    332 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
    333 }
    334 
    335 /*
    336  * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
    337  * instead of a string for the entry's name.
    338  */
    339 int
    340 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
    341     ino64_t ino, unsigned long num)
    342 {
    343 	char buf[40];
    344 
    345 	numtos(num, buf);
    346 	return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
    347 }
    348 
    349 /*
    350  * gfs_readdir_pred: readdir loop predicate
    351  *   voffp - a pointer in which the next virtual offset should be stored
    352  *
    353  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
    354  * readdir loop should terminate.  A non-zero result (either errno or
    355  * -1) from this function is typically passed directly to
    356  * gfs_readdir_fini().
    357  */
    358 int
    359 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
    360 {
    361 	offset_t off, voff;
    362 	int error;
    363 
    364 top:
    365 	if (uiop->uio_resid <= 0)
    366 		return (-1);
    367 
    368 	off = uiop->uio_loffset / st->grd_ureclen;
    369 	voff = off - 2;
    370 	if (off == 0) {
    371 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
    372 		    ".", 0)) == 0)
    373 			goto top;
    374 	} else if (off == 1) {
    375 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
    376 		    "..", 0)) == 0)
    377 			goto top;
    378 	} else {
    379 		*voffp = voff;
    380 		return (0);
    381 	}
    382 
    383 	return (error);
    384 }
    385 
    386 /*
    387  * gfs_readdir_fini: generic readdir cleanup
    388  *   error	- if positive, an error to return
    389  *   eofp	- the eofp passed to readdir
    390  *   eof	- the eof value
    391  *
    392  * Returns a 0 on success, a non-zero errno on failure.  This result
    393  * should be returned from readdir.
    394  */
    395 int
    396 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
    397 {
    398 	size_t dirent_size;
    399 
    400 	if (st->grd_flags & V_RDDIR_ENTFLAGS)
    401 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
    402 	else
    403 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
    404 	kmem_free(st->grd_dirent, dirent_size);
    405 	if (error > 0)
    406 		return (error);
    407 	if (eofp)
    408 		*eofp = eof;
    409 	return (0);
    410 }
    411 
    412 /*
    413  * gfs_lookup_dot
    414  *
    415  * Performs a basic check for "." and ".." directory entries.
    416  */
    417 int
    418 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
    419 {
    420 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
    421 		VN_HOLD(dvp);
    422 		*vpp = dvp;
    423 		return (0);
    424 	} else if (strcmp(nm, "..") == 0) {
    425 		if (pvp == NULL) {
    426 			ASSERT(dvp->v_flag & VROOT);
    427 			VN_HOLD(dvp);
    428 			*vpp = dvp;
    429 		} else {
    430 			VN_HOLD(pvp);
    431 			*vpp = pvp;
    432 		}
    433 		return (0);
    434 	}
    435 
    436 	return (-1);
    437 }
    438 
    439 /*
    440  * gfs_file_create(): create a new GFS file
    441  *
    442  *   size	- size of private data structure (v_data)
    443  *   pvp	- parent vnode (GFS directory)
    444  *   ops	- vnode operations vector
    445  *
    446  * In order to use this interface, the parent vnode must have been created by
    447  * gfs_dir_create(), and the private data stored in v_data must have a
    448  * 'gfs_file_t' as its first field.
    449  *
    450  * Given these constraints, this routine will automatically:
    451  *
    452  * 	- Allocate v_data for the vnode
    453  * 	- Initialize necessary fields in the vnode
    454  * 	- Hold the parent
    455  */
    456 vnode_t *
    457 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
    458 {
    459 	gfs_file_t *fp;
    460 	vnode_t *vp;
    461 
    462 	/*
    463 	 * Allocate vnode and internal data structure
    464 	 */
    465 	fp = kmem_zalloc(size, KM_SLEEP);
    466 	vp = vn_alloc(KM_SLEEP);
    467 
    468 	/*
    469 	 * Set up various pointers
    470 	 */
    471 	fp->gfs_vnode = vp;
    472 	fp->gfs_parent = pvp;
    473 	vp->v_data = fp;
    474 	fp->gfs_size = size;
    475 	fp->gfs_type = GFS_FILE;
    476 
    477 	/*
    478 	 * Initialize vnode and hold parent.
    479 	 */
    480 	vn_setops(vp, ops);
    481 	if (pvp) {
    482 		VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
    483 		VN_HOLD(pvp);
    484 	}
    485 
    486 	return (vp);
    487 }
    488 
    489 /*
    490  * gfs_dir_create: creates a new directory in the parent
    491  *
    492  *   size	- size of private data structure (v_data)
    493  *   pvp	- parent vnode (GFS directory)
    494  *   ops	- vnode operations vector
    495  *   entries	- NULL-terminated list of static entries (if any)
    496  *   maxlen	- maximum length of a directory entry
    497  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
    498  *   inode_cb	- inode callback (see gfs_dir_readdir)
    499  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
    500  *
    501  * In order to use this function, the first member of the private vnode
    502  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
    503  * static entries, defined when the structure is initialized, and dynamic
    504  * entries, retrieved through callbacks.
    505  *
    506  * If a directory has static entries, then it must supply a inode callback,
    507  * which will compute the inode number based on the parent and the index.
    508  * For a directory with dynamic entries, the caller must supply a readdir
    509  * callback and a lookup callback.  If a static lookup fails, we fall back to
    510  * the supplied lookup callback, if any.
    511  *
    512  * This function also performs the same initialization as gfs_file_create().
    513  */
    514 vnode_t *
    515 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
    516     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
    517     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
    518 {
    519 	vnode_t *vp;
    520 	gfs_dir_t *dp;
    521 	gfs_dirent_t *de;
    522 
    523 	vp = gfs_file_create(struct_size, pvp, ops);
    524 	vp->v_type = VDIR;
    525 
    526 	dp = vp->v_data;
    527 	dp->gfsd_file.gfs_type = GFS_DIR;
    528 	dp->gfsd_maxlen = maxlen;
    529 
    530 	if (entries != NULL) {
    531 		for (de = entries; de->gfse_name != NULL; de++)
    532 			dp->gfsd_nstatic++;
    533 
    534 		dp->gfsd_static = kmem_alloc(
    535 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
    536 		bcopy(entries, dp->gfsd_static,
    537 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
    538 	}
    539 
    540 	dp->gfsd_readdir = readdir_cb;
    541 	dp->gfsd_lookup = lookup_cb;
    542 	dp->gfsd_inode = inode_cb;
    543 
    544 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
    545 
    546 	return (vp);
    547 }
    548 
    549 /*
    550  * gfs_root_create(): create a root vnode for a GFS filesystem
    551  *
    552  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
    553  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
    554  */
    555 vnode_t *
    556 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
    557     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
    558     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
    559 {
    560 	vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
    561 	    maxlen, readdir_cb, lookup_cb);
    562 
    563 	/* Manually set the inode */
    564 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
    565 
    566 	VFS_HOLD(vfsp);
    567 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
    568 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
    569 
    570 	return (vp);
    571 }
    572 
    573 /*
    574  * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
    575  *
    576  * Similar to gfs_root_create(), this creates a root vnode for a file to
    577  * be the pseudo-filesystem.
    578  */
    579 vnode_t *
    580 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
    581 {
    582 	vnode_t	*vp = gfs_file_create(size, NULL, ops);
    583 
    584 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
    585 
    586 	VFS_HOLD(vfsp);
    587 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
    588 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
    589 
    590 	return (vp);
    591 }
    592 
    593 /*
    594  * gfs_file_inactive()
    595  *
    596  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
    597  * remove the given vnode from the parent directory and clean up any references
    598  * in the VFS layer.
    599  *
    600  * If the vnode was not removed (due to a race with vget), then NULL is
    601  * returned.  Otherwise, a pointer to the private data is returned.
    602  */
    603 void *
    604 gfs_file_inactive(vnode_t *vp)
    605 {
    606 	int i;
    607 	gfs_dirent_t *ge = NULL;
    608 	gfs_file_t *fp = vp->v_data;
    609 	gfs_dir_t *dp = NULL;
    610 	void *data;
    611 
    612 	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
    613 		goto found;
    614 
    615 	dp = fp->gfs_parent->v_data;
    616 
    617 	/*
    618 	 * First, see if this vnode is cached in the parent.
    619 	 */
    620 	gfs_dir_lock(dp);
    621 
    622 	/*
    623 	 * Find it in the set of static entries.
    624 	 */
    625 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
    626 		ge = &dp->gfsd_static[i];
    627 
    628 		if (ge->gfse_vnode == vp)
    629 			goto found;
    630 	}
    631 
    632 	/*
    633 	 * If 'ge' is NULL, then it is a dynamic entry.
    634 	 */
    635 	ge = NULL;
    636 
    637 found:
    638 	if (vp->v_flag & V_XATTRDIR) {
    639 		mutex_enter(&fp->gfs_parent->v_lock);
    640 	}
    641 	mutex_enter(&vp->v_lock);
    642 	if (vp->v_count == 1) {
    643 		/*
    644 		 * Really remove this vnode
    645 		 */
    646 		data = vp->v_data;
    647 		if (ge != NULL) {
    648 			/*
    649 			 * If this was a statically cached entry, simply set the
    650 			 * cached vnode to NULL.
    651 			 */
    652 			ge->gfse_vnode = NULL;
    653 		}
    654 		if (vp->v_flag & V_XATTRDIR) {
    655 			fp->gfs_parent->v_xattrdir = NULL;
    656 			mutex_exit(&fp->gfs_parent->v_lock);
    657 		}
    658 		mutex_exit(&vp->v_lock);
    659 
    660 		/*
    661 		 * Free vnode and release parent
    662 		 */
    663 		if (fp->gfs_parent) {
    664 			if (dp) {
    665 				gfs_dir_unlock(dp);
    666 			}
    667 			VN_RELE(fp->gfs_parent);
    668 		} else {
    669 			ASSERT(vp->v_vfsp != NULL);
    670 			VFS_RELE(vp->v_vfsp);
    671 		}
    672 		vn_free(vp);
    673 	} else {
    674 		vp->v_count--;
    675 		data = NULL;
    676 		mutex_exit(&vp->v_lock);
    677 		if (vp->v_flag & V_XATTRDIR) {
    678 			mutex_exit(&fp->gfs_parent->v_lock);
    679 		}
    680 		if (dp)
    681 			gfs_dir_unlock(dp);
    682 	}
    683 
    684 	return (data);
    685 }
    686 
    687 /*
    688  * gfs_dir_inactive()
    689  *
    690  * Same as above, but for directories.
    691  */
    692 void *
    693 gfs_dir_inactive(vnode_t *vp)
    694 {
    695 	gfs_dir_t *dp;
    696 
    697 	ASSERT(vp->v_type == VDIR);
    698 
    699 	if ((dp = gfs_file_inactive(vp)) != NULL) {
    700 		mutex_destroy(&dp->gfsd_lock);
    701 		if (dp->gfsd_nstatic)
    702 			kmem_free(dp->gfsd_static,
    703 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
    704 	}
    705 
    706 	return (dp);
    707 }
    708 
    709 /*
    710  * gfs_dir_lookup_dynamic()
    711  *
    712  * This routine looks up the provided name amongst the dynamic entries
    713  * in the gfs directory and returns the corresponding vnode, if found.
    714  *
    715  * The gfs directory is expected to be locked by the caller prior to
    716  * calling this function.  The directory will be unlocked during the
    717  * execution of this function, but will be locked upon return from the
    718  * function.  This function returns 0 on success, non-zero on error.
    719  *
    720  * The dynamic lookups are performed by invoking the lookup
    721  * callback, which is passed to this function as the first argument.
    722  * The arguments to the callback are:
    723  *
    724  * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr,
    725  *     int flags, int *deflgs, pathname_t *rpnp);
    726  *
    727  *	pvp	- parent vnode
    728  *	nm	- name of entry
    729  *	vpp	- pointer to resulting vnode
    730  *	cr	- pointer to cred
    731  *	flags	- flags value from lookup request
    732  *		ignored here; currently only used to request
    733  *		insensitive lookups
    734  *	direntflgs - output parameter, directory entry flags
    735  *		ignored here; currently only used to indicate a lookup
    736  *		has more than one possible match when case is not considered
    737  *	realpnp	- output parameter, real pathname
    738  *		ignored here; when lookup was performed case-insensitively,
    739  *		this field contains the "real" name of the file.
    740  *
    741  * 	Returns 0 on success, non-zero on error.
    742  */
    743 static int
    744 gfs_dir_lookup_dynamic(gfs_lookup_cb callback, gfs_dir_t *dp,
    745     const char *nm, vnode_t *dvp, vnode_t **vpp, cred_t *cr, int flags,
    746     int *direntflags, pathname_t *realpnp)
    747 {
    748 	gfs_file_t *fp;
    749 	ino64_t ino;
    750 	int ret;
    751 
    752 	ASSERT(GFS_DIR_LOCKED(dp));
    753 
    754 	/*
    755 	 * Drop the directory lock, as the lookup routine
    756 	 * will need to allocate memory, or otherwise deadlock on this
    757 	 * directory.
    758 	 */
    759 	gfs_dir_unlock(dp);
    760 	ret = callback(dvp, nm, vpp, &ino, cr, flags, direntflags, realpnp);
    761 	gfs_dir_lock(dp);
    762 
    763 	/*
    764 	 * The callback for extended attributes returns a vnode
    765 	 * with v_data from an underlying fs.
    766 	 */
    767 	if (ret == 0 && !IS_XATTRDIR(dvp)) {
    768 		fp = (gfs_file_t *)((*vpp)->v_data);
    769 		fp->gfs_index = -1;
    770 		fp->gfs_ino = ino;
    771 	}
    772 
    773 	return (ret);
    774 }
    775 
    776 /*
    777  * gfs_dir_lookup_static()
    778  *
    779  * This routine looks up the provided name amongst the static entries
    780  * in the gfs directory and returns the corresponding vnode, if found.
    781  * The first argument to the function is a pointer to the comparison
    782  * function this function should use to decide if names are a match.
    783  *
    784  * If a match is found, and GFS_CACHE_VNODE is set and the vnode
    785  * exists, we simply return the existing vnode.  Otherwise, we call
    786  * the static entry's callback routine, caching the result if
    787  * necessary.  If the idx pointer argument is non-NULL, we use it to
    788  * return the index of the matching static entry.
    789  *
    790  * The gfs directory is expected to be locked by the caller prior to calling
    791  * this function.  The directory may be unlocked during the execution of
    792  * this function, but will be locked upon return from the function.
    793  *
    794  * This function returns 0 if a match is found, ENOENT if not.
    795  */
    796 static int
    797 gfs_dir_lookup_static(int (*compare)(const char *, const char *),
    798     gfs_dir_t *dp, const char *nm, vnode_t *dvp, int *idx,
    799     vnode_t **vpp, pathname_t *rpnp)
    800 {
    801 	gfs_dirent_t *ge;
    802 	vnode_t *vp = NULL;
    803 	int i;
    804 
    805 	ASSERT(GFS_DIR_LOCKED(dp));
    806 
    807 	/*
    808 	 * Search static entries.
    809 	 */
    810 	for (i = 0; i < dp->gfsd_nstatic; i++) {
    811 		ge = &dp->gfsd_static[i];
    812 
    813 		if (compare(ge->gfse_name, nm) == 0) {
    814 			if (rpnp)
    815 				(void) strlcpy(rpnp->pn_buf, ge->gfse_name,
    816 				    rpnp->pn_bufsize);
    817 
    818 			if (ge->gfse_vnode) {
    819 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
    820 				vp = ge->gfse_vnode;
    821 				VN_HOLD(vp);
    822 				break;
    823 			}
    824 
    825 			/*
    826 			 * We drop the directory lock, as the constructor will
    827 			 * need to do KM_SLEEP allocations.  If we return from
    828 			 * the constructor only to find that a parallel
    829 			 * operation has completed, and GFS_CACHE_VNODE is set
    830 			 * for this entry, we discard the result in favor of
    831 			 * the cached vnode.
    832 			 */
    833 			gfs_dir_unlock(dp);
    834 			vp = ge->gfse_ctor(dvp);
    835 			gfs_dir_lock(dp);
    836 
    837 			((gfs_file_t *)vp->v_data)->gfs_index = i;
    838 
    839 			/* Set the inode according to the callback. */
    840 			((gfs_file_t *)vp->v_data)->gfs_ino =
    841 			    dp->gfsd_inode(dvp, i);
    842 
    843 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
    844 				if (ge->gfse_vnode == NULL) {
    845 					ge->gfse_vnode = vp;
    846 				} else {
    847 					/*
    848 					 * A parallel constructor beat us to it;
    849 					 * return existing vnode.  We have to be
    850 					 * careful because we can't release the
    851 					 * current vnode while holding the
    852 					 * directory lock; its inactive routine
    853 					 * will try to lock this directory.
    854 					 */
    855 					vnode_t *oldvp = vp;
    856 					vp = ge->gfse_vnode;
    857 					VN_HOLD(vp);
    858 
    859 					gfs_dir_unlock(dp);
    860 					VN_RELE(oldvp);
    861 					gfs_dir_lock(dp);
    862 				}
    863 			}
    864 			break;
    865 		}
    866 	}
    867 
    868 	if (vp == NULL)
    869 		return (ENOENT);
    870 	else if (idx)
    871 		*idx = i;
    872 	*vpp = vp;
    873 	return (0);
    874 }
    875 
    876 /*
    877  * gfs_dir_lookup()
    878  *
    879  * Looks up the given name in the directory and returns the corresponding
    880  * vnode, if found.
    881  *
    882  * First, we search statically defined entries, if any, with a call to
    883  * gfs_dir_lookup_static().  If no static entry is found, and we have
    884  * a callback function we try a dynamic lookup via gfs_dir_lookup_dynamic().
    885  *
    886  * This function returns 0 on success, non-zero on error.
    887  */
    888 int
    889 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr,
    890     int flags, int *direntflags, pathname_t *realpnp)
    891 {
    892 	gfs_dir_t *dp = dvp->v_data;
    893 	boolean_t casecheck;
    894 	vnode_t *dynvp = NULL;
    895 	vnode_t *vp = NULL;
    896 	int (*compare)(const char *, const char *);
    897 	int error, idx;
    898 
    899 	ASSERT(dvp->v_type == VDIR);
    900 
    901 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
    902 		return (0);
    903 
    904 	casecheck = (flags & FIGNORECASE) != 0 && direntflags != NULL;
    905 	if (vfs_has_feature(dvp->v_vfsp, VFSFT_NOCASESENSITIVE) ||
    906 	    (flags & FIGNORECASE))
    907 		compare = strcasecmp;
    908 	else
    909 		compare = strcmp;
    910 
    911 	gfs_dir_lock(dp);
    912 
    913 	error = gfs_dir_lookup_static(compare, dp, nm, dvp, &idx, &vp, realpnp);
    914 
    915 	if (vp && casecheck) {
    916 		gfs_dirent_t *ge;
    917 		int i;
    918 
    919 		for (i = idx + 1; i < dp->gfsd_nstatic; i++) {
    920 			ge = &dp->gfsd_static[i];
    921 
    922 			if (strcasecmp(ge->gfse_name, nm) == 0) {
    923 				*direntflags |= ED_CASE_CONFLICT;
    924 				goto out;
    925 			}
    926 		}
    927 	}
    928 
    929 	if ((error || casecheck) && dp->gfsd_lookup)
    930 		error = gfs_dir_lookup_dynamic(dp->gfsd_lookup, dp, nm, dvp,
    931 		    &dynvp, cr, flags, direntflags, vp ? NULL : realpnp);
    932 
    933 	if (vp && dynvp) {
    934 		/* static and dynamic entries are case-insensitive conflict */
    935 		ASSERT(casecheck);
    936 		*direntflags |= ED_CASE_CONFLICT;
    937 		VN_RELE(dynvp);
    938 	} else if (vp == NULL) {
    939 		vp = dynvp;
    940 	} else if (error == ENOENT) {
    941 		error = 0;
    942 	} else if (error) {
    943 		VN_RELE(vp);
    944 		vp = NULL;
    945 	}
    946 
    947 out:
    948 	gfs_dir_unlock(dp);
    949 
    950 	*vpp = vp;
    951 	return (error);
    952 }
    953 
    954 /*
    955  * gfs_dir_readdir: does a readdir() on the given directory
    956  *
    957  *    dvp	- directory vnode
    958  *    uiop	- uio structure
    959  *    eofp	- eof pointer
    960  *    data	- arbitrary data passed to readdir callback
    961  *
    962  * This routine does all the readdir() dirty work.  Even so, the caller must
    963  * supply two callbacks in order to get full compatibility.
    964  *
    965  * If the directory contains static entries, an inode callback must be
    966  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
    967  * when reading the directory.  This function has the following arguments:
    968  *
    969  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
    970  *
    971  * 	vp	- vnode for the directory
    972  * 	index	- index in original gfs_dirent_t array
    973  *
    974  * 	Returns the inode number for the given entry.
    975  *
    976  * For directories with dynamic entries, a readdir callback must be provided.
    977  * This is significantly more complex, thanks to the particulars of
    978  * VOP_READDIR().
    979  *
    980  *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
    981  *	    offset_t *off, offset_t *nextoff, void *data, int flags)
    982  *
    983  *	vp	- directory vnode
    984  *	dp	- directory entry, sized according to maxlen given to
    985  *		  gfs_dir_create().  callback must fill in d_name and
    986  *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
    987  *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
    988  *		  is set in 'flags'.
    989  *	eofp	- callback must set to 1 when EOF has been reached
    990  *	off	- on entry, the last offset read from the directory.  Callback
    991  *		  must set to the offset of the current entry, typically left
    992  *		  untouched.
    993  *	nextoff	- callback must set to offset of next entry.  Typically
    994  *		  (off + 1)
    995  *	data	- caller-supplied data
    996  *	flags	- VOP_READDIR flags
    997  *
    998  *	Return 0 on success, or error on failure.
    999  */
   1000 int
   1001 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
   1002     caller_context_t *ct, int flags)
   1003 {
   1004 	gfs_readdir_state_t gstate;
   1005 	int error, eof = 0;
   1006 	ino64_t ino, pino;
   1007 	offset_t off, next;
   1008 	gfs_dir_t *dp = dvp->v_data;
   1009 
   1010 	error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
   1011 	if (error)
   1012 		return (error);
   1013 
   1014 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
   1015 	    pino, ino, flags)) != 0)
   1016 		return (error);
   1017 
   1018 	while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
   1019 	    !eof) {
   1020 
   1021 		if (off >= 0 && off < dp->gfsd_nstatic) {
   1022 			ino = dp->gfsd_inode(dvp, off);
   1023 
   1024 			if ((error = gfs_readdir_emit(&gstate, uiop,
   1025 			    off, ino, dp->gfsd_static[off].gfse_name, 0))
   1026 			    != 0)
   1027 				break;
   1028 
   1029 		} else if (dp->gfsd_readdir) {
   1030 			off -= dp->gfsd_nstatic;
   1031 
   1032 			if ((error = dp->gfsd_readdir(dvp,
   1033 			    gstate.grd_dirent, &eof, &off, &next,
   1034 			    data, flags)) != 0 || eof)
   1035 				break;
   1036 
   1037 			off += dp->gfsd_nstatic + 2;
   1038 			next += dp->gfsd_nstatic + 2;
   1039 
   1040 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
   1041 			    next)) != 0)
   1042 				break;
   1043 		} else {
   1044 			/*
   1045 			 * Offset is beyond the end of the static entries, and
   1046 			 * we have no dynamic entries.  Set EOF.
   1047 			 */
   1048 			eof = 1;
   1049 		}
   1050 	}
   1051 
   1052 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
   1053 }
   1054 
   1055 
   1056 /*
   1057  * gfs_vop_lookup: VOP_LOOKUP() entry point
   1058  *
   1059  * For use directly in vnode ops table.  Given a GFS directory, calls
   1060  * gfs_dir_lookup() as necessary.
   1061  */
   1062 /* ARGSUSED */
   1063 int
   1064 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
   1065     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
   1066     int *direntflags, pathname_t *realpnp)
   1067 {
   1068 	return (gfs_dir_lookup(dvp, nm, vpp, cr, flags, direntflags, realpnp));
   1069 }
   1070 
   1071 /*
   1072  * gfs_vop_readdir: VOP_READDIR() entry point
   1073  *
   1074  * For use directly in vnode ops table.  Given a GFS directory, calls
   1075  * gfs_dir_readdir() as necessary.
   1076  */
   1077 /* ARGSUSED */
   1078 int
   1079 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
   1080     caller_context_t *ct, int flags)
   1081 {
   1082 	return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
   1083 }
   1084 
   1085 
   1086 /*
   1087  * gfs_vop_map: VOP_MAP() entry point
   1088  *
   1089  * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
   1090  * This function only works for readonly files, and uses the read function for
   1091  * the vnode to fill in the data.  The mapped data is immediately faulted in and
   1092  * filled with the necessary data during this call; there are no getpage() or
   1093  * putpage() routines.
   1094  */
   1095 /* ARGSUSED */
   1096 int
   1097 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
   1098     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
   1099     caller_context_t *ct)
   1100 {
   1101 	int rv;
   1102 	ssize_t resid = len;
   1103 
   1104 	/*
   1105 	 * Check for bad parameters
   1106 	 */
   1107 #ifdef _ILP32
   1108 	if (len > MAXOFF_T)
   1109 		return (ENOMEM);
   1110 #endif
   1111 	if (vp->v_flag & VNOMAP)
   1112 		return (ENOTSUP);
   1113 	if (off > MAXOFF_T)
   1114 		return (EFBIG);
   1115 	if ((long)off < 0 || (long)(off + len) < 0)
   1116 		return (EINVAL);
   1117 	if (vp->v_type != VREG)
   1118 		return (ENODEV);
   1119 	if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
   1120 		return (EACCES);
   1121 
   1122 	/*
   1123 	 * Find appropriate address if needed, otherwise clear address range.
   1124 	 */
   1125 	as_rangelock(as);
   1126 	rv = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
   1127 	if (rv != 0) {
   1128 		as_rangeunlock(as);
   1129 		return (rv);
   1130 	}
   1131 
   1132 	/*
   1133 	 * Create mapping
   1134 	 */
   1135 	rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
   1136 	as_rangeunlock(as);
   1137 	if (rv != 0)
   1138 		return (rv);
   1139 
   1140 	/*
   1141 	 * Fill with data from read()
   1142 	 */
   1143 	rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
   1144 	    0, (rlim64_t)0, cred, &resid);
   1145 
   1146 	if (rv == 0 && resid != 0)
   1147 		rv = ENXIO;
   1148 
   1149 	if (rv != 0) {
   1150 		as_rangelock(as);
   1151 		(void) as_unmap(as, *addrp, len);
   1152 		as_rangeunlock(as);
   1153 	}
   1154 
   1155 	return (rv);
   1156 }
   1157 
   1158 /*
   1159  * gfs_vop_inactive: VOP_INACTIVE() entry point
   1160  *
   1161  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
   1162  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
   1163  */
   1164 /* ARGSUSED */
   1165 void
   1166 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   1167 {
   1168 	gfs_file_t *fp = vp->v_data;
   1169 	void *data;
   1170 
   1171 	if (fp->gfs_type == GFS_DIR)
   1172 		data = gfs_dir_inactive(vp);
   1173 	else
   1174 		data = gfs_file_inactive(vp);
   1175 
   1176 	if (data != NULL)
   1177 		kmem_free(data, fp->gfs_size);
   1178 }
   1179