Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /* Portions Copyright 2007 Shivakumar GN */
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)gfs.c	1.13	07/12/09 SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/cmn_err.h>
     31 #include <sys/debug.h>
     32 #include <sys/dirent.h>
     33 #include <sys/kmem.h>
     34 #include <sys/mman.h>
     35 #include <sys/mutex.h>
     36 #include <sys/sysmacros.h>
     37 #include <sys/systm.h>
     38 #include <sys/uio.h>
     39 #include <sys/vmsystm.h>
     40 #include <sys/vfs.h>
     41 #include <sys/vnode.h>
     42 
     43 #include <vm/as.h>
     44 #include <vm/seg_vn.h>
     45 
     46 #include <sys/gfs.h>
     47 
     48 /*
     49  * Generic pseudo-filesystem routines.
     50  *
     51  * There are significant similarities between the implementation of certain file
     52  * system entry points across different filesystems.  While one could attempt to
     53  * "choke up on the bat" and incorporate common functionality into a VOP
     54  * preamble or postamble, such an approach is limited in the benefit it can
     55  * provide.  In this file we instead define a toolkit of routines which can be
     56  * called from a filesystem (with in-kernel pseudo-filesystems being the focus
     57  * of the exercise) in a more component-like fashion.
     58  *
     59  * There are three basic classes of routines:
     60  *
     61  * 1) Lowlevel support routines
     62  *
     63  *    These routines are designed to play a support role for existing
     64  *    pseudo-filesystems (such as procfs).  They simplify common tasks,
     65  *    without forcing the filesystem to hand over management to GFS.  The
     66  *    routines covered are:
     67  *
     68  *	gfs_readdir_init()
     69  *	gfs_readdir_emit()
     70  *	gfs_readdir_emitn()
     71  *	gfs_readdir_pred()
     72  *	gfs_readdir_fini()
     73  *	gfs_lookup_dot()
     74  *
     75  * 2) Complete GFS management
     76  *
     77  *    These routines take a more active role in management of the
     78  *    pseudo-filesystem.  They handle the relationship between vnode private
     79  *    data and VFS data, as well as the relationship between vnodes in the
     80  *    directory hierarchy.
     81  *
     82  *    In order to use these interfaces, the first member of every private
     83  *    v_data must be a gfs_file_t or a gfs_dir_t.  This hands over all control
     84  *    to GFS.
     85  *
     86  * 	gfs_file_create()
     87  * 	gfs_dir_create()
     88  * 	gfs_root_create()
     89  *
     90  *	gfs_file_inactive()
     91  *	gfs_dir_inactive()
     92  *	gfs_dir_lookup()
     93  *	gfs_dir_readdir()
     94  *
     95  * 	gfs_vop_inactive()
     96  * 	gfs_vop_lookup()
     97  * 	gfs_vop_readdir()
     98  * 	gfs_vop_map()
     99  *
    100  * 3) Single File pseudo-filesystems
    101  *
    102  *    This routine creates a rooted file to be overlayed ontop of another
    103  *    file in the physical filespace.
    104  *
    105  *    Note that the parent is NULL (actually the vfs), but there is nothing
    106  *    technically keeping such a file from utilizing the "Complete GFS
    107  *    management" set of routines.
    108  *
    109  * 	gfs_root_create_file()
    110  */
    111 
    112 /*
    113  * gfs_make_opsvec: take an array of vnode type definitions and create
    114  * their vnodeops_t structures
    115  *
    116  * This routine takes an array of gfs_opsvec_t's.  It could
    117  * alternatively take an array of gfs_opsvec_t*'s, which would allow
    118  * vnode types to be completely defined in files external to the caller
    119  * of gfs_make_opsvec().  As it stands, much more sharing takes place --
    120  * both the caller and the vnode type provider need to access gfsv_ops
    121  * and gfsv_template, and the caller also needs to know gfsv_name.
    122  */
    123 int
    124 gfs_make_opsvec(gfs_opsvec_t *vec)
    125 {
    126 	int error, i;
    127 
    128 	for (i = 0; ; i++) {
    129 		if (vec[i].gfsv_name == NULL)
    130 			return (0);
    131 		error = vn_make_ops(vec[i].gfsv_name, vec[i].gfsv_template,
    132 		    vec[i].gfsv_ops);
    133 		if (error)
    134 			break;
    135 	}
    136 
    137 	cmn_err(CE_WARN, "gfs_make_opsvec: bad vnode ops template for '%s'",
    138 	    vec[i].gfsv_name);
    139 	for (i--; i >= 0; i--) {
    140 		vn_freevnodeops(*vec[i].gfsv_ops);
    141 		*vec[i].gfsv_ops = NULL;
    142 	}
    143 	return (error);
    144 }
    145 
    146 /*
    147  * Low level directory routines
    148  *
    149  * These routines provide some simple abstractions for reading directories.
    150  * They are designed to be used by existing pseudo filesystems (namely procfs)
    151  * that already have a complicated management infrastructure.
    152  */
    153 
    154 /*
    155  * gfs_get_parent_ino: used to obtain a parent inode number and the
    156  * inode number of the given vnode in preparation for calling gfs_readdir_init.
    157  */
    158 int
    159 gfs_get_parent_ino(vnode_t *dvp, cred_t *cr, caller_context_t *ct,
    160     ino64_t *pino, ino64_t *ino)
    161 {
    162 	vnode_t *parent;
    163 	gfs_dir_t *dp = dvp->v_data;
    164 	int error;
    165 
    166 	*ino = dp->gfsd_file.gfs_ino;
    167 	parent = dp->gfsd_file.gfs_parent;
    168 
    169 	if (parent == NULL) {
    170 		*pino = *ino;		/* root of filesystem */
    171 	} else if (dvp->v_flag & V_XATTRDIR) {
    172 		vattr_t va;
    173 
    174 		va.va_mask = AT_NODEID;
    175 		error = VOP_GETATTR(parent, &va, 0, cr, ct);
    176 		if (error)
    177 			return (error);
    178 		*pino = va.va_nodeid;
    179 	} else {
    180 		*pino = ((gfs_file_t *)(parent->v_data))->gfs_ino;
    181 	}
    182 
    183 	return (0);
    184 }
    185 
    186 /*
    187  * gfs_readdir_init: initiate a generic readdir
    188  *   st		- a pointer to an uninitialized gfs_readdir_state_t structure
    189  *   name_max	- the directory's maximum file name length
    190  *   ureclen	- the exported file-space record length (1 for non-legacy FSs)
    191  *   uiop	- the uiop passed to readdir
    192  *   parent	- the parent directory's inode
    193  *   self	- this directory's inode
    194  *   flags	- flags from VOP_READDIR
    195  *
    196  * Returns 0 or a non-zero errno.
    197  *
    198  * Typical VOP_READDIR usage of gfs_readdir_*:
    199  *
    200  *	if ((error = gfs_readdir_init(...)) != 0)
    201  *		return (error);
    202  *	eof = 0;
    203  *	while ((error = gfs_readdir_pred(..., &voffset)) != 0) {
    204  *		if (!consumer_entry_at(voffset))
    205  *			voffset = consumer_next_entry(voffset);
    206  *		if (consumer_eof(voffset)) {
    207  *			eof = 1
    208  *			break;
    209  *		}
    210  *		if ((error = gfs_readdir_emit(..., voffset,
    211  *		    consumer_ino(voffset), consumer_name(voffset))) != 0)
    212  *			break;
    213  *	}
    214  *	return (gfs_readdir_fini(..., error, eofp, eof));
    215  *
    216  * As you can see, a zero result from gfs_readdir_pred() or
    217  * gfs_readdir_emit() indicates that processing should continue,
    218  * whereas a non-zero result indicates that the loop should terminate.
    219  * Most consumers need do nothing more than let gfs_readdir_fini()
    220  * determine what the cause of failure was and return the appropriate
    221  * value.
    222  */
    223 int
    224 gfs_readdir_init(gfs_readdir_state_t *st, int name_max, int ureclen,
    225     uio_t *uiop, ino64_t parent, ino64_t self, int flags)
    226 {
    227 	size_t dirent_size;
    228 
    229 	if (uiop->uio_loffset < 0 || uiop->uio_resid <= 0 ||
    230 	    (uiop->uio_loffset % ureclen) != 0)
    231 		return (EINVAL);
    232 
    233 	st->grd_ureclen = ureclen;
    234 	st->grd_oresid = uiop->uio_resid;
    235 	st->grd_namlen = name_max;
    236 	if (flags & V_RDDIR_ENTFLAGS)
    237 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
    238 	else
    239 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
    240 	st->grd_dirent = kmem_zalloc(dirent_size, KM_SLEEP);
    241 	st->grd_parent = parent;
    242 	st->grd_self = self;
    243 	st->grd_flags = flags;
    244 
    245 	return (0);
    246 }
    247 
    248 /*
    249  * gfs_readdir_emit_int: internal routine to emit directory entry
    250  *
    251  *   st		- the current readdir state, which must have d_ino/ed_ino
    252  *		  and d_name/ed_name set
    253  *   uiop	- caller-supplied uio pointer
    254  *   next	- the offset of the next entry
    255  */
    256 static int
    257 gfs_readdir_emit_int(gfs_readdir_state_t *st, uio_t *uiop, offset_t next)
    258 {
    259 	int reclen;
    260 	dirent64_t *dp;
    261 	edirent_t *edp;
    262 
    263 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    264 		edp = st->grd_dirent;
    265 		reclen = EDIRENT_RECLEN(strlen(edp->ed_name));
    266 	} else {
    267 		dp = st->grd_dirent;
    268 		reclen = DIRENT64_RECLEN(strlen(dp->d_name));
    269 	}
    270 
    271 	if (reclen > uiop->uio_resid) {
    272 		/*
    273 		 * Error if no entries were returned yet
    274 		 */
    275 		if (uiop->uio_resid == st->grd_oresid)
    276 			return (EINVAL);
    277 		return (-1);
    278 	}
    279 
    280 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    281 		edp->ed_off = next;
    282 		edp->ed_reclen = (ushort_t)reclen;
    283 	} else {
    284 		dp->d_off = next;
    285 		dp->d_reclen = (ushort_t)reclen;
    286 	}
    287 
    288 	if (uiomove((caddr_t)st->grd_dirent, reclen, UIO_READ, uiop))
    289 		return (EFAULT);
    290 
    291 	uiop->uio_loffset = next;
    292 
    293 	return (0);
    294 }
    295 
    296 /*
    297  * gfs_readdir_emit: emit a directory entry
    298  *   voff       - the virtual offset (obtained from gfs_readdir_pred)
    299  *   ino        - the entry's inode
    300  *   name       - the entry's name
    301  *   eflags	- value for ed_eflags (if processing edirent_t)
    302  *
    303  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
    304  * readdir loop should terminate.  A non-zero result (either errno or
    305  * -1) from this function is typically passed directly to
    306  * gfs_readdir_fini().
    307  */
    308 int
    309 gfs_readdir_emit(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
    310     ino64_t ino, const char *name, int eflags)
    311 {
    312 	offset_t off = (voff + 2) * st->grd_ureclen;
    313 
    314 	if (st->grd_flags & V_RDDIR_ENTFLAGS) {
    315 		edirent_t *edp = st->grd_dirent;
    316 
    317 		edp->ed_ino = ino;
    318 		(void) strncpy(edp->ed_name, name, st->grd_namlen);
    319 		edp->ed_eflags = eflags;
    320 	} else {
    321 		dirent64_t *dp = st->grd_dirent;
    322 
    323 		dp->d_ino = ino;
    324 		(void) strncpy(dp->d_name, name, st->grd_namlen);
    325 	}
    326 
    327 	/*
    328 	 * Inter-entry offsets are invalid, so we assume a record size of
    329 	 * grd_ureclen and explicitly set the offset appropriately.
    330 	 */
    331 	return (gfs_readdir_emit_int(st, uiop, off + st->grd_ureclen));
    332 }
    333 
    334 /*
    335  * gfs_readdir_emitn: like gfs_readdir_emit(), but takes an integer
    336  * instead of a string for the entry's name.
    337  */
    338 int
    339 gfs_readdir_emitn(gfs_readdir_state_t *st, uio_t *uiop, offset_t voff,
    340     ino64_t ino, unsigned long num)
    341 {
    342 	char buf[40];
    343 
    344 	numtos(num, buf);
    345 	return (gfs_readdir_emit(st, uiop, voff, ino, buf, 0));
    346 }
    347 
    348 /*
    349  * gfs_readdir_pred: readdir loop predicate
    350  *   voffp - a pointer in which the next virtual offset should be stored
    351  *
    352  * Returns a 0 on success, a non-zero errno on failure, or -1 if the
    353  * readdir loop should terminate.  A non-zero result (either errno or
    354  * -1) from this function is typically passed directly to
    355  * gfs_readdir_fini().
    356  */
    357 int
    358 gfs_readdir_pred(gfs_readdir_state_t *st, uio_t *uiop, offset_t *voffp)
    359 {
    360 	offset_t off, voff;
    361 	int error;
    362 
    363 top:
    364 	if (uiop->uio_resid <= 0)
    365 		return (-1);
    366 
    367 	off = uiop->uio_loffset / st->grd_ureclen;
    368 	voff = off - 2;
    369 	if (off == 0) {
    370 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_self,
    371 		    ".", 0)) == 0)
    372 			goto top;
    373 	} else if (off == 1) {
    374 		if ((error = gfs_readdir_emit(st, uiop, voff, st->grd_parent,
    375 		    "..", 0)) == 0)
    376 			goto top;
    377 	} else {
    378 		*voffp = voff;
    379 		return (0);
    380 	}
    381 
    382 	return (error);
    383 }
    384 
    385 /*
    386  * gfs_readdir_fini: generic readdir cleanup
    387  *   error	- if positive, an error to return
    388  *   eofp	- the eofp passed to readdir
    389  *   eof	- the eof value
    390  *
    391  * Returns a 0 on success, a non-zero errno on failure.  This result
    392  * should be returned from readdir.
    393  */
    394 int
    395 gfs_readdir_fini(gfs_readdir_state_t *st, int error, int *eofp, int eof)
    396 {
    397 	size_t dirent_size;
    398 
    399 	if (st->grd_flags & V_RDDIR_ENTFLAGS)
    400 		dirent_size = EDIRENT_RECLEN(st->grd_namlen);
    401 	else
    402 		dirent_size = DIRENT64_RECLEN(st->grd_namlen);
    403 	kmem_free(st->grd_dirent, dirent_size);
    404 	if (error > 0)
    405 		return (error);
    406 	if (eofp)
    407 		*eofp = eof;
    408 	return (0);
    409 }
    410 
    411 /*
    412  * gfs_lookup_dot
    413  *
    414  * Performs a basic check for "." and ".." directory entries.
    415  */
    416 int
    417 gfs_lookup_dot(vnode_t **vpp, vnode_t *dvp, vnode_t *pvp, const char *nm)
    418 {
    419 	if (*nm == '\0' || strcmp(nm, ".") == 0) {
    420 		VN_HOLD(dvp);
    421 		*vpp = dvp;
    422 		return (0);
    423 	} else if (strcmp(nm, "..") == 0) {
    424 		if (pvp == NULL) {
    425 			ASSERT(dvp->v_flag & VROOT);
    426 			VN_HOLD(dvp);
    427 			*vpp = dvp;
    428 		} else {
    429 			VN_HOLD(pvp);
    430 			*vpp = pvp;
    431 		}
    432 		return (0);
    433 	}
    434 
    435 	return (-1);
    436 }
    437 
    438 /*
    439  * gfs_file_create(): create a new GFS file
    440  *
    441  *   size	- size of private data structure (v_data)
    442  *   pvp	- parent vnode (GFS directory)
    443  *   ops	- vnode operations vector
    444  *
    445  * In order to use this interface, the parent vnode must have been created by
    446  * gfs_dir_create(), and the private data stored in v_data must have a
    447  * 'gfs_file_t' as its first field.
    448  *
    449  * Given these constraints, this routine will automatically:
    450  *
    451  * 	- Allocate v_data for the vnode
    452  * 	- Initialize necessary fields in the vnode
    453  * 	- Hold the parent
    454  */
    455 vnode_t *
    456 gfs_file_create(size_t size, vnode_t *pvp, vnodeops_t *ops)
    457 {
    458 	gfs_file_t *fp;
    459 	vnode_t *vp;
    460 
    461 	/*
    462 	 * Allocate vnode and internal data structure
    463 	 */
    464 	fp = kmem_zalloc(size, KM_SLEEP);
    465 	vp = vn_alloc(KM_SLEEP);
    466 
    467 	/*
    468 	 * Set up various pointers
    469 	 */
    470 	fp->gfs_vnode = vp;
    471 	fp->gfs_parent = pvp;
    472 	vp->v_data = fp;
    473 	fp->gfs_size = size;
    474 	fp->gfs_type = GFS_FILE;
    475 
    476 	/*
    477 	 * Initialize vnode and hold parent.
    478 	 */
    479 	vn_setops(vp, ops);
    480 	if (pvp) {
    481 		VN_SET_VFS_TYPE_DEV(vp, pvp->v_vfsp, VREG, 0);
    482 		VN_HOLD(pvp);
    483 	}
    484 
    485 	return (vp);
    486 }
    487 
    488 /*
    489  * gfs_dir_create: creates a new directory in the parent
    490  *
    491  *   size	- size of private data structure (v_data)
    492  *   pvp	- parent vnode (GFS directory)
    493  *   ops	- vnode operations vector
    494  *   entries	- NULL-terminated list of static entries (if any)
    495  *   maxlen	- maximum length of a directory entry
    496  *   readdir_cb	- readdir callback (see gfs_dir_readdir)
    497  *   inode_cb	- inode callback (see gfs_dir_readdir)
    498  *   lookup_cb	- lookup callback (see gfs_dir_lookup)
    499  *
    500  * In order to use this function, the first member of the private vnode
    501  * structure (v_data) must be a gfs_dir_t.  For each directory, there are
    502  * static entries, defined when the structure is initialized, and dynamic
    503  * entries, retrieved through callbacks.
    504  *
    505  * If a directory has static entries, then it must supply a inode callback,
    506  * which will compute the inode number based on the parent and the index.
    507  * For a directory with dynamic entries, the caller must supply a readdir
    508  * callback and a lookup callback.  If a static lookup fails, we fall back to
    509  * the supplied lookup callback, if any.
    510  *
    511  * This function also performs the same initialization as gfs_file_create().
    512  */
    513 vnode_t *
    514 gfs_dir_create(size_t struct_size, vnode_t *pvp, vnodeops_t *ops,
    515     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
    516     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
    517 {
    518 	vnode_t *vp;
    519 	gfs_dir_t *dp;
    520 	gfs_dirent_t *de;
    521 
    522 	vp = gfs_file_create(struct_size, pvp, ops);
    523 	vp->v_type = VDIR;
    524 
    525 	dp = vp->v_data;
    526 	dp->gfsd_file.gfs_type = GFS_DIR;
    527 	dp->gfsd_maxlen = maxlen;
    528 
    529 	if (entries != NULL) {
    530 		for (de = entries; de->gfse_name != NULL; de++)
    531 			dp->gfsd_nstatic++;
    532 
    533 		dp->gfsd_static = kmem_alloc(
    534 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t), KM_SLEEP);
    535 		bcopy(entries, dp->gfsd_static,
    536 		    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
    537 	}
    538 
    539 	dp->gfsd_readdir = readdir_cb;
    540 	dp->gfsd_lookup = lookup_cb;
    541 	dp->gfsd_inode = inode_cb;
    542 
    543 	mutex_init(&dp->gfsd_lock, NULL, MUTEX_DEFAULT, NULL);
    544 
    545 	return (vp);
    546 }
    547 
    548 /*
    549  * gfs_root_create(): create a root vnode for a GFS filesystem
    550  *
    551  * Similar to gfs_dir_create(), this creates a root vnode for a filesystem.  The
    552  * only difference is that it takes a vfs_t instead of a vnode_t as its parent.
    553  */
    554 vnode_t *
    555 gfs_root_create(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino,
    556     gfs_dirent_t *entries, gfs_inode_cb inode_cb, int maxlen,
    557     gfs_readdir_cb readdir_cb, gfs_lookup_cb lookup_cb)
    558 {
    559 	vnode_t *vp = gfs_dir_create(size, NULL, ops, entries, inode_cb,
    560 	    maxlen, readdir_cb, lookup_cb);
    561 
    562 	/* Manually set the inode */
    563 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
    564 
    565 	VFS_HOLD(vfsp);
    566 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VDIR, 0);
    567 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
    568 
    569 	return (vp);
    570 }
    571 
    572 /*
    573  * gfs_root_create_file(): create a root vnode for a GFS file as a filesystem
    574  *
    575  * Similar to gfs_root_create(), this creates a root vnode for a file to
    576  * be the pseudo-filesystem.
    577  */
    578 vnode_t *
    579 gfs_root_create_file(size_t size, vfs_t *vfsp, vnodeops_t *ops, ino64_t ino)
    580 {
    581 	vnode_t	*vp = gfs_file_create(size, NULL, ops);
    582 
    583 	((gfs_file_t *)vp->v_data)->gfs_ino = ino;
    584 
    585 	VFS_HOLD(vfsp);
    586 	VN_SET_VFS_TYPE_DEV(vp, vfsp, VREG, 0);
    587 	vp->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT;
    588 
    589 	return (vp);
    590 }
    591 
    592 /*
    593  * gfs_file_inactive()
    594  *
    595  * Called from the VOP_INACTIVE() routine.  If necessary, this routine will
    596  * remove the given vnode from the parent directory and clean up any references
    597  * in the VFS layer.
    598  *
    599  * If the vnode was not removed (due to a race with vget), then NULL is
    600  * returned.  Otherwise, a pointer to the private data is returned.
    601  */
    602 void *
    603 gfs_file_inactive(vnode_t *vp)
    604 {
    605 	int i;
    606 	gfs_dirent_t *ge = NULL;
    607 	gfs_file_t *fp = vp->v_data;
    608 	gfs_dir_t *dp = NULL;
    609 	void *data;
    610 
    611 	if (fp->gfs_parent == NULL || (vp->v_flag & V_XATTRDIR))
    612 		goto found;
    613 
    614 	dp = fp->gfs_parent->v_data;
    615 
    616 	/*
    617 	 * First, see if this vnode is cached in the parent.
    618 	 */
    619 	gfs_dir_lock(dp);
    620 
    621 	/*
    622 	 * Find it in the set of static entries.
    623 	 */
    624 	for (i = 0; i < dp->gfsd_nstatic; i++)  {
    625 		ge = &dp->gfsd_static[i];
    626 
    627 		if (ge->gfse_vnode == vp)
    628 			goto found;
    629 	}
    630 
    631 	/*
    632 	 * If 'ge' is NULL, then it is a dynamic entry.
    633 	 */
    634 	ge = NULL;
    635 
    636 found:
    637 	if (vp->v_flag & V_XATTRDIR) {
    638 		mutex_enter(&fp->gfs_parent->v_lock);
    639 	}
    640 	mutex_enter(&vp->v_lock);
    641 	if (vp->v_count == 1) {
    642 		/*
    643 		 * Really remove this vnode
    644 		 */
    645 		data = vp->v_data;
    646 		if (ge != NULL) {
    647 			/*
    648 			 * If this was a statically cached entry, simply set the
    649 			 * cached vnode to NULL.
    650 			 */
    651 			ge->gfse_vnode = NULL;
    652 		}
    653 		if (vp->v_flag & V_XATTRDIR) {
    654 			fp->gfs_parent->v_xattrdir = NULL;
    655 			mutex_exit(&fp->gfs_parent->v_lock);
    656 		}
    657 		mutex_exit(&vp->v_lock);
    658 
    659 		/*
    660 		 * Free vnode and release parent
    661 		 */
    662 		if (fp->gfs_parent) {
    663 			if (dp) {
    664 				gfs_dir_unlock(dp);
    665 			}
    666 			VN_RELE(fp->gfs_parent);
    667 		} else {
    668 			ASSERT(vp->v_vfsp != NULL);
    669 			VFS_RELE(vp->v_vfsp);
    670 		}
    671 		vn_free(vp);
    672 	} else {
    673 		vp->v_count--;
    674 		data = NULL;
    675 		mutex_exit(&vp->v_lock);
    676 		if (vp->v_flag & V_XATTRDIR) {
    677 			mutex_exit(&fp->gfs_parent->v_lock);
    678 		}
    679 		if (dp)
    680 			gfs_dir_unlock(dp);
    681 	}
    682 
    683 	return (data);
    684 }
    685 
    686 /*
    687  * gfs_dir_inactive()
    688  *
    689  * Same as above, but for directories.
    690  */
    691 void *
    692 gfs_dir_inactive(vnode_t *vp)
    693 {
    694 	gfs_dir_t *dp;
    695 
    696 	ASSERT(vp->v_type == VDIR);
    697 
    698 	if ((dp = gfs_file_inactive(vp)) != NULL) {
    699 		mutex_destroy(&dp->gfsd_lock);
    700 		if (dp->gfsd_nstatic)
    701 			kmem_free(dp->gfsd_static,
    702 			    dp->gfsd_nstatic * sizeof (gfs_dirent_t));
    703 	}
    704 
    705 	return (dp);
    706 }
    707 
    708 /*
    709  * gfs_dir_lookup()
    710  *
    711  * Looks up the given name in the directory and returns the corresponding vnode,
    712  * if found.
    713  *
    714  * First, we search statically defined entries, if any.  If a match is found,
    715  * and GFS_CACHE_VNODE is set and the vnode exists, we simply return the
    716  * existing vnode.  Otherwise, we call the static entry's callback routine,
    717  * caching the result if necessary.
    718  *
    719  * If no static entry is found, we invoke the lookup callback, if any.  The
    720  * arguments to this callback are:
    721  *
    722  * int gfs_lookup_cb(vnode_t *pvp, const char *nm, vnode_t **vpp, cred_t *cr);
    723  *
    724  *	pvp	- parent vnode
    725  *	nm	- name of entry
    726  *	vpp	- pointer to resulting vnode
    727  *	cr	- pointer to cred
    728  *
    729  * 	Returns 0 on success, non-zero on error.
    730  */
    731 int
    732 gfs_dir_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, cred_t *cr)
    733 {
    734 	int i;
    735 	gfs_dirent_t *ge;
    736 	vnode_t *vp;
    737 	gfs_dir_t *dp = dvp->v_data;
    738 	int ret = 0;
    739 
    740 	ASSERT(dvp->v_type == VDIR);
    741 
    742 	if (gfs_lookup_dot(vpp, dvp, dp->gfsd_file.gfs_parent, nm) == 0)
    743 		return (0);
    744 
    745 	gfs_dir_lock(dp);
    746 
    747 	/*
    748 	 * Search static entries.
    749 	 */
    750 	for (i = 0; i < dp->gfsd_nstatic; i++) {
    751 		ge = &dp->gfsd_static[i];
    752 
    753 		if (strcmp(ge->gfse_name, nm) == 0) {
    754 			if (ge->gfse_vnode) {
    755 				ASSERT(ge->gfse_flags & GFS_CACHE_VNODE);
    756 				vp = ge->gfse_vnode;
    757 				VN_HOLD(vp);
    758 				goto out;
    759 			}
    760 
    761 			/*
    762 			 * We drop the directory lock, as the constructor will
    763 			 * need to do KM_SLEEP allocations.  If we return from
    764 			 * the constructor only to find that a parallel
    765 			 * operation has completed, and GFS_CACHE_VNODE is set
    766 			 * for this entry, we discard the result in favor of the
    767 			 * cached vnode.
    768 			 */
    769 			gfs_dir_unlock(dp);
    770 			vp = ge->gfse_ctor(dvp);
    771 			gfs_dir_lock(dp);
    772 
    773 			((gfs_file_t *)vp->v_data)->gfs_index = i;
    774 
    775 			/* Set the inode according to the callback. */
    776 			((gfs_file_t *)vp->v_data)->gfs_ino =
    777 			    dp->gfsd_inode(dvp, i);
    778 
    779 			if (ge->gfse_flags & GFS_CACHE_VNODE) {
    780 				if (ge->gfse_vnode == NULL) {
    781 					ge->gfse_vnode = vp;
    782 				} else {
    783 					/*
    784 					 * A parallel constructor beat us to it;
    785 					 * return existing vnode.  We have to be
    786 					 * careful because we can't release the
    787 					 * current vnode while holding the
    788 					 * directory lock; its inactive routine
    789 					 * will try to lock this directory.
    790 					 */
    791 					vnode_t *oldvp = vp;
    792 					vp = ge->gfse_vnode;
    793 					VN_HOLD(vp);
    794 
    795 					gfs_dir_unlock(dp);
    796 					VN_RELE(oldvp);
    797 					gfs_dir_lock(dp);
    798 				}
    799 			}
    800 
    801 			goto out;
    802 		}
    803 	}
    804 
    805 	/*
    806 	 * See if there is a dynamic constructor.
    807 	 */
    808 	if (dp->gfsd_lookup) {
    809 		ino64_t ino;
    810 		gfs_file_t *fp;
    811 
    812 		/*
    813 		 * Once again, drop the directory lock, as the lookup routine
    814 		 * will need to allocate memory, or otherwise deadlock on this
    815 		 * directory.
    816 		 */
    817 		gfs_dir_unlock(dp);
    818 		ret = dp->gfsd_lookup(dvp, nm, &vp, &ino, cr);
    819 		gfs_dir_lock(dp);
    820 		if (ret != 0)
    821 			goto out;
    822 
    823 		/*
    824 		 * The lookup_cb might be returning a non-GFS vnode.
    825 		 * Currently this is true for extended attributes,
    826 		 * where we're returning a vnode with v_data from an
    827 		 * underlying fs.
    828 		 */
    829 		if ((dvp->v_flag & V_XATTRDIR) == 0) {
    830 			fp = (gfs_file_t *)vp->v_data;
    831 			fp->gfs_index = -1;
    832 			fp->gfs_ino = ino;
    833 		}
    834 	} else {
    835 		/*
    836 		 * No static entry found, and there is no lookup callback, so
    837 		 * return ENOENT.
    838 		 */
    839 		ret = ENOENT;
    840 	}
    841 
    842 out:
    843 	gfs_dir_unlock(dp);
    844 
    845 	if (ret == 0)
    846 		*vpp = vp;
    847 	else
    848 		*vpp = NULL;
    849 
    850 	return (ret);
    851 }
    852 
    853 /*
    854  * gfs_dir_readdir: does a readdir() on the given directory
    855  *
    856  *    dvp	- directory vnode
    857  *    uiop	- uio structure
    858  *    eofp	- eof pointer
    859  *    data	- arbitrary data passed to readdir callback
    860  *
    861  * This routine does all the readdir() dirty work.  Even so, the caller must
    862  * supply two callbacks in order to get full compatibility.
    863  *
    864  * If the directory contains static entries, an inode callback must be
    865  * specified.  This avoids having to create every vnode and call VOP_GETATTR()
    866  * when reading the directory.  This function has the following arguments:
    867  *
    868  *	ino_t gfs_inode_cb(vnode_t *vp, int index);
    869  *
    870  * 	vp	- vnode for the directory
    871  * 	index	- index in original gfs_dirent_t array
    872  *
    873  * 	Returns the inode number for the given entry.
    874  *
    875  * For directories with dynamic entries, a readdir callback must be provided.
    876  * This is significantly more complex, thanks to the particulars of
    877  * VOP_READDIR().
    878  *
    879  *	int gfs_readdir_cb(vnode_t *vp, void *dp, int *eofp,
    880  *	    offset_t *off, offset_t *nextoff, void *data, int flags)
    881  *
    882  *	vp	- directory vnode
    883  *	dp	- directory entry, sized according to maxlen given to
    884  *		  gfs_dir_create().  callback must fill in d_name and
    885  *		  d_ino (if a dirent64_t), or ed_name, ed_ino, and ed_eflags
    886  *		  (if an edirent_t). edirent_t is used if V_RDDIR_ENTFLAGS
    887  *		  is set in 'flags'.
    888  *	eofp	- callback must set to 1 when EOF has been reached
    889  *	off	- on entry, the last offset read from the directory.  Callback
    890  *		  must set to the offset of the current entry, typically left
    891  *		  untouched.
    892  *	nextoff	- callback must set to offset of next entry.  Typically
    893  *		  (off + 1)
    894  *	data	- caller-supplied data
    895  *	flags	- VOP_READDIR flags
    896  *
    897  *	Return 0 on success, or error on failure.
    898  */
    899 int
    900 gfs_dir_readdir(vnode_t *dvp, uio_t *uiop, int *eofp, void *data, cred_t *cr,
    901     caller_context_t *ct, int flags)
    902 {
    903 	gfs_readdir_state_t gstate;
    904 	int error, eof = 0;
    905 	ino64_t ino, pino;
    906 	offset_t off, next;
    907 	gfs_dir_t *dp = dvp->v_data;
    908 
    909 	error = gfs_get_parent_ino(dvp, cr, ct, &pino, &ino);
    910 	if (error)
    911 		return (error);
    912 
    913 	if ((error = gfs_readdir_init(&gstate, dp->gfsd_maxlen, 1, uiop,
    914 	    pino, ino, flags)) != 0)
    915 		return (error);
    916 
    917 	while ((error = gfs_readdir_pred(&gstate, uiop, &off)) == 0 &&
    918 	    !eof) {
    919 
    920 		if (off >= 0 && off < dp->gfsd_nstatic) {
    921 			ino = dp->gfsd_inode(dvp, off);
    922 
    923 			if ((error = gfs_readdir_emit(&gstate, uiop,
    924 			    off, ino, dp->gfsd_static[off].gfse_name, 0))
    925 			    != 0)
    926 				break;
    927 
    928 		} else if (dp->gfsd_readdir) {
    929 			off -= dp->gfsd_nstatic;
    930 
    931 			if ((error = dp->gfsd_readdir(dvp,
    932 			    gstate.grd_dirent, &eof, &off, &next,
    933 			    data, flags)) != 0 || eof)
    934 				break;
    935 
    936 			off += dp->gfsd_nstatic + 2;
    937 			next += dp->gfsd_nstatic + 2;
    938 
    939 			if ((error = gfs_readdir_emit_int(&gstate, uiop,
    940 			    next)) != 0)
    941 				break;
    942 		} else {
    943 			/*
    944 			 * Offset is beyond the end of the static entries, and
    945 			 * we have no dynamic entries.  Set EOF.
    946 			 */
    947 			eof = 1;
    948 		}
    949 	}
    950 
    951 	return (gfs_readdir_fini(&gstate, error, eofp, eof));
    952 }
    953 
    954 
    955 /*
    956  * gfs_vop_lookup: VOP_LOOKUP() entry point
    957  *
    958  * For use directly in vnode ops table.  Given a GFS directory, calls
    959  * gfs_dir_lookup() as necessary.
    960  */
    961 /* ARGSUSED */
    962 int
    963 gfs_vop_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
    964     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
    965     int *direntflags, pathname_t *realpnp)
    966 {
    967 	return (gfs_dir_lookup(dvp, nm, vpp, cr));
    968 }
    969 
    970 /*
    971  * gfs_vop_readdir: VOP_READDIR() entry point
    972  *
    973  * For use directly in vnode ops table.  Given a GFS directory, calls
    974  * gfs_dir_readdir() as necessary.
    975  */
    976 /* ARGSUSED */
    977 int
    978 gfs_vop_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
    979     caller_context_t *ct, int flags)
    980 {
    981 	return (gfs_dir_readdir(vp, uiop, eofp, NULL, cr, ct, flags));
    982 }
    983 
    984 
    985 /*
    986  * gfs_vop_map: VOP_MAP() entry point
    987  *
    988  * Convenient routine for handling pseudo-files that wish to allow mmap() calls.
    989  * This function only works for readonly files, and uses the read function for
    990  * the vnode to fill in the data.  The mapped data is immediately faulted in and
    991  * filled with the necessary data during this call; there are no getpage() or
    992  * putpage() routines.
    993  */
    994 /* ARGSUSED */
    995 int
    996 gfs_vop_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
    997     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cred,
    998     caller_context_t *ct)
    999 {
   1000 	int rv;
   1001 	ssize_t resid = len;
   1002 
   1003 	/*
   1004 	 * Check for bad parameters
   1005 	 */
   1006 #ifdef _ILP32
   1007 	if (len > MAXOFF_T)
   1008 		return (ENOMEM);
   1009 #endif
   1010 	if (vp->v_flag & VNOMAP)
   1011 		return (ENOTSUP);
   1012 	if (off > MAXOFF_T)
   1013 		return (EFBIG);
   1014 	if ((long)off < 0 || (long)(off + len) < 0)
   1015 		return (EINVAL);
   1016 	if (vp->v_type != VREG)
   1017 		return (ENODEV);
   1018 	if ((prot & (PROT_EXEC | PROT_WRITE)) != 0)
   1019 		return (EACCES);
   1020 
   1021 	/*
   1022 	 * Find appropriate address if needed, otherwise clear address range.
   1023 	 */
   1024 	as_rangelock(as);
   1025 	if ((flags & MAP_FIXED) == 0) {
   1026 		map_addr(addrp, len, (offset_t)off, 1, flags);
   1027 		if (*addrp == NULL) {
   1028 			as_rangeunlock(as);
   1029 			return (ENOMEM);
   1030 		}
   1031 	} else {
   1032 		(void) as_unmap(as, *addrp, len);
   1033 	}
   1034 
   1035 	/*
   1036 	 * Create mapping
   1037 	 */
   1038 	rv = as_map(as, *addrp, len, segvn_create, zfod_argsp);
   1039 	as_rangeunlock(as);
   1040 	if (rv != 0)
   1041 		return (rv);
   1042 
   1043 	/*
   1044 	 * Fill with data from read()
   1045 	 */
   1046 	rv = vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_USERSPACE,
   1047 	    0, (rlim64_t)0, cred, &resid);
   1048 
   1049 	if (rv == 0 && resid != 0)
   1050 		rv = ENXIO;
   1051 
   1052 	if (rv != 0) {
   1053 		as_rangelock(as);
   1054 		(void) as_unmap(as, *addrp, len);
   1055 		as_rangeunlock(as);
   1056 	}
   1057 
   1058 	return (rv);
   1059 }
   1060 
   1061 /*
   1062  * gfs_vop_inactive: VOP_INACTIVE() entry point
   1063  *
   1064  * Given a vnode that is a GFS file or directory, call gfs_file_inactive() or
   1065  * gfs_dir_inactive() as necessary, and kmem_free()s associated private data.
   1066  */
   1067 /* ARGSUSED */
   1068 void
   1069 gfs_vop_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
   1070 {
   1071 	gfs_file_t *fp = vp->v_data;
   1072 	void *data;
   1073 
   1074 	if (fp->gfs_type == GFS_DIR)
   1075 		data = gfs_dir_inactive(vp);
   1076 	else
   1077 		data = gfs_file_inactive(vp);
   1078 
   1079 	if (data != NULL)
   1080 		kmem_free(data, fp->gfs_size);
   1081 }
   1082