Home | History | Annotate | Download | only in fs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 
     40 #pragma ident	"@(#)vnode.c	1.107	07/12/26 SMI"
     41 
     42 #include <sys/types.h>
     43 #include <sys/param.h>
     44 #include <sys/t_lock.h>
     45 #include <sys/errno.h>
     46 #include <sys/cred.h>
     47 #include <sys/user.h>
     48 #include <sys/uio.h>
     49 #include <sys/file.h>
     50 #include <sys/pathname.h>
     51 #include <sys/vfs.h>
     52 #include <sys/vfs_opreg.h>
     53 #include <sys/vnode.h>
     54 #include <sys/rwstlock.h>
     55 #include <sys/fem.h>
     56 #include <sys/stat.h>
     57 #include <sys/mode.h>
     58 #include <sys/conf.h>
     59 #include <sys/sysmacros.h>
     60 #include <sys/cmn_err.h>
     61 #include <sys/systm.h>
     62 #include <sys/kmem.h>
     63 #include <sys/debug.h>
     64 #include <c2/audit.h>
     65 #include <sys/acl.h>
     66 #include <sys/nbmlock.h>
     67 #include <sys/fcntl.h>
     68 #include <fs/fs_subr.h>
     69 
     70 /* Determine if this vnode is a file that is read-only */
     71 #define	ISROFILE(vp)	\
     72 	((vp)->v_type != VCHR && (vp)->v_type != VBLK && \
     73 	    (vp)->v_type != VFIFO && vn_is_readonly(vp))
     74 
     75 /* Tunable via /etc/system; used only by admin/install */
     76 int nfs_global_client_only;
     77 
     78 /*
     79  * Array of vopstats_t for per-FS-type vopstats.  This array has the same
     80  * number of entries as and parallel to the vfssw table.  (Arguably, it could
     81  * be part of the vfssw table.)  Once it's initialized, it's accessed using
     82  * the same fstype index that is used to index into the vfssw table.
     83  */
     84 vopstats_t **vopstats_fstype;
     85 
     86 /* vopstats initialization template used for fast initialization via bcopy() */
     87 static vopstats_t *vs_templatep;
     88 
     89 /* Kmem cache handle for vsk_anchor_t allocations */
     90 kmem_cache_t *vsk_anchor_cache;
     91 
     92 /* file events cleanup routine */
     93 extern void free_fopdata(vnode_t *);
     94 
     95 /*
     96  * Root of AVL tree for the kstats associated with vopstats.  Lock protects
     97  * updates to vsktat_tree.
     98  */
     99 avl_tree_t	vskstat_tree;
    100 kmutex_t	vskstat_tree_lock;
    101 
    102 /* Global variable which enables/disables the vopstats collection */
    103 int vopstats_enabled = 1;
    104 
    105 /*
    106  * forward declarations for internal vnode specific data (vsd)
    107  */
    108 static void *vsd_realloc(void *, size_t, size_t);
    109 
    110 /*
    111  * VSD -- VNODE SPECIFIC DATA
    112  * The v_data pointer is typically used by a file system to store a
    113  * pointer to the file system's private node (e.g. ufs inode, nfs rnode).
    114  * However, there are times when additional project private data needs
    115  * to be stored separately from the data (node) pointed to by v_data.
    116  * This additional data could be stored by the file system itself or
    117  * by a completely different kernel entity.  VSD provides a way for
    118  * callers to obtain a key and store a pointer to private data associated
    119  * with a vnode.
    120  *
    121  * Callers are responsible for protecting the vsd by holding v_lock
    122  * for calls to vsd_set() and vsd_get().
    123  */
    124 
    125 /*
    126  * vsd_lock protects:
    127  *   vsd_nkeys - creation and deletion of vsd keys
    128  *   vsd_list - insertion and deletion of vsd_node in the vsd_list
    129  *   vsd_destructor - adding and removing destructors to the list
    130  */
    131 static kmutex_t		vsd_lock;
    132 static uint_t		vsd_nkeys;	 /* size of destructor array */
    133 /* list of vsd_node's */
    134 static list_t *vsd_list = NULL;
    135 /* per-key destructor funcs */
    136 static void 		(**vsd_destructor)(void *);
    137 
    138 /*
    139  * The following is the common set of actions needed to update the
    140  * vopstats structure from a vnode op.  Both VOPSTATS_UPDATE() and
    141  * VOPSTATS_UPDATE_IO() do almost the same thing, except for the
    142  * recording of the bytes transferred.  Since the code is similar
    143  * but small, it is nearly a duplicate.  Consequently any changes
    144  * to one may need to be reflected in the other.
    145  * Rundown of the variables:
    146  * vp - Pointer to the vnode
    147  * counter - Partial name structure member to update in vopstats for counts
    148  * bytecounter - Partial name structure member to update in vopstats for bytes
    149  * bytesval - Value to update in vopstats for bytes
    150  * fstype - Index into vsanchor_fstype[], same as index into vfssw[]
    151  * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i])
    152  */
    153 
    154 #define	VOPSTATS_UPDATE(vp, counter) {					\
    155 	vfs_t *vfsp = (vp)->v_vfsp;					\
    156 	if (vfsp && vfsp->vfs_implp &&					\
    157 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
    158 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
    159 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
    160 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
    161 		    size_t, uint64_t *);				\
    162 		__dtrace_probe___fsinfo_##counter(vp, 0, stataddr);	\
    163 		(*stataddr)++;						\
    164 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
    165 			vsp->n##counter.value.ui64++;			\
    166 		}							\
    167 	}								\
    168 }
    169 
    170 #define	VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) {	\
    171 	vfs_t *vfsp = (vp)->v_vfsp;					\
    172 	if (vfsp && vfsp->vfs_implp &&					\
    173 	    (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) {	\
    174 		vopstats_t *vsp = &vfsp->vfs_vopstats;			\
    175 		uint64_t *stataddr = &(vsp->n##counter.value.ui64);	\
    176 		extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \
    177 		    size_t, uint64_t *);				\
    178 		__dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
    179 		(*stataddr)++;						\
    180 		vsp->bytecounter.value.ui64 += bytesval;		\
    181 		if ((vsp = vfsp->vfs_fstypevsp) != NULL) {		\
    182 			vsp->n##counter.value.ui64++;			\
    183 			vsp->bytecounter.value.ui64 += bytesval;	\
    184 		}							\
    185 	}								\
    186 }
    187 
    188 /*
    189  * If the filesystem does not support XIDs map credential
    190  * If the vfsp is NULL, perhaps we should also map?
    191  */
    192 #define	VOPXID_MAP_CR(vp, cr)	{					\
    193 	vfs_t *vfsp = (vp)->v_vfsp;					\
    194 	if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)		\
    195 		cr = crgetmapped(cr);					\
    196 	}
    197 
    198 /*
    199  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
    200  * numerical order of S_IFMT and vnode types.)
    201  */
    202 enum vtype iftovt_tab[] = {
    203 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
    204 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
    205 };
    206 
    207 ushort_t vttoif_tab[] = {
    208 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
    209 	S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
    210 };
    211 
    212 /*
    213  * The system vnode cache.
    214  */
    215 
    216 kmem_cache_t *vn_cache;
    217 
    218 
    219 /*
    220  * Vnode operations vector.
    221  */
    222 
    223 static const fs_operation_trans_def_t vn_ops_table[] = {
    224 	VOPNAME_OPEN, offsetof(struct vnodeops, vop_open),
    225 	    fs_nosys, fs_nosys,
    226 
    227 	VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close),
    228 	    fs_nosys, fs_nosys,
    229 
    230 	VOPNAME_READ, offsetof(struct vnodeops, vop_read),
    231 	    fs_nosys, fs_nosys,
    232 
    233 	VOPNAME_WRITE, offsetof(struct vnodeops, vop_write),
    234 	    fs_nosys, fs_nosys,
    235 
    236 	VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl),
    237 	    fs_nosys, fs_nosys,
    238 
    239 	VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl),
    240 	    fs_setfl, fs_nosys,
    241 
    242 	VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr),
    243 	    fs_nosys, fs_nosys,
    244 
    245 	VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr),
    246 	    fs_nosys, fs_nosys,
    247 
    248 	VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access),
    249 	    fs_nosys, fs_nosys,
    250 
    251 	VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup),
    252 	    fs_nosys, fs_nosys,
    253 
    254 	VOPNAME_CREATE, offsetof(struct vnodeops, vop_create),
    255 	    fs_nosys, fs_nosys,
    256 
    257 	VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove),
    258 	    fs_nosys, fs_nosys,
    259 
    260 	VOPNAME_LINK, offsetof(struct vnodeops, vop_link),
    261 	    fs_nosys, fs_nosys,
    262 
    263 	VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename),
    264 	    fs_nosys, fs_nosys,
    265 
    266 	VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir),
    267 	    fs_nosys, fs_nosys,
    268 
    269 	VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir),
    270 	    fs_nosys, fs_nosys,
    271 
    272 	VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir),
    273 	    fs_nosys, fs_nosys,
    274 
    275 	VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink),
    276 	    fs_nosys, fs_nosys,
    277 
    278 	VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink),
    279 	    fs_nosys, fs_nosys,
    280 
    281 	VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync),
    282 	    fs_nosys, fs_nosys,
    283 
    284 	VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive),
    285 	    fs_nosys, fs_nosys,
    286 
    287 	VOPNAME_FID, offsetof(struct vnodeops, vop_fid),
    288 	    fs_nosys, fs_nosys,
    289 
    290 	VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock),
    291 	    fs_rwlock, fs_rwlock,
    292 
    293 	VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock),
    294 	    (fs_generic_func_p) fs_rwunlock,
    295 	    (fs_generic_func_p) fs_rwunlock,	/* no errors allowed */
    296 
    297 	VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek),
    298 	    fs_nosys, fs_nosys,
    299 
    300 	VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp),
    301 	    fs_cmp, fs_cmp,		/* no errors allowed */
    302 
    303 	VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock),
    304 	    fs_frlock, fs_nosys,
    305 
    306 	VOPNAME_SPACE, offsetof(struct vnodeops, vop_space),
    307 	    fs_nosys, fs_nosys,
    308 
    309 	VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp),
    310 	    fs_nosys, fs_nosys,
    311 
    312 	VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage),
    313 	    fs_nosys, fs_nosys,
    314 
    315 	VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage),
    316 	    fs_nosys, fs_nosys,
    317 
    318 	VOPNAME_MAP, offsetof(struct vnodeops, vop_map),
    319 	    (fs_generic_func_p) fs_nosys_map,
    320 	    (fs_generic_func_p) fs_nosys_map,
    321 
    322 	VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap),
    323 	    (fs_generic_func_p) fs_nosys_addmap,
    324 	    (fs_generic_func_p) fs_nosys_addmap,
    325 
    326 	VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap),
    327 	    fs_nosys, fs_nosys,
    328 
    329 	VOPNAME_POLL, offsetof(struct vnodeops, vop_poll),
    330 	    (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll,
    331 
    332 	VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump),
    333 	    fs_nosys, fs_nosys,
    334 
    335 	VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf),
    336 	    fs_pathconf, fs_nosys,
    337 
    338 	VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio),
    339 	    fs_nosys, fs_nosys,
    340 
    341 	VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl),
    342 	    fs_nosys, fs_nosys,
    343 
    344 	VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose),
    345 	    (fs_generic_func_p) fs_dispose,
    346 	    (fs_generic_func_p) fs_nodispose,
    347 
    348 	VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr),
    349 	    fs_nosys, fs_nosys,
    350 
    351 	VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr),
    352 	    fs_fab_acl, fs_nosys,
    353 
    354 	VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock),
    355 	    fs_shrlock, fs_nosys,
    356 
    357 	VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent),
    358 	    (fs_generic_func_p) fs_vnevent_nosupport,
    359 	    (fs_generic_func_p) fs_vnevent_nosupport,
    360 
    361 	NULL, 0, NULL, NULL
    362 };
    363 
    364 /* Extensible attribute (xva) routines. */
    365 
    366 /*
    367  * Zero out the structure, set the size of the requested/returned bitmaps,
    368  * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer
    369  * to the returned attributes array.
    370  */
    371 void
    372 xva_init(xvattr_t *xvap)
    373 {
    374 	bzero(xvap, sizeof (xvattr_t));
    375 	xvap->xva_mapsize = XVA_MAPSIZE;
    376 	xvap->xva_magic = XVA_MAGIC;
    377 	xvap->xva_vattr.va_mask = AT_XVATTR;
    378 	xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0];
    379 }
    380 
    381 /*
    382  * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t
    383  * structure.  Otherwise, returns NULL.
    384  */
    385 xoptattr_t *
    386 xva_getxoptattr(xvattr_t *xvap)
    387 {
    388 	xoptattr_t *xoap = NULL;
    389 	if (xvap->xva_vattr.va_mask & AT_XVATTR)
    390 		xoap = &xvap->xva_xoptattrs;
    391 	return (xoap);
    392 }
    393 
    394 /*
    395  * Used by the AVL routines to compare two vsk_anchor_t structures in the tree.
    396  * We use the f_fsid reported by VFS_STATVFS() since we use that for the
    397  * kstat name.
    398  */
    399 static int
    400 vska_compar(const void *n1, const void *n2)
    401 {
    402 	int ret;
    403 	ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid;
    404 	ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid;
    405 
    406 	if (p1 < p2) {
    407 		ret = -1;
    408 	} else if (p1 > p2) {
    409 		ret = 1;
    410 	} else {
    411 		ret = 0;
    412 	}
    413 
    414 	return (ret);
    415 }
    416 
    417 /*
    418  * Used to create a single template which will be bcopy()ed to a newly
    419  * allocated vsanchor_combo_t structure in new_vsanchor(), below.
    420  */
    421 static vopstats_t *
    422 create_vopstats_template()
    423 {
    424 	vopstats_t		*vsp;
    425 
    426 	vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP);
    427 	bzero(vsp, sizeof (*vsp));	/* Start fresh */
    428 
    429 	/* VOP_OPEN */
    430 	kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64);
    431 	/* VOP_CLOSE */
    432 	kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64);
    433 	/* VOP_READ I/O */
    434 	kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64);
    435 	kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64);
    436 	/* VOP_WRITE I/O */
    437 	kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64);
    438 	kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64);
    439 	/* VOP_IOCTL */
    440 	kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64);
    441 	/* VOP_SETFL */
    442 	kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64);
    443 	/* VOP_GETATTR */
    444 	kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64);
    445 	/* VOP_SETATTR */
    446 	kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64);
    447 	/* VOP_ACCESS */
    448 	kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64);
    449 	/* VOP_LOOKUP */
    450 	kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64);
    451 	/* VOP_CREATE */
    452 	kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64);
    453 	/* VOP_REMOVE */
    454 	kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64);
    455 	/* VOP_LINK */
    456 	kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64);
    457 	/* VOP_RENAME */
    458 	kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64);
    459 	/* VOP_MKDIR */
    460 	kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64);
    461 	/* VOP_RMDIR */
    462 	kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64);
    463 	/* VOP_READDIR I/O */
    464 	kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64);
    465 	kstat_named_init(&vsp->readdir_bytes, "readdir_bytes",
    466 	    KSTAT_DATA_UINT64);
    467 	/* VOP_SYMLINK */
    468 	kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64);
    469 	/* VOP_READLINK */
    470 	kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64);
    471 	/* VOP_FSYNC */
    472 	kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64);
    473 	/* VOP_INACTIVE */
    474 	kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64);
    475 	/* VOP_FID */
    476 	kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64);
    477 	/* VOP_RWLOCK */
    478 	kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64);
    479 	/* VOP_RWUNLOCK */
    480 	kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64);
    481 	/* VOP_SEEK */
    482 	kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64);
    483 	/* VOP_CMP */
    484 	kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64);
    485 	/* VOP_FRLOCK */
    486 	kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64);
    487 	/* VOP_SPACE */
    488 	kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64);
    489 	/* VOP_REALVP */
    490 	kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64);
    491 	/* VOP_GETPAGE */
    492 	kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64);
    493 	/* VOP_PUTPAGE */
    494 	kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64);
    495 	/* VOP_MAP */
    496 	kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64);
    497 	/* VOP_ADDMAP */
    498 	kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64);
    499 	/* VOP_DELMAP */
    500 	kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64);
    501 	/* VOP_POLL */
    502 	kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64);
    503 	/* VOP_DUMP */
    504 	kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64);
    505 	/* VOP_PATHCONF */
    506 	kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64);
    507 	/* VOP_PAGEIO */
    508 	kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64);
    509 	/* VOP_DUMPCTL */
    510 	kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64);
    511 	/* VOP_DISPOSE */
    512 	kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64);
    513 	/* VOP_SETSECATTR */
    514 	kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64);
    515 	/* VOP_GETSECATTR */
    516 	kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64);
    517 	/* VOP_SHRLOCK */
    518 	kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64);
    519 	/* VOP_VNEVENT */
    520 	kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64);
    521 
    522 	return (vsp);
    523 }
    524 
    525 /*
    526  * Creates a kstat structure associated with a vopstats structure.
    527  */
    528 kstat_t *
    529 new_vskstat(char *ksname, vopstats_t *vsp)
    530 {
    531 	kstat_t		*ksp;
    532 
    533 	if (!vopstats_enabled) {
    534 		return (NULL);
    535 	}
    536 
    537 	ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED,
    538 	    sizeof (vopstats_t)/sizeof (kstat_named_t),
    539 	    KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE);
    540 	if (ksp) {
    541 		ksp->ks_data = vsp;
    542 		kstat_install(ksp);
    543 	}
    544 
    545 	return (ksp);
    546 }
    547 
    548 /*
    549  * Called from vfsinit() to initialize the support mechanisms for vopstats
    550  */
    551 void
    552 vopstats_startup()
    553 {
    554 	if (!vopstats_enabled)
    555 		return;
    556 
    557 	/*
    558 	 * Creates the AVL tree which holds per-vfs vopstat anchors.  This
    559 	 * is necessary since we need to check if a kstat exists before we
    560 	 * attempt to create it.  Also, initialize its lock.
    561 	 */
    562 	avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t),
    563 	    offsetof(vsk_anchor_t, vsk_node));
    564 	mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL);
    565 
    566 	vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache",
    567 	    sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL,
    568 	    NULL, NULL, 0);
    569 
    570 	/*
    571 	 * Set up the array of pointers for the vopstats-by-FS-type.
    572 	 * The entries will be allocated/initialized as each file system
    573 	 * goes through modload/mod_installfs.
    574 	 */
    575 	vopstats_fstype = (vopstats_t **)kmem_zalloc(
    576 	    (sizeof (vopstats_t *) * nfstype), KM_SLEEP);
    577 
    578 	/* Set up the global vopstats initialization template */
    579 	vs_templatep = create_vopstats_template();
    580 }
    581 
    582 /*
    583  * We need to have the all of the counters zeroed.
    584  * The initialization of the vopstats_t includes on the order of
    585  * 50 calls to kstat_named_init().  Rather that do that on every call,
    586  * we do it once in a template (vs_templatep) then bcopy it over.
    587  */
    588 void
    589 initialize_vopstats(vopstats_t *vsp)
    590 {
    591 	if (vsp == NULL)
    592 		return;
    593 
    594 	bcopy(vs_templatep, vsp, sizeof (vopstats_t));
    595 }
    596 
    597 /*
    598  * If possible, determine which vopstats by fstype to use and
    599  * return a pointer to the caller.
    600  */
    601 vopstats_t *
    602 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp)
    603 {
    604 	int		fstype = 0;	/* Index into vfssw[] */
    605 	vopstats_t	*vsp = NULL;
    606 
    607 	if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 ||
    608 	    !vopstats_enabled)
    609 		return (NULL);
    610 	/*
    611 	 * Set up the fstype.  We go to so much trouble because all versions
    612 	 * of NFS use the same fstype in their vfs even though they have
    613 	 * distinct entries in the vfssw[] table.
    614 	 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry.
    615 	 */
    616 	if (vswp) {
    617 		fstype = vswp - vfssw;	/* Gets us the index */
    618 	} else {
    619 		fstype = vfsp->vfs_fstype;
    620 	}
    621 
    622 	/*
    623 	 * Point to the per-fstype vopstats. The only valid values are
    624 	 * non-zero positive values less than the number of vfssw[] table
    625 	 * entries.
    626 	 */
    627 	if (fstype > 0 && fstype < nfstype) {
    628 		vsp = vopstats_fstype[fstype];
    629 	}
    630 
    631 	return (vsp);
    632 }
    633 
    634 /*
    635  * Generate a kstat name, create the kstat structure, and allocate a
    636  * vsk_anchor_t to hold it together.  Return the pointer to the vsk_anchor_t
    637  * to the caller.  This must only be called from a mount.
    638  */
    639 vsk_anchor_t *
    640 get_vskstat_anchor(vfs_t *vfsp)
    641 {
    642 	char		kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */
    643 	statvfs64_t	statvfsbuf;		/* Needed to find f_fsid */
    644 	vsk_anchor_t	*vskp = NULL;		/* vfs <--> kstat anchor */
    645 	kstat_t		*ksp;			/* Ptr to new kstat */
    646 	avl_index_t	where;			/* Location in the AVL tree */
    647 
    648 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
    649 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
    650 		return (NULL);
    651 
    652 	/* Need to get the fsid to build a kstat name */
    653 	if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) {
    654 		/* Create a name for our kstats based on fsid */
    655 		(void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx",
    656 		    VOPSTATS_STR, statvfsbuf.f_fsid);
    657 
    658 		/* Allocate and initialize the vsk_anchor_t */
    659 		vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP);
    660 		bzero(vskp, sizeof (*vskp));
    661 		vskp->vsk_fsid = statvfsbuf.f_fsid;
    662 
    663 		mutex_enter(&vskstat_tree_lock);
    664 		if (avl_find(&vskstat_tree, vskp, &where) == NULL) {
    665 			avl_insert(&vskstat_tree, vskp, where);
    666 			mutex_exit(&vskstat_tree_lock);
    667 
    668 			/*
    669 			 * Now that we've got the anchor in the AVL
    670 			 * tree, we can create the kstat.
    671 			 */
    672 			ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats);
    673 			if (ksp) {
    674 				vskp->vsk_ksp = ksp;
    675 			}
    676 		} else {
    677 			/* Oops, found one! Release memory and lock. */
    678 			mutex_exit(&vskstat_tree_lock);
    679 			kmem_cache_free(vsk_anchor_cache, vskp);
    680 			vskp = NULL;
    681 		}
    682 	}
    683 	return (vskp);
    684 }
    685 
    686 /*
    687  * We're in the process of tearing down the vfs and need to cleanup
    688  * the data structures associated with the vopstats. Must only be called
    689  * from dounmount().
    690  */
    691 void
    692 teardown_vopstats(vfs_t *vfsp)
    693 {
    694 	vsk_anchor_t	*vskap;
    695 	avl_index_t	where;
    696 
    697 	if (vfsp == NULL || vfsp->vfs_implp == NULL ||
    698 	    (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled)
    699 		return;
    700 
    701 	/* This is a safe check since VFS_STATS must be set (see above) */
    702 	if ((vskap = vfsp->vfs_vskap) == NULL)
    703 		return;
    704 
    705 	/* Whack the pointer right away */
    706 	vfsp->vfs_vskap = NULL;
    707 
    708 	/* Lock the tree, remove the node, and delete the kstat */
    709 	mutex_enter(&vskstat_tree_lock);
    710 	if (avl_find(&vskstat_tree, vskap, &where)) {
    711 		avl_remove(&vskstat_tree, vskap);
    712 	}
    713 
    714 	if (vskap->vsk_ksp) {
    715 		kstat_delete(vskap->vsk_ksp);
    716 	}
    717 	mutex_exit(&vskstat_tree_lock);
    718 
    719 	kmem_cache_free(vsk_anchor_cache, vskap);
    720 }
    721 
    722 /*
    723  * Read or write a vnode.  Called from kernel code.
    724  */
    725 int
    726 vn_rdwr(
    727 	enum uio_rw rw,
    728 	struct vnode *vp,
    729 	caddr_t base,
    730 	ssize_t len,
    731 	offset_t offset,
    732 	enum uio_seg seg,
    733 	int ioflag,
    734 	rlim64_t ulimit,	/* meaningful only if rw is UIO_WRITE */
    735 	cred_t *cr,
    736 	ssize_t *residp)
    737 {
    738 	struct uio uio;
    739 	struct iovec iov;
    740 	int error;
    741 	int in_crit = 0;
    742 
    743 	if (rw == UIO_WRITE && ISROFILE(vp))
    744 		return (EROFS);
    745 
    746 	if (len < 0)
    747 		return (EIO);
    748 
    749 	VOPXID_MAP_CR(vp, cr);
    750 
    751 	iov.iov_base = base;
    752 	iov.iov_len = len;
    753 	uio.uio_iov = &iov;
    754 	uio.uio_iovcnt = 1;
    755 	uio.uio_loffset = offset;
    756 	uio.uio_segflg = (short)seg;
    757 	uio.uio_resid = len;
    758 	uio.uio_llimit = ulimit;
    759 
    760 	/*
    761 	 * We have to enter the critical region before calling VOP_RWLOCK
    762 	 * to avoid a deadlock with ufs.
    763 	 */
    764 	if (nbl_need_check(vp)) {
    765 		int svmand;
    766 
    767 		nbl_start_crit(vp, RW_READER);
    768 		in_crit = 1;
    769 		error = nbl_svmand(vp, cr, &svmand);
    770 		if (error != 0)
    771 			goto done;
    772 		if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ,
    773 		    uio.uio_offset, uio.uio_resid, svmand, NULL)) {
    774 			error = EACCES;
    775 			goto done;
    776 		}
    777 	}
    778 
    779 	(void) VOP_RWLOCK(vp,
    780 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
    781 	if (rw == UIO_WRITE) {
    782 		uio.uio_fmode = FWRITE;
    783 		uio.uio_extflg = UIO_COPY_DEFAULT;
    784 		error = VOP_WRITE(vp, &uio, ioflag, cr, NULL);
    785 	} else {
    786 		uio.uio_fmode = FREAD;
    787 		uio.uio_extflg = UIO_COPY_CACHED;
    788 		error = VOP_READ(vp, &uio, ioflag, cr, NULL);
    789 	}
    790 	VOP_RWUNLOCK(vp,
    791 	    rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL);
    792 	if (residp)
    793 		*residp = uio.uio_resid;
    794 	else if (uio.uio_resid)
    795 		error = EIO;
    796 
    797 done:
    798 	if (in_crit)
    799 		nbl_end_crit(vp);
    800 	return (error);
    801 }
    802 
    803 /*
    804  * Release a vnode.  Call VOP_INACTIVE on last reference or
    805  * decrement reference count.
    806  *
    807  * To avoid race conditions, the v_count is left at 1 for
    808  * the call to VOP_INACTIVE. This prevents another thread
    809  * from reclaiming and releasing the vnode *before* the
    810  * VOP_INACTIVE routine has a chance to destroy the vnode.
    811  * We can't have more than 1 thread calling VOP_INACTIVE
    812  * on a vnode.
    813  */
    814 void
    815 vn_rele(vnode_t *vp)
    816 {
    817 	if (vp->v_count == 0)
    818 		cmn_err(CE_PANIC, "vn_rele: vnode ref count 0");
    819 	mutex_enter(&vp->v_lock);
    820 	if (vp->v_count == 1) {
    821 		mutex_exit(&vp->v_lock);
    822 		VOP_INACTIVE(vp, CRED(), NULL);
    823 	} else {
    824 		vp->v_count--;
    825 		mutex_exit(&vp->v_lock);
    826 	}
    827 }
    828 
    829 /*
    830  * Like vn_rele() except that it clears v_stream under v_lock.
    831  * This is used by sockfs when it dismantels the association between
    832  * the sockfs node and the vnode in the underlaying file system.
    833  * v_lock has to be held to prevent a thread coming through the lookupname
    834  * path from accessing a stream head that is going away.
    835  */
    836 void
    837 vn_rele_stream(vnode_t *vp)
    838 {
    839 	if (vp->v_count == 0)
    840 		cmn_err(CE_PANIC, "vn_rele: vnode ref count 0");
    841 	mutex_enter(&vp->v_lock);
    842 	vp->v_stream = NULL;
    843 	if (vp->v_count == 1) {
    844 		mutex_exit(&vp->v_lock);
    845 		VOP_INACTIVE(vp, CRED(), NULL);
    846 	} else {
    847 		vp->v_count--;
    848 		mutex_exit(&vp->v_lock);
    849 	}
    850 }
    851 
    852 int
    853 vn_open(
    854 	char *pnamep,
    855 	enum uio_seg seg,
    856 	int filemode,
    857 	int createmode,
    858 	struct vnode **vpp,
    859 	enum create crwhy,
    860 	mode_t umask)
    861 {
    862 	return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy,
    863 	    umask, NULL, -1));
    864 }
    865 
    866 
    867 /*
    868  * Open/create a vnode.
    869  * This may be callable by the kernel, the only known use
    870  * of user context being that the current user credentials
    871  * are used for permissions.  crwhy is defined iff filemode & FCREAT.
    872  */
    873 int
    874 vn_openat(
    875 	char *pnamep,
    876 	enum uio_seg seg,
    877 	int filemode,
    878 	int createmode,
    879 	struct vnode **vpp,
    880 	enum create crwhy,
    881 	mode_t umask,
    882 	struct vnode *startvp,
    883 	int fd)
    884 {
    885 	struct vnode *vp;
    886 	int mode;
    887 	int accessflags;
    888 	int error;
    889 	int in_crit = 0;
    890 	int open_done = 0;
    891 	int shrlock_done = 0;
    892 	struct vattr vattr;
    893 	enum symfollow follow;
    894 	int estale_retry = 0;
    895 	struct shrlock shr;
    896 	struct shr_locowner shr_own;
    897 
    898 	mode = 0;
    899 	accessflags = 0;
    900 	if (filemode & FREAD)
    901 		mode |= VREAD;
    902 	if (filemode & (FWRITE|FTRUNC))
    903 		mode |= VWRITE;
    904 	if (filemode & FXATTRDIROPEN)
    905 		mode |= VEXEC;
    906 
    907 	/* symlink interpretation */
    908 	if (filemode & FNOFOLLOW)
    909 		follow = NO_FOLLOW;
    910 	else
    911 		follow = FOLLOW;
    912 
    913 	if (filemode & FAPPEND)
    914 		accessflags |= V_APPEND;
    915 
    916 top:
    917 	if (filemode & FCREAT) {
    918 		enum vcexcl excl;
    919 
    920 		/*
    921 		 * Wish to create a file.
    922 		 */
    923 		vattr.va_type = VREG;
    924 		vattr.va_mode = createmode;
    925 		vattr.va_mask = AT_TYPE|AT_MODE;
    926 		if (filemode & FTRUNC) {
    927 			vattr.va_size = 0;
    928 			vattr.va_mask |= AT_SIZE;
    929 		}
    930 		if (filemode & FEXCL)
    931 			excl = EXCL;
    932 		else
    933 			excl = NONEXCL;
    934 
    935 		if (error =
    936 		    vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy,
    937 		    (filemode & ~(FTRUNC|FEXCL)), umask, startvp))
    938 			return (error);
    939 	} else {
    940 		/*
    941 		 * Wish to open a file.  Just look it up.
    942 		 */
    943 		if (error = lookupnameat(pnamep, seg, follow,
    944 		    NULLVPP, &vp, startvp)) {
    945 			if ((error == ESTALE) &&
    946 			    fs_need_estale_retry(estale_retry++))
    947 				goto top;
    948 			return (error);
    949 		}
    950 
    951 		/*
    952 		 * Get the attributes to check whether file is large.
    953 		 * We do this only if the FOFFMAX flag is not set and
    954 		 * only for regular files.
    955 		 */
    956 
    957 		if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) {
    958 			vattr.va_mask = AT_SIZE;
    959 			if ((error = VOP_GETATTR(vp, &vattr, 0,
    960 			    CRED(), NULL))) {
    961 				goto out;
    962 			}
    963 			if (vattr.va_size > (u_offset_t)MAXOFF32_T) {
    964 				/*
    965 				 * Large File API - regular open fails
    966 				 * if FOFFMAX flag is set in file mode
    967 				 */
    968 				error = EOVERFLOW;
    969 				goto out;
    970 			}
    971 		}
    972 		/*
    973 		 * Can't write directories, active texts, or
    974 		 * read-only filesystems.  Can't truncate files
    975 		 * on which mandatory locking is in effect.
    976 		 */
    977 		if (filemode & (FWRITE|FTRUNC)) {
    978 			/*
    979 			 * Allow writable directory if VDIROPEN flag is set.
    980 			 */
    981 			if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) {
    982 				error = EISDIR;
    983 				goto out;
    984 			}
    985 			if (ISROFILE(vp)) {
    986 				error = EROFS;
    987 				goto out;
    988 			}
    989 			/*
    990 			 * Can't truncate files on which
    991 			 * sysv mandatory locking is in effect.
    992 			 */
    993 			if (filemode & FTRUNC) {
    994 				vnode_t *rvp;
    995 
    996 				if (VOP_REALVP(vp, &rvp, NULL) != 0)
    997 					rvp = vp;
    998 				if (rvp->v_filocks != NULL) {
    999 					vattr.va_mask = AT_MODE;
   1000 					if ((error = VOP_GETATTR(vp,
   1001 					    &vattr, 0, CRED(), NULL)) == 0 &&
   1002 					    MANDLOCK(vp, vattr.va_mode))
   1003 						error = EAGAIN;
   1004 				}
   1005 			}
   1006 			if (error)
   1007 				goto out;
   1008 		}
   1009 		/*
   1010 		 * Check permissions.
   1011 		 */
   1012 		if (error = VOP_ACCESS(vp, mode, accessflags, CRED(), NULL))
   1013 			goto out;
   1014 	}
   1015 
   1016 	/*
   1017 	 * Do remaining checks for FNOFOLLOW and FNOLINKS.
   1018 	 */
   1019 	if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) {
   1020 		error = ELOOP;
   1021 		goto out;
   1022 	}
   1023 	if (filemode & FNOLINKS) {
   1024 		vattr.va_mask = AT_NLINK;
   1025 		if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))) {
   1026 			goto out;
   1027 		}
   1028 		if (vattr.va_nlink != 1) {