1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/param.h> 44 #include <sys/t_lock.h> 45 #include <sys/errno.h> 46 #include <sys/cred.h> 47 #include <sys/user.h> 48 #include <sys/uio.h> 49 #include <sys/file.h> 50 #include <sys/pathname.h> 51 #include <sys/vfs.h> 52 #include <sys/vfs_opreg.h> 53 #include <sys/vnode.h> 54 #include <sys/rwstlock.h> 55 #include <sys/fem.h> 56 #include <sys/stat.h> 57 #include <sys/mode.h> 58 #include <sys/conf.h> 59 #include <sys/sysmacros.h> 60 #include <sys/cmn_err.h> 61 #include <sys/systm.h> 62 #include <sys/kmem.h> 63 #include <sys/debug.h> 64 #include <c2/audit.h> 65 #include <sys/acl.h> 66 #include <sys/nbmlock.h> 67 #include <sys/fcntl.h> 68 #include <fs/fs_subr.h> 69 70 /* Determine if this vnode is a file that is read-only */ 71 #define ISROFILE(vp) \ 72 ((vp)->v_type != VCHR && (vp)->v_type != VBLK && \ 73 (vp)->v_type != VFIFO && vn_is_readonly(vp)) 74 75 /* Tunable via /etc/system; used only by admin/install */ 76 int nfs_global_client_only; 77 78 /* 79 * Array of vopstats_t for per-FS-type vopstats. This array has the same 80 * number of entries as and parallel to the vfssw table. (Arguably, it could 81 * be part of the vfssw table.) Once it's initialized, it's accessed using 82 * the same fstype index that is used to index into the vfssw table. 83 */ 84 vopstats_t **vopstats_fstype; 85 86 /* vopstats initialization template used for fast initialization via bcopy() */ 87 static vopstats_t *vs_templatep; 88 89 /* Kmem cache handle for vsk_anchor_t allocations */ 90 kmem_cache_t *vsk_anchor_cache; 91 92 /* file events cleanup routine */ 93 extern void free_fopdata(vnode_t *); 94 95 /* 96 * Root of AVL tree for the kstats associated with vopstats. Lock protects 97 * updates to vsktat_tree. 98 */ 99 avl_tree_t vskstat_tree; 100 kmutex_t vskstat_tree_lock; 101 102 /* Global variable which enables/disables the vopstats collection */ 103 int vopstats_enabled = 1; 104 105 /* 106 * forward declarations for internal vnode specific data (vsd) 107 */ 108 static void *vsd_realloc(void *, size_t, size_t); 109 110 /* 111 * VSD -- VNODE SPECIFIC DATA 112 * The v_data pointer is typically used by a file system to store a 113 * pointer to the file system's private node (e.g. ufs inode, nfs rnode). 114 * However, there are times when additional project private data needs 115 * to be stored separately from the data (node) pointed to by v_data. 116 * This additional data could be stored by the file system itself or 117 * by a completely different kernel entity. VSD provides a way for 118 * callers to obtain a key and store a pointer to private data associated 119 * with a vnode. 120 * 121 * Callers are responsible for protecting the vsd by holding v_lock 122 * for calls to vsd_set() and vsd_get(). 123 */ 124 125 /* 126 * vsd_lock protects: 127 * vsd_nkeys - creation and deletion of vsd keys 128 * vsd_list - insertion and deletion of vsd_node in the vsd_list 129 * vsd_destructor - adding and removing destructors to the list 130 */ 131 static kmutex_t vsd_lock; 132 static uint_t vsd_nkeys; /* size of destructor array */ 133 /* list of vsd_node's */ 134 static list_t *vsd_list = NULL; 135 /* per-key destructor funcs */ 136 static void (**vsd_destructor)(void *); 137 138 /* 139 * The following is the common set of actions needed to update the 140 * vopstats structure from a vnode op. Both VOPSTATS_UPDATE() and 141 * VOPSTATS_UPDATE_IO() do almost the same thing, except for the 142 * recording of the bytes transferred. Since the code is similar 143 * but small, it is nearly a duplicate. Consequently any changes 144 * to one may need to be reflected in the other. 145 * Rundown of the variables: 146 * vp - Pointer to the vnode 147 * counter - Partial name structure member to update in vopstats for counts 148 * bytecounter - Partial name structure member to update in vopstats for bytes 149 * bytesval - Value to update in vopstats for bytes 150 * fstype - Index into vsanchor_fstype[], same as index into vfssw[] 151 * vsp - Pointer to vopstats structure (either in vfs or vsanchor_fstype[i]) 152 */ 153 154 #define VOPSTATS_UPDATE(vp, counter) { \ 155 vfs_t *vfsp = (vp)->v_vfsp; \ 156 if (vfsp && vfsp->vfs_implp && \ 157 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 158 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 159 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 160 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 161 size_t, uint64_t *); \ 162 __dtrace_probe___fsinfo_##counter(vp, 0, stataddr); \ 163 (*stataddr)++; \ 164 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 165 vsp->n##counter.value.ui64++; \ 166 } \ 167 } \ 168 } 169 170 #define VOPSTATS_UPDATE_IO(vp, counter, bytecounter, bytesval) { \ 171 vfs_t *vfsp = (vp)->v_vfsp; \ 172 if (vfsp && vfsp->vfs_implp && \ 173 (vfsp->vfs_flag & VFS_STATS) && (vp)->v_type != VBAD) { \ 174 vopstats_t *vsp = &vfsp->vfs_vopstats; \ 175 uint64_t *stataddr = &(vsp->n##counter.value.ui64); \ 176 extern void __dtrace_probe___fsinfo_##counter(vnode_t *, \ 177 size_t, uint64_t *); \ 178 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \ 179 (*stataddr)++; \ 180 vsp->bytecounter.value.ui64 += bytesval; \ 181 if ((vsp = vfsp->vfs_fstypevsp) != NULL) { \ 182 vsp->n##counter.value.ui64++; \ 183 vsp->bytecounter.value.ui64 += bytesval; \ 184 } \ 185 } \ 186 } 187 188 /* 189 * If the filesystem does not support XIDs map credential 190 * If the vfsp is NULL, perhaps we should also map? 191 */ 192 #define VOPXID_MAP_CR(vp, cr) { \ 193 vfs_t *vfsp = (vp)->v_vfsp; \ 194 if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0) \ 195 cr = crgetmapped(cr); \ 196 } 197 198 /* 199 * Convert stat(2) formats to vnode types and vice versa. (Knows about 200 * numerical order of S_IFMT and vnode types.) 201 */ 202 enum vtype iftovt_tab[] = { 203 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 204 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON 205 }; 206 207 ushort_t vttoif_tab[] = { 208 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 209 S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0 210 }; 211 212 /* 213 * The system vnode cache. 214 */ 215 216 kmem_cache_t *vn_cache; 217 218 219 /* 220 * Vnode operations vector. 221 */ 222 223 static const fs_operation_trans_def_t vn_ops_table[] = { 224 VOPNAME_OPEN, offsetof(struct vnodeops, vop_open), 225 fs_nosys, fs_nosys, 226 227 VOPNAME_CLOSE, offsetof(struct vnodeops, vop_close), 228 fs_nosys, fs_nosys, 229 230 VOPNAME_READ, offsetof(struct vnodeops, vop_read), 231 fs_nosys, fs_nosys, 232 233 VOPNAME_WRITE, offsetof(struct vnodeops, vop_write), 234 fs_nosys, fs_nosys, 235 236 VOPNAME_IOCTL, offsetof(struct vnodeops, vop_ioctl), 237 fs_nosys, fs_nosys, 238 239 VOPNAME_SETFL, offsetof(struct vnodeops, vop_setfl), 240 fs_setfl, fs_nosys, 241 242 VOPNAME_GETATTR, offsetof(struct vnodeops, vop_getattr), 243 fs_nosys, fs_nosys, 244 245 VOPNAME_SETATTR, offsetof(struct vnodeops, vop_setattr), 246 fs_nosys, fs_nosys, 247 248 VOPNAME_ACCESS, offsetof(struct vnodeops, vop_access), 249 fs_nosys, fs_nosys, 250 251 VOPNAME_LOOKUP, offsetof(struct vnodeops, vop_lookup), 252 fs_nosys, fs_nosys, 253 254 VOPNAME_CREATE, offsetof(struct vnodeops, vop_create), 255 fs_nosys, fs_nosys, 256 257 VOPNAME_REMOVE, offsetof(struct vnodeops, vop_remove), 258 fs_nosys, fs_nosys, 259 260 VOPNAME_LINK, offsetof(struct vnodeops, vop_link), 261 fs_nosys, fs_nosys, 262 263 VOPNAME_RENAME, offsetof(struct vnodeops, vop_rename), 264 fs_nosys, fs_nosys, 265 266 VOPNAME_MKDIR, offsetof(struct vnodeops, vop_mkdir), 267 fs_nosys, fs_nosys, 268 269 VOPNAME_RMDIR, offsetof(struct vnodeops, vop_rmdir), 270 fs_nosys, fs_nosys, 271 272 VOPNAME_READDIR, offsetof(struct vnodeops, vop_readdir), 273 fs_nosys, fs_nosys, 274 275 VOPNAME_SYMLINK, offsetof(struct vnodeops, vop_symlink), 276 fs_nosys, fs_nosys, 277 278 VOPNAME_READLINK, offsetof(struct vnodeops, vop_readlink), 279 fs_nosys, fs_nosys, 280 281 VOPNAME_FSYNC, offsetof(struct vnodeops, vop_fsync), 282 fs_nosys, fs_nosys, 283 284 VOPNAME_INACTIVE, offsetof(struct vnodeops, vop_inactive), 285 fs_nosys, fs_nosys, 286 287 VOPNAME_FID, offsetof(struct vnodeops, vop_fid), 288 fs_nosys, fs_nosys, 289 290 VOPNAME_RWLOCK, offsetof(struct vnodeops, vop_rwlock), 291 fs_rwlock, fs_rwlock, 292 293 VOPNAME_RWUNLOCK, offsetof(struct vnodeops, vop_rwunlock), 294 (fs_generic_func_p) fs_rwunlock, 295 (fs_generic_func_p) fs_rwunlock, /* no errors allowed */ 296 297 VOPNAME_SEEK, offsetof(struct vnodeops, vop_seek), 298 fs_nosys, fs_nosys, 299 300 VOPNAME_CMP, offsetof(struct vnodeops, vop_cmp), 301 fs_cmp, fs_cmp, /* no errors allowed */ 302 303 VOPNAME_FRLOCK, offsetof(struct vnodeops, vop_frlock), 304 fs_frlock, fs_nosys, 305 306 VOPNAME_SPACE, offsetof(struct vnodeops, vop_space), 307 fs_nosys, fs_nosys, 308 309 VOPNAME_REALVP, offsetof(struct vnodeops, vop_realvp), 310 fs_nosys, fs_nosys, 311 312 VOPNAME_GETPAGE, offsetof(struct vnodeops, vop_getpage), 313 fs_nosys, fs_nosys, 314 315 VOPNAME_PUTPAGE, offsetof(struct vnodeops, vop_putpage), 316 fs_nosys, fs_nosys, 317 318 VOPNAME_MAP, offsetof(struct vnodeops, vop_map), 319 (fs_generic_func_p) fs_nosys_map, 320 (fs_generic_func_p) fs_nosys_map, 321 322 VOPNAME_ADDMAP, offsetof(struct vnodeops, vop_addmap), 323 (fs_generic_func_p) fs_nosys_addmap, 324 (fs_generic_func_p) fs_nosys_addmap, 325 326 VOPNAME_DELMAP, offsetof(struct vnodeops, vop_delmap), 327 fs_nosys, fs_nosys, 328 329 VOPNAME_POLL, offsetof(struct vnodeops, vop_poll), 330 (fs_generic_func_p) fs_poll, (fs_generic_func_p) fs_nosys_poll, 331 332 VOPNAME_DUMP, offsetof(struct vnodeops, vop_dump), 333 fs_nosys, fs_nosys, 334 335 VOPNAME_PATHCONF, offsetof(struct vnodeops, vop_pathconf), 336 fs_pathconf, fs_nosys, 337 338 VOPNAME_PAGEIO, offsetof(struct vnodeops, vop_pageio), 339 fs_nosys, fs_nosys, 340 341 VOPNAME_DUMPCTL, offsetof(struct vnodeops, vop_dumpctl), 342 fs_nosys, fs_nosys, 343 344 VOPNAME_DISPOSE, offsetof(struct vnodeops, vop_dispose), 345 (fs_generic_func_p) fs_dispose, 346 (fs_generic_func_p) fs_nodispose, 347 348 VOPNAME_SETSECATTR, offsetof(struct vnodeops, vop_setsecattr), 349 fs_nosys, fs_nosys, 350 351 VOPNAME_GETSECATTR, offsetof(struct vnodeops, vop_getsecattr), 352 fs_fab_acl, fs_nosys, 353 354 VOPNAME_SHRLOCK, offsetof(struct vnodeops, vop_shrlock), 355 fs_shrlock, fs_nosys, 356 357 VOPNAME_VNEVENT, offsetof(struct vnodeops, vop_vnevent), 358 (fs_generic_func_p) fs_vnevent_nosupport, 359 (fs_generic_func_p) fs_vnevent_nosupport, 360 361 NULL, 0, NULL, NULL 362 }; 363 364 /* Extensible attribute (xva) routines. */ 365 366 /* 367 * Zero out the structure, set the size of the requested/returned bitmaps, 368 * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer 369 * to the returned attributes array. 370 */ 371 void 372 xva_init(xvattr_t *xvap) 373 { 374 bzero(xvap, sizeof (xvattr_t)); 375 xvap->xva_mapsize = XVA_MAPSIZE; 376 xvap->xva_magic = XVA_MAGIC; 377 xvap->xva_vattr.va_mask = AT_XVATTR; 378 xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; 379 } 380 381 /* 382 * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t 383 * structure. Otherwise, returns NULL. 384 */ 385 xoptattr_t * 386 xva_getxoptattr(xvattr_t *xvap) 387 { 388 xoptattr_t *xoap = NULL; 389 if (xvap->xva_vattr.va_mask & AT_XVATTR) 390 xoap = &xvap->xva_xoptattrs; 391 return (xoap); 392 } 393 394 /* 395 * Used by the AVL routines to compare two vsk_anchor_t structures in the tree. 396 * We use the f_fsid reported by VFS_STATVFS() since we use that for the 397 * kstat name. 398 */ 399 static int 400 vska_compar(const void *n1, const void *n2) 401 { 402 int ret; 403 ulong_t p1 = ((vsk_anchor_t *)n1)->vsk_fsid; 404 ulong_t p2 = ((vsk_anchor_t *)n2)->vsk_fsid; 405 406 if (p1 < p2) { 407 ret = -1; 408 } else if (p1 > p2) { 409 ret = 1; 410 } else { 411 ret = 0; 412 } 413 414 return (ret); 415 } 416 417 /* 418 * Used to create a single template which will be bcopy()ed to a newly 419 * allocated vsanchor_combo_t structure in new_vsanchor(), below. 420 */ 421 static vopstats_t * 422 create_vopstats_template() 423 { 424 vopstats_t *vsp; 425 426 vsp = kmem_alloc(sizeof (vopstats_t), KM_SLEEP); 427 bzero(vsp, sizeof (*vsp)); /* Start fresh */ 428 429 /* VOP_OPEN */ 430 kstat_named_init(&vsp->nopen, "nopen", KSTAT_DATA_UINT64); 431 /* VOP_CLOSE */ 432 kstat_named_init(&vsp->nclose, "nclose", KSTAT_DATA_UINT64); 433 /* VOP_READ I/O */ 434 kstat_named_init(&vsp->nread, "nread", KSTAT_DATA_UINT64); 435 kstat_named_init(&vsp->read_bytes, "read_bytes", KSTAT_DATA_UINT64); 436 /* VOP_WRITE I/O */ 437 kstat_named_init(&vsp->nwrite, "nwrite", KSTAT_DATA_UINT64); 438 kstat_named_init(&vsp->write_bytes, "write_bytes", KSTAT_DATA_UINT64); 439 /* VOP_IOCTL */ 440 kstat_named_init(&vsp->nioctl, "nioctl", KSTAT_DATA_UINT64); 441 /* VOP_SETFL */ 442 kstat_named_init(&vsp->nsetfl, "nsetfl", KSTAT_DATA_UINT64); 443 /* VOP_GETATTR */ 444 kstat_named_init(&vsp->ngetattr, "ngetattr", KSTAT_DATA_UINT64); 445 /* VOP_SETATTR */ 446 kstat_named_init(&vsp->nsetattr, "nsetattr", KSTAT_DATA_UINT64); 447 /* VOP_ACCESS */ 448 kstat_named_init(&vsp->naccess, "naccess", KSTAT_DATA_UINT64); 449 /* VOP_LOOKUP */ 450 kstat_named_init(&vsp->nlookup, "nlookup", KSTAT_DATA_UINT64); 451 /* VOP_CREATE */ 452 kstat_named_init(&vsp->ncreate, "ncreate", KSTAT_DATA_UINT64); 453 /* VOP_REMOVE */ 454 kstat_named_init(&vsp->nremove, "nremove", KSTAT_DATA_UINT64); 455 /* VOP_LINK */ 456 kstat_named_init(&vsp->nlink, "nlink", KSTAT_DATA_UINT64); 457 /* VOP_RENAME */ 458 kstat_named_init(&vsp->nrename, "nrename", KSTAT_DATA_UINT64); 459 /* VOP_MKDIR */ 460 kstat_named_init(&vsp->nmkdir, "nmkdir", KSTAT_DATA_UINT64); 461 /* VOP_RMDIR */ 462 kstat_named_init(&vsp->nrmdir, "nrmdir", KSTAT_DATA_UINT64); 463 /* VOP_READDIR I/O */ 464 kstat_named_init(&vsp->nreaddir, "nreaddir", KSTAT_DATA_UINT64); 465 kstat_named_init(&vsp->readdir_bytes, "readdir_bytes", 466 KSTAT_DATA_UINT64); 467 /* VOP_SYMLINK */ 468 kstat_named_init(&vsp->nsymlink, "nsymlink", KSTAT_DATA_UINT64); 469 /* VOP_READLINK */ 470 kstat_named_init(&vsp->nreadlink, "nreadlink", KSTAT_DATA_UINT64); 471 /* VOP_FSYNC */ 472 kstat_named_init(&vsp->nfsync, "nfsync", KSTAT_DATA_UINT64); 473 /* VOP_INACTIVE */ 474 kstat_named_init(&vsp->ninactive, "ninactive", KSTAT_DATA_UINT64); 475 /* VOP_FID */ 476 kstat_named_init(&vsp->nfid, "nfid", KSTAT_DATA_UINT64); 477 /* VOP_RWLOCK */ 478 kstat_named_init(&vsp->nrwlock, "nrwlock", KSTAT_DATA_UINT64); 479 /* VOP_RWUNLOCK */ 480 kstat_named_init(&vsp->nrwunlock, "nrwunlock", KSTAT_DATA_UINT64); 481 /* VOP_SEEK */ 482 kstat_named_init(&vsp->nseek, "nseek", KSTAT_DATA_UINT64); 483 /* VOP_CMP */ 484 kstat_named_init(&vsp->ncmp, "ncmp", KSTAT_DATA_UINT64); 485 /* VOP_FRLOCK */ 486 kstat_named_init(&vsp->nfrlock, "nfrlock", KSTAT_DATA_UINT64); 487 /* VOP_SPACE */ 488 kstat_named_init(&vsp->nspace, "nspace", KSTAT_DATA_UINT64); 489 /* VOP_REALVP */ 490 kstat_named_init(&vsp->nrealvp, "nrealvp", KSTAT_DATA_UINT64); 491 /* VOP_GETPAGE */ 492 kstat_named_init(&vsp->ngetpage, "ngetpage", KSTAT_DATA_UINT64); 493 /* VOP_PUTPAGE */ 494 kstat_named_init(&vsp->nputpage, "nputpage", KSTAT_DATA_UINT64); 495 /* VOP_MAP */ 496 kstat_named_init(&vsp->nmap, "nmap", KSTAT_DATA_UINT64); 497 /* VOP_ADDMAP */ 498 kstat_named_init(&vsp->naddmap, "naddmap", KSTAT_DATA_UINT64); 499 /* VOP_DELMAP */ 500 kstat_named_init(&vsp->ndelmap, "ndelmap", KSTAT_DATA_UINT64); 501 /* VOP_POLL */ 502 kstat_named_init(&vsp->npoll, "npoll", KSTAT_DATA_UINT64); 503 /* VOP_DUMP */ 504 kstat_named_init(&vsp->ndump, "ndump", KSTAT_DATA_UINT64); 505 /* VOP_PATHCONF */ 506 kstat_named_init(&vsp->npathconf, "npathconf", KSTAT_DATA_UINT64); 507 /* VOP_PAGEIO */ 508 kstat_named_init(&vsp->npageio, "npageio", KSTAT_DATA_UINT64); 509 /* VOP_DUMPCTL */ 510 kstat_named_init(&vsp->ndumpctl, "ndumpctl", KSTAT_DATA_UINT64); 511 /* VOP_DISPOSE */ 512 kstat_named_init(&vsp->ndispose, "ndispose", KSTAT_DATA_UINT64); 513 /* VOP_SETSECATTR */ 514 kstat_named_init(&vsp->nsetsecattr, "nsetsecattr", KSTAT_DATA_UINT64); 515 /* VOP_GETSECATTR */ 516 kstat_named_init(&vsp->ngetsecattr, "ngetsecattr", KSTAT_DATA_UINT64); 517 /* VOP_SHRLOCK */ 518 kstat_named_init(&vsp->nshrlock, "nshrlock", KSTAT_DATA_UINT64); 519 /* VOP_VNEVENT */ 520 kstat_named_init(&vsp->nvnevent, "nvnevent", KSTAT_DATA_UINT64); 521 522 return (vsp); 523 } 524 525 /* 526 * Creates a kstat structure associated with a vopstats structure. 527 */ 528 kstat_t * 529 new_vskstat(char *ksname, vopstats_t *vsp) 530 { 531 kstat_t *ksp; 532 533 if (!vopstats_enabled) { 534 return (NULL); 535 } 536 537 ksp = kstat_create("unix", 0, ksname, "misc", KSTAT_TYPE_NAMED, 538 sizeof (vopstats_t)/sizeof (kstat_named_t), 539 KSTAT_FLAG_VIRTUAL|KSTAT_FLAG_WRITABLE); 540 if (ksp) { 541 ksp->ks_data = vsp; 542 kstat_install(ksp); 543 } 544 545 return (ksp); 546 } 547 548 /* 549 * Called from vfsinit() to initialize the support mechanisms for vopstats 550 */ 551 void 552 vopstats_startup() 553 { 554 if (!vopstats_enabled) 555 return; 556 557 /* 558 * Creates the AVL tree which holds per-vfs vopstat anchors. This 559 * is necessary since we need to check if a kstat exists before we 560 * attempt to create it. Also, initialize its lock. 561 */ 562 avl_create(&vskstat_tree, vska_compar, sizeof (vsk_anchor_t), 563 offsetof(vsk_anchor_t, vsk_node)); 564 mutex_init(&vskstat_tree_lock, NULL, MUTEX_DEFAULT, NULL); 565 566 vsk_anchor_cache = kmem_cache_create("vsk_anchor_cache", 567 sizeof (vsk_anchor_t), sizeof (uintptr_t), NULL, NULL, NULL, 568 NULL, NULL, 0); 569 570 /* 571 * Set up the array of pointers for the vopstats-by-FS-type. 572 * The entries will be allocated/initialized as each file system 573 * goes through modload/mod_installfs. 574 */ 575 vopstats_fstype = (vopstats_t **)kmem_zalloc( 576 (sizeof (vopstats_t *) * nfstype), KM_SLEEP); 577 578 /* Set up the global vopstats initialization template */ 579 vs_templatep = create_vopstats_template(); 580 } 581 582 /* 583 * We need to have the all of the counters zeroed. 584 * The initialization of the vopstats_t includes on the order of 585 * 50 calls to kstat_named_init(). Rather that do that on every call, 586 * we do it once in a template (vs_templatep) then bcopy it over. 587 */ 588 void 589 initialize_vopstats(vopstats_t *vsp) 590 { 591 if (vsp == NULL) 592 return; 593 594 bcopy(vs_templatep, vsp, sizeof (vopstats_t)); 595 } 596 597 /* 598 * If possible, determine which vopstats by fstype to use and 599 * return a pointer to the caller. 600 */ 601 vopstats_t * 602 get_fstype_vopstats(vfs_t *vfsp, struct vfssw *vswp) 603 { 604 int fstype = 0; /* Index into vfssw[] */ 605 vopstats_t *vsp = NULL; 606 607 if (vfsp == NULL || (vfsp->vfs_flag & VFS_STATS) == 0 || 608 !vopstats_enabled) 609 return (NULL); 610 /* 611 * Set up the fstype. We go to so much trouble because all versions 612 * of NFS use the same fstype in their vfs even though they have 613 * distinct entries in the vfssw[] table. 614 * NOTE: A special vfs (e.g., EIO_vfs) may not have an entry. 615 */ 616 if (vswp) { 617 fstype = vswp - vfssw; /* Gets us the index */ 618 } else { 619 fstype = vfsp->vfs_fstype; 620 } 621 622 /* 623 * Point to the per-fstype vopstats. The only valid values are 624 * non-zero positive values less than the number of vfssw[] table 625 * entries. 626 */ 627 if (fstype > 0 && fstype < nfstype) { 628 vsp = vopstats_fstype[fstype]; 629 } 630 631 return (vsp); 632 } 633 634 /* 635 * Generate a kstat name, create the kstat structure, and allocate a 636 * vsk_anchor_t to hold it together. Return the pointer to the vsk_anchor_t 637 * to the caller. This must only be called from a mount. 638 */ 639 vsk_anchor_t * 640 get_vskstat_anchor(vfs_t *vfsp) 641 { 642 char kstatstr[KSTAT_STRLEN]; /* kstat name for vopstats */ 643 statvfs64_t statvfsbuf; /* Needed to find f_fsid */ 644 vsk_anchor_t *vskp = NULL; /* vfs <--> kstat anchor */ 645 kstat_t *ksp; /* Ptr to new kstat */ 646 avl_index_t where; /* Location in the AVL tree */ 647 648 if (vfsp == NULL || vfsp->vfs_implp == NULL || 649 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 650 return (NULL); 651 652 /* Need to get the fsid to build a kstat name */ 653 if (VFS_STATVFS(vfsp, &statvfsbuf) == 0) { 654 /* Create a name for our kstats based on fsid */ 655 (void) snprintf(kstatstr, KSTAT_STRLEN, "%s%lx", 656 VOPSTATS_STR, statvfsbuf.f_fsid); 657 658 /* Allocate and initialize the vsk_anchor_t */ 659 vskp = kmem_cache_alloc(vsk_anchor_cache, KM_SLEEP); 660 bzero(vskp, sizeof (*vskp)); 661 vskp->vsk_fsid = statvfsbuf.f_fsid; 662 663 mutex_enter(&vskstat_tree_lock); 664 if (avl_find(&vskstat_tree, vskp, &where) == NULL) { 665 avl_insert(&vskstat_tree, vskp, where); 666 mutex_exit(&vskstat_tree_lock); 667 668 /* 669 * Now that we've got the anchor in the AVL 670 * tree, we can create the kstat. 671 */ 672 ksp = new_vskstat(kstatstr, &vfsp->vfs_vopstats); 673 if (ksp) { 674 vskp->vsk_ksp = ksp; 675 } 676 } else { 677 /* Oops, found one! Release memory and lock. */ 678 mutex_exit(&vskstat_tree_lock); 679 kmem_cache_free(vsk_anchor_cache, vskp); 680 vskp = NULL; 681 } 682 } 683 return (vskp); 684 } 685 686 /* 687 * We're in the process of tearing down the vfs and need to cleanup 688 * the data structures associated with the vopstats. Must only be called 689 * from dounmount(). 690 */ 691 void 692 teardown_vopstats(vfs_t *vfsp) 693 { 694 vsk_anchor_t *vskap; 695 avl_index_t where; 696 697 if (vfsp == NULL || vfsp->vfs_implp == NULL || 698 (vfsp->vfs_flag & VFS_STATS) == 0 || !vopstats_enabled) 699 return; 700 701 /* This is a safe check since VFS_STATS must be set (see above) */ 702 if ((vskap = vfsp->vfs_vskap) == NULL) 703 return; 704 705 /* Whack the pointer right away */ 706 vfsp->vfs_vskap = NULL; 707 708 /* Lock the tree, remove the node, and delete the kstat */ 709 mutex_enter(&vskstat_tree_lock); 710 if (avl_find(&vskstat_tree, vskap, &where)) { 711 avl_remove(&vskstat_tree, vskap); 712 } 713 714 if (vskap->vsk_ksp) { 715 kstat_delete(vskap->vsk_ksp); 716 } 717 mutex_exit(&vskstat_tree_lock); 718 719 kmem_cache_free(vsk_anchor_cache, vskap); 720 } 721 722 /* 723 * Read or write a vnode. Called from kernel code. 724 */ 725 int 726 vn_rdwr( 727 enum uio_rw rw, 728 struct vnode *vp, 729 caddr_t base, 730 ssize_t len, 731 offset_t offset, 732 enum uio_seg seg, 733 int ioflag, 734 rlim64_t ulimit, /* meaningful only if rw is UIO_WRITE */ 735 cred_t *cr, 736 ssize_t *residp) 737 { 738 struct uio uio; 739 struct iovec iov; 740 int error; 741 int in_crit = 0; 742 743 if (rw == UIO_WRITE && ISROFILE(vp)) 744 return (EROFS); 745 746 if (len < 0) 747 return (EIO); 748 749 VOPXID_MAP_CR(vp, cr); 750 751 iov.iov_base = base; 752 iov.iov_len = len; 753 uio.uio_iov = &iov; 754 uio.uio_iovcnt = 1; 755 uio.uio_loffset = offset; 756 uio.uio_segflg = (short)seg; 757 uio.uio_resid = len; 758 uio.uio_llimit = ulimit; 759 760 /* 761 * We have to enter the critical region before calling VOP_RWLOCK 762 * to avoid a deadlock with ufs. 763 */ 764 if (nbl_need_check(vp)) { 765 int svmand; 766 767 nbl_start_crit(vp, RW_READER); 768 in_crit = 1; 769 error = nbl_svmand(vp, cr, &svmand); 770 if (error != 0) 771 goto done; 772 if (nbl_conflict(vp, rw == UIO_WRITE ? NBL_WRITE : NBL_READ, 773 uio.uio_offset, uio.uio_resid, svmand, NULL)) { 774 error = EACCES; 775 goto done; 776 } 777 } 778 779 (void) VOP_RWLOCK(vp, 780 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 781 if (rw == UIO_WRITE) { 782 uio.uio_fmode = FWRITE; 783 uio.uio_extflg = UIO_COPY_DEFAULT; 784 error = VOP_WRITE(vp, &uio, ioflag, cr, NULL); 785 } else { 786 uio.uio_fmode = FREAD; 787 uio.uio_extflg = UIO_COPY_CACHED; 788 error = VOP_READ(vp, &uio, ioflag, cr, NULL); 789 } 790 VOP_RWUNLOCK(vp, 791 rw == UIO_WRITE ? V_WRITELOCK_TRUE : V_WRITELOCK_FALSE, NULL); 792 if (residp) 793 *residp = uio.uio_resid; 794 else if (uio.uio_resid) 795 error = EIO; 796 797 done: 798 if (in_crit) 799 nbl_end_crit(vp); 800 return (error); 801 } 802 803 /* 804 * Release a vnode. Call VOP_INACTIVE on last reference or 805 * decrement reference count. 806 * 807 * To avoid race conditions, the v_count is left at 1 for 808 * the call to VOP_INACTIVE. This prevents another thread 809 * from reclaiming and releasing the vnode *before* the 810 * VOP_INACTIVE routine has a chance to destroy the vnode. 811 * We can't have more than 1 thread calling VOP_INACTIVE 812 * on a vnode. 813 */ 814 void 815 vn_rele(vnode_t *vp) 816 { 817 VERIFY(vp->v_count > 0); 818 mutex_enter(&vp->v_lock); 819 if (vp->v_count == 1) { 820 mutex_exit(&vp->v_lock); 821 VOP_INACTIVE(vp, CRED(), NULL); 822 return; 823 } 824 vp->v_count--; 825 mutex_exit(&vp->v_lock); 826 } 827 828 /* 829 * Release a vnode referenced by the DNLC. Multiple DNLC references are treated 830 * as a single reference, so v_count is not decremented until the last DNLC hold 831 * is released. This makes it possible to distinguish vnodes that are referenced 832 * only by the DNLC. 833 */ 834 void 835 vn_rele_dnlc(vnode_t *vp) 836 { 837 VERIFY((vp->v_count > 0) && (vp->v_count_dnlc > 0)); 838 mutex_enter(&vp->v_lock); 839 if (--vp->v_count_dnlc == 0) { 840 if (vp->v_count == 1) { 841 mutex_exit(&vp->v_lock); 842 VOP_INACTIVE(vp, CRED(), NULL); 843 return; 844 } 845 vp->v_count--; 846 } 847 mutex_exit(&vp->v_lock); 848 } 849 850 /* 851 * Like vn_rele() except that it clears v_stream under v_lock. 852 * This is used by sockfs when it dismantels the association between 853 * the sockfs node and the vnode in the underlaying file system. 854 * v_lock has to be held to prevent a thread coming through the lookupname 855 * path from accessing a stream head that is going away. 856 */ 857 void 858 vn_rele_stream(vnode_t *vp) 859 { 860 VERIFY(vp->v_count > 0); 861 mutex_enter(&vp->v_lock); 862 vp->v_stream = NULL; 863 if (vp->v_count == 1) { 864 mutex_exit(&vp->v_lock); 865 VOP_INACTIVE(vp, CRED(), NULL); 866 return; 867 } 868 vp->v_count--; 869 mutex_exit(&vp->v_lock); 870 } 871 872 int 873 vn_open( 874 char *pnamep, 875 enum uio_seg seg, 876 int filemode, 877 int createmode, 878 struct vnode **vpp, 879 enum create crwhy, 880 mode_t umask) 881 { 882 return (vn_openat(pnamep, seg, filemode, createmode, vpp, crwhy, 883 umask, NULL, -1)); 884 } 885 886 887 /* 888 * Open/create a vnode. 889 * This may be callable by the kernel, the only known use 890 * of user context being that the current user credentials 891 * are used for permissions. crwhy is defined iff filemode & FCREAT. 892 */ 893 int 894 vn_openat( 895 char *pnamep, 896 enum uio_seg seg, 897 int filemode, 898 int createmode, 899 struct vnode **vpp, 900 enum create crwhy, 901 mode_t umask, 902 struct vnode *startvp, 903 int fd) 904 { 905 struct vnode *vp; 906 int mode; 907 int accessflags; 908 int error; 909 int in_crit = 0; 910 int open_done = 0; 911 int shrlock_done = 0; 912 struct vattr vattr; 913 enum symfollow follow; 914 int estale_retry = 0; 915 struct shrlock shr; 916 struct shr_locowner shr_own; 917 918 mode = 0; 919 accessflags = 0; 920 if (filemode & FREAD) 921 mode |= VREAD; 922 if (filemode & (FWRITE|FTRUNC)) 923 mode |= VWRITE; 924 if (filemode & FXATTRDIROPEN) 925 mode |= VEXEC; 926 927 /* symlink interpretation */ 928 if (filemode & FNOFOLLOW) 929 follow = NO_FOLLOW; 930 else 931 follow = FOLLOW; 932 933 if (filemode & FAPPEND) 934 accessflags |= V_APPEND; 935 936 top: 937 if (filemode & FCREAT) { 938 enum vcexcl excl; 939 940 /* 941 * Wish to create a file. 942 */ 943 vattr.va_type = VREG; 944 vattr.va_mode = createmode; 945 vattr.va_mask = AT_TYPE|AT_MODE; 946 if (filemode & FTRUNC) { 947 vattr.va_size = 0; 948 vattr.va_mask |= AT_SIZE; 949 } 950 if (filemode & FEXCL) 951 excl = EXCL; 952 else 953 excl = NONEXCL; 954 955 if (error = 956 vn_createat(pnamep, seg, &vattr, excl, mode, &vp, crwhy, 957 (filemode & ~(FTRUNC|FEXCL)), umask, startvp)) 958 return (error); 959 } else { 960 /* 961 * Wish to open a file. Just look it up. 962 */ 963 if (error = lookupnameat(pnamep, seg, follow, 964 NULLVPP, &vp, startvp)) { 965 if ((error == ESTALE) && 966 fs_need_estale_retry(estale_retry++)) 967 goto top; 968 return (error); 969 } 970 971 /* 972 * Get the attributes to check whether file is large. 973 * We do this only if the FOFFMAX flag is not set and 974 * only for regular files. 975 */ 976 977 if (!(filemode & FOFFMAX) && (vp->v_type == VREG)) { 978 vattr.va_mask = AT_SIZE; 979 if ((error = VOP_GETATTR(vp, &vattr, 0, 980 CRED(), NULL))) { 981 goto out; 982 } 983 if (vattr.va_size > (u_offset_t)MAXOFF32_T) { 984 /* 985 * Large File API - regular open fails 986 * if FOFFMAX flag is set in file mode 987 */ 988 error = EOVERFLOW; 989 goto out; 990 } 991 } 992 /* 993 * Can't write directories, active texts, or 994 * read-only filesystems. Can't truncate files 995 * on which mandatory locking is in effect. 996 */ 997 if (filemode & (FWRITE|FTRUNC)) { 998 /* 999 * Allow writable directory if VDIROPEN flag is set. 1000 */ 1001 if (vp->v_type == VDIR && !(vp->v_flag & VDIROPEN)) { 1002 error = EISDIR; 1003 goto out; 1004 } 1005 if (ISROFILE(vp)) { 1006 error = EROFS; 1007 goto out; 1008 } 1009 /* 1010 * Can't truncate files on which 1011 * sysv mandatory locking is in effect. 1012 */ 1013 if (filemode & FTRUNC) { 1014 vnode_t *rvp; 1015 1016 if (VOP_REALVP(vp, &rvp, NULL) != 0) 1017 rvp = vp; 1018 if (rvp->v_filocks != NULL) { 1019 vattr.va_mask = AT_MODE; 1020 if ((error = VOP_GETATTR(vp, 1021 &vattr, 0, CRED(), NULL)) == 0 && 1022 MANDLOCK(vp, vattr.va_mode)) 1023 error = EAGAIN; 1024 } 1025 } 1026 if (error) 1027 goto out;