1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/t_lock.h> 37 #include <sys/ksynch.h> 38 #include <sys/param.h> 39 #include <sys/time.h> 40 #include <sys/systm.h> 41 #include <sys/sysmacros.h> 42 #include <sys/resource.h> 43 #include <sys/signal.h> 44 #include <sys/cred.h> 45 #include <sys/user.h> 46 #include <sys/buf.h> 47 #include <sys/vfs.h> 48 #include <sys/vfs_opreg.h> 49 #include <sys/vnode.h> 50 #include <sys/proc.h> 51 #include <sys/disp.h> 52 #include <sys/file.h> 53 #include <sys/fcntl.h> 54 #include <sys/flock.h> 55 #include <sys/atomic.h> 56 #include <sys/kmem.h> 57 #include <sys/uio.h> 58 #include <sys/dnlc.h> 59 #include <sys/conf.h> 60 #include <sys/mman.h> 61 #include <sys/pathname.h> 62 #include <sys/debug.h> 63 #include <sys/vmsystm.h> 64 #include <sys/cmn_err.h> 65 #include <sys/filio.h> 66 #include <sys/policy.h> 67 68 #include <sys/fs/ufs_fs.h> 69 #include <sys/fs/ufs_lockfs.h> 70 #include <sys/fs/ufs_filio.h> 71 #include <sys/fs/ufs_inode.h> 72 #include <sys/fs/ufs_fsdir.h> 73 #include <sys/fs/ufs_quota.h> 74 #include <sys/fs/ufs_log.h> 75 #include <sys/fs/ufs_snap.h> 76 #include <sys/fs/ufs_trans.h> 77 #include <sys/fs/ufs_panic.h> 78 #include <sys/fs/ufs_bio.h> 79 #include <sys/dirent.h> /* must be AFTER <sys/fs/fsdir.h>! */ 80 #include <sys/errno.h> 81 #include <sys/fssnap_if.h> 82 #include <sys/unistd.h> 83 #include <sys/sunddi.h> 84 85 #include <sys/filio.h> /* _FIOIO */ 86 87 #include <vm/hat.h> 88 #include <vm/page.h> 89 #include <vm/pvn.h> 90 #include <vm/as.h> 91 #include <vm/seg.h> 92 #include <vm/seg_map.h> 93 #include <vm/seg_vn.h> 94 #include <vm/seg_kmem.h> 95 #include <vm/rm.h> 96 #include <sys/swap.h> 97 98 #include <fs/fs_subr.h> 99 100 #include <sys/fs/decomp.h> 101 102 static struct instats ins; 103 104 static int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t); 105 static int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *, 106 caddr_t, struct page **, size_t, enum seg_rw, int); 107 static int ufs_open(struct vnode **, int, struct cred *, caller_context_t *); 108 static int ufs_close(struct vnode *, int, int, offset_t, struct cred *, 109 caller_context_t *); 110 static int ufs_read(struct vnode *, struct uio *, int, struct cred *, 111 struct caller_context *); 112 static int ufs_write(struct vnode *, struct uio *, int, struct cred *, 113 struct caller_context *); 114 static int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *, 115 int *, caller_context_t *); 116 static int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *, 117 caller_context_t *); 118 static int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *, 119 caller_context_t *); 120 static int ufs_access(struct vnode *, int, int, struct cred *, 121 caller_context_t *); 122 static int ufs_lookup(struct vnode *, char *, struct vnode **, 123 struct pathname *, int, struct vnode *, struct cred *, 124 caller_context_t *, int *, pathname_t *); 125 static int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl, 126 int, struct vnode **, struct cred *, int, 127 caller_context_t *, vsecattr_t *); 128 static int ufs_remove(struct vnode *, char *, struct cred *, 129 caller_context_t *, int); 130 static int ufs_link(struct vnode *, struct vnode *, char *, struct cred *, 131 caller_context_t *, int); 132 static int ufs_rename(struct vnode *, char *, struct vnode *, char *, 133 struct cred *, caller_context_t *, int); 134 static int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **, 135 struct cred *, caller_context_t *, int, vsecattr_t *); 136 static int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *, 137 caller_context_t *, int); 138 static int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *, 139 caller_context_t *, int); 140 static int ufs_symlink(struct vnode *, char *, struct vattr *, char *, 141 struct cred *, caller_context_t *, int); 142 static int ufs_readlink(struct vnode *, struct uio *, struct cred *, 143 caller_context_t *); 144 static int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *); 145 static void ufs_inactive(struct vnode *, struct cred *, caller_context_t *); 146 static int ufs_fid(struct vnode *, struct fid *, caller_context_t *); 147 static int ufs_rwlock(struct vnode *, int, caller_context_t *); 148 static void ufs_rwunlock(struct vnode *, int, caller_context_t *); 149 static int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *); 150 static int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t, 151 struct flk_callback *, struct cred *, 152 caller_context_t *); 153 static int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t, 154 cred_t *, caller_context_t *); 155 static int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *, 156 struct page **, size_t, struct seg *, caddr_t, 157 enum seg_rw, struct cred *, caller_context_t *); 158 static int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *, 159 caller_context_t *); 160 static int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *); 161 static int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t, 162 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 163 static int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 164 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *); 165 static int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t, size_t, 166 uint_t, uint_t, uint_t, struct cred *, caller_context_t *); 167 static int ufs_poll(vnode_t *, short, int, short *, struct pollhead **, 168 caller_context_t *); 169 static int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t, 170 caller_context_t *); 171 static int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *, 172 caller_context_t *); 173 static int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int, 174 struct cred *, caller_context_t *); 175 static int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *); 176 static daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *, 177 daddr32_t *, int, int); 178 static int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *, 179 caller_context_t *); 180 static int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *, 181 caller_context_t *); 182 static int ufs_priv_access(void *, int, struct cred *); 183 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *); 184 185 /* 186 * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions. 187 * 188 * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet. 189 */ 190 struct vnodeops *ufs_vnodeops; 191 192 /* NOTE: "not blkd" below means that the operation isn't blocked by lockfs */ 193 const fs_operation_def_t ufs_vnodeops_template[] = { 194 VOPNAME_OPEN, { .vop_open = ufs_open }, /* not blkd */ 195 VOPNAME_CLOSE, { .vop_close = ufs_close }, /* not blkd */ 196 VOPNAME_READ, { .vop_read = ufs_read }, 197 VOPNAME_WRITE, { .vop_write = ufs_write }, 198 VOPNAME_IOCTL, { .vop_ioctl = ufs_ioctl }, 199 VOPNAME_GETATTR, { .vop_getattr = ufs_getattr }, 200 VOPNAME_SETATTR, { .vop_setattr = ufs_setattr }, 201 VOPNAME_ACCESS, { .vop_access = ufs_access }, 202 VOPNAME_LOOKUP, { .vop_lookup = ufs_lookup }, 203 VOPNAME_CREATE, { .vop_create = ufs_create }, 204 VOPNAME_REMOVE, { .vop_remove = ufs_remove }, 205 VOPNAME_LINK, { .vop_link = ufs_link }, 206 VOPNAME_RENAME, { .vop_rename = ufs_rename }, 207 VOPNAME_MKDIR, { .vop_mkdir = ufs_mkdir }, 208 VOPNAME_RMDIR, { .vop_rmdir = ufs_rmdir }, 209 VOPNAME_READDIR, { .vop_readdir = ufs_readdir }, 210 VOPNAME_SYMLINK, { .vop_symlink = ufs_symlink }, 211 VOPNAME_READLINK, { .vop_readlink = ufs_readlink }, 212 VOPNAME_FSYNC, { .vop_fsync = ufs_fsync }, 213 VOPNAME_INACTIVE, { .vop_inactive = ufs_inactive }, /* not blkd */ 214 VOPNAME_FID, { .vop_fid = ufs_fid }, 215 VOPNAME_RWLOCK, { .vop_rwlock = ufs_rwlock }, /* not blkd */ 216 VOPNAME_RWUNLOCK, { .vop_rwunlock = ufs_rwunlock }, /* not blkd */ 217 VOPNAME_SEEK, { .vop_seek = ufs_seek }, 218 VOPNAME_FRLOCK, { .vop_frlock = ufs_frlock }, 219 VOPNAME_SPACE, { .vop_space = ufs_space }, 220 VOPNAME_GETPAGE, { .vop_getpage = ufs_getpage }, 221 VOPNAME_PUTPAGE, { .vop_putpage = ufs_putpage }, 222 VOPNAME_MAP, { .vop_map = ufs_map }, 223 VOPNAME_ADDMAP, { .vop_addmap = ufs_addmap }, /* not blkd */ 224 VOPNAME_DELMAP, { .vop_delmap = ufs_delmap }, /* not blkd */ 225 VOPNAME_POLL, { .vop_poll = ufs_poll }, /* not blkd */ 226 VOPNAME_DUMP, { .vop_dump = ufs_dump }, 227 VOPNAME_PATHCONF, { .vop_pathconf = ufs_l_pathconf }, 228 VOPNAME_PAGEIO, { .vop_pageio = ufs_pageio }, 229 VOPNAME_DUMPCTL, { .vop_dumpctl = ufs_dumpctl }, 230 VOPNAME_GETSECATTR, { .vop_getsecattr = ufs_getsecattr }, 231 VOPNAME_SETSECATTR, { .vop_setsecattr = ufs_setsecattr }, 232 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 233 NULL, NULL 234 }; 235 236 #define MAX_BACKFILE_COUNT 9999 237 238 /* 239 * Created by ufs_dumpctl() to store a file's disk block info into memory. 240 * Used by ufs_dump() to dump data to disk directly. 241 */ 242 struct dump { 243 struct inode *ip; /* the file we contain */ 244 daddr_t fsbs; /* number of blocks stored */ 245 struct timeval32 time; /* time stamp for the struct */ 246 daddr32_t dblk[1]; /* place holder for block info */ 247 }; 248 249 static struct dump *dump_info = NULL; 250 251 /* 252 * Previously there was no special action required for ordinary files. 253 * (Devices are handled through the device file system.) 254 * Now we support Large Files and Large File API requires open to 255 * fail if file is large. 256 * We could take care to prevent data corruption 257 * by doing an atomic check of size and truncate if file is opened with 258 * FTRUNC flag set but traditionally this is being done by the vfs/vnode 259 * layers. So taking care of truncation here is a change in the existing 260 * semantics of VOP_OPEN and therefore we chose not to implement any thing 261 * here. The check for the size of the file > 2GB is being done at the 262 * vfs layer in routine vn_open(). 263 */ 264 265 /* ARGSUSED */ 266 static int 267 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct) 268 { 269 return (0); 270 } 271 272 /*ARGSUSED*/ 273 static int 274 ufs_close(struct vnode *vp, int flag, int count, offset_t offset, 275 struct cred *cr, caller_context_t *ct) 276 { 277 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 278 cleanshares(vp, ttoproc(curthread)->p_pid); 279 280 /* 281 * Push partially filled cluster at last close. 282 * ``last close'' is approximated because the dnlc 283 * may have a hold on the vnode. 284 * Checking for VBAD here will also act as a forced umount check. 285 */ 286 if (vp->v_count <= 2 && vp->v_type != VBAD) { 287 struct inode *ip = VTOI(vp); 288 if (ip->i_delaylen) { 289 ins.in_poc.value.ul++; 290 (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen, 291 B_ASYNC | B_FREE, cr); 292 ip->i_delaylen = 0; 293 } 294 } 295 296 return (0); 297 } 298 299 /*ARGSUSED*/ 300 static int 301 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, 302 struct caller_context *ct) 303 { 304 struct inode *ip = VTOI(vp); 305 struct ufsvfs *ufsvfsp; 306 struct ulockfs *ulp = NULL; 307 int error = 0; 308 int intrans = 0; 309 310 ASSERT(RW_READ_HELD(&ip->i_rwlock)); 311 312 /* 313 * Mandatory locking needs to be done before ufs_lockfs_begin() 314 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep. 315 */ 316 if (MANDLOCK(vp, ip->i_mode)) { 317 /* 318 * ufs_getattr ends up being called by chklock 319 */ 320 error = chklock(vp, FREAD, uiop->uio_loffset, 321 uiop->uio_resid, uiop->uio_fmode, ct); 322 if (error) 323 goto out; 324 } 325 326 ufsvfsp = ip->i_ufsvfs; 327 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK); 328 if (error) 329 goto out; 330 331 /* 332 * In the case that a directory is opened for reading as a file 333 * (eg "cat .") with the O_RSYNC, O_SYNC and O_DSYNC flags set. 334 * The locking order had to be changed to avoid a deadlock with 335 * an update taking place on that directory at the same time. 336 */ 337 if ((ip->i_mode & IFMT) == IFDIR) { 338 339 rw_enter(&ip->i_contents, RW_READER); 340 error = rdip(ip, uiop, ioflag, cr); 341 rw_exit(&ip->i_contents); 342 343 if (error) { 344 if (ulp) 345 ufs_lockfs_end(ulp); 346 goto out; 347 } 348 349 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 350 TRANS_ISTRANS(ufsvfsp)) { 351 rw_exit(&ip->i_rwlock); 352 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 353 error); 354 ASSERT(!error); 355 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 356 TOP_READ_SIZE); 357 rw_enter(&ip->i_rwlock, RW_READER); 358 } 359 } else { 360 /* 361 * Only transact reads to files opened for sync-read and 362 * sync-write on a file system that is not write locked. 363 * 364 * The ``not write locked'' check prevents problems with 365 * enabling/disabling logging on a busy file system. E.g., 366 * logging exists at the beginning of the read but does not 367 * at the end. 368 * 369 */ 370 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) && 371 TRANS_ISTRANS(ufsvfsp)) { 372 TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE, 373 error); 374 ASSERT(!error); 375 intrans = 1; 376 } 377 378 rw_enter(&ip->i_contents, RW_READER); 379 error = rdip(ip, uiop, ioflag, cr); 380 rw_exit(&ip->i_contents); 381 382 if (intrans) { 383 TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC, 384 TOP_READ_SIZE); 385 } 386 } 387 388 if (ulp) { 389 ufs_lockfs_end(ulp); 390 } 391 out: 392 393 return (error); 394 } 395 396 extern int ufs_HW; /* high water mark */ 397 extern int ufs_LW; /* low water mark */ 398 int ufs_WRITES = 1; /* XXX - enable/disable */ 399 int ufs_throttles = 0; /* throttling count */ 400 int ufs_allow_shared_writes = 1; /* directio shared writes */ 401 402 static int 403 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag) 404 { 405 int shared_write; 406 407 /* 408 * If the FDSYNC flag is set then ignore the global 409 * ufs_allow_shared_writes in this case. 410 */ 411 shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes; 412 413 /* 414 * Filter to determine if this request is suitable as a 415 * concurrent rewrite. This write must not allocate blocks 416 * by extending the file or filling in holes. No use trying 417 * through FSYNC descriptors as the inode will be synchronously 418 * updated after the write. The uio structure has not yet been 419 * checked for sanity, so assume nothing. 420 */ 421 return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) && 422 (uiop->uio_loffset >= (offset_t)0) && 423 (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) && 424 ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) && 425 !(ioflag & FSYNC) && !bmap_has_holes(ip) && 426 shared_write); 427 } 428 429 /*ARGSUSED*/ 430 static int 431 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr, 432 caller_context_t *ct) 433 { 434 struct inode *ip = VTOI(vp); 435 struct ufsvfs *ufsvfsp; 436 struct ulockfs *ulp; 437 int retry = 1; 438 int error, resv, resid = 0; 439 int directio_status; 440 int exclusive; 441 int rewriteflg; 442 long start_resid = uiop->uio_resid; 443 444 ASSERT(RW_LOCK_HELD(&ip->i_rwlock)); 445 446 retry_mandlock: 447 /* 448 * Mandatory locking needs to be done before ufs_lockfs_begin() 449 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep. 450 * Check for forced unmounts normally done in ufs_lockfs_begin(). 451 */ 452 if ((ufsvfsp = ip->i_ufsvfs) == NULL) { 453 error = EIO; 454 goto out; 455 } 456 if (MANDLOCK(vp, ip->i_mode)) { 457 458 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 459 460 /* 461 * ufs_getattr ends up being called by chklock 462 */ 463 error = chklock(vp, FWRITE, uiop->uio_loffset, 464 uiop->uio_resid, uiop->uio_fmode, ct); 465 if (error) 466 goto out; 467 } 468 469 /* i_rwlock can change in chklock */ 470 exclusive = rw_write_held(&ip->i_rwlock); 471 rewriteflg = ufs_check_rewrite(ip, uiop, ioflag); 472 473 /* 474 * Check for fast-path special case of directio re-writes. 475 */ 476 if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) && 477 !exclusive && rewriteflg) { 478 479 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 480 if (error) 481 goto out; 482 483 rw_enter(&ip->i_contents, RW_READER); 484 error = ufs_directio_write(ip, uiop, ioflag, 1, cr, 485 &directio_status); 486 if (directio_status == DIRECTIO_SUCCESS) { 487 uint_t i_flag_save; 488 489 if (start_resid != uiop->uio_resid) 490 error = 0; 491 /* 492 * Special treatment of access times for re-writes. 493 * If IMOD is not already set, then convert it 494 * to IMODACC for this operation. This defers 495 * entering a delta into the log until the inode 496 * is flushed. This mimics what is done for read 497 * operations and inode access time. 498 */ 499 mutex_enter(&ip->i_tlock); 500 i_flag_save = ip->i_flag; 501 ip->i_flag |= IUPD | ICHG; 502 ip->i_seq++; 503 ITIMES_NOLOCK(ip); 504 if ((i_flag_save & IMOD) == 0) { 505 ip->i_flag &= ~IMOD; 506 ip->i_flag |= IMODACC; 507 } 508 mutex_exit(&ip->i_tlock); 509 rw_exit(&ip->i_contents); 510 if (ulp) 511 ufs_lockfs_end(ulp); 512 goto out; 513 } 514 rw_exit(&ip->i_contents); 515 if (ulp) 516 ufs_lockfs_end(ulp); 517 } 518 519 if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) { 520 rw_exit(&ip->i_rwlock); 521 rw_enter(&ip->i_rwlock, RW_WRITER); 522 /* 523 * Mandatory locking could have been enabled 524 * after dropping the i_rwlock. 525 */ 526 if (MANDLOCK(vp, ip->i_mode)) 527 goto retry_mandlock; 528 } 529 530 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK); 531 if (error) 532 goto out; 533 534 /* 535 * Amount of log space needed for this write 536 */ 537 if (!rewriteflg || !(ioflag & FDSYNC)) 538 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid); 539 540 /* 541 * Throttle writes. 542 */ 543 if (ufs_WRITES && (ip->i_writes > ufs_HW)) { 544 mutex_enter(&ip->i_tlock); 545 while (ip->i_writes > ufs_HW) { 546 ufs_throttles++; 547 cv_wait(&ip->i_wrcv, &ip->i_tlock); 548 } 549 mutex_exit(&ip->i_tlock); 550 } 551 552 /* 553 * Enter Transaction 554 * 555 * If the write is a rewrite there is no need to open a transaction 556 * if the FDSYNC flag is set and not the FSYNC. In this case just 557 * set the IMODACC flag to modify do the update at a later time 558 * thus avoiding the overhead of the logging transaction that is 559 * not required. 560 */ 561 if (ioflag & (FSYNC|FDSYNC)) { 562 if (ulp) { 563 if (rewriteflg) { 564 uint_t i_flag_save; 565 566 rw_enter(&ip->i_contents, RW_READER); 567 mutex_enter(&ip->i_tlock); 568 i_flag_save = ip->i_flag; 569 ip->i_flag |= IUPD | ICHG; 570 ip->i_seq++; 571 ITIMES_NOLOCK(ip); 572 if ((i_flag_save & IMOD) == 0) { 573 ip->i_flag &= ~IMOD; 574 ip->i_flag |= IMODACC; 575 } 576 mutex_exit(&ip->i_tlock); 577 rw_exit(&ip->i_contents); 578 } else { 579 int terr = 0; 580 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv, 581 terr); 582 ASSERT(!terr); 583 } 584 } 585 } else { 586 if (ulp) 587 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv); 588 } 589 590 /* 591 * Write the file 592 */ 593 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 594 rw_enter(&ip->i_contents, RW_WRITER); 595 if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) { 596 /* 597 * In append mode start at end of file. 598 */ 599 uiop->uio_loffset = ip->i_size; 600 } 601 602 /* 603 * Mild optimisation, don't call ufs_trans_write() unless we have to 604 * Also, suppress file system full messages if we will retry. 605 */ 606 if (retry) 607 ip->i_flag |= IQUIET; 608 if (resid) { 609 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid); 610 } else { 611 error = wrip(ip, uiop, ioflag, cr); 612 } 613 ip->i_flag &= ~IQUIET; 614 615 rw_exit(&ip->i_contents); 616 rw_exit(&ufsvfsp->vfs_dqrwlock); 617 618 /* 619 * Leave Transaction 620 */ 621 if (ulp) { 622 if (ioflag & (FSYNC|FDSYNC)) { 623 if (!rewriteflg) { 624 int terr = 0; 625 626 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC, 627 resv); 628 if (error == 0) 629 error = terr; 630 } 631 } else { 632 TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv); 633 } 634 ufs_lockfs_end(ulp); 635 } 636 out: 637 if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 638 /* 639 * Any blocks tied up in pending deletes? 640 */ 641 ufs_delete_drain_wait(ufsvfsp, 1); 642 retry = 0; 643 goto retry_mandlock; 644 } 645 646 if (error == ENOSPC && (start_resid != uiop->uio_resid)) 647 error = 0; 648 649 return (error); 650 } 651 652 /* 653 * Don't cache write blocks to files with the sticky bit set. 654 * Used to keep swap files from blowing the page cache on a server. 655 */ 656 int stickyhack = 1; 657 658 /* 659 * Free behind hacks. The pager is busted. 660 * XXX - need to pass the information down to writedone() in a flag like B_SEQ 661 * or B_FREE_IF_TIGHT_ON_MEMORY. 662 */ 663 int freebehind = 1; 664 int smallfile = 0; 665 u_offset_t smallfile64 = 32 * 1024; 666 667 /* 668 * While we should, in most cases, cache the pages for write, we 669 * may also want to cache the pages for read as long as they are 670 * frequently re-usable. 671 * 672 * If cache_read_ahead = 1, the pages for read will go to the tail 673 * of the cache list when they are released, otherwise go to the head. 674 */ 675 int cache_read_ahead = 0; 676 677 /* 678 * Freebehind exists so that as we read large files sequentially we 679 * don't consume most of memory with pages from a few files. It takes 680 * longer to re-read from disk multiple small files as it does reading 681 * one large one sequentially. As system memory grows customers need 682 * to retain bigger chunks of files in memory. The advent of the 683 * cachelist opens up of the possibility freeing pages to the head or 684 * tail of the list. 685 * 686 * Not freeing a page is a bet that the page will be read again before 687 * it's segmap slot is needed for something else. If we loose the bet, 688 * it means some other thread is burdened with the page free we did 689 * not do. If we win we save a free and reclaim. 690 * 691 * Freeing it at the tail vs the head of cachelist is a bet that the 692 * page will survive until the next read. It's also saying that this 693 * page is more likely to be re-used than a page freed some time ago 694 * and never reclaimed. 695 * 696 * Freebehind maintains a range of file offset [smallfile1; smallfile2] 697 * 698 * 0 < offset < smallfile1 : pages are not freed. 699 * smallfile1 < offset < smallfile2 : pages freed to tail of cachelist. 700 * smallfile2 < offset : pages freed to head of cachelist. 701 * 702 * The range is computed at most once per second and depends on 703 * freemem and ncpus_online. Both parameters are bounded to be 704 * >= smallfile && >= smallfile64. 705 * 706 * smallfile1 = (free memory / ncpu) / 1000 707 * smallfile2 = (free memory / ncpu) / 10 708 * 709 * A few examples values: 710 * 711 * Free Mem (in Bytes) [smallfile1; smallfile2] [smallfile1; smallfile2] 712 * ncpus_online = 4 ncpus_online = 64 713 * ------------------ ----------------------- ----------------------- 714 * 1G [256K; 25M] [32K; 1.5M] 715 * 10G [2.5M; 250M] [156K; 15M] 716 * 100G [25M; 2.5G] [1.5M; 150M] 717 * 718 */ 719 720 #define SMALLFILE1_D 1000 721 #define SMALLFILE2_D 10 722 static u_offset_t smallfile1 = 32 * 1024; 723 static u_offset_t smallfile2 = 32 * 1024; 724 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */ 725 uint_t smallfile1_d = SMALLFILE1_D; 726 uint_t smallfile2_d = SMALLFILE2_D; 727 728 /* 729 * wrip does the real work of write requests for ufs. 730 */ 731 int 732 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr) 733 { 734 rlim64_t limit = uio->uio_llimit; 735 u_offset_t off; 736 u_offset_t old_i_size; 737 struct fs *fs; 738 struct vnode *vp; 739 struct ufsvfs *ufsvfsp; 740 caddr_t base; 741 long start_resid = uio->uio_resid; /* save starting resid */ 742 long premove_resid; /* resid before uiomove() */ 743 uint_t flags; 744 int newpage; 745 int iupdat_flag, directio_status; 746 int n, on, mapon; 747 int error, pagecreate; 748 int do_dqrwlock; /* drop/reacquire vfs_dqrwlock */ 749 int32_t iblocks; 750 int new_iblocks; 751 752 /* 753 * ip->i_size is incremented before the uiomove 754 * is done on a write. If the move fails (bad user 755 * address) reset ip->i_size. 756 * The better way would be to increment ip->i_size 757 * only if the uiomove succeeds. 758 */ 759 int i_size_changed = 0; 760 o_mode_t type; 761 int i_seq_needed = 0; 762 763 vp = ITOV(ip); 764 765 /* 766 * check for forced unmount - should not happen as 767 * the request passed the lockfs checks. 768 */ 769 if ((ufsvfsp = ip->i_ufsvfs) == NULL) 770 return (EIO); 771 772 fs = ip->i_fs; 773 774 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 775 776 /* check for valid filetype */ 777 type = ip->i_mode & IFMT; 778 if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) && 779 (type != IFLNK) && (type != IFSHAD)) { 780 return (EIO); 781 } 782 783 /* 784 * the actual limit of UFS file size 785 * is UFS_MAXOFFSET_T 786 */ 787 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 788 limit = MAXOFFSET_T; 789 790 if (uio->uio_loffset >= limit) { 791 proc_t *p = ttoproc(curthread); 792 793 mutex_enter(&p->p_lock); 794 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls, 795 p, RCA_UNSAFE_SIGINFO); 796 mutex_exit(&p->p_lock); 797 return (EFBIG); 798 } 799 800 /* 801 * if largefiles are disallowed, the limit is 802 * the pre-largefiles value of 2GB 803 */ 804 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) 805 limit = MIN(UFS_MAXOFFSET_T, limit); 806 else 807 limit = MIN(MAXOFF32_T, limit); 808 809 if (uio->uio_loffset < (offset_t)0) { 810 return (EINVAL); 811 } 812 if (uio->uio_resid == 0) { 813 return (0); 814 } 815 816 if (uio->uio_loffset >= limit) 817 return (EFBIG); 818 819 ip->i_flag |= INOACC; /* don't update ref time in getpage */ 820 821 if (ioflag & (FSYNC|FDSYNC)) { 822 ip->i_flag |= ISYNC; 823 iupdat_flag = 1; 824 } 825 /* 826 * Try to go direct 827 */ 828 if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) { 829 uio->uio_llimit = limit; 830 error = ufs_directio_write(ip, uio, ioflag, 0, cr, 831 &directio_status); 832 /* 833 * If ufs_directio wrote to the file or set the flags, 834 * we need to update i_seq, but it may be deferred. 835 */ 836 if (start_resid != uio->uio_resid || 837 (ip->i_flag & (ICHG|IUPD))) { 838 i_seq_needed = 1; 839 ip->i_flag |= ISEQ; 840 } 841 if (directio_status == DIRECTIO_SUCCESS) 842 goto out; 843 } 844 845 /* 846 * Behavior with respect to dropping/reacquiring vfs_dqrwlock: 847 * 848 * o shadow inodes: vfs_dqrwlock is not held at all 849 * o quota updates: vfs_dqrwlock is read or write held 850 * o other updates: vfs_dqrwlock is read held 851 * 852 * The first case is the only one where we do not hold 853 * vfs_dqrwlock at all while entering wrip(). 854 * We must make sure not to downgrade/drop vfs_dqrwlock if we 855 * have it as writer, i.e. if we are updating the quota inode. 856 * There is no potential deadlock scenario in this case as 857 * ufs_getpage() takes care of this and avoids reacquiring 858 * vfs_dqrwlock in that case. 859 * 860 * This check is done here since the above conditions do not change 861 * and we possibly loop below, so save a few cycles. 862 */ 863 if ((type == IFSHAD) || 864 (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) { 865 do_dqrwlock = 0; 866 } else { 867 do_dqrwlock = 1; 868 } 869 870 /* 871 * Large Files: We cast MAXBMASK to offset_t 872 * inorder to mask out the higher bits. Since offset_t 873 * is a signed value, the high order bit set in MAXBMASK 874 * value makes it do the right thing by having all bits 1 875 * in the higher word. May be removed for _SOLARIS64_. 876 */ 877 878 fs = ip->i_fs; 879 do { 880 u_offset_t uoff = uio->uio_loffset; 881 off = uoff & (offset_t)MAXBMASK; 882 mapon = (int)(uoff & (offset_t)MAXBOFFSET); 883 on = (int)blkoff(fs, uoff); 884 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid); 885 new_iblocks = 1; 886 887 if (type == IFREG && uoff + n >= limit) { 888 if (uoff >= limit) { 889 error = EFBIG; 890 goto out; 891 } 892 /* 893 * since uoff + n >= limit, 894 * therefore n >= limit - uoff, and n is an int 895 * so it is safe to cast it to an int 896 */ 897 n = (int)(limit - (rlim64_t)uoff); 898 } 899 if (uoff + n > ip->i_size) { 900 /* 901 * We are extending the length of the file. 902 * bmap is used so that we are sure that 903 * if we need to allocate new blocks, that it 904 * is done here before we up the file size. 905 */ 906 error = bmap_write(ip, uoff, (int)(on + n), 907 mapon == 0, NULL, cr); 908 /* 909 * bmap_write never drops i_contents so if 910 * the flags are set it changed the file. 911 */ 912 if (ip->i_flag & (ICHG|IUPD)) { 913 i_seq_needed = 1; 914 ip->i_flag |= ISEQ; 915 } 916 if (error) 917 break; 918 /* 919 * There is a window of vulnerability here. 920 * The sequence of operations: allocate file 921 * system blocks, uiomove the data into pages, 922 * and then update the size of the file in the 923 * inode, must happen atomically. However, due 924 * to current locking constraints, this can not 925 * be done. 926 */ 927 ASSERT(ip->i_writer == NULL); 928 ip->i_writer = curthread; 929 i_size_changed = 1; 930 /* 931 * If we are writing from the beginning of 932 * the mapping, we can just create the 933 * pages without having to read them. 934 */ 935 pagecreate = (mapon == 0); 936 } else if (n == MAXBSIZE) { 937 /* 938 * Going to do a whole mappings worth, 939 * so we can just create the pages w/o 940 * having to read them in. But before 941 * we do that, we need to make sure any 942 * needed blocks are allocated first. 943 */ 944 iblocks = ip->i_blocks; 945 error = bmap_write(ip, uoff, (int)(on + n), 946 BI_ALLOC_ONLY, NULL, cr); 947 /* 948 * bmap_write never drops i_contents so if 949 * the flags are set it changed the file. 950 */ 951 if (ip->i_flag & (ICHG|IUPD)) { 952 i_seq_needed = 1; 953 ip->i_flag |= ISEQ; 954 } 955 if (error) 956 break; 957 pagecreate = 1; 958 /* 959 * check if the new created page needed the 960 * allocation of new disk blocks. 961 */ 962 if (iblocks == ip->i_blocks) 963 new_iblocks = 0; /* no new blocks allocated */ 964 } else { 965 pagecreate = 0; 966 /* 967 * In sync mode flush the indirect blocks which 968 * may have been allocated and not written on 969 * disk. In above cases bmap_write will allocate 970 * in sync mode. 971 */ 972 if (ioflag & (FSYNC|FDSYNC)) { 973 error = ufs_indirblk_sync(ip, uoff); 974 if (error) 975 break; 976 } 977 } 978 979 /* 980 * At this point we can enter ufs_getpage() in one 981 * of two ways: 982 * 1) segmap_getmapflt() calls ufs_getpage() when the 983 * forcefault parameter is true (pagecreate == 0) 984 * 2) uiomove() causes a page fault. 985 * 986 * We have to drop the contents lock to prevent the VM 987 * system from trying to reacquire it in ufs_getpage() 988 * should the uiomove cause a pagefault. 989 * 990 * We have to drop the reader vfs_dqrwlock here as well. 991 */ 992 rw_exit(&ip->i_contents); 993 if (do_dqrwlock) { 994 ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 995 ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock))); 996 rw_exit(&ufsvfsp->vfs_dqrwlock); 997 } 998 999 newpage = 0; 1000 premove_resid = uio->uio_resid; 1001 if (vpm_enable) { 1002 /* 1003 * Copy data. If new pages are created, part of 1004 * the page that is not written will be initizliazed 1005 * with zeros. 1006 */ 1007 error = vpm_data_copy(vp, (off + mapon), (uint_t)n, 1008 uio, !pagecreate, &newpage, 0, S_WRITE); 1009 } else { 1010 1011 base = segmap_getmapflt(segkmap, vp, (off + mapon), 1012 (uint_t)n, !pagecreate, S_WRITE); 1013 1014 /* 1015 * segmap_pagecreate() returns 1 if it calls 1016 * page_create_va() to allocate any pages. 1017 */ 1018 1019 if (pagecreate