Home | History | Annotate | Download | only in ufs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved  	*/
     29 
     30 /*
     31  * Portions of this source code were derived from Berkeley 4.3 BSD
     32  * under license from the Regents of the University of California.
     33  */
     34 
     35 #include <sys/types.h>
     36 #include <sys/t_lock.h>
     37 #include <sys/ksynch.h>
     38 #include <sys/param.h>
     39 #include <sys/time.h>
     40 #include <sys/systm.h>
     41 #include <sys/sysmacros.h>
     42 #include <sys/resource.h>
     43 #include <sys/signal.h>
     44 #include <sys/cred.h>
     45 #include <sys/user.h>
     46 #include <sys/buf.h>
     47 #include <sys/vfs.h>
     48 #include <sys/vfs_opreg.h>
     49 #include <sys/vnode.h>
     50 #include <sys/proc.h>
     51 #include <sys/disp.h>
     52 #include <sys/file.h>
     53 #include <sys/fcntl.h>
     54 #include <sys/flock.h>
     55 #include <sys/atomic.h>
     56 #include <sys/kmem.h>
     57 #include <sys/uio.h>
     58 #include <sys/dnlc.h>
     59 #include <sys/conf.h>
     60 #include <sys/mman.h>
     61 #include <sys/pathname.h>
     62 #include <sys/debug.h>
     63 #include <sys/vmsystm.h>
     64 #include <sys/cmn_err.h>
     65 #include <sys/filio.h>
     66 #include <sys/policy.h>
     67 
     68 #include <sys/fs/ufs_fs.h>
     69 #include <sys/fs/ufs_lockfs.h>
     70 #include <sys/fs/ufs_filio.h>
     71 #include <sys/fs/ufs_inode.h>
     72 #include <sys/fs/ufs_fsdir.h>
     73 #include <sys/fs/ufs_quota.h>
     74 #include <sys/fs/ufs_log.h>
     75 #include <sys/fs/ufs_snap.h>
     76 #include <sys/fs/ufs_trans.h>
     77 #include <sys/fs/ufs_panic.h>
     78 #include <sys/fs/ufs_bio.h>
     79 #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
     80 #include <sys/errno.h>
     81 #include <sys/fssnap_if.h>
     82 #include <sys/unistd.h>
     83 #include <sys/sunddi.h>
     84 
     85 #include <sys/filio.h>		/* _FIOIO */
     86 
     87 #include <vm/hat.h>
     88 #include <vm/page.h>
     89 #include <vm/pvn.h>
     90 #include <vm/as.h>
     91 #include <vm/seg.h>
     92 #include <vm/seg_map.h>
     93 #include <vm/seg_vn.h>
     94 #include <vm/seg_kmem.h>
     95 #include <vm/rm.h>
     96 #include <sys/swap.h>
     97 
     98 #include <fs/fs_subr.h>
     99 
    100 #include <sys/fs/decomp.h>
    101 
    102 static struct instats ins;
    103 
    104 static 	int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
    105 static	int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
    106 		caddr_t, struct page **, size_t, enum seg_rw, int);
    107 static	int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
    108 static	int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
    109 		caller_context_t *);
    110 static	int ufs_read(struct vnode *, struct uio *, int, struct cred *,
    111 		struct caller_context *);
    112 static	int ufs_write(struct vnode *, struct uio *, int, struct cred *,
    113 		struct caller_context *);
    114 static	int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
    115 		int *, caller_context_t *);
    116 static	int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
    117 		caller_context_t *);
    118 static	int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
    119 		caller_context_t *);
    120 static	int ufs_access(struct vnode *, int, int, struct cred *,
    121 		caller_context_t *);
    122 static	int ufs_lookup(struct vnode *, char *, struct vnode **,
    123 		struct pathname *, int, struct vnode *, struct cred *,
    124 		caller_context_t *, int *, pathname_t *);
    125 static	int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
    126 		int, struct vnode **, struct cred *, int,
    127 		caller_context_t *, vsecattr_t  *);
    128 static	int ufs_remove(struct vnode *, char *, struct cred *,
    129 		caller_context_t *, int);
    130 static	int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
    131 		caller_context_t *, int);
    132 static	int ufs_rename(struct vnode *, char *, struct vnode *, char *,
    133 		struct cred *, caller_context_t *, int);
    134 static	int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
    135 		struct cred *, caller_context_t *, int, vsecattr_t *);
    136 static	int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
    137 		caller_context_t *, int);
    138 static	int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
    139 		caller_context_t *, int);
    140 static	int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
    141 		struct cred *, caller_context_t *, int);
    142 static	int ufs_readlink(struct vnode *, struct uio *, struct cred *,
    143 		caller_context_t *);
    144 static	int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
    145 static	void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
    146 static	int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
    147 static	int ufs_rwlock(struct vnode *, int, caller_context_t *);
    148 static	void ufs_rwunlock(struct vnode *, int, caller_context_t *);
    149 static	int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
    150 static	int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
    151 		struct flk_callback *, struct cred *,
    152 		caller_context_t *);
    153 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
    154 		cred_t *, caller_context_t *);
    155 static	int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
    156 		struct page **, size_t, struct seg *, caddr_t,
    157 		enum seg_rw, struct cred *, caller_context_t *);
    158 static	int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
    159 		caller_context_t *);
    160 static	int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
    161 static	int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
    162 		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    163 static	int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
    164 		uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
    165 static	int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
    166 		uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
    167 static	int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
    168 		caller_context_t *);
    169 static	int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
    170     caller_context_t *);
    171 static	int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
    172 		caller_context_t *);
    173 static	int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
    174 		struct cred *, caller_context_t *);
    175 static	int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
    176 static	daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
    177 		daddr32_t *, int, int);
    178 static	int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
    179 		caller_context_t *);
    180 static	int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
    181 		caller_context_t *);
    182 static	int ufs_priv_access(void *, int, struct cred *);
    183 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
    184 
    185 /*
    186  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
    187  *
    188  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
    189  */
    190 struct vnodeops *ufs_vnodeops;
    191 
    192 /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
    193 const fs_operation_def_t ufs_vnodeops_template[] = {
    194 	VOPNAME_OPEN,		{ .vop_open = ufs_open },	/* not blkd */
    195 	VOPNAME_CLOSE,		{ .vop_close = ufs_close },	/* not blkd */
    196 	VOPNAME_READ,		{ .vop_read = ufs_read },
    197 	VOPNAME_WRITE,		{ .vop_write = ufs_write },
    198 	VOPNAME_IOCTL,		{ .vop_ioctl = ufs_ioctl },
    199 	VOPNAME_GETATTR,	{ .vop_getattr = ufs_getattr },
    200 	VOPNAME_SETATTR,	{ .vop_setattr = ufs_setattr },
    201 	VOPNAME_ACCESS,		{ .vop_access = ufs_access },
    202 	VOPNAME_LOOKUP,		{ .vop_lookup = ufs_lookup },
    203 	VOPNAME_CREATE,		{ .vop_create = ufs_create },
    204 	VOPNAME_REMOVE,		{ .vop_remove = ufs_remove },
    205 	VOPNAME_LINK,		{ .vop_link = ufs_link },
    206 	VOPNAME_RENAME,		{ .vop_rename = ufs_rename },
    207 	VOPNAME_MKDIR,		{ .vop_mkdir = ufs_mkdir },
    208 	VOPNAME_RMDIR,		{ .vop_rmdir = ufs_rmdir },
    209 	VOPNAME_READDIR,	{ .vop_readdir = ufs_readdir },
    210 	VOPNAME_SYMLINK,	{ .vop_symlink = ufs_symlink },
    211 	VOPNAME_READLINK,	{ .vop_readlink = ufs_readlink },
    212 	VOPNAME_FSYNC,		{ .vop_fsync = ufs_fsync },
    213 	VOPNAME_INACTIVE,	{ .vop_inactive = ufs_inactive }, /* not blkd */
    214 	VOPNAME_FID,		{ .vop_fid = ufs_fid },
    215 	VOPNAME_RWLOCK,		{ .vop_rwlock = ufs_rwlock },	/* not blkd */
    216 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = ufs_rwunlock }, /* not blkd */
    217 	VOPNAME_SEEK,		{ .vop_seek = ufs_seek },
    218 	VOPNAME_FRLOCK,		{ .vop_frlock = ufs_frlock },
    219 	VOPNAME_SPACE,		{ .vop_space = ufs_space },
    220 	VOPNAME_GETPAGE,	{ .vop_getpage = ufs_getpage },
    221 	VOPNAME_PUTPAGE,	{ .vop_putpage = ufs_putpage },
    222 	VOPNAME_MAP,		{ .vop_map = ufs_map },
    223 	VOPNAME_ADDMAP,		{ .vop_addmap = ufs_addmap },	/* not blkd */
    224 	VOPNAME_DELMAP,		{ .vop_delmap = ufs_delmap },	/* not blkd */
    225 	VOPNAME_POLL,		{ .vop_poll = ufs_poll },	/* not blkd */
    226 	VOPNAME_DUMP,		{ .vop_dump = ufs_dump },
    227 	VOPNAME_PATHCONF,	{ .vop_pathconf = ufs_l_pathconf },
    228 	VOPNAME_PAGEIO,		{ .vop_pageio = ufs_pageio },
    229 	VOPNAME_DUMPCTL,	{ .vop_dumpctl = ufs_dumpctl },
    230 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = ufs_getsecattr },
    231 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = ufs_setsecattr },
    232 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
    233 	NULL,			NULL
    234 };
    235 
    236 #define	MAX_BACKFILE_COUNT	9999
    237 
    238 /*
    239  * Created by ufs_dumpctl() to store a file's disk block info into memory.
    240  * Used by ufs_dump() to dump data to disk directly.
    241  */
    242 struct dump {
    243 	struct inode	*ip;		/* the file we contain */
    244 	daddr_t		fsbs;		/* number of blocks stored */
    245 	struct timeval32 time;		/* time stamp for the struct */
    246 	daddr32_t 	dblk[1];	/* place holder for block info */
    247 };
    248 
    249 static struct dump *dump_info = NULL;
    250 
    251 /*
    252  * Previously there was no special action required for ordinary files.
    253  * (Devices are handled through the device file system.)
    254  * Now we support Large Files and Large File API requires open to
    255  * fail if file is large.
    256  * We could take care to prevent data corruption
    257  * by doing an atomic check of size and truncate if file is opened with
    258  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
    259  * layers. So taking care of truncation here is a change in the existing
    260  * semantics of VOP_OPEN and therefore we chose not to implement any thing
    261  * here. The check for the size of the file > 2GB is being done at the
    262  * vfs layer in routine vn_open().
    263  */
    264 
    265 /* ARGSUSED */
    266 static int
    267 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
    268 {
    269 	return (0);
    270 }
    271 
    272 /*ARGSUSED*/
    273 static int
    274 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
    275 	struct cred *cr, caller_context_t *ct)
    276 {
    277 	cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
    278 	cleanshares(vp, ttoproc(curthread)->p_pid);
    279 
    280 	/*
    281 	 * Push partially filled cluster at last close.
    282 	 * ``last close'' is approximated because the dnlc
    283 	 * may have a hold on the vnode.
    284 	 * Checking for VBAD here will also act as a forced umount check.
    285 	 */
    286 	if (vp->v_count <= 2 && vp->v_type != VBAD) {
    287 		struct inode *ip = VTOI(vp);
    288 		if (ip->i_delaylen) {
    289 			ins.in_poc.value.ul++;
    290 			(void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
    291 			    B_ASYNC | B_FREE, cr);
    292 			ip->i_delaylen = 0;
    293 		}
    294 	}
    295 
    296 	return (0);
    297 }
    298 
    299 /*ARGSUSED*/
    300 static int
    301 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
    302 	struct caller_context *ct)
    303 {
    304 	struct inode *ip = VTOI(vp);
    305 	struct ufsvfs *ufsvfsp;
    306 	struct ulockfs *ulp = NULL;
    307 	int error = 0;
    308 	int intrans = 0;
    309 
    310 	ASSERT(RW_READ_HELD(&ip->i_rwlock));
    311 
    312 	/*
    313 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
    314 	 * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
    315 	 */
    316 	if (MANDLOCK(vp, ip->i_mode)) {
    317 		/*
    318 		 * ufs_getattr ends up being called by chklock
    319 		 */
    320 		error = chklock(vp, FREAD, uiop->uio_loffset,
    321 		    uiop->uio_resid, uiop->uio_fmode, ct);
    322 		if (error)
    323 			goto out;
    324 	}
    325 
    326 	ufsvfsp = ip->i_ufsvfs;
    327 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
    328 	if (error)
    329 		goto out;
    330 
    331 	/*
    332 	 * In the case that a directory is opened for reading as a file
    333 	 * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
    334 	 * The locking order had to be changed to avoid a deadlock with
    335 	 * an update taking place on that directory at the same time.
    336 	 */
    337 	if ((ip->i_mode & IFMT) == IFDIR) {
    338 
    339 		rw_enter(&ip->i_contents, RW_READER);
    340 		error = rdip(ip, uiop, ioflag, cr);
    341 		rw_exit(&ip->i_contents);
    342 
    343 		if (error) {
    344 			if (ulp)
    345 				ufs_lockfs_end(ulp);
    346 			goto out;
    347 		}
    348 
    349 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
    350 		    TRANS_ISTRANS(ufsvfsp)) {
    351 			rw_exit(&ip->i_rwlock);
    352 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
    353 			    error);
    354 			ASSERT(!error);
    355 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
    356 			    TOP_READ_SIZE);
    357 			rw_enter(&ip->i_rwlock, RW_READER);
    358 		}
    359 	} else {
    360 		/*
    361 		 * Only transact reads to files opened for sync-read and
    362 		 * sync-write on a file system that is not write locked.
    363 		 *
    364 		 * The ``not write locked'' check prevents problems with
    365 		 * enabling/disabling logging on a busy file system.  E.g.,
    366 		 * logging exists at the beginning of the read but does not
    367 		 * at the end.
    368 		 *
    369 		 */
    370 		if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
    371 		    TRANS_ISTRANS(ufsvfsp)) {
    372 			TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
    373 			    error);
    374 			ASSERT(!error);
    375 			intrans = 1;
    376 		}
    377 
    378 		rw_enter(&ip->i_contents, RW_READER);
    379 		error = rdip(ip, uiop, ioflag, cr);
    380 		rw_exit(&ip->i_contents);
    381 
    382 		if (intrans) {
    383 			TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
    384 			    TOP_READ_SIZE);
    385 		}
    386 	}
    387 
    388 	if (ulp) {
    389 		ufs_lockfs_end(ulp);
    390 	}
    391 out:
    392 
    393 	return (error);
    394 }
    395 
    396 extern	int	ufs_HW;		/* high water mark */
    397 extern	int	ufs_LW;		/* low water mark */
    398 int	ufs_WRITES = 1;		/* XXX - enable/disable */
    399 int	ufs_throttles = 0;	/* throttling count */
    400 int	ufs_allow_shared_writes = 1;	/* directio shared writes */
    401 
    402 static int
    403 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
    404 {
    405 	int	shared_write;
    406 
    407 	/*
    408 	 * If the FDSYNC flag is set then ignore the global
    409 	 * ufs_allow_shared_writes in this case.
    410 	 */
    411 	shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
    412 
    413 	/*
    414 	 * Filter to determine if this request is suitable as a
    415 	 * concurrent rewrite. This write must not allocate blocks
    416 	 * by extending the file or filling in holes. No use trying
    417 	 * through FSYNC descriptors as the inode will be synchronously
    418 	 * updated after the write. The uio structure has not yet been
    419 	 * checked for sanity, so assume nothing.
    420 	 */
    421 	return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
    422 	    (uiop->uio_loffset >= (offset_t)0) &&
    423 	    (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
    424 	    ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
    425 	    !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
    426 	    shared_write);
    427 }
    428 
    429 /*ARGSUSED*/
    430 static int
    431 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
    432 	caller_context_t *ct)
    433 {
    434 	struct inode *ip = VTOI(vp);
    435 	struct ufsvfs *ufsvfsp;
    436 	struct ulockfs *ulp;
    437 	int retry = 1;
    438 	int error, resv, resid = 0;
    439 	int directio_status;
    440 	int exclusive;
    441 	int rewriteflg;
    442 	long start_resid = uiop->uio_resid;
    443 
    444 	ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
    445 
    446 retry_mandlock:
    447 	/*
    448 	 * Mandatory locking needs to be done before ufs_lockfs_begin()
    449 	 * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
    450 	 * Check for forced unmounts normally done in ufs_lockfs_begin().
    451 	 */
    452 	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
    453 		error = EIO;
    454 		goto out;
    455 	}
    456 	if (MANDLOCK(vp, ip->i_mode)) {
    457 
    458 		ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
    459 
    460 		/*
    461 		 * ufs_getattr ends up being called by chklock
    462 		 */
    463 		error = chklock(vp, FWRITE, uiop->uio_loffset,
    464 		    uiop->uio_resid, uiop->uio_fmode, ct);
    465 		if (error)
    466 			goto out;
    467 	}
    468 
    469 	/* i_rwlock can change in chklock */
    470 	exclusive = rw_write_held(&ip->i_rwlock);
    471 	rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
    472 
    473 	/*
    474 	 * Check for fast-path special case of directio re-writes.
    475 	 */
    476 	if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
    477 	    !exclusive && rewriteflg) {
    478 
    479 		error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
    480 		if (error)
    481 			goto out;
    482 
    483 		rw_enter(&ip->i_contents, RW_READER);
    484 		error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
    485 		    &directio_status);
    486 		if (directio_status == DIRECTIO_SUCCESS) {
    487 			uint_t i_flag_save;
    488 
    489 			if (start_resid != uiop->uio_resid)
    490 				error = 0;
    491 			/*
    492 			 * Special treatment of access times for re-writes.
    493 			 * If IMOD is not already set, then convert it
    494 			 * to IMODACC for this operation. This defers
    495 			 * entering a delta into the log until the inode
    496 			 * is flushed. This mimics what is done for read
    497 			 * operations and inode access time.
    498 			 */
    499 			mutex_enter(&ip->i_tlock);
    500 			i_flag_save = ip->i_flag;
    501 			ip->i_flag |= IUPD | ICHG;
    502 			ip->i_seq++;
    503 			ITIMES_NOLOCK(ip);
    504 			if ((i_flag_save & IMOD) == 0) {
    505 				ip->i_flag &= ~IMOD;
    506 				ip->i_flag |= IMODACC;
    507 			}
    508 			mutex_exit(&ip->i_tlock);
    509 			rw_exit(&ip->i_contents);
    510 			if (ulp)
    511 				ufs_lockfs_end(ulp);
    512 			goto out;
    513 		}
    514 		rw_exit(&ip->i_contents);
    515 		if (ulp)
    516 			ufs_lockfs_end(ulp);
    517 	}
    518 
    519 	if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
    520 		rw_exit(&ip->i_rwlock);
    521 		rw_enter(&ip->i_rwlock, RW_WRITER);
    522 		/*
    523 		 * Mandatory locking could have been enabled
    524 		 * after dropping the i_rwlock.
    525 		 */
    526 		if (MANDLOCK(vp, ip->i_mode))
    527 			goto retry_mandlock;
    528 	}
    529 
    530 	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
    531 	if (error)
    532 		goto out;
    533 
    534 	/*
    535 	 * Amount of log space needed for this write
    536 	 */
    537 	if (!rewriteflg || !(ioflag & FDSYNC))
    538 		TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
    539 
    540 	/*
    541 	 * Throttle writes.
    542 	 */
    543 	if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
    544 		mutex_enter(&ip->i_tlock);
    545 		while (ip->i_writes > ufs_HW) {
    546 			ufs_throttles++;
    547 			cv_wait(&ip->i_wrcv, &ip->i_tlock);
    548 		}
    549 		mutex_exit(&ip->i_tlock);
    550 	}
    551 
    552 	/*
    553 	 * Enter Transaction
    554 	 *
    555 	 * If the write is a rewrite there is no need to open a transaction
    556 	 * if the FDSYNC flag is set and not the FSYNC.  In this case just
    557 	 * set the IMODACC flag to modify do the update at a later time
    558 	 * thus avoiding the overhead of the logging transaction that is
    559 	 * not required.
    560 	 */
    561 	if (ioflag & (FSYNC|FDSYNC)) {
    562 		if (ulp) {
    563 			if (rewriteflg) {
    564 				uint_t i_flag_save;
    565 
    566 				rw_enter(&ip->i_contents, RW_READER);
    567 				mutex_enter(&ip->i_tlock);
    568 				i_flag_save = ip->i_flag;
    569 				ip->i_flag |= IUPD | ICHG;
    570 				ip->i_seq++;
    571 				ITIMES_NOLOCK(ip);
    572 				if ((i_flag_save & IMOD) == 0) {
    573 					ip->i_flag &= ~IMOD;
    574 					ip->i_flag |= IMODACC;
    575 				}
    576 				mutex_exit(&ip->i_tlock);
    577 				rw_exit(&ip->i_contents);
    578 			} else {
    579 				int terr = 0;
    580 				TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
    581 				    terr);
    582 				ASSERT(!terr);
    583 			}
    584 		}
    585 	} else {
    586 		if (ulp)
    587 			TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
    588 	}
    589 
    590 	/*
    591 	 * Write the file
    592 	 */
    593 	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
    594 	rw_enter(&ip->i_contents, RW_WRITER);
    595 	if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
    596 		/*
    597 		 * In append mode start at end of file.
    598 		 */
    599 		uiop->uio_loffset = ip->i_size;
    600 	}
    601 
    602 	/*
    603 	 * Mild optimisation, don't call ufs_trans_write() unless we have to
    604 	 * Also, suppress file system full messages if we will retry.
    605 	 */
    606 	if (retry)
    607 		ip->i_flag |= IQUIET;
    608 	if (resid) {
    609 		TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
    610 	} else {
    611 		error = wrip(ip, uiop, ioflag, cr);
    612 	}
    613 	ip->i_flag &= ~IQUIET;
    614 
    615 	rw_exit(&ip->i_contents);
    616 	rw_exit(&ufsvfsp->vfs_dqrwlock);
    617 
    618 	/*
    619 	 * Leave Transaction
    620 	 */
    621 	if (ulp) {
    622 		if (ioflag & (FSYNC|FDSYNC)) {
    623 			if (!rewriteflg) {
    624 				int terr = 0;
    625 
    626 				TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
    627 				    resv);
    628 				if (error == 0)
    629 					error = terr;
    630 			}
    631 		} else {
    632 			TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
    633 		}
    634 		ufs_lockfs_end(ulp);
    635 	}
    636 out:
    637 	if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
    638 		/*
    639 		 * Any blocks tied up in pending deletes?
    640 		 */
    641 		ufs_delete_drain_wait(ufsvfsp, 1);
    642 		retry = 0;
    643 		goto retry_mandlock;
    644 	}
    645 
    646 	if (error == ENOSPC && (start_resid != uiop->uio_resid))
    647 		error = 0;
    648 
    649 	return (error);
    650 }
    651 
    652 /*
    653  * Don't cache write blocks to files with the sticky bit set.
    654  * Used to keep swap files from blowing the page cache on a server.
    655  */
    656 int stickyhack = 1;
    657 
    658 /*
    659  * Free behind hacks.  The pager is busted.
    660  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
    661  * or B_FREE_IF_TIGHT_ON_MEMORY.
    662  */
    663 int	freebehind = 1;
    664 int	smallfile = 0;
    665 u_offset_t smallfile64 = 32 * 1024;
    666 
    667 /*
    668  * While we should, in most cases, cache the pages for write, we
    669  * may also want to cache the pages for read as long as they are
    670  * frequently re-usable.
    671  *
    672  * If cache_read_ahead = 1, the pages for read will go to the tail
    673  * of the cache list when they are released, otherwise go to the head.
    674  */
    675 int	cache_read_ahead = 0;
    676 
    677 /*
    678  * Freebehind exists  so that as we read  large files  sequentially we
    679  * don't consume most of memory with pages  from a few files. It takes
    680  * longer to re-read from disk multiple small files as it does reading
    681  * one large one sequentially.  As system  memory grows customers need
    682  * to retain bigger chunks   of files in  memory.   The advent of  the
    683  * cachelist opens up of the possibility freeing pages  to the head or
    684  * tail of the list.
    685  *
    686  * Not freeing a page is a bet that the page will be read again before
    687  * it's segmap slot is needed for something else. If we loose the bet,
    688  * it means some  other thread is  burdened with the  page free we did
    689  * not do. If we win we save a free and reclaim.
    690  *
    691  * Freeing it at the tail  vs the head of cachelist  is a bet that the
    692  * page will survive until the next  read.  It's also saying that this
    693  * page is more likely to  be re-used than a  page freed some time ago
    694  * and never reclaimed.
    695  *
    696  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
    697  *
    698  *            0 < offset < smallfile1 : pages are not freed.
    699  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
    700  *   smallfile2 < offset              : pages freed to head of cachelist.
    701  *
    702  * The range  is  computed  at most  once  per second  and  depends on
    703  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
    704  * >= smallfile && >= smallfile64.
    705  *
    706  * smallfile1 = (free memory / ncpu) / 1000
    707  * smallfile2 = (free memory / ncpu) / 10
    708  *
    709  * A few examples values:
    710  *
    711  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
    712  *                                 ncpus_online = 4          ncpus_online = 64
    713  *       ------------------  -----------------------   -----------------------
    714  *             1G                   [256K;  25M]               [32K; 1.5M]
    715  *            10G                   [2.5M; 250M]              [156K; 15M]
    716  *           100G                    [25M; 2.5G]              [1.5M; 150M]
    717  *
    718  */
    719 
    720 #define	SMALLFILE1_D 1000
    721 #define	SMALLFILE2_D 10
    722 static u_offset_t smallfile1 = 32 * 1024;
    723 static u_offset_t smallfile2 = 32 * 1024;
    724 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
    725 uint_t smallfile1_d = SMALLFILE1_D;
    726 uint_t smallfile2_d = SMALLFILE2_D;
    727 
    728 /*
    729  * wrip does the real work of write requests for ufs.
    730  */
    731 int
    732 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
    733 {
    734 	rlim64_t limit = uio->uio_llimit;
    735 	u_offset_t off;
    736 	u_offset_t old_i_size;
    737 	struct fs *fs;
    738 	struct vnode *vp;
    739 	struct ufsvfs *ufsvfsp;
    740 	caddr_t base;
    741 	long start_resid = uio->uio_resid;	/* save starting resid */
    742 	long premove_resid;			/* resid before uiomove() */
    743 	uint_t flags;
    744 	int newpage;
    745 	int iupdat_flag, directio_status;
    746 	int n, on, mapon;
    747 	int error, pagecreate;
    748 	int do_dqrwlock;		/* drop/reacquire vfs_dqrwlock */
    749 	int32_t	iblocks;
    750 	int	new_iblocks;
    751 
    752 	/*
    753 	 * ip->i_size is incremented before the uiomove
    754 	 * is done on a write.  If the move fails (bad user
    755 	 * address) reset ip->i_size.
    756 	 * The better way would be to increment ip->i_size
    757 	 * only if the uiomove succeeds.
    758 	 */
    759 	int i_size_changed = 0;
    760 	o_mode_t type;
    761 	int i_seq_needed = 0;
    762 
    763 	vp = ITOV(ip);
    764 
    765 	/*
    766 	 * check for forced unmount - should not happen as
    767 	 * the request passed the lockfs checks.
    768 	 */
    769 	if ((ufsvfsp = ip->i_ufsvfs) == NULL)
    770 		return (EIO);
    771 
    772 	fs = ip->i_fs;
    773 
    774 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
    775 
    776 	/* check for valid filetype */
    777 	type = ip->i_mode & IFMT;
    778 	if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
    779 	    (type != IFLNK) && (type != IFSHAD)) {
    780 		return (EIO);
    781 	}
    782 
    783 	/*
    784 	 * the actual limit of UFS file size
    785 	 * is UFS_MAXOFFSET_T
    786 	 */
    787 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    788 		limit = MAXOFFSET_T;
    789 
    790 	if (uio->uio_loffset >= limit) {
    791 		proc_t *p = ttoproc(curthread);
    792 
    793 		mutex_enter(&p->p_lock);
    794 		(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
    795 		    p, RCA_UNSAFE_SIGINFO);
    796 		mutex_exit(&p->p_lock);
    797 		return (EFBIG);
    798 	}
    799 
    800 	/*
    801 	 * if largefiles are disallowed, the limit is
    802 	 * the pre-largefiles value of 2GB
    803 	 */
    804 	if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
    805 		limit = MIN(UFS_MAXOFFSET_T, limit);
    806 	else
    807 		limit = MIN(MAXOFF32_T, limit);
    808 
    809 	if (uio->uio_loffset < (offset_t)0) {
    810 		return (EINVAL);
    811 	}
    812 	if (uio->uio_resid == 0) {
    813 		return (0);
    814 	}
    815 
    816 	if (uio->uio_loffset >= limit)
    817 		return (EFBIG);
    818 
    819 	ip->i_flag |= INOACC;	/* don't update ref time in getpage */
    820 
    821 	if (ioflag & (FSYNC|FDSYNC)) {
    822 		ip->i_flag |= ISYNC;
    823 		iupdat_flag = 1;
    824 	}
    825 	/*
    826 	 * Try to go direct
    827 	 */
    828 	if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
    829 		uio->uio_llimit = limit;
    830 		error = ufs_directio_write(ip, uio, ioflag, 0, cr,
    831 		    &directio_status);
    832 		/*
    833 		 * If ufs_directio wrote to the file or set the flags,
    834 		 * we need to update i_seq, but it may be deferred.
    835 		 */
    836 		if (start_resid != uio->uio_resid ||
    837 		    (ip->i_flag & (ICHG|IUPD))) {
    838 			i_seq_needed = 1;
    839 			ip->i_flag |= ISEQ;
    840 		}
    841 		if (directio_status == DIRECTIO_SUCCESS)
    842 			goto out;
    843 	}
    844 
    845 	/*
    846 	 * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
    847 	 *
    848 	 * o shadow inodes: vfs_dqrwlock is not held at all
    849 	 * o quota updates: vfs_dqrwlock is read or write held
    850 	 * o other updates: vfs_dqrwlock is read held
    851 	 *
    852 	 * The first case is the only one where we do not hold
    853 	 * vfs_dqrwlock at all while entering wrip().
    854 	 * We must make sure not to downgrade/drop vfs_dqrwlock if we
    855 	 * have it as writer, i.e. if we are updating the quota inode.
    856 	 * There is no potential deadlock scenario in this case as
    857 	 * ufs_getpage() takes care of this and avoids reacquiring
    858 	 * vfs_dqrwlock in that case.
    859 	 *
    860 	 * This check is done here since the above conditions do not change
    861 	 * and we possibly loop below, so save a few cycles.
    862 	 */
    863 	if ((type == IFSHAD) ||
    864 	    (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
    865 		do_dqrwlock = 0;
    866 	} else {
    867 		do_dqrwlock = 1;
    868 	}
    869 
    870 	/*
    871 	 * Large Files: We cast MAXBMASK to offset_t
    872 	 * inorder to mask out the higher bits. Since offset_t
    873 	 * is a signed value, the high order bit set in MAXBMASK
    874 	 * value makes it do the right thing by having all bits 1
    875 	 * in the higher word. May be removed for _SOLARIS64_.
    876 	 */
    877 
    878 	fs = ip->i_fs;
    879 	do {
    880 		u_offset_t uoff = uio->uio_loffset;
    881 		off = uoff & (offset_t)MAXBMASK;
    882 		mapon = (int)(uoff & (offset_t)MAXBOFFSET);
    883 		on = (int)blkoff(fs, uoff);
    884 		n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
    885 		new_iblocks = 1;
    886 
    887 		if (type == IFREG && uoff + n >= limit) {
    888 			if (uoff >= limit) {
    889 				error = EFBIG;
    890 				goto out;
    891 			}
    892 			/*
    893 			 * since uoff + n >= limit,
    894 			 * therefore n >= limit - uoff, and n is an int
    895 			 * so it is safe to cast it to an int
    896 			 */
    897 			n = (int)(limit - (rlim64_t)uoff);
    898 		}
    899 		if (uoff + n > ip->i_size) {
    900 			/*
    901 			 * We are extending the length of the file.
    902 			 * bmap is used so that we are sure that
    903 			 * if we need to allocate new blocks, that it
    904 			 * is done here before we up the file size.
    905 			 */
    906 			error = bmap_write(ip, uoff, (int)(on + n),
    907 			    mapon == 0, NULL, cr);
    908 			/*
    909 			 * bmap_write never drops i_contents so if
    910 			 * the flags are set it changed the file.
    911 			 */
    912 			if (ip->i_flag & (ICHG|IUPD)) {
    913 				i_seq_needed = 1;
    914 				ip->i_flag |= ISEQ;
    915 			}
    916 			if (error)
    917 				break;
    918 			/*
    919 			 * There is a window of vulnerability here.
    920 			 * The sequence of operations: allocate file
    921 			 * system blocks, uiomove the data into pages,
    922 			 * and then update the size of the file in the
    923 			 * inode, must happen atomically.  However, due
    924 			 * to current locking constraints, this can not
    925 			 * be done.
    926 			 */
    927 			ASSERT(ip->i_writer == NULL);
    928 			ip->i_writer = curthread;
    929 			i_size_changed = 1;
    930 			/*
    931 			 * If we are writing from the beginning of
    932 			 * the mapping, we can just create the
    933 			 * pages without having to read them.
    934 			 */
    935 			pagecreate = (mapon == 0);
    936 		} else if (n == MAXBSIZE) {
    937 			/*
    938 			 * Going to do a whole mappings worth,
    939 			 * so we can just create the pages w/o
    940 			 * having to read them in.  But before
    941 			 * we do that, we need to make sure any
    942 			 * needed blocks are allocated first.
    943 			 */
    944 			iblocks = ip->i_blocks;
    945 			error = bmap_write(ip, uoff, (int)(on + n),
    946 			    BI_ALLOC_ONLY, NULL, cr);
    947 			/*
    948 			 * bmap_write never drops i_contents so if
    949 			 * the flags are set it changed the file.
    950 			 */
    951 			if (ip->i_flag & (ICHG|IUPD)) {
    952 				i_seq_needed = 1;
    953 				ip->i_flag |= ISEQ;
    954 			}
    955 			if (error)
    956 				break;
    957 			pagecreate = 1;
    958 			/*
    959 			 * check if the new created page needed the
    960 			 * allocation of new disk blocks.
    961 			 */
    962 			if (iblocks == ip->i_blocks)
    963 				new_iblocks = 0; /* no new blocks allocated */
    964 		} else {
    965 			pagecreate = 0;
    966 			/*
    967 			 * In sync mode flush the indirect blocks which
    968 			 * may have been allocated and not written on
    969 			 * disk. In above cases bmap_write will allocate
    970 			 * in sync mode.
    971 			 */
    972 			if (ioflag & (FSYNC|FDSYNC)) {
    973 				error = ufs_indirblk_sync(ip, uoff);
    974 				if (error)
    975 					break;
    976 			}
    977 		}
    978 
    979 		/*
    980 		 * At this point we can enter ufs_getpage() in one
    981 		 * of two ways:
    982 		 * 1) segmap_getmapflt() calls ufs_getpage() when the
    983 		 *    forcefault parameter is true (pagecreate == 0)
    984 		 * 2) uiomove() causes a page fault.
    985 		 *
    986 		 * We have to drop the contents lock to prevent the VM
    987 		 * system from trying to reacquire it in ufs_getpage()
    988 		 * should the uiomove cause a pagefault.
    989 		 *
    990 		 * We have to drop the reader vfs_dqrwlock here as well.
    991 		 */
    992 		rw_exit(&ip->i_contents);
    993 		if (do_dqrwlock) {
    994 			ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
    995 			ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
    996 			rw_exit(&ufsvfsp->vfs_dqrwlock);
    997 		}
    998 
    999 		newpage = 0;
   1000 		premove_resid = uio->uio_resid;
   1001 		if (vpm_enable) {
   1002 			/*
   1003 			 * Copy data. If new pages are created, part of
   1004 			 * the page that is not written will be initizliazed
   1005 			 * with zeros.
   1006 			 */
   1007 			error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
   1008 			    uio, !pagecreate, &newpage, 0, S_WRITE);
   1009 		} else {
   1010 
   1011 			base = segmap_getmapflt(segkmap, vp, (off + mapon),
   1012 			    (uint_t)n, !pagecreate, S_WRITE);
   1013 
   1014 			/*
   1015 			 * segmap_pagecreate() returns 1 if it calls
   1016 			 * page_create_va() to allocate any pages.
   1017 			 */
   1018 
   1019 			if (pagecreate