Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Portions Copyright 2007 Jeremy Teo */
     27 
     28 #pragma ident	"@(#)zfs_vnops.c	1.66	08/01/04 SMI"
     29 
     30 #include <sys/types.h>
     31 #include <sys/param.h>
     32 #include <sys/time.h>
     33 #include <sys/systm.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/resource.h>
     36 #include <sys/vfs.h>
     37 #include <sys/vfs_opreg.h>
     38 #include <sys/vnode.h>
     39 #include <sys/file.h>
     40 #include <sys/stat.h>
     41 #include <sys/kmem.h>
     42 #include <sys/taskq.h>
     43 #include <sys/uio.h>
     44 #include <sys/vmsystm.h>
     45 #include <sys/atomic.h>
     46 #include <sys/vm.h>
     47 #include <vm/seg_vn.h>
     48 #include <vm/pvn.h>
     49 #include <vm/as.h>
     50 #include <sys/mman.h>
     51 #include <sys/pathname.h>
     52 #include <sys/cmn_err.h>
     53 #include <sys/errno.h>
     54 #include <sys/unistd.h>
     55 #include <sys/zfs_dir.h>
     56 #include <sys/zfs_acl.h>
     57 #include <sys/zfs_ioctl.h>
     58 #include <sys/fs/zfs.h>
     59 #include <sys/dmu.h>
     60 #include <sys/spa.h>
     61 #include <sys/txg.h>
     62 #include <sys/dbuf.h>
     63 #include <sys/zap.h>
     64 #include <sys/dirent.h>
     65 #include <sys/policy.h>
     66 #include <sys/sunddi.h>
     67 #include <sys/filio.h>
     68 #include "fs/fs_subr.h"
     69 #include <sys/zfs_ctldir.h>
     70 #include <sys/zfs_fuid.h>
     71 #include <sys/dnlc.h>
     72 #include <sys/zfs_rlock.h>
     73 #include <sys/extdirent.h>
     74 #include <sys/kidmap.h>
     75 #include <sys/cred_impl.h>
     76 #include <sys/attr.h>
     77 
     78 /*
     79  * Programming rules.
     80  *
     81  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
     82  * properly lock its in-core state, create a DMU transaction, do the work,
     83  * record this work in the intent log (ZIL), commit the DMU transaction,
     84  * and wait for the intent log to commit if it is a synchronous operation.
     85  * Moreover, the vnode ops must work in both normal and log replay context.
     86  * The ordering of events is important to avoid deadlocks and references
     87  * to freed memory.  The example below illustrates the following Big Rules:
     88  *
     89  *  (1) A check must be made in each zfs thread for a mounted file system.
     90  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
     91  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
     92  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
     93  *      can return EIO from the calling function.
     94  *
     95  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
     96  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
     97  *	First, if it's the last reference, the vnode/znode
     98  *	can be freed, so the zp may point to freed memory.  Second, the last
     99  *	reference will call zfs_zinactive(), which may induce a lot of work --
    100  *	pushing cached pages (which acquires range locks) and syncing out
    101  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
    102  *	which could deadlock the system if you were already holding one.
    103  *
    104  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
    105  *	as they can span dmu_tx_assign() calls.
    106  *
    107  *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
    108  *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
    109  *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
    110  *	This is critical because we don't want to block while holding locks.
    111  *	Note, in particular, that if a lock is sometimes acquired before
    112  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
    113  *	use a non-blocking assign can deadlock the system.  The scenario:
    114  *
    115  *	Thread A has grabbed a lock before calling dmu_tx_assign().
    116  *	Thread B is in an already-assigned tx, and blocks for this lock.
    117  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
    118  *	forever, because the previous txg can't quiesce until B's tx commits.
    119  *
    120  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
    121  *	then drop all locks, call dmu_tx_wait(), and try again.
    122  *
    123  *  (5)	If the operation succeeded, generate the intent log entry for it
    124  *	before dropping locks.  This ensures that the ordering of events
    125  *	in the intent log matches the order in which they actually occurred.
    126  *
    127  *  (6)	At the end of each vnode op, the DMU tx must always commit,
    128  *	regardless of whether there were any errors.
    129  *
    130  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
    131  *	to ensure that synchronous semantics are provided when necessary.
    132  *
    133  * In general, this is how things should be ordered in each vnode op:
    134  *
    135  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
    136  * top:
    137  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
    138  *	rw_enter(...);			// grab any other locks you need
    139  *	tx = dmu_tx_create(...);	// get DMU tx
    140  *	dmu_tx_hold_*();		// hold each object you might modify
    141  *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
    142  *	if (error) {
    143  *		rw_exit(...);		// drop locks
    144  *		zfs_dirent_unlock(dl);	// unlock directory entry
    145  *		VN_RELE(...);		// release held vnodes
    146  *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
    147  *			dmu_tx_wait(tx);
    148  *			dmu_tx_abort(tx);
    149  *			goto top;
    150  *		}
    151  *		dmu_tx_abort(tx);	// abort DMU tx
    152  *		ZFS_EXIT(zfsvfs);	// finished in zfs
    153  *		return (error);		// really out of space
    154  *	}
    155  *	error = do_real_work();		// do whatever this VOP does
    156  *	if (error == 0)
    157  *		zfs_log_*(...);		// on success, make ZIL entry
    158  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
    159  *	rw_exit(...);			// drop locks
    160  *	zfs_dirent_unlock(dl);		// unlock directory entry
    161  *	VN_RELE(...);			// release held vnodes
    162  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
    163  *	ZFS_EXIT(zfsvfs);		// finished in zfs
    164  *	return (error);			// done, report error
    165  */
    166 
    167 /* ARGSUSED */
    168 static int
    169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    170 {
    171 	znode_t	*zp = VTOZ(*vpp);
    172 
    173 	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
    174 	    ((flag & FAPPEND) == 0)) {
    175 		return (EPERM);
    176 	}
    177 
    178 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    179 	    ZTOV(zp)->v_type == VREG &&
    180 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    181 	    zp->z_phys->zp_size > 0)
    182 		if (fs_vscan(*vpp, cr, 0) != 0)
    183 			return (EACCES);
    184 
    185 	/* Keep a count of the synchronous opens in the znode */
    186 	if (flag & (FSYNC | FDSYNC))
    187 		atomic_inc_32(&zp->z_sync_cnt);
    188 
    189 	return (0);
    190 }
    191 
    192 /* ARGSUSED */
    193 static int
    194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    195     caller_context_t *ct)
    196 {
    197 	znode_t	*zp = VTOZ(vp);
    198 
    199 	/* Decrement the synchronous opens in the znode */
    200 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
    201 		atomic_dec_32(&zp->z_sync_cnt);
    202 
    203 	/*
    204 	 * Clean up any locks held by this process on the vp.
    205 	 */
    206 	cleanlocks(vp, ddi_get_pid(), 0);
    207 	cleanshares(vp, ddi_get_pid());
    208 
    209 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    210 	    ZTOV(zp)->v_type == VREG &&
    211 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    212 	    zp->z_phys->zp_size > 0)
    213 		VERIFY(fs_vscan(vp, cr, 1) == 0);
    214 
    215 	return (0);
    216 }
    217 
    218 /*
    219  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
    220  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
    221  */
    222 static int
    223 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
    224 {
    225 	znode_t	*zp = VTOZ(vp);
    226 	uint64_t noff = (uint64_t)*off; /* new offset */
    227 	uint64_t file_sz;
    228 	int error;
    229 	boolean_t hole;
    230 
    231 	file_sz = zp->z_phys->zp_size;
    232 	if (noff >= file_sz)  {
    233 		return (ENXIO);
    234 	}
    235 
    236 	if (cmd == _FIO_SEEK_HOLE)
    237 		hole = B_TRUE;
    238 	else
    239 		hole = B_FALSE;
    240 
    241 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
    242 
    243 	/* end of file? */
    244 	if ((error == ESRCH) || (noff > file_sz)) {
    245 		/*
    246 		 * Handle the virtual hole at the end of file.
    247 		 */
    248 		if (hole) {
    249 			*off = file_sz;
    250 			return (0);
    251 		}
    252 		return (ENXIO);
    253 	}
    254 
    255 	if (noff < *off)
    256 		return (error);
    257 	*off = noff;
    258 	return (error);
    259 }
    260 
    261 /* ARGSUSED */
    262 static int
    263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
    264     int *rvalp, caller_context_t *ct)
    265 {
    266 	offset_t off;
    267 	int error;
    268 	zfsvfs_t *zfsvfs;
    269 	znode_t *zp;
    270 
    271 	switch (com) {
    272 	case _FIOFFS:
    273 		return (zfs_sync(vp->v_vfsp, 0, cred));
    274 
    275 		/*
    276 		 * The following two ioctls are used by bfu.  Faking out,
    277 		 * necessary to avoid bfu errors.
    278 		 */
    279 	case _FIOGDIO:
    280 	case _FIOSDIO:
    281 		return (0);
    282 
    283 	case _FIO_SEEK_DATA:
    284 	case _FIO_SEEK_HOLE:
    285 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
    286 			return (EFAULT);
    287 
    288 		zp = VTOZ(vp);
    289 		zfsvfs = zp->z_zfsvfs;
    290 		ZFS_ENTER(zfsvfs);
    291 		ZFS_VERIFY_ZP(zp);
    292 
    293 		/* offset parameter is in/out */
    294 		error = zfs_holey(vp, com, &off);
    295 		ZFS_EXIT(zfsvfs);
    296 		if (error)
    297 			return (error);
    298 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
    299 			return (EFAULT);
    300 		return (0);
    301 	}
    302 	return (ENOTTY);
    303 }
    304 
    305 /*
    306  * When a file is memory mapped, we must keep the IO data synchronized
    307  * between the DMU cache and the memory mapped pages.  What this means:
    308  *
    309  * On Write:	If we find a memory mapped page, we write to *both*
    310  *		the page and the dmu buffer.
    311  *
    312  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    313  *	the file is memory mapped.
    314  */
    315 static int
    316 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
    317 {
    318 	znode_t	*zp = VTOZ(vp);
    319 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    320 	int64_t	start, off;
    321 	int len = nbytes;
    322 	int error = 0;
    323 
    324 	start = uio->uio_loffset;
    325 	off = start & PAGEOFFSET;
    326 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    327 		page_t *pp;
    328 		uint64_t bytes = MIN(PAGESIZE - off, len);
    329 		uint64_t woff = uio->uio_loffset;
    330 
    331 		/*
    332 		 * We don't want a new page to "appear" in the middle of
    333 		 * the file update (because it may not get the write
    334 		 * update data), so we grab a lock to block
    335 		 * zfs_getpage().
    336 		 */
    337 		rw_enter(&zp->z_map_lock, RW_WRITER);
    338 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    339 			caddr_t va;
    340 
    341 			rw_exit(&zp->z_map_lock);
    342 			va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1L);
    343 			error = uiomove(va+off, bytes, UIO_WRITE, uio);
    344 			if (error == 0) {
    345 				dmu_write(zfsvfs->z_os, zp->z_id,
    346 				    woff, bytes, va+off, tx);
    347 			}
    348 			ppmapout(va);
    349 			page_unlock(pp);
    350 		} else {
    351 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    352 			    uio, bytes, tx);
    353 			rw_exit(&zp->z_map_lock);
    354 		}
    355 		len -= bytes;
    356 		off = 0;
    357 		if (error)
    358 			break;
    359 	}
    360 	return (error);
    361 }
    362 
    363 /*
    364  * When a file is memory mapped, we must keep the IO data synchronized
    365  * between the DMU cache and the memory mapped pages.  What this means:
    366  *
    367  * On Read:	We "read" preferentially from memory mapped pages,
    368  *		else we default from the dmu buffer.
    369  *
    370  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    371  *	the file is memory mapped.
    372  */
    373 static int
    374 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
    375 {
    376 	znode_t *zp = VTOZ(vp);
    377 	objset_t *os = zp->z_zfsvfs->z_os;
    378 	int64_t	start, off;
    379 	int len = nbytes;
    380 	int error = 0;
    381 
    382 	start = uio->uio_loffset;
    383 	off = start & PAGEOFFSET;
    384 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    385 		page_t *pp;
    386 		uint64_t bytes = MIN(PAGESIZE - off, len);
    387 
    388 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    389 			caddr_t va;
    390 
    391 			va = ppmapin(pp, PROT_READ, (caddr_t)-1L);
    392 			error = uiomove(va + off, bytes, UIO_READ, uio);
    393 			ppmapout(va);
    394 			page_unlock(pp);
    395 		} else {
    396 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
    397 		}
    398 		len -= bytes;
    399 		off = 0;
    400 		if (error)
    401 			break;
    402 	}
    403 	return (error);
    404 }
    405 
    406 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
    407 
    408 /*
    409  * Read bytes from specified file into supplied buffer.
    410  *
    411  *	IN:	vp	- vnode of file to be read from.
    412  *		uio	- structure supplying read location, range info,
    413  *			  and return buffer.
    414  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
    415  *		cr	- credentials of caller.
    416  *		ct	- caller context
    417  *
    418  *	OUT:	uio	- updated offset and range, buffer filled.
    419  *
    420  *	RETURN:	0 if success
    421  *		error code if failure
    422  *
    423  * Side Effects:
    424  *	vp - atime updated if byte count > 0
    425  */
    426 /* ARGSUSED */
    427 static int
    428 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    429 {
    430 	znode_t		*zp = VTOZ(vp);
    431 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    432 	objset_t	*os;
    433 	ssize_t		n, nbytes;
    434 	int		error;
    435 	rl_t		*rl;
    436 
    437 	ZFS_ENTER(zfsvfs);
    438 	ZFS_VERIFY_ZP(zp);
    439 	os = zfsvfs->z_os;
    440 
    441 	/*
    442 	 * Validate file offset
    443 	 */
    444 	if (uio->uio_loffset < (offset_t)0) {
    445 		ZFS_EXIT(zfsvfs);
    446 		return (EINVAL);
    447 	}
    448 
    449 	/*
    450 	 * Fasttrack empty reads
    451 	 */
    452 	if (uio->uio_resid == 0) {
    453 		ZFS_EXIT(zfsvfs);
    454 		return (0);
    455 	}
    456 
    457 	/*
    458 	 * Check for mandatory locks
    459 	 */
    460 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
    461 		if (error = chklock(vp, FREAD,
    462 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
    463 			ZFS_EXIT(zfsvfs);
    464 			return (error);
    465 		}
    466 	}
    467 
    468 	/*
    469 	 * If we're in FRSYNC mode, sync out this znode before reading it.
    470 	 */
    471 	if (ioflag & FRSYNC)
    472 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
    473 
    474 	/*
    475 	 * Lock the range against changes.
    476 	 */
    477 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
    478 
    479 	/*
    480 	 * If we are reading past end-of-file we can skip
    481 	 * to the end; but we might still need to set atime.
    482 	 */
    483 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
    484 		error = 0;
    485 		goto out;
    486 	}
    487 
    488 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
    489 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
    490 
    491 	while (n > 0) {
    492 		nbytes = MIN(n, zfs_read_chunk_size -
    493 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
    494 
    495 		if (vn_has_cached_data(vp))
    496 			error = mappedread(vp, nbytes, uio);
    497 		else
    498 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
    499 		if (error)
    500 			break;
    501 
    502 		n -= nbytes;
    503 	}
    504 
    505 out:
    506 	zfs_range_unlock(rl);
    507 
    508 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
    509 	ZFS_EXIT(zfsvfs);
    510 	return (error);
    511 }
    512 
    513 /*
    514  * Fault in the pages of the first n bytes specified by the uio structure.
    515  * 1 byte in each page is touched and the uio struct is unmodified.
    516  * Any error will exit this routine as this is only a best
    517  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
    518  */
    519 static void
    520 zfs_prefault_write(ssize_t n, struct uio *uio)
    521 {
    522 	struct iovec *iov;
    523 	ulong_t cnt, incr;
    524 	caddr_t p;
    525 	uint8_t tmp;
    526 
    527 	iov = uio->uio_iov;
    528 
    529 	while (n) {
    530 		cnt = MIN(iov->iov_len, n);
    531 		if (cnt == 0) {
    532 			/* empty iov entry */
    533 			iov++;
    534 			continue;
    535 		}
    536 		n -= cnt;
    537 		/*
    538 		 * touch each page in this segment.
    539 		 */
    540 		p = iov->iov_base;
    541 		while (cnt) {
    542 			switch (uio->uio_segflg) {
    543 			case UIO_USERSPACE:
    544 			case UIO_USERISPACE:
    545 				if (fuword8(p, &tmp))
    546 					return;
    547 				break;
    548 			case UIO_SYSSPACE:
    549 				if (kcopy(p, &tmp, 1))
    550 					return;
    551 				break;
    552 			}
    553 			incr = MIN(cnt, PAGESIZE);
    554 			p += incr;
    555 			cnt -= incr;
    556 		}
    557 		/*
    558 		 * touch the last byte in case it straddles a page.
    559 		 */
    560 		p--;
    561 		switch (uio->uio_segflg) {
    562 		case UIO_USERSPACE:
    563 		case UIO_USERISPACE:
    564 			if (fuword8(p, &tmp))
    565 				return;
    566 			break;
    567 		case UIO_SYSSPACE:
    568 			if (kcopy(p, &tmp, 1))
    569 				return;
    570 			break;
    571 		}
    572 		iov++;
    573 	}
    574 }
    575 
    576 /*
    577  * Write the bytes to a file.
    578  *
    579  *	IN:	vp	- vnode of file to be written to.
    580  *		uio	- structure supplying write location, range info,
    581  *			  and data buffer.
    582  *		ioflag	- FAPPEND flag set if in append mode.
    583  *		cr	- credentials of caller.
    584  *		ct	- caller context (NFS/CIFS fem monitor only)
    585  *
    586  *	OUT:	uio	- updated offset and range.
    587  *
    588  *	RETURN:	0 if success
    589  *		error code if failure
    590  *
    591  * Timestamps:
    592  *	vp - ctime|mtime updated if byte count > 0
    593  */
    594 /* ARGSUSED */
    595 static int
    596 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    597 {
    598 	znode_t		*zp = VTOZ(vp);
    599 	rlim64_t	limit = uio->uio_llimit;
    600 	ssize_t		start_resid = uio->uio_resid;
    601 	ssize_t		tx_bytes;
    602 	uint64_t	end_size;
    603 	dmu_tx_t	*tx;
    604 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    605 	zilog_t		*zilog;
    606 	offset_t	woff;
    607 	ssize_t		n, nbytes;
    608 	rl_t		*rl;
    609 	int		max_blksz = zfsvfs->z_max_blksz;
    610 	uint64_t	pflags = zp->z_phys->zp_flags;
    611 	int		error;
    612 
    613 	/*
    614 	 * If immutable or not appending then return EPERM
    615 	 */
    616 	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
    617 	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
    618 	    (uio->uio_loffset < zp->z_phys->zp_size)))
    619 		return (EPERM);
    620 
    621 	/*
    622 	 * Fasttrack empty write
    623 	 */
    624 	n = start_resid;
    625 	if (n == 0)
    626 		return (0);
    627 
    628 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    629 		limit = MAXOFFSET_T;
    630 
    631 	ZFS_ENTER(zfsvfs);
    632 	ZFS_VERIFY_ZP(zp);
    633 	zilog = zfsvfs->z_log;
    634 
    635 	/*
    636 	 * Pre-fault the pages to ensure slow (eg NFS) pages
    637 	 * don't hold up txg.
    638 	 */
    639 	zfs_prefault_write(n, uio);
    640 
    641 	/*
    642 	 * If in append mode, set the io offset pointer to eof.
    643 	 */
    644 	if (ioflag & FAPPEND) {
    645 		/*
    646 		 * Range lock for a file append:
    647 		 * The value for the start of range will be determined by
    648 		 * zfs_range_lock() (to guarantee append semantics).
    649 		 * If this write will cause the block size to increase,
    650 		 * zfs_range_lock() will lock the entire file, so we must
    651 		 * later reduce the range after we grow the block size.
    652 		 */
    653 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
    654 		if (rl->r_len == UINT64_MAX) {
    655 			/* overlocked, zp_size can't change */
    656 			woff = uio->uio_loffset = zp->z_phys->zp_size;
    657 		} else {
    658 			woff = uio->uio_loffset = rl->r_off;
    659 		}
    660 	} else {
    661 		woff = uio->uio_loffset;
    662 		/*
    663 		 * Validate file offset
    664 		 */
    665 		if (woff < 0) {
    666 			ZFS_EXIT(zfsvfs);
    667 			return (EINVAL);
    668 		}
    669 
    670 		/*
    671 		 * If we need to grow the block size then zfs_range_lock()
    672 		 * will lock a wider range than we request here.
    673 		 * Later after growing the block size we reduce the range.
    674 		 */
    675 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
    676 	}
    677 
    678 	if (woff >= limit) {
    679 		zfs_range_unlock(rl);
    680 		ZFS_EXIT(zfsvfs);
    681 		return (EFBIG);
    682 	}
    683 
    684 	if ((woff + n) > limit || woff > (limit - n))
    685 		n = limit - woff;
    686 
    687 	/*
    688 	 * Check for mandatory locks
    689 	 */
    690 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
    691 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
    692 		zfs_range_unlock(rl);
    693 		ZFS_EXIT(zfsvfs);
    694 		return (error);
    695 	}
    696 	end_size = MAX(zp->z_phys->zp_size, woff + n);
    697 
    698 	/*
    699 	 * Write the file in reasonable size chunks.  Each chunk is written
    700 	 * in a separate transaction; this keeps the intent log records small
    701 	 * and allows us to do more fine-grained space accounting.
    702 	 */
    703 	while (n > 0) {
    704 		/*
    705 		 * Start a transaction.
    706 		 */
    707 		woff = uio->uio_loffset;
    708 		tx = dmu_tx_create(zfsvfs->z_os);
    709 		dmu_tx_hold_bonus(tx, zp->z_id);
    710 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
    711 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
    712 		if (error) {
    713 			if (error == ERESTART &&
    714 			    zfsvfs->z_assign == TXG_NOWAIT) {
    715 				dmu_tx_wait(tx);
    716 				dmu_tx_abort(tx);
    717 				continue;
    718 			}
    719 			dmu_tx_abort(tx);
    720 			break;
    721 		}
    722 
    723 		/*
    724 		 * If zfs_range_lock() over-locked we grow the blocksize
    725 		 * and then reduce the lock range.  This will only happen
    726 		 * on the first iteration since zfs_range_reduce() will
    727 		 * shrink down r_len to the appropriate size.
    728 		 */
    729 		if (rl->r_len == UINT64_MAX) {
    730 			uint64_t new_blksz;
    731 
    732 			if (zp->z_blksz > max_blksz) {
    733 				ASSERT(!ISP2(zp->z_blksz));
    734 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
    735 			} else {
    736 				new_blksz = MIN(end_size, max_blksz);
    737 			}
    738 			zfs_grow_blocksize(zp, new_blksz, tx);
    739 			zfs_range_reduce(rl, woff, n);
    740 		}
    741 
    742 		/*
    743 		 * XXX - should we really limit each write to z_max_blksz?
    744 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
    745 		 */
    746 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
    747 		rw_enter(&zp->z_map_lock, RW_READER);
    748 
    749 		tx_bytes = uio->uio_resid;
    750 		if (vn_has_cached_data(vp)) {
    751 			rw_exit(&zp->z_map_lock);
    752 			error = mappedwrite(vp, nbytes, uio, tx);
    753 		} else {
    754 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    755 			    uio, nbytes, tx);
    756 			rw_exit(&zp->z_map_lock);
    757 		}
    758 		tx_bytes -= uio->uio_resid;
    759 
    760 		/*
    761 		 * If we made no progress, we're done.  If we made even
    762 		 * partial progress, update the znode and ZIL accordingly.
    763 		 */
    764 		if (tx_bytes == 0) {
    765 			dmu_tx_commit(tx);
    766 			ASSERT(error != 0);
    767 			break;
    768 		}
    769 
    770 		/*
    771 		 * Clear Set-UID/Set-GID bits on successful write if not
    772 		 * privileged and at least one of the excute bits is set.
    773 		 *
    774 		 * It would be nice to to this after all writes have
    775 		 * been done, but that would still expose the ISUID/ISGID
    776 		 * to another app after the partial write is committed.
    777 		 *
    778 		 * Note: we don't call zfs_fuid_map_id() here because
    779 		 * user 0 is not an ephemeral uid.
    780 		 */
    781 		mutex_enter(&zp->z_acl_lock);
    782 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
    783 		    (S_IXUSR >> 6))) != 0 &&
    784 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
    785 		    secpolicy_vnode_setid_retain(cr,
    786 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
    787 		    zp->z_phys->zp_uid == 0) != 0) {
    788 			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
    789 		}
    790 		mutex_exit(&zp->z_acl_lock);
    791 
    792 		/*
    793 		 * Update time stamp.  NOTE: This marks the bonus buffer as
    794 		 * dirty, so we don't have to do it again for zp_size.
    795 		 */
    796 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
    797 
    798 		/*
    799 		 * Update the file size (zp_size) if it has changed;
    800 		 * account for possible concurrent updates.
    801 		 */
    802 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
    803 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
    804 			    uio->uio_loffset);
    805 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
    806 		dmu_tx_commit(tx);
    807 
    808 		if (error != 0)
    809 			break;
    810 		ASSERT(tx_bytes == nbytes);
    811 		n -= nbytes;
    812 	}
    813 
    814 	zfs_range_unlock(rl);
    815 
    816 	/*
    817 	 * If we're in replay mode, or we made no progress, return error.
    818 	 * Otherwise, it's at least a partial write, so it's successful.
    819 	 */
    820 	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
    821 		ZFS_EXIT(zfsvfs);
    822 		return (error);
    823 	}
    824 
    825 	if (ioflag & (FSYNC | FDSYNC))
    826 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
    827 
    828 	ZFS_EXIT(zfsvfs);
    829 	return (0);
    830 }
    831 
    832 void
    833 zfs_get_done(dmu_buf_t *db, void *vzgd)
    834 {
    835 	zgd_t *zgd = (zgd_t *)vzgd;
    836 	rl_t *rl = zgd->zgd_rl;
    837 	vnode_t *vp = ZTOV(rl->r_zp);
    838 
    839 	dmu_buf_rele(db, vzgd);
    840 	zfs_range_unlock(rl);
    841 	VN_RELE(vp);
    842 	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
    843 	kmem_free(zgd, sizeof (zgd_t));
    844 }
    845 
    846 /*
    847  * Get data to generate a TX_WRITE intent log record.
    848  */
    849 int
    850 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
    851 {
    852 	zfsvfs_t *zfsvfs = arg;
    853 	objset_t *os = zfsvfs->z_os;
    854 	znode_t *zp;
    855 	uint64_t off = lr->lr_offset;
    856 	dmu_buf_t *db;
    857 	rl_t *rl;
    858 	zgd_t *zgd;
    859 	int dlen = lr->lr_length;		/* length of user data */
    860 	int error = 0;
    861 
    862 	ASSERT(zio);
    863 	ASSERT(dlen != 0);
    864 
    865 	/*
    866 	 * Nothing to do if the file has been removed
    867 	 */
    868 	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
    869 		return (ENOENT);
    870 	if (zp->z_unlinked) {
    871 		VN_RELE(ZTOV(zp));
    872 		return (ENOENT);
    873 	}
    874 
    875 	/*
    876 	 * Write records come in two flavors: immediate and indirect.
    877 	 * For small writes it's cheaper to store the data with the
    878 	 * log record (immediate); for large writes it's cheaper to
    879 	 * sync the data and get a pointer to it (indirect) so that
    880 	 * we don't have to write the data twice.
    881 	 */
    882 	if (buf != NULL) { /* immediate write */
    883 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
    884 		/* test for truncation needs to be done while range locked */
    885 		if (off >= zp->z_phys->zp_size) {
    886 			error = ENOENT;
    887 			goto out;
    888 		}
    889 		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
    890 	} else { /* indirect write */
    891 		uint64_t boff; /* block starting offset */
    892 
    893 		/*
    894 		 * Have to lock the whole block to ensure when it's
    895 		 * written out and it's checksum is being calculated
    896 		 * that no one can change the data. We need to re-check
    897 		 * blocksize after we get the lock in case it's changed!
    898 		 */
    899 		for (;;) {
    900 			if (ISP2(zp->z_blksz)) {
    901 				boff = P2ALIGN_TYPED(off, zp->z_blksz,
    902 				    uint64_t);
    903 			} else {
    904 				boff = 0;
    905 			}
    906 			dlen = zp->z_blksz;
    907 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
    908 			if (zp->z_blksz == dlen)
    909 				break;
    910 			zfs_range_unlock(rl);
    911 		}
    912 		/* test for truncation needs to be done while range locked */
    913 		if (off >= zp->z_phys->zp_size) {
    914 			error = ENOENT;
    915 			goto out;
    916 		}
    917 		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
    918 		zgd->zgd_rl = rl;
    919 		zgd->zgd_zilog = zfsvfs->z_log;
    920 		zgd->zgd_bp = &lr->lr_blkptr;
    921 		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
    922 		ASSERT(boff == db->db_offset);
    923 		lr->lr_blkoff = off - boff;
    924 		error = dmu_sync(zio, db, &lr->lr_blkptr,
    925 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
    926 		ASSERT((error && error != EINPROGRESS) ||
    927 		    lr->lr_length <= zp->z_blksz);
    928 		if (error == 0)
    929 			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
    930 		/*
    931 		 * If we get EINPROGRESS, then we need to wait for a
    932 		 * write IO initiated by dmu_sync() to complete before
    933 		 * we can release this dbuf.  We will finish everything
    934 		 * up in the zfs_get_done() callback.
    935 		 */
    936 		if (error == EINPROGRESS)
    937 			return (0);
    938 		dmu_buf_rele(db, zgd);
    939 		kmem_free(zgd, sizeof (zgd_t));
    940 	}
    941 out:
    942 	zfs_range_unlock(rl);
    943 	VN_RELE(ZTOV(zp));
    944 	return (error);
    945 }
    946 
    947 /*ARGSUSED*/
    948 static int
    949 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
    950     caller_context_t *ct)
    951 {
    952 	znode_t *zp = VTOZ(vp);
    953 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    954 	int error;
    955 
    956 	ZFS_ENTER(zfsvfs);
    957 	ZFS_VERIFY_ZP(zp);
    958 
    959 	if (flag & V_ACE_MASK)
    960 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
    961 	else
    962 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
    963 
    964 	ZFS_EXIT(zfsvfs);
    965 	return (error);
    966 }
    967 
    968 /*
    969  * Lookup an entry in a directory, or an extended attribute directory.
    970  * If it exists, return a held vnode reference for it.
    971  *
    972  *	IN:	dvp	- vnode of directory to search.
    973  *		nm	- name of entry to lookup.
    974  *		pnp	- full pathname to lookup [UNUSED].
    975  *		flags	- LOOKUP_XATTR set if looking for an attribute.
    976  *		rdir	- root directory vnode [UNUSED].
    977  *		cr	- credentials of caller.
    978  *		ct	- caller context
    979  *		direntflags - directory lookup flags
    980  *		realpnp - returned pathname.
    981  *
    982  *	OUT:	vpp	- vnode of located entry, NULL if not found.
    983  *
    984  *	RETURN:	0 if success
    985  *		error code if failure
    986  *
    987  * Timestamps:
    988  *	NA
    989  */
    990 /* ARGSUSED */
    991 static int
    992 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
    993     int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
    994     int *direntflags, pathname_t *realpnp)
    995 {
    996 	znode_t *zdp = VTOZ(dvp);
    997 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
    998 	int	error;
    999 
   1000 	ZFS_ENTER(zfsvfs);
   1001 	ZFS_VERIFY_ZP(zdp);
   1002 
   1003 	*vpp = NULL;
   1004 
   1005 	if (flags & LOOKUP_XATTR) {
   1006 		/*
   1007 		 * If the xattr property is off, refuse the lookup request.
   1008 		 */
   1009 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
   1010 			ZFS_EXIT(zfsvfs);
   1011 			return (EINVAL);
   1012 		}
   1013 
   1014 		/*
   1015 		 * We don't allow recursive attributes..
   1016 		 * Maybe someday we will.
   1017 		 */
   1018 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
   1019 			ZFS_EXIT(zfsvfs);
   1020 			return (EINVAL);
   1021 		}
   1022 
   1023 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
   1024 			ZFS_EXIT(zfsvfs);
   1025 			return (error);
   1026 		}
   1027 
   1028 		/*
   1029 		 * Do we have permission to get into attribute directory?
   1030 		 */
   1031 
   1032 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
   1033 		    B_FALSE, cr)) {
   1034 			VN_RELE(*vpp);
   1035 			*vpp = NULL;
   1036 		}
   1037 
   1038 		ZFS_EXIT(zfsvfs);
   1039 		return (error);
   1040 	}
   1041 
   1042 	if (dvp->v_type != VDIR) {
   1043 		ZFS_EXIT(zfsvfs);
   1044 		return (ENOTDIR);
   1045 	}
   1046 
   1047 	/*
   1048 	 * Check accessibility of directory.
   1049 	 */
   1050 
   1051 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
   1052 		ZFS_EXIT(zfsvfs);
   1053 		return (error);
   1054 	}
   1055 
   1056 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
   1057 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1058 		ZFS_EXIT(zfsvfs);
   1059 		return (EILSEQ);
   1060 	}
   1061 
   1062 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
   1063 	if (error == 0) {
   1064 		/*
   1065 		 * Convert device special files
   1066 		 */
   1067 		if (IS_DEVVP(*vpp)) {
   1068 			vnode_t	*svp;
   1069 
   1070 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
   1071 			VN_RELE(*vpp);
   1072 			if (svp == NULL)
   1073 				error = ENOSYS;
   1074 			else
   1075 				*vpp = svp;
   1076 		}
   1077 	}
   1078 
   1079 	ZFS_EXIT(zfsvfs);
   1080 	return (error);
   1081 }
   1082 
   1083 /*
   1084  * Attempt to create a new entry in a directory.  If the entry
   1085  * already exists, truncate the file if permissible, else return
   1086  * an error.  Return the vp of the created or trunc'd file.
   1087  *
   1088  *	IN:	dvp	- vnode of directory to put new file entry in.
   1089  *		name	- name of new file entry.
   1090  *		vap	- attributes of new file.
   1091  *		excl	- flag indicating exclusive or non-exclusive mode.
   1092  *		mode	- mode to open file with.
   1093  *		cr	- credentials of caller.
   1094  *		flag	- large file flag [UNUSED].
   1095  *		ct	- caller context
   1096  *		vsecp 	- ACL to be set
   1097  *
   1098  *	OUT:	vpp	- vnode of created or trunc'd entry.
   1099  *
   1100  *	RETURN:	0 if success
   1101  *		error code if failure
   1102  *
   1103  * Timestamps:
   1104  *	dvp - ctime|mtime updated if new entry created
   1105  *	 vp - ctime|mtime always, atime if new
   1106  */
   1107 
   1108 /* ARGSUSED */
   1109 static int
   1110 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
   1111     int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
   1112     vsecattr_t *vsecp)
   1113 {
   1114 	znode_t		*zp, *dzp = VTOZ(dvp);
   1115 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
   1116 	zilog_t		*zilog;
   1117 	objset_t	*os;
   1118 	zfs_dirlock_t	*dl;
   1119 	dmu_tx_t	*tx;
   1120 	int		error;
   1121 	zfs_acl_t	*aclp = NULL;
   1122 	zfs_fuid_info_t *fuidp = NULL;
   1123 
   1124 	/*
   1125 	 * If we have an ephemeral id, ACL, or XVATTR then
   1126 	 * make sure file system is at proper version
   1127 	 */
   1128 
   1129 	if (zfsvfs->z_use_fuids == B_FALSE &&
   1130 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
   1131 	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
   1132 		return (EINVAL);
   1133 
   1134 	ZFS_ENTER(z