Home | History | Annotate | Download | only in zfs
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Portions Copyright 2007 Jeremy Teo */
     27 
     28 #include <sys/types.h>
     29 #include <sys/param.h>
     30 #include <sys/time.h>
     31 #include <sys/systm.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/resource.h>
     34 #include <sys/vfs.h>
     35 #include <sys/vfs_opreg.h>
     36 #include <sys/vnode.h>
     37 #include <sys/file.h>
     38 #include <sys/stat.h>
     39 #include <sys/kmem.h>
     40 #include <sys/taskq.h>
     41 #include <sys/uio.h>
     42 #include <sys/vmsystm.h>
     43 #include <sys/atomic.h>
     44 #include <sys/vm.h>
     45 #include <vm/seg_vn.h>
     46 #include <vm/pvn.h>
     47 #include <vm/as.h>
     48 #include <vm/kpm.h>
     49 #include <vm/seg_kpm.h>
     50 #include <sys/mman.h>
     51 #include <sys/pathname.h>
     52 #include <sys/cmn_err.h>
     53 #include <sys/errno.h>
     54 #include <sys/unistd.h>
     55 #include <sys/zfs_dir.h>
     56 #include <sys/zfs_acl.h>
     57 #include <sys/zfs_ioctl.h>
     58 #include <sys/fs/zfs.h>
     59 #include <sys/dmu.h>
     60 #include <sys/spa.h>
     61 #include <sys/txg.h>
     62 #include <sys/dbuf.h>
     63 #include <sys/zap.h>
     64 #include <sys/dirent.h>
     65 #include <sys/policy.h>
     66 #include <sys/sunddi.h>
     67 #include <sys/filio.h>
     68 #include "fs/fs_subr.h"
     69 #include <sys/zfs_ctldir.h>
     70 #include <sys/zfs_fuid.h>
     71 #include <sys/dnlc.h>
     72 #include <sys/zfs_rlock.h>
     73 #include <sys/extdirent.h>
     74 #include <sys/kidmap.h>
     75 #include <sys/cred_impl.h>
     76 #include <sys/attr.h>
     77 
     78 /*
     79  * Programming rules.
     80  *
     81  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
     82  * properly lock its in-core state, create a DMU transaction, do the work,
     83  * record this work in the intent log (ZIL), commit the DMU transaction,
     84  * and wait for the intent log to commit if it is a synchronous operation.
     85  * Moreover, the vnode ops must work in both normal and log replay context.
     86  * The ordering of events is important to avoid deadlocks and references
     87  * to freed memory.  The example below illustrates the following Big Rules:
     88  *
     89  *  (1) A check must be made in each zfs thread for a mounted file system.
     90  *	This is done avoiding races using ZFS_ENTER(zfsvfs).
     91  *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
     92  *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
     93  *      can return EIO from the calling function.
     94  *
     95  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
     96  *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
     97  *	First, if it's the last reference, the vnode/znode
     98  *	can be freed, so the zp may point to freed memory.  Second, the last
     99  *	reference will call zfs_zinactive(), which may induce a lot of work --
    100  *	pushing cached pages (which acquires range locks) and syncing out
    101  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
    102  *	which could deadlock the system if you were already holding one.
    103  *
    104  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
    105  *	as they can span dmu_tx_assign() calls.
    106  *
    107  *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
    108  *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
    109  *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
    110  *	This is critical because we don't want to block while holding locks.
    111  *	Note, in particular, that if a lock is sometimes acquired before
    112  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
    113  *	use a non-blocking assign can deadlock the system.  The scenario:
    114  *
    115  *	Thread A has grabbed a lock before calling dmu_tx_assign().
    116  *	Thread B is in an already-assigned tx, and blocks for this lock.
    117  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
    118  *	forever, because the previous txg can't quiesce until B's tx commits.
    119  *
    120  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
    121  *	then drop all locks, call dmu_tx_wait(), and try again.
    122  *
    123  *  (5)	If the operation succeeded, generate the intent log entry for it
    124  *	before dropping locks.  This ensures that the ordering of events
    125  *	in the intent log matches the order in which they actually occurred.
    126  *
    127  *  (6)	At the end of each vnode op, the DMU tx must always commit,
    128  *	regardless of whether there were any errors.
    129  *
    130  *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
    131  *	to ensure that synchronous semantics are provided when necessary.
    132  *
    133  * In general, this is how things should be ordered in each vnode op:
    134  *
    135  *	ZFS_ENTER(zfsvfs);		// exit if unmounted
    136  * top:
    137  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
    138  *	rw_enter(...);			// grab any other locks you need
    139  *	tx = dmu_tx_create(...);	// get DMU tx
    140  *	dmu_tx_hold_*();		// hold each object you might modify
    141  *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
    142  *	if (error) {
    143  *		rw_exit(...);		// drop locks
    144  *		zfs_dirent_unlock(dl);	// unlock directory entry
    145  *		VN_RELE(...);		// release held vnodes
    146  *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
    147  *			dmu_tx_wait(tx);
    148  *			dmu_tx_abort(tx);
    149  *			goto top;
    150  *		}
    151  *		dmu_tx_abort(tx);	// abort DMU tx
    152  *		ZFS_EXIT(zfsvfs);	// finished in zfs
    153  *		return (error);		// really out of space
    154  *	}
    155  *	error = do_real_work();		// do whatever this VOP does
    156  *	if (error == 0)
    157  *		zfs_log_*(...);		// on success, make ZIL entry
    158  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
    159  *	rw_exit(...);			// drop locks
    160  *	zfs_dirent_unlock(dl);		// unlock directory entry
    161  *	VN_RELE(...);			// release held vnodes
    162  *	zil_commit(zilog, seq, foid);	// synchronous when necessary
    163  *	ZFS_EXIT(zfsvfs);		// finished in zfs
    164  *	return (error);			// done, report error
    165  */
    166 
    167 /* ARGSUSED */
    168 static int
    169 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
    170 {
    171 	znode_t	*zp = VTOZ(*vpp);
    172 
    173 	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
    174 	    ((flag & FAPPEND) == 0)) {
    175 		return (EPERM);
    176 	}
    177 
    178 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    179 	    ZTOV(zp)->v_type == VREG &&
    180 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    181 	    zp->z_phys->zp_size > 0)
    182 		if (fs_vscan(*vpp, cr, 0) != 0)
    183 			return (EACCES);
    184 
    185 	/* Keep a count of the synchronous opens in the znode */
    186 	if (flag & (FSYNC | FDSYNC))
    187 		atomic_inc_32(&zp->z_sync_cnt);
    188 
    189 	return (0);
    190 }
    191 
    192 /* ARGSUSED */
    193 static int
    194 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    195     caller_context_t *ct)
    196 {
    197 	znode_t	*zp = VTOZ(vp);
    198 
    199 	/* Decrement the synchronous opens in the znode */
    200 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
    201 		atomic_dec_32(&zp->z_sync_cnt);
    202 
    203 	/*
    204 	 * Clean up any locks held by this process on the vp.
    205 	 */
    206 	cleanlocks(vp, ddi_get_pid(), 0);
    207 	cleanshares(vp, ddi_get_pid());
    208 
    209 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
    210 	    ZTOV(zp)->v_type == VREG &&
    211 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
    212 	    zp->z_phys->zp_size > 0)
    213 		VERIFY(fs_vscan(vp, cr, 1) == 0);
    214 
    215 	return (0);
    216 }
    217 
    218 /*
    219  * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
    220  * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
    221  */
    222 static int
    223 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
    224 {
    225 	znode_t	*zp = VTOZ(vp);
    226 	uint64_t noff = (uint64_t)*off; /* new offset */
    227 	uint64_t file_sz;
    228 	int error;
    229 	boolean_t hole;
    230 
    231 	file_sz = zp->z_phys->zp_size;
    232 	if (noff >= file_sz)  {
    233 		return (ENXIO);
    234 	}
    235 
    236 	if (cmd == _FIO_SEEK_HOLE)
    237 		hole = B_TRUE;
    238 	else
    239 		hole = B_FALSE;
    240 
    241 	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
    242 
    243 	/* end of file? */
    244 	if ((error == ESRCH) || (noff > file_sz)) {
    245 		/*
    246 		 * Handle the virtual hole at the end of file.
    247 		 */
    248 		if (hole) {
    249 			*off = file_sz;
    250 			return (0);
    251 		}
    252 		return (ENXIO);
    253 	}
    254 
    255 	if (noff < *off)
    256 		return (error);
    257 	*off = noff;
    258 	return (error);
    259 }
    260 
    261 /* ARGSUSED */
    262 static int
    263 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
    264     int *rvalp, caller_context_t *ct)
    265 {
    266 	offset_t off;
    267 	int error;
    268 	zfsvfs_t *zfsvfs;
    269 	znode_t *zp;
    270 
    271 	switch (com) {
    272 	case _FIOFFS:
    273 		return (zfs_sync(vp->v_vfsp, 0, cred));
    274 
    275 		/*
    276 		 * The following two ioctls are used by bfu.  Faking out,
    277 		 * necessary to avoid bfu errors.
    278 		 */
    279 	case _FIOGDIO:
    280 	case _FIOSDIO:
    281 		return (0);
    282 
    283 	case _FIO_SEEK_DATA:
    284 	case _FIO_SEEK_HOLE:
    285 		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
    286 			return (EFAULT);
    287 
    288 		zp = VTOZ(vp);
    289 		zfsvfs = zp->z_zfsvfs;
    290 		ZFS_ENTER(zfsvfs);
    291 		ZFS_VERIFY_ZP(zp);
    292 
    293 		/* offset parameter is in/out */
    294 		error = zfs_holey(vp, com, &off);
    295 		ZFS_EXIT(zfsvfs);
    296 		if (error)
    297 			return (error);
    298 		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
    299 			return (EFAULT);
    300 		return (0);
    301 	}
    302 	return (ENOTTY);
    303 }
    304 
    305 /*
    306  * Utility functions to map and unmap a single physical page.  These
    307  * are used to manage the mappable copies of ZFS file data, and therefore
    308  * do not update ref/mod bits.
    309  */
    310 caddr_t
    311 zfs_map_page(page_t *pp, enum seg_rw rw)
    312 {
    313 	if (kpm_enable)
    314 		return (hat_kpm_mapin(pp, 0));
    315 	ASSERT(rw == S_READ || rw == S_WRITE);
    316 	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
    317 	    (caddr_t)-1));
    318 }
    319 
    320 void
    321 zfs_unmap_page(page_t *pp, caddr_t addr)
    322 {
    323 	if (kpm_enable) {
    324 		hat_kpm_mapout(pp, 0, addr);
    325 	} else {
    326 		ppmapout(addr);
    327 	}
    328 }
    329 
    330 /*
    331  * When a file is memory mapped, we must keep the IO data synchronized
    332  * between the DMU cache and the memory mapped pages.  What this means:
    333  *
    334  * On Write:	If we find a memory mapped page, we write to *both*
    335  *		the page and the dmu buffer.
    336  *
    337  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    338  *	the file is memory mapped.
    339  */
    340 static int
    341 mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
    342 {
    343 	znode_t	*zp = VTOZ(vp);
    344 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    345 	int64_t	start, off;
    346 	int len = nbytes;
    347 	int error = 0;
    348 
    349 	start = uio->uio_loffset;
    350 	off = start & PAGEOFFSET;
    351 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    352 		page_t *pp;
    353 		uint64_t bytes = MIN(PAGESIZE - off, len);
    354 		uint64_t woff = uio->uio_loffset;
    355 
    356 		/*
    357 		 * We don't want a new page to "appear" in the middle of
    358 		 * the file update (because it may not get the write
    359 		 * update data), so we grab a lock to block
    360 		 * zfs_getpage().
    361 		 */
    362 		rw_enter(&zp->z_map_lock, RW_WRITER);
    363 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    364 			caddr_t va;
    365 
    366 			rw_exit(&zp->z_map_lock);
    367 			va = zfs_map_page(pp, S_WRITE);
    368 			error = uiomove(va+off, bytes, UIO_WRITE, uio);
    369 			if (error == 0) {
    370 				dmu_write(zfsvfs->z_os, zp->z_id,
    371 				    woff, bytes, va+off, tx);
    372 			}
    373 			zfs_unmap_page(pp, va);
    374 			page_unlock(pp);
    375 		} else {
    376 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    377 			    uio, bytes, tx);
    378 			rw_exit(&zp->z_map_lock);
    379 		}
    380 		len -= bytes;
    381 		off = 0;
    382 		if (error)
    383 			break;
    384 	}
    385 	return (error);
    386 }
    387 
    388 /*
    389  * When a file is memory mapped, we must keep the IO data synchronized
    390  * between the DMU cache and the memory mapped pages.  What this means:
    391  *
    392  * On Read:	We "read" preferentially from memory mapped pages,
    393  *		else we default from the dmu buffer.
    394  *
    395  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
    396  *	the file is memory mapped.
    397  */
    398 static int
    399 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
    400 {
    401 	znode_t *zp = VTOZ(vp);
    402 	objset_t *os = zp->z_zfsvfs->z_os;
    403 	int64_t	start, off;
    404 	int len = nbytes;
    405 	int error = 0;
    406 
    407 	start = uio->uio_loffset;
    408 	off = start & PAGEOFFSET;
    409 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
    410 		page_t *pp;
    411 		uint64_t bytes = MIN(PAGESIZE - off, len);
    412 
    413 		if (pp = page_lookup(vp, start, SE_SHARED)) {
    414 			caddr_t va;
    415 
    416 			va = zfs_map_page(pp, S_READ);
    417 			error = uiomove(va + off, bytes, UIO_READ, uio);
    418 			zfs_unmap_page(pp, va);
    419 			page_unlock(pp);
    420 		} else {
    421 			error = dmu_read_uio(os, zp->z_id, uio, bytes);
    422 		}
    423 		len -= bytes;
    424 		off = 0;
    425 		if (error)
    426 			break;
    427 	}
    428 	return (error);
    429 }
    430 
    431 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
    432 
    433 /*
    434  * Read bytes from specified file into supplied buffer.
    435  *
    436  *	IN:	vp	- vnode of file to be read from.
    437  *		uio	- structure supplying read location, range info,
    438  *			  and return buffer.
    439  *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
    440  *		cr	- credentials of caller.
    441  *		ct	- caller context
    442  *
    443  *	OUT:	uio	- updated offset and range, buffer filled.
    444  *
    445  *	RETURN:	0 if success
    446  *		error code if failure
    447  *
    448  * Side Effects:
    449  *	vp - atime updated if byte count > 0
    450  */
    451 /* ARGSUSED */
    452 static int
    453 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    454 {
    455 	znode_t		*zp = VTOZ(vp);
    456 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    457 	objset_t	*os;
    458 	ssize_t		n, nbytes;
    459 	int		error;
    460 	rl_t		*rl;
    461 
    462 	ZFS_ENTER(zfsvfs);
    463 	ZFS_VERIFY_ZP(zp);
    464 	os = zfsvfs->z_os;
    465 
    466 	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
    467 		ZFS_EXIT(zfsvfs);
    468 		return (EACCES);
    469 	}
    470 
    471 	/*
    472 	 * Validate file offset
    473 	 */
    474 	if (uio->uio_loffset < (offset_t)0) {
    475 		ZFS_EXIT(zfsvfs);
    476 		return (EINVAL);
    477 	}
    478 
    479 	/*
    480 	 * Fasttrack empty reads
    481 	 */
    482 	if (uio->uio_resid == 0) {
    483 		ZFS_EXIT(zfsvfs);
    484 		return (0);
    485 	}
    486 
    487 	/*
    488 	 * Check for mandatory locks
    489 	 */
    490 	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
    491 		if (error = chklock(vp, FREAD,
    492 		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
    493 			ZFS_EXIT(zfsvfs);
    494 			return (error);
    495 		}
    496 	}
    497 
    498 	/*
    499 	 * If we're in FRSYNC mode, sync out this znode before reading it.
    500 	 */
    501 	if (ioflag & FRSYNC)
    502 		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
    503 
    504 	/*
    505 	 * Lock the range against changes.
    506 	 */
    507 	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
    508 
    509 	/*
    510 	 * If we are reading past end-of-file we can skip
    511 	 * to the end; but we might still need to set atime.
    512 	 */
    513 	if (uio->uio_loffset >= zp->z_phys->zp_size) {
    514 		error = 0;
    515 		goto out;
    516 	}
    517 
    518 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
    519 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
    520 
    521 	while (n > 0) {
    522 		nbytes = MIN(n, zfs_read_chunk_size -
    523 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
    524 
    525 		if (vn_has_cached_data(vp))
    526 			error = mappedread(vp, nbytes, uio);
    527 		else
    528 			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
    529 		if (error) {
    530 			/* convert checksum errors into IO errors */
    531 			if (error == ECKSUM)
    532 				error = EIO;
    533 			break;
    534 		}
    535 
    536 		n -= nbytes;
    537 	}
    538 
    539 out:
    540 	zfs_range_unlock(rl);
    541 
    542 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
    543 	ZFS_EXIT(zfsvfs);
    544 	return (error);
    545 }
    546 
    547 /*
    548  * Fault in the pages of the first n bytes specified by the uio structure.
    549  * 1 byte in each page is touched and the uio struct is unmodified.
    550  * Any error will exit this routine as this is only a best
    551  * attempt to get the pages resident. This is a copy of ufs_trans_touch().
    552  */
    553 static void
    554 zfs_prefault_write(ssize_t n, struct uio *uio)
    555 {
    556 	struct iovec *iov;
    557 	ulong_t cnt, incr;
    558 	caddr_t p;
    559 	uint8_t tmp;
    560 
    561 	iov = uio->uio_iov;
    562 
    563 	while (n) {
    564 		cnt = MIN(iov->iov_len, n);
    565 		if (cnt == 0) {
    566 			/* empty iov entry */
    567 			iov++;
    568 			continue;
    569 		}
    570 		n -= cnt;
    571 		/*
    572 		 * touch each page in this segment.
    573 		 */
    574 		p = iov->iov_base;
    575 		while (cnt) {
    576 			switch (uio->uio_segflg) {
    577 			case UIO_USERSPACE:
    578 			case UIO_USERISPACE:
    579 				if (fuword8(p, &tmp))
    580 					return;
    581 				break;
    582 			case UIO_SYSSPACE:
    583 				if (kcopy(p, &tmp, 1))
    584 					return;
    585 				break;
    586 			}
    587 			incr = MIN(cnt, PAGESIZE);
    588 			p += incr;
    589 			cnt -= incr;
    590 		}
    591 		/*
    592 		 * touch the last byte in case it straddles a page.
    593 		 */
    594 		p--;
    595 		switch (uio->uio_segflg) {
    596 		case UIO_USERSPACE:
    597 		case UIO_USERISPACE:
    598 			if (fuword8(p, &tmp))
    599 				return;
    600 			break;
    601 		case UIO_SYSSPACE:
    602 			if (kcopy(p, &tmp, 1))
    603 				return;
    604 			break;
    605 		}
    606 		iov++;
    607 	}
    608 }
    609 
    610 /*
    611  * Write the bytes to a file.
    612  *
    613  *	IN:	vp	- vnode of file to be written to.
    614  *		uio	- structure supplying write location, range info,
    615  *			  and data buffer.
    616  *		ioflag	- FAPPEND flag set if in append mode.
    617  *		cr	- credentials of caller.
    618  *		ct	- caller context (NFS/CIFS fem monitor only)
    619  *
    620  *	OUT:	uio	- updated offset and range.
    621  *
    622  *	RETURN:	0 if success
    623  *		error code if failure
    624  *
    625  * Timestamps:
    626  *	vp - ctime|mtime updated if byte count > 0
    627  */
    628 /* ARGSUSED */
    629 static int
    630 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
    631 {
    632 	znode_t		*zp = VTOZ(vp);
    633 	rlim64_t	limit = uio->uio_llimit;
    634 	ssize_t		start_resid = uio->uio_resid;
    635 	ssize_t		tx_bytes;
    636 	uint64_t	end_size;
    637 	dmu_tx_t	*tx;
    638 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
    639 	zilog_t		*zilog;
    640 	offset_t	woff;
    641 	ssize_t		n, nbytes;
    642 	rl_t		*rl;
    643 	int		max_blksz = zfsvfs->z_max_blksz;
    644 	uint64_t	pflags;
    645 	int		error;
    646 
    647 	/*
    648 	 * Fasttrack empty write
    649 	 */
    650 	n = start_resid;
    651 	if (n == 0)
    652 		return (0);
    653 
    654 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
    655 		limit = MAXOFFSET_T;
    656 
    657 	ZFS_ENTER(zfsvfs);
    658 	ZFS_VERIFY_ZP(zp);
    659 
    660 	/*
    661 	 * If immutable or not appending then return EPERM
    662 	 */
    663 	pflags = zp->z_phys->zp_flags;
    664 	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
    665 	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
    666 	    (uio->uio_loffset < zp->z_phys->zp_size))) {
    667 		ZFS_EXIT(zfsvfs);
    668 		return (EPERM);
    669 	}
    670 
    671 	zilog = zfsvfs->z_log;
    672 
    673 	/*
    674 	 * Pre-fault the pages to ensure slow (eg NFS) pages
    675 	 * don't hold up txg.
    676 	 */
    677 	zfs_prefault_write(n, uio);
    678 
    679 	/*
    680 	 * If in append mode, set the io offset pointer to eof.
    681 	 */
    682 	if (ioflag & FAPPEND) {
    683 		/*
    684 		 * Range lock for a file append:
    685 		 * The value for the start of range will be determined by
    686 		 * zfs_range_lock() (to guarantee append semantics).
    687 		 * If this write will cause the block size to increase,
    688 		 * zfs_range_lock() will lock the entire file, so we must
    689 		 * later reduce the range after we grow the block size.
    690 		 */
    691 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
    692 		if (rl->r_len == UINT64_MAX) {
    693 			/* overlocked, zp_size can't change */
    694 			woff = uio->uio_loffset = zp->z_phys->zp_size;
    695 		} else {
    696 			woff = uio->uio_loffset = rl->r_off;
    697 		}
    698 	} else {
    699 		woff = uio->uio_loffset;
    700 		/*
    701 		 * Validate file offset
    702 		 */
    703 		if (woff < 0) {
    704 			ZFS_EXIT(zfsvfs);
    705 			return (EINVAL);
    706 		}
    707 
    708 		/*
    709 		 * If we need to grow the block size then zfs_range_lock()
    710 		 * will lock a wider range than we request here.
    711 		 * Later after growing the block size we reduce the range.
    712 		 */
    713 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
    714 	}
    715 
    716 	if (woff >= limit) {
    717 		zfs_range_unlock(rl);
    718 		ZFS_EXIT(zfsvfs);
    719 		return (EFBIG);
    720 	}
    721 
    722 	if ((woff + n) > limit || woff > (limit - n))
    723 		n = limit - woff;
    724 
    725 	/*
    726 	 * Check for mandatory locks
    727 	 */
    728 	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
    729 	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
    730 		zfs_range_unlock(rl);
    731 		ZFS_EXIT(zfsvfs);
    732 		return (error);
    733 	}
    734 	end_size = MAX(zp->z_phys->zp_size, woff + n);
    735 
    736 	/*
    737 	 * Write the file in reasonable size chunks.  Each chunk is written
    738 	 * in a separate transaction; this keeps the intent log records small
    739 	 * and allows us to do more fine-grained space accounting.
    740 	 */
    741 	while (n > 0) {
    742 		/*
    743 		 * Start a transaction.
    744 		 */
    745 		woff = uio->uio_loffset;
    746 		tx = dmu_tx_create(zfsvfs->z_os);
    747 		dmu_tx_hold_bonus(tx, zp->z_id);
    748 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
    749 		error = dmu_tx_assign(tx, zfsvfs->z_assign);
    750 		if (error) {
    751 			if (error == ERESTART &&
    752 			    zfsvfs->z_assign == TXG_NOWAIT) {
    753 				dmu_tx_wait(tx);
    754 				dmu_tx_abort(tx);
    755 				continue;
    756 			}
    757 			dmu_tx_abort(tx);
    758 			break;
    759 		}
    760 
    761 		/*
    762 		 * If zfs_range_lock() over-locked we grow the blocksize
    763 		 * and then reduce the lock range.  This will only happen
    764 		 * on the first iteration since zfs_range_reduce() will
    765 		 * shrink down r_len to the appropriate size.
    766 		 */
    767 		if (rl->r_len == UINT64_MAX) {
    768 			uint64_t new_blksz;
    769 
    770 			if (zp->z_blksz > max_blksz) {
    771 				ASSERT(!ISP2(zp->z_blksz));
    772 				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
    773 			} else {
    774 				new_blksz = MIN(end_size, max_blksz);
    775 			}
    776 			zfs_grow_blocksize(zp, new_blksz, tx);
    777 			zfs_range_reduce(rl, woff, n);
    778 		}
    779 
    780 		/*
    781 		 * XXX - should we really limit each write to z_max_blksz?
    782 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
    783 		 */
    784 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
    785 		rw_enter(&zp->z_map_lock, RW_READER);
    786 
    787 		tx_bytes = uio->uio_resid;
    788 		if (vn_has_cached_data(vp)) {
    789 			rw_exit(&zp->z_map_lock);
    790 			error = mappedwrite(vp, nbytes, uio, tx);
    791 		} else {
    792 			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
    793 			    uio, nbytes, tx);
    794 			rw_exit(&zp->z_map_lock);
    795 		}
    796 		tx_bytes -= uio->uio_resid;
    797 
    798 		/*
    799 		 * If we made no progress, we're done.  If we made even
    800 		 * partial progress, update the znode and ZIL accordingly.
    801 		 */
    802 		if (tx_bytes == 0) {
    803 			dmu_tx_commit(tx);
    804 			ASSERT(error != 0);
    805 			break;
    806 		}
    807 
    808 		/*
    809 		 * Clear Set-UID/Set-GID bits on successful write if not
    810 		 * privileged and at least one of the excute bits is set.
    811 		 *
    812 		 * It would be nice to to this after all writes have
    813 		 * been done, but that would still expose the ISUID/ISGID
    814 		 * to another app after the partial write is committed.
    815 		 *
    816 		 * Note: we don't call zfs_fuid_map_id() here because
    817 		 * user 0 is not an ephemeral uid.
    818 		 */
    819 		mutex_enter(&zp->z_acl_lock);
    820 		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
    821 		    (S_IXUSR >> 6))) != 0 &&
    822 		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
    823 		    secpolicy_vnode_setid_retain(cr,
    824 		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
    825 		    zp->z_phys->zp_uid == 0) != 0) {
    826 			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
    827 		}
    828 		mutex_exit(&zp->z_acl_lock);
    829 
    830 		/*
    831 		 * Update time stamp.  NOTE: This marks the bonus buffer as
    832 		 * dirty, so we don't have to do it again for zp_size.
    833 		 */
    834 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
    835 
    836 		/*
    837 		 * Update the file size (zp_size) if it has changed;
    838 		 * account for possible concurrent updates.
    839 		 */
    840 		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
    841 			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
    842 			    uio->uio_loffset);
    843 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
    844 		dmu_tx_commit(tx);
    845 
    846 		if (error != 0)
    847 			break;
    848 		ASSERT(tx_bytes == nbytes);
    849 		n -= nbytes;
    850 	}
    851 
    852 	zfs_range_unlock(rl);
    853 
    854 	/*
    855 	 * If we're in replay mode, or we made no progress, return error.
    856 	 * Otherwise, it's at least a partial write, so it's successful.
    857 	 */
    858 	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
    859 		ZFS_EXIT(zfsvfs);
    860 		return (error);
    861 	}
    862 
    863 	if (ioflag & (FSYNC | FDSYNC))
    864 		zil_commit(zilog, zp->z_last_itx, zp->z_id);
    865 
    866 	ZFS_EXIT(zfsvfs);
    867 	return (0);
    868 }
    869 
    870 void
    871 zfs_get_done(dmu_buf_t *db, void *vzgd)
    872 {
    873 	zgd_t *zgd = (zgd_t *)vzgd;
    874 	rl_t *rl = zgd->zgd_rl;
    875 	vnode_t *vp = ZTOV(rl->r_zp);
    876 
    877 	dmu_buf_rele(db, vzgd);
    878 	zfs_range_unlock(rl);
    879 	VN_RELE(vp);
    880 	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
    881 	kmem_free(zgd, sizeof (zgd_t));
    882 }
    883 
    884 /*
    885  * Get data to generate a TX_WRITE intent log record.
    886  */
    887 int
    888 zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
    889 {
    890 	zfsvfs_t *zfsvfs = arg;
    891 	objset_t *os = zfsvfs->z_os;
    892 	znode_t *zp;
    893 	uint64_t off = lr->lr_offset;
    894 	dmu_buf_t *db;
    895 	rl_t *rl;
    896 	zgd_t *zgd;
    897 	int dlen = lr->lr_length;		/* length of user data */
    898 	int error = 0;
    899 
    900 	ASSERT(zio);
    901 	ASSERT(dlen != 0);
    902 
    903 	/*
    904 	 * Nothing to do if the file has been removed
    905 	 */
    906 	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
    907 		return (ENOENT);
    908 	if (zp->z_unlinked) {
    909 		VN_RELE(ZTOV(zp));
    910 		return (ENOENT);
    911 	}
    912 
    913 	/*
    914 	 * Write records come in two flavors: immediate and indirect.
    915 	 * For small writes it's cheaper to store the data with the
    916 	 * log record (immediate); for large writes it's cheaper to
    917 	 * sync the data and get a pointer to it (indirect) so that
    918 	 * we don't have to write the data twice.
    919 	 */
    920 	if (buf != NULL) { /* immediate write */
    921 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
    922 		/* test for truncation needs to be done while range locked */
    923 		if (off >= zp->z_phys->zp_size) {
    924 			error = ENOENT;
    925 			goto out;
    926 		}
    927 		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
    928 	} else { /* indirect write */
    929 		uint64_t boff; /* block starting offset */
    930 
    931 		/*
    932 		 * Have to lock the whole block to ensure when it's
    933 		 * written out and it's checksum is being calculated
    934 		 * that no one can change the data. We need to re-check
    935 		 * blocksize after we get the lock in case it's changed!
    936 		 */
    937 		for (;;) {
    938 			if (ISP2(zp->z_blksz)) {
    939 				boff = P2ALIGN_TYPED(off, zp->z_blksz,
    940 				    uint64_t);
    941 			} else {
    942 				boff = 0;
    943 			}
    944 			dlen = zp->z_blksz;
    945 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
    946 			if (zp->z_blksz == dlen)
    947 				break;
    948 			zfs_range_unlock(rl);
    949 		}
    950 		/* test for truncation needs to be done while range locked */
    951 		if (off >= zp->z_phys->zp_size) {
    952 			error = ENOENT;
    953 			goto out;
    954 		}
    955 		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
    956 		zgd->zgd_rl = rl;
    957 		zgd->zgd_zilog = zfsvfs->z_log;
    958 		zgd->zgd_bp = &lr->lr_blkptr;
    959 		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
    960 		ASSERT(boff == db->db_offset);
    961 		lr->lr_blkoff = off - boff;
    962 		error = dmu_sync(zio, db, &lr->lr_blkptr,
    963 		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
    964 		ASSERT((error && error != EINPROGRESS) ||
    965 		    lr->lr_length <= zp->z_blksz);
    966 		if (error == 0)
    967 			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
    968 		/*
    969 		 * If we get EINPROGRESS, then we need to wait for a
    970 		 * write IO initiated by dmu_sync() to complete before
    971 		 * we can release this dbuf.  We will finish everything
    972 		 * up in the zfs_get_done() callback.
    973 		 */
    974 		if (error == EINPROGRESS)
    975 			return (0);
    976 		dmu_buf_rele(db, zgd);
    977 		kmem_free(zgd, sizeof (zgd_t));
    978 	}
    979 out:
    980 	zfs_range_unlock(rl);
    981 	VN_RELE(ZTOV(zp));
    982 	return (error);
    983 }
    984 
    985 /*ARGSUSED*/
    986 static int
    987 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
    988     caller_context_t *ct)
    989 {
    990 	znode_t *zp = VTOZ(vp);
    991 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
    992 	int error;
    993 
    994 	ZFS_ENTER(zfsvfs);
    995 	ZFS_VERIFY_ZP(zp);
    996 
    997 	if (flag & V_ACE_MASK)
    998 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
    999 	else
   1000 		error = zfs_zaccess_rwx(zp, mode, flag, cr);
   1001 
   1002 	ZFS_EXIT(zfsvfs);
   1003 	return (error);
   1004 }
   1005 
   1006 /*
   1007  * Lookup an entry in a directory, or an extended attribute directory.
   1008  * If it exists, return a held vnode reference for it.
   1009  *
   1010  *	IN:	dvp	- vnode of directory to search.
   1011  *		nm	- name of entry to lookup.
   1012  *		pnp	- full pathname to lookup [UNUSED].
   1013  *		flags	- LOOKUP_XATTR set if looking for an attribute.
   1014  *		rdir	- root directory vnode [UNUSED].
   1015  *		cr	- credentials of caller.
   1016  *		ct	- caller context
   1017  *		direntflags - directory lookup flags
   1018  *		realpnp - returned pathname.
   1019  *
   1020  *	OUT:	vpp	- vnode of located entry, NULL if not found.
   1021  *
   1022  *	RETURN:	0 if success
   1023  *		error code if failure
   1024  *
   1025  * Timestamps:
   1026  *	NA
   1027  */
   1028 /* ARGSUSED */
   1029 static int
   1030 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
   1031     int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
   1032     int *direntflags, pathname_t *realpnp)
   1033 {
   1034 	znode_t *zdp = VTOZ(dvp);
   1035 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
   1036 	int	error;
   1037 
   1038 	ZFS_ENTER(zfsvfs);
   1039 	ZFS_VERIFY_ZP(zdp);
   1040 
   1041 	*vpp = NULL;
   1042 
   1043 	if (flags & LOOKUP_XATTR) {
   1044 		/*
   1045 		 * If the xattr property is off, refuse the lookup request.
   1046 		 */
   1047 		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
   1048 			ZFS_EXIT(zfsvfs);
   1049 			return (EINVAL);
   1050 		}
   1051 
   1052 		/*
   1053 		 * We don't allow recursive attributes..
   1054 		 * Maybe someday we will.
   1055 		 */
   1056 		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
   1057 			ZFS_EXIT(zfsvfs);
   1058 			return (EINVAL);
   1059 		}
   1060 
   1061 		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
   1062 			ZFS_EXIT(zfsvfs);
   1063 			return (error);
   1064 		}
   1065 
   1066 		/*
   1067 		 * Do we have permission to get into attribute directory?
   1068 		 */
   1069 
   1070 		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
   1071 		    B_FALSE, cr)) {
   1072 			VN_RELE(*vpp);
   1073 			*vpp = NULL;
   1074 		}
   1075 
   1076 		ZFS_EXIT(zfsvfs);
   1077 		return (error);
   1078 	}
   1079 
   1080 	if (dvp->v_type != VDIR) {
   1081 		ZFS_EXIT(zfsvfs);
   1082 		return (ENOTDIR);
   1083 	}
   1084 
   1085 	/*
   1086 	 * Check accessibility of directory.
   1087 	 */
   1088 
   1089 	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
   1090 		ZFS_EXIT(zfsvfs);
   1091 		return (error);
   1092 	}
   1093 
   1094 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
   1095 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
   1096 		ZFS_EXIT(zfsvfs);
   1097 		return (EILSEQ);
   1098 	}
   1099 
   1100 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
   1101 	if (error == 0) {
   1102 		/*
   1103 		 * Convert device special files
   1104 		 */
   1105 		if (IS_DEVVP(*vpp)) {
   1106 			vnode_t	*svp;
   1107 
   1108 			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
   1109 			VN_RELE(*vpp);
   1110 			if (svp == NULL)
   1111 				error = ENOSYS;
   1112 			else
   1113 				*vpp = svp;
   1114 		}
   1115 	}
   1116 
   1117 	ZFS_EXIT(zfsvfs);
   1118 	return (error);
   1119 }
   1120 
   1121 /*
   1122  * Attempt to create a new entry in a directory.  If the entry
   1123  * already exists, truncate the file if permissible, else return
   1124  * an error.  Return the vp of the created or trunc'd file.
   1125  *
   1126  *	IN:	dvp	- vnode of directory to put new file entry in.
   1127  *		name	- name of new file entry.
   1128  *		vap	- attributes of new file.
   1129  *		excl	- flag indicating exclusive or non-exclusive mode.
   1130  *		mode	- mode to open file with.
   1131  *		cr	- credentials of caller.
   1132  *		flag	- large file flag [UNUSED].
   1133  *		ct	- caller context
   1134  *		vsecp 	- ACL to be set
   1135  *
   1136  *	OUT:	vpp	- vnode of created or trunc'd entry.
   1137  *
   1138  *	RETURN:	0 if success
   1139  *		error code if failure
   1140  *
   1141  * Timestamps:
   1142  *	dvp - ctime|mtime updated if new entry created
   1143  *	 vp - ctime|mtime always, atime if new
   1144  */
   1145