Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 #pragma ident	"@(#)rw.c	1.42	07/10/25 SMI"
     35 
     36 #include <sys/param.h>
     37 #include <sys/isa_defs.h>
     38 #include <sys/types.h>
     39 #include <sys/inttypes.h>
     40 #include <sys/sysmacros.h>
     41 #include <sys/cred.h>
     42 #include <sys/user.h>
     43 #include <sys/systm.h>
     44 #include <sys/errno.h>
     45 #include <sys/vnode.h>
     46 #include <sys/file.h>
     47 #include <sys/proc.h>
     48 #include <sys/cpuvar.h>
     49 #include <sys/uio.h>
     50 #include <sys/debug.h>
     51 #include <sys/rctl.h>
     52 #include <sys/nbmlock.h>
     53 
     54 #define	COPYOUT_MAX_CACHE	(1<<17)		/* 128K */
     55 
     56 size_t copyout_max_cached = COPYOUT_MAX_CACHE;	/* global so it's patchable */
     57 
     58 /*
     59  * read, write, pread, pwrite, readv, and writev syscalls.
     60  *
     61  * 64-bit open:	all open's are large file opens.
     62  * Large Files: the behaviour of read depends on whether the fd
     63  *		corresponds to large open or not.
     64  * 32-bit open:	FOFFMAX flag not set.
     65  *		read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
     66  *		EOVERFLOW if count is non-zero and if size of file
     67  *		is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
     68  *		at >= MAXOFF32_T returns EOF.
     69  */
     70 
     71 /*
     72  * Native system call
     73  */
     74 ssize_t
     75 read(int fdes, void *cbuf, size_t count)
     76 {
     77 	struct uio auio;
     78 	struct iovec aiov;
     79 	file_t *fp;
     80 	register vnode_t *vp;
     81 	struct cpu *cp;
     82 	int fflag, ioflag, rwflag;
     83 	ssize_t cnt, bcount;
     84 	int error = 0;
     85 	u_offset_t fileoff;
     86 	int in_crit = 0;
     87 
     88 	if ((cnt = (ssize_t)count) < 0)
     89 		return (set_errno(EINVAL));
     90 	if ((fp = getf(fdes)) == NULL)
     91 		return (set_errno(EBADF));
     92 	if (((fflag = fp->f_flag) & FREAD) == 0) {
     93 		error = EBADF;
     94 		goto out;
     95 	}
     96 	vp = fp->f_vnode;
     97 
     98 	if (vp->v_type == VREG && cnt == 0) {
     99 		goto out;
    100 	}
    101 
    102 	rwflag = 0;
    103 	aiov.iov_base = cbuf;
    104 	aiov.iov_len = cnt;
    105 
    106 	/*
    107 	 * We have to enter the critical region before calling VOP_RWLOCK
    108 	 * to avoid a deadlock with write() calls.
    109 	 */
    110 	if (nbl_need_check(vp)) {
    111 		int svmand;
    112 
    113 		nbl_start_crit(vp, RW_READER);
    114 		in_crit = 1;
    115 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    116 		if (error != 0)
    117 			goto out;
    118 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand,
    119 		    NULL)) {
    120 			error = EACCES;
    121 			goto out;
    122 		}
    123 	}
    124 
    125 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    126 
    127 	/*
    128 	 * We do the following checks inside VOP_RWLOCK so as to
    129 	 * prevent file size from changing while these checks are
    130 	 * being done. Also, we load fp's offset to the local
    131 	 * variable fileoff because we can have a parallel lseek
    132 	 * going on (f_offset is not protected by any lock) which
    133 	 * could change f_offset. We need to see the value only
    134 	 * once here and take a decision. Seeing it more than once
    135 	 * can lead to incorrect functionality.
    136 	 */
    137 
    138 	fileoff = (u_offset_t)fp->f_offset;
    139 	if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
    140 		struct vattr va;
    141 		va.va_mask = AT_SIZE;
    142 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
    143 			VOP_RWUNLOCK(vp, rwflag, NULL);
    144 			goto out;
    145 		}
    146 		if (fileoff >= va.va_size) {
    147 			cnt = 0;
    148 			VOP_RWUNLOCK(vp, rwflag, NULL);
    149 			goto out;
    150 		} else {
    151 			error = EOVERFLOW;
    152 			VOP_RWUNLOCK(vp, rwflag, NULL);
    153 			goto out;
    154 		}
    155 	}
    156 	if ((vp->v_type == VREG) &&
    157 	    (fileoff + cnt > OFFSET_MAX(fp))) {
    158 		cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
    159 	}
    160 	auio.uio_loffset = fileoff;
    161 	auio.uio_iov = &aiov;
    162 	auio.uio_iovcnt = 1;
    163 	auio.uio_resid = bcount = cnt;
    164 	auio.uio_segflg = UIO_USERSPACE;
    165 	auio.uio_llimit = MAXOFFSET_T;
    166 	auio.uio_fmode = fflag;
    167 	/*
    168 	 * Only use bypass caches when the count is large enough
    169 	 */
    170 	if (bcount <= copyout_max_cached)
    171 		auio.uio_extflg = UIO_COPY_CACHED;
    172 	else
    173 		auio.uio_extflg = UIO_COPY_DEFAULT;
    174 
    175 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    176 
    177 	/* If read sync is not asked for, filter sync flags */
    178 	if ((ioflag & FRSYNC) == 0)
    179 		ioflag &= ~(FSYNC|FDSYNC);
    180 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
    181 	cnt -= auio.uio_resid;
    182 	CPU_STATS_ENTER_K();
    183 	cp = CPU;
    184 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
    185 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
    186 	CPU_STATS_EXIT_K();
    187 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
    188 
    189 	if (vp->v_type == VFIFO)	/* Backward compatibility */
    190 		fp->f_offset = cnt;
    191 	else if (((fp->f_flag & FAPPEND) == 0) ||
    192 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
    193 		fp->f_offset = auio.uio_loffset;
    194 	VOP_RWUNLOCK(vp, rwflag, NULL);
    195 
    196 	if (error == EINTR && cnt != 0)
    197 		error = 0;
    198 out:
    199 	if (in_crit)
    200 		nbl_end_crit(vp);
    201 	releasef(fdes);
    202 	if (error)
    203 		return (set_errno(error));
    204 	return (cnt);
    205 }
    206 
    207 /*
    208  * Native system call
    209  */
    210 ssize_t
    211 write(int fdes, void *cbuf, size_t count)
    212 {
    213 	struct uio auio;
    214 	struct iovec aiov;
    215 	file_t *fp;
    216 	register vnode_t *vp;
    217 	struct cpu *cp;
    218 	int fflag, ioflag, rwflag;
    219 	ssize_t cnt, bcount;
    220 	int error = 0;
    221 	u_offset_t fileoff;
    222 	int in_crit = 0;
    223 
    224 	if ((cnt = (ssize_t)count) < 0)
    225 		return (set_errno(EINVAL));
    226 	if ((fp = getf(fdes)) == NULL)
    227 		return (set_errno(EBADF));
    228 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
    229 		error = EBADF;
    230 		goto out;
    231 	}
    232 	vp = fp->f_vnode;
    233 
    234 	if (vp->v_type == VREG && cnt == 0) {
    235 		goto out;
    236 	}
    237 
    238 	rwflag = 1;
    239 	aiov.iov_base = cbuf;
    240 	aiov.iov_len = cnt;
    241 
    242 	/*
    243 	 * We have to enter the critical region before calling VOP_RWLOCK
    244 	 * to avoid a deadlock with ufs.
    245 	 */
    246 	if (nbl_need_check(vp)) {
    247 		int svmand;
    248 
    249 		nbl_start_crit(vp, RW_READER);
    250 		in_crit = 1;
    251 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    252 		if (error != 0)
    253 			goto out;
    254 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand,
    255 		    NULL)) {
    256 			error = EACCES;
    257 			goto out;
    258 		}
    259 	}
    260 
    261 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    262 
    263 	fileoff = fp->f_offset;
    264 	if (vp->v_type == VREG) {
    265 
    266 		/*
    267 		 * We raise psignal if write for >0 bytes causes
    268 		 * it to exceed the ulimit.
    269 		 */
    270 		if (fileoff >= curproc->p_fsz_ctl) {
    271 			VOP_RWUNLOCK(vp, rwflag, NULL);
    272 
    273 			mutex_enter(&curproc->p_lock);
    274 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
    275 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
    276 			mutex_exit(&curproc->p_lock);
    277 
    278 			error = EFBIG;
    279 			goto out;
    280 		}
    281 		/*
    282 		 * We return EFBIG if write is done at an offset
    283 		 * greater than the offset maximum for this file structure.
    284 		 */
    285 
    286 		if (fileoff >= OFFSET_MAX(fp)) {
    287 			VOP_RWUNLOCK(vp, rwflag, NULL);
    288 			error = EFBIG;
    289 			goto out;
    290 		}
    291 		/*
    292 		 * Limit the bytes to be written  upto offset maximum for
    293 		 * this open file structure.
    294 		 */
    295 		if (fileoff + cnt > OFFSET_MAX(fp))
    296 			cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
    297 	}
    298 	auio.uio_loffset = fileoff;
    299 	auio.uio_iov = &aiov;
    300 	auio.uio_iovcnt = 1;
    301 	auio.uio_resid = bcount = cnt;
    302 	auio.uio_segflg = UIO_USERSPACE;
    303 	auio.uio_llimit = curproc->p_fsz_ctl;
    304 	auio.uio_fmode = fflag;
    305 	auio.uio_extflg = UIO_COPY_DEFAULT;
    306 
    307 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    308 
    309 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
    310 	cnt -= auio.uio_resid;
    311 	CPU_STATS_ENTER_K();
    312 	cp = CPU;
    313 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
    314 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
    315 	CPU_STATS_EXIT_K();
    316 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
    317 
    318 	if (vp->v_type == VFIFO)	/* Backward compatibility */
    319 		fp->f_offset = cnt;
    320 	else if (((fp->f_flag & FAPPEND) == 0) ||
    321 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
    322 		fp->f_offset = auio.uio_loffset;
    323 	VOP_RWUNLOCK(vp, rwflag, NULL);
    324 
    325 	if (error == EINTR && cnt != 0)
    326 		error = 0;
    327 out:
    328 	if (in_crit)
    329 		nbl_end_crit(vp);
    330 	releasef(fdes);
    331 	if (error)
    332 		return (set_errno(error));
    333 	return (cnt);
    334 }
    335 
    336 ssize_t
    337 pread(int fdes, void *cbuf, size_t count, off_t offset)
    338 {
    339 	struct uio auio;
    340 	struct iovec aiov;
    341 	file_t *fp;
    342 	register vnode_t *vp;
    343 	struct cpu *cp;
    344 	int fflag, ioflag, rwflag;
    345 	ssize_t bcount;
    346 	int error = 0;
    347 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
    348 #ifdef _SYSCALL32_IMPL
    349 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
    350 		MAXOFF32_T : MAXOFFSET_T;
    351 #else
    352 	const u_offset_t maxoff = MAXOFF32_T;
    353 #endif
    354 	int in_crit = 0;
    355 
    356 	if ((bcount = (ssize_t)count) < 0)
    357 		return (set_errno(EINVAL));
    358 
    359 	if ((fp = getf(fdes)) == NULL)
    360 		return (set_errno(EBADF));
    361 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
    362 		error = EBADF;
    363 		goto out;
    364 	}
    365 
    366 	rwflag = 0;
    367 	vp = fp->f_vnode;
    368 
    369 	if (vp->v_type == VREG) {
    370 
    371 		if (bcount == 0)
    372 			goto out;
    373 
    374 		/*
    375 		 * Return EINVAL if an invalid offset comes to pread.
    376 		 * Negative offset from user will cause this error.
    377 		 */
    378 
    379 		if (fileoff > maxoff) {
    380 			error = EINVAL;
    381 			goto out;
    382 		}
    383 		/*
    384 		 * Limit offset such that we don't read or write
    385 		 * a file beyond the maximum offset representable in
    386 		 * an off_t structure.
    387 		 */
    388 		if (fileoff + bcount > maxoff)
    389 			bcount = (ssize_t)((offset_t)maxoff - fileoff);
    390 	} else if (vp->v_type == VFIFO) {
    391 		error = ESPIPE;
    392 		goto out;
    393 	}
    394 
    395 	/*
    396 	 * We have to enter the critical region before calling VOP_RWLOCK
    397 	 * to avoid a deadlock with ufs.
    398 	 */
    399 	if (nbl_need_check(vp)) {
    400 		int svmand;
    401 
    402 		nbl_start_crit(vp, RW_READER);
    403 		in_crit = 1;
    404 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    405 		if (error != 0)
    406 			goto out;
    407 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
    408 		    NULL)) {
    409 			error = EACCES;
    410 			goto out;
    411 		}
    412 	}
    413 
    414 	aiov.iov_base = cbuf;
    415 	aiov.iov_len = bcount;
    416 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    417 	if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
    418 		struct vattr va;
    419 		va.va_mask = AT_SIZE;
    420 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL))) {
    421 			VOP_RWUNLOCK(vp, rwflag, NULL);
    422 			goto out;
    423 		}
    424 		VOP_RWUNLOCK(vp, rwflag, NULL);
    425 
    426 		/*
    427 		 * We have to return EOF if fileoff is >= file size.
    428 		 */
    429 		if (fileoff >= va.va_size) {
    430 			bcount = 0;
    431 			goto out;
    432 		}
    433 
    434 		/*
    435 		 * File is greater than or equal to maxoff and therefore
    436 		 * we return EOVERFLOW.
    437 		 */
    438 		error = EOVERFLOW;
    439 		goto out;
    440 	}
    441 	auio.uio_loffset = fileoff;
    442 	auio.uio_iov = &aiov;
    443 	auio.uio_iovcnt = 1;
    444 	auio.uio_resid = bcount;
    445 	auio.uio_segflg = UIO_USERSPACE;
    446 	auio.uio_llimit = MAXOFFSET_T;
    447 	auio.uio_fmode = fflag;
    448 	auio.uio_extflg = UIO_COPY_CACHED;
    449 
    450 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    451 
    452 	/* If read sync is not asked for, filter sync flags */
    453 	if ((ioflag & FRSYNC) == 0)
    454 		ioflag &= ~(FSYNC|FDSYNC);
    455 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
    456 	bcount -= auio.uio_resid;
    457 	CPU_STATS_ENTER_K();
    458 	cp = CPU;
    459 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
    460 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
    461 	CPU_STATS_EXIT_K();
    462 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
    463 	VOP_RWUNLOCK(vp, rwflag, NULL);
    464 
    465 	if (error == EINTR && bcount != 0)
    466 		error = 0;
    467 out:
    468 	if (in_crit)
    469 		nbl_end_crit(vp);
    470 	releasef(fdes);
    471 	if (error)
    472 		return (set_errno(error));
    473 	return (bcount);
    474 }
    475 
    476 ssize_t
    477 pwrite(int fdes, void *cbuf, size_t count, off_t offset)
    478 {
    479 	struct uio auio;
    480 	struct iovec aiov;
    481 	file_t *fp;
    482 	register vnode_t *vp;
    483 	struct cpu *cp;
    484 	int fflag, ioflag, rwflag;
    485 	ssize_t bcount;
    486 	int error = 0;
    487 	u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
    488 #ifdef _SYSCALL32_IMPL
    489 	u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
    490 		MAXOFF32_T : MAXOFFSET_T;
    491 #else
    492 	const u_offset_t maxoff = MAXOFF32_T;
    493 #endif
    494 	int in_crit = 0;
    495 
    496 	if ((bcount = (ssize_t)count) < 0)
    497 		return (set_errno(EINVAL));
    498 	if ((fp = getf(fdes)) == NULL)
    499 		return (set_errno(EBADF));
    500 	if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
    501 		error = EBADF;
    502 		goto out;
    503 	}
    504 
    505 	rwflag = 1;
    506 	vp = fp->f_vnode;
    507 
    508 	if (vp->v_type == VREG) {
    509 
    510 		if (bcount == 0)
    511 			goto out;
    512 
    513 		/*
    514 		 * return EINVAL for offsets that cannot be
    515 		 * represented in an off_t.
    516 		 */
    517 		if (fileoff > maxoff) {
    518 			error = EINVAL;
    519 			goto out;
    520 		}
    521 		/*
    522 		 * Take appropriate action if we are trying to write above the
    523 		 * resource limit.
    524 		 */
    525 		if (fileoff >= curproc->p_fsz_ctl) {
    526 			mutex_enter(&curproc->p_lock);
    527 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
    528 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
    529 			mutex_exit(&curproc->p_lock);
    530 
    531 			error = EFBIG;
    532 			goto out;
    533 		}
    534 		/*
    535 		 * Don't allow pwrite to cause file sizes to exceed
    536 		 * maxoff.
    537 		 */
    538 		if (fileoff == maxoff) {
    539 			error = EFBIG;
    540 			goto out;
    541 		}
    542 		if (fileoff + count > maxoff)
    543 			bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
    544 	} else if (vp->v_type == VFIFO) {
    545 		error = ESPIPE;
    546 		goto out;
    547 	}
    548 
    549 	/*
    550 	 * We have to enter the critical region before calling VOP_RWLOCK
    551 	 * to avoid a deadlock with ufs.
    552 	 */
    553 	if (nbl_need_check(vp)) {
    554 		int svmand;
    555 
    556 		nbl_start_crit(vp, RW_READER);
    557 		in_crit = 1;
    558 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    559 		if (error != 0)
    560 			goto out;
    561 		if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand,
    562 		    NULL)) {
    563 			error = EACCES;
    564 			goto out;
    565 		}
    566 	}
    567 
    568 	aiov.iov_base = cbuf;
    569 	aiov.iov_len = bcount;
    570 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    571 	auio.uio_loffset = fileoff;
    572 	auio.uio_iov = &aiov;
    573 	auio.uio_iovcnt = 1;
    574 	auio.uio_resid = bcount;
    575 	auio.uio_segflg = UIO_USERSPACE;
    576 	auio.uio_llimit = curproc->p_fsz_ctl;
    577 	auio.uio_fmode = fflag;
    578 	auio.uio_extflg = UIO_COPY_CACHED;
    579 
    580 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    581 
    582 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
    583 	bcount -= auio.uio_resid;
    584 	CPU_STATS_ENTER_K();
    585 	cp = CPU;
    586 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
    587 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
    588 	CPU_STATS_EXIT_K();
    589 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
    590 	VOP_RWUNLOCK(vp, rwflag, NULL);
    591 
    592 	if (error == EINTR && bcount != 0)
    593 		error = 0;
    594 out:
    595 	if (in_crit)
    596 		nbl_end_crit(vp);
    597 	releasef(fdes);
    598 	if (error)
    599 		return (set_errno(error));
    600 	return (bcount);
    601 }
    602 
    603 /*
    604  * XXX -- The SVID refers to IOV_MAX, but doesn't define it.  Grrrr....
    605  * XXX -- However, SVVS expects readv() and writev() to fail if
    606  * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
    607  * XXX -- so I guess that's the "interface".
    608  */
    609 #define	DEF_IOV_MAX	16
    610 
    611 ssize_t
    612 readv(int fdes, struct iovec *iovp, int iovcnt)
    613 {
    614 	struct uio auio;
    615 	struct iovec aiov[DEF_IOV_MAX];
    616 	file_t *fp;
    617 	register vnode_t *vp;
    618 	struct cpu *cp;
    619 	int fflag, ioflag, rwflag;
    620 	ssize_t count, bcount;
    621 	int error = 0;
    622 	int i;
    623 	u_offset_t fileoff;
    624 	int in_crit = 0;
    625 
    626 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
    627 		return (set_errno(EINVAL));
    628 
    629 #ifdef _SYSCALL32_IMPL
    630 	/*
    631 	 * 32-bit callers need to have their iovec expanded,
    632 	 * while ensuring that they can't move more than 2Gbytes
    633 	 * of data in a single call.
    634 	 */
    635 	if (get_udatamodel() == DATAMODEL_ILP32) {
    636 		struct iovec32 aiov32[DEF_IOV_MAX];
    637 		ssize32_t count32;
    638 
    639 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
    640 			return (set_errno(EFAULT));
    641 
    642 		count32 = 0;
    643 		for (i = 0; i < iovcnt; i++) {
    644 			ssize32_t iovlen32 = aiov32[i].iov_len;
    645 			count32 += iovlen32;
    646 			if (iovlen32 < 0 || count32 < 0)
    647 				return (set_errno(EINVAL));
    648 			aiov[i].iov_len = iovlen32;
    649 			aiov[i].iov_base =
    650 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
    651 		}
    652 	} else
    653 #endif
    654 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
    655 		return (set_errno(EFAULT));
    656 
    657 	count = 0;
    658 	for (i = 0; i < iovcnt; i++) {
    659 		ssize_t iovlen = aiov[i].iov_len;
    660 		count += iovlen;
    661 		if (iovlen < 0 || count < 0)
    662 			return (set_errno(EINVAL));
    663 	}
    664 	if ((fp = getf(fdes)) == NULL)
    665 		return (set_errno(EBADF));
    666 	if (((fflag = fp->f_flag) & FREAD) == 0) {
    667 		error = EBADF;
    668 		goto out;
    669 	}
    670 	vp = fp->f_vnode;
    671 	if (vp->v_type == VREG && count == 0) {
    672 		goto out;
    673 	}
    674 
    675 	rwflag = 0;
    676 
    677 	/*
    678 	 * We have to enter the critical region before calling VOP_RWLOCK
    679 	 * to avoid a deadlock with ufs.
    680 	 */
    681 	if (nbl_need_check(vp)) {
    682 		int svmand;
    683 
    684 		nbl_start_crit(vp, RW_READER);
    685 		in_crit = 1;
    686 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    687 		if (error != 0)
    688 			goto out;
    689 		if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand,
    690 		    NULL)) {
    691 			error = EACCES;
    692 			goto out;
    693 		}
    694 	}
    695 
    696 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    697 	fileoff = fp->f_offset;
    698 
    699 	/*
    700 	 * Behaviour is same as read. Please see comments in read.
    701 	 */
    702 
    703 	if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
    704 		struct vattr va;
    705 		va.va_mask = AT_SIZE;
    706 		if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL)))  {
    707 			VOP_RWUNLOCK(vp, rwflag, NULL);
    708 			goto out;
    709 		}
    710 		if (fileoff >= va.va_size) {
    711 			VOP_RWUNLOCK(vp, rwflag, NULL);
    712 			count = 0;
    713 			goto out;
    714 		} else {
    715 			VOP_RWUNLOCK(vp, rwflag, NULL);
    716 			error = EOVERFLOW;
    717 			goto out;
    718 		}
    719 	}
    720 	if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
    721 		count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
    722 	}
    723 	auio.uio_loffset = fileoff;
    724 	auio.uio_iov = aiov;
    725 	auio.uio_iovcnt = iovcnt;
    726 	auio.uio_resid = bcount = count;
    727 	auio.uio_segflg = UIO_USERSPACE;
    728 	auio.uio_llimit = MAXOFFSET_T;
    729 	auio.uio_fmode = fflag;
    730 	if (bcount <= copyout_max_cached)
    731 		auio.uio_extflg = UIO_COPY_CACHED;
    732 	else
    733 		auio.uio_extflg = UIO_COPY_DEFAULT;
    734 
    735 
    736 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    737 
    738 	/* If read sync is not asked for, filter sync flags */
    739 	if ((ioflag & FRSYNC) == 0)
    740 		ioflag &= ~(FSYNC|FDSYNC);
    741 	error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
    742 	count -= auio.uio_resid;
    743 	CPU_STATS_ENTER_K();
    744 	cp = CPU;
    745 	CPU_STATS_ADDQ(cp, sys, sysread, 1);
    746 	CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
    747 	CPU_STATS_EXIT_K();
    748 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
    749 
    750 	if (vp->v_type == VFIFO)	/* Backward compatibility */
    751 		fp->f_offset = count;
    752 	else if (((fp->f_flag & FAPPEND) == 0) ||
    753 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
    754 		fp->f_offset = auio.uio_loffset;
    755 
    756 	VOP_RWUNLOCK(vp, rwflag, NULL);
    757 
    758 	if (error == EINTR && count != 0)
    759 		error = 0;
    760 out:
    761 	if (in_crit)
    762 		nbl_end_crit(vp);
    763 	releasef(fdes);
    764 	if (error)
    765 		return (set_errno(error));
    766 	return (count);
    767 }
    768 
    769 ssize_t
    770 writev(int fdes, struct iovec *iovp, int iovcnt)
    771 {
    772 	struct uio auio;
    773 	struct iovec aiov[DEF_IOV_MAX];
    774 	file_t *fp;
    775 	register vnode_t *vp;
    776 	struct cpu *cp;
    777 	int fflag, ioflag, rwflag;
    778 	ssize_t count, bcount;
    779 	int error = 0;
    780 	int i;
    781 	u_offset_t fileoff;
    782 	int in_crit = 0;
    783 
    784 	if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
    785 		return (set_errno(EINVAL));
    786 
    787 #ifdef _SYSCALL32_IMPL
    788 	/*
    789 	 * 32-bit callers need to have their iovec expanded,
    790 	 * while ensuring that they can't move more than 2Gbytes
    791 	 * of data in a single call.
    792 	 */
    793 	if (get_udatamodel() == DATAMODEL_ILP32) {
    794 		struct iovec32 aiov32[DEF_IOV_MAX];
    795 		ssize32_t count32;
    796 
    797 		if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
    798 			return (set_errno(EFAULT));
    799 
    800 		count32 = 0;
    801 		for (i = 0; i < iovcnt; i++) {
    802 			ssize32_t iovlen = aiov32[i].iov_len;
    803 			count32 += iovlen;
    804 			if (iovlen < 0 || count32 < 0)
    805 				return (set_errno(EINVAL));
    806 			aiov[i].iov_len = iovlen;
    807 			aiov[i].iov_base =
    808 			    (caddr_t)(uintptr_t)aiov32[i].iov_base;
    809 		}
    810 	} else
    811 #endif
    812 	if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
    813 		return (set_errno(EFAULT));
    814 
    815 	count = 0;
    816 	for (i = 0; i < iovcnt; i++) {
    817 		ssize_t iovlen = aiov[i].iov_len;
    818 		count += iovlen;
    819 		if (iovlen < 0 || count < 0)
    820 			return (set_errno(EINVAL));
    821 	}
    822 	if ((fp = getf(fdes)) == NULL)
    823 		return (set_errno(EBADF));
    824 	if (((fflag = fp->f_flag) & FWRITE) == 0) {
    825 		error = EBADF;
    826 		goto out;
    827 	}
    828 	vp = fp->f_vnode;
    829 	if (vp->v_type == VREG && count == 0) {
    830 		goto out;
    831 	}
    832 
    833 	rwflag = 1;
    834 
    835 	/*
    836 	 * We have to enter the critical region before calling VOP_RWLOCK
    837 	 * to avoid a deadlock with ufs.
    838 	 */
    839 	if (nbl_need_check(vp)) {
    840 		int svmand;
    841 
    842 		nbl_start_crit(vp, RW_READER);
    843 		in_crit = 1;
    844 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    845 		if (error != 0)
    846 			goto out;
    847 		if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand,
    848 		    NULL)) {
    849 			error = EACCES;
    850 			goto out;
    851 		}
    852 	}
    853 
    854 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    855 
    856 	fileoff = fp->f_offset;
    857 
    858 	/*
    859 	 * Behaviour is same as write. Please see comments for write.
    860 	 */
    861 
    862 	if (vp->v_type == VREG) {
    863 		if (fileoff >= curproc->p_fsz_ctl) {
    864 			VOP_RWUNLOCK(vp, rwflag, NULL);
    865 			mutex_enter(&curproc->p_lock);
    866 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
    867 			    curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
    868 			mutex_exit(&curproc->p_lock);
    869 			error = EFBIG;
    870 			goto out;
    871 		}
    872 		if (fileoff >= OFFSET_MAX(fp)) {
    873 			VOP_RWUNLOCK(vp, rwflag, NULL);
    874 			error = EFBIG;
    875 			goto out;
    876 		}
    877 		if (fileoff + count > OFFSET_MAX(fp))
    878 			count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
    879 	}
    880 	auio.uio_loffset = fileoff;
    881 	auio.uio_iov = aiov;
    882 	auio.uio_iovcnt = iovcnt;
    883 	auio.uio_resid = bcount = count;
    884 	auio.uio_segflg = UIO_USERSPACE;
    885 	auio.uio_llimit = curproc->p_fsz_ctl;
    886 	auio.uio_fmode = fflag;
    887 	auio.uio_extflg = UIO_COPY_DEFAULT;
    888 
    889 	ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    890 
    891 	error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
    892 	count -= auio.uio_resid;
    893 	CPU_STATS_ENTER_K();
    894 	cp = CPU;
    895 	CPU_STATS_ADDQ(cp, sys, syswrite, 1);
    896 	CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
    897 	CPU_STATS_EXIT_K();
    898 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
    899 
    900 	if (vp->v_type == VFIFO)	/* Backward compatibility */
    901 		fp->f_offset = count;
    902 	else if (((fp->f_flag & FAPPEND) == 0) ||
    903 		(vp->v_type != VREG) || (bcount != 0))	/* POSIX */
    904 		fp->f_offset = auio.uio_loffset;
    905 	VOP_RWUNLOCK(vp, rwflag, NULL);
    906 
    907 	if (error == EINTR && count != 0)
    908 		error = 0;
    909 out:
    910 	if (in_crit)
    911 		nbl_end_crit(vp);
    912 	releasef(fdes);
    913 	if (error)
    914 		return (set_errno(error));
    915 	return (count);
    916 }
    917 
    918 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
    919 
    920 /*
    921  * This syscall supplies 64-bit file offsets to 32-bit applications only.
    922  */
    923 ssize32_t
    924 pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
    925     uint32_t offset_2)
    926 {
    927 	struct uio auio;
    928 	struct iovec aiov;
    929 	file_t *fp;
    930 	register vnode_t *vp;
    931 	struct cpu *cp;
    932 	int fflag, ioflag, rwflag;
    933 	ssize_t bcount;
    934 	int error = 0;
    935 	u_offset_t fileoff;
    936 	int in_crit = 0;
    937 
    938 #if defined(_LITTLE_ENDIAN)
    939 	fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
    940 #else
    941 	fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
    942 #endif
    943 
    944 	if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
    945 		return (set_errno(EINVAL));
    946 
    947 	if ((fp = getf(fdes)) == NULL)
    948 		return (set_errno(EBADF));
    949 	if (((fflag = fp->f_flag) & (FREAD)) == 0) {
    950 		error = EBADF;
    951 		goto out;
    952 	}
    953 
    954 	rwflag = 0;
    955 	vp = fp->f_vnode;
    956 
    957 	if (vp->v_type == VREG) {
    958 
    959 		if (bcount == 0)
    960 			goto out;
    961 
    962 		/*
    963 		 * Same as pread. See comments in pread.
    964 		 */
    965 
    966 		if (fileoff > MAXOFFSET_T) {
    967 			error = EINVAL;
    968 			goto out;
    969 		}
    970 		if (fileoff + bcount > MAXOFFSET_T)
    971 			bcount = (ssize_t)(MAXOFFSET_T - fileoff);
    972 	} else if (vp->v_type == VFIFO) {
    973 		error = ESPIPE;
    974 		goto out;
    975 	}
    976 
    977 	/*
    978 	 * We have to enter the critical region before calling VOP_RWLOCK
    979 	 * to avoid a deadlock with ufs.
    980 	 */
    981 	if (nbl_need_check(vp)) {
    982 		int svmand;
    983 
    984 		nbl_start_crit(vp, RW_READER);
    985 		in_crit = 1;
    986 		error = nbl_svmand(vp, fp->f_cred, &svmand);
    987 		if (error != 0)
    988 			goto out;
    989 		if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand,
    990 		    NULL)) {
    991 			error = EACCES;
    992 			goto out;
    993 		}
    994 	}
    995 
    996 	aiov.iov_base = cbuf;
    997 	aiov.iov_len = bcount;
    998 	(void) VOP_RWLOCK(vp, rwflag