Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)sendfile.c	1.37	07/10/25 SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/t_lock.h>
     31 #include <sys/param.h>
     32 #include <sys/systm.h>
     33 #include <sys/buf.h>
     34 #include <sys/conf.h>
     35 #include <sys/cred.h>
     36 #include <sys/kmem.h>
     37 #include <sys/sysmacros.h>
     38 #include <sys/vfs.h>
     39 #include <sys/vnode.h>
     40 #include <sys/debug.h>
     41 #include <sys/errno.h>
     42 #include <sys/time.h>
     43 #include <sys/file.h>
     44 #include <sys/open.h>
     45 #include <sys/user.h>
     46 #include <sys/termios.h>
     47 #include <sys/stream.h>
     48 #include <sys/strsubr.h>
     49 #include <sys/sunddi.h>
     50 #include <sys/esunddi.h>
     51 #include <sys/flock.h>
     52 #include <sys/modctl.h>
     53 #include <sys/cmn_err.h>
     54 #include <sys/vmsystm.h>
     55 
     56 #include <sys/socket.h>
     57 #include <sys/socketvar.h>
     58 /* swilly code in sys/socketvar.h turns off DEBUG */
     59 #ifdef __lint
     60 #define	DEBUG
     61 #endif
     62 
     63 #include <netinet/in.h>
     64 #include <sys/sendfile.h>
     65 #include <sys/un.h>
     66 #include <sys/tihdr.h>
     67 #include <sys/atomic.h>
     68 
     69 #include <inet/common.h>
     70 #include <inet/ip.h>
     71 #include <inet/ip6.h>
     72 #include <inet/tcp.h>
     73 
     74 extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
     75 		ssize32_t *);
     76 extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
     77 		int, ssize_t *);
     78 extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, uint_t,
     79 		ssize_t *, boolean_t);
     80 
     81 #define	readflg	(V_WRITELOCK_FALSE)
     82 #define	rwflag	(V_WRITELOCK_TRUE)
     83 
     84 /*
     85  * kstrwritemp() has very similar semantics as that of strwrite().
     86  * The main difference is it obtains mblks from the caller and also
     87  * does not do any copy as done in strwrite() from user buffers to
     88  * kernel buffers.
     89  *
     90  * Currently, this routine is used by sendfile to send data allocated
     91  * within the kernel without any copying. This interface does not use the
     92  * synchronous stream interface as synch. stream interface implies
     93  * copying.
     94  */
     95 int
     96 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
     97 {
     98 	struct stdata *stp;
     99 	struct queue *wqp;
    100 	mblk_t *newmp;
    101 	char waitflag;
    102 	int tempmode;
    103 	int error = 0;
    104 	int done = 0;
    105 	struct sonode *so;
    106 	boolean_t direct;
    107 
    108 	ASSERT(vp->v_stream);
    109 	stp = vp->v_stream;
    110 
    111 	so = VTOSO(vp);
    112 	direct = (so->so_state & SS_DIRECT);
    113 
    114 	/*
    115 	 * This is the sockfs direct fast path. canputnext() need
    116 	 * not be accurate so we don't grab the sd_lock here. If
    117 	 * we get flow-controlled, we grab sd_lock just before the
    118 	 * do..while loop below to emulate what strwrite() does.
    119 	 */
    120 	wqp = stp->sd_wrq;
    121 	if (canputnext(wqp) && direct &&
    122 	    !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
    123 		return (sostream_direct(so, NULL, mp, CRED()));
    124 	} else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
    125 		/* Fast check of flags before acquiring the lock */
    126 		mutex_enter(&stp->sd_lock);
    127 		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
    128 		mutex_exit(&stp->sd_lock);
    129 		if (error != 0) {
    130 			if (!(stp->sd_flag & STPLEX) &&
    131 			    (stp->sd_wput_opt & SW_SIGPIPE)) {
    132 				tsignal(curthread, SIGPIPE);
    133 				error = EPIPE;
    134 			}
    135 			return (error);
    136 		}
    137 	}
    138 
    139 	waitflag = WRITEWAIT;
    140 	if (stp->sd_flag & OLDNDELAY)
    141 		tempmode = fmode & ~FNDELAY;
    142 	else
    143 		tempmode = fmode;
    144 
    145 	mutex_enter(&stp->sd_lock);
    146 	do {
    147 		if (canputnext(wqp)) {
    148 			mutex_exit(&stp->sd_lock);
    149 			if (stp->sd_wputdatafunc != NULL) {
    150 				newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
    151 				    NULL, NULL, NULL);
    152 				if (newmp == NULL) {
    153 					/* The caller will free mp */
    154 					return (ECOMM);
    155 				}
    156 				mp = newmp;
    157 			}
    158 			putnext(wqp, mp);
    159 			return (0);
    160 		}
    161 		error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
    162 		    &done);
    163 	} while (error == 0 && !done);
    164 
    165 	mutex_exit(&stp->sd_lock);
    166 	/*
    167 	 * EAGAIN tells the application to try again. ENOMEM
    168 	 * is returned only if the memory allocation size
    169 	 * exceeds the physical limits of the system. ENOMEM
    170 	 * can't be true here.
    171 	 */
    172 	if (error == ENOMEM)
    173 		error = EAGAIN;
    174 	return (error);
    175 }
    176 
    177 #define	SEND_MAX_CHUNK	16
    178 
    179 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
    180 /*
    181  * 64 bit offsets for 32 bit applications only running either on
    182  * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
    183  * more than 2GB of data.
    184  */
    185 int
    186 sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
    187     int copy_cnt, ssize32_t *count)
    188 {
    189 	struct vnode *vp;
    190 	ushort_t fflag;
    191 	int ioflag;
    192 	size32_t cnt;
    193 	ssize32_t sfv_len;
    194 	ssize32_t tmpcount;
    195 	u_offset_t sfv_off;
    196 	struct uio auio;
    197 	struct iovec aiov;
    198 	int i, error;
    199 
    200 	fflag = fp->f_flag;
    201 	vp = fp->f_vnode;
    202 	for (i = 0; i < copy_cnt; i++) {
    203 
    204 		if (ISSIG(curthread, JUSTLOOKING))
    205 			return (EINTR);
    206 
    207 		/*
    208 		 * Do similar checks as "write" as we are writing
    209 		 * sfv_len bytes into "vp".
    210 		 */
    211 		sfv_len = (ssize32_t)sfv->sfv_len;
    212 
    213 		if (sfv_len == 0)
    214 			continue;
    215 
    216 		if (sfv_len < 0)
    217 			return (EINVAL);
    218 
    219 		if (vp->v_type == VREG) {
    220 			if (*fileoff >= curproc->p_fsz_ctl) {
    221 				mutex_enter(&curproc->p_lock);
    222 				(void) rctl_action(
    223 				    rctlproc_legacy[RLIMIT_FSIZE],
    224 				    curproc->p_rctls, curproc, RCA_SAFE);
    225 				mutex_exit(&curproc->p_lock);
    226 				return (EFBIG);
    227 			}
    228 
    229 			if (*fileoff >= OFFSET_MAX(fp))
    230 				return (EFBIG);
    231 
    232 			if (*fileoff + sfv_len > OFFSET_MAX(fp))
    233 				return (EINVAL);
    234 		}
    235 
    236 		tmpcount = *count + sfv_len;
    237 		if (tmpcount < 0)
    238 			return (EINVAL);
    239 
    240 		sfv_off = sfv->sfv_off;
    241 
    242 		auio.uio_extflg = UIO_COPY_DEFAULT;
    243 		if (sfv->sfv_fd == SFV_FD_SELF) {
    244 			aiov.iov_len = sfv_len;
    245 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
    246 			auio.uio_loffset = *fileoff;
    247 			auio.uio_iovcnt = 1;
    248 			auio.uio_resid = sfv_len;
    249 			auio.uio_iov = &aiov;
    250 			auio.uio_segflg = UIO_USERSPACE;
    251 			auio.uio_llimit = curproc->p_fsz_ctl;
    252 			auio.uio_fmode = fflag;
    253 			ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    254 			while (sfv_len > 0) {
    255 				error = VOP_WRITE(vp, &auio, ioflag,
    256 				    fp->f_cred, NULL);
    257 				cnt = sfv_len - auio.uio_resid;
    258 				sfv_len -= cnt;
    259 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
    260 				if (vp->v_type == VREG)
    261 					*fileoff += cnt;
    262 				*count += cnt;
    263 				if (error != 0)
    264 					return (error);
    265 			}
    266 		} else {
    267 			file_t	*ffp;
    268 			vnode_t	*readvp;
    269 			size_t	size;
    270 			caddr_t	ptr;
    271 
    272 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
    273 				return (EBADF);
    274 
    275 			if ((ffp->f_flag & FREAD) == 0) {
    276 				releasef(sfv->sfv_fd);
    277 				return (EBADF);
    278 			}
    279 
    280 			readvp = ffp->f_vnode;
    281 			if (readvp->v_type != VREG) {
    282 				releasef(sfv->sfv_fd);
    283 				return (EINVAL);
    284 			}
    285 
    286 			/*
    287 			 * No point reading and writing to same vp,
    288 			 * as long as both are regular files. readvp is not
    289 			 * locked; but since we got it from an open file the
    290 			 * contents will be valid during the time of access.
    291 			 */
    292 			if (vn_compare(vp, readvp)) {
    293 				releasef(sfv->sfv_fd);
    294 				return (EINVAL);
    295 			}
    296 
    297 			/*
    298 			 * Note: we assume readvp != vp. "vp" is already
    299 			 * locked, and "readvp" must not be.
    300 			 */
    301 			(void) VOP_RWLOCK(readvp, readflg, NULL);
    302 
    303 			/*
    304 			 * Same checks as in pread64.
    305 			 */
    306 			if (sfv_off > MAXOFFSET_T) {
    307 				VOP_RWUNLOCK(readvp, readflg, NULL);
    308 				releasef(sfv->sfv_fd);
    309 				return (EINVAL);
    310 			}
    311 
    312 			if (sfv_off + sfv_len > MAXOFFSET_T)
    313 				sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
    314 
    315 			/* Find the native blocksize to transfer data */
    316 			size = MIN(vp->v_vfsp->vfs_bsize,
    317 			    readvp->v_vfsp->vfs_bsize);
    318 			size = sfv_len < size ? sfv_len : size;
    319 			ptr = kmem_alloc(size, KM_SLEEP);
    320 
    321 			while (sfv_len > 0) {
    322 				size_t	iov_len;
    323 
    324 				iov_len = MIN(size, sfv_len);
    325 				aiov.iov_base = ptr;
    326 				aiov.iov_len = iov_len;
    327 				auio.uio_loffset = sfv_off;
    328 				auio.uio_iov = &aiov;
    329 				auio.uio_iovcnt = 1;
    330 				auio.uio_resid = iov_len;
    331 				auio.uio_segflg = UIO_SYSSPACE;
    332 				auio.uio_llimit = MAXOFFSET_T;
    333 				auio.uio_fmode = ffp->f_flag;
    334 				ioflag = auio.uio_fmode &
    335 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    336 
    337 				/*
    338 				 * If read sync is not asked for,
    339 				 * filter sync flags
    340 				 */
    341 				if ((ioflag & FRSYNC) == 0)
    342 					ioflag &= ~(FSYNC|FDSYNC);
    343 				error = VOP_READ(readvp, &auio, ioflag,
    344 				    fp->f_cred, NULL);
    345 				if (error) {
    346 					kmem_free(ptr, size);
    347 					VOP_RWUNLOCK(readvp, readflg, NULL);
    348 					releasef(sfv->sfv_fd);
    349 					return (error);
    350 				}
    351 
    352 				/*
    353 				 * Check how must data was really read.
    354 				 * Decrement the 'len' and increment the
    355 				 * 'off' appropriately.
    356 				 */
    357 				cnt = iov_len - auio.uio_resid;
    358 				if (cnt == 0) {
    359 					/*
    360 					 * If we were reading a pipe (currently
    361 					 * not implemented), we may now lose
    362 					 * data.
    363 					 */
    364 					kmem_free(ptr, size);
    365 					VOP_RWUNLOCK(readvp, readflg, NULL);
    366 					releasef(sfv->sfv_fd);
    367 					return (EINVAL);
    368 				}
    369 				sfv_len -= cnt;
    370 				sfv_off += cnt;
    371 
    372 				aiov.iov_base = ptr;
    373 				aiov.iov_len = cnt;
    374 				auio.uio_loffset = *fileoff;
    375 				auio.uio_resid = cnt;
    376 				auio.uio_segflg = UIO_SYSSPACE;
    377 				auio.uio_llimit = curproc->p_fsz_ctl;
    378 				auio.uio_fmode = fflag;
    379 				ioflag = auio.uio_fmode &
    380 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    381 				error = VOP_WRITE(vp, &auio, ioflag,
    382 				    fp->f_cred, NULL);
    383 
    384 				/*
    385 				 * Check how much data was written. Increment
    386 				 * the 'len' and decrement the 'off' if all
    387 				 * the data was not written.
    388 				 */
    389 				cnt -= auio.uio_resid;
    390 				sfv_len += auio.uio_resid;
    391 				sfv_off -= auio.uio_resid;
    392 				ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
    393 				if (vp->v_type == VREG)
    394 					*fileoff += cnt;
    395 				*count += cnt;
    396 				if (error != 0) {
    397 					kmem_free(ptr, size);
    398 					VOP_RWUNLOCK(readvp, readflg, NULL);
    399 					releasef(sfv->sfv_fd);
    400 					return (error);
    401 				}
    402 			}
    403 			VOP_RWUNLOCK(readvp, readflg, NULL);
    404 			releasef(sfv->sfv_fd);
    405 			kmem_free(ptr, size);
    406 		}
    407 		sfv++;
    408 	}
    409 	return (0);
    410 }
    411 
    412 ssize32_t
    413 sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
    414 	size32_t *xferred, int fildes)
    415 {
    416 	u_offset_t		fileoff;
    417 	int			copy_cnt;
    418 	const struct ksendfilevec64 *copy_vec;
    419 	struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
    420 	struct vnode *vp;
    421 	int error;
    422 	ssize32_t count = 0;
    423 
    424 	vp = fp->f_vnode;
    425 	(void) VOP_RWLOCK(vp, rwflag, NULL);
    426 
    427 	copy_vec = vec;
    428 	fileoff = fp->f_offset;
    429 
    430 	do {
    431 		copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
    432 		if (copyin(copy_vec, sfv, copy_cnt *
    433 		    sizeof (struct ksendfilevec64))) {
    434 			error = EFAULT;
    435 			break;
    436 		}
    437 
    438 		/*
    439 		 * Optimize the regular file over
    440 		 * the socket case.
    441 		 */
    442 		if (vp->v_type == VSOCK && sfv->sfv_fd != SFV_FD_SELF) {
    443 			file_t *rfp;
    444 			vnode_t *rvp;
    445 
    446 			if ((rfp = getf(sfv->sfv_fd)) == NULL) {
    447 				error = EBADF;
    448 				break;
    449 			}
    450 			if ((rfp->f_flag & FREAD) == 0) {
    451 				releasef(sfv->sfv_fd);
    452 				error = EBADF;
    453 				break;
    454 			}
    455 			rvp = rfp->f_vnode;
    456 			if (rvp->v_type == VREG) {
    457 				error = sosendfile64(fp, rfp, sfv, &count);
    458 				if (error)
    459 					break;
    460 				copy_vec++;
    461 				sfvcnt--;
    462 				continue;
    463 			}
    464 			releasef(sfv->sfv_fd);
    465 		}
    466 		error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
    467 		if (error != 0)
    468 			break;
    469 
    470 		copy_vec += copy_cnt;
    471 		sfvcnt -= copy_cnt;
    472 	} while (sfvcnt > 0);
    473 
    474 	if (vp->v_type == VREG)
    475 		fp->f_offset += count;
    476 
    477 	VOP_RWUNLOCK(vp, rwflag, NULL);
    478 	if (copyout(&count, xferred, sizeof (count)))
    479 		error = EFAULT;
    480 	releasef(fildes);
    481 	if (error != 0)
    482 		return (set_errno(error));
    483 	return (count);
    484 }
    485 #endif
    486 
    487 int
    488 sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
    489     int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
    490 {
    491 	struct vnode *vp;
    492 	struct uio auio;
    493 	struct iovec aiov;
    494 	ushort_t fflag;
    495 	int ioflag;
    496 	int i, error;
    497 	size_t cnt;
    498 	ssize_t sfv_len;
    499 	u_offset_t sfv_off;
    500 #ifdef _SYSCALL32_IMPL
    501 	model_t model = get_udatamodel();
    502 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
    503 		MAXOFF32_T : MAXOFFSET_T;
    504 #else
    505 	const u_offset_t maxoff = MAXOFF32_T;
    506 #endif
    507 	mblk_t *dmp = NULL;
    508 	int wroff;
    509 	int buf_left = 0;
    510 	size_t	iov_len;
    511 	mblk_t  *head, *tmp;
    512 	size_t  size = total_size;
    513 	size_t  extra;
    514 	int tail_len;
    515 
    516 	fflag = fp->f_flag;
    517 	vp = fp->f_vnode;
    518 
    519 	ASSERT(vp->v_type == VSOCK);
    520 	ASSERT(maxblk > 0);
    521 
    522 	wroff = (int)vp->v_stream->sd_wroff;
    523 	tail_len = (int)vp->v_stream->sd_tail;
    524 	extra = wroff + tail_len;
    525 
    526 	buf_left = MIN(total_size, maxblk);
    527 	head = dmp = allocb(buf_left + extra, BPRI_HI);
    528 	if (head == NULL)
    529 		return (ENOMEM);
    530 	head->b_wptr = head->b_rptr = head->b_rptr + wroff;
    531 
    532 	auio.uio_extflg = UIO_COPY_DEFAULT;
    533 	for (i = 0; i < copy_cnt; i++) {
    534 		if (ISSIG(curthread, JUSTLOOKING))
    535 			return (EINTR);
    536 
    537 		/*
    538 		 * Do similar checks as "write" as we are writing
    539 		 * sfv_len bytes into "vp".
    540 		 */
    541 		sfv_len = (ssize_t)sfv->sfv_len;
    542 
    543 		if (sfv_len == 0) {
    544 			sfv++;
    545 			continue;
    546 		}
    547 
    548 		/* Make sure sfv_len is not negative */
    549 #ifdef _SYSCALL32_IMPL
    550 		if (model == DATAMODEL_ILP32) {
    551 			if ((ssize32_t)sfv_len < 0)
    552 				return (EINVAL);
    553 		} else
    554 #endif
    555 		if (sfv_len < 0)
    556 			return (EINVAL);
    557 
    558 		/* Check for overflow */
    559 #ifdef _SYSCALL32_IMPL
    560 		if (model == DATAMODEL_ILP32) {
    561 			if (((ssize32_t)(*count + sfv_len)) < 0)
    562 				return (EINVAL);
    563 		} else
    564 #endif
    565 		if ((*count + sfv_len) < 0)
    566 			return (EINVAL);
    567 
    568 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
    569 
    570 		if (sfv->sfv_fd == SFV_FD_SELF) {
    571 			while (sfv_len > 0) {
    572 				if (buf_left == 0) {
    573 					tmp = dmp;
    574 					buf_left = MIN(total_size, maxblk);
    575 					iov_len = MIN(buf_left, sfv_len);
    576 					dmp = allocb(buf_left + extra, BPRI_HI);
    577 					if (dmp == NULL) {
    578 						freemsg(head);
    579 						return (ENOMEM);
    580 					}
    581 					dmp->b_wptr = dmp->b_rptr =
    582 					    dmp->b_rptr + wroff;
    583 					tmp->b_cont = dmp;
    584 				} else {
    585 					iov_len = MIN(buf_left, sfv_len);
    586 				}
    587 
    588 				aiov.iov_len = iov_len;
    589 				aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
    590 				auio.uio_loffset = *fileoff;
    591 				auio.uio_iovcnt = 1;
    592 				auio.uio_resid = iov_len;
    593 				auio.uio_iov = &aiov;
    594 				auio.uio_segflg = UIO_USERSPACE;
    595 				auio.uio_llimit = curproc->p_fsz_ctl;
    596 				auio.uio_fmode = fflag;
    597 
    598 				buf_left -= iov_len;
    599 				total_size -= iov_len;
    600 				sfv_len -= iov_len;
    601 				sfv_off += iov_len;
    602 
    603 				error = uiomove((caddr_t)dmp->b_wptr,
    604 				    iov_len, UIO_WRITE, &auio);
    605 				if (error != 0) {
    606 					freemsg(head);
    607 					return (error);
    608 				}
    609 				dmp->b_wptr += iov_len;
    610 			}
    611 		} else {
    612 			file_t	*ffp;
    613 			vnode_t	*readvp;
    614 
    615 			if ((ffp = getf(sfv->sfv_fd)) == NULL) {
    616 				freemsg(head);
    617 				return (EBADF);
    618 			}
    619 
    620 			if ((ffp->f_flag & FREAD) == 0) {
    621 				releasef(sfv->sfv_fd);
    622 				freemsg(head);
    623 				return (EACCES);
    624 			}
    625 
    626 			readvp = ffp->f_vnode;
    627 			if (readvp->v_type != VREG) {
    628 				releasef(sfv->sfv_fd);
    629 				freemsg(head);
    630 				return (EINVAL);
    631 			}
    632 
    633 			/*
    634 			 * No point reading and writing to same vp,
    635 			 * as long as both are regular files. readvp is not
    636 			 * locked; but since we got it from an open file the
    637 			 * contents will be valid during the time of access.
    638 			 */
    639 
    640 			if (vn_compare(vp, readvp)) {
    641 				releasef(sfv->sfv_fd);
    642 				freemsg(head);
    643 				return (EINVAL);
    644 			}
    645 
    646 			/*
    647 			 * Note: we assume readvp != vp. "vp" is already
    648 			 * locked, and "readvp" must not be.
    649 			 */
    650 
    651 			(void) VOP_RWLOCK(readvp, readflg, NULL);
    652 
    653 			/* Same checks as in pread */
    654 			if (sfv_off > maxoff) {
    655 				VOP_RWUNLOCK(readvp, readflg, NULL);
    656 				releasef(sfv->sfv_fd);
    657 				freemsg(head);
    658 				return (EINVAL);
    659 			}
    660 			if (sfv_off + sfv_len > maxoff) {
    661 				total_size -= (sfv_off + sfv_len - maxoff);
    662 				sfv_len = (ssize_t)((offset_t)maxoff -
    663 				    sfv_off);
    664 			}
    665 
    666 			while (sfv_len > 0) {
    667 				if (buf_left == 0) {
    668 					tmp = dmp;
    669 					buf_left = MIN(total_size, maxblk);
    670 					iov_len = MIN(buf_left, sfv_len);
    671 					dmp = allocb(buf_left + extra, BPRI_HI);
    672 					if (dmp == NULL) {
    673 						VOP_RWUNLOCK(readvp, readflg,
    674 									NULL);
    675 						releasef(sfv->sfv_fd);
    676 						freemsg(head);
    677 						return (ENOMEM);
    678 					}
    679 					dmp->b_wptr = dmp->b_rptr =
    680 					    dmp->b_rptr + wroff;
    681 					tmp->b_cont = dmp;
    682 				} else {
    683 					iov_len = MIN(buf_left, sfv_len);
    684 				}
    685 				aiov.iov_base = (caddr_t)dmp->b_wptr;
    686 				aiov.iov_len = iov_len;
    687 				auio.uio_loffset = sfv_off;
    688 				auio.uio_iov = &aiov;
    689 				auio.uio_iovcnt = 1;
    690 				auio.uio_resid = iov_len;
    691 				auio.uio_segflg = UIO_SYSSPACE;
    692 				auio.uio_llimit = MAXOFFSET_T;
    693 				auio.uio_fmode = ffp->f_flag;
    694 				ioflag = auio.uio_fmode &
    695 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    696 
    697 				/*
    698 				 * If read sync is not asked for,
    699 				 * filter sync flags
    700 				 */
    701 				if ((ioflag & FRSYNC) == 0)
    702 					ioflag &= ~(FSYNC|FDSYNC);
    703 				error = VOP_READ(readvp, &auio, ioflag,
    704 				    fp->f_cred, NULL);
    705 				if (error != 0) {
    706 					/*
    707 					 * If we were reading a pipe (currently
    708 					 * not implemented), we may now loose
    709 					 * data.
    710 					 */
    711 					VOP_RWUNLOCK(readvp, readflg, NULL);
    712 					releasef(sfv->sfv_fd);
    713 					freemsg(head);
    714 					return (error);
    715 				}
    716 
    717 				/*
    718 				 * Check how much data was really read.
    719 				 * Decrement the 'len' and increment the
    720 				 * 'off' appropriately.
    721 				 */
    722 				cnt = iov_len - auio.uio_resid;
    723 				if (cnt == 0) {
    724 					VOP_RWUNLOCK(readvp, readflg, NULL);
    725 					releasef(sfv->sfv_fd);
    726 					freemsg(head);
    727 					return (EINVAL);
    728 				}
    729 				sfv_len -= cnt;
    730 				sfv_off += cnt;
    731 				total_size -= cnt;
    732 				buf_left -= cnt;
    733 
    734 				dmp->b_wptr += cnt;
    735 			}
    736 			VOP_RWUNLOCK(readvp, readflg, NULL);
    737 			releasef(sfv->sfv_fd);
    738 		}
    739 		sfv++;
    740 	}
    741 
    742 	ASSERT(total_size == 0);
    743 	error = kstrwritemp(vp, head, fflag);
    744 	if (error != 0) {
    745 		freemsg(head);
    746 		return (error);
    747 	}
    748 	ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
    749 	*count += size;
    750 
    751 	return (0);
    752 }
    753 
    754 
    755 int
    756 sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
    757     int copy_cnt, ssize_t *count)
    758 {
    759 	struct vnode *vp;
    760 	struct uio auio;
    761 	struct iovec aiov;
    762 	ushort_t fflag;
    763 	int ioflag;
    764 	int i, error;
    765 	size_t cnt;
    766 	ssize_t sfv_len;
    767 	u_offset_t sfv_off;
    768 #ifdef _SYSCALL32_IMPL
    769 	model_t model = get_udatamodel();
    770 	u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
    771 		MAXOFF32_T : MAXOFFSET_T;
    772 #else
    773 	const u_offset_t maxoff = MAXOFF32_T;
    774 #endif
    775 	mblk_t	*dmp = NULL;
    776 	char	*buf = NULL;
    777 	size_t  extra;
    778 	int maxblk, wroff, tail_len;
    779 	struct sonode *so;
    780 	stdata_t *stp;
    781 
    782 	fflag = fp->f_flag;
    783 	vp = fp->f_vnode;
    784 
    785 	if (vp->v_type == VSOCK) {
    786 		so = VTOSO(vp);
    787 		stp = vp->v_stream;
    788 		wroff = (int)stp->sd_wroff;
    789 		tail_len = (int)stp->sd_tail;
    790 		maxblk = (int)stp->sd_maxblk;
    791 		extra = wroff + tail_len;
    792 	}
    793 
    794 	auio.uio_extflg = UIO_COPY_DEFAULT;
    795 	for (i = 0; i < copy_cnt; i++) {
    796 		if (ISSIG(curthread, JUSTLOOKING))
    797 			return (EINTR);
    798 
    799 		/*
    800 		 * Do similar checks as "write" as we are writing
    801 		 * sfv_len bytes into "vp".
    802 		 */
    803 		sfv_len = (ssize_t)sfv->sfv_len;
    804 
    805 		if (sfv_len == 0) {
    806 			sfv++;
    807 			continue;
    808 		}
    809 
    810 		/* Make sure sfv_len is not negative */
    811 #ifdef _SYSCALL32_IMPL
    812 		if (model == DATAMODEL_ILP32) {
    813 			if ((ssize32_t)sfv_len < 0)
    814 				return (EINVAL);
    815 		} else
    816 #endif
    817 		if (sfv_len < 0)
    818 			return (EINVAL);
    819 
    820 		if (vp->v_type == VREG) {
    821 			if (*fileoff >= curproc->p_fsz_ctl) {
    822 				mutex_enter(&curproc->p_lock);
    823 				(void) rctl_action(
    824 				    rctlproc_legacy[RLIMIT_FSIZE],
    825 				    curproc->p_rctls, curproc, RCA_SAFE);
    826 				mutex_exit(&curproc->p_lock);
    827 
    828 				return (EFBIG);
    829 			}
    830 
    831 			if (*fileoff >= maxoff)
    832 				return (EFBIG);
    833 
    834 			if (*fileoff + sfv_len > maxoff)
    835 				return (EINVAL);
    836 		}
    837 
    838 		/* Check for overflow */
    839 #ifdef _SYSCALL32_IMPL
    840 		if (model == DATAMODEL_ILP32) {
    841 			if (((ssize32_t)(*count + sfv_len)) < 0)
    842 				return (EINVAL);
    843 		} else
    844 #endif
    845 		if ((*count + sfv_len) < 0)
    846 			return (EINVAL);
    847 
    848 		sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
    849 
    850 		if (sfv->sfv_fd == SFV_FD_SELF) {
    851 			aiov.iov_len = sfv_len;
    852 			aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
    853 			auio.uio_loffset = *fileoff;
    854 			auio.uio_iovcnt = 1;
    855 			auio.uio_resid = sfv_len;
    856 			auio.uio_iov = &aiov;
    857 			auio.uio_segflg = UIO_USERSPACE;
    858 			auio.uio_llimit = curproc->p_fsz_ctl;
    859 			auio.uio_fmode = fflag;
    860 
    861 			if (vp->v_type == VSOCK) {
    862 
    863 				/*
    864 				 * Optimize for the socket case
    865 				 */
    866 
    867 				dmp = allocb(sfv_len + extra, BPRI_HI);
    868 				if (dmp == NULL)
    869 					return (ENOMEM);
    870 				dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
    871 				error = uiomove((caddr_t)dmp->b_wptr,
    872 				    sfv_len, UIO_WRITE, &auio);
    873 				if (error != 0) {
    874 					freeb(dmp);
    875 					return (error);
    876 				}
    877 				dmp->b_wptr += sfv_len;
    878 				error = kstrwritemp(vp, dmp, fflag);
    879 				if (error != 0) {
    880 					freeb(dmp);
    881 					return (error);
    882 				}
    883 				ttolwp(curthread)->lwp_ru.ioch +=
    884 				    (ulong_t)sfv_len;
    885 				*count += sfv_len;
    886 			} else {
    887 				ioflag = auio.uio_fmode &
    888 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
    889 				while (sfv_len > 0) {
    890 					error = VOP_WRITE(vp, &auio, ioflag,
    891 					    fp->f_cred, NULL);
    892 					cnt = sfv_len - auio.uio_resid;
    893 					sfv_len -= cnt;
    894 					ttolwp(curthread)->lwp_ru.ioch +=
    895 					    (ulong_t)cnt;
    896 					*fileoff += cnt;
    897 					*count += cnt;
    898 					if (error != 0)
    899 						return (error);
    900 				}
    901 			}
    902 		} else {
    903 			int segmapit = 0;
    904 			file_t	*ffp;
    905 			vnode_t	*readvp;
    906 			struct vnode *realvp;
    907 			size_t	size;
    908 			caddr_t	ptr;
    909 
    910 			if ((ffp = getf(sfv->sfv_fd)) == NULL)
    911 				return (EBADF);
    912 
    913 			if ((ffp->f_flag & FREAD) == 0) {
    914 				releasef(sfv->sfv_fd);
    915 				return (EBADF);
    916 			}
    917 
    918 			readvp = ffp->f_vnode;
    919 			if (VOP_REALVP(readvp, &realvp, NULL) == 0)
    920 				readvp = realvp;
    921 			if (readvp->v_type != VREG) {
    922 				releasef(sfv->sfv_fd);
    923 				return (EINVAL);
    924 			}
    925 
    926 			/*
    927 			 * No point reading and writing to same vp,
    928 			 * as long as both are regular files. readvp is not
    929 			 * locked; but since we got it from an open file the
    930 			 * contents will be valid during the time of access.
    931 			 */
    932 			if (vn_compare(vp, readvp)) {
    933 				releasef(sfv->sfv_fd);
    934 				return (EINVAL);
    935 			}
    936 
    937 			/*
    938 			 * Note: we assume readvp != vp. "vp" is already
    939 			 * locked, and "readvp" must not be.
    940 			 */
    941 			(void) VOP_RWLOCK(readvp, readflg, NULL);
    942 
    943 			/* Same checks as in pread */
    944 			if (sfv_off > maxoff) {
    945 				VOP_RWUNLOCK(readvp, readflg, NULL);
    946 				releasef(sfv->sfv_fd);
    947 				return (EINVAL);
    948 			}
    949 			if (sfv_off + sfv_len > maxoff) {
    950 				sfv_len = (ssize_t)((offset_t)maxoff -
    951 				    sfv_off);
    952 			}
    953 			/* Find the native blocksize to transfer data */
    954 			size = MIN(vp->v_vfsp->vfs_bsize,
    955 			    readvp->v_vfsp->vfs_bsize);
    956 			size = sfv_len < size ? sfv_len : size;
    957 
    958 			if (vp->v_type != VSOCK) {
    959 				segmapit = 0;
    960 				buf = kmem_alloc(size, KM_NOSLEEP);
    961 				if (buf == NULL) {
    962 					VOP_RWUNLOCK(readvp, readflg, NULL);
    963 					releasef(sfv->sfv_fd);
    964 					return (ENOMEM);
    965 				}
    966 			} else {
    967 				/*
    968 				 * For sockets acting as an SSL proxy, we
    969 				 * need to adjust the size to the maximum
    970 				 * SSL record size set in the stream head.
    971 				 */
    972 				if (so->so_kssl_ctx != NULL)
    973 					size = MIN(size, maxblk);
    974 
    975 				if (vn_has_flocks(readvp) ||
    976 				    readvp->v_flag & VNOMAP ||
    977 				    stp->sd_copyflag & STZCVMUNSAFE) {
    978 					segmapit = 0;
    979 				} else if (stp->sd_copyflag & STZCVMSAFE) {
    980 					segmapit = 1;
    981 				} else {
    982 					int on = 1;
    983 					if (SOP_SETSOCKOPT(VTOSO(vp),
    984 					    SOL_SOCKET, SO_SND_COPYAVOID,
    985 					    &on, sizeof (on)) == 0)
    986 					segmapit = 1;
    987 				}
    988 			}
    989 
    990 			if (segmapit) {
    991 				boolean_t nowait;
    992 				uint_t maxpsz;
    993 
    994 				nowait = (sfv->sfv_flag & SFV_NOWAIT) != 0;
    995 				maxpsz = stp->sd_qn_maxpsz;
    996 				if (maxpsz == INFPSZ)
    997 					maxpsz = maxphys;
    998 				maxpsz = roundup(maxpsz, MAXBSIZE);
    999 				error = snf_segmap(fp, readvp, sfv_off,
   1000 					    (u_offset_t)sfv_len, maxpsz,
   1001 					    (ssize_t *)&cnt, nowait);
   1002 				releasef(sfv->sfv_fd);
   1003 				*count += cnt;
   1004 				if (error)
   1005 					return (error);
   1006 				sfv++;
   1007 				continue;
   1008 			}
   1009 
   1010 			while (sfv_len > 0) {
   1011 				size_t	iov_len;
   1012 
   1013 				iov_len = MIN(size, sfv_len);
   1014 
   1015 				if (vp->v_type == VSOCK) {
   1016 					dmp = allocb(iov_len + extra, BPRI_HI);
   1017 					if (dmp == NULL) {
   1018 						VOP_RWUNLOCK(readvp, readflg,
   1019 						    NULL);
   1020 						releasef(sfv->sfv_fd);
   1021 						return (ENOMEM);
   1022 					}
   1023 					dmp->b_wptr = dmp->b_rptr =
   1024 					    dmp->b_rptr + wroff;
   1025 					ptr = (caddr_t)dmp->b_rptr;
   1026 				} else {
   1027 					ptr = buf;
   1028 				}
   1029 
   1030 				aiov.iov_base = ptr;
   1031 				aiov.iov_len = iov_len;
   1032 				auio.uio_loffset = sfv_off;
   1033 				auio.uio_iov = &aiov;
   1034 				auio.uio_iovcnt = 1;
   1035 				auio.uio_resid = iov_len;
   1036 				auio.uio_segflg = UIO_SYSSPACE;
   1037 				auio.uio_llimit = MAXOFFSET_T;
   1038 				auio.uio_fmode = ffp->f_flag;
   1039 				ioflag = auio.uio_fmode &
   1040 				    (FAPPEND|FSYNC|FDSYNC|FRSYNC);
   1041 
   1042 				/*
   1043 				 * If read sync is not asked for,
   1044 				 * filter sync flags
   1045 				 */
   1046 				if ((ioflag & FRSYNC) == 0)
   1047 					ioflag &= ~(FSYNC|FDSYNC);
   1048 				error = VOP_READ(readvp, &auio, ioflag,
   1049 				    fp->f_cred, NULL);
   1050 				if (error != 0) {
   1051 					/*
   1052 					 * If we were reading a pipe (currently
   1053 					 * not implemented), we may now lose
   1054 					 * data.
   1055 					 */
   1056 					if (vp->v_type == VSOCK)
   1057 						freeb(dmp);
   1058 					else
   1059 						kmem_free(buf, size);
   1060 					VOP_RWUNLOCK(