Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)aio.c	1.111	07/06/25 SMI"
     28 
     29 /*
     30  * Kernel asynchronous I/O.
     31  * This is only for raw devices now (as of Nov. 1993).
     32  */
     33 
     34 #include <sys/types.h>
     35 #include <sys/errno.h>
     36 #include <sys/conf.h>
     37 #include <sys/file.h>
     38 #include <sys/fs/snode.h>
     39 #include <sys/unistd.h>
     40 #include <sys/cmn_err.h>
     41 #include <vm/as.h>
     42 #include <vm/faultcode.h>
     43 #include <sys/sysmacros.h>
     44 #include <sys/procfs.h>
     45 #include <sys/kmem.h>
     46 #include <sys/autoconf.h>
     47 #include <sys/ddi_impldefs.h>
     48 #include <sys/sunddi.h>
     49 #include <sys/aio_impl.h>
     50 #include <sys/debug.h>
     51 #include <sys/param.h>
     52 #include <sys/systm.h>
     53 #include <sys/vmsystm.h>
     54 #include <sys/fs/pxfs_ki.h>
     55 #include <sys/contract/process_impl.h>
     56 
     57 /*
     58  * external entry point.
     59  */
     60 #ifdef _LP64
     61 static int64_t kaioc(long, long, long, long, long, long);
     62 #endif
     63 static int kaio(ulong_t *, rval_t *);
     64 
     65 
     66 #define	AIO_64	0
     67 #define	AIO_32	1
     68 #define	AIO_LARGEFILE	2
     69 
     70 /*
     71  * implementation specific functions (private)
     72  */
     73 #ifdef _LP64
     74 static int alio(int, aiocb_t **, int, struct sigevent *);
     75 #endif
     76 static int aionotify(void);
     77 static int aioinit(void);
     78 static int aiostart(void);
     79 static void alio_cleanup(aio_t *, aiocb_t **, int, int);
     80 static int (*check_vp(struct vnode *, int))(vnode_t *, struct aio_req *,
     81     cred_t *);
     82 static void lio_set_error(aio_req_t *, int portused);
     83 static aio_t *aio_aiop_alloc();
     84 static int aio_req_alloc(aio_req_t **, aio_result_t *);
     85 static int aio_lio_alloc(aio_lio_t **);
     86 static aio_req_t *aio_req_done(void *);
     87 static aio_req_t *aio_req_remove(aio_req_t *);
     88 static int aio_req_find(aio_result_t *, aio_req_t **);
     89 static int aio_hash_insert(struct aio_req_t *, aio_t *);
     90 static int aio_req_setup(aio_req_t **, aio_t *, aiocb_t *,
     91     aio_result_t *, vnode_t *);
     92 static int aio_cleanup_thread(aio_t *);
     93 static aio_lio_t *aio_list_get(aio_result_t *);
     94 static void lio_set_uerror(void *, int);
     95 extern void aio_zerolen(aio_req_t *);
     96 static int aiowait(struct timeval *, int, long	*);
     97 static int aiowaitn(void *, uint_t, uint_t *, timespec_t *);
     98 static int aio_unlock_requests(caddr_t iocblist, int iocb_index,
     99     aio_req_t *reqlist, aio_t *aiop, model_t model);
    100 static int aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max);
    101 static int aiosuspend(void *, int, struct  timespec *, int,
    102     long	*, int);
    103 static int aliowait(int, void *, int, void *, int);
    104 static int aioerror(void *, int);
    105 static int aio_cancel(int, void *, long	*, int);
    106 static int arw(int, int, char *, int, offset_t, aio_result_t *, int);
    107 static int aiorw(int, void *, int, int);
    108 
    109 static int alioLF(int, void *, int, void *);
    110 static int aio_req_setupLF(aio_req_t **, aio_t *, aiocb64_32_t *,
    111     aio_result_t *, vnode_t *);
    112 static int alio32(int, void *, int, void *);
    113 static int driver_aio_write(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
    114 static int driver_aio_read(vnode_t *vp, struct aio_req *aio, cred_t *cred_p);
    115 
    116 #ifdef  _SYSCALL32_IMPL
    117 static void aiocb_LFton(aiocb64_32_t *, aiocb_t *);
    118 void	aiocb_32ton(aiocb32_t *, aiocb_t *);
    119 #endif /* _SYSCALL32_IMPL */
    120 
    121 /*
    122  * implementation specific functions (external)
    123  */
    124 void aio_req_free(aio_t *, aio_req_t *);
    125 
    126 /*
    127  * Event Port framework
    128  */
    129 
    130 void aio_req_free_port(aio_t *, aio_req_t *);
    131 static int aio_port_callback(void *, int *, pid_t, int, void *);
    132 
    133 /*
    134  * This is the loadable module wrapper.
    135  */
    136 #include <sys/modctl.h>
    137 #include <sys/syscall.h>
    138 
    139 #ifdef _LP64
    140 
    141 static struct sysent kaio_sysent = {
    142 	6,
    143 	SE_NOUNLOAD | SE_64RVAL | SE_ARGC,
    144 	(int (*)())kaioc
    145 };
    146 
    147 #ifdef _SYSCALL32_IMPL
    148 static struct sysent kaio_sysent32 = {
    149 	7,
    150 	SE_NOUNLOAD | SE_64RVAL,
    151 	kaio
    152 };
    153 #endif  /* _SYSCALL32_IMPL */
    154 
    155 #else   /* _LP64 */
    156 
    157 static struct sysent kaio_sysent = {
    158 	7,
    159 	SE_NOUNLOAD | SE_32RVAL1,
    160 	kaio
    161 };
    162 
    163 #endif  /* _LP64 */
    164 
    165 /*
    166  * Module linkage information for the kernel.
    167  */
    168 
    169 static struct modlsys modlsys = {
    170 	&mod_syscallops,
    171 	"kernel Async I/O",
    172 	&kaio_sysent
    173 };
    174 
    175 #ifdef  _SYSCALL32_IMPL
    176 static struct modlsys modlsys32 = {
    177 	&mod_syscallops32,
    178 	"kernel Async I/O for 32 bit compatibility",
    179 	&kaio_sysent32
    180 };
    181 #endif  /* _SYSCALL32_IMPL */
    182 
    183 
    184 static struct modlinkage modlinkage = {
    185 	MODREV_1,
    186 	&modlsys,
    187 #ifdef  _SYSCALL32_IMPL
    188 	&modlsys32,
    189 #endif
    190 	NULL
    191 };
    192 
    193 int
    194 _init(void)
    195 {
    196 	int retval;
    197 
    198 	if ((retval = mod_install(&modlinkage)) != 0)
    199 		return (retval);
    200 
    201 	return (0);
    202 }
    203 
    204 int
    205 _fini(void)
    206 {
    207 	int retval;
    208 
    209 	retval = mod_remove(&modlinkage);
    210 
    211 	return (retval);
    212 }
    213 
    214 int
    215 _info(struct modinfo *modinfop)
    216 {
    217 	return (mod_info(&modlinkage, modinfop));
    218 }
    219 
    220 #ifdef	_LP64
    221 static int64_t
    222 kaioc(
    223 	long	a0,
    224 	long	a1,
    225 	long	a2,
    226 	long	a3,
    227 	long	a4,
    228 	long	a5)
    229 {
    230 	int	error;
    231 	long	rval = 0;
    232 
    233 	switch ((int)a0 & ~AIO_POLL_BIT) {
    234 	case AIOREAD:
    235 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
    236 		    (offset_t)a4, (aio_result_t *)a5, FREAD);
    237 		break;
    238 	case AIOWRITE:
    239 		error = arw((int)a0, (int)a1, (char *)a2, (int)a3,
    240 		    (offset_t)a4, (aio_result_t *)a5, FWRITE);
    241 		break;
    242 	case AIOWAIT:
    243 		error = aiowait((struct timeval *)a1, (int)a2, &rval);
    244 		break;
    245 	case AIOWAITN:
    246 		error = aiowaitn((void *)a1, (uint_t)a2, (uint_t *)a3,
    247 		    (timespec_t *)a4);
    248 		break;
    249 	case AIONOTIFY:
    250 		error = aionotify();
    251 		break;
    252 	case AIOINIT:
    253 		error = aioinit();
    254 		break;
    255 	case AIOSTART:
    256 		error = aiostart();
    257 		break;
    258 	case AIOLIO:
    259 		error = alio((int)a1, (aiocb_t **)a2, (int)a3,
    260 		    (struct sigevent *)a4);
    261 		break;
    262 	case AIOLIOWAIT:
    263 		error = aliowait((int)a1, (void *)a2, (int)a3,
    264 		    (struct sigevent *)a4, AIO_64);
    265 		break;
    266 	case AIOSUSPEND:
    267 		error = aiosuspend((void *)a1, (int)a2, (timespec_t *)a3,
    268 		    (int)a4, &rval, AIO_64);
    269 		break;
    270 	case AIOERROR:
    271 		error = aioerror((void *)a1, AIO_64);
    272 		break;
    273 	case AIOAREAD:
    274 		error = aiorw((int)a0, (void *)a1, FREAD, AIO_64);
    275 		break;
    276 	case AIOAWRITE:
    277 		error = aiorw((int)a0, (void *)a1, FWRITE, AIO_64);
    278 		break;
    279 	case AIOCANCEL:
    280 		error = aio_cancel((int)a1, (void *)a2, &rval, AIO_64);
    281 		break;
    282 
    283 	/*
    284 	 * The large file related stuff is valid only for
    285 	 * 32 bit kernel and not for 64 bit kernel
    286 	 * On 64 bit kernel we convert large file calls
    287 	 * to regular 64bit calls.
    288 	 */
    289 
    290 	default:
    291 		error = EINVAL;
    292 	}
    293 	if (error)
    294 		return ((int64_t)set_errno(error));
    295 	return (rval);
    296 }
    297 #endif
    298 
    299 static int
    300 kaio(
    301 	ulong_t *uap,
    302 	rval_t *rvp)
    303 {
    304 	long rval = 0;
    305 	int	error = 0;
    306 	offset_t	off;
    307 
    308 
    309 		rvp->r_vals = 0;
    310 #if defined(_LITTLE_ENDIAN)
    311 	off = ((u_offset_t)uap[5] << 32) | (u_offset_t)uap[4];
    312 #else
    313 	off = ((u_offset_t)uap[4] << 32) | (u_offset_t)uap[5];
    314 #endif
    315 
    316 	switch (uap[0] & ~AIO_POLL_BIT) {
    317 	/*
    318 	 * It must be the 32 bit system call on 64 bit kernel
    319 	 */
    320 	case AIOREAD:
    321 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
    322 		    (int)uap[3], off, (aio_result_t *)uap[6], FREAD));
    323 	case AIOWRITE:
    324 		return (arw((int)uap[0], (int)uap[1], (char *)uap[2],
    325 		    (int)uap[3], off, (aio_result_t *)uap[6], FWRITE));
    326 	case AIOWAIT:
    327 		error = aiowait((struct	timeval *)uap[1], (int)uap[2],
    328 		    &rval);
    329 		break;
    330 	case AIOWAITN:
    331 		error = aiowaitn((void *)uap[1], (uint_t)uap[2],
    332 		    (uint_t *)uap[3], (timespec_t *)uap[4]);
    333 		break;
    334 	case AIONOTIFY:
    335 		return (aionotify());
    336 	case AIOINIT:
    337 		return (aioinit());
    338 	case AIOSTART:
    339 		return (aiostart());
    340 	case AIOLIO:
    341 		return (alio32((int)uap[1], (void *)uap[2], (int)uap[3],
    342 		    (void *)uap[4]));
    343 	case AIOLIOWAIT:
    344 		return (aliowait((int)uap[1], (void *)uap[2],
    345 		    (int)uap[3], (struct sigevent *)uap[4], AIO_32));
    346 	case AIOSUSPEND:
    347 		error = aiosuspend((void *)uap[1], (int)uap[2],
    348 		    (timespec_t *)uap[3], (int)uap[4],
    349 		    &rval, AIO_32);
    350 		break;
    351 	case AIOERROR:
    352 		return (aioerror((void *)uap[1], AIO_32));
    353 	case AIOAREAD:
    354 		return (aiorw((int)uap[0], (void *)uap[1],
    355 		    FREAD, AIO_32));
    356 	case AIOAWRITE:
    357 		return (aiorw((int)uap[0], (void *)uap[1],
    358 		    FWRITE, AIO_32));
    359 	case AIOCANCEL:
    360 		error = (aio_cancel((int)uap[1], (void *)uap[2], &rval,
    361 		    AIO_32));
    362 		break;
    363 	case AIOLIO64:
    364 		return (alioLF((int)uap[1], (void *)uap[2],
    365 		    (int)uap[3], (void *)uap[4]));
    366 	case AIOLIOWAIT64:
    367 		return (aliowait(uap[1], (void *)uap[2],
    368 		    (int)uap[3], (void *)uap[4], AIO_LARGEFILE));
    369 	case AIOSUSPEND64:
    370 		error = aiosuspend((void *)uap[1], (int)uap[2],
    371 		    (timespec_t *)uap[3], (int)uap[4], &rval,
    372 		    AIO_LARGEFILE);
    373 		break;
    374 	case AIOERROR64:
    375 		return (aioerror((void *)uap[1], AIO_LARGEFILE));
    376 	case AIOAREAD64:
    377 		return (aiorw((int)uap[0], (void *)uap[1], FREAD,
    378 		    AIO_LARGEFILE));
    379 	case AIOAWRITE64:
    380 		return (aiorw((int)uap[0], (void *)uap[1], FWRITE,
    381 		    AIO_LARGEFILE));
    382 	case AIOCANCEL64:
    383 		error = (aio_cancel((int)uap[1], (void *)uap[2],
    384 		    &rval, AIO_LARGEFILE));
    385 		break;
    386 	default:
    387 		return (EINVAL);
    388 	}
    389 
    390 	rvp->r_val1 = rval;
    391 	return (error);
    392 }
    393 
    394 /*
    395  * wake up LWPs in this process that are sleeping in
    396  * aiowait().
    397  */
    398 static int
    399 aionotify(void)
    400 {
    401 	aio_t	*aiop;
    402 
    403 	aiop = curproc->p_aio;
    404 	if (aiop == NULL)
    405 		return (0);
    406 
    407 	mutex_enter(&aiop->aio_mutex);
    408 	aiop->aio_notifycnt++;
    409 	cv_broadcast(&aiop->aio_waitcv);
    410 	mutex_exit(&aiop->aio_mutex);
    411 
    412 	return (0);
    413 }
    414 
    415 static int
    416 timeval2reltime(struct timeval *timout, timestruc_t *rqtime,
    417 	timestruc_t **rqtp, int *blocking)
    418 {
    419 #ifdef	_SYSCALL32_IMPL
    420 	struct timeval32 wait_time_32;
    421 #endif
    422 	struct timeval wait_time;
    423 	model_t	model = get_udatamodel();
    424 
    425 	*rqtp = NULL;
    426 	if (timout == NULL) {		/* wait indefinitely */
    427 		*blocking = 1;
    428 		return (0);
    429 	}
    430 
    431 	/*
    432 	 * Need to correctly compare with the -1 passed in for a user
    433 	 * address pointer, with both 32 bit and 64 bit apps.
    434 	 */
    435 	if (model == DATAMODEL_NATIVE) {
    436 		if ((intptr_t)timout == (intptr_t)-1) {	/* don't wait */
    437 			*blocking = 0;
    438 			return (0);
    439 		}
    440 
    441 		if (copyin(timout, &wait_time, sizeof (wait_time)))
    442 			return (EFAULT);
    443 	}
    444 #ifdef	_SYSCALL32_IMPL
    445 	else {
    446 		/*
    447 		 * -1 from a 32bit app. It will not get sign extended.
    448 		 * don't wait if -1.
    449 		 */
    450 		if ((intptr_t)timout == (intptr_t)((uint32_t)-1)) {
    451 			*blocking = 0;
    452 			return (0);
    453 		}
    454 
    455 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
    456 			return (EFAULT);
    457 		TIMEVAL32_TO_TIMEVAL(&wait_time, &wait_time_32);
    458 	}
    459 #endif  /* _SYSCALL32_IMPL */
    460 
    461 	if (wait_time.tv_sec == 0 && wait_time.tv_usec == 0) {	/* don't wait */
    462 		*blocking = 0;
    463 		return (0);
    464 	}
    465 
    466 	if (wait_time.tv_sec < 0 ||
    467 	    wait_time.tv_usec < 0 || wait_time.tv_usec >= MICROSEC)
    468 		return (EINVAL);
    469 
    470 	rqtime->tv_sec = wait_time.tv_sec;
    471 	rqtime->tv_nsec = wait_time.tv_usec * 1000;
    472 	*rqtp = rqtime;
    473 	*blocking = 1;
    474 
    475 	return (0);
    476 }
    477 
    478 static int
    479 timespec2reltime(timespec_t *timout, timestruc_t *rqtime,
    480 	timestruc_t **rqtp, int *blocking)
    481 {
    482 #ifdef	_SYSCALL32_IMPL
    483 	timespec32_t wait_time_32;
    484 #endif
    485 	model_t	model = get_udatamodel();
    486 
    487 	*rqtp = NULL;
    488 	if (timout == NULL) {
    489 		*blocking = 1;
    490 		return (0);
    491 	}
    492 
    493 	if (model == DATAMODEL_NATIVE) {
    494 		if (copyin(timout, rqtime, sizeof (*rqtime)))
    495 			return (EFAULT);
    496 	}
    497 #ifdef	_SYSCALL32_IMPL
    498 	else {
    499 		if (copyin(timout, &wait_time_32, sizeof (wait_time_32)))
    500 			return (EFAULT);
    501 		TIMESPEC32_TO_TIMESPEC(rqtime, &wait_time_32);
    502 	}
    503 #endif  /* _SYSCALL32_IMPL */
    504 
    505 	if (rqtime->tv_sec == 0 && rqtime->tv_nsec == 0) {
    506 		*blocking = 0;
    507 		return (0);
    508 	}
    509 
    510 	if (rqtime->tv_sec < 0 ||
    511 	    rqtime->tv_nsec < 0 || rqtime->tv_nsec >= NANOSEC)
    512 		return (EINVAL);
    513 
    514 	*rqtp = rqtime;
    515 	*blocking = 1;
    516 
    517 	return (0);
    518 }
    519 
    520 /*ARGSUSED*/
    521 static int
    522 aiowait(
    523 	struct timeval	*timout,
    524 	int	dontblockflg,
    525 	long	*rval)
    526 {
    527 	int 		error;
    528 	aio_t		*aiop;
    529 	aio_req_t	*reqp;
    530 	clock_t		status;
    531 	int		blocking;
    532 	int		timecheck;
    533 	timestruc_t	rqtime;
    534 	timestruc_t	*rqtp;
    535 
    536 	aiop = curproc->p_aio;
    537 	if (aiop == NULL)
    538 		return (EINVAL);
    539 
    540 	/*
    541 	 * Establish the absolute future time for the timeout.
    542 	 */
    543 	error = timeval2reltime(timout, &rqtime, &rqtp, &blocking);
    544 	if (error)
    545 		return (error);
    546 	if (rqtp) {
    547 		timestruc_t now;
    548 		timecheck = timechanged;
    549 		gethrestime(&now);
    550 		timespecadd(rqtp, &now);
    551 	}
    552 
    553 	mutex_enter(&aiop->aio_mutex);
    554 	for (;;) {
    555 		/* process requests on poll queue */
    556 		if (aiop->aio_pollq) {
    557 			mutex_exit(&aiop->aio_mutex);
    558 			aio_cleanup(0);
    559 			mutex_enter(&aiop->aio_mutex);
    560 		}
    561 		if ((reqp = aio_req_remove(NULL)) != NULL) {
    562 			*rval = (long)reqp->aio_req_resultp;
    563 			break;
    564 		}
    565 		/* user-level done queue might not be empty */
    566 		if (aiop->aio_notifycnt > 0) {
    567 			aiop->aio_notifycnt--;
    568 			*rval = 1;
    569 			break;
    570 		}
    571 		/* don't block if no outstanding aio */
    572 		if (aiop->aio_outstanding == 0 && dontblockflg) {
    573 			error = EINVAL;
    574 			break;
    575 		}
    576 		if (blocking) {
    577 			status = cv_waituntil_sig(&aiop->aio_waitcv,
    578 			    &aiop->aio_mutex, rqtp, timecheck);
    579 
    580 			if (status > 0)		/* check done queue again */
    581 				continue;
    582 			if (status == 0) {	/* interrupted by a signal */
    583 				error = EINTR;
    584 				*rval = -1;
    585 			} else {		/* timer expired */
    586 				error = ETIME;
    587 			}
    588 		}
    589 		break;
    590 	}
    591 	mutex_exit(&aiop->aio_mutex);
    592 	if (reqp) {
    593 		aphysio_unlock(reqp);
    594 		aio_copyout_result(reqp);
    595 		mutex_enter(&aiop->aio_mutex);
    596 		aio_req_free(aiop, reqp);
    597 		mutex_exit(&aiop->aio_mutex);
    598 	}
    599 	return (error);
    600 }
    601 
    602 /*
    603  * aiowaitn can be used to reap completed asynchronous requests submitted with
    604  * lio_listio, aio_read or aio_write.
    605  * This function only reaps asynchronous raw I/Os.
    606  */
    607 
    608 /*ARGSUSED*/
    609 static int
    610 aiowaitn(void *uiocb, uint_t nent, uint_t *nwait, timespec_t *timout)
    611 {
    612 	int 		error = 0;
    613 	aio_t		*aiop;
    614 	aio_req_t	*reqlist = NULL;
    615 	caddr_t		iocblist = NULL;	/* array of iocb ptr's */
    616 	uint_t		waitcnt, cnt = 0;	/* iocb cnt */
    617 	size_t		iocbsz;			/* users iocb size */
    618 	size_t		riocbsz;		/* returned iocb size */
    619 	int		iocb_index = 0;
    620 	model_t		model = get_udatamodel();
    621 	int		blocking = 1;
    622 	int		timecheck;
    623 	timestruc_t	rqtime;
    624 	timestruc_t	*rqtp;
    625 
    626 	aiop = curproc->p_aio;
    627 
    628 	if (aiop == NULL || aiop->aio_outstanding == 0)
    629 		return (EAGAIN);
    630 
    631 	if (copyin(nwait, &waitcnt, sizeof (uint_t)))
    632 		return (EFAULT);
    633 
    634 	/* set *nwait to zero, if we must return prematurely */
    635 	if (copyout(&cnt, nwait, sizeof (uint_t)))
    636 		return (EFAULT);
    637 
    638 	if (waitcnt == 0) {
    639 		blocking = 0;
    640 		rqtp = NULL;
    641 		waitcnt = nent;
    642 	} else {
    643 		error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
    644 		if (error)
    645 			return (error);
    646 	}
    647 
    648 	if (model == DATAMODEL_NATIVE)
    649 		iocbsz = (sizeof (aiocb_t *) * nent);
    650 #ifdef	_SYSCALL32_IMPL
    651 	else
    652 		iocbsz = (sizeof (caddr32_t) * nent);
    653 #endif  /* _SYSCALL32_IMPL */
    654 
    655 	/*
    656 	 * Only one aio_waitn call is allowed at a time.
    657 	 * The active aio_waitn will collect all requests
    658 	 * out of the "done" list and if necessary it will wait
    659 	 * for some/all pending requests to fulfill the nwait
    660 	 * parameter.
    661 	 * A second or further aio_waitn calls will sleep here
    662 	 * until the active aio_waitn finishes and leaves the kernel
    663 	 * If the second call does not block (poll), then return
    664 	 * immediately with the error code : EAGAIN.
    665 	 * If the second call should block, then sleep here, but
    666 	 * do not touch the timeout. The timeout starts when this
    667 	 * aio_waitn-call becomes active.
    668 	 */
    669 
    670 	mutex_enter(&aiop->aio_mutex);
    671 
    672 	while (aiop->aio_flags & AIO_WAITN) {
    673 		if (blocking == 0) {
    674 			mutex_exit(&aiop->aio_mutex);
    675 			return (EAGAIN);
    676 		}
    677 
    678 		/* block, no timeout */
    679 		aiop->aio_flags |= AIO_WAITN_PENDING;
    680 		if (!cv_wait_sig(&aiop->aio_waitncv, &aiop->aio_mutex)) {
    681 			mutex_exit(&aiop->aio_mutex);
    682 			return (EINTR);
    683 		}
    684 	}
    685 
    686 	/*
    687 	 * Establish the absolute future time for the timeout.
    688 	 */
    689 	if (rqtp) {
    690 		timestruc_t now;
    691 		timecheck = timechanged;
    692 		gethrestime(&now);
    693 		timespecadd(rqtp, &now);
    694 	}
    695 
    696 	if (iocbsz > aiop->aio_iocbsz && aiop->aio_iocb != NULL) {
    697 		kmem_free(aiop->aio_iocb, aiop->aio_iocbsz);
    698 		aiop->aio_iocb = NULL;
    699 	}
    700 
    701 	if (aiop->aio_iocb == NULL) {
    702 		iocblist = kmem_zalloc(iocbsz, KM_NOSLEEP);
    703 		if (iocblist == NULL) {
    704 			mutex_exit(&aiop->aio_mutex);
    705 			return (ENOMEM);
    706 		}
    707 		aiop->aio_iocb = (aiocb_t **)iocblist;
    708 		aiop->aio_iocbsz = iocbsz;
    709 	} else {
    710 		iocblist = (char *)aiop->aio_iocb;
    711 	}
    712 
    713 	aiop->aio_waitncnt = waitcnt;
    714 	aiop->aio_flags |= AIO_WAITN;
    715 
    716 	for (;;) {
    717 		/* push requests on poll queue to done queue */
    718 		if (aiop->aio_pollq) {
    719 			mutex_exit(&aiop->aio_mutex);
    720 			aio_cleanup(0);
    721 			mutex_enter(&aiop->aio_mutex);
    722 		}
    723 
    724 		/* check for requests on done queue */
    725 		if (aiop->aio_doneq) {
    726 			cnt += aio_reqlist_concat(aiop, &reqlist, nent - cnt);
    727 			aiop->aio_waitncnt = waitcnt - cnt;
    728 		}
    729 
    730 		/* user-level done queue might not be empty */
    731 		if (aiop->aio_notifycnt > 0) {
    732 			aiop->aio_notifycnt--;
    733 			error = 0;
    734 			break;
    735 		}
    736 
    737 		/*
    738 		 * if we are here second time as a result of timer
    739 		 * expiration, we reset error if there are enough
    740 		 * aiocb's to satisfy request.
    741 		 * We return also if all requests are already done
    742 		 * and we picked up the whole done queue.
    743 		 */
    744 
    745 		if ((cnt >= waitcnt) || (cnt > 0 && aiop->aio_pending == 0 &&
    746 		    aiop->aio_doneq == NULL)) {
    747 			error = 0;
    748 			break;
    749 		}
    750 
    751 		if ((cnt < waitcnt) && blocking) {
    752 			int rval = cv_waituntil_sig(&aiop->aio_waitcv,
    753 			    &aiop->aio_mutex, rqtp, timecheck);
    754 			if (rval > 0)
    755 				continue;
    756 			if (rval < 0) {
    757 				error = ETIME;
    758 				blocking = 0;
    759 				continue;
    760 			}
    761 			error = EINTR;
    762 		}
    763 		break;
    764 	}
    765 
    766 	mutex_exit(&aiop->aio_mutex);
    767 
    768 	if (cnt > 0) {
    769 
    770 		iocb_index = aio_unlock_requests(iocblist, iocb_index, reqlist,
    771 		    aiop, model);
    772 
    773 		if (model == DATAMODEL_NATIVE)
    774 			riocbsz = (sizeof (aiocb_t *) * cnt);
    775 #ifdef	_SYSCALL32_IMPL
    776 		else
    777 			riocbsz = (sizeof (caddr32_t) * cnt);
    778 #endif  /* _SYSCALL32_IMPL */
    779 
    780 		if (copyout(iocblist, uiocb, riocbsz) ||
    781 		    copyout(&cnt, nwait, sizeof (uint_t)))
    782 			error = EFAULT;
    783 	}
    784 
    785 	if (aiop->aio_iocbsz > AIO_IOCB_MAX) {
    786 		kmem_free(iocblist, aiop->aio_iocbsz);
    787 		aiop->aio_iocb = NULL;
    788 	}
    789 
    790 	/* check if there is another thread waiting for execution */
    791 	mutex_enter(&aiop->aio_mutex);
    792 	aiop->aio_flags &= ~AIO_WAITN;
    793 	if (aiop->aio_flags & AIO_WAITN_PENDING) {
    794 		aiop->aio_flags &= ~AIO_WAITN_PENDING;
    795 		cv_signal(&aiop->aio_waitncv);
    796 	}
    797 	mutex_exit(&aiop->aio_mutex);
    798 
    799 	return (error);
    800 }
    801 
    802 /*
    803  * aio_unlock_requests
    804  * copyouts the result of the request as well as the return value.
    805  * It builds the list of completed asynchronous requests,
    806  * unlocks the allocated memory ranges and
    807  * put the aio request structure back into the free list.
    808  */
    809 
    810 static int
    811 aio_unlock_requests(
    812 	caddr_t	iocblist,
    813 	int	iocb_index,
    814 	aio_req_t *reqlist,
    815 	aio_t	*aiop,
    816 	model_t	model)
    817 {
    818 	aio_req_t	*reqp, *nreqp;
    819 
    820 	if (model == DATAMODEL_NATIVE) {
    821 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
    822 			(((caddr_t *)iocblist)[iocb_index++]) =
    823 			    reqp->aio_req_iocb.iocb;
    824 			nreqp = reqp->aio_req_next;
    825 			aphysio_unlock(reqp);
    826 			aio_copyout_result(reqp);
    827 			mutex_enter(&aiop->aio_mutex);
    828 			aio_req_free(aiop, reqp);
    829 			mutex_exit(&aiop->aio_mutex);
    830 		}
    831 	}
    832 #ifdef	_SYSCALL32_IMPL
    833 	else {
    834 		for (reqp = reqlist; reqp != NULL;  reqp = nreqp) {
    835 			((caddr32_t *)iocblist)[iocb_index++] =
    836 			    reqp->aio_req_iocb.iocb32;
    837 			nreqp = reqp->aio_req_next;
    838 			aphysio_unlock(reqp);
    839 			aio_copyout_result(reqp);
    840 			mutex_enter(&aiop->aio_mutex);
    841 			aio_req_free(aiop, reqp);
    842 			mutex_exit(&aiop->aio_mutex);
    843 		}
    844 	}
    845 #endif	/* _SYSCALL32_IMPL */
    846 	return (iocb_index);
    847 }
    848 
    849 /*
    850  * aio_reqlist_concat
    851  * moves "max" elements from the done queue to the reqlist queue and removes
    852  * the AIO_DONEQ flag.
    853  * - reqlist queue is a simple linked list
    854  * - done queue is a double linked list
    855  */
    856 
    857 static int
    858 aio_reqlist_concat(aio_t *aiop, aio_req_t **reqlist, int max)
    859 {
    860 	aio_req_t *q2, *q2work, *list;
    861 	int count = 0;
    862 
    863 	list = *reqlist;
    864 	q2 = aiop->aio_doneq;
    865 	q2work = q2;
    866 	while (max-- > 0) {
    867 		q2work->aio_req_flags &= ~AIO_DONEQ;
    868 		q2work = q2work->aio_req_next;
    869 		count++;
    870 		if (q2work == q2)
    871 			break;
    872 	}
    873 
    874 	if (q2work == q2) {
    875 		/* all elements revised */
    876 		q2->aio_req_prev->aio_req_next = list;
    877 		list = q2;
    878 		aiop->aio_doneq = NULL;
    879 	} else {
    880 		/*
    881 		 * max < elements in the doneq
    882 		 * detach only the required amount of elements
    883 		 * out of the doneq
    884 		 */
    885 		q2work->aio_req_prev->aio_req_next = list;
    886 		list = q2;
    887 
    888 		aiop->aio_doneq = q2work;
    889 		q2work->aio_req_prev = q2->aio_req_prev;
    890 		q2->aio_req_prev->aio_req_next = q2work;
    891 	}
    892 	*reqlist = list;
    893 	return (count);
    894 }
    895 
    896 /*ARGSUSED*/
    897 static int
    898 aiosuspend(
    899 	void	*aiocb,
    900 	int	nent,
    901 	struct	timespec	*timout,
    902 	int	flag,
    903 	long	*rval,
    904 	int	run_mode)
    905 {
    906 	int 		error;
    907 	aio_t		*aiop;
    908 	aio_req_t	*reqp, *found, *next;
    909 	caddr_t		cbplist = NULL;
    910 	aiocb_t		*cbp, **ucbp;
    911 #ifdef	_SYSCALL32_IMPL
    912 	aiocb32_t	*cbp32;
    913 	caddr32_t	*ucbp32;
    914 #endif  /* _SYSCALL32_IMPL */
    915 	aiocb64_32_t	*cbp64;
    916 	int		rv;
    917 	int		i;
    918 	size_t		ssize;
    919 	model_t		model = get_udatamodel();
    920 	int		blocking;
    921 	int		timecheck;
    922 	timestruc_t	rqtime;
    923 	timestruc_t	*rqtp;
    924 
    925 	aiop = curproc->p_aio;
    926 	if (aiop == NULL || nent <= 0)
    927 		return (EINVAL);
    928 
    929 	/*
    930 	 * Establish the absolute future time for the timeout.
    931 	 */
    932 	error = timespec2reltime(timout, &rqtime, &rqtp, &blocking);
    933 	if (error)
    934 		return (error);
    935 	if (rqtp) {
    936 		timestruc_t now;
    937 		timecheck = timechanged;
    938 		gethrestime(&now);
    939 		timespecadd(rqtp, &now);
    940 	}
    941 
    942 	/*
    943 	 * If we are not blocking and there's no IO complete
    944 	 * skip aiocb copyin.
    945 	 */
    946 	if (!blocking && (aiop->aio_pollq == NULL) &&
    947 	    (aiop->aio_doneq == NULL)) {
    948 		return (EAGAIN);
    949 	}
    950 
    951 	if (model == DATAMODEL_NATIVE)
    952 		ssize = (sizeof (aiocb_t *) * nent);
    953 #ifdef	_SYSCALL32_IMPL
    954 	else
    955 		ssize = (sizeof (caddr32_t) * nent);
    956 #endif  /* _SYSCALL32_IMPL */
    957 
    958 	cbplist = kmem_alloc(ssize, KM_NOSLEEP);
    959 	if (cbplist == NULL)
    960 		return (ENOMEM);
    961 
    962 	if (copyin(aiocb, cbplist, ssize)) {
    963 		error = EFAULT;
    964 		goto done;
    965 	}
    966 
    967 	found = NULL;
    968 	/*
    969 	 * we need to get the aio_cleanupq_mutex since we call
    970 	 * aio_req_done().
    971 	 */
    972 	mutex_enter(&aiop->aio_cleanupq_mutex);
    973 	mutex_enter(&aiop->aio_mutex);
    974 	for (;;) {
    975 		/* push requests on poll queue to done queue */
    976 		if (aiop->aio_pollq) {
    977 			mutex_exit(&aiop->aio_mutex);
    978 			mutex_exit(&aiop->aio_cleanupq_mutex);
    979 			aio_cleanup(0);
    980 			mutex_enter(&aiop->aio_cleanupq_mutex);
    981 			mutex_enter(&aiop->aio_mutex);
    982 		}
    983 		/* check for requests on done queue */
    984 		if (aiop->aio_doneq) {
    985 			if (model == DATAMODEL_NATIVE)
    986 				ucbp = (aiocb_t **)cbplist;
    987 #ifdef	_SYSCALL32_IMPL
    988 			else
    989 				ucbp32 = (caddr32_t *)cbplist;
    990 #endif  /* _SYSCALL32_IMPL */
    991 			for (i = 0; i < nent; i++) {
    992 				if (model == DATAMODEL_NATIVE) {
    993 					if ((cbp = *ucbp++) == NULL)
    994 						continue;
    995 					if (run_mode != AIO_LARGEFILE)
    996 						reqp = aio_req_done(
    997 						    &cbp->aio_resultp);
    998 					else {
    999 						cbp64 = (aiocb64_32_t *)cbp;
   1000 						reqp = aio_req_done(
   1001 						    &cbp64->aio_resultp);
   1002 					}
   1003 				}
   1004 #ifdef	_SYSCALL32_IMPL
   1005 				else {
   1006 					if (run_mode == AIO_32) {
   1007 						if ((cbp32 =
   1008 						    (aiocb32_t *)(uintptr_t)
   1009 						    *ucbp32++) == NULL)
   1010 							continue;
   1011 						reqp = aio_req_done(
   1012 						    &cbp32->aio_resultp);
   1013 					} else if (run_mode == AIO_LARGEFILE) {
   1014 						if ((cbp64 =
   1015 						    (aiocb64_32_t *)(uintptr_t)
   1016 						    *ucbp32++) == NULL)
   1017 							continue;
   1018 						reqp = aio_req_done(
   1019 						    &cbp64->aio_resultp);
   1020 					}
   1021 
   1022 				}
   1023 #endif  /* _SYSCALL32_IMPL */
   1024 				if (reqp) {
   1025 					reqp->aio_req_next = found;
   1026 					found = reqp;
   1027 				}
   1028 				if (aiop->aio_doneq == NULL)
   1029 					break;
   1030 			}
   1031 			if (found)
   1032 				break;
   1033 		}
   1034 		if (aiop->aio_notifycnt > 0) {
   1035 			/*
   1036 			 * nothing on the kernel's queue. the user
   1037 			 * has notified the kernel that it has items
   1038 			 * on a user-level queue.
   1039 			 */
   1040 			aiop->aio_notifycnt--;
   1041 			*rval = 1;
   1042 			error = 0;
   1043 			break;
   1044 		}
   1045 		/* don't block if nothing is outstanding */
   1046 		if (aiop->aio_outstanding == 0) {
   1047 			error = EAGAIN;
   1048 			break;
   1049 		}
   1050 		if (blocking) {
   1051 			/*
   1052 			 * drop the aio_cleanupq_mutex as we are
   1053 			 * going to block.
   1054 			 */
   1055 			mutex_exit(&aiop->aio_cleanupq_mutex);
   1056 			rv = cv_waituntil_sig(&aiop->aio_waitcv,
   1057 			    &aiop->aio_mutex, rqtp, timecheck);
   1058 			/*
   1059 			 * we have to drop aio_mutex and
   1060 			 * grab it in the right order.
   1061 			 */
   1062 			mutex_exit(&aiop->aio_mutex);
   1063 			mutex_enter(&aiop->aio_cleanupq_mutex);
   1064 			mutex_enter(&aiop->aio_mutex);
   1065 			if (rv > 0)	/* check done queue again */
   1066 				continue;
   1067 			if (rv == 0)	/* interrupted by a signal */
   1068 				error = EINTR;
   1069 			else		/* timer expired */
   1070 				error = ETIME;
   1071 		} else {
   1072 			error = EAGAIN;
   1073 		}
   1074 		break;
   1075 	}
   1076 	mutex_exit(&aiop->aio_mutex);
   1077 	mutex_exit(&aiop->aio_cleanupq_mutex);
   1078 	for (reqp = found; reqp != NULL; reqp = next) {
   1079 		next = reqp->aio_req_next;
   1080 		aphysio_unlock(reqp);
   1081 		aio_copyout_result(reqp);
   1082 		mutex_enter(&aiop->aio_mutex);
   1083 		aio_req_free(aiop, reqp);
   1084 		mutex_exit(&aiop->aio_mutex);
   1085 	}
   1086 done:
   1087 	kmem_free(cbplist, ssize);
   1088 	return (error);
   1089 }
   1090 
   1091 /*
   1092  * initialize aio by allocating an aio_t struct for this
   1093  * process.
   1094  */
   1095 static int
   1096 aioinit(void)
   1097 {
   1098 	proc_t *p = curproc;
   1099 	aio_t *aiop;
   1100 	mutex_enter(&p->p_lock);
   1101 	if ((aiop = p->p_aio) == NULL) {