Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)aio_subr.c	1.83	07/06/25 SMI"
     28 
     29 #include <sys/types.h>
     30 #include <sys/proc.h>
     31 #include <sys/file.h>
     32 #include <sys/errno.h>
     33 #include <sys/param.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/systm.h>
     37 #include <vm/as.h>
     38 #include <vm/page.h>
     39 #include <sys/uio.h>
     40 #include <sys/kmem.h>
     41 #include <sys/debug.h>
     42 #include <sys/aio_impl.h>
     43 #include <sys/epm.h>
     44 #include <sys/fs/snode.h>
     45 #include <sys/siginfo.h>
     46 #include <sys/cpuvar.h>
     47 #include <sys/tnf_probe.h>
     48 #include <sys/conf.h>
     49 #include <sys/sdt.h>
     50 
     51 int aphysio(int (*)(), int (*)(), dev_t, int, void (*)(), struct aio_req *);
     52 void aio_done(struct buf *);
     53 void aphysio_unlock(aio_req_t *);
     54 void aio_cleanup(int);
     55 void aio_cleanup_exit(void);
     56 
     57 /*
     58  * private functions
     59  */
     60 static void aio_sigev_send(proc_t *, sigqueue_t *);
     61 static void aio_hash_delete(aio_t *, aio_req_t *);
     62 static void aio_lio_free(aio_t *, aio_lio_t *);
     63 static void aio_cleanup_cleanupq(aio_t *, aio_req_t *, int);
     64 static int aio_cleanup_notifyq(aio_t *, aio_req_t *, int);
     65 static void aio_cleanup_pollq(aio_t *, aio_req_t *, int);
     66 static void aio_cleanup_portq(aio_t *, aio_req_t *, int);
     67 
     68 /*
     69  * async version of physio() that doesn't wait synchronously
     70  * for the driver's strategy routine to complete.
     71  */
     72 
     73 int
     74 aphysio(
     75 	int (*strategy)(struct buf *),
     76 	int (*cancel)(struct buf *),
     77 	dev_t dev,
     78 	int rw,
     79 	void (*mincnt)(struct buf *),
     80 	struct aio_req *aio)
     81 {
     82 	struct uio *uio = aio->aio_uio;
     83 	aio_req_t *reqp = (aio_req_t *)aio->aio_private;
     84 	struct buf *bp = &reqp->aio_req_buf;
     85 	struct iovec *iov;
     86 	struct as *as;
     87 	char *a;
     88 	int	error;
     89 	size_t	c;
     90 	struct page **pplist;
     91 	struct dev_ops *ops = devopsp[getmajor(dev)];
     92 
     93 	if (uio->uio_loffset < 0)
     94 		return (EINVAL);
     95 #ifdef	_ILP32
     96 	/*
     97 	 * For 32-bit kernels, check against SPEC_MAXOFFSET_T which represents
     98 	 * the maximum size that can be supported by the IO subsystem.
     99 	 * XXX this code assumes a D_64BIT driver.
    100 	 */
    101 	if (uio->uio_loffset > SPEC_MAXOFFSET_T)
    102 		return (EINVAL);
    103 #endif	/* _ILP32 */
    104 
    105 	TNF_PROBE_5(aphysio_start, "kaio", /* CSTYLED */,
    106 		tnf_opaque, bp, bp,
    107 		tnf_device, device, dev,
    108 		tnf_offset, blkno, btodt(uio->uio_loffset),
    109 		tnf_size, size, uio->uio_iov->iov_len,
    110 		tnf_bioflags, rw, rw);
    111 
    112 	if (rw == B_READ) {
    113 		CPU_STATS_ADD_K(sys, phread, 1);
    114 	} else {
    115 		CPU_STATS_ADD_K(sys, phwrite, 1);
    116 	}
    117 
    118 	iov = uio->uio_iov;
    119 	sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
    120 	sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
    121 
    122 	bp->b_error = 0;
    123 	bp->b_flags = B_BUSY | B_PHYS | B_ASYNC | rw;
    124 	bp->b_edev = dev;
    125 	bp->b_dev = cmpdev(dev);
    126 	bp->b_lblkno = btodt(uio->uio_loffset);
    127 	bp->b_offset = uio->uio_loffset;
    128 	(void) ops->devo_getinfo(NULL, DDI_INFO_DEVT2DEVINFO,
    129 	    (void *)bp->b_edev, (void **)&bp->b_dip);
    130 
    131 	/*
    132 	 * Clustering: Clustering can set the b_iodone, b_forw and
    133 	 * b_proc fields to cluster-specifc values.
    134 	 */
    135 	if (bp->b_iodone == NULL) {
    136 		bp->b_iodone = (int (*)()) aio_done;
    137 		/* b_forw points at an aio_req_t structure */
    138 		bp->b_forw = (struct buf *)reqp;
    139 		bp->b_proc = curproc;
    140 	}
    141 
    142 	a = bp->b_un.b_addr = iov->iov_base;
    143 	c = bp->b_bcount = iov->iov_len;
    144 
    145 	(*mincnt)(bp);
    146 	if (bp->b_bcount != iov->iov_len)
    147 		return (ENOTSUP);
    148 
    149 	as = bp->b_proc->p_as;
    150 
    151 	error = as_pagelock(as, &pplist, a,
    152 	    c, rw == B_READ? S_WRITE : S_READ);
    153 	if (error != 0) {
    154 		bp->b_flags |= B_ERROR;
    155 		bp->b_error = error;
    156 		bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
    157 		return (error);
    158 	}
    159 	reqp->aio_req_flags |= AIO_PAGELOCKDONE;
    160 	bp->b_shadow = pplist;
    161 	if (pplist != NULL) {
    162 		bp->b_flags |= B_SHADOW;
    163 	}
    164 
    165 	if (cancel != anocancel)
    166 		cmn_err(CE_PANIC,
    167 		    "aphysio: cancellation not supported, use anocancel");
    168 
    169 	reqp->aio_req_cancel = cancel;
    170 
    171 	DTRACE_IO1(start, struct buf *, bp);
    172 
    173 	return ((*strategy)(bp));
    174 }
    175 
    176 /*ARGSUSED*/
    177 int
    178 anocancel(struct buf *bp)
    179 {
    180 	return (ENXIO);
    181 }
    182 
    183 /*
    184  * Called from biodone().
    185  * Notify process that a pending AIO has finished.
    186  */
    187 
    188 /*
    189  * Clustering: This function is made non-static as it is used
    190  * by clustering s/w as contract private interface.
    191  */
    192 
    193 void
    194 aio_done(struct buf *bp)
    195 {
    196 	proc_t *p;
    197 	struct as *as;
    198 	aio_req_t *reqp;
    199 	aio_lio_t *head = NULL;
    200 	aio_t *aiop;
    201 	sigqueue_t *sigev = NULL;
    202 	sigqueue_t *lio_sigev = NULL;
    203 	port_kevent_t *pkevp = NULL;
    204 	port_kevent_t *lio_pkevp = NULL;
    205 	int fd;
    206 	int cleanupqflag;
    207 	int pollqflag;
    208 	int portevpend;
    209 	void (*func)();
    210 	int use_port = 0;
    211 	int reqp_flags = 0;
    212 
    213 	p = bp->b_proc;
    214 	as = p->p_as;
    215 	reqp = (aio_req_t *)bp->b_forw;
    216 	fd = reqp->aio_req_fd;
    217 
    218 	TNF_PROBE_5(aphysio_end, "kaio", /* CSTYLED */,
    219 		tnf_opaque, bp, bp,
    220 		tnf_device, device, bp->b_edev,
    221 		tnf_offset, blkno, btodt(reqp->aio_req_uio.uio_loffset),
    222 		tnf_size, size, reqp->aio_req_uio.uio_iov->iov_len,
    223 		tnf_bioflags, rw, (bp->b_flags & (B_READ|B_WRITE)));
    224 
    225 	/*
    226 	 * mapout earlier so that more kmem is available when aio is
    227 	 * heavily used. bug #1262082
    228 	 */
    229 	if (bp->b_flags & B_REMAPPED)
    230 		bp_mapout(bp);
    231 
    232 	/* decrement fd's ref count by one, now that aio request is done. */
    233 	areleasef(fd, P_FINFO(p));
    234 
    235 	aiop = p->p_aio;
    236 	ASSERT(aiop != NULL);
    237 
    238 	mutex_enter(&aiop->aio_portq_mutex);
    239 	mutex_enter(&aiop->aio_mutex);
    240 	ASSERT(aiop->aio_pending > 0);
    241 	ASSERT(reqp->aio_req_flags & AIO_PENDING);
    242 	aiop->aio_pending--;
    243 	reqp->aio_req_flags &= ~AIO_PENDING;
    244 	reqp_flags = reqp->aio_req_flags;
    245 	if ((pkevp = reqp->aio_req_portkev) != NULL) {
    246 		/* Event port notification is desired for this transaction */
    247 		if (reqp->aio_req_flags & AIO_CLOSE_PORT) {
    248 			/*
    249 			 * The port is being closed and it is waiting for
    250 			 * pending asynchronous I/O transactions to complete.
    251 			 */
    252 			portevpend = --aiop->aio_portpendcnt;
    253 			aio_deq(&aiop->aio_portpending, reqp);
    254 			aio_enq(&aiop->aio_portq, reqp, 0);
    255 			mutex_exit(&aiop->aio_mutex);
    256 			mutex_exit(&aiop->aio_portq_mutex);
    257 			port_send_event(pkevp);
    258 			if (portevpend == 0)
    259 				cv_broadcast(&aiop->aio_portcv);
    260 			return;
    261 		}
    262 
    263 		if (aiop->aio_flags & AIO_CLEANUP) {
    264 			/*
    265 			 * aio_cleanup_thread() is waiting for completion of
    266 			 * transactions.
    267 			 */
    268 			mutex_enter(&as->a_contents);
    269 			aio_deq(&aiop->aio_portpending, reqp);
    270 			aio_enq(&aiop->aio_portcleanupq, reqp, 0);
    271 			cv_signal(&aiop->aio_cleanupcv);
    272 			mutex_exit(&as->a_contents);
    273 			mutex_exit(&aiop->aio_mutex);
    274 			mutex_exit(&aiop->aio_portq_mutex);
    275 			return;
    276 		}
    277 
    278 		aio_deq(&aiop->aio_portpending, reqp);
    279 		aio_enq(&aiop->aio_portq, reqp, 0);
    280 
    281 		use_port = 1;
    282 	} else {
    283 		/*
    284 		 * when the AIO_CLEANUP flag is enabled for this
    285 		 * process, or when the AIO_POLL bit is set for
    286 		 * this request, special handling is required.
    287 		 * otherwise the request is put onto the doneq.
    288 		 */
    289 		cleanupqflag = (aiop->aio_flags & AIO_CLEANUP);
    290 		pollqflag = (reqp->aio_req_flags & AIO_POLL);
    291 		if (cleanupqflag | pollqflag) {
    292 
    293 			if (cleanupqflag)
    294 				mutex_enter(&as->a_contents);
    295 
    296 			/*
    297 			 * requests with their AIO_POLL bit set are put
    298 			 * on the pollq, requests with sigevent structures
    299 			 * or with listio heads are put on the notifyq, and
    300 			 * the remaining requests don't require any special
    301 			 * cleanup handling, so they're put onto the default
    302 			 * cleanupq.
    303 			 */
    304 			if (pollqflag)
    305 				aio_enq(&aiop->aio_pollq, reqp, AIO_POLLQ);
    306 			else if (reqp->aio_req_sigqp || reqp->aio_req_lio)
    307 				aio_enq(&aiop->aio_notifyq, reqp, AIO_NOTIFYQ);
    308 			else
    309 				aio_enq(&aiop->aio_cleanupq, reqp,
    310 				    AIO_CLEANUPQ);
    311 
    312 			if (cleanupqflag) {
    313 				cv_signal(&aiop->aio_cleanupcv);
    314 				mutex_exit(&as->a_contents);
    315 				mutex_exit(&aiop->aio_mutex);
    316 				mutex_exit(&aiop->aio_portq_mutex);
    317 			} else {
    318 				ASSERT(pollqflag);
    319 				/* block aio_cleanup_exit until we're done */
    320 				aiop->aio_flags |= AIO_DONE_ACTIVE;
    321 				mutex_exit(&aiop->aio_mutex);
    322 				mutex_exit(&aiop->aio_portq_mutex);
    323 				/*
    324 				 * let the cleanup processing happen from an AST
    325 				 * set an AST on all threads in this process
    326 				 */
    327 				mutex_enter(&p->p_lock);
    328 				set_proc_ast(p);
    329 				mutex_exit(&p->p_lock);
    330 				mutex_enter(&aiop->aio_mutex);
    331 				/* wakeup anybody waiting in aiowait() */
    332 				cv_broadcast(&aiop->aio_waitcv);
    333 
    334 				/* wakeup aio_cleanup_exit if needed */
    335 				if (aiop->aio_flags & AIO_CLEANUP)
    336 					cv_signal(&aiop->aio_cleanupcv);
    337 				aiop->aio_flags &= ~AIO_DONE_ACTIVE;
    338 				mutex_exit(&aiop->aio_mutex);
    339 			}
    340 			return;
    341 		}
    342 
    343 		/*
    344 		 * save req's sigevent pointer, and check its
    345 		 * value after releasing aio_mutex lock.
    346 		 */
    347 		sigev = reqp->aio_req_sigqp;
    348 		reqp->aio_req_sigqp = NULL;
    349 
    350 		/* put request on done queue. */
    351 		aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
    352 	} /* portkevent */
    353 
    354 	/*
    355 	 * when list IO notification is enabled, a notification or
    356 	 * signal is sent only when all entries in the list are done.
    357 	 */
    358 	if ((head = reqp->aio_req_lio) != NULL) {
    359 		ASSERT(head->lio_refcnt > 0);
    360 		if (--head->lio_refcnt == 0) {
    361 			/*
    362 			 * save lio's sigevent pointer, and check
    363 			 * its value after releasing aio_mutex lock.
    364 			 */
    365 			lio_sigev = head->lio_sigqp;
    366 			head->lio_sigqp = NULL;
    367 			cv_signal(&head->lio_notify);
    368 			if (head->lio_port >= 0 &&
    369 			    (lio_pkevp = head->lio_portkev) != NULL)
    370 				head->lio_port = -1;
    371 		}
    372 	}
    373 
    374 	/*
    375 	 * if AIO_WAITN set then
    376 	 * send signal only when we reached the
    377 	 * required amount of IO's finished
    378 	 * or when all IO's are done
    379 	 */
    380 	if (aiop->aio_flags & AIO_WAITN) {
    381 		if (aiop->aio_waitncnt > 0)
    382 			aiop->aio_waitncnt--;
    383 		if (aiop->aio_pending == 0 ||
    384 		    aiop->aio_waitncnt == 0)
    385 			cv_broadcast(&aiop->aio_waitcv);
    386 	} else {
    387 		cv_broadcast(&aiop->aio_waitcv);
    388 	}
    389 
    390 	mutex_exit(&aiop->aio_mutex);
    391 	mutex_exit(&aiop->aio_portq_mutex);
    392 
    393 	/*
    394 	 * Could the cleanup thread be waiting for AIO with locked
    395 	 * resources to finish?
    396 	 * Ideally in that case cleanup thread should block on cleanupcv,
    397 	 * but there is a window, where it could miss to see a new aio
    398 	 * request that sneaked in.
    399 	 */
    400 	mutex_enter(&as->a_contents);
    401 	if ((reqp_flags & AIO_PAGELOCKDONE) && AS_ISUNMAPWAIT(as))
    402 		cv_broadcast(&as->a_cv);
    403 	mutex_exit(&as->a_contents);
    404 
    405 	if (sigev)
    406 		aio_sigev_send(p, sigev);
    407 	else if (!use_port && head == NULL) {
    408 		/*
    409 		 * Send a SIGIO signal when the process has a handler enabled.
    410 		 */
    411 		if ((func = PTOU(p)->u_signal[SIGIO - 1]) != SIG_DFL &&
    412 		    func != SIG_IGN)
    413 			psignal(p, SIGIO);
    414 	}
    415 	if (pkevp)
    416 		port_send_event(pkevp);
    417 	if (lio_sigev)
    418 		aio_sigev_send(p, lio_sigev);
    419 	if (lio_pkevp)
    420 		port_send_event(lio_pkevp);
    421 }
    422 
    423 /*
    424  * send a queued signal to the specified process when
    425  * the event signal is non-NULL. A return value of 1
    426  * will indicate that a signal is queued, and 0 means that
    427  * no signal was specified, nor sent.
    428  */
    429 static void
    430 aio_sigev_send(proc_t *p, sigqueue_t *sigev)
    431 {
    432 	ASSERT(sigev != NULL);
    433 
    434 	mutex_enter(&p->p_lock);
    435 	sigaddqa(p, NULL, sigev);
    436 	mutex_exit(&p->p_lock);
    437 }
    438 
    439 /*
    440  * special case handling for zero length requests. the aio request
    441  * short circuits the normal completion path since all that's required
    442  * to complete this request is to copyout a zero to the aio request's
    443  * return value.
    444  */
    445 void
    446 aio_zerolen(aio_req_t *reqp)
    447 {
    448 
    449 	struct buf *bp = &reqp->aio_req_buf;
    450 
    451 	reqp->aio_req_flags |= AIO_ZEROLEN;
    452 
    453 	bp->b_forw = (struct buf *)reqp;
    454 	bp->b_proc = curproc;
    455 
    456 	bp->b_resid = 0;
    457 	bp->b_flags = 0;
    458 
    459 	aio_done(bp);
    460 }
    461 
    462 /*
    463  * unlock pages previously locked by as_pagelock
    464  */
    465 void
    466 aphysio_unlock(aio_req_t *reqp)
    467 {
    468 	struct buf *bp;
    469 	struct iovec *iov;
    470 	int flags;
    471 
    472 	if (reqp->aio_req_flags & AIO_PHYSIODONE)
    473 		return;
    474 
    475 	reqp->aio_req_flags |= AIO_PHYSIODONE;
    476 
    477 	if (reqp->aio_req_flags & AIO_ZEROLEN)
    478 		return;
    479 
    480 	bp = &reqp->aio_req_buf;
    481 	iov = reqp->aio_req_uio.uio_iov;
    482 	flags = (((bp->b_flags & B_READ) == B_READ) ? S_WRITE : S_READ);
    483 	if (reqp->aio_req_flags & AIO_PAGELOCKDONE) {
    484 		as_pageunlock(bp->b_proc->p_as,
    485 			bp->b_flags & B_SHADOW ? bp->b_shadow : NULL,
    486 			iov->iov_base, iov->iov_len, flags);
    487 		reqp->aio_req_flags &= ~AIO_PAGELOCKDONE;
    488 	}
    489 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
    490 	bp->b_flags |= B_DONE;
    491 }
    492 
    493 /*
    494  * deletes a requests id from the hash table of outstanding io.
    495  */
    496 static void
    497 aio_hash_delete(aio_t *aiop, struct aio_req_t *reqp)
    498 {
    499 	long index;
    500 	aio_result_t *resultp = reqp->aio_req_resultp;
    501 	aio_req_t *current;
    502 	aio_req_t **nextp;
    503 
    504 	index = AIO_HASH(resultp);
    505 	nextp = (aiop->aio_hash + index);
    506 	while ((current = *nextp) != NULL) {
    507 		if (current->aio_req_resultp == resultp) {
    508 			*nextp = current->aio_hash_next;
    509 			return;
    510 		}
    511 		nextp = &current->aio_hash_next;
    512 	}
    513 }
    514 
    515 /*
    516  * Put a list head struct onto its free list.
    517  */
    518 static void
    519 aio_lio_free(aio_t *aiop, aio_lio_t *head)
    520 {
    521 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
    522 
    523 	if (head->lio_sigqp != NULL)
    524 		kmem_free(head->lio_sigqp, sizeof (sigqueue_t));
    525 	head->lio_next = aiop->aio_lio_free;
    526 	aiop->aio_lio_free = head;
    527 }
    528 
    529 /*
    530  * Put a reqp onto the freelist.
    531  */
    532 void
    533 aio_req_free(aio_t *aiop, aio_req_t *reqp)
    534 {
    535 	aio_lio_t *liop;
    536 
    537 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
    538 
    539 	if (reqp->aio_req_portkev) {
    540 		port_free_event(reqp->aio_req_portkev);
    541 		reqp->aio_req_portkev = NULL;
    542 	}
    543 
    544 	if ((liop = reqp->aio_req_lio) != NULL) {
    545 		if (--liop->lio_nent == 0)
    546 			aio_lio_free(aiop, liop);
    547 		reqp->aio_req_lio = NULL;
    548 	}
    549 	if (reqp->aio_req_sigqp != NULL) {
    550 		kmem_free(reqp->aio_req_sigqp, sizeof (sigqueue_t));
    551 		reqp->aio_req_sigqp = NULL;
    552 	}
    553 	reqp->aio_req_next = aiop->aio_free;
    554 	reqp->aio_req_prev = NULL;
    555 	aiop->aio_free = reqp;
    556 	aiop->aio_outstanding--;
    557 	if (aiop->aio_outstanding == 0)
    558 		cv_broadcast(&aiop->aio_waitcv);
    559 	aio_hash_delete(aiop, reqp);
    560 }
    561 
    562 /*
    563  * Put a reqp onto the freelist.
    564  */
    565 void
    566 aio_req_free_port(aio_t *aiop, aio_req_t *reqp)
    567 {
    568 	ASSERT(MUTEX_HELD(&aiop->aio_mutex));
    569 
    570 	reqp->aio_req_next = aiop->aio_free;
    571 	reqp->aio_req_prev = NULL;
    572 	aiop->aio_free = reqp;
    573 	aiop->aio_outstanding--;
    574 	aio_hash_delete(aiop, reqp);
    575 }
    576 
    577 
    578 /*
    579  * Verify the integrity of a queue.
    580  */
    581 #if defined(DEBUG)
    582 static void
    583 aio_verify_queue(aio_req_t *head,
    584 	aio_req_t *entry_present, aio_req_t *entry_missing)
    585 {
    586 	aio_req_t *reqp;
    587 	int found = 0;
    588 	int present = 0;
    589 
    590 	if ((reqp = head) != NULL) {
    591 		do {
    592 			ASSERT(reqp->aio_req_prev->aio_req_next == reqp);
    593 			ASSERT(reqp->aio_req_next->aio_req_prev == reqp);
    594 			if (entry_present == reqp)
    595 				found++;
    596 			if (entry_missing == reqp)
    597 				present++;
    598 		} while ((reqp = reqp->aio_req_next) != head);
    599 	}
    600 	ASSERT(entry_present == NULL || found == 1);
    601 	ASSERT(entry_missing == NULL || present == 0);
    602 }
    603 #else
    604 #define	aio_verify_queue(x, y, z)
    605 #endif
    606 
    607 /*
    608  * Put a request onto the tail of a queue.
    609  */
    610 void
    611 aio_enq(aio_req_t **qhead, aio_req_t *reqp, int qflg_new)
    612 {
    613 	aio_req_t *head;
    614 	aio_req_t *prev;
    615 
    616 	aio_verify_queue(*qhead, NULL, reqp);
    617 
    618 	if ((head = *qhead) == NULL) {
    619 		reqp->aio_req_next = reqp;
    620 		reqp->aio_req_prev = reqp;
    621 		*qhead = reqp;
    622 	} else {
    623 		reqp->aio_req_next = head;
    624 		reqp->aio_req_prev = prev = head->aio_req_prev;
    625 		prev->aio_req_next = reqp;
    626 		head->aio_req_prev = reqp;
    627 	}
    628 	reqp->aio_req_flags |= qflg_new;
    629 }
    630 
    631 /*
    632  * Remove a request from its queue.
    633  */
    634 void
    635 aio_deq(aio_req_t **qhead, aio_req_t *reqp)
    636 {
    637 	aio_verify_queue(*qhead, reqp, NULL);
    638 
    639 	if (reqp->aio_req_next == reqp) {
    640 		*qhead = NULL;
    641 	} else {
    642 		reqp->aio_req_prev->aio_req_next = reqp->aio_req_next;
    643 		reqp->aio_req_next->aio_req_prev = reqp->aio_req_prev;
    644 		if (*qhead == reqp)
    645 			*qhead = reqp->aio_req_next;
    646 	}
    647 	reqp->aio_req_next = NULL;
    648 	reqp->aio_req_prev = NULL;
    649 }
    650 
    651 /*
    652  * concatenate a specified queue with the cleanupq. the specified
    653  * queue is put onto the tail of the cleanupq. all elements on the
    654  * specified queue should have their aio_req_flags field cleared.
    655  */
    656 /*ARGSUSED*/
    657 void
    658 aio_cleanupq_concat(aio_t *aiop, aio_req_t *q2, int qflg)
    659 {
    660 	aio_req_t *cleanupqhead, *q2tail;
    661 	aio_req_t *reqp = q2;
    662 
    663 	do {
    664 		ASSERT(reqp->aio_req_flags & qflg);
    665 		reqp->aio_req_flags &= ~qflg;
    666 		reqp->aio_req_flags |= AIO_CLEANUPQ;
    667 	} while ((reqp = reqp->aio_req_next) != q2);
    668 
    669 	cleanupqhead = aiop->aio_cleanupq;
    670 	if (cleanupqhead == NULL)
    671 		aiop->aio_cleanupq = q2;
    672 	else {
    673 		cleanupqhead->aio_req_prev->aio_req_next = q2;
    674 		q2tail = q2->aio_req_prev;
    675 		q2tail->aio_req_next = cleanupqhead;
    676 		q2->aio_req_prev = cleanupqhead->aio_req_prev;
    677 		cleanupqhead->aio_req_prev = q2tail;
    678 	}
    679 }
    680 
    681 /*
    682  * cleanup aio requests that are on the per-process poll queue.
    683  */
    684 void
    685 aio_cleanup(int flag)
    686 {
    687 	aio_t *aiop = curproc->p_aio;
    688 	aio_req_t *pollqhead, *cleanupqhead, *notifyqhead;
    689 	aio_req_t *cleanupport;
    690 	aio_req_t *portq = NULL;
    691 	void (*func)();
    692 	int signalled = 0;
    693 	int qflag = 0;
    694 	int exitflg;
    695 
    696 	ASSERT(aiop != NULL);
    697 
    698 	if (flag == AIO_CLEANUP_EXIT)
    699 		exitflg = AIO_CLEANUP_EXIT;
    700 	else
    701 		exitflg = 0;
    702 
    703 	/*
    704 	 * We need to get the aio_cleanupq_mutex because we are calling
    705 	 * aio_cleanup_cleanupq()
    706 	 */
    707 	mutex_enter(&aiop->aio_cleanupq_mutex);
    708 	/*
    709 	 * take all the requests off the cleanupq, the notifyq,
    710 	 * and the pollq.
    711 	 */
    712 	mutex_enter(&aiop->aio_mutex);
    713 	if ((cleanupqhead = aiop->aio_cleanupq) != NULL) {
    714 		aiop->aio_cleanupq = NULL;
    715 		qflag++;
    716 	}
    717 	if ((notifyqhead = aiop->aio_notifyq) != NULL) {
    718 		aiop->aio_notifyq = NULL;
    719 		qflag++;
    720 	}
    721 	if ((pollqhead = aiop->aio_pollq) != NULL) {
    722 		aiop->aio_pollq = NULL;
    723 		qflag++;
    724 	}
    725 	if (flag) {
    726 		if ((portq = aiop->aio_portq) != NULL)
    727 			qflag++;
    728 
    729 		if ((cleanupport = aiop->aio_portcleanupq) != NULL) {
    730 			aiop->aio_portcleanupq = NULL;
    731 			qflag++;
    732 		}
    733 	}
    734 	mutex_exit(&aiop->aio_mutex);
    735 
    736 	/*
    737 	 * return immediately if cleanupq, pollq, and
    738 	 * notifyq are all empty. someone else must have
    739 	 * emptied them.
    740 	 */
    741 	if (!qflag) {
    742 		mutex_exit(&aiop->aio_cleanupq_mutex);
    743 		return;
    744 	}
    745 
    746 	/*
    747 	 * do cleanup for the various queues.
    748 	 */
    749 	if (cleanupqhead)
    750 		aio_cleanup_cleanupq(aiop, cleanupqhead, exitflg);
    751 	mutex_exit(&aiop->aio_cleanupq_mutex);
    752 	if (notifyqhead)
    753 		signalled = aio_cleanup_notifyq(aiop, notifyqhead, exitflg);
    754 	if (pollqhead)
    755 		aio_cleanup_pollq(aiop, pollqhead, exitflg);
    756 	if (flag && (cleanupport || portq))
    757 		aio_cleanup_portq(aiop, cleanupport, exitflg);
    758 
    759 	if (exitflg)
    760 		return;
    761 
    762 	/*
    763 	 * If we have an active aio_cleanup_thread it's possible for
    764 	 * this routine to push something on to the done queue after
    765 	 * an aiowait/aiosuspend thread has already decided to block.
    766 	 * This being the case, we need a cv_broadcast here to wake
    767 	 * these threads up. It is simpler and cleaner to do this
    768 	 * broadcast here than in the individual cleanup routines.
    769 	 */
    770 
    771 	mutex_enter(&aiop->aio_mutex);
    772 	cv_broadcast(&aiop->aio_waitcv);
    773 	mutex_exit(&aiop->aio_mutex);
    774 
    775 	/*
    776 	 * Only if the process wasn't already signalled,
    777 	 * determine if a SIGIO signal should be delievered.
    778 	 */
    779 	if (!signalled &&
    780 	    (func = PTOU(curproc)->u_signal[SIGIO - 1]) != SIG_DFL &&
    781 	    func != SIG_IGN)
    782 		psignal(curproc, SIGIO);
    783 }
    784 
    785 
    786 /*
    787  * Do cleanup for every element of the port cleanup queue.
    788  */
    789 static void
    790 aio_cleanup_portq(aio_t *aiop, aio_req_t *cleanupq, int exitflag)
    791 {
    792 	aio_req_t	*reqp;
    793 	aio_req_t	*next;
    794 	aio_req_t	*headp;
    795 	aio_lio_t	*liop;
    796 
    797 	/* first check the portq */
    798 	if (exitflag || ((aiop->aio_flags & AIO_CLEANUP_PORT) == 0)) {
    799 		mutex_enter(&aiop->aio_mutex);
    800 		if (aiop->aio_flags & AIO_CLEANUP)
    801 			aiop->aio_flags |= AIO_CLEANUP_PORT;
    802 		mutex_exit(&aiop->aio_mutex);
    803 
    804 		/*
    805 		 * It is not allowed to hold locks during aphysio_unlock().
    806 		 * The aio_done() interrupt function will try to acquire
    807 		 * aio_mutex and aio_portq_mutex.  Therefore we disconnect
    808 		 * the portq list from the aiop for the duration of the
    809 		 * aphysio_unlock() loop below.
    810 		 */
    811 		mutex_enter(&aiop->aio_portq_mutex);
    812 		headp = aiop->aio_portq;
    813 		aiop->aio_portq = NULL;
    814 		mutex_exit(&aiop->aio_portq_mutex);
    815 		if ((reqp = headp) != NULL) {
    816 			do {
    817 				next = reqp->aio_req_next;
    818 				aphysio_unlock(reqp);
    819 				if (exitflag) {
    820 					mutex_enter(&aiop->aio_mutex);
    821 					aio_req_free(aiop, reqp);
    822 					mutex_exit(&aiop->aio_mutex);
    823 				}
    824 			} while ((reqp = next) != headp);
    825 		}
    826 
    827 		if (headp != NULL && exitflag == 0) {
    828 			/* move unlocked requests back to the port queue */
    829 			aio_req_t *newq;
    830 
    831 			mutex_enter(&aiop->aio_portq_mutex);
    832 			if ((newq = aiop->aio_portq) != NULL) {
    833 				aio_req_t *headprev = headp->aio_req_prev;
    834 				aio_req_t *newqprev = newq->aio_req_prev;
    835 
    836 				headp->aio_req_prev = newqprev;
    837 				newq->aio_req_prev = headprev;
    838 				headprev->aio_req_next = newq;
    839 				newqprev->aio_req_next = headp;
    840 			}
    841 			aiop->aio_portq = headp;
    842 			cv_broadcast(&aiop->aio_portcv);
    843 			mutex_exit(&aiop->aio_portq_mutex);
    844 		}
    845 	}
    846 
    847 	/* now check the port cleanup queue */
    848 	if ((reqp = cleanupq) == NULL)
    849 		return;
    850 	do {
    851 		next = reqp->aio_req_next;
    852 		aphysio_unlock(reqp);
    853 		if (exitflag) {
    854 			mutex_enter(&aiop->aio_mutex);
    855 			aio_req_free(aiop, reqp);
    856 			mutex_exit(&aiop->aio_mutex);
    857 		} else {
    858 			mutex_enter(&aiop->aio_portq_mutex);
    859 			aio_enq(&aiop->aio_portq, reqp, 0);
    860 			mutex_exit(&aiop->aio_portq_mutex);
    861 			port_send_event(reqp->aio_req_portkev);
    862 			if ((liop = reqp->aio_req_lio) != NULL) {
    863 				int send_event = 0;
    864 
    865 				mutex_enter(&aiop->aio_mutex);
    866 				ASSERT(liop->lio_refcnt > 0);
    867 				if (--liop->lio_refcnt == 0) {
    868 					if (liop->lio_port >= 0 &&
    869 					    liop->lio_portkev) {
    870 						liop->lio_port = -1;
    871 						send_event = 1;
    872 					}
    873 				}
    874 				mutex_exit(&aiop->aio_mutex);
    875 				if (send_event)
    876 					port_send_event(liop->lio_portkev);
    877 			}
    878 		}
    879 	} while ((reqp = next) != cleanupq);
    880 }
    881 
    882 /*
    883  * Do cleanup for every element of the cleanupq.
    884  */
    885 static void
    886 aio_cleanup_cleanupq(aio_t *aiop, aio_req_t *qhead, int exitflg)
    887 {
    888 	aio_req_t *reqp, *next;
    889 
    890 	ASSERT(MUTEX_HELD(&aiop->aio_cleanupq_mutex));
    891 
    892 	/*
    893 	 * Since aio_req_done() or aio_req_find() use the HASH list to find
    894 	 * the required requests, they could potentially take away elements
    895 	 * if they are already done (AIO_DONEQ is set).
    896 	 * The aio_cleanupq_mutex protects the queue for the duration of the
    897 	 * loop from aio_req_done() and aio_req_find().
    898 	 */
    899 	if ((reqp = qhead) == NULL)
    900 		return;
    901 	do {
    902 		ASSERT(reqp->aio_req_flags & AIO_CLEANUPQ);
    903 		ASSERT(reqp->aio_req_portkev == NULL);
    904 		next = reqp->aio_req_next;
    905 		aphysio_unlock(reqp);
    906 		mutex_enter(&aiop->aio_mutex);
    907 		if (exitflg)
    908 			aio_req_free(aiop, reqp);
    909 		else
    910 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
    911 		mutex_exit(&aiop->aio_mutex);
    912 	} while ((reqp = next) != qhead);
    913 }
    914 
    915 /*
    916  * do cleanup for every element of the notify queue.
    917  */
    918 static int
    919 aio_cleanup_notifyq(aio_t *aiop, aio_req_t *qhead, int exitflg)
    920 {
    921 	aio_req_t *reqp, *next;
    922 	aio_lio_t *liohead;
    923 	sigqueue_t *sigev, *lio_sigev = NULL;
    924 	int signalled = 0;
    925 
    926 	if ((reqp = qhead) == NULL)
    927 		return (0);
    928 	do {
    929 		ASSERT(reqp->aio_req_flags & AIO_NOTIFYQ);
    930 		next = reqp->aio_req_next;
    931 		aphysio_unlock(reqp);
    932 		if (exitflg) {
    933 			mutex_enter(&aiop->aio_mutex);
    934 			aio_req_free(aiop, reqp);
    935 			mutex_exit(&aiop->aio_mutex);
    936 		} else {
    937 			mutex_enter(&aiop->aio_mutex);
    938 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
    939 			sigev = reqp->aio_req_sigqp;
    940 			reqp->aio_req_sigqp = NULL;
    941 			if ((liohead = reqp->aio_req_lio) != NULL) {
    942 				ASSERT(liohead->lio_refcnt > 0);
    943 				if (--liohead->lio_refcnt == 0) {
    944 					cv_signal(&liohead->lio_notify);
    945 					lio_sigev = liohead->lio_sigqp;
    946 					liohead->lio_sigqp = NULL;
    947 				}
    948 			}
    949 			mutex_exit(&aiop->aio_mutex);
    950 			if (sigev) {
    951 				signalled++;
    952 				aio_sigev_send(reqp->aio_req_buf.b_proc,
    953 				    sigev);
    954 			}
    955 			if (lio_sigev) {
    956 				signalled++;
    957 				aio_sigev_send(reqp->aio_req_buf.b_proc,
    958 				    lio_sigev);
    959 			}
    960 		}
    961 	} while ((reqp = next) != qhead);
    962 
    963 	return (signalled);
    964 }
    965 
    966 /*
    967  * Do cleanup for every element of the poll queue.
    968  */
    969 static void
    970 aio_cleanup_pollq(aio_t *aiop, aio_req_t *qhead, int exitflg)
    971 {
    972 	aio_req_t *reqp, *next;
    973 
    974 	/*
    975 	 * As no other threads should be accessing the queue at this point,
    976 	 * it isn't necessary to hold aio_mutex while we traverse its elements.
    977 	 */
    978 	if ((reqp = qhead) == NULL)
    979 		return;
    980 	do {
    981 		ASSERT(reqp->aio_req_flags & AIO_POLLQ);
    982 		next = reqp->aio_req_next;
    983 		aphysio_unlock(reqp);
    984 		if (exitflg) {
    985 			mutex_enter(&aiop->aio_mutex);
    986 			aio_req_free(aiop, reqp);
    987 			mutex_exit(&aiop->aio_mutex);
    988 		} else {
    989 			aio_copyout_result(reqp);
    990 			mutex_enter(&aiop->aio_mutex);
    991 			aio_enq(&aiop->aio_doneq, reqp, AIO_DONEQ);
    992 			mutex_exit(&aiop->aio_mutex);
    993 		}
    994 	} while ((reqp = next) != qhead);
    995 }
    996 
    997 /*
    998  * called by exit(). waits for all outstanding kaio to finish
    999  * before the kaio resources are freed.
   1000  */
   1001 void
   1002 aio_cleanup_exit(void)
   1003 {
   1004 	proc_t *p = curproc;
   1005 	aio_t *aiop = p->p_aio;
   1006 	aio_req_t *reqp, *next, *head;
   1007 	aio_lio_t *nxtlio, *liop;
   1008 
   1009 	/*
   1010 	 * wait for all outstanding kaio to complete. process
   1011 	 * is now single-threaded; no other kaio requests can
   1012 	 * happen once aio_pending is zero.
   1013 	 */
   1014 	mutex_enter(&aiop->aio_mutex);
   1015 	aiop->aio_flags |= AIO_CLEANUP;
   1016 	while ((aiop->aio_pending != 0) || (aiop->aio_flags & AIO_DONE_ACTIVE))
   1017 		cv_wait(&aiop->aio_cleanupcv, &aiop->aio_mutex);
   1018 	mutex_exit(&aiop->aio_mutex);
   1019 
   1020 	/* cleanup the cleanup-thread queues. */
   1021 	aio_cleanup(AIO_CLEANUP_EXIT);
   1022 
   1023 	/*
   1024 	 * Although this process is now single-threaded, we
   1025 	 * still need to protect ourselves against a race with
   1026 	 * aio_cleanup_dr_delete_memory().
   1027 	 */
   1028 	mutex_enter(&p->p_lock);
   1029 
   1030 	/*
   1031 	 * free up the done queue's resources.
   1032 	 */
   1033 	if ((head = aiop->aio_doneq) != NULL) {
   1034 		aiop->aio_doneq = NULL;
   1035 		reqp = head;
   1036 		do {
   1037 			next = reqp->aio_req_next;
   1038 			aphysio_unlock(reqp);
   1039 			kmem_free