Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)vm_as.c	1.175	07/12/10 SMI"
     40 
     41 /*
     42  * VM - address spaces.
     43  */
     44 
     45 #include <sys/types.h>
     46 #include <sys/t_lock.h>
     47 #include <sys/param.h>
     48 #include <sys/errno.h>
     49 #include <sys/systm.h>
     50 #include <sys/mman.h>
     51 #include <sys/sysmacros.h>
     52 #include <sys/cpuvar.h>
     53 #include <sys/sysinfo.h>
     54 #include <sys/kmem.h>
     55 #include <sys/vnode.h>
     56 #include <sys/vmsystm.h>
     57 #include <sys/cmn_err.h>
     58 #include <sys/debug.h>
     59 #include <sys/tnf_probe.h>
     60 #include <sys/vtrace.h>
     61 
     62 #include <vm/hat.h>
     63 #include <vm/xhat.h>
     64 #include <vm/as.h>
     65 #include <vm/seg.h>
     66 #include <vm/seg_vn.h>
     67 #include <vm/seg_dev.h>
     68 #include <vm/seg_kmem.h>
     69 #include <vm/seg_map.h>
     70 #include <vm/seg_spt.h>
     71 #include <vm/page.h>
     72 
     73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
     74 
     75 static struct kmem_cache *as_cache;
     76 
     77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
     78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
     79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
     80 
     81 
     82 /*
     83  * Verifying the segment lists is very time-consuming; it may not be
     84  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
     85  */
     86 #ifdef DEBUG
     87 #define	VERIFY_SEGLIST
     88 int do_as_verify = 0;
     89 #endif
     90 
     91 /*
     92  * Allocate a new callback data structure entry and fill in the events of
     93  * interest, the address range of interest, and the callback argument.
     94  * Link the entry on the as->a_callbacks list. A callback entry for the
     95  * entire address space may be specified with vaddr = 0 and size = -1.
     96  *
     97  * CALLERS RESPONSIBILITY: If not calling from within the process context for
     98  * the specified as, the caller must guarantee persistence of the specified as
     99  * for the duration of this function (eg. pages being locked within the as
    100  * will guarantee persistence).
    101  */
    102 int
    103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
    104 		caddr_t vaddr, size_t size, int sleepflag)
    105 {
    106 	struct as_callback 	*current_head, *cb;
    107 	caddr_t 		saddr;
    108 	size_t 			rsize;
    109 
    110 	/* callback function and an event are mandatory */
    111 	if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
    112 		return (EINVAL);
    113 
    114 	/* Adding a callback after as_free has been called is not allowed */
    115 	if (as == &kas)
    116 		return (ENOMEM);
    117 
    118 	/*
    119 	 * vaddr = 0 and size = -1 is used to indicate that the callback range
    120 	 * is the entire address space so no rounding is done in that case.
    121 	 */
    122 	if (size != -1) {
    123 		saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
    124 		rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
    125 		    (size_t)saddr;
    126 		/* check for wraparound */
    127 		if (saddr + rsize < saddr)
    128 			return (ENOMEM);
    129 	} else {
    130 		if (vaddr != 0)
    131 			return (EINVAL);
    132 		saddr = vaddr;
    133 		rsize = size;
    134 	}
    135 
    136 	/* Allocate and initialize a callback entry */
    137 	cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
    138 	if (cb == NULL)
    139 		return (EAGAIN);
    140 
    141 	cb->ascb_func = cb_func;
    142 	cb->ascb_arg = arg;
    143 	cb->ascb_events = events;
    144 	cb->ascb_saddr = saddr;
    145 	cb->ascb_len = rsize;
    146 
    147 	/* Add the entry to the list */
    148 	mutex_enter(&as->a_contents);
    149 	current_head = as->a_callbacks;
    150 	as->a_callbacks = cb;
    151 	cb->ascb_next = current_head;
    152 
    153 	/*
    154 	 * The call to this function may lose in a race with
    155 	 * a pertinent event - eg. a thread does long term memory locking
    156 	 * but before the callback is added another thread executes as_unmap.
    157 	 * A broadcast here resolves that.
    158 	 */
    159 	if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
    160 		AS_CLRUNMAPWAIT(as);
    161 		cv_broadcast(&as->a_cv);
    162 	}
    163 
    164 	mutex_exit(&as->a_contents);
    165 	return (0);
    166 }
    167 
    168 /*
    169  * Search the callback list for an entry which pertains to arg.
    170  *
    171  * This is called from within the client upon completion of the callback.
    172  * RETURN VALUES:
    173  *	AS_CALLBACK_DELETED  (callback entry found and deleted)
    174  *	AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
    175  *	AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
    176  *			entry will be made in as_do_callbacks)
    177  *
    178  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
    179  * set, it indicates that as_do_callbacks is processing this entry.  The
    180  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
    181  * to unblock as_do_callbacks, in case it is blocked.
    182  *
    183  * CALLERS RESPONSIBILITY: If not calling from within the process context for
    184  * the specified as, the caller must guarantee persistence of the specified as
    185  * for the duration of this function (eg. pages being locked within the as
    186  * will guarantee persistence).
    187  */
    188 uint_t
    189 as_delete_callback(struct as *as, void *arg)
    190 {
    191 	struct as_callback **prevcb = &as->a_callbacks;
    192 	struct as_callback *cb;
    193 	uint_t rc = AS_CALLBACK_NOTFOUND;
    194 
    195 	mutex_enter(&as->a_contents);
    196 	for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
    197 		if (cb->ascb_arg != arg)
    198 			continue;
    199 
    200 		/*
    201 		 * If the events indicate AS_CALLBACK_CALLED, just clear
    202 		 * AS_ALL_EVENT in the events field and wakeup the thread
    203 		 * that may be waiting in as_do_callbacks.  as_do_callbacks
    204 		 * will take care of removing this entry from the list.  In
    205 		 * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
    206 		 * (AS_CALLBACK_CALLED not set), just remove it from the
    207 		 * list, return the memory and return AS_CALLBACK_DELETED.
    208 		 */
    209 		if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
    210 			/* leave AS_CALLBACK_CALLED */
    211 			cb->ascb_events &= ~AS_ALL_EVENT;
    212 			rc = AS_CALLBACK_DELETE_DEFERRED;
    213 			cv_broadcast(&as->a_cv);
    214 		} else {
    215 			*prevcb = cb->ascb_next;
    216 			kmem_free(cb, sizeof (struct as_callback));
    217 			rc = AS_CALLBACK_DELETED;
    218 		}
    219 		break;
    220 	}
    221 	mutex_exit(&as->a_contents);
    222 	return (rc);
    223 }
    224 
    225 /*
    226  * Searches the as callback list for a matching entry.
    227  * Returns a pointer to the first matching callback, or NULL if
    228  * nothing is found.
    229  * This function never sleeps so it is ok to call it with more
    230  * locks held but the (required) a_contents mutex.
    231  *
    232  * See also comment on as_do_callbacks below.
    233  */
    234 static struct as_callback *
    235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
    236 			size_t event_len)
    237 {
    238 	struct as_callback	*cb;
    239 
    240 	ASSERT(MUTEX_HELD(&as->a_contents));
    241 	for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
    242 		/*
    243 		 * If the callback has not already been called, then
    244 		 * check if events or address range pertains.  An event_len
    245 		 * of zero means do an unconditional callback.
    246 		 */
    247 		if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
    248 		    ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
    249 		    (event_addr + event_len < cb->ascb_saddr) ||
    250 		    (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
    251 			continue;
    252 		}
    253 		break;
    254 	}
    255 	return (cb);
    256 }
    257 
    258 /*
    259  * Executes a given callback and removes it from the callback list for
    260  * this address space.
    261  * This function may sleep so the caller must drop all locks except
    262  * a_contents before calling this func.
    263  *
    264  * See also comments on as_do_callbacks below.
    265  */
    266 static void
    267 as_execute_callback(struct as *as, struct as_callback *cb,
    268 				uint_t events)
    269 {
    270 	struct as_callback **prevcb;
    271 	void	*cb_arg;
    272 
    273 	ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
    274 	cb->ascb_events |= AS_CALLBACK_CALLED;
    275 	mutex_exit(&as->a_contents);
    276 	(*cb->ascb_func)(as, cb->ascb_arg, events);
    277 	mutex_enter(&as->a_contents);
    278 	/*
    279 	 * the callback function is required to delete the callback
    280 	 * when the callback function determines it is OK for
    281 	 * this thread to continue. as_delete_callback will clear
    282 	 * the AS_ALL_EVENT in the events field when it is deleted.
    283 	 * If the callback function called as_delete_callback,
    284 	 * events will already be cleared and there will be no blocking.
    285 	 */
    286 	while ((cb->ascb_events & events) != 0) {
    287 		cv_wait(&as->a_cv, &as->a_contents);
    288 	}
    289 	/*
    290 	 * This entry needs to be taken off the list. Normally, the
    291 	 * callback func itself does that, but unfortunately the list
    292 	 * may have changed while the callback was running because the
    293 	 * a_contents mutex was dropped and someone else other than the
    294 	 * callback func itself could have called as_delete_callback,
    295 	 * so we have to search to find this entry again.  The entry
    296 	 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
    297 	 */
    298 	cb_arg = cb->ascb_arg;
    299 	prevcb = &as->a_callbacks;
    300 	for (cb = as->a_callbacks; cb != NULL;
    301 	    prevcb = &cb->ascb_next, cb = *prevcb) {
    302 		if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
    303 		    (cb_arg != cb->ascb_arg)) {
    304 			continue;
    305 		}
    306 		*prevcb = cb->ascb_next;
    307 		kmem_free(cb, sizeof (struct as_callback));
    308 		break;
    309 	}
    310 }
    311 
    312 /*
    313  * Check the callback list for a matching event and intersection of
    314  * address range. If there is a match invoke the callback.  Skip an entry if:
    315  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
    316  *    - not event of interest
    317  *    - not address range of interest
    318  *
    319  * An event_len of zero indicates a request for an unconditional callback
    320  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
    321  * a_contents lock must be dropped before a callback, so only one callback
    322  * can be done before returning. Return -1 (true) if a callback was
    323  * executed and removed from the list, else return 0 (false).
    324  *
    325  * The logically separate parts, i.e. finding a matching callback and
    326  * executing a given callback have been separated into two functions
    327  * so that they can be called with different sets of locks held beyond
    328  * the always-required a_contents. as_find_callback does not sleep so
    329  * it is ok to call it if more locks than a_contents (i.e. the a_lock
    330  * rwlock) are held. as_execute_callback on the other hand may sleep
    331  * so all locks beyond a_contents must be dropped by the caller if one
    332  * does not want to end comatose.
    333  */
    334 static int
    335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
    336 			size_t event_len)
    337 {
    338 	struct as_callback *cb;
    339 
    340 	if ((cb = as_find_callback(as, events, event_addr, event_len))) {
    341 		as_execute_callback(as, cb, events);
    342 		return (-1);
    343 	}
    344 	return (0);
    345 }
    346 
    347 /*
    348  * Search for the segment containing addr. If a segment containing addr
    349  * exists, that segment is returned.  If no such segment exists, and
    350  * the list spans addresses greater than addr, then the first segment
    351  * whose base is greater than addr is returned; otherwise, NULL is
    352  * returned unless tail is true, in which case the last element of the
    353  * list is returned.
    354  *
    355  * a_seglast is used to cache the last found segment for repeated
    356  * searches to the same addr (which happens frequently).
    357  */
    358 struct seg *
    359 as_findseg(struct as *as, caddr_t addr, int tail)
    360 {
    361 	struct seg *seg = as->a_seglast;
    362 	avl_index_t where;
    363 
    364 	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
    365 
    366 	if (seg != NULL &&
    367 	    seg->s_base <= addr &&
    368 	    addr < seg->s_base + seg->s_size)
    369 		return (seg);
    370 
    371 	seg = avl_find(&as->a_segtree, &addr, &where);
    372 	if (seg != NULL)
    373 		return (as->a_seglast = seg);
    374 
    375 	seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
    376 	if (seg == NULL && tail)
    377 		seg = avl_last(&as->a_segtree);
    378 	return (as->a_seglast = seg);
    379 }
    380 
    381 #ifdef VERIFY_SEGLIST
    382 /*
    383  * verify that the linked list is coherent
    384  */
    385 static void
    386 as_verify(struct as *as)
    387 {
    388 	struct seg *seg, *seglast, *p, *n;
    389 	uint_t nsegs = 0;
    390 
    391 	if (do_as_verify == 0)
    392 		return;
    393 
    394 	seglast = as->a_seglast;
    395 
    396 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
    397 		ASSERT(seg->s_as == as);
    398 		p = AS_SEGPREV(as, seg);
    399 		n = AS_SEGNEXT(as, seg);
    400 		ASSERT(p == NULL || p->s_as == as);
    401 		ASSERT(p == NULL || p->s_base < seg->s_base);
    402 		ASSERT(n == NULL || n->s_base > seg->s_base);
    403 		ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
    404 		if (seg == seglast)
    405 			seglast = NULL;
    406 		nsegs++;
    407 	}
    408 	ASSERT(seglast == NULL);
    409 	ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
    410 }
    411 #endif /* VERIFY_SEGLIST */
    412 
    413 /*
    414  * Add a new segment to the address space. The avl_find()
    415  * may be expensive so we attempt to use last segment accessed
    416  * in as_gap() as an insertion point.
    417  */
    418 int
    419 as_addseg(struct as  *as, struct seg *newseg)
    420 {
    421 	struct seg *seg;
    422 	caddr_t addr;
    423 	caddr_t eaddr;
    424 	avl_index_t where;
    425 
    426 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
    427 
    428 	as->a_updatedir = 1;	/* inform /proc */
    429 	gethrestime(&as->a_updatetime);
    430 
    431 	if (as->a_lastgaphl != NULL) {
    432 		struct seg *hseg = NULL;
    433 		struct seg *lseg = NULL;
    434 
    435 		if (as->a_lastgaphl->s_base > newseg->s_base) {
    436 			hseg = as->a_lastgaphl;
    437 			lseg = AVL_PREV(&as->a_segtree, hseg);
    438 		} else {
    439 			lseg = as->a_lastgaphl;
    440 			hseg = AVL_NEXT(&as->a_segtree, lseg);
    441 		}
    442 
    443 		if (hseg && lseg && lseg->s_base < newseg->s_base &&
    444 		    hseg->s_base > newseg->s_base) {
    445 			avl_insert_here(&as->a_segtree, newseg, lseg,
    446 			    AVL_AFTER);
    447 			as->a_lastgaphl = NULL;
    448 			as->a_seglast = newseg;
    449 			return (0);
    450 		}
    451 		as->a_lastgaphl = NULL;
    452 	}
    453 
    454 	addr = newseg->s_base;
    455 	eaddr = addr + newseg->s_size;
    456 again:
    457 
    458 	seg = avl_find(&as->a_segtree, &addr, &where);
    459 
    460 	if (seg == NULL)
    461 		seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
    462 
    463 	if (seg == NULL)
    464 		seg = avl_last(&as->a_segtree);
    465 
    466 	if (seg != NULL) {
    467 		caddr_t base = seg->s_base;
    468 
    469 		/*
    470 		 * If top of seg is below the requested address, then
    471 		 * the insertion point is at the end of the linked list,
    472 		 * and seg points to the tail of the list.  Otherwise,
    473 		 * the insertion point is immediately before seg.
    474 		 */
    475 		if (base + seg->s_size > addr) {
    476 			if (addr >= base || eaddr > base) {
    477 #ifdef __sparc
    478 				extern struct seg_ops segnf_ops;
    479 
    480 				/*
    481 				 * no-fault segs must disappear if overlaid.
    482 				 * XXX need new segment type so
    483 				 * we don't have to check s_ops
    484 				 */
    485 				if (seg->s_ops == &segnf_ops) {
    486 					seg_unmap(seg);
    487 					goto again;
    488 				}
    489 #endif
    490 				return (-1);	/* overlapping segment */
    491 			}
    492 		}
    493 	}
    494 	as->a_seglast = newseg;
    495 	avl_insert(&as->a_segtree, newseg, where);
    496 
    497 #ifdef VERIFY_SEGLIST
    498 	as_verify(as);
    499 #endif
    500 	return (0);
    501 }
    502 
    503 struct seg *
    504 as_removeseg(struct as *as, struct seg *seg)
    505 {
    506 	avl_tree_t *t;
    507 
    508 	ASSERT(AS_WRITE_HELD(as, &as->a_lock));
    509 
    510 	as->a_updatedir = 1;	/* inform /proc */
    511 	gethrestime(&as->a_updatetime);
    512 
    513 	if (seg == NULL)
    514 		return (NULL);
    515 
    516 	t = &as->a_segtree;
    517 	if (as->a_seglast == seg)
    518 		as->a_seglast = NULL;
    519 	as->a_lastgaphl = NULL;
    520 
    521 	/*
    522 	 * if this segment is at an address higher than
    523 	 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
    524 	 */
    525 	if (as->a_lastgap &&
    526 	    (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
    527 		as->a_lastgap = AVL_NEXT(t, seg);
    528 
    529 	/*
    530 	 * remove the segment from the seg tree
    531 	 */
    532 	avl_remove(t, seg);
    533 
    534 #ifdef VERIFY_SEGLIST
    535 	as_verify(as);
    536 #endif
    537 	return (seg);
    538 }
    539 
    540 /*
    541  * Find a segment containing addr.
    542  */
    543 struct seg *
    544 as_segat(struct as *as, caddr_t addr)
    545 {
    546 	struct seg *seg = as->a_seglast;
    547 
    548 	ASSERT(AS_LOCK_HELD(as, &as->a_lock));
    549 
    550 	if (seg != NULL && seg->s_base <= addr &&
    551 	    addr < seg->s_base + seg->s_size)
    552 		return (seg);
    553 
    554 	seg = avl_find(&as->a_segtree, &addr, NULL);
    555 	return (seg);
    556 }
    557 
    558 /*
    559  * Serialize all searches for holes in an address space to
    560  * prevent two or more threads from allocating the same virtual
    561  * address range.  The address space must not be "read/write"
    562  * locked by the caller since we may block.
    563  */
    564 void
    565 as_rangelock(struct as *as)
    566 {
    567 	mutex_enter(&as->a_contents);
    568 	while (AS_ISCLAIMGAP(as))
    569 		cv_wait(&as->a_cv, &as->a_contents);
    570 	AS_SETCLAIMGAP(as);
    571 	mutex_exit(&as->a_contents);
    572 }
    573 
    574 /*
    575  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
    576  */
    577 void
    578 as_rangeunlock(struct as *as)
    579 {
    580 	mutex_enter(&as->a_contents);
    581 	AS_CLRCLAIMGAP(as);
    582 	cv_signal(&as->a_cv);
    583 	mutex_exit(&as->a_contents);
    584 }
    585 
    586 /*
    587  * compar segments (or just an address) by segment address range
    588  */
    589 static int
    590 as_segcompar(const void *x, const void *y)
    591 {
    592 	struct seg *a = (struct seg *)x;
    593 	struct seg *b = (struct seg *)y;
    594 
    595 	if (a->s_base < b->s_base)
    596 		return (-1);
    597 	if (a->s_base >= b->s_base + b->s_size)
    598 		return (1);
    599 	return (0);
    600 }
    601 
    602 
    603 void
    604 as_avlinit(struct as *as)
    605 {
    606 	avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
    607 	    offsetof(struct seg, s_tree));
    608 	avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
    609 	    offsetof(struct watched_page, wp_link));
    610 }
    611 
    612 /*ARGSUSED*/
    613 static int
    614 as_constructor(void *buf, void *cdrarg, int kmflags)
    615 {
    616 	struct as *as = buf;
    617 
    618 	mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
    619 	cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
    620 	rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
    621 	as_avlinit(as);
    622 	return (0);
    623 }
    624 
    625 /*ARGSUSED1*/
    626 static void
    627 as_destructor(void *buf, void *cdrarg)
    628 {
    629 	struct as *as = buf;
    630 
    631 	avl_destroy(&as->a_segtree);
    632 	mutex_destroy(&as->a_contents);
    633 	cv_destroy(&as->a_cv);
    634 	rw_destroy(&as->a_lock);
    635 }
    636 
    637 void
    638 as_init(void)
    639 {
    640 	as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
    641 	    as_constructor, as_destructor, NULL, NULL, NULL, 0);
    642 }
    643 
    644 /*
    645  * Allocate and initialize an address space data structure.
    646  * We call hat_alloc to allow any machine dependent
    647  * information in the hat structure to be initialized.
    648  */
    649 struct as *
    650 as_alloc(void)
    651 {
    652 	struct as *as;
    653 
    654 	as = kmem_cache_alloc(as_cache, KM_SLEEP);
    655 
    656 	as->a_flags		= 0;
    657 	as->a_vbits		= 0;
    658 	as->a_hrm		= NULL;
    659 	as->a_seglast		= NULL;
    660 	as->a_size		= 0;
    661 	as->a_updatedir		= 0;
    662 	gethrestime(&as->a_updatetime);
    663 	as->a_objectdir		= NULL;
    664 	as->a_sizedir		= 0;
    665 	as->a_userlimit		= (caddr_t)USERLIMIT;
    666 	as->a_lastgap		= NULL;
    667 	as->a_lastgaphl		= NULL;
    668 	as->a_callbacks		= NULL;
    669 
    670 	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
    671 	as->a_hat = hat_alloc(as);	/* create hat for default system mmu */
    672 	AS_LOCK_EXIT(as, &as->a_lock);
    673 
    674 	as->a_xhat = NULL;
    675 
    676 	return (as);
    677 }
    678 
    679 /*
    680  * Free an address space data structure.
    681  * Need to free the hat first and then
    682  * all the segments on this as and finally
    683  * the space for the as struct itself.
    684  */
    685 void
    686 as_free(struct as *as)
    687 {
    688 	struct hat *hat = as->a_hat;
    689 	struct seg *seg, *next;
    690 	int called = 0;
    691 
    692 top:
    693 	/*
    694 	 * Invoke ALL callbacks. as_do_callbacks will do one callback
    695 	 * per call, and not return (-1) until the callback has completed.
    696 	 * When as_do_callbacks returns zero, all callbacks have completed.
    697 	 */
    698 	mutex_enter(&as->a_contents);
    699 	while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
    700 		;
    701 
    702 	/* This will prevent new XHATs from attaching to as */
    703 	if (!called)
    704 		AS_SETBUSY(as);
    705 	mutex_exit(&as->a_contents);
    706 	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
    707 
    708 	if (!called) {
    709 		called = 1;
    710 		hat_free_start(hat);
    711 		if (as->a_xhat != NULL)
    712 			xhat_free_start_all(as);
    713 	}
    714 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
    715 		int err;
    716 
    717 		next = AS_SEGNEXT(as, seg);
    718 		err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
    719 		if (err == EAGAIN) {
    720 			mutex_enter(&as->a_contents);
    721 			if (as->a_callbacks) {
    722 				AS_LOCK_EXIT(as, &as->a_lock);
    723 			} else {
    724 				/*
    725 				 * Memory is currently locked. Wait for a
    726 				 * cv_signal that it has been unlocked, then
    727 				 * try the operation again.
    728 				 */
    729 				if (AS_ISUNMAPWAIT(as) == 0)
    730 					cv_broadcast(&as->a_cv);
    731 				AS_SETUNMAPWAIT(as);
    732 				AS_LOCK_EXIT(as, &as->a_lock);
    733 				while (AS_ISUNMAPWAIT(as))
    734 					cv_wait(&as->a_cv, &as->a_contents);
    735 			}
    736 			mutex_exit(&as->a_contents);
    737 			goto top;
    738 		} else {
    739 			/*
    740 			 * We do not expect any other error return at this
    741 			 * time. This is similar to an ASSERT in seg_unmap()
    742 			 */
    743 			ASSERT(err == 0);
    744 		}
    745 	}
    746 	hat_free_end(hat);
    747 	if (as->a_xhat != NULL)
    748 		xhat_free_end_all(as);
    749 	AS_LOCK_EXIT(as, &as->a_lock);
    750 
    751 	/* /proc stuff */
    752 	ASSERT(avl_numnodes(&as->a_wpage) == 0);
    753 	if (as->a_objectdir) {
    754 		kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
    755 		as->a_objectdir = NULL;
    756 		as->a_sizedir = 0;
    757 	}
    758 
    759 	/*
    760 	 * Free the struct as back to kmem.  Assert it has no segments.
    761 	 */
    762 	ASSERT(avl_numnodes(&as->a_segtree) == 0);
    763 	kmem_cache_free(as_cache, as);
    764 }
    765 
    766 int
    767 as_dup(struct as *as, struct as **outas)
    768 {
    769 	struct as *newas;
    770 	struct seg *seg, *newseg;
    771 	int error;
    772 
    773 	AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
    774 	as_clearwatch(as);
    775 	newas = as_alloc();
    776 	newas->a_userlimit = as->a_userlimit;
    777 	AS_LOCK_ENTER(newas, &newas->a_lock, RW_WRITER);
    778 
    779 	/* This will prevent new XHATs from attaching */
    780 	mutex_enter(&as->a_contents);
    781 	AS_SETBUSY(as);
    782 	mutex_exit(&as->a_contents);
    783 	mutex_enter(&newas->a_contents);
    784 	AS_SETBUSY(newas);
    785 	mutex_exit(&newas->a_contents);
    786 
    787 	(void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
    788 
    789 	for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
    790 
    791 		if (seg->s_flags & S_PURGE)
    792 			continue;
    793 
    794 		newseg = seg_alloc(newas, seg->s_base, seg->s_size);
    795 		if (newseg == NULL) {
    796 			AS_LOCK_EXIT(newas, &newas->a_lock);
    797 			as_setwatch(as);
    798 			mutex_enter(&as->a_contents);
    799 			AS_CLRBUSY(as);
    800 			mutex_exit(&as->a_contents);
    801 			AS_LOCK_EXIT(as, &as->a_lock);
    802 			as_free(newas);
    803 			return (-1);
    804 		}
    805 		if ((error = SEGOP_DUP(seg, newseg)) != 0) {
    806 			/*
    807 			 * We call seg_free() on the new seg
    808 			 * because the segment is not set up
    809 			 * completely; i.e. it has no ops.
    810 			 */
    811 			as_setwatch(as);
    812 			mutex_enter(&as->a_contents);
    813 			AS_CLRBUSY(as);
    814 			mutex_exit(&as->a_contents);
    815 			AS_LOCK_EXIT(as, &as->a_lock);
    816 			seg_free(newseg);
    817 			AS_LOCK_EXIT(newas, &newas->a_lock);
    818 			as_free(newas);
    819 			return (error);
    820 		}
    821 		newas->a_size += seg->s_size;
    822 	}
    823 
    824 	error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
    825 	if (as->a_xhat != NULL)
    826 		error |= xhat_dup_all(as, newas, NULL, 0, HAT_DUP_ALL);
    827 
    828 	mutex_enter(&newas->a_contents);
    829 	AS_CLRBUSY(newas);
    830 	mutex_exit(&newas->a_contents);
    831 	AS_LOCK_EXIT(newas, &newas->a_lock);
    832 
    833 	as_setwatch(as);
    834 	mutex_enter(&as->a_contents);
    835 	AS_CLRBUSY(as);
    836 	mutex_exit(&as->a_contents);
    837 	AS_LOCK_EXIT(as, &as->a_lock);
    838 	if (error != 0) {
    839 		as_free(newas);
    840 		return (error);
    841 	}
    842 	*outas = newas;
    843 	return (0);
    844 }
    845 
    846 /*
    847  * Handle a ``fault'' at addr for size bytes.
    848  */
    849 faultcode_t
    850 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
    851 	enum fault_type type, enum seg_rw rw)
    852 {
    853 	struct seg *seg;
    854 	caddr_t raddr;			/* rounded down addr */
    855 	size_t rsize;			/* rounded up size */
    856 	size_t ssize;
    857 	faultcode_t res = 0;
    858 	caddr_t addrsav;
    859 	struct seg *segsav;
    860 	int as_lock_held;
    861 	klwp_t *lwp = ttolwp(curthread);
    862 	int is_xhat = 0;
    863 	int holding_wpage = 0;
    864 	extern struct seg_ops   segdev_ops;
    865 
    866 
    867 
    868 	if (as->a_hat != hat) {
    869 		/* This must be an XHAT then */
    870 		is_xhat = 1;
    871 
    872 		if ((type != F_INVAL) || (as == &kas))
    873 			return (FC_NOSUPPORT);
    874 	}
    875 
    876 retry:
    877 	if (!is_xhat) {
    878 		/*
    879 		 * Indicate that the lwp is not to be stopped while waiting
    880 		 * for a pagefault.  This is to avoid deadlock while debugging
    881 		 * a process via /proc over NFS (in particular).
    882 		 */
    883 		if (lwp != NULL)
    884 			lwp->lwp_nostop++;
    885 
    886 		/*
    887 		 * same length must be used when we softlock and softunlock.
    888 		 * We don't support softunlocking lengths less than
    889 		 * the original length when there is largepage support.
    890 		 * See seg_dev.c for more comments.
    891 		 */
    892 		switch (type) {
    893 
    894 		case F_SOFTLOCK:
    895 			CPU_STATS_ADD_K(vm, softlock, 1);
    896 			break;
    897 
    898 		case F_SOFTUNLOCK:
    899 			break;
    900 
    901 		case F_PROT:
    902 			CPU_STATS_ADD_K(vm, prot_fault, 1);
    903 			break;
    904 
    905 		case F_INVAL:
    906 			CPU_STATS_ENTER_K();
    907 			CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
    908 			if (as == &kas)
    909 				CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
    910 			CPU_STATS_EXIT_K();
    911 			break;
    912 		}
    913 	}
    914 
    915 	/* Kernel probe */
    916 	TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
    917 	    tnf_opaque,	address,	addr,
    918 	    tnf_fault_type,	fault_type,	type,
    919 	    tnf_seg_access,	access,		rw);
    920 
    921 	raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
    922 	rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
    923 	    (size_t)raddr;
    924 
    925 	/*
    926 	 * XXX -- Don't grab the as lock for segkmap. We should grab it for
    927 	 * correctness, but then we could be stuck holding this lock for
    928 	 * a LONG time if the fault needs to be resolved on a slow
    929 	 * filesystem, and then no-one will be able to exec new commands,
    930 	 * as exec'ing requires the write lock on the as.
    931 	 */
    932 	if (as == &kas && segkmap && segkmap->s_base <= raddr &&
    933 	    raddr + size < segkmap->s_base + segkmap->s_size) {
    934 		/*
    935 		 * if (as==&kas), this can't be XHAT: we've already returned
    936 		 * FC_NOSUPPORT.
    937 		 */
    938 		seg = segkmap;
    939 		as_lock_held = 0;
    940 	} else {
    941 		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
    942 		if (is_xhat && avl_numnodes(&as->a_wpage) != 0) {
    943 			/*
    944 			 * Grab and hold the writers' lock on the as
    945 			 * if the fault is to a watched page.
    946 			 * This will keep CPUs from "peeking" at the
    947 			 * address range while we're temporarily boosting
    948 			 * the permissions for the XHAT device to
    949 			 * resolve the fault in the segment layer.
    950 			 *
    951 			 * We could check whether faulted address
    952 			 * is within a watched page and only then grab
    953 			 * the writer lock, but this is simpler.
    954 			 */
    955 			AS_LOCK_EXIT(as, &as->a_lock);
    956 			AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
    957 		}
    958 
    959 		seg = as_segat(as, raddr);
    960 		if (seg == NULL) {
    961 			AS_LOCK_EXIT(as, &as->a_lock);
    962 			if ((lwp != NULL) && (!is_xhat))
    963 				lwp->lwp_nostop--;
    964 			return (FC_NOMAP);
    965 		}
    966 
    967 		as_lock_held = 1;
    968 	}
    969 
    970 	addrsav = raddr;
    971 	segsav = seg;
    972 
    973 	for (; rsize != 0; rsize -= ssize, raddr += ssize) {
    974 		if (raddr >= seg->s_base + seg->s_size) {
    975 			seg = AS_SEGNEXT(as, seg);
    976 			if (seg == NULL || raddr != seg->s_base) {
    977 				res = FC_NOMAP;
    978 				break;
    979 			}
    980 		}
    981 		if (raddr + rsize > seg->s_base + seg->s_size)
    982 			ssize = seg->s_base + seg->s_size - raddr;
    983 		else
    984 			ssize = rsize;
    985 
    986 		if (!is_xhat || (seg->s_ops != &segdev_ops)) {
    987 
    988 			if (is_xhat && avl_numnodes(&as->a_wpage) != 0 &&
    989 			    pr_is_watchpage_as(raddr, rw, as)) {
    990 				/*
    991 				 * Handle watch pages.  If we're faulting on a
    992 				 * watched page from an X-hat, we have to
    993 				 * restore the original permissions while we
    994 				 * handle the fault.
    995 				 */
    996 				as_clearwatch(as);
    997 				holding_wpage = 1;
    998 			}
    999 
   1000 			res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
   1001 
   1002 			/* Restore watchpoints */
   1003 			if (holding_wpage) {
   1004 				as_setwatch(as);
   1005 				holding_wpage = 0;
   1006 			}
   1007 
   1008 			if (res != 0)
   1009 				break;
   1010 		} else {
   1011 			/* XHAT does not support seg_dev */
   1012 			res = FC_NOSUPPORT;
   1013 			break;
   1014 		}
   1015 	}
   1016 
   1017 	/*
   1018 	 * If we were SOFTLOCKing and encountered a failure,
   1019 	 * we must SOFTUNLOCK the range we already did. (Maybe we
   1020 	 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
   1021 	 * right here...)
   1022 	 */
   1023 	if (res != 0 && type == F_SOFTLOCK) {
   1024 		for (seg = segsav; addrsav < raddr; addrsav += ssize) {
   1025 			if (addrsav >= seg->s_base + seg->s_size)
   1026 				seg = AS_SEGNEXT(as, seg);
   1027 			ASSERT(seg != NULL);
   1028 			/*
   1029 			 * Now call the fault routine again to perform the
   1030 			 * unlock using S_OTHER instead of the rw variable
   1031 			 * since we never got a chance to touch the pages.
   1032 			 */
   1033 			if (raddr > seg->s_base + seg->s_size)
   1034 				ssize = seg->s_base + seg->s_size - addrsav;
   1035 			else
   1036 				ssize = raddr - addrsav;
   1037 			(void) SEGOP_FAULT(hat, seg, addrsav, ssize,
   1038 			    F_SOFTUNLOCK, S_OTHER);
   1039 		}
   1040 	}
   1041 	if (as_lock_held)
   1042 		AS_LOCK_EXIT(as, &as->a_lock);
   1043 	if ((lwp != NULL) && (!is_xhat))
   1044 		lwp->lwp_nostop--;
   1045 
   1046 	/*
   1047 	 * If the lower levels returned EDEADLK for a fault,
   1048 	 * It means that we should retry the fault.  Let's wait
   1049 	 * a bit also to let the deadlock causing condition clear.
   1050 	 * This is part of a gross hack to work around a design flaw
   1051 	 * in the ufs/sds logging code and should go away when the
   1052 	 * logging code is re-designed to fix the problem. See bug
   1053 	 * 4125102 for details of the problem.
   1054 	 */
   1055 	if (FC_ERRNO(res) == EDEADLK) {
   1056 		delay(deadlk_wait);
   1057 		res = 0;
   1058 		goto retry;
   1059 	}
   1060 	return (res);
   1061 }
   1062 
   1063 
   1064 
   1065 /*
   1066  * Asynchronous ``fault'' at addr for size bytes.
   1067  */
   1068 faultcode_t
   1069 as_faulta(struct as *as, caddr_t addr, size_t size)
   1070 {
   1071 	struct seg *seg;
   1072 	caddr_t raddr;			/* rounded down addr */
   1073 	size_t rsize;			/* rounded up size */
   1074 	faultcode_t res = 0;
   1075 	klwp_t *lwp = ttolwp(curthread);
   1076 
   1077 retry:
   1078 	/*
   1079 	 * Indicate that the lwp is not to be stopped while waiting
   1080 	 * for a pagefault.  This is to a