Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)page_lock.c	1.66	07/08/16 SMI"
     27 
     28 /*
     29  * VM - page locking primitives
     30  */
     31 #include <sys/param.h>
     32 #include <sys/t_lock.h>
     33 #include <sys/vtrace.h>
     34 #include <sys/debug.h>
     35 #include <sys/cmn_err.h>
     36 #include <sys/vnode.h>
     37 #include <sys/bitmap.h>
     38 #include <sys/lockstat.h>
     39 #include <sys/sysmacros.h>
     40 #include <sys/condvar_impl.h>
     41 #include <vm/page.h>
     42 #include <vm/seg_enum.h>
     43 #include <vm/vm_dep.h>
     44 
     45 /*
     46  * This global mutex is for logical page locking.
     47  * The following fields in the page structure are protected
     48  * by this lock:
     49  *
     50  *	p_lckcnt
     51  *	p_cowcnt
     52  */
     53 kmutex_t page_llock;
     54 
     55 /*
     56  * This is a global lock for the logical page free list.  The
     57  * logical free list, in this implementation, is maintained as two
     58  * separate physical lists - the cache list and the free list.
     59  */
     60 kmutex_t  page_freelock;
     61 
     62 /*
     63  * The hash table, page_hash[], the p_selock fields, and the
     64  * list of pages associated with vnodes are protected by arrays of mutexes.
     65  *
     66  * Unless the hashes are changed radically, the table sizes must be
     67  * a power of two.  Also, we typically need more mutexes for the
     68  * vnodes since these locks are occasionally held for long periods.
     69  * And since there seem to be two special vnodes (kvp and swapvp),
     70  * we make room for private mutexes for them.
     71  *
     72  * The pse_mutex[] array holds the mutexes to protect the p_selock
     73  * fields of all page_t structures.
     74  *
     75  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
     76  * when given a pointer to a page_t.
     77  *
     78  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
     79  * should go to the trouble of setting it up at run time and base it
     80  * on memory size rather than the number of compile time CPUs.
     81  *
     82  * XX64	We should be using physmem size to calculate PIO_SHIFT.
     83  *
     84  *	These might break in 64 bit world.
     85  */
     86 #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
     87 #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
     88 
     89 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
     90 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
     91 
     92 #define	PAGE_IO_MUTEX(pp) \
     93 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
     94 
     95 /*
     96  * The pse_mutex[] array is allocated in the platform startup code
     97  * based on the size of the machine at startup.
     98  */
     99 extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
    100 extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
    101 extern int pse_shift;			/* log2(pse_table_size) */
    102 #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
    103 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
    104 	(pse_table_size - 1)].pad_mutex
    105 
    106 #define	PSZC_MTX_TABLE_SIZE	128
    107 #define	PSZC_MTX_TABLE_SHIFT	7
    108 
    109 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
    110 
    111 #define	PAGE_SZC_MUTEX(_pp) \
    112 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
    113 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
    114 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
    115 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
    116 
    117 /*
    118  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
    119  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
    120  * and p_vpnext).
    121  *
    122  * The page_vnode_mutex(vp) function returns the address of the appropriate
    123  * mutex from this array given a pointer to a vnode.  It is complicated
    124  * by the fact that the kernel's vnode and the swapfs vnode are referenced
    125  * frequently enough to warrent their own mutexes.
    126  *
    127  * The VP_HASH_FUNC returns the index into the vph_mutex array given
    128  * an address of a vnode.
    129  */
    130 
    131 /*
    132  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
    133  *	Need to review again.
    134  */
    135 #if defined(_LP64)
    136 #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
    137 #else	/* 32 bits */
    138 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
    139 #endif
    140 
    141 #define	VP_HASH_FUNC(vp) \
    142 	((((uintptr_t)(vp) >> 6) + \
    143 	    ((uintptr_t)(vp) >> 8) + \
    144 	    ((uintptr_t)(vp) >> 10) + \
    145 	    ((uintptr_t)(vp) >> 12)) \
    146 	    & (VPH_TABLE_SIZE - 1))
    147 
    148 extern	struct vnode	kvp;
    149 
    150 /*
    151  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
    152  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
    153  * VPH_TABLE_SIZE + 1.
    154  */
    155 
    156 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
    157 
    158 /*
    159  * Initialize the locks used by the Virtual Memory Management system.
    160  */
    161 void
    162 page_lock_init()
    163 {
    164 }
    165 
    166 /*
    167  * Return a value for pse_shift based on npg (the number of physical pages)
    168  * and ncpu (the maximum number of CPUs).  This is called by platform startup
    169  * code.
    170  *
    171  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
    172  * locks grew approximately as the square of the number of threads executing.
    173  * So the primary scaling factor used is NCPU^2.  The size of the machine in
    174  * megabytes is used as an upper bound, particularly for sun4v machines which
    175  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
    176  * (128) is used as a minimum.  Since the size of the table has to be a power
    177  * of two, the calculated size is rounded up to the next power of two.
    178  */
    179 /*ARGSUSED*/
    180 int
    181 size_pse_array(pgcnt_t npg, int ncpu)
    182 {
    183 	size_t size;
    184 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
    185 
    186 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
    187 	size += (1 << (highbit(size) - 1)) - 1;
    188 	return (highbit(size) - 1);
    189 }
    190 
    191 /*
    192  * At present we only use page ownership to aid debugging, so it's
    193  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
    194  * can map to the same owner because we just 'or' in 0x80000000 and
    195  * then clear the second highest bit, so that (for example) 0x2faced00
    196  * and 0xafaced00 both map to 0xafaced00.
    197  * In the 64-bit world, p_selock may not be large enough to hold a full
    198  * thread pointer.  If we ever need precise ownership (e.g. if we implement
    199  * priority inheritance for page locks) then p_selock should become a
    200  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
    201  */
    202 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
    203 #define	SE_READER	1
    204 
    205 /*
    206  * A page that is deleted must be marked as such using the
    207  * page_lock_delete() function. The page must be exclusively locked.
    208  * The SE_DELETED marker is put in p_selock when this function is called.
    209  * SE_DELETED must be distinct from any SE_WRITER value.
    210  */
    211 #define	SE_DELETED	(1 | INT_MIN)
    212 
    213 #ifdef VM_STATS
    214 uint_t	vph_kvp_count;
    215 uint_t	vph_swapfsvp_count;
    216 uint_t	vph_other;
    217 #endif /* VM_STATS */
    218 
    219 #ifdef VM_STATS
    220 uint_t	page_lock_count;
    221 uint_t	page_lock_miss;
    222 uint_t	page_lock_miss_lock;
    223 uint_t	page_lock_reclaim;
    224 uint_t	page_lock_bad_reclaim;
    225 uint_t	page_lock_same_page;
    226 uint_t	page_lock_upgrade;
    227 uint_t	page_lock_retired;
    228 uint_t	page_lock_upgrade_failed;
    229 uint_t	page_lock_deleted;
    230 
    231 uint_t	page_trylock_locked;
    232 uint_t	page_trylock_failed;
    233 uint_t	page_trylock_missed;
    234 
    235 uint_t	page_try_reclaim_upgrade;
    236 #endif /* VM_STATS */
    237 
    238 /*
    239  * Acquire the "shared/exclusive" lock on a page.
    240  *
    241  * Returns 1 on success and locks the page appropriately.
    242  *	   0 on failure and does not lock the page.
    243  *
    244  * If `lock' is non-NULL, it will be dropped and reacquired in the
    245  * failure case.  This routine can block, and if it does
    246  * it will always return a failure since the page identity [vp, off]
    247  * or state may have changed.
    248  */
    249 
    250 int
    251 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
    252 {
    253 	return (page_lock_es(pp, se, lock, reclaim, 0));
    254 }
    255 
    256 /*
    257  * With the addition of reader-writer lock semantics to page_lock_es,
    258  * callers wanting an exclusive (writer) lock may prevent shared-lock
    259  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
    260  * In this case, when an exclusive lock cannot be acquired, p_selock's
    261  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
    262  * if the page is slated for retirement.
    263  *
    264  * The se and es parameters determine if the lock should be granted
    265  * based on the following decision table:
    266  *
    267  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
    268  * ----------- -------------- -------------------  ---------
    269  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
    270  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
    271  * SE_EXCL        none         any lock/any        deny
    272  * SE_SHARED      n/a [2]        shared/0          grant
    273  * SE_SHARED      n/a [2]      unlocked/0          grant
    274  * SE_SHARED      n/a            shared/1          deny
    275  * SE_SHARED      n/a          unlocked/1          deny
    276  * SE_SHARED      n/a              excl/any        deny
    277  *
    278  * Notes:
    279  * [1] The code grants an exclusive lock to the caller and clears the bit
    280  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
    281  *   bit's value.  This was deemed acceptable as we are not concerned about
    282  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
    283  *   fifo mechanism should also be implemented. Meantime, the thread that
    284  *   set SE_EWANTED should be prepared to catch this condition and reset it
    285  *
    286  * [2] Retired pages may not be locked at any time, regardless of the
    287  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
    288  *
    289  * Notes on values of "es":
    290  *
    291  *   es & 1: page_lookup_create will attempt page relocation
    292  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
    293  *       memory thread); this prevents reader-starvation of waiting
    294  *       writer thread(s) by giving priority to writers over readers.
    295  *   es & SE_RETIRED: caller wants to lock pages even if they are
    296  *       retired.  Default is to deny the lock if the page is retired.
    297  *
    298  * And yes, we know, the semantics of this function are too complicated.
    299  * It's on the list to be cleaned up.
    300  */
    301 int
    302 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
    303 {
    304 	int		retval;
    305 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
    306 	int		upgraded;
    307 	int		reclaim_it;
    308 
    309 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
    310 
    311 	VM_STAT_ADD(page_lock_count);
    312 
    313 	upgraded = 0;
    314 	reclaim_it = 0;
    315 
    316 	mutex_enter(pse);
    317 
    318 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
    319 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
    320 
    321 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
    322 		mutex_exit(pse);
    323 		VM_STAT_ADD(page_lock_retired);
    324 		return (0);
    325 	}
    326 
    327 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
    328 		se = SE_EXCL;
    329 	}
    330 
    331 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
    332 
    333 		reclaim_it = 1;
    334 		if (se == SE_SHARED) {
    335 			/*
    336 			 * This is an interesting situation.
    337 			 *
    338 			 * Remember that p_free can only change if
    339 			 * p_selock < 0.
    340 			 * p_free does not depend on our holding `pse'.
    341 			 * And, since we hold `pse', p_selock can not change.
    342 			 * So, if p_free changes on us, the page is already
    343 			 * exclusively held, and we would fail to get p_selock
    344 			 * regardless.
    345 			 *
    346 			 * We want to avoid getting the share
    347 			 * lock on a free page that needs to be reclaimed.
    348 			 * It is possible that some other thread has the share
    349 			 * lock and has left the free page on the cache list.
    350 			 * pvn_vplist_dirty() does this for brief periods.
    351 			 * If the se_share is currently SE_EXCL, we will fail
    352 			 * to acquire p_selock anyway.  Blocking is the
    353 			 * right thing to do.
    354 			 * If we need to reclaim this page, we must get
    355 			 * exclusive access to it, force the upgrade now.
    356 			 * Again, we will fail to acquire p_selock if the
    357 			 * page is not free and block.
    358 			 */
    359 			upgraded = 1;
    360 			se = SE_EXCL;
    361 			VM_STAT_ADD(page_lock_upgrade);
    362 		}
    363 	}
    364 
    365 	if (se == SE_EXCL) {
    366 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
    367 			/*
    368 			 * if the caller wants a writer lock (but did not
    369 			 * specify exclusive access), and there is a pending
    370 			 * writer that wants exclusive access, return failure
    371 			 */
    372 			retval = 0;
    373 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
    374 			/* no reader/writer lock held */
    375 			THREAD_KPRI_REQUEST();
    376 			/* this clears our setting of the SE_EWANTED bit */
    377 			pp->p_selock = SE_WRITER;
    378 			retval = 1;
    379 		} else {
    380 			/* page is locked */
    381 			if (es & SE_EXCL_WANTED) {
    382 				/* set the SE_EWANTED bit */
    383 				pp->p_selock |= SE_EWANTED;
    384 			}
    385 			retval = 0;
    386 		}
    387 	} else {
    388 		retval = 0;
    389 		if (pp->p_selock >= 0) {
    390 			if ((pp->p_selock & SE_EWANTED) == 0) {
    391 				pp->p_selock += SE_READER;
    392 				retval = 1;
    393 			}
    394 		}
    395 	}
    396 
    397 	if (retval == 0) {
    398 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
    399 			VM_STAT_ADD(page_lock_deleted);
    400 			mutex_exit(pse);
    401 			return (retval);
    402 		}
    403 
    404 #ifdef VM_STATS
    405 		VM_STAT_ADD(page_lock_miss);
    406 		if (upgraded) {
    407 			VM_STAT_ADD(page_lock_upgrade_failed);
    408 		}
    409 #endif
    410 		if (lock) {
    411 			VM_STAT_ADD(page_lock_miss_lock);
    412 			mutex_exit(lock);
    413 		}
    414 
    415 		/*
    416 		 * Now, wait for the page to be unlocked and
    417 		 * release the lock protecting p_cv and p_selock.
    418 		 */
    419 		cv_wait(&pp->p_cv, pse);
    420 		mutex_exit(pse);
    421 
    422 		/*
    423 		 * The page identity may have changed while we were
    424 		 * blocked.  If we are willing to depend on "pp"
    425 		 * still pointing to a valid page structure (i.e.,
    426 		 * assuming page structures are not dynamically allocated
    427 		 * or freed), we could try to lock the page if its
    428 		 * identity hasn't changed.
    429 		 *
    430 		 * This needs to be measured, since we come back from
    431 		 * cv_wait holding pse (the expensive part of this
    432 		 * operation) we might as well try the cheap part.
    433 		 * Though we would also have to confirm that dropping
    434 		 * `lock' did not cause any grief to the callers.
    435 		 */
    436 		if (lock) {
    437 			mutex_enter(lock);
    438 		}
    439 	} else {
    440 		/*
    441 		 * We have the page lock.
    442 		 * If we needed to reclaim the page, and the page
    443 		 * needed reclaiming (ie, it was free), then we
    444 		 * have the page exclusively locked.  We may need
    445 		 * to downgrade the page.
    446 		 */
    447 		ASSERT((upgraded) ?
    448 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
    449 		mutex_exit(pse);
    450 
    451 		/*
    452 		 * We now hold this page's lock, either shared or
    453 		 * exclusive.  This will prevent its identity from changing.
    454 		 * The page, however, may or may not be free.  If the caller
    455 		 * requested, and it is free, go reclaim it from the
    456 		 * free list.  If the page can't be reclaimed, return failure
    457 		 * so that the caller can start all over again.
    458 		 *
    459 		 * NOTE:page_reclaim() releases the page lock (p_selock)
    460 		 *	if it can't be reclaimed.
    461 		 */
    462 		if (reclaim_it) {
    463 			if (!page_reclaim(pp, lock)) {
    464 				VM_STAT_ADD(page_lock_bad_reclaim);
    465 				retval = 0;
    466 			} else {
    467 				VM_STAT_ADD(page_lock_reclaim);
    468 				if (upgraded) {
    469 					page_downgrade(pp);
    470 				}
    471 			}
    472 		}
    473 	}
    474 	return (retval);
    475 }
    476 
    477 /*
    478  * Clear the SE_EWANTED bit from p_selock.  This function allows
    479  * callers of page_lock_es and page_try_reclaim_lock to clear
    480  * their setting of this bit if they decide they no longer wish
    481  * to gain exclusive access to the page.  Currently only
    482  * delete_memory_thread uses this when the delete memory
    483  * operation is cancelled.
    484  */
    485 void
    486 page_lock_clr_exclwanted(page_t *pp)
    487 {
    488 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    489 
    490 	mutex_enter(pse);
    491 	pp->p_selock &= ~SE_EWANTED;
    492 	if (CV_HAS_WAITERS(&pp->p_cv))
    493 		cv_broadcast(&pp->p_cv);
    494 	mutex_exit(pse);
    495 }
    496 
    497 /*
    498  * Read the comments inside of page_lock_es() carefully.
    499  *
    500  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
    501  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
    502  * This is used by threads subject to reader-starvation (eg. memory delete).
    503  *
    504  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
    505  * it is expected that it will retry at a later time.  Threads that will
    506  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
    507  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
    508  * the bit is cleared.)
    509  */
    510 int
    511 page_try_reclaim_lock(page_t *pp, se_t se, int es)
    512 {
    513 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    514 	selock_t old;
    515 
    516 	mutex_enter(pse);
    517 
    518 	old = pp->p_selock;
    519 
    520 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
    521 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
    522 
    523 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
    524 		mutex_exit(pse);
    525 		VM_STAT_ADD(page_trylock_failed);
    526 		return (0);
    527 	}
    528 
    529 	if (se == SE_SHARED && es == 1 && old == 0) {
    530 		se = SE_EXCL;
    531 	}
    532 
    533 	if (se == SE_SHARED) {
    534 		if (!PP_ISFREE(pp)) {
    535 			if (old >= 0) {
    536 				/*
    537 				 * Readers are not allowed when excl wanted
    538 				 */
    539 				if ((old & SE_EWANTED) == 0) {
    540 					pp->p_selock = old + SE_READER;
    541 					mutex_exit(pse);
    542 					return (1);
    543 				}
    544 			}
    545 			mutex_exit(pse);
    546 			return (0);
    547 		}
    548 		/*
    549 		 * The page is free, so we really want SE_EXCL (below)
    550 		 */
    551 		VM_STAT_ADD(page_try_reclaim_upgrade);
    552 	}
    553 
    554 	/*
    555 	 * The caller wants a writer lock.  We try for it only if
    556 	 * SE_EWANTED is not set, or if the caller specified
    557 	 * SE_EXCL_WANTED.
    558 	 */
    559 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
    560 		if ((old & ~SE_EWANTED) == 0) {
    561 			/* no reader/writer lock held */
    562 			THREAD_KPRI_REQUEST();
    563 			/* this clears out our setting of the SE_EWANTED bit */
    564 			pp->p_selock = SE_WRITER;
    565 			mutex_exit(pse);
    566 			return (1);
    567 		}
    568 	}
    569 	if (es & SE_EXCL_WANTED) {
    570 		/* page is locked, set the SE_EWANTED bit */
    571 		pp->p_selock |= SE_EWANTED;
    572 	}
    573 	mutex_exit(pse);
    574 	return (0);
    575 }
    576 
    577 /*
    578  * Acquire a page's "shared/exclusive" lock, but never block.
    579  * Returns 1 on success, 0 on failure.
    580  */
    581 int
    582 page_trylock(page_t *pp, se_t se)
    583 {
    584 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    585 
    586 	mutex_enter(pse);
    587 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
    588 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
    589 		/*
    590 		 * Fail if a thread wants exclusive access and page is
    591 		 * retired, if the page is slated for retirement, or a
    592 		 * share lock is requested.
    593 		 */
    594 		mutex_exit(pse);
    595 		VM_STAT_ADD(page_trylock_failed);
    596 		return (0);
    597 	}
    598 
    599 	if (se == SE_EXCL) {
    600 		if (pp->p_selock == 0) {
    601 			THREAD_KPRI_REQUEST();
    602 			pp->p_selock = SE_WRITER;
    603 			mutex_exit(pse);
    604 			return (1);
    605 		}
    606 	} else {
    607 		if (pp->p_selock >= 0) {
    608 			pp->p_selock += SE_READER;
    609 			mutex_exit(pse);
    610 			return (1);
    611 		}
    612 	}
    613 	mutex_exit(pse);
    614 	return (0);
    615 }
    616 
    617 /*
    618  * Variant of page_unlock() specifically for the page freelist
    619  * code. The mere existence of this code is a vile hack that
    620  * has resulted due to the backwards locking order of the page
    621  * freelist manager; please don't call it.
    622  */
    623 void
    624 page_unlock_nocapture(page_t *pp)
    625 {
    626 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    627 	selock_t old;
    628 
    629 	mutex_enter(pse);
    630 
    631 	old = pp->p_selock;
    632 	if ((old & ~SE_EWANTED) == SE_READER) {
    633 		pp->p_selock = old & ~SE_READER;
    634 		if (CV_HAS_WAITERS(&pp->p_cv))
    635 			cv_broadcast(&pp->p_cv);
    636 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
    637 		panic("page_unlock_nocapture: page %p is deleted", pp);
    638 	} else if (old < 0) {
    639 		THREAD_KPRI_RELEASE();
    640 		pp->p_selock &= SE_EWANTED;
    641 		if (CV_HAS_WAITERS(&pp->p_cv))
    642 			cv_broadcast(&pp->p_cv);
    643 	} else if ((old & ~SE_EWANTED) > SE_READER) {
    644 		pp->p_selock = old - SE_READER;
    645 	} else {
    646 		panic("page_unlock_nocapture: page %p is not locked", pp);
    647 	}
    648 
    649 	mutex_exit(pse);
    650 }
    651 
    652 /*
    653  * Release the page's "shared/exclusive" lock and wake up anyone
    654  * who might be waiting for it.
    655  */
    656 void
    657 page_unlock(page_t *pp)
    658 {
    659 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    660 	selock_t old;
    661 
    662 	mutex_enter(pse);
    663 
    664 	old = pp->p_selock;
    665 	if ((old & ~SE_EWANTED) == SE_READER) {
    666 		pp->p_selock = old & ~SE_READER;
    667 		if (CV_HAS_WAITERS(&pp->p_cv))
    668 			cv_broadcast(&pp->p_cv);
    669 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
    670 		panic("page_unlock: page %p is deleted", pp);
    671 	} else if (old < 0) {
    672 		THREAD_KPRI_RELEASE();
    673 		pp->p_selock &= SE_EWANTED;
    674 		if (CV_HAS_WAITERS(&pp->p_cv))
    675 			cv_broadcast(&pp->p_cv);
    676 	} else if ((old & ~SE_EWANTED) > SE_READER) {
    677 		pp->p_selock = old - SE_READER;
    678 	} else {
    679 		panic("page_unlock: page %p is not locked", pp);
    680 	}
    681 
    682 	if (pp->p_selock == 0) {
    683 		/*
    684 		 * If the T_CAPTURING bit is set, that means that we should
    685 		 * not try and capture the page again as we could recurse
    686 		 * which could lead to a stack overflow panic or spending a
    687 		 * relatively long time in the kernel making no progress.
    688 		 */
    689 		if ((pp->p_toxic & PR_CAPTURE) &&
    690 		    !(curthread->t_flag & T_CAPTURING) &&
    691 		    !PP_RETIRED(pp)) {
    692 			THREAD_KPRI_REQUEST();
    693 			pp->p_selock = SE_WRITER;
    694 			mutex_exit(pse);
    695 			page_unlock_capture(pp);
    696 		} else {
    697 			mutex_exit(pse);
    698 		}
    699 	} else {
    700 		mutex_exit(pse);
    701 	}
    702 }
    703 
    704 /*
    705  * Try to upgrade the lock on the page from a "shared" to an
    706  * "exclusive" lock.  Since this upgrade operation is done while
    707  * holding the mutex protecting this page, no one else can acquire this page's
    708  * lock and change the page. Thus, it is safe to drop the "shared"
    709  * lock and attempt to acquire the "exclusive" lock.
    710  *
    711  * Returns 1 on success, 0 on failure.
    712  */
    713 int
    714 page_tryupgrade(page_t *pp)
    715 {
    716 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    717 
    718 	mutex_enter(pse);
    719 	if (!(pp->p_selock & SE_EWANTED)) {
    720 		/* no threads want exclusive access, try upgrade */
    721 		if (pp->p_selock == SE_READER) {
    722 			THREAD_KPRI_REQUEST();
    723 			/* convert to exclusive lock */
    724 			pp->p_selock = SE_WRITER;
    725 			mutex_exit(pse);
    726 			return (1);
    727 		}
    728 	}
    729 	mutex_exit(pse);
    730 	return (0);
    731 }
    732 
    733 /*
    734  * Downgrade the "exclusive" lock on the page to a "shared" lock
    735  * while holding the mutex protecting this page's p_selock field.
    736  */
    737 void
    738 page_downgrade(page_t *pp)
    739 {
    740 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    741 	int excl_waiting;
    742 
    743 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
    744 	ASSERT(PAGE_EXCL(pp));
    745 
    746 	mutex_enter(pse);
    747 	excl_waiting =  pp->p_selock & SE_EWANTED;
    748 	THREAD_KPRI_RELEASE();
    749 	pp->p_selock = SE_READER | excl_waiting;
    750 	if (CV_HAS_WAITERS(&pp->p_cv))
    751 		cv_broadcast(&pp->p_cv);
    752 	mutex_exit(pse);
    753 }
    754 
    755 void
    756 page_lock_delete(page_t *pp)
    757 {
    758 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
    759 
    760 	ASSERT(PAGE_EXCL(pp));
    761 	ASSERT(pp->p_vnode == NULL);
    762 	ASSERT(pp->p_offset == (u_offset_t)-1);
    763 	ASSERT(!PP_ISFREE(pp));
    764 
    765 	mutex_enter(pse);
    766 	THREAD_KPRI_RELEASE();
    767 	pp->p_selock = SE_DELETED;
    768 	if (CV_HAS_WAITERS(&pp->p_cv))
    769 		cv_broadcast(&pp->p_cv);
    770 	mutex_exit(pse);
    771 }
    772 
    773 int
    774 page_deleted(page_t *pp)
    775 {
    776 	return (pp->p_selock == SE_DELETED);
    777 }
    778 
    779 /*
    780  * Implement the io lock for pages
    781  */
    782 void
    783 page_iolock_init(page_t *pp)
    784 {
    785 	pp->p_iolock_state = 0;
    786 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
    787 }
    788 
    789 /*
    790  * Acquire the i/o lock on a page.
    791  */
    792 void
    793 page_io_lock(page_t *pp)
    794 {
    795 	kmutex_t *pio;
    796 
    797 	pio = PAGE_IO_MUTEX(pp);
    798 	mutex_enter(pio);
    799 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
    800 		cv_wait(&(pp->p_io_cv), pio);
    801 	}
    802 	pp->p_iolock_state |= PAGE_IO_INUSE;
    803 	mutex_exit(pio);
    804 }
    805 
    806 /*
    807  * Release the i/o lock on a page.
    808  */
    809 void
    810 page_io_unlock(page_t *pp)
    811 {
    812 	kmutex_t *pio;
    813 
    814 	pio = PAGE_IO_MUTEX(pp);
    815 	mutex_enter(pio);
    816 	cv_broadcast(&pp->p_io_cv);
    817 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
    818 	mutex_exit(pio);
    819 }
    820 
    821 /*
    822  * Try to acquire the i/o lock on a page without blocking.
    823  * Returns 1 on success, 0 on failure.
    824  */
    825 int
    826 page_io_trylock(page_t *pp)
    827 {
    828 	kmutex_t *pio;
    829 
    830 	if (pp->p_iolock_state & PAGE_IO_INUSE)
    831 		return (0);
    832 
    833 	pio = PAGE_IO_MUTEX(pp);
    834 	mutex_enter(pio);
    835 
    836 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
    837 		mutex_exit(pio);
    838 		return (0);
    839 	}
    840 	pp->p_iolock_state |= PAGE_IO_INUSE;
    841 	mutex_exit(pio);
    842 
    843 	return (1);
    844 }
    845 
    846 /*
    847  * Wait until the i/o lock is not held.
    848  */
    849 void
    850 page_io_wait(page_t *pp)
    851 {
    852 	kmutex_t *pio;
    853 
    854 	pio = PAGE_IO_MUTEX(pp);
    855 	mutex_enter(pio);
    856 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
    857 		cv_wait(&(pp->p_io_cv), pio);
    858 	}
    859 	mutex_exit(pio);
    860 }
    861 
    862 /*
    863  * Returns 1 on success, 0 on failure.
    864  */
    865 int
    866 page_io_locked(page_t *pp)
    867 {
    868 	return (pp->p_iolock_state & PAGE_IO_INUSE);
    869 }
    870 
    871 /*
    872  * Assert that the i/o lock on a page is held.
    873  * Returns 1 on success, 0 on failure.
    874  */
    875 int
    876 page_iolock_assert(page_t *pp)
    877 {
    878 	return (page_io_locked(pp));
    879 }
    880 
    881 /*
    882  * Wrapper exported to kernel routines that are built
    883  * platform-independent (the macro is platform-dependent;
    884  * the size of vph_mutex[] is based on NCPU).
    885  *
    886  * Note that you can do stress testing on this by setting the
    887  * variable page_vnode_mutex_stress to something other than
    888  * zero in a DEBUG kernel in a debugger after loading the kernel.
    889  * Setting it after the kernel is running may not work correctly.
    890  */
    891 #ifdef DEBUG
    892 static int page_vnode_mutex_stress = 0;
    893 #endif
    894 
    895 kmutex_t *
    896 page_vnode_mutex(vnode_t *vp)
    897 {
    898 	if (vp == &kvp)
    899 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
    900 
    901 	if (vp == &zvp)
    902 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
    903 #ifdef DEBUG
    904 	if (page_vnode_mutex_stress != 0)
    905 		return (&vph_mutex[0]);
    906 #endif
    907 
    908 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
    909 }
    910 
    911 kmutex_t *
    912 page_se_mutex(page_t *pp)
    913 {
    914 	return (PAGE_SE_MUTEX(pp));
    915 }
    916 
    917 #ifdef VM_STATS
    918 uint_t pszclck_stat[4];
    919 #endif
    920 /*
    921  * Find, take and return a mutex held by hat_page_demote().
    922  * Called by page_demote_vp_pages() before hat_page_demote() call and by
    923  * routines that want to block hat_page_demote() but can't do it
    924  * via locking all constituent pages.
    925  *
    926  * Return NULL if p_szc is 0.
    927  *
    928  * It should only be used for pages that can be demoted by hat_page_demote()
    929  * i.e. non swapfs file system pages.  The logic here is lifted from
    930  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
    931  * since the page is locked and not free.
    932  *
    933  * Hash of the root page is used to find the lock.
    934  * To find the root in the presense of hat_page_demote() chageing the location
    935  * of the root this routine relies on the fact that hat_page_demote() changes
    936  * root last.
    937  *
    938  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
    939  * returned pp's p_szc may be any value.
    940  */
    941 kmutex_t *
    942 page_szc_lock(page_t *pp)
    943 {
    944 	kmutex_t	*mtx;
    945 	page_t		*rootpp;
    946 	uint_t		szc;
    947 	uint_t		rszc;
    948 	uint_t		pszc = pp->p_szc;
    949 
    950 	ASSERT(pp != NULL);
    951 	ASSERT(PAGE_LOCKED(pp));
    952 	ASSERT(!PP_ISFREE(pp));
    953 	ASSERT(pp->p_vnode != NULL);
    954 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
    955 	ASSERT(!PP_ISKAS(pp));
    956 
    957 again:
    958 	if (pszc == 0) {
    959 		VM_STAT_ADD(pszclck_stat[0]);
    960 		return (NULL);
    961 	}
    962 
    963 	/* The lock lives in the root page */
    964 
    965 	rootpp = PP_GROUPLEADER(pp, pszc);
    966 	mtx = PAGE_SZC_MUTEX(rootpp);
    967 	mutex_enter(mtx);
    968 
    969 	/*
    970 	 * since p_szc can only decrease if pp == rootpp
    971 	 * rootpp will be always the same i.e we have the right root
    972 	 * regardless of rootpp->p_szc.
    973 	 * If location of pp's root didn't change after we took
    974 	 * the lock we have the right root. return mutex hashed off it.
    975 	 */
    976 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
    977 		VM_STAT_ADD(pszclck_stat[1]);
    978 		return (mtx);
    979 	}
    980 
    981 	/*
    982 	 * root location changed because page got demoted.
    983 	 * locate the new root.
    984 	 */
    985 	if (rszc < pszc) {
    986 		szc = pp->p_szc;
    987 		ASSERT(szc < pszc);
    988 		mutex_exit(mtx);
    989 		pszc = szc;
    990 		VM_STAT_ADD(pszclck_stat[2]);
    991 		goto again;
    992 	}
    993 
    994 	VM_STAT_ADD(pszclck_stat[3]);
    995 	/*
    996 	 * current hat_page_demote not done yet.
    997 	 * wait for it to finish.
    998 	 */
    999 	mutex_exit(mtx);
   1000 	rootpp = PP_GROUPLEADER(rootpp, rszc);
   1001 	mtx = PAGE_SZC_MUTEX(rootpp);
   1002 	mutex_enter(mtx);
   1003 	mutex_exit(mtx);
   1004 	ASSERT(rootpp->p_szc < rszc);
   1005 	goto again;
   1006 }
   1007 
   1008 int
   1009 page_szc_lock_assert(page_t *pp)
   1010 {
   1011 	page_t *rootpp = PP_PAGEROOT(pp);
   1012 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
   1013 
   1014 	return (MUTEX_HELD(mtx));
   1015 }
   1016 
   1017 /*
   1018  * memseg locking
   1019  */
   1020 static krwlock_t memsegslock;
   1021 
   1022 /*
   1023  * memlist (phys_install, phys_avail) locking.
   1024  */
   1025 static krwlock_t memlists_lock;
   1026 
   1027 void
   1028 memsegs_lock(int writer)
   1029 {
   1030 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
   1031 }
   1032 
   1033 /*ARGSUSED*/
   1034 void
   1035 memsegs_unlock(int writer)
   1036 {
   1037 	rw_exit(&memsegslock);
   1038 }
   1039 
   1040 int
   1041 memsegs_lock_held(void)
   1042 {
   1043 	return (RW_LOCK_HELD(&memsegslock));
   1044 }
   1045 
   1046 void
   1047 memlist_read_lock(void)
   1048 {
   1049 	rw_enter(&memlists_lock, RW_READER);
   1050 }
   1051 
   1052 void
   1053 memlist_read_unlock(void)
   1054 {
   1055 	rw_exit(&memlists_lock);
   1056 }
   1057 
   1058 void
   1059 memlist_write_lock(void)
   1060 {
   1061 	rw_enter(&memlists_lock, RW_WRITER);
   1062 }
   1063 
   1064 void
   1065 memlist_write_unlock(void)
   1066 {
   1067 	rw_exit(&memlists_lock);
   1068 }
   1069