Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #ifndef	_VM_PAGE_H
     40 #define	_VM_PAGE_H
     41 
     42 #pragma ident	"@(#)page.h	1.188	07/11/12 SMI"
     43 
     44 #include <vm/seg.h>
     45 
     46 #ifdef	__cplusplus
     47 extern "C" {
     48 #endif
     49 
     50 #if defined(_KERNEL) || defined(_KMEMUSER)
     51 
     52 /*
     53  * Shared/Exclusive lock.
     54  */
     55 
     56 /*
     57  * Types of page locking supported by page_lock & friends.
     58  */
     59 typedef enum {
     60 	SE_SHARED,
     61 	SE_EXCL			/* exclusive lock (value == -1) */
     62 } se_t;
     63 
     64 /*
     65  * For requesting that page_lock reclaim the page from the free list.
     66  */
     67 typedef enum {
     68 	P_RECLAIM,		/* reclaim page from free list */
     69 	P_NO_RECLAIM		/* DON`T reclaim the page	*/
     70 } reclaim_t;
     71 
     72 /*
     73  * Callers of page_try_reclaim_lock and page_lock_es can use this flag
     74  * to get SE_EXCL access before reader/writers are given access.
     75  */
     76 #define	SE_EXCL_WANTED	0x02
     77 
     78 /*
     79  * All page_*lock() requests will be denied unless this flag is set in
     80  * the 'es' parameter.
     81  */
     82 #define	SE_RETIRED	0x04
     83 
     84 #endif	/* _KERNEL | _KMEMUSER */
     85 
     86 typedef int	selock_t;
     87 
     88 /*
     89  * Define VM_STATS to turn on all sorts of statistic gathering about
     90  * the VM layer.  By default, it is only turned on when DEBUG is
     91  * also defined.
     92  */
     93 #ifdef DEBUG
     94 #define	VM_STATS
     95 #endif	/* DEBUG */
     96 
     97 #ifdef VM_STATS
     98 #define	VM_STAT_ADD(stat)			(stat)++
     99 #define	VM_STAT_COND_ADD(cond, stat)		((void) (!(cond) || (stat)++))
    100 #else
    101 #define	VM_STAT_ADD(stat)
    102 #define	VM_STAT_COND_ADD(cond, stat)
    103 #endif	/* VM_STATS */
    104 
    105 #ifdef _KERNEL
    106 
    107 /*
    108  * Macros to acquire and release the page logical lock.
    109  */
    110 #define	page_struct_lock(pp)	mutex_enter(&page_llock)
    111 #define	page_struct_unlock(pp)	mutex_exit(&page_llock)
    112 
    113 #endif	/* _KERNEL */
    114 
    115 #include <sys/t_lock.h>
    116 
    117 struct as;
    118 
    119 /*
    120  * Each physical page has a page structure, which is used to maintain
    121  * these pages as a cache.  A page can be found via a hashed lookup
    122  * based on the [vp, offset].  If a page has an [vp, offset] identity,
    123  * then it is entered on a doubly linked circular list off the
    124  * vnode using the vpnext/vpprev pointers.   If the p_free bit
    125  * is on, then the page is also on a doubly linked circular free
    126  * list using next/prev pointers.  If the "p_selock" and "p_iolock"
    127  * are held, then the page is currently being read in (exclusive p_selock)
    128  * or written back (shared p_selock).  In this case, the next/prev pointers
    129  * are used to link the pages together for a consecutive i/o request.  If
    130  * the page is being brought in from its backing store, then other processes
    131  * will wait for the i/o to complete before attaching to the page since it
    132  * will have an "exclusive" lock.
    133  *
    134  * Each page structure has the locks described below along with
    135  * the fields they protect:
    136  *
    137  *	p_selock	This is a per-page shared/exclusive lock that is
    138  *			used to implement the logical shared/exclusive
    139  *			lock for each page.  The "shared" lock is normally
    140  *			used in most cases while the "exclusive" lock is
    141  *			required to destroy or retain exclusive access to
    142  *			a page (e.g., while reading in pages).  The appropriate
    143  *			lock is always held whenever there is any reference
    144  *			to a page structure (e.g., during i/o).
    145  *			(Note that with the addition of the "writer-lock-wanted"
    146  *			semantics (via SE_EWANTED), threads must not acquire
    147  *			multiple reader locks or else a deadly embrace will
    148  *			occur in the following situation: thread 1 obtains a
    149  *			reader lock; next thread 2 fails to get a writer lock
    150  *			but specified SE_EWANTED so it will wait by either
    151  *			blocking (when using page_lock_es) or spinning while
    152  *			retrying (when using page_try_reclaim_lock) until the
    153  *			reader lock is released; then thread 1 attempts to
    154  *			get another reader lock but is denied due to
    155  *			SE_EWANTED being set, and now both threads are in a
    156  *			deadly embrace.)
    157  *
    158  *				p_hash
    159  *				p_vnode
    160  *				p_offset
    161  *
    162  *				p_free
    163  *				p_age
    164  *
    165  *	p_iolock	This is a binary semaphore lock that provides
    166  *			exclusive access to the i/o list links in each
    167  *			page structure.  It is always held while the page
    168  *			is on an i/o list (i.e., involved in i/o).  That is,
    169  *			even though a page may be only `shared' locked
    170  *			while it is doing a write, the following fields may
    171  *			change anyway.  Normally, the page must be
    172  *			`exclusively' locked to change anything in it.
    173  *
    174  *				p_next
    175  *				p_prev
    176  *
    177  * The following fields are protected by the global page_llock:
    178  *
    179  *				p_lckcnt
    180  *				p_cowcnt
    181  *
    182  * The following lists are protected by the global page_freelock:
    183  *
    184  *				page_cachelist
    185  *				page_freelist
    186  *
    187  * The following, for our purposes, are protected by
    188  * the global freemem_lock:
    189  *
    190  *				freemem
    191  *				freemem_wait
    192  *				freemem_cv
    193  *
    194  * The following fields are protected by hat layer lock(s).  When a page
    195  * structure is not mapped and is not associated with a vnode (after a call
    196  * to page_hashout() for example) the p_nrm field may be modified with out
    197  * holding the hat layer lock:
    198  *
    199  *				p_nrm
    200  *				p_mapping
    201  *				p_share
    202  *
    203  * The following field is file system dependent.  How it is used and
    204  * the locking strategies applied are up to the individual file system
    205  * implementation.
    206  *
    207  *				p_fsdata
    208  *
    209  * The page structure is used to represent and control the system's
    210  * physical pages.  There is one instance of the structure for each
    211  * page that is not permenately allocated.  For example, the pages that
    212  * hold the page structures are permanently held by the kernel
    213  * and hence do not need page structures to track them.  The array
    214  * of page structures is allocated early on in the kernel's life and
    215  * is based on the amount of available physical memory.
    216  *
    217  * Each page structure may simultaneously appear on several linked lists.
    218  * The lists are:  hash list, free or in i/o list, and a vnode's page list.
    219  * Each type of list is protected by a different group of mutexes as described
    220  * below:
    221  *
    222  * The hash list is used to quickly find a page when the page's vnode and
    223  * offset within the vnode are known.  Each page that is hashed is
    224  * connected via the `p_hash' field.  The anchor for each hash is in the
    225  * array `page_hash'.  An array of mutexes, `ph_mutex', protects the
    226  * lists anchored by page_hash[].  To either search or modify a given hash
    227  * list, the appropriate mutex in the ph_mutex array must be held.
    228  *
    229  * The free list contains pages that are `free to be given away'.  For
    230  * efficiency reasons, pages on this list are placed in two catagories:
    231  * pages that are still associated with a vnode, and pages that are not
    232  * associated with a vnode.  Free pages always have their `p_free' bit set,
    233  * free pages that are still associated with a vnode also have their
    234  * `p_age' bit set.  Pages on the free list are connected via their
    235  * `p_next' and `p_prev' fields.  When a page is involved in some sort
    236  * of i/o, it is not free and these fields may be used to link associated
    237  * pages together.  At the moment, the free list is protected by a
    238  * single mutex `page_freelock'.  The list of free pages still associated
    239  * with a vnode is anchored by `page_cachelist' while other free pages
    240  * are anchored in architecture dependent ways (to handle page coloring etc.).
    241  *
    242  * Pages associated with a given vnode appear on a list anchored in the
    243  * vnode by the `v_pages' field.  They are linked together with
    244  * `p_vpnext' and `p_vpprev'.  The field `p_offset' contains a page's
    245  * offset within the vnode.  The pages on this list are not kept in
    246  * offset order.  These lists, in a manner similar to the hash lists,
    247  * are protected by an array of mutexes called `vph_hash'.  Before
    248  * searching or modifying this chain the appropriate mutex in the
    249  * vph_hash[] array must be held.
    250  *
    251  * Again, each of the lists that a page can appear on is protected by a
    252  * mutex.  Before reading or writing any of the fields comprising the
    253  * list, the appropriate lock must be held.  These list locks should only
    254  * be held for very short intervals.
    255  *
    256  * In addition to the list locks, each page structure contains a
    257  * shared/exclusive lock that protects various fields within it.
    258  * To modify one of these fields, the `p_selock' must be exclusively held.
    259  * To read a field with a degree of certainty, the lock must be at least
    260  * held shared.
    261  *
    262  * Removing a page structure from one of the lists requires holding
    263  * the appropriate list lock and the page's p_selock.  A page may be
    264  * prevented from changing identity, being freed, or otherwise modified
    265  * by acquiring p_selock shared.
    266  *
    267  * To avoid deadlocks, a strict locking protocol must be followed.  Basically
    268  * there are two cases:  In the first case, the page structure in question
    269  * is known ahead of time (e.g., when the page is to be added or removed
    270  * from a list).  In the second case, the page structure is not known and
    271  * must be found by searching one of the lists.
    272  *
    273  * When adding or removing a known page to one of the lists, first the
    274  * page must be exclusively locked (since at least one of its fields
    275  * will be modified), second the lock protecting the list must be acquired,
    276  * third the page inserted or deleted, and finally the list lock dropped.
    277  *
    278  * The more interesting case occures when the particular page structure
    279  * is not known ahead of time.  For example, when a call is made to
    280  * page_lookup(), it is not known if a page with the desired (vnode and
    281  * offset pair) identity exists.  So the appropriate mutex in ph_mutex is
    282  * acquired, the hash list searched, and if the desired page is found
    283  * an attempt is made to lock it.  The attempt to acquire p_selock must
    284  * not block while the hash list lock is held.  A deadlock could occure
    285  * if some other process was trying to remove the page from the list.
    286  * The removing process (following the above protocol) would have exclusively
    287  * locked the page, and be spinning waiting to acquire the lock protecting
    288  * the hash list.  Since the searching process holds the hash list lock
    289  * and is waiting to acquire the page lock, a deadlock occurs.
    290  *
    291  * The proper scheme to follow is: first, lock the appropriate list,
    292  * search the list, and if the desired page is found either use
    293  * page_trylock() (which will not block) or pass the address of the
    294  * list lock to page_lock().  If page_lock() can not acquire the page's
    295  * lock, it will drop the list lock before going to sleep.  page_lock()
    296  * returns a value to indicate if the list lock was dropped allowing the
    297  * calling program to react appropriately (i.e., retry the operation).
    298  *
    299  * If the list lock was dropped before the attempt at locking the page
    300  * was made, checks would have to be made to ensure that the page had
    301  * not changed identity before its lock was obtained.  This is because
    302  * the interval between dropping the list lock and acquiring the page
    303  * lock is indeterminate.
    304  *
    305  * In addition, when both a hash list lock (ph_mutex[]) and a vnode list
    306  * lock (vph_mutex[]) are needed, the hash list lock must be acquired first.
    307  * The routine page_hashin() is a good example of this sequence.
    308  * This sequence is ASSERTed by checking that the vph_mutex[] is not held
    309  * just before each acquisition of one of the mutexs in ph_mutex[].
    310  *
    311  * So, as a quick summary:
    312  *
    313  * 	pse_mutex[]'s protect the p_selock and p_cv fields.
    314  *
    315  * 	p_selock protects the p_free, p_age, p_vnode, p_offset and p_hash,
    316  *
    317  * 	ph_mutex[]'s protect the page_hash[] array and its chains.
    318  *
    319  * 	vph_mutex[]'s protect the v_pages field and the vp page chains.
    320  *
    321  *	First lock the page, then the hash chain, then the vnode chain.  When
    322  *	this is not possible `trylocks' must be used.  Sleeping while holding
    323  *	any of these mutexes (p_selock is not a mutex) is not allowed.
    324  *
    325  *
    326  *	field		reading		writing		    ordering
    327  *	======================================================================
    328  *	p_vnode		p_selock(E,S)	p_selock(E)
    329  *	p_offset
    330  *	p_free
    331  *	p_age
    332  *	=====================================================================
    333  *	p_hash		p_selock(E,S)	p_selock(E) &&	    p_selock, ph_mutex
    334  *					ph_mutex[]
    335  *	=====================================================================
    336  *	p_vpnext	p_selock(E,S)	p_selock(E) &&	    p_selock, vph_mutex
    337  *	p_vpprev			vph_mutex[]
    338  *	=====================================================================
    339  *	When the p_free bit is set:
    340  *
    341  *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock,
    342  *	p_prev				page_freelock	    page_freelock
    343  *
    344  *	When the p_free bit is not set:
    345  *
    346  *	p_next		p_selock(E,S)	p_selock(E) &&	    p_selock, p_iolock
    347  *	p_prev				p_iolock
    348  *	=====================================================================
    349  *	p_selock	pse_mutex[]	pse_mutex[]	    can`t acquire any
    350  *	p_cv						    other mutexes or
    351  *							    sleep while holding
    352  *							    this lock.
    353  *	=====================================================================
    354  *	p_lckcnt	p_selock(E,S)	p_selock(E) &&
    355  *	p_cowcnt			page_llock
    356  *	=====================================================================
    357  *	p_nrm		hat layer lock	hat layer lock
    358  *	p_mapping
    359  *	p_pagenum
    360  *	=====================================================================
    361  *
    362  *	where:
    363  *		E----> exclusive version of p_selock.
    364  *		S----> shared version of p_selock.
    365  *
    366  *
    367  *	Global data structures and variable:
    368  *
    369  *	field		reading		writing		    ordering
    370  *	=====================================================================
    371  *	page_hash[]	ph_mutex[]	ph_mutex[]	    can hold this lock
    372  *							    before acquiring
    373  *							    a vph_mutex or
    374  *							    pse_mutex.
    375  *	=====================================================================
    376  *	vp->v_pages	vph_mutex[]	vph_mutex[]	    can only acquire
    377  *							    a pse_mutex while
    378  *							    holding this lock.
    379  *	=====================================================================
    380  *	page_cachelist	page_freelock	page_freelock	    can't acquire any
    381  *	page_freelist	page_freelock	page_freelock
    382  *	=====================================================================
    383  *	freemem		freemem_lock	freemem_lock	    can't acquire any
    384  *	freemem_wait					    other mutexes while
    385  *	freemem_cv					    holding this mutex.
    386  *	=====================================================================
    387  *
    388  * Page relocation, PG_NORELOC and P_NORELOC.
    389  *
    390  * Pages may be relocated using the page_relocate() interface. Relocation
    391  * involves moving the contents and identity of a page to another, free page.
    392  * To relocate a page, the SE_EXCL lock must be obtained. The way to prevent
    393  * a page from being relocated is to hold the SE_SHARED lock (the SE_EXCL
    394  * lock must not be held indefinitely). If the page is going to be held
    395  * SE_SHARED indefinitely, then the PG_NORELOC hint should be passed
    396  * to page_create_va so that pages that are prevented from being relocated
    397  * can be managed differently by the platform specific layer.
    398  *
    399  * Pages locked in memory using page_pp_lock (p_lckcnt/p_cowcnt != 0)
    400  * are guaranteed to be held in memory, but can still be relocated
    401  * providing the SE_EXCL lock can be obtained.
    402  *
    403  * The P_NORELOC bit in the page_t.p_state field is provided for use by
    404  * the platform specific code in managing pages when the PG_NORELOC
    405  * hint is used.
    406  *
    407  * Memory delete and page locking.
    408  *
    409  * The set of all usable pages is managed using the global page list as
    410  * implemented by the memseg structure defined below. When memory is added
    411  * or deleted this list changes. Additions to this list guarantee that the
    412  * list is never corrupt.  In order to avoid the necessity of an additional
    413  * lock to protect against failed accesses to the memseg being deleted and,
    414  * more importantly, the page_ts, the memseg structure is never freed and the
    415  * page_t virtual address space is remapped to a page (or pages) of
    416  * zeros.  If a page_t is manipulated while it is p_selock'd, or if it is
    417  * locked indirectly via a hash or freelist lock, it is not possible for
    418  * memory delete to collect the page and so that part of the page list is
    419  * prevented from being deleted. If the page is referenced outside of one
    420  * of these locks, it is possible for the page_t being referenced to be
    421  * deleted.  Examples of this are page_t pointers returned by
    422  * page_numtopp_nolock, page_first and page_next.  Providing the page_t
    423  * is re-checked after taking the p_selock (for p_vnode != NULL), the
    424  * remapping to the zero pages will be detected.
    425  *
    426  *
    427  * Page size (p_szc field) and page locking.
    428  *
    429  * p_szc field of free pages is changed by free list manager under freelist
    430  * locks and is of no concern to the rest of VM subsystem.
    431  *
    432  * p_szc changes of allocated anonymous (swapfs) can only be done only after
    433  * exclusively locking all constituent pages and calling hat_pageunload() on
    434  * each of them. To prevent p_szc changes of non free anonymous (swapfs) large
    435  * pages it's enough to either lock SHARED any of constituent pages or prevent
    436  * hat_pageunload() by holding hat level lock that protects mapping lists (this
    437  * method is for hat code only)
    438  *
    439  * To increase (promote) p_szc of allocated non anonymous file system pages
    440  * one has to first lock exclusively all involved constituent pages and call
    441  * hat_pageunload() on each of them. To prevent p_szc promote it's enough to
    442  * either lock SHARED any of constituent pages that will be needed to make a
    443  * large page or prevent hat_pageunload() by holding hat level lock that
    444  * protects mapping lists (this method is for hat code only).
    445  *
    446  * To decrease (demote) p_szc of an allocated non anonymous file system large
    447  * page one can either use the same method as used for changeing p_szc of
    448  * anonymous large pages or if it's not possible to lock all constituent pages
    449  * exclusively a different method can be used. In the second method one only
    450  * has to exclusively lock one of constituent pages but then one has to
    451  * acquire further locks by calling page_szc_lock() and
    452  * hat_page_demote(). hat_page_demote() acquires hat level locks and then
    453  * demotes the page. This mechanism relies on the fact that any code that
    454  * needs to prevent p_szc of a file system large page from changeing either
    455  * locks all constituent large pages at least SHARED or locks some pages at
    456  * least SHARED and calls page_szc_lock() or uses hat level page locks.
    457  * Demotion using this method is implemented by page_demote_vp_pages().
    458  * Please see comments in front of page_demote_vp_pages(), hat_page_demote()
    459  * and page_szc_lock() for more details.
    460  *
    461  * Lock order: p_selock, page_szc_lock, ph_mutex/vph_mutex/freelist,
    462  * hat level locks.
    463  */
    464 
    465 typedef struct page {
    466 	u_offset_t	p_offset;	/* offset into vnode for this page */
    467 	struct vnode	*p_vnode;	/* vnode that this page is named by */
    468 	selock_t	p_selock;	/* shared/exclusive lock on the page */
    469 #if defined(_LP64)
    470 	uint_t		p_vpmref;	/* vpm ref - index of the vpmap_t */
    471 #endif
    472 	struct page	*p_hash;	/* hash by [vnode, offset] */
    473 	struct page	*p_vpnext;	/* next page in vnode list */
    474 	struct page	*p_vpprev;	/* prev page in vnode list */
    475 	struct page	*p_next;	/* next page in free/intrans lists */
    476 	struct page	*p_prev;	/* prev page in free/intrans lists */
    477 	ushort_t	p_lckcnt;	/* number of locks on page data */
    478 	ushort_t	p_cowcnt;	/* number of copy on write lock */
    479 	kcondvar_t	p_cv;		/* page struct's condition var */
    480 	kcondvar_t	p_io_cv;	/* for iolock */
    481 	uchar_t		p_iolock_state;	/* replaces p_iolock */
    482 	volatile uchar_t p_szc;		/* page size code */
    483 	uchar_t		p_fsdata;	/* file system dependent byte */
    484 	uchar_t		p_state;	/* p_free, p_noreloc */
    485 	uchar_t		p_nrm;		/* non-cache, ref, mod readonly bits */
    486 #if defined(__sparc)
    487 	uchar_t		p_vcolor;	/* virtual color */
    488 #else
    489 	uchar_t		p_embed;	/* x86 - changes p_mapping & p_index */
    490 #endif
    491 	uchar_t		p_index;	/* MPSS mapping info. Not used on x86 */
    492 	uchar_t		p_toxic;	/* page has an unrecoverable error */
    493 	void		*p_mapping;	/* hat specific translation info */
    494 	pfn_t		p_pagenum;	/* physical page number */
    495 
    496 	uint_t		p_share;	/* number of translations */
    497 #if defined(_LP64)
    498 	uint_t		p_sharepad;	/* pad for growing p_share */
    499 #endif
    500 	uint_t		p_slckcnt;	/* number of softlocks */
    501 #if defined(__sparc)
    502 	uint_t		p_kpmref;	/* number of kpm mapping sharers */
    503 	struct kpme	*p_kpmelist;	/* kpm specific mapping info */
    504 #else
    505 	/* index of entry in p_map when p_embed is set */
    506 	uint_t		p_mlentry;
    507 #endif
    508 #if defined(_LP64)
    509 	kmutex_t	p_ilock;	/* protects p_vpmref */
    510 #else
    511 	uint64_t	p_msresv_2;	/* page allocation debugging */
    512 #endif
    513 } page_t;
    514 
    515 
    516 typedef	page_t	devpage_t;
    517 #define	devpage	page
    518 
    519 #define	PAGE_LOCK_MAXIMUM \
    520 	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
    521 
    522 #define	PAGE_SLOCK_MAXIMUM UINT_MAX
    523 
    524 /*
    525  * Page hash table is a power-of-two in size, externally chained
    526  * through the hash field.  PAGE_HASHAVELEN is the average length
    527  * desired for this chain, from which the size of the page_hash
    528  * table is derived at boot time and stored in the kernel variable
    529  * page_hashsz.  In the hash function it is given by PAGE_HASHSZ.
    530  *
    531  * PAGE_HASH_FUNC returns an index into the page_hash[] array.  This
    532  * index is also used to derive the mutex that protects the chain.
    533  *
    534  * In constructing the hash function, first we dispose of unimportant bits
    535  * (page offset from "off" and the low 3 bits of "vp" which are zero for
    536  * struct alignment). Then shift and sum the remaining bits a couple times
    537  * in order to get as many source bits from the two source values into the
    538  * resulting hashed value.  Note that this will perform quickly, since the
    539  * shifting/summing are fast register to register operations with no additional
    540  * memory references).
    541  */
    542 #if defined(_LP64)
    543 
    544 #if NCPU < 4
    545 #define	PH_TABLE_SIZE	128
    546 #define	VP_SHIFT	7
    547 #else
    548 #define	PH_TABLE_SIZE	1024
    549 #define	VP_SHIFT	9
    550 #endif
    551 
    552 #else	/* 32 bits */
    553 
    554 #if NCPU < 4
    555 #define	PH_TABLE_SIZE	16
    556 #define	VP_SHIFT	7
    557 #else
    558 #define	PH_TABLE_SIZE	128
    559 #define	VP_SHIFT	9
    560 #endif
    561 
    562 #endif	/* _LP64 */
    563 
    564 /*
    565  * The amount to use for the successive shifts in the hash function below.
    566  * The actual value is LOG2(PH_TABLE_SIZE), so that as many bits as
    567  * possible will filter thru PAGE_HASH_FUNC() and PAGE_HASH_MUTEX().
    568  */
    569 #define	PH_SHIFT_SIZE   (7)
    570 
    571 #define	PAGE_HASHSZ	page_hashsz
    572 #define	PAGE_HASHAVELEN		4
    573 #define	PAGE_HASH_FUNC(vp, off) \
    574 	((((uintptr_t)(off) >> PAGESHIFT) + \
    575 		((uintptr_t)(off) >> (PAGESHIFT + PH_SHIFT_SIZE)) + \
    576 		((uintptr_t)(vp) >> 3) + \
    577 		((uintptr_t)(vp) >> (3 + PH_SHIFT_SIZE)) + \
    578 		((uintptr_t)(vp) >> (3 + 2 * PH_SHIFT_SIZE))) & \
    579 		(PAGE_HASHSZ - 1))
    580 #ifdef _KERNEL
    581 
    582 /*
    583  * The page hash value is re-hashed to an index for the ph_mutex array.
    584  *
    585  * For 64 bit kernels, the mutex array is padded out to prevent false
    586  * sharing of cache sub-blocks (64 bytes) of adjacent mutexes.
    587  *
    588  * For 32 bit kernels, we don't want to waste kernel address space with
    589  * padding, so instead we rely on the hash function to introduce skew of
    590  * adjacent vnode/offset indexes (the left shift part of the hash function).
    591  * Since sizeof (kmutex_t) is 8, we shift an additional 3 to skew to a different
    592  * 64 byte sub-block.
    593  */
    594 typedef struct pad_mutex {
    595 	kmutex_t	pad_mutex;
    596 #ifdef _LP64
    597 	char		pad_pad[64 - sizeof (kmutex_t)];
    598 #endif
    599 } pad_mutex_t;
    600 extern pad_mutex_t ph_mutex[];
    601 
    602 #define	PAGE_HASH_MUTEX(x) \
    603 	&(ph_mutex[((x) + ((x) >> VP_SHIFT) + ((x) << 3)) & \
    604 		(PH_TABLE_SIZE - 1)].pad_mutex)
    605 
    606 /*
    607  * Flags used while creating pages.
    608  */
    609 #define	PG_EXCL		0x0001
    610 #define	PG_WAIT		0x0002
    611 #define	PG_PHYSCONTIG	0x0004		/* NOT SUPPORTED */
    612 #define	PG_MATCH_COLOR	0x0008		/* SUPPORTED by free list routines */
    613 #define	PG_NORELOC	0x0010		/* Non-relocatable alloc hint. */
    614 					/* Page must be PP_ISNORELOC */
    615 #define	PG_PANIC	0x0020		/* system will panic if alloc fails */
    616 #define	PG_PUSHPAGE	0x0040		/* alloc may use reserve */
    617 #define	PG_LOCAL	0x0080		/* alloc from given lgrp only */
    618 
    619 /*
    620  * When p_selock has the SE_EWANTED bit set, threads waiting for SE_EXCL
    621  * access are given priority over all other waiting threads.
    622  */
    623 #define	SE_EWANTED	0x40000000
    624 #define	PAGE_LOCKED(pp)		(((pp)->p_selock & ~SE_EWANTED) != 0)
    625 #define	PAGE_SHARED(pp)		(((pp)->p_selock & ~SE_EWANTED) > 0)
    626 #define	PAGE_EXCL(pp)		((pp)->p_selock < 0)
    627 #define	PAGE_LOCKED_SE(pp, se)	\
    628 	((se) == SE_EXCL ? PAGE_EXCL(pp) : PAGE_SHARED(pp))
    629 
    630 extern	long page_hashsz;
    631 extern	page_t **page_hash;
    632 
    633 extern	kmutex_t page_llock;		/* page logical lock mutex */
    634 extern	kmutex_t freemem_lock;		/* freemem lock */
    635 
    636 extern	pgcnt_t	total_pages;		/* total pages in the system */
    637 
    638 /*
    639  * Variables controlling locking of physical memory.
    640  */
    641 extern	pgcnt_t	pages_pp_maximum;	/* tuning: lock + claim <= max */
    642 extern	void init_pages_pp_maximum(void);
    643 
    644 struct lgrp;
    645 
    646 /* page_list_{add,sub} flags */
    647 
    648 /* which list */
    649 #define	PG_FREE_LIST	0x0001
    650 #define	PG_CACHE_LIST	0x0002
    651 
    652 /* where on list */
    653 #define	PG_LIST_TAIL	0x0010
    654 #define	PG_LIST_HEAD	0x0020
    655 
    656 /* called from */
    657 #define	PG_LIST_ISINIT	0x1000
    658 
    659 /*
    660  * Page frame operations.
    661  */
    662 page_t	*page_lookup(struct vnode *, u_offset_t, se_t);
    663 page_t	*page_lookup_create(struct vnode *, u_offset_t, se_t, page_t *,
    664 	spgcnt_t *, int);
    665 page_t	*page_lookup_nowait(struct vnode *, u_offset_t, se_t);
    666 page_t	*page_find(struct vnode *, u_offset_t);
    667 page_t	*page_exists(struct vnode *, u_offset_t);
    668 int	page_exists_physcontig(vnode_t *, u_offset_t, uint_t, page_t *[]);
    669 int	page_exists_forreal(struct vnode *, u_offset_t, uint_t *);
    670 void	page_needfree(spgcnt_t);
    671 page_t	*page_create(struct vnode *, u_offset_t, size_t, uint_t);
    672 int	page_alloc_pages(struct vnode *, struct seg *, caddr_t, page_t **,
    673 	page_t **, uint_t, int, int);
    674 page_t  *page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes,
    675 	uint_t flags, struct seg *seg, caddr_t vaddr, void *arg);
    676 page_t	*page_create_va(struct vnode *, u_offset_t, size_t, uint_t,
    677 	struct seg *, caddr_t);
    678 int	page_create_wait(size_t npages, uint_t flags);
    679 void    page_create_putback(ssize_t npages);
    680 void	page_free(page_t *, int);
    681 void	page_free_at_startup(page_t *);
    682 void	page_free_pages(page_t *);
    683 void	free_vp_pages(struct vnode *, u_offset_t, size_t);
    684 int	page_reclaim(page_t *, kmutex_t *);
    685 int	page_reclaim_pages(page_t *, kmutex_t *, uint_t);
    686 void	page_destroy(page_t *, int);
    687 void	page_destroy_pages(page_t *);
    688 void	page_destroy_free(page_t *);
    689 void	page_rename(page_t *, struct vnode *, u_offset_t);
    690 int	page_hashin(page_t *, struct vnode *, u_offset_t, kmutex_t *);
    691 void	page_hashout(page_t *, kmutex_t *);
    692 int	page_num_hashin(pfn_t, struct vnode *, u_offset_t);
    693 void	page_add(page_t **, page_t *);
    694 void	page_add_common(page_t **, page_t *);
    695 void	page_sub(page_t **, page_t *);
    696 void	page_sub_common(page_t **, page_t *);
    697 page_t	*page_get_freelist(struct vnode *, u_offset_t, struct seg *,
    698 		caddr_t, size_t, uint_t, struct lgrp *);
    699 
    700 page_t	*page_get_cachelist(struct vnode *, u_offset_t, struct seg *,
    701 		caddr_t, uint_t, struct lgrp *);
    702 #if defined(__i386) || defined(__amd64)
    703 int	page_chk_freelist(uint_t);
    704 #endif
    705 void	page_list_add(page_t *, int);
    706 void	page_boot_demote(page_t *);
    707 void	page_promote_size(page_t *, uint_t);
    708 void	page_list_add_pages(page_t *, int);
    709 void	page_list_sub(page_t *, int);
    710 void	page_list_sub_pages(page_t *, uint_t);
    711 void	page_list_xfer(page_t *, int, int);
    712 void	page_list_break(page_t **, page_t **, size_t);
    713 void	page_list_concat(page_t **, page_t **);
    714 void	page_vpadd(page_t **, page_t *);
    715 void	page_vpsub(page_t **, page_t *);
    716 int	page_lock(page_t *, se_t, kmutex_t *, reclaim_t);
    717 int	page_lock_es(page_t *, se_t, kmutex_t *, reclaim_t, int);
    718 void page_lock_clr_exclwanted(page_t *);
    719 int	page_trylock(page_t *, se_t);
    720 int	page_try_reclaim_lock(page_t *, se_t, int);
    721 int	page_tryupgrade(page_t *);
    722 void	page_downgrade(page_t *);
    723 void	page_unlock(page_t *);
    724 void	page_unlock_nocapture(page_t *);
    725 void	page_lock_delete(page_t *);
    726 int	page_deleted(page_t *);
    727 int	page_pp_lock(page_t *, int, int);
    728 void	page_pp_unlock(page_t *, int, int);
    729 int	page_resv(pgcnt_t, uint_t);
    730 void	page_unresv(pgcnt_t);
    731 void	page_pp_useclaim(page_t *, page_t *, uint_t);
    732 int	page_addclaim(page_t *);
    733 int	page_subclaim(page_t *);
    734 int	page_addclaim_pages(page_t **);
    735 int	page_subclaim_pages(page_t **);
    736 pfn_t	page_pptonum(page_t *);
    737 page_t	*page_numtopp(pfn_t, se_t);
    738 page_t	*page_numtopp_noreclaim(pfn_t, se_t);
    739 page_t	*page_numtopp_nolock(pfn_t);
    740 page_t	*page_numtopp_nowait(pfn_t, se_t);
    741 page_t  *page_first();
    742 page_t  *page_next(page_t *);
    743 page_t  *page_list_next(page_t *);
    744 page_t	*page_nextn(page_t *, ulong_t);
    745 page_t	*page_next_scan_init(void **);
    746 page_t	*page_next_scan_large(page_t *, ulong_t *, void **);
    747 void    prefetch_page_r(void *);
    748 int	ppcopy(page_t *, page_t *);
    749 void	page_relocate_hash(page_t *, page_t *);
    750 void	pagezero(page_t *, uint_t, uint_t);
    751 void	pagescrub(page_t *, uint_t, uint_t);
    752 void	page_io_lock(page_t *);
    753 void	page_io_unlock(page_t *);
    754 int	page_io_trylock(page_t *);
    755 int	page_iolock_assert(page_t *);
    756 void	page_iolock_init(page_t *);
    757 void	page_io_wait(page_t *);
    758 int	page_io_locked(page_t *);
    759 pgcnt_t	page_busy(int);
    760 void	page_lock_init(void);
    761 ulong_t	page_share_cnt(page_t *);
    762 int	page_isshared(page_t *);
    763 int	page_isfree(page_t *);
    764 int	page_isref(page_t *);
    765 int	page_ismod(page_t *);
    766 int	page_release(page_t *, int);
    767 void	page_retire_init(void);
    768 int	page_retire(uint64_t, uchar_t);
    769 int	page_retire_check(uint64_t, uint64_t *);
    770 int	page_unretire(uint64_t);
    771 int	page_unretire_pp(page_t *, int);
    772 void	page_tryretire(page_t *);
    773 void	page_retire_mdboot();
    774 uint64_t	page_retire_pend_count(void);
    775 void	page_retire_incr_pend_count(void);
    776 void	page_retire_decr_pend_count(void);
    777 void	page_clrtoxic(page_t *, uchar_t);
    778 void	page_settoxic(page_t *, uchar_t);
    779 
    780 int	page_mem_avail(pgcnt_t);
    781 int	page_reclaim_mem(pgcnt_t, pgcnt_t, int);
    782 
    783 void page_set_props(page_t *, uint_t);
    784 void page_clr_all_props(page_t *);
    785 int page_clear_lck_cow(page_t *, int);
    786 
    787 kmutex_t	*page_vnode_mutex(struct vnode *);
    788 kmutex_t	*page_se_mutex(struct page *);
    789 kmutex_t	*page_szc_lock(struct page *);
    790 int		page_szc_lock_assert(struct page *pp);
    791 
    792 /*
    793  * Page relocation interfaces. page_relocate() is generic.
    794  * page_get_replacement_page() is provided by the PSM.
    795  * page_free_replacement_page() is generic.
    796  */
    797 int group_page_trylock(page_t *, se_t);
    798 void group_page_unlock(page_t *);
    799 int page_relocate(page_t **, page_t **, int, int, spgcnt_t *, struct lgrp *);
    800 int do_page_relocate(page_t **, page_t **, int, spgcnt_t *, struct lgrp *);
    801 page_t *page_get_replacement_page(page_t *, struct lgrp *, uint_t);
    802 void page_free_replacement_page(page_t *);
    803 int page_relocate_cage(page_t **, page_t **);
    804 
    805 int page_try_demote_pages(page_t *);
    806 int page_try_demote_free_pages(page_t *);
    807 void page_demote_free_pages(page_t *);
    808 
    809 struct anon_map;
    810 
    811 void page_mark_migrate(struct seg *, caddr_t, size_t, struct anon_map *,
    812     ulong_t, vnode_t *, u_offset_t, int);
    813 void page_migrate(struct seg *, caddr_t, page_t **, pgcnt_t);
    814 
    815 /*
    816  * Tell the PIM we are adding physical memory
    817  */
    818 void add_physmem(page_t *, size_t, pfn_t);
    819 void add_physmem_cb(page_t *, pfn_t);	/* callback for page_t part */
    820 
    821 /*
    822  * hw_page_array[] is configured with hardware supported page sizes by
    823  * platform specific code.
    824  */
    825 typedef struct {
    826 	size_t	hp_size;
    827 	uint_t	hp_shift;
    828 	uint_t  hp_colors;
    829 	pgcnt_t	hp_pgcnt;	/* base pagesize cnt */
    830 } hw_pagesize_t;
    831 
    832 extern hw_pagesize_t	hw_page_array[];
    833 extern uint_t		page_coloring_shift;
    834 extern uint_t		page_colors_mask;
    835 extern int		cpu_page_colors;
    836 extern uint_t		colorequiv;
    837 extern uchar_t		colorequivszc[];
    838 
    839 uint_t	page_num_pagesizes(void);
    840 uint_t	page_num_user_pagesizes(int);
    841 size_t	page_get_pagesize(uint_t);
    842 size_t	page_get_user_pagesize(uint_t n);
    843 pgcnt_t	page_get_pagecnt(uint_t);
    844 uint_t	page_get_shift(uint_t);
    845 int	page_szc(size_t);
    846 int	page_szc_user_filtered(size_t);
    847 
    848 /* page_get_replacement page flags */
    849 #define	PGR_SAMESZC	0x1	/* only look for page size same as orig */
    850 #define	PGR_NORELOC	0x2	/* allocate a P_NORELOC page */
    851 
    852 /*
    853  * macros for "masked arithmetic"
    854  * The purpose is to step through all combinations of a set of bits while
    855  * keeping some other bits fixed. Fixed bits need not be contiguous. The
    856  * variable bits need not be contiguous either, or even right aligned. The
    857  * trick is to set all fixed bits to 1, then increment, then restore the
    858  * fixed bits. If incrementing causes a carry from a low bit position, the
    859  * carry propagates thru the fixed bits, because they are temporarily set to 1.
    860  *	v is the value
    861  *	i is the increment
    862  *	eq_mask defines the fixed bits
    863  *	mask limits the size of the result
    864  */
    865 #define	ADD_MASKED(v, i, eq_mask, mask) \
    866 	(((((v) | (eq_mask)) + (i)) & (mask) & ~(eq_mask)) | ((v) & (eq_mask)))
    867 
    868 /*
    869  * convenience macro which increments by 1
    870  */
    871 #define	INC_MASKED(v, eq_mask, mask) ADD_MASKED(v, 1, eq_mask, mask)
    872 
    873 #endif	/* _KERNEL */
    874 
    875 /*
    876  * Constants used for the p_iolock_state
    877  */
    878 #define	PAGE_IO_INUSE	0x1
    879 #define	PAGE_IO_WANTED	0x2
    880 
    881 /*
    882  * Constants used for page_release status
    883  */
    884 #define	PGREL_NOTREL    0x1
    885 #define	PGREL_CLEAN	0x2
    886 #define	PGREL_MOD	0x3
    887 
    888 /*
    889  * The p_state field holds what used to be the p_age and p_free
    890  * bits.  These fields are protected by p_selock (see above).
    891  */
    892 #define	P_FREE		0x80		/* Page on free list */
    893 #define	P_NORELOC	0x40		/* Page is non-relocatable */
    894 #define	P_MIGRATE	0x20		/* Migrate page on next touch */
    895 #define	P_SWAP		0x10		/* belongs to vnode that is V_ISSWAP */
    896 #define	P_BOOTPAGES	0x08		/* member of bootpages list */
    897 
    898 #define	PP_ISFREE(pp)		((pp)->p_state & P_FREE)
    899 #define	PP_ISAGED(pp)		(((pp)->p_state & P_FREE) && \
    900 					((pp)->p_vnode == NULL))
    901 #define	PP_ISNORELOC(pp)	((pp)->p_state & P_NORELOC)
    902 #define	PP_ISKAS(pp)		(((pp)->p_vnode == &kvp) || \
    903 					    ((pp)->p_vnode == &zvp))
    904 #define	PP_ISNORELOCKERNEL(pp)	(PP_ISNORELOC(pp) && PP_ISKAS(pp))
    905 #define	PP_ISMIGRATE(pp)	((pp)->p_state & P_MIGRATE)
    906 #define	PP_ISSWAP(pp)		((pp)->p_state & P_SWAP)
    907 #define	PP_ISBOOTPAGES(pp)	((pp)->p_state & P_BOOTPAGES)
    908 
    909 #define	PP_SETFREE(pp)		((pp)->p_state = ((pp)->p_state & ~P_MIGRATE) \
    910 				| P_FREE)
    911 #define	PP_SETAGED(pp)		ASSERT(PP_ISAGED(pp))
    912 #define	PP_SETNORELOC(pp)	((pp)->p_state |= P_NORELOC)
    913 #define	PP_SETMIGRATE(pp)	((pp)->p_state |= P_MIGRATE)
    914 #define	PP_SETSWAP(pp)		((pp)->p_state |= P_SWAP)
    915 #define	PP_SETBOOTPAGES(pp)	((pp)->p_state |= P_BOOTPAGES)
    916 
    917 #define	PP_CLRFREE(pp)		((pp)->p_state &= ~P_FREE)
    918 #define	PP_CLRAGED(pp)		ASSERT(!PP_ISAGED(pp))
    919 #define	PP_CLRNORELOC(pp)	((pp)->p_state &= ~P_NORELOC)
    920 #define	PP_CLRMIGRATE(pp)	((pp)->p_state &= ~P_MIGRATE)
    921 #define	PP_CLRSWAP(pp)		((pp)->p_state &= ~P_SWAP)
    922 #define	PP_CLRBOOTPAGES(pp)	((pp)->p_state &= ~P_BOOTPAGES)
    923 
    924 /*
    925  * Flags for page_t p_toxic, for tracking memory hardware errors.
    926  *
    927  * These flags are OR'ed into p_toxic with page_settoxic() to track which
    928  * error(s) have occurred on a given page. The flags are cleared with
    929  * page_clrtoxic(). Both page_settoxic() and page_cleartoxic use atomic
    930  * primitives to manipulate the p_toxic field so no other locking is needed.
    931  *
    932  * When an error occurs on a page, p_toxic is set to record the error. The
    933  * error could be a memory error or something else (i.e. a datapath). The Page
    934  * Retire mechanism does not try to determine the exact cause of the error;
    935  * Page Retire rightly leaves that sort of determination to FMA's Diagnostic
    936  * Engine (DE).
    937  *
    938  * Note that, while p_toxic bits can be set without holding any locks, they
    939  * should only be cleared while holding the page exclusively locked.
    940  * There is one exception to this, the PR_CAPTURE bit is protected by a mutex
    941  * within the page capture logic and thus to set or clear the bit, that mutex
    942  * needs to be held.  The page does not need to be locked but the page_clrtoxic
    943  * function must be used as we need an atomic operation.
    944  * Also note that there is what amounts to a hack to prevent recursion with
    945  * large pages such that if we are unlocking a page and the PR_CAPTURE bit is
    946  * set, we will only try to capture the page if the current threads T_CAPTURING
    947  * flag is not set.  If the flag is set, the unlock will not try to capture
    948  * the page even though the PR_CAPTURE bit is set.
    949  *
    950  * Pages with PR_UE or PR_FMA flags are retired unconditionally, while pages
    951  * with PR_MCE are retired if the system has not retired too many of them.
    952  *
    953  * A page must be exclusively locked to be retired. Pages can be retired if
    954  * they are mapped, modified, or both, as long as they are not marked PR_UE,
    955  * since pages with uncorrectable errors cannot be relocated in memory.
    956  * Once a page has been successfully retired it is zeroed, attached to the
    957  * retired_pages vnode and, finally, PR_RETIRED is set in p_toxic. The other
    958  * p_toxic bits are NOT cleared. Pages are not left locked after retiring them
    959  * to avoid special case code throughout the kernel; rather, page_*lock() will
    960  * fail to lock the page, unless SE_RETIRED is passed as an argument.
    961  *
    962  * While we have your attention, go take a look at the comments at the
    963  * beginning of page_retire.c too.
    964  */
    965 #define	PR_OK		0x00	/* no problem */
    966 #define	PR_MCE		0x01	/* page has seen two or more CEs */
    967 #define	PR_UE		0x02	/* page has an unhandled UE */
    968 #define	PR_UE_SCRUBBED	0x04	/* page has seen a UE but was cleaned */
    969 #define	PR_FMA		0x08	/* A DE wants this page retired */
    970 #define	PR_CAPTURE	0x10	/* Generic page capture flag */
    971 #define	PR_RESV		0x20	/* Reserved for future use */
    972 #define	PR_MSG		0x40	/* message(s) already printed for this page */
    973 #define	PR_RETIRED	0x80	/* This page has been retired */
    974 
    975 #define	PR_REASONS	(PR_UE | PR_MCE | PR_FMA)
    976 #define	PR_TOXIC	(PR_UE)
    977 #define	PR_ERRMASK	(PR_UE | PR_UE_SCRUBBED | PR_MCE | PR_FMA)
    978 #define	PR_TOXICFLAGS	(0xCF)
    979 
    980 #define	PP_RETIRED(pp)	((pp)->p_toxic & PR_RETIRED)
    981 #define	PP_TOXIC(pp)	((pp)->p_toxic & PR_TOXIC)
    982 #define	PP_PR_REQ(pp)	(((pp)->p_toxic & PR_REASONS) && !PP_RETIRED(pp))
    983 #define	PP_PR_NOSHARE(pp)						\
    984 	((((pp)->p_toxic & (PR_RETIRED | PR_FMA | PR_UE)) == PR_FMA) &&	\
    985 	!PP_ISKAS(pp))
    986 
    987 /*
    988  * Flags for page_unretire_pp
    989  */
    990 #define	PR_UNR_FREE	0x1
    991 #define	PR_UNR_CLEAN	0x2
    992 #define	PR_UNR_TEMP	0x4
    993 
    994 /*
    995  * kpm large page description.
    996  * The virtual address range of segkpm is divided into chunks of
    997  * kpm_pgsz. Each chunk is controlled by a kpm_page_t. The ushort
    998  * is sufficient for 2^^15 * PAGESIZE, so e.g. the maximum kpm_pgsz
    999  * for 8K is 256M and 2G for 64K pages. It it kept as small as
   1000  * possible to save physical memory space.
   1001  *
   1002  * There are 2 segkpm mapping windows within in the virtual address
   1003  * space when we have to prevent VAC alias conflicts. The so called
   1004  * Alias window (mappings are always by PAGESIZE) is controlled by
   1005  * kp_refcnta. The regular window is controlled by kp_refcnt for the
   1006  * normal operation, which is to use the largest available pagesize.
   1007  * When VAC alias conflicts are present within a chunk in the regular
   1008  * window the large page mapping is broken up into smaller PAGESIZE
   1009  * mappings. kp_refcntc is used to control the pages that are invoked
   1010  * in the conflict and kp_refcnts holds the active mappings done
   1011  * with the small page size. In non vac conflict mode kp_refcntc is
   1012  * also used as "go" indication (-1) for the trap level tsbmiss
   1013  * handler.
   1014  */
   1015 typedef struct kpm_page {
   1016 	short kp_refcnt;	/* pages mapped large */
   1017 	short kp_refcnta;	/* pages mapped in Alias window */
   1018 	short kp_refcntc;	/* TL-tsbmiss flag; #vac alias conflict pages */
   1019 	short kp_refcnts;	/* vac alias: pages mapped small */
   1020 } kpm_page_t;
   1021 
   1022 /*
   1023  * Note: khl_lock offset changes must be reflected in sfmmu_asm.s
   1024  */
   1025 typedef struct kpm_hlk {
   1026 	kmutex_t khl_mutex;	/* kpm_page mutex */
   1027 	uint_t   khl_lock;	/* trap level tsbmiss handling */
   1028 } kpm_hlk_t;
   1029 
   1030 /*
   1031  * kpm small page description.
   1032  * When kpm_pgsz is equal to PAGESIZE a smaller representation is used
   1033  * to save memory space. Alias range mappings and regular segkpm
   1034  * mappings are done in units of PAGESIZE and can share the mapping
   1035  * information and the mappings are always distinguishable by their
   1036  * virtual address. Other information neeeded for VAC conflict prevention
   1037  * is already available on a per page basis. There are basically 3 states
   1038  * a kpm_spage can have: not mapped (0), mapped in Alias range or virtually
   1039  * uncached (1) and mapped in the regular segkpm window (-1). The -1 value
   1040  * is also used as "go" indication for the segkpm trap level tsbmiss
   1041  * handler for small pages (value is kept the same as it is used for large
   1042  * mappings).
   1043  */
   1044 typedef struct kpm_spage {
   1045 	char	kp_mapped;	/* page mapped small */
   1046 } kpm_spage_t;
   1047 
   1048 /*
   1049  * Note: kshl_lock offset changes must be reflected in sfmmu_asm.s
   1050  */
   1051 typedef struct kpm_shlk {
   1052 	uint_t   kshl_lock;	/* trap level tsbmiss handling */
   1053 } kpm_shlk_t;
   1054 
   1055 /*
   1056  * Each segment of physical memory is described by a memseg struct.
   1057  * Within a segment, memory is considered contiguous. The members
   1058  * can be categorized as follows:
   1059  * . Platform independent:
   1060  *         pages, epages, pages_base, pages_end, next, lnext.
   1061  * . 64bit only but platform independent:
   1062  *         kpm_pbase, kpm_nkpmpgs, kpm_pages, kpm_spages.
   1063  * . Really platform or mmu specific:
   1064  *         pagespa, epagespa, nextpa, kpm_pagespa.
   1065  * . Mixed:
   1066  *         msegflags.
   1067  */
   1068 struct memseg {
   1069 	page_t *pages, *epages;		/* [from, to] in page array */
   1070 	pfn_t pages_base, pages_end;	/* [from, to] in page numbers */
   1071 	struct memseg *next;		/* next segment in list */
   1072 #if defined(__sparc)
   1073 	struct memseg *lnext;		/* next segment in deleted list */
   1074 	uint64_t pagespa, epagespa;	/* [from, to] page array physical */
   1075 	uint64_t nextpa;		/* physical next pointer */
   1076 	pfn_t	kpm_pbase;		/* start of kpm range */
   1077 	pgcnt_t kpm_nkpmpgs;		/* # of kpm_pgsz pages */
   1078 	union _mseg_un {
   1079 		kpm_page_t  *kpm_lpgs;	/* ptr to kpm_page array */
   1080 		kpm_spage_t *kpm_spgs;	/* ptr to kpm_spage array */
   1081 	} mseg_un;
   1082 	uint64_t kpm_pagespa;		/* physical ptr to kpm (s)pages array */
   1083 	uint_t msegflags;		/* memseg flags */
   1084 #endif /* __sparc */
   1085 };
   1086 
   1087 /* memseg union aliases */
   1088 #define	kpm_pages	mseg_un.kpm_lpgs
   1089 #define	kpm_spages	mseg_un.kpm_spgs
   1090 
   1091 /* msegflags */
   1092 #define	MEMSEG_DYNAMIC		0x1	/* DR: memory was added dynamically */
   1093 
   1094 /* memseg support macros */
   1095 #define	MSEG_NPAGES(SEG)	((SEG)->pages_end - (SEG)->pages_base)
   1096 
   1097 /* memseg hash */