Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)page_retire.c	1.8	07/03/26 SMI"
     27 
     28 /*
     29  * Page Retire - Big Theory Statement.
     30  *
     31  * This file handles removing sections of faulty memory from use when the
     32  * user land FMA Diagnosis Engine requests that a page be removed or when
     33  * a CE or UE is detected by the hardware.
     34  *
     35  * In the bad old days, the kernel side of Page Retire did a lot of the work
     36  * on its own. Now, with the DE keeping track of errors, the kernel side is
     37  * rather simple minded on most platforms.
     38  *
     39  * Errors are all reflected to the DE, and after digesting the error and
     40  * looking at all previously reported errors, the DE decides what should
     41  * be done about the current error. If the DE wants a particular page to
     42  * be retired, then the kernel page retire code is invoked via an ioctl.
     43  * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling
     44  * page retire to handle the error. Since page retire is just a simple
     45  * mechanism it doesn't need to differentiate between the different callers.
     46  *
     47  * The p_toxic field in the page_t is used to indicate which errors have
     48  * occurred and what action has been taken on a given page. Because errors are
     49  * reported without regard to the locked state of a page, no locks are used
     50  * to SET the error bits in p_toxic. However, in order to clear the error
     51  * bits, the page_t must be held exclusively locked.
     52  *
     53  * When page_retire() is called, it must be able to acquire locks, sleep, etc.
     54  * It must not be called from high-level interrupt context.
     55  *
     56  * Depending on how the requested page is being used at the time of the retire
     57  * request (and on the availability of sufficient system resources), the page
     58  * may be retired immediately, or just marked for retirement later. For
     59  * example, locked pages are marked, while free pages are retired. Multiple
     60  * requests may be made to retire the same page, although there is no need
     61  * to: once the p_toxic flags are set, the page will be retired as soon as it
     62  * can be exclusively locked.
     63  *
     64  * The retire mechanism is driven centrally out of page_unlock(). To expedite
     65  * the retirement of pages, further requests for SE_SHARED locks are denied
     66  * as long as a page retirement is pending. In addition, as long as pages are
     67  * pending retirement a background thread runs periodically trying to retire
     68  * those pages. Pages which could not be retired while the system is running
     69  * are scrubbed prior to rebooting to avoid latent errors on the next boot.
     70  *
     71  * UE pages without persistent errors are scrubbed and returned to service.
     72  * Recidivist pages, as well as FMA-directed requests for retirement, result
     73  * in the page being taken out of service. Once the decision is made to take
     74  * a page out of service, the page is cleared, hashed onto the retired_pages
     75  * vnode, marked as retired, and it is unlocked.  No other requesters (except
     76  * for unretire) are allowed to lock retired pages.
     77  *
     78  * The public routines return (sadly) 0 if they worked and a non-zero error
     79  * value if something went wrong. This is done for the ioctl side of the
     80  * world to allow errors to be reflected all the way out to user land. The
     81  * non-zero values are explained in comments atop each function.
     82  */
     83 
     84 /*
     85  * Things to fix:
     86  *
     87  * 	1. Trying to retire non-relocatable kvp pages may result in a
     88  *      quagmire. This is because seg_kmem() no longer keeps its pages locked,
     89  *      and calls page_lookup() in the free path; since kvp pages are modified
     90  *      and don't have a usable backing store, page_retire() can't do anything
     91  *      with them, and we'll keep denying the lock to seg_kmem_free() in a
     92  *      vicious cycle. To prevent that, we don't deny locks to kvp pages, and
     93  *      hence only try to retire a page from page_unlock() in the free path.
     94  *      Since most kernel pages are indefinitely held anyway, and don't
     95  *      participate in I/O, this is of little consequence.
     96  *
     97  *      2. Low memory situations will be interesting. If we don't have
     98  *      enough memory for page_relocate() to succeed, we won't be able to
     99  *      retire dirty pages; nobody will be able to push them out to disk
    100  *      either, since we aggressively deny the page lock. We could change
    101  *      fsflush so it can recognize this situation, grab the lock, and push
    102  *      the page out, where we'll catch it in the free path and retire it.
    103  *
    104  *	3. Beware of places that have code like this in them:
    105  *
    106  *		if (! page_tryupgrade(pp)) {
    107  *			page_unlock(pp);
    108  *			while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) {
    109  *				/ *NOTHING* /
    110  *			}
    111  *		}
    112  *		page_free(pp);
    113  *
    114  *	The problem is that pp can change identity right after the
    115  *	page_unlock() call.  In particular, page_retire() can step in
    116  *	there, change pp's identity, and hash pp onto the retired_vnode.
    117  *
    118  *	Of course, other functions besides page_retire() can have the
    119  *	same effect. A kmem reader can waltz by, set up a mapping to the
    120  *	page, and then unlock the page. Page_free() will then go castors
    121  *	up. So if anybody is doing this, it's already a bug.
    122  *
    123  *      4. mdboot()'s call into page_retire_mdboot() should probably be
    124  *      moved lower. Where the call is made now, we can get into trouble
    125  *      by scrubbing a kernel page that is then accessed later.
    126  */
    127 
    128 #include <sys/types.h>
    129 #include <sys/param.h>
    130 #include <sys/systm.h>
    131 #include <sys/mman.h>
    132 #include <sys/vnode.h>
    133 #include <sys/vfs_opreg.h>
    134 #include <sys/cmn_err.h>
    135 #include <sys/ksynch.h>
    136 #include <sys/thread.h>
    137 #include <sys/disp.h>
    138 #include <sys/ontrap.h>
    139 #include <sys/vmsystm.h>
    140 #include <sys/mem_config.h>
    141 #include <sys/atomic.h>
    142 #include <sys/callb.h>
    143 #include <vm/page.h>
    144 #include <vm/vm_dep.h>
    145 #include <vm/as.h>
    146 #include <vm/hat.h>
    147 
    148 /*
    149  * vnode for all pages which are retired from the VM system;
    150  */
    151 vnode_t *retired_pages;
    152 
    153 static int page_retire_pp_finish(page_t *, void *, uint_t);
    154 
    155 /*
    156  * Make a list of all of the pages that have been marked for retirement
    157  * but are not yet retired.  At system shutdown, we will scrub all of the
    158  * pages in the list in case there are outstanding UEs.  Then, we
    159  * cross-check this list against the number of pages that are yet to be
    160  * retired, and if we find inconsistencies, we scan every page_t in the
    161  * whole system looking for any pages that need to be scrubbed for UEs.
    162  * The background thread also uses this queue to determine which pages
    163  * it should keep trying to retire.
    164  */
    165 #ifdef	DEBUG
    166 #define	PR_PENDING_QMAX	32
    167 #else	/* DEBUG */
    168 #define	PR_PENDING_QMAX	256
    169 #endif	/* DEBUG */
    170 page_t		*pr_pending_q[PR_PENDING_QMAX];
    171 kmutex_t	pr_q_mutex;
    172 
    173 /*
    174  * Page retire global kstats
    175  */
    176 struct page_retire_kstat {
    177 	kstat_named_t	pr_retired;
    178 	kstat_named_t	pr_requested;
    179 	kstat_named_t	pr_requested_free;
    180 	kstat_named_t	pr_enqueue_fail;
    181 	kstat_named_t	pr_dequeue_fail;
    182 	kstat_named_t	pr_pending;
    183 	kstat_named_t	pr_failed;
    184 	kstat_named_t	pr_failed_kernel;
    185 	kstat_named_t	pr_limit;
    186 	kstat_named_t	pr_limit_exceeded;
    187 	kstat_named_t	pr_fma;
    188 	kstat_named_t	pr_mce;
    189 	kstat_named_t	pr_ue;
    190 	kstat_named_t	pr_ue_cleared_retire;
    191 	kstat_named_t	pr_ue_cleared_free;
    192 	kstat_named_t	pr_ue_persistent;
    193 	kstat_named_t	pr_unretired;
    194 };
    195 
    196 static struct page_retire_kstat page_retire_kstat = {
    197 	{ "pages_retired",		KSTAT_DATA_UINT64},
    198 	{ "pages_retire_request",	KSTAT_DATA_UINT64},
    199 	{ "pages_retire_request_free",	KSTAT_DATA_UINT64},
    200 	{ "pages_notenqueued", 		KSTAT_DATA_UINT64},
    201 	{ "pages_notdequeued", 		KSTAT_DATA_UINT64},
    202 	{ "pages_pending", 		KSTAT_DATA_UINT64},
    203 	{ "pages_deferred",		KSTAT_DATA_UINT64},
    204 	{ "pages_deferred_kernel",	KSTAT_DATA_UINT64},
    205 	{ "pages_limit",		KSTAT_DATA_UINT64},
    206 	{ "pages_limit_exceeded",	KSTAT_DATA_UINT64},
    207 	{ "pages_fma",			KSTAT_DATA_UINT64},
    208 	{ "pages_multiple_ce",		KSTAT_DATA_UINT64},
    209 	{ "pages_ue",			KSTAT_DATA_UINT64},
    210 	{ "pages_ue_cleared_retired",	KSTAT_DATA_UINT64},
    211 	{ "pages_ue_cleared_freed",	KSTAT_DATA_UINT64},
    212 	{ "pages_ue_persistent",	KSTAT_DATA_UINT64},
    213 	{ "pages_unretired",		KSTAT_DATA_UINT64},
    214 };
    215 
    216 static kstat_t  *page_retire_ksp = NULL;
    217 
    218 #define	PR_INCR_KSTAT(stat)	\
    219 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), 1)
    220 #define	PR_DECR_KSTAT(stat)	\
    221 	atomic_add_64(&(page_retire_kstat.stat.value.ui64), -1)
    222 
    223 #define	PR_KSTAT_RETIRED_CE	(page_retire_kstat.pr_mce.value.ui64)
    224 #define	PR_KSTAT_RETIRED_FMA	(page_retire_kstat.pr_fma.value.ui64)
    225 #define	PR_KSTAT_RETIRED_NOTUE	(PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA)
    226 #define	PR_KSTAT_PENDING	(page_retire_kstat.pr_pending.value.ui64)
    227 #define	PR_KSTAT_EQFAIL		(page_retire_kstat.pr_enqueue_fail.value.ui64)
    228 #define	PR_KSTAT_DQFAIL		(page_retire_kstat.pr_dequeue_fail.value.ui64)
    229 
    230 /*
    231  * page retire kstats to list all retired pages
    232  */
    233 static int pr_list_kstat_update(kstat_t *ksp, int rw);
    234 static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
    235 kmutex_t pr_list_kstat_mutex;
    236 
    237 /*
    238  * Limit the number of multiple CE page retires.
    239  * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in
    240  * basis points, where 100 basis points equals one percent.
    241  */
    242 #define	MCE_BPT	10
    243 uint64_t	max_pages_retired_bps = MCE_BPT;
    244 #define	PAGE_RETIRE_LIMIT	((physmem * max_pages_retired_bps) / 10000)
    245 
    246 /*
    247  * Control over the verbosity of page retirement.
    248  *
    249  * When set to zero (the default), no messages will be printed.
    250  * When set to one, summary messages will be printed.
    251  * When set > one, all messages will be printed.
    252  *
    253  * A value of one will trigger detailed messages for retirement operations,
    254  * and is intended as a platform tunable for processors where FMA's DE does
    255  * not run (e.g., spitfire). Values > one are intended for debugging only.
    256  */
    257 int page_retire_messages = 0;
    258 
    259 /*
    260  * Control whether or not we return scrubbed UE pages to service.
    261  * By default we do not since FMA wants to run its diagnostics first
    262  * and then ask us to unretire the page if it passes. Non-FMA platforms
    263  * may set this to zero so we will only retire recidivist pages. It should
    264  * not be changed by the user.
    265  */
    266 int page_retire_first_ue = 1;
    267 
    268 /*
    269  * Master enable for page retire. This prevents a CE or UE early in boot
    270  * from trying to retire a page before page_retire_init() has finished
    271  * setting things up. This is internal only and is not a tunable!
    272  */
    273 static int pr_enable = 0;
    274 
    275 extern struct vnode kvp;
    276 
    277 #ifdef	DEBUG
    278 struct page_retire_debug {
    279 	int prd_dup1;
    280 	int prd_dup2;
    281 	int prd_qdup;
    282 	int prd_noaction;
    283 	int prd_queued;
    284 	int prd_notqueued;
    285 	int prd_dequeue;
    286 	int prd_top;
    287 	int prd_locked;
    288 	int prd_reloc;
    289 	int prd_relocfail;
    290 	int prd_mod;
    291 	int prd_mod_late;
    292 	int prd_kern;
    293 	int prd_free;
    294 	int prd_noreclaim;
    295 	int prd_hashout;
    296 	int prd_fma;
    297 	int prd_uescrubbed;
    298 	int prd_uenotscrubbed;
    299 	int prd_mce;
    300 	int prd_prlocked;
    301 	int prd_prnotlocked;
    302 	int prd_prretired;
    303 	int prd_ulocked;
    304 	int prd_unotretired;
    305 	int prd_udestroy;
    306 	int prd_uhashout;
    307 	int prd_uunretired;
    308 	int prd_unotlocked;
    309 	int prd_checkhit;
    310 	int prd_checkmiss_pend;
    311 	int prd_checkmiss_noerr;
    312 	int prd_tctop;
    313 	int prd_tclocked;
    314 	int prd_hunt;
    315 	int prd_dohunt;
    316 	int prd_earlyhunt;
    317 	int prd_latehunt;
    318 	int prd_nofreedemote;
    319 	int prd_nodemote;
    320 	int prd_demoted;
    321 } pr_debug;
    322 
    323 #define	PR_DEBUG(foo)	((pr_debug.foo)++)
    324 
    325 /*
    326  * A type histogram. We record the incidence of the various toxic
    327  * flag combinations along with the interesting page attributes. The
    328  * goal is to get as many combinations as we can while driving all
    329  * pr_debug values nonzero (indicating we've exercised all possible
    330  * code paths across all possible page types). Not all combinations
    331  * will make sense -- e.g. PRT_MOD|PRT_KERNEL.
    332  *
    333  * pr_type offset bit encoding (when examining with a debugger):
    334  *
    335  *    PRT_NAMED  - 0x4
    336  *    PRT_KERNEL - 0x8
    337  *    PRT_FREE   - 0x10
    338  *    PRT_MOD    - 0x20
    339  *    PRT_FMA    - 0x0
    340  *    PRT_MCE    - 0x40
    341  *    PRT_UE     - 0x80
    342  */
    343 
    344 #define	PRT_NAMED	0x01
    345 #define	PRT_KERNEL	0x02
    346 #define	PRT_FREE	0x04
    347 #define	PRT_MOD		0x08
    348 #define	PRT_FMA		0x00	/* yes, this is not a mistake */
    349 #define	PRT_MCE		0x10
    350 #define	PRT_UE		0x20
    351 #define	PRT_ALL		0x3F
    352 
    353 int pr_types[PRT_ALL+1];
    354 
    355 #define	PR_TYPES(pp)	{			\
    356 	int whichtype = 0;			\
    357 	if (pp->p_vnode)			\
    358 		whichtype |= PRT_NAMED;		\
    359 	if (PP_ISKAS(pp))			\
    360 		whichtype |= PRT_KERNEL;	\
    361 	if (PP_ISFREE(pp))			\
    362 		whichtype |= PRT_FREE;		\
    363 	if (hat_ismod(pp))			\
    364 		whichtype |= PRT_MOD;		\
    365 	if (pp->p_toxic & PR_UE)		\
    366 		whichtype |= PRT_UE;		\
    367 	if (pp->p_toxic & PR_MCE)		\
    368 		whichtype |= PRT_MCE;		\
    369 	pr_types[whichtype]++;			\
    370 }
    371 
    372 int recl_calls;
    373 int recl_mtbf = 3;
    374 int reloc_calls;
    375 int reloc_mtbf = 7;
    376 int pr_calls;
    377 int pr_mtbf = 15;
    378 
    379 #define	MTBF(v, f)	(((++(v)) & (f)) != (f))
    380 
    381 #else	/* DEBUG */
    382 
    383 #define	PR_DEBUG(foo)	/* nothing */
    384 #define	PR_TYPES(foo)	/* nothing */
    385 #define	MTBF(v, f)	(1)
    386 
    387 #endif	/* DEBUG */
    388 
    389 /*
    390  * page_retire_done() - completion processing
    391  *
    392  * Used by the page_retire code for common completion processing.
    393  * It keeps track of how many times a given result has happened,
    394  * and writes out an occasional message.
    395  *
    396  * May be called with a NULL pp (PRD_INVALID_PA case).
    397  */
    398 #define	PRD_INVALID_KEY		-1
    399 #define	PRD_SUCCESS		0
    400 #define	PRD_PENDING		1
    401 #define	PRD_FAILED		2
    402 #define	PRD_DUPLICATE		3
    403 #define	PRD_INVALID_PA		4
    404 #define	PRD_LIMIT		5
    405 #define	PRD_UE_SCRUBBED		6
    406 #define	PRD_UNR_SUCCESS		7
    407 #define	PRD_UNR_CANTLOCK	8
    408 #define	PRD_UNR_NOT		9
    409 
    410 typedef struct page_retire_op {
    411 	int	pr_key;		/* one of the PRD_* defines from above */
    412 	int	pr_count;	/* How many times this has happened */
    413 	int	pr_retval;	/* return value */
    414 	int	pr_msglvl;	/* message level - when to print */
    415 	char	*pr_message;	/* Cryptic message for field service */
    416 } page_retire_op_t;
    417 
    418 static page_retire_op_t page_retire_ops[] = {
    419 	/* key			count	retval	msglvl	message */
    420 	{PRD_SUCCESS,		0,	0,	1,
    421 		"Page 0x%08x.%08x removed from service"},
    422 	{PRD_PENDING,		0,	EAGAIN,	2,
    423 		"Page 0x%08x.%08x will be retired on free"},
    424 	{PRD_FAILED,		0,	EAGAIN,	0, NULL},
    425 	{PRD_DUPLICATE,		0,	EIO,	2,
    426 		"Page 0x%08x.%08x already retired or pending"},
    427 	{PRD_INVALID_PA,	0,	EINVAL, 2,
    428 		"PA 0x%08x.%08x is not a relocatable page"},
    429 	{PRD_LIMIT,		0,	0,	1,
    430 		"Page 0x%08x.%08x not retired due to limit exceeded"},
    431 	{PRD_UE_SCRUBBED,	0,	0,	1,
    432 		"Previously reported error on page 0x%08x.%08x cleared"},
    433 	{PRD_UNR_SUCCESS,	0,	0,	1,
    434 		"Page 0x%08x.%08x returned to service"},
    435 	{PRD_UNR_CANTLOCK,	0,	EAGAIN,	2,
    436 		"Page 0x%08x.%08x could not be unretired"},
    437 	{PRD_UNR_NOT,		0,	EIO,	2,
    438 		"Page 0x%08x.%08x is not retired"},
    439 	{PRD_INVALID_KEY,	0,	0,	0, NULL} /* MUST BE LAST! */
    440 };
    441 
    442 /*
    443  * print a message if page_retire_messages is true.
    444  */
    445 #define	PR_MESSAGE(debuglvl, msglvl, msg, pa)				\
    446 {									\
    447 	uint64_t p = (uint64_t)pa;					\
    448 	if (page_retire_messages >= msglvl && msg != NULL) {		\
    449 		cmn_err(debuglvl, msg,					\
    450 		    (uint32_t)(p >> 32), (uint32_t)p);			\
    451 	}								\
    452 }
    453 
    454 /*
    455  * Note that multiple bits may be set in a single settoxic operation.
    456  * May be called without the page locked.
    457  */
    458 void
    459 page_settoxic(page_t *pp, uchar_t bits)
    460 {
    461 	atomic_or_8(&pp->p_toxic, bits);
    462 }
    463 
    464 /*
    465  * Note that multiple bits may cleared in a single clrtoxic operation.
    466  * Must be called with the page exclusively locked to prevent races which
    467  * may attempt to retire a page without any toxic bits set.
    468  * Note that the PR_CAPTURE bit can be cleared without the exclusive lock
    469  * being held as there is a separate mutex which protects that bit.
    470  */
    471 void
    472 page_clrtoxic(page_t *pp, uchar_t bits)
    473 {
    474 	ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp));
    475 	atomic_and_8(&pp->p_toxic, ~bits);
    476 }
    477 
    478 /*
    479  * Prints any page retire messages to the user, and decides what
    480  * error code is appropriate for the condition reported.
    481  */
    482 static int
    483 page_retire_done(page_t *pp, int code)
    484 {
    485 	page_retire_op_t *prop;
    486 	uint64_t	pa = 0;
    487 	int		i;
    488 
    489 	if (pp != NULL) {
    490 		pa = mmu_ptob((uint64_t)pp->p_pagenum);
    491 	}
    492 
    493 	prop = NULL;
    494 	for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) {
    495 		if (page_retire_ops[i].pr_key == code) {
    496 			prop = &page_retire_ops[i];
    497 			break;
    498 		}
    499 	}
    500 
    501 #ifdef	DEBUG
    502 	if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) {
    503 		cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code);
    504 	}
    505 #endif
    506 
    507 	ASSERT(prop->pr_key == code);
    508 
    509 	prop->pr_count++;
    510 
    511 	PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa);
    512 	if (pp != NULL) {
    513 		page_settoxic(pp, PR_MSG);
    514 	}
    515 
    516 	return (prop->pr_retval);
    517 }
    518 
    519 /*
    520  * Act like page_destroy(), but instead of freeing the page, hash it onto
    521  * the retired_pages vnode, and mark it retired.
    522  *
    523  * For fun, we try to scrub the page until it's squeaky clean.
    524  * availrmem is adjusted here.
    525  */
    526 static void
    527 page_retire_destroy(page_t *pp)
    528 {
    529 	u_offset_t off = (u_offset_t)((uintptr_t)pp);
    530 
    531 	ASSERT(PAGE_EXCL(pp));
    532 	ASSERT(!PP_ISFREE(pp));
    533 	ASSERT(pp->p_szc == 0);
    534 	ASSERT(!hat_page_is_mapped(pp));
    535 	ASSERT(!pp->p_vnode);
    536 
    537 	page_clr_all_props(pp);
    538 	pagescrub(pp, 0, MMU_PAGESIZE);
    539 
    540 	pp->p_next = NULL;
    541 	pp->p_prev = NULL;
    542 	if (page_hashin(pp, retired_pages, off, NULL) == 0) {
    543 		cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp);
    544 	}
    545 
    546 	page_settoxic(pp, PR_RETIRED);
    547 	PR_INCR_KSTAT(pr_retired);
    548 
    549 	if (pp->p_toxic & PR_FMA) {
    550 		PR_INCR_KSTAT(pr_fma);
    551 	} else if (pp->p_toxic & PR_UE) {
    552 		PR_INCR_KSTAT(pr_ue);
    553 	} else {
    554 		PR_INCR_KSTAT(pr_mce);
    555 	}
    556 
    557 	mutex_enter(&freemem_lock);
    558 	availrmem--;
    559 	mutex_exit(&freemem_lock);
    560 
    561 	page_unlock(pp);
    562 }
    563 
    564 /*
    565  * Check whether the number of pages which have been retired already exceeds
    566  * the maximum allowable percentage of memory which may be retired.
    567  *
    568  * Returns 1 if the limit has been exceeded.
    569  */
    570 static int
    571 page_retire_limit(void)
    572 {
    573 	if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) {
    574 		PR_INCR_KSTAT(pr_limit_exceeded);
    575 		return (1);
    576 	}
    577 
    578 	return (0);
    579 }
    580 
    581 #define	MSG_DM	"Data Mismatch occurred at PA 0x%08x.%08x"		\
    582 	"[ 0x%x != 0x%x ] while attempting to clear previously "	\
    583 	"reported error; page removed from service"
    584 
    585 #define	MSG_UE	"Uncorrectable Error occurred at PA 0x%08x.%08x while "	\
    586 	"attempting to clear previously reported error; page removed "	\
    587 	"from service"
    588 
    589 /*
    590  * Attempt to clear a UE from a page.
    591  * Returns 1 if the error has been successfully cleared.
    592  */
    593 static int
    594 page_clear_transient_ue(page_t *pp)
    595 {
    596 	caddr_t		kaddr;
    597 	uint8_t		rb, wb;
    598 	uint64_t	pa;
    599 	uint32_t	pa_hi, pa_lo;
    600 	on_trap_data_t	otd;
    601 	int		errors = 0;
    602 	int		i;
    603 
    604 	ASSERT(PAGE_EXCL(pp));
    605 	ASSERT(PP_PR_REQ(pp));
    606 	ASSERT(pp->p_szc == 0);
    607 	ASSERT(!hat_page_is_mapped(pp));
    608 
    609 	/*
    610 	 * Clear the page and attempt to clear the UE.  If we trap
    611 	 * on the next access to the page, we know the UE has recurred.
    612 	 */
    613 	pagescrub(pp, 0, PAGESIZE);
    614 
    615 	/*
    616 	 * Map the page and write a bunch of bit patterns to compare
    617 	 * what we wrote with what we read back.  This isn't a perfect
    618 	 * test but it should be good enough to catch most of the
    619 	 * recurring UEs. If this fails to catch a recurrent UE, we'll
    620 	 * retire the page the next time we see a UE on the page.
    621 	 */
    622 	kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1);
    623 
    624 	pa = ptob((uint64_t)page_pptonum(pp));
    625 	pa_hi = (uint32_t)(pa >> 32);
    626 	pa_lo = (uint32_t)pa;
    627 
    628 	/*
    629 	 * Fill the page with each (0x00 - 0xFF] bit pattern, flushing
    630 	 * the cache in between reading and writing.  We do this under
    631 	 * on_trap() protection to avoid recursion.
    632 	 */
    633 	if (on_trap(&otd, OT_DATA_EC)) {
    634 		PR_MESSAGE(CE_WARN, 1, MSG_UE, pa);
    635 		errors = 1;
    636 	} else {
    637 		for (wb = 0xff; wb > 0; wb--) {
    638 			for (i = 0; i < PAGESIZE; i++) {
    639 				kaddr[i] = wb;
    640 			}
    641 
    642 			sync_data_memory(kaddr, PAGESIZE);
    643 
    644 			for (i = 0; i < PAGESIZE; i++) {
    645 				rb = kaddr[i];
    646 				if (rb != wb) {
    647 					/*
    648 					 * We had a mismatch without a trap.
    649 					 * Uh-oh. Something is really wrong
    650 					 * with this system.
    651 					 */
    652 					if (page_retire_messages) {
    653 						cmn_err(CE_WARN, MSG_DM,
    654 						    pa_hi, pa_lo, rb, wb);
    655 					}
    656 					errors = 1;
    657 					goto out;	/* double break */
    658 				}
    659 			}
    660 		}
    661 	}
    662 out:
    663 	no_trap();
    664 	ppmapout(kaddr);
    665 
    666 	return (errors ? 0 : 1);
    667 }
    668 
    669 /*
    670  * Try to clear a page_t with a single UE. If the UE was transient, it is
    671  * returned to service, and we return 1. Otherwise we return 0 meaning
    672  * that further processing is required to retire the page.
    673  */
    674 static int
    675 page_retire_transient_ue(page_t *pp)
    676 {
    677 	ASSERT(PAGE_EXCL(pp));
    678 	ASSERT(!hat_page_is_mapped(pp));
    679 
    680 	/*
    681 	 * If this page is a repeat offender, retire him under the
    682 	 * "two strikes and you're out" rule. The caller is responsible
    683 	 * for scrubbing the page to try to clear the error.
    684 	 */
    685 	if (pp->p_toxic & PR_UE_SCRUBBED) {
    686 		PR_INCR_KSTAT(pr_ue_persistent);
    687 		return (0);
    688 	}
    689 
    690 	if (page_clear_transient_ue(pp)) {
    691 		/*
    692 		 * We set the PR_SCRUBBED_UE bit; if we ever see this
    693 		 * page again, we will retire it, no questions asked.
    694 		 */
    695 		page_settoxic(pp, PR_UE_SCRUBBED);
    696 
    697 		if (page_retire_first_ue) {
    698 			PR_INCR_KSTAT(pr_ue_cleared_retire);
    699 			return (0);
    700 		} else {
    701 			PR_INCR_KSTAT(pr_ue_cleared_free);
    702 
    703 			page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG);
    704 
    705 			/* LINTED: CONSTCOND */
    706 			VN_DISPOSE(pp, B_FREE, 1, kcred);
    707 			return (1);
    708 		}
    709 	}
    710 
    711 	PR_INCR_KSTAT(pr_ue_persistent);
    712 	return (0);
    713 }
    714 
    715 /*
    716  * Update the statistics dynamically when our kstat is read.
    717  */
    718 static int
    719 page_retire_kstat_update(kstat_t *ksp, int rw)
    720 {
    721 	struct page_retire_kstat *pr;
    722 
    723 	if (ksp == NULL)
    724 	    return (EINVAL);
    725 
    726 	switch (rw) {
    727 
    728 	case KSTAT_READ:
    729 		pr = (struct page_retire_kstat *)ksp->ks_data;
    730 		ASSERT(pr == &page_retire_kstat);
    731 		pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT;
    732 		return (0);
    733 
    734 	case KSTAT_WRITE:
    735 		return (EACCES);
    736 
    737 	default:
    738 		return (EINVAL);
    739 	}
    740 	/*NOTREACHED*/
    741 }
    742 
    743 static int
    744 pr_list_kstat_update(kstat_t *ksp, int rw)
    745 {
    746 	uint_t count;
    747 	page_t *pp;
    748 	kmutex_t *vphm;
    749 
    750 	if (rw == KSTAT_WRITE)
    751 		return (EACCES);
    752 
    753 	vphm = page_vnode_mutex(retired_pages);
    754 	mutex_enter(vphm);
    755 	/* Needs to be under a lock so that for loop will work right */
    756 	if (retired_pages->v_pages == NULL) {
    757 		mutex_exit(vphm);
    758 		ksp->ks_ndata = 0;
    759 		ksp->ks_data_size = 0;
    760 		return (0);
    761 	}
    762 
    763 	count = 1;
    764 	for (pp = retired_pages->v_pages->p_vpnext;
    765 	    pp != retired_pages->v_pages; pp = pp->p_vpnext) {
    766 		count++;
    767 	}
    768 	mutex_exit(vphm);
    769 
    770 	ksp->ks_ndata = count;
    771 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
    772 
    773 	return (0);
    774 }
    775 
    776 /*
    777  * all spans will be pagesize and no coalescing will be done with the
    778  * list produced.
    779  */
    780 static int
    781 pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
    782 {
    783 	kmutex_t *vphm;
    784 	page_t *pp;
    785 	struct memunit {
    786 		uint64_t address;
    787 		uint64_t size;
    788 	} *kspmem;
    789 
    790 	if (rw == KSTAT_WRITE)
    791 		return (EACCES);
    792 
    793 	ksp->ks_snaptime = gethrtime();
    794 
    795 	kspmem = (struct memunit *)buf;
    796 
    797 	vphm = page_vnode_mutex(retired_pages);
    798 	mutex_enter(vphm);
    799 	pp = retired_pages->v_pages;
    800 	if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) ||
    801 	    (pp == NULL)) {
    802 		mutex_exit(vphm);
    803 		return (0);
    804 	}
    805 	kspmem->address = ptob(pp->p_pagenum);
    806 	kspmem->size = PAGESIZE;
    807 	kspmem++;
    808 	for (pp = pp->p_vpnext; pp != retired_pages->v_pages;
    809 	    pp = pp->p_vpnext, kspmem++) {
    810 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
    811 			break;
    812 		kspmem->address = ptob(pp->p_pagenum);
    813 		kspmem->size = PAGESIZE;
    814 	}
    815 	mutex_exit(vphm);
    816 
    817 	return (0);
    818 }
    819 
    820 /*
    821  * page_retire_pend_count -- helper function for page_capture_thread,
    822  * returns the number of pages pending retirement.
    823  */
    824 uint64_t
    825 page_retire_pend_count(void)
    826 {
    827 	return (PR_KSTAT_PENDING);
    828 }
    829 
    830 void
    831 page_retire_incr_pend_count(void)
    832 {
    833 	PR_INCR_KSTAT(pr_pending);
    834 }
    835 
    836 void
    837 page_retire_decr_pend_count(void)
    838 {
    839 	PR_DECR_KSTAT(pr_pending);
    840 }
    841 
    842 /*
    843  * Initialize the page retire mechanism:
    844  *
    845  *   - Establish the correctable error retire limit.
    846  *   - Initialize locks.
    847  *   - Build the retired_pages vnode.
    848  *   - Set up the kstats.
    849  *   - Fire off the background thread.
    850  *   - Tell page_retire() it's OK to start retiring pages.
    851  */
    852 void
    853 page_retire_init(void)
    854 {
    855 	const fs_operation_def_t retired_vnodeops_template[] = {
    856 		{ NULL, NULL }
    857 	};
    858 	struct vnodeops *vops;
    859 	kstat_t *ksp;
    860 
    861 	const uint_t page_retire_ndata =
    862 	    sizeof (page_retire_kstat) / sizeof (kstat_named_t);
    863 
    864 	ASSERT(page_retire_ksp == NULL);
    865 
    866 	if (max_pages_retired_bps <= 0) {
    867 		max_pages_retired_bps = MCE_BPT;
    868 	}
    869 
    870 	mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL);
    871 
    872 	retired_pages = vn_alloc(KM_SLEEP);
    873 	if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) {
    874 		cmn_err(CE_PANIC,
    875 		    "page_retired_init: can't make retired vnodeops");
    876 	}
    877 	vn_setops(retired_pages, vops);
    878 
    879 	if ((page_retire_ksp = kstat_create("unix", 0, "page_retire",
    880 	    "misc", KSTAT_TYPE_NAMED, page_retire_ndata,
    881 	    KSTAT_FLAG_VIRTUAL)) == NULL) {
    882 		cmn_err(CE_WARN, "kstat_create for page_retire failed");
    883 	} else {
    884 		page_retire_ksp->ks_data = (void *)&page_retire_kstat;
    885 		page_retire_ksp->ks_update = page_retire_kstat_update;
    886 		kstat_install(page_retire_ksp);
    887 	}
    888 
    889 	mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
    890 	ksp = kstat_create("unix", 0, "page_retire_list", "misc",
    891 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
    892 	if (ksp != NULL) {
    893 		ksp->ks_update = pr_list_kstat_update;
    894 		ksp->ks_snapshot = pr_list_kstat_snapshot;
    895 		ksp->ks_lock = &pr_list_kstat_mutex;
    896 		kstat_install(ksp);
    897 	}
    898 
    899 	page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish);
    900 	pr_enable = 1;
    901 }
    902 
    903 /*
    904  * page_retire_hunt() callback for the retire thread.
    905  */
    906 static void
    907 page_retire_thread_cb(page_t *pp)
    908 {
    909 	PR_DEBUG(prd_tctop);
    910 	if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) {
    911 		PR_DEBUG(prd_tclocked);
    912 		page_unlock(pp);
    913 	}
    914 }
    915 
    916 /*
    917  * page_retire_hunt() callback for mdboot().
    918  *
    919  * It is necessary to scrub any failing pages prior to reboot in order to
    920  * prevent a latent error trap from occurring on the next boot.
    921  */
    922 void
    923 page_retire_mdboot_cb(page_t *pp)
    924 {
    925 	/*
    926 	 * Don't scrub the kernel, since we might still need it, unless
    927 	 * we have UEs on the page, in which case we have nothing to lose.
    928 	 */
    929 	if (!PP_ISKAS(pp) || PP_TOXIC(pp)) {
    930 		pp->p_selock = -1;	/* pacify ASSERTs */
    931 		PP_CLRFREE(pp);
    932 		pagescrub(pp, 0, PAGESIZE);
    933 		pp->p_selock = 0;
    934 	}
    935 	pp->p_toxic = 0;
    936 }
    937 
    938 
    939 /*
    940  * Callback used by page_trycapture() to finish off retiring a page.
    941  * The page has already been cleaned and we've been given sole access to
    942  * it.
    943  * Always returns 0 to indicate that callback succeded as the callback never
    944  * fails to finish retiring the given page.
    945  */
    946 /*ARGSUSED*/
    947 static int
    948 page_retire_pp_finish(page_t *pp, void *notused, uint_t flags)
    949 {
    950 	int		toxic;
    951 
    952 	ASSERT(PAGE_EXCL(pp));
    953 	ASSERT(pp->p_iolock_state == 0);
    954 	ASSERT(pp->p_szc == 0);
    955 
    956 	toxic = pp->p_toxic;
    957 
    958 	/*
    959 	 * The problem page is locked, demoted, unmapped, not free,
    960 	 * hashed out, and not COW or mlocked (whew!).
    961 	 *
    962 	 * Now we select our ammunition, take it around back, and shoot it.
    963 	 */
    964 	if (toxic & PR_UE) {
    965 ue_error:
    966 		if (page_retire_transient_ue(pp)) {
    967 			PR_DEBUG(prd_uescrubbed);
    968 			(void) page_retire_done(pp, PRD_UE_SCRUBBED);
    969 		} else {
    970 			PR_DEBUG(prd_uenotscrubbed);
    971 			page_retire_destroy(pp);
    972 			(void) page_retire_done(pp, PRD_SUCCESS);
    973 		}
    974 		return (0);
    975 	} else if (toxic & PR_FMA) {
    976 		PR_DEBUG(prd_fma);
    977 		page_retire_destroy(pp);
    978 		(void) page_retire_done(pp, PRD_SUCCESS);
    979 		return (0);
    980 	} else if (toxic & PR_MCE) {
    981 		PR_DEBUG(prd_mce);
    982 		page_retire_destroy(pp);
    983 		(void) page_retire_done(pp, PRD_SUCCESS);
    984 		return (0);
    985 	}
    986 
    987 	/*
    988 	 * When page_retire_first_ue is set to zero and a UE occurs which is
    989 	 * transient, it's possible that we clear some flags set by a second
    990 	 * UE error on the page which occurs while the first is currently being
    991 	 * handled and thus we need to handle the case where none of the above
    992 	 * are set.  In this instance, PR_UE_SCRUBBED should be set and thus
    993 	 * we should execute the UE code above.
    994 	 */
    995 	if (toxic & PR_UE_SCRUBBED) {
    996 		goto ue_error;
    997 	}
    998 
    999 	/*
   1000 	 * It's impossible to get here.
   1001 	 */
   1002 	panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic);
   1003 	return (0);
   1004 }
   1005 
   1006 /*
   1007  * page_retire() - the front door in to retire a page.
   1008  *
   1009  * Ideally, page_retire() would instantly retire the requested page.
   1010  * Unfortunately, some pages are locked or otherwise tied up and cannot be
   1011  * retired right away.  We use the page capture logic to deal with this
   1012  * situation as it will continuously try to retire the page in the background
   1013  * if the first attempt fails.  Success is determined by looking to see whether
   1014  * the page has been retired after the page_trycapture() attempt.
   1015  *
   1016  * Returns:
   1017  *
   1018  *   - 0 on success,
   1019  *   - EINVAL when the PA is whacko,
   1020  *   - EIO if the page is already retired or already pending retirement, or
   1021  *   - EAGAIN if the page could not be _immediately_ retired but is pending.
   1022  */
   1023 int
   1024 page_retire(uint64_t pa, uchar_t reason)
   1025 {
   1026 	page_t	*pp;
   1027 
   1028 	ASSERT(reason & PR_REASONS);		/* there must be a reason */
   1029 	ASSERT(!(reason & ~PR_REASONS));	/* but no other bits */
   1030 
   1031 	pp = page_numtopp_nolock(mmu_btop(pa));
   1032 	if (pp == NULL) {
   1033 		PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on"
   1034 		    " page 0x%08x.%08x; page is not relocatable memory", pa);
   1035 		return (page_retire_done(pp, PRD_INVALID_PA));
   1036 	}
   1037 	if (PP_RETIRED(pp)) {
   1038 		PR_DEBUG(prd_dup1);
   1039 		return (page_retire_done(pp, PRD_DUPLICATE));
   1040 	}
   1041 
   1042 	if ((reason & PR_UE) && !PP_TOXIC(pp)) {
   1043 		PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on"
   1044 		    " page 0x%08x.%08x", pa);
   1045 	} else if (PP_PR_REQ(pp)) {
   1046 		PR_DEBUG(prd_dup2);
   1047 		return (page_retire_done(pp, PRD_DUPLICATE));
   1048 	} else {
   1049 		PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of"
   1050 		    " page 0x%08x.%08x", pa);
   1051 	}
   1052 
   1053 	/* Avoid setting toxic bits in the first place */
   1054 	if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) &&
   1055 	    page_retire_limit()) {
   1056 		return (page_retire_done(pp, PRD_LIMIT));
   1057 	}
   1058 
   1059 	if (MTBF(pr_calls, pr_mtbf)) {
   1060 		page_settoxic(pp, reason);
   1061 		if (page_trycapture(pp, 0, CAPTURE_RETIRE, NULL) == 0) {
   1062 			PR_DEBUG(prd_prlocked);
   1063 		} else {
   1064 			PR_DEBUG(prd_prnotlocked);
   1065 		}
   1066 	} else {
   1067 		PR_DEBUG(prd_prnotlocked);
   1068 	}
   1069 
   1070 	if (PP_RETIRED(pp)) {
   1071 		PR_DEBUG(prd_prretired);
   1072 		return (0);
   1073 	} else {
   1074 		cv_signal(&pc_cv);
   1075 		PR_INCR_KSTAT(pr_failed);
   1076 
   1077 		if (pp->p_toxic & PR_MSG) {
   1078 			return (page_retire_done(pp, PRD_FAILED));
   1079 		} else {
   1080 			return (page_retire_done(pp, PRD_PENDING));
   1081 		}
   1082 	}
   1083 }
   1084 
   1085 /*
   1086  * Take a retired page off the retired-pages vnode and clear the toxic flags.
   1087  * If "free" is nonzero, lock it and put it back on the freelist. If "free"
   1088  * is zero, the caller already holds SE_EXCL lock so we simply unretire it
   1089  * and don't do anything else with it.
   1090  *
   1091  * Any unretire messages are printed from this routine.
   1092  *
   1093  * Returns 0 if page pp was unretired; else an error code.
   1094  *
   1095  * If flags is:
   1096  *	PR_UNR_FREE - lock the page, clear the toxic flags and free it
   1097  *	    to the freelist.
   1098  *	PR_UNR_TEMP - lock the page, unretire it, leave the toxic
   1099  *	    bits set as is and return it to the caller.
   1100  *	PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the
   1101  *	    toxic flags and return it to caller as is.
   1102  */
   1103 int
   1104 page_unretire_pp(page_t *pp, int flags)
   1105 {
   1106 	/*
   1107 	 * To be retired, a page has to be hashed onto the retired_pages vnode
   1108 	 * and have PR_RETIRED set in p_toxic.
   1109 	 */
   1110 	if (flags == PR_UNR_CLEAN ||
   1111 	    page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) {
   1112 		ASSERT(PAGE_EXCL(pp));
   1113 		PR_DEBUG(prd_ulocked);
   1114 		if (!PP_RETIRED(pp)) {
   1115 			PR_DEBUG(prd_unotretired);
   1116 			page_unlock(pp);
   1117 			return (page_retire_done(pp, PRD_UNR_NOT));
   1118 		}
   1119 
   1120 		PR_MESSAGE(CE_NOTE, 1, "unretiring retired"
   1121 		    " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum));
   1122 		if (pp->p_toxic & PR_FMA) {
   1123 			PR_DECR_KSTAT(pr_fma);
   1124 		} else if (pp->p_toxic & PR_UE) {
   1125 			PR_DECR_KSTAT(pr_ue);
   1126 		} else {
   1127 			PR_DECR_KSTAT(pr_mce);
   1128 		}
   1129 
   1130 		if (flags == PR_UNR_TEMP)
   1131 			page_clrtoxic(pp, PR_RETIRED);
   1132 		else
   1133 			page_clrtoxic(pp, PR_TOXICFLAGS);
   1134 
   1135 		if (flags == PR_UNR_FREE) {
   1136 			PR_DEBUG(prd_udestroy);