Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
     27 /*	All Rights Reserved   */
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 #pragma ident	"@(#)vm_pagelist.c	1.46	07/12/19 SMI"
     35 
     36 /*
     37  * This file contains common functions to access and manage the page lists.
     38  * Many of these routines originated from platform dependent modules
     39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
     40  * a platform independent manner.
     41  *
     42  * vm/vm_dep.h provides for platform specific support.
     43  */
     44 
     45 #include <sys/types.h>
     46 #include <sys/debug.h>
     47 #include <sys/cmn_err.h>
     48 #include <sys/systm.h>
     49 #include <sys/atomic.h>
     50 #include <sys/sysmacros.h>
     51 #include <vm/as.h>
     52 #include <vm/page.h>
     53 #include <vm/seg_kmem.h>
     54 #include <vm/seg_vn.h>
     55 #include <sys/vmsystm.h>
     56 #include <sys/memnode.h>
     57 #include <vm/vm_dep.h>
     58 #include <sys/lgrp.h>
     59 #include <sys/mem_config.h>
     60 #include <sys/callb.h>
     61 #include <sys/mem_cage.h>
     62 #include <sys/sdt.h>
     63 
     64 extern uint_t	vac_colors;
     65 
     66 #define	MAX_PRAGMA_ALIGN	128
     67 
     68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
     69 
     70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
     71 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
     72 #else
     73 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
     74 #endif
     75 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
     76 
     77 /*
     78  * number of page colors equivalent to reqested color in page_get routines.
     79  * If set, keeps large pages intact longer and keeps MPO allocation
     80  * from the local mnode in favor of acquiring the 'correct' page color from
     81  * a demoted large page or from a remote mnode.
     82  */
     83 uint_t	colorequiv;
     84 
     85 /*
     86  * color equivalency mask for each page size.
     87  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
     88  * High 4 bits determine the number of high order bits of the color to ignore.
     89  * Low 4 bits determines number of low order bits of color to ignore (it's only
     90  * relevant for hashed index based page coloring).
     91  */
     92 uchar_t colorequivszc[MMU_PAGE_SIZES];
     93 
     94 /*
     95  * if set, specifies the percentage of large pages that are free from within
     96  * a large page region before attempting to lock those pages for
     97  * page_get_contig_pages processing.
     98  *
     99  * Should be turned on when kpr is available when page_trylock_contig_pages
    100  * can be more selective.
    101  */
    102 
    103 int	ptcpthreshold;
    104 
    105 /*
    106  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
    107  * Enabled by default via pgcplimitsearch.
    108  *
    109  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
    110  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
    111  * bound. This upper bound range guarantees:
    112  *    - all large page 'slots' will be searched over time
    113  *    - the minimum (1) large page candidates considered on each pgcp call
    114  *    - count doesn't wrap around to 0
    115  */
    116 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
    117 int	pgcplimitsearch = 1;
    118 
    119 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
    120 #define	SETPGCPFAILCNT(szc)						\
    121 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
    122 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
    123 
    124 #ifdef VM_STATS
    125 struct vmm_vmstats_str  vmm_vmstats;
    126 
    127 #endif /* VM_STATS */
    128 
    129 #if defined(__sparc)
    130 #define	LPGCREATE	0
    131 #else
    132 /* enable page_get_contig_pages */
    133 #define	LPGCREATE	1
    134 #endif
    135 
    136 int pg_contig_disable;
    137 int pg_lpgcreate_nocage = LPGCREATE;
    138 
    139 /*
    140  * page_freelist_split pfn flag to signify no hi pfn requirement.
    141  */
    142 #define	PFNNULL		0
    143 
    144 /* Flags involved in promotion and demotion routines */
    145 #define	PC_FREE		0x1	/* put page on freelist */
    146 #define	PC_ALLOC	0x2	/* return page for allocation */
    147 
    148 /*
    149  * Flag for page_demote to be used with PC_FREE to denote that we don't care
    150  * what the color is as the color parameter to the function is ignored.
    151  */
    152 #define	PC_NO_COLOR	(-1)
    153 
    154 /* mtype value for page_promote to use when mtype does not matter */
    155 #define	PC_MTYPE_ANY	(-1)
    156 
    157 /*
    158  * page counters candidates info
    159  * See page_ctrs_cands comment below for more details.
    160  * fields are as follows:
    161  *	pcc_pages_free:		# pages which freelist coalesce can create
    162  *	pcc_color_free:		pointer to page free counts per color
    163  */
    164 typedef struct pcc_info {
    165 	pgcnt_t	pcc_pages_free;
    166 	pgcnt_t	*pcc_color_free;
    167 } pcc_info_t;
    168 
    169 /*
    170  * On big machines it can take a long time to check page_counters
    171  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
    172  * updated sum of all elements of the corresponding page_counters arrays.
    173  * page_freelist_coalesce() searches page_counters only if an appropriate
    174  * element of page_ctrs_cands array is greater than 0.
    175  *
    176  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
    177  */
    178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
    179 
    180 /*
    181  * Return in val the total number of free pages which can be created
    182  * for the given mnode (m), mrange (g), and region size (r)
    183  */
    184 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
    185 	int i;								\
    186 	val = 0;							\
    187 	for (i = 0; i < NPC_MUTEX; i++) {				\
    188 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
    189 	}								\
    190 }
    191 
    192 /*
    193  * Return in val the total number of free pages which can be created
    194  * for the given mnode (m), mrange (g), region size (r), and color (c)
    195  */
    196 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
    197 	int i;								\
    198 	val = 0;							\
    199 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
    200 	for (i = 0; i < NPC_MUTEX; i++) {				\
    201 	    val +=							\
    202 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
    203 	}								\
    204 }
    205 
    206 /*
    207  * We can only allow a single thread to update a counter within the physical
    208  * range of the largest supported page size. That is the finest granularity
    209  * possible since the counter values are dependent on each other
    210  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
    211  * ctr_mutex lock index for a particular physical range.
    212  */
    213 static kmutex_t	*ctr_mutex[NPC_MUTEX];
    214 
    215 #define	PP_CTR_LOCK_INDX(pp)						\
    216 	(((pp)->p_pagenum >>						\
    217 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
    218 
    219 #define	INVALID_COLOR 0xffffffff
    220 #define	INVALID_MASK  0xffffffff
    221 
    222 /*
    223  * Local functions prototypes.
    224  */
    225 
    226 void page_ctr_add(int, int, page_t *, int);
    227 void page_ctr_add_internal(int, int, page_t *, int);
    228 void page_ctr_sub(int, int, page_t *, int);
    229 void page_ctr_sub_internal(int, int, page_t *, int);
    230 void page_freelist_lock(int);
    231 void page_freelist_unlock(int);
    232 page_t *page_promote(int, pfn_t, uchar_t, int, int);
    233 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
    234 page_t *page_freelist_split(uchar_t,
    235     uint_t, int, int, pfn_t, page_list_walker_t *);
    236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
    237 static int page_trylock_cons(page_t *pp, se_t se);
    238 
    239 /*
    240  * The page_counters array below is used to keep track of free contiguous
    241  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
    242  * This contains an array of counters, the size of the array, a shift value
    243  * used to convert a pagenum into a counter array index or vice versa, as
    244  * well as a cache of the last successful index to be promoted to a larger
    245  * page size.  As an optimization, we keep track of the last successful index
    246  * to be promoted per page color for the given size region, and this is
    247  * allocated dynamically based upon the number of colors for a given
    248  * region size.
    249  *
    250  * Conceptually, the page counters are represented as:
    251  *
    252  *	page_counters[region_size][mnode]
    253  *
    254  *	region_size:	size code of a candidate larger page made up
    255  *			of contiguous free smaller pages.
    256  *
    257  *	page_counters[region_size][mnode].hpm_counters[index]:
    258  *		represents how many (region_size - 1) pages either
    259  *		exist or can be created within the given index range.
    260  *
    261  * Let's look at a sparc example:
    262  *	If we want to create a free 512k page, we look at region_size 2
    263  *	for the mnode we want.  We calculate the index and look at a specific
    264  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
    265  *	this location, it means that 8 64k pages either exist or can be created
    266  *	from 8K pages in order to make a single free 512k page at the given
    267  *	index.  Note that when a region is full, it will contribute to the
    268  *	counts in the region above it.  Thus we will not know what page
    269  *	size the free pages will be which can be promoted to this new free
    270  *	page unless we look at all regions below the current region.
    271  */
    272 
    273 /*
    274  * Note: hpmctr_t is defined in platform vm_dep.h
    275  * hw_page_map_t contains all the information needed for the page_counters
    276  * logic. The fields are as follows:
    277  *
    278  *	hpm_counters:	dynamically allocated array to hold counter data
    279  *	hpm_entries:	entries in hpm_counters
    280  *	hpm_shift:	shift for pnum/array index conv
    281  *	hpm_base:	PFN mapped to counter index 0
    282  *	hpm_color_current:	last index in counter array for this color at
    283  *				which we successfully created a large page
    284  */
    285 typedef struct hw_page_map {
    286 	hpmctr_t	*hpm_counters;
    287 	size_t		hpm_entries;
    288 	int		hpm_shift;
    289 	pfn_t		hpm_base;
    290 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
    291 } hw_page_map_t;
    292 
    293 /*
    294  * Element zero is not used, but is allocated for convenience.
    295  */
    296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
    297 
    298 /*
    299  * Cached value of MNODE_RANGE_CNT(mnode).
    300  * This is a function call in x86.
    301  */
    302 static int mnode_nranges[MAX_MEM_NODES];
    303 static int mnode_maxmrange[MAX_MEM_NODES];
    304 
    305 /*
    306  * The following macros are convenient ways to get access to the individual
    307  * elements of the page_counters arrays.  They can be used on both
    308  * the left side and right side of equations.
    309  */
    310 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
    311 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
    312 
    313 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
    314 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
    315 
    316 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
    317 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
    318 
    319 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
    320 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
    321 
    322 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
    323 	(page_counters[(rg_szc)][(mnode)].hpm_base)
    324 
    325 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
    326 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
    327 
    328 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
    329 	(page_counters[(rg_szc)][(mnode)].				\
    330 	hpm_color_current[(mrange)][(color)])
    331 
    332 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
    333 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
    334 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
    335 
    336 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
    337 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
    338 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
    339 
    340 /*
    341  * Protects the hpm_counters and hpm_color_current memory from changing while
    342  * looking at page counters information.
    343  * Grab the write lock to modify what these fields point at.
    344  * Grab the read lock to prevent any pointers from changing.
    345  * The write lock can not be held during memory allocation due to a possible
    346  * recursion deadlock with trying to grab the read lock while the
    347  * write lock is already held.
    348  */
    349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
    350 
    351 
    352 /*
    353  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
    354  */
    355 void
    356 cpu_vm_data_init(struct cpu *cp)
    357 {
    358 	if (cp == CPU0) {
    359 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
    360 	} else {
    361 		void	*kmptr;
    362 		int	align;
    363 		size_t	sz;
    364 
    365 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
    366 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
    367 		kmptr = kmem_zalloc(sz, KM_SLEEP);
    368 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
    369 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
    370 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
    371 	}
    372 }
    373 
    374 /*
    375  * free cpu_vm_data
    376  */
    377 void
    378 cpu_vm_data_destroy(struct cpu *cp)
    379 {
    380 	if (cp->cpu_seqid && cp->cpu_vm_data) {
    381 		ASSERT(cp != CPU0);
    382 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
    383 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
    384 	}
    385 	cp->cpu_vm_data = NULL;
    386 }
    387 
    388 
    389 /*
    390  * page size to page size code
    391  */
    392 int
    393 page_szc(size_t pagesize)
    394 {
    395 	int	i = 0;
    396 
    397 	while (hw_page_array[i].hp_size) {
    398 		if (pagesize == hw_page_array[i].hp_size)
    399 			return (i);
    400 		i++;
    401 	}
    402 	return (-1);
    403 }
    404 
    405 /*
    406  * page size to page size code with the restriction that it be a supported
    407  * user page size.  If it's not a supported user page size, -1 will be returned.
    408  */
    409 int
    410 page_szc_user_filtered(size_t pagesize)
    411 {
    412 	int szc = page_szc(pagesize);
    413 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
    414 		return (szc);
    415 	}
    416 	return (-1);
    417 }
    418 
    419 /*
    420  * Return how many page sizes are available for the user to use.  This is
    421  * what the hardware supports and not based upon how the OS implements the
    422  * support of different page sizes.
    423  *
    424  * If legacy is non-zero, return the number of pagesizes available to legacy
    425  * applications. The number of legacy page sizes might be less than the
    426  * exported user page sizes. This is to prevent legacy applications that
    427  * use the largest page size returned from getpagesizes(3c) from inadvertantly
    428  * using the 'new' large pagesizes.
    429  */
    430 uint_t
    431 page_num_user_pagesizes(int legacy)
    432 {
    433 	if (legacy)
    434 		return (mmu_legacy_page_sizes);
    435 	return (mmu_exported_page_sizes);
    436 }
    437 
    438 uint_t
    439 page_num_pagesizes(void)
    440 {
    441 	return (mmu_page_sizes);
    442 }
    443 
    444 /*
    445  * returns the count of the number of base pagesize pages associated with szc
    446  */
    447 pgcnt_t
    448 page_get_pagecnt(uint_t szc)
    449 {
    450 	if (szc >= mmu_page_sizes)
    451 		panic("page_get_pagecnt: out of range %d", szc);
    452 	return (hw_page_array[szc].hp_pgcnt);
    453 }
    454 
    455 size_t
    456 page_get_pagesize(uint_t szc)
    457 {
    458 	if (szc >= mmu_page_sizes)
    459 		panic("page_get_pagesize: out of range %d", szc);
    460 	return (hw_page_array[szc].hp_size);
    461 }
    462 
    463 /*
    464  * Return the size of a page based upon the index passed in.  An index of
    465  * zero refers to the smallest page size in the system, and as index increases
    466  * it refers to the next larger supported page size in the system.
    467  * Note that szc and userszc may not be the same due to unsupported szc's on
    468  * some systems.
    469  */
    470 size_t
    471 page_get_user_pagesize(uint_t userszc)
    472 {
    473 	uint_t szc = USERSZC_2_SZC(userszc);
    474 
    475 	if (szc >= mmu_page_sizes)
    476 		panic("page_get_user_pagesize: out of range %d", szc);
    477 	return (hw_page_array[szc].hp_size);
    478 }
    479 
    480 uint_t
    481 page_get_shift(uint_t szc)
    482 {
    483 	if (szc >= mmu_page_sizes)
    484 		panic("page_get_shift: out of range %d", szc);
    485 	return (PAGE_GET_SHIFT(szc));
    486 }
    487 
    488 uint_t
    489 page_get_pagecolors(uint_t szc)
    490 {
    491 	if (szc >= mmu_page_sizes)
    492 		panic("page_get_pagecolors: out of range %d", szc);
    493 	return (PAGE_GET_PAGECOLORS(szc));
    494 }
    495 
    496 /*
    497  * this assigns the desired equivalent color after a split
    498  */
    499 uint_t
    500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
    501     uint_t ncolor, uint_t ceq_mask)
    502 {
    503 	ASSERT(nszc > szc);
    504 	ASSERT(szc < mmu_page_sizes);
    505 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
    506 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
    507 
    508 	color &= ceq_mask;
    509 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
    510 	return (color | (ncolor & ~ceq_mask));
    511 }
    512 
    513 /*
    514  * The interleaved_mnodes flag is set when mnodes overlap in
    515  * the physbase..physmax range, but have disjoint slices.
    516  * In this case hpm_counters is shared by all mnodes.
    517  * This flag is set dynamically by the platform.
    518  */
    519 int interleaved_mnodes = 0;
    520 
    521 /*
    522  * Called by startup().
    523  * Size up the per page size free list counters based on physmax
    524  * of each node and max_mem_nodes.
    525  *
    526  * If interleaved_mnodes is set we need to find the first mnode that
    527  * exists. hpm_counters for the first mnode will then be shared by
    528  * all other mnodes. If interleaved_mnodes is not set, just set
    529  * first=mnode each time. That means there will be no sharing.
    530  */
    531 size_t
    532 page_ctrs_sz(void)
    533 {
    534 	int	r;		/* region size */
    535 	int	mnode;
    536 	int	firstmn;	/* first mnode that exists */
    537 	int	nranges;
    538 	pfn_t	physbase;
    539 	pfn_t	physmax;
    540 	uint_t	ctrs_sz = 0;
    541 	int 	i;
    542 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
    543 
    544 	/*
    545 	 * We need to determine how many page colors there are for each
    546 	 * page size in order to allocate memory for any color specific
    547 	 * arrays.
    548 	 */
    549 	for (i = 0; i < mmu_page_sizes; i++) {
    550 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    551 	}
    552 
    553 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
    554 
    555 		pgcnt_t r_pgcnt;
    556 		pfn_t   r_base;
    557 		pgcnt_t r_align;
    558 
    559 		if (mem_node_config[mnode].exists == 0)
    560 			continue;
    561 
    562 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
    563 		nranges = MNODE_RANGE_CNT(mnode);
    564 		mnode_nranges[mnode] = nranges;
    565 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
    566 
    567 		/*
    568 		 * determine size needed for page counter arrays with
    569 		 * base aligned to large page size.
    570 		 */
    571 		for (r = 1; r < mmu_page_sizes; r++) {
    572 			/* add in space for hpm_color_current */
    573 			ctrs_sz += sizeof (size_t) *
    574 			    colors_per_szc[r] * nranges;
    575 
    576 			if (firstmn != mnode)
    577 				continue;
    578 
    579 			/* add in space for hpm_counters */
    580 			r_align = page_get_pagecnt(r);
    581 			r_base = physbase;
    582 			r_base &= ~(r_align - 1);
    583 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
    584 
    585 			/*
    586 			 * Round up to always allocate on pointer sized
    587 			 * boundaries.
    588 			 */
    589 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
    590 			    sizeof (hpmctr_t *));
    591 		}
    592 	}
    593 
    594 	for (r = 1; r < mmu_page_sizes; r++) {
    595 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
    596 	}
    597 
    598 	/* add in space for page_ctrs_cands and pcc_color_free */
    599 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
    600 	    mmu_page_sizes * NPC_MUTEX;
    601 
    602 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    603 
    604 		if (mem_node_config[mnode].exists == 0)
    605 			continue;
    606 
    607 		nranges = mnode_nranges[mnode];
    608 		ctrs_sz += sizeof (pcc_info_t) * nranges *
    609 		    mmu_page_sizes * NPC_MUTEX;
    610 		for (r = 1; r < mmu_page_sizes; r++) {
    611 			ctrs_sz += sizeof (pgcnt_t) * nranges *
    612 			    colors_per_szc[r] * NPC_MUTEX;
    613 		}
    614 	}
    615 
    616 	/* ctr_mutex */
    617 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
    618 
    619 	/* size for page list counts */
    620 	PLCNT_SZ(ctrs_sz);
    621 
    622 	/*
    623 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
    624 	 * address of the counters to ecache_alignsize boundary for every
    625 	 * memory node.
    626 	 */
    627 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
    628 }
    629 
    630 caddr_t
    631 page_ctrs_alloc(caddr_t alloc_base)
    632 {
    633 	int	mnode;
    634 	int	mrange, nranges;
    635 	int	r;		/* region size */
    636 	int	i;
    637 	int	firstmn;	/* first mnode that exists */
    638 	pfn_t	physbase;
    639 	pfn_t	physmax;
    640 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
    641 
    642 	/*
    643 	 * We need to determine how many page colors there are for each
    644 	 * page size in order to allocate memory for any color specific
    645 	 * arrays.
    646 	 */
    647 	for (i = 0; i < mmu_page_sizes; i++) {
    648 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
    649 	}
    650 
    651 	for (r = 1; r < mmu_page_sizes; r++) {
    652 		page_counters[r] = (hw_page_map_t *)alloc_base;
    653 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
    654 	}
    655 
    656 	/* page_ctrs_cands and pcc_color_free array */
    657 	for (i = 0; i < NPC_MUTEX; i++) {
    658 		for (r = 1; r < mmu_page_sizes; r++) {
    659 
    660 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
    661 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
    662 
    663 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
    664 				pcc_info_t *pi;
    665 
    666 				if (mem_node_config[mnode].exists == 0)
    667 					continue;
    668 
    669 				nranges = mnode_nranges[mnode];
    670 
    671 				pi = (pcc_info_t *)alloc_base;
    672 				alloc_base += sizeof (pcc_info_t) * nranges;
    673 				page_ctrs_cands[i][r][mnode] = pi;
    674 
    675 				for (mrange = 0; mrange < nranges; mrange++) {
    676 					pi->pcc_color_free =
    677 					    (pgcnt_t *)alloc_base;
    678 					alloc_base += sizeof (pgcnt_t) *
    679 					    colors_per_szc[r];
    680 					pi++;
    681 				}
    682 			}
    683 		}
    684 	}
    685 
    686 	/* ctr_mutex */
    687 	for (i = 0; i < NPC_MUTEX; i++) {
    688 		ctr_mutex[i] = (kmutex_t *)alloc_base;
    689 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
    690 	}
    691 
    692 	/* initialize page list counts */
    693 	PLCNT_INIT(alloc_base);
    694 
    695 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
    696 
    697 		pgcnt_t r_pgcnt;
    698 		pfn_t	r_base;
    699 		pgcnt_t r_align;
    700 		int	r_shift;
    701 		int	nranges = mnode_nranges[mnode];
    702 
    703 		if (mem_node_config[mnode].exists == 0)
    704 			continue;
    705 
    706 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
    707 
    708 		for (r = 1; r < mmu_page_sizes; r++) {
    709 			/*
    710 			 * the page_counters base has to be aligned to the
    711 			 * page count of page size code r otherwise the counts
    712 			 * will cross large page boundaries.
    713 			 */
    714 			r_align = page_get_pagecnt(r);
    715 			r_base = physbase;
    716 			/* base needs to be aligned - lower to aligned value */
    717 			r_base &= ~(r_align - 1);
    718 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
    719 			r_shift = PAGE_BSZS_SHIFT(r);
    720 
    721 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
    722 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
    723 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
    724 			for (mrange = 0; mrange < nranges; mrange++) {
    725 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
    726 				    r, mrange) = (size_t *)alloc_base;
    727 				alloc_base += sizeof (size_t) *
    728 				    colors_per_szc[r];
    729 			}
    730 			for (i = 0; i < colors_per_szc[r]; i++) {
    731 				uint_t color_mask = colors_per_szc[r] - 1;
    732 				pfn_t  pfnum = r_base;
    733 				size_t idx;
    734 				int mrange;
    735 				MEM_NODE_ITERATOR_DECL(it);
    736 
    737 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
    738 				ASSERT(pfnum != (pfn_t)-1);
    739 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
    740 				    color_mask, color_mask, &it);
    741 				idx = PNUM_TO_IDX(mnode, r, pfnum);
    742 				idx = (idx >= r_pgcnt) ? 0 : idx;
    743 				for (mrange = 0; mrange < nranges; mrange++) {
    744 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
    745 					    r, i, mrange) = idx;
    746 				}
    747 			}
    748 
    749 			/* hpm_counters may be shared by all mnodes */
    750 			if (firstmn == mnode) {
    751 				PAGE_COUNTERS_COUNTERS(mnode, r) =
    752 				    (hpmctr_t *)alloc_base;
    753 				alloc_base +=
    754 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
    755 				    sizeof (hpmctr_t *));
    756 			} else {
    757 				PAGE_COUNTERS_COUNTERS(mnode, r) =
    758 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
    759 			}
    760 
    761 			/*
    762 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
    763 			 * satisfy the identity requirement.
    764 			 * We should be able to go from one to the other
    765 			 * and get consistent values.
    766 			 */
    767 			ASSERT(PNUM_TO_IDX(mnode, r,
    768 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
    769 			ASSERT(IDX_TO_PNUM(mnode, r,
    770 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
    771 		}
    772 		/*
    773 		 * Roundup the start address of the page_counters to
    774 		 * cache aligned boundary for every memory node.
    775 		 * page_ctrs_sz() has added some slop for these roundups.
    776 		 */
    777 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
    778 		    L2CACHE_ALIGN);
    779 	}
    780 
    781 	/* Initialize other page counter specific data structures. */
    782 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
    783 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
    784 	}
    785 
    786 	return (alloc_base);
    787 }
    788 
    789 /*
    790  * Functions to adjust region counters for each size free list.
    791  * Caller is responsible to acquire the ctr_mutex lock if necessary and
    792  * thus can be called during startup without locks.
    793  */
    794 /* ARGSUSED */
    795 void
    796 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
    797 {
    798 	ssize_t		r;	/* region size */
    799 	ssize_t		idx;
    800 	pfn_t		pfnum;
    801 	int		lckidx;
    802 
    803 	ASSERT(mnode == PP_2_MEM_NODE(pp));
    804 	ASSERT(mtype == PP_2_MTYPE(pp));
    805 
    806 	ASSERT(pp->p_szc < mmu_page_sizes);
    807 
    808 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
    809 
    810 	/* no counter update needed for largest page size */
    811 	if (pp->p_szc >= mmu_page_sizes - 1) {
    812 		return;
    813 	}
    814 
    815 	r = pp->p_szc + 1;
    816 	pfnum = pp->p_pagenum;
    817 	lckidx = PP_CTR_LOCK_INDX(pp);
    818 
    819 	/*
    820 	 * Increment the count of free pages for the current
    821 	 * region. Continue looping up in region size incrementing
    822 	 * count if the preceeding region is full.
    823 	 */
    824 	while (r < mmu_page_sizes) {
    825 		idx = PNUM_TO_IDX(mnode, r, pfnum);
    826 
    827 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
    828 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
    829 
    830 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
    831 			break;
    832 		} else {
    833 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
    834 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
    835 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
    836 
    837 			cand->pcc_pages_free++;
    838 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
    839 		}
    840 		r++;
    841 	}
    842 }
    843 
    844 void
    845 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
    846 {
    847 	int		lckidx = PP_CTR_LOCK_INDX(pp);
    848 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
    849 
    850 	mutex_enter(lock);
    851 	page_ctr_add_internal(mnode, mtype, pp, flags);
    852 	mutex_exit(lock);
    853 }
    854 
    855 void
    856 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
    857 {
    858 	int		lckidx;
    859 	ssize_t		r;	/* region size */
    860 	ssize_t		idx;
    861 	pfn_t		pfnum;
    862 
    863 	ASSERT(mnode == PP_2_MEM_NODE(pp));
    864 	ASSERT(mtype == PP_2_MTYPE(pp));
    865 
    866 	ASSERT(pp->p_szc < mmu_page_sizes);
    867 
    868 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
    869 
    870 	/* no counter update needed for largest page size */
    871 	if (pp->p_szc >= mmu_page_sizes - 1) {
    872 		return;
    873 	}
    874 
    875 	r = pp->p_szc + 1;
    876 	pfnum = pp->p_pagenum;
    877 	lckidx = PP_CTR_LOCK_INDX(pp);
    878 
    879 	/*
    880 	 * Decrement the count of free pages for the current
    881 	 * region. Continue looping up in region size decrementing
    882 	 * count if the preceeding region was full.
    883 	 */
    884 	while (r < mmu_page_sizes) {
    885 		idx = PNUM_TO_IDX(mnode, r, pfnum);
    886 
    887 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
    888 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
    889 
    890 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
    891 			break;
    892 		} else {
    893 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
    894 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
    895 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
    896 
    897 			ASSERT(cand->pcc_pages_free != 0);
    898 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
    899 
    900 			cand->pcc_pages_free--;
    901 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
    902 		}
    903 		r++;
    904 	}
    905 }
    906 
    907 void
    908 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
    909 {
    910 	int		lckidx = PP_CTR_LOCK_INDX(pp);
    911 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
    912 
    913 	mutex_enter(lock);
    914 	page_ctr_sub_internal(mnode, mtype, pp, flags);
    915 	mutex_exit(lock);
    916 }
    917 
    918 /*
    919  * Adjust page counters following a memory attach, since typically the
    920  * size of the array needs to change, and the PFN to counter index
    921  * mapping needs to change.
    922  *
    923  * It is possible this mnode did not exist at startup. In that case
    924  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
    925  * to change (a theoretical possibility on x86), which means pcc_color_free
    926  * arrays must be extended.
    927  */
    928 uint_t
    929 page_ctrs_adjust(int mnode)
    930 {
    931 	pgcnt_t npgs;
    932 	int	r;		/* region size */
    933 	int	i;
    934 	size_t	pcsz, old_csz;
    935 	hpmctr_t *new_ctr, *old_ctr;
    936 	pfn_t	oldbase, newbase;
    937 	pfn_t	physbase, physmax;
    938 	size_t	old_npgs;
    939 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
    940 	size_t	size_cache[MMU_PAGE_SIZES];
    941 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
    942 	size_t	*old_color_array[MAX_MNODE_MRANGES];
    943 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
    944 	pcc_info_t **cands_cache;
    945 	pcc_info_t *old_pi, *pi;
    946 	pgcnt_t *pgcntp;
    947 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
    948 	int cands_cache_nranges;
    949 	int old_maxmrange, new_maxmrange;
    950 	int rc = 0;
    951 
    952 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
    953 	    MMU_PAGE_SIZES, KM_NOSLEEP);
    954 	if (cands_cache == NULL)
    955 		return (ENOMEM);
    956 
    957 	i = -1;
    958 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
    959 
    960 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
    961 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
    962 
    963 	/* prepare to free non-null pointers on the way out */
    964 	cands_cache_nranges = nranges;
    965 	bzero(ctr_cache, sizeof (ctr_cache));
    966 	bzero(color_cache, sizeof (color_cache));
    967 
    968 	/*
    969 	 * We need to determine how many page colors there are for each
    970 	 * page size in order to allocate memory for any color specific
    971 	 * arrays.
    972 	 */
    973 	for (r = 0; r < mmu_page_sizes; r++) {
    974 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
    975 	}
    976 
    977 	/*
    978 	 * Preallocate all of the new hpm_counters arrays as we can't
    979 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
    980 	 * If we can't allocate all of the arrays, undo our work so far
    981 	 * and return failure.
    982 	 */
    983 	for (r = 1; r < mmu_page_sizes; r++) {
    984 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
    985 		size_cache[r] = pcsz;
    986 		ctr_cache[r] = kmem_zalloc(pcsz *
    987 		    sizeof (hpmctr_t), KM_NOSLEEP);
    988 		if (ctr_cache[r] == NULL) {
    989 			rc = ENOMEM;
    990 			goto cleanup;
    991 		}
    992 	}
    993 
    994 	/*
    995 	 * Preallocate all of the new color current arrays as we can't
    996 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
    997 	 * If we can't allocate all of the arrays, undo our work so far
    998 	 * and return failure.
    999 	 */
   1000 	for (r = 1; r < mmu_page_sizes; r++) {
   1001 		for (mrange = 0; mrange < nranges; mrange++) {
   1002 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
   1003 			    colors_per_szc[r], KM_NOSLEEP);
   1004 			if (color_cache[r][mrange] == NULL) {
   1005 				rc = ENOMEM;
   1006 				goto cleanup;
   1007 			}
   1008 		}
   1009 	}
   1010 
   1011 	/*
   1012 	 * Preallocate all of the new pcc_info_t arrays as we can't
   1013 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
   1014 	 * If we can't allocate all of the arrays, undo our work so far
   1015 	 * and return failure.
   1016 	 */
   1017 	for (r = 1; r < mmu_page_sizes; r++) {
   1018 		for (i = 0; i < NPC_MUTEX; i++) {
   1019 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
   1020 			    KM_NOSLEEP);
   1021 			if (pi == NULL) {
   1022 				rc = ENOMEM;
   1023 				goto cleanup;
   1024 			}
   1025 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
   1026 
   1027 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
   1028 				pgcntp = kmem_zalloc(colors_per_szc[r] *
   1029 				    sizeof (pgcnt_t), KM_NOSLEEP);
   1030 				if (pgcntp == NULL) {
   1031 					rc = ENOMEM;
   1032 					goto cleanup;
   1033 				}
   1034 				pi->pcc_color_free = pgcntp;
   1035 			}
   1036 		}
   1037 	}
   1038 
   1039 	/*
   1040 	 * Grab the write lock to prevent others from walking these arrays
   1041 	 * while we are modifying them.
   1042 	 */
   1043 	PAGE_CTRS_WRITE_LOCK(mnode);
   1044 
   1045 	old_nranges = mnode_nranges[mnode];
   1046 	cands_cache_nranges = old_nranges;
   1047 	mnode_nranges[mnode] = nranges;
   1048 	old_maxmrange = mnode_maxmrange[mnode];
   1049 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
   1050 	new_maxmrange = mnode_maxmrange[mnode];
   1051 
   1052 	for (r = 1; r < mmu_page_sizes; r++) {
   1053 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
   1054 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
   1055 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
   1056 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
   1057 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
   1058 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1059 			old_color_array[mrange] =
   1060 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
   1061 			    r, mrange);
   1062 		}
   1063 
   1064 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
   1065 		new_ctr = ctr_cache[r];
   1066 		ctr_cache[r] = NULL;
   1067 		if (old_ctr != NULL &&
   1068 		    (oldbase + old_npgs > newbase) &&
   1069 		    (newbase + npgs > oldbase)) {
   1070 			/*
   1071 			 * Map the intersection of the old and new
   1072 			 * counters into the new array.
   1073 			 */
   1074 			size_t offset;
   1075 			if (newbase > oldbase) {
   1076 				offset = (newbase - oldbase) >>
   1077 				    PAGE_COUNTERS_SHIFT(mnode, r);
   1078 				bcopy(old_ctr + offset, new_ctr,
   1079 				    MIN(pcsz, (old_csz - offset)) *
   1080 				    sizeof (hpmctr_t));
   1081 			} else {
   1082 				offset = (oldbase - newbase) >>
   1083 				    PAGE_COUNTERS_SHIFT(mnode, r);
   1084 				bcopy(old_ctr, new_ctr + offset,
   1085 				    MIN(pcsz - offset, old_csz) *
   1086 				    sizeof (hpmctr_t));
   1087 			}
   1088 		}
   1089 
   1090 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
   1091 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
   1092 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
   1093 
   1094 		/* update shared hpm_counters in other mnodes */
   1095 		if (interleaved_mnodes) {
   1096 			for (i = 0; i < max_mem_nodes; i++) {
   1097 				if (i == mnode)
   1098 					continue;
   1099 				if (mem_node_config[i].exists == 0)
   1100 					continue;
   1101 				ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
   1102 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
   1103 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
   1104 				PAGE_COUNTERS_BASE(i, r) = newbase;
   1105 			}
   1106 		}
   1107 
   1108 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
   1109 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
   1110 			    color_cache[r][mrange];
   1111 			color_cache[r][mrange] = NULL;
   1112 		}
   1113 		/*
   1114 		 * for now, just reset on these events as it's probably
   1115 		 * not worthwhile to try and optimize this.
   1116 		 */
   1117 		for (i = 0; i < colors_per_szc[r]; i++) {
   1118 			uint_t color_mask = colors_per_szc</