1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "@(#)vm_pagelist.c 1.46 07/12/19 SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/vmsystm.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 } pcc_info_t; 168 169 /* 170 * On big machines it can take a long time to check page_counters 171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 172 * updated sum of all elements of the corresponding page_counters arrays. 173 * page_freelist_coalesce() searches page_counters only if an appropriate 174 * element of page_ctrs_cands array is greater than 0. 175 * 176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 177 */ 178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 179 180 /* 181 * Return in val the total number of free pages which can be created 182 * for the given mnode (m), mrange (g), and region size (r) 183 */ 184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 185 int i; \ 186 val = 0; \ 187 for (i = 0; i < NPC_MUTEX; i++) { \ 188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 189 } \ 190 } 191 192 /* 193 * Return in val the total number of free pages which can be created 194 * for the given mnode (m), mrange (g), region size (r), and color (c) 195 */ 196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 197 int i; \ 198 val = 0; \ 199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 200 for (i = 0; i < NPC_MUTEX; i++) { \ 201 val += \ 202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 203 } \ 204 } 205 206 /* 207 * We can only allow a single thread to update a counter within the physical 208 * range of the largest supported page size. That is the finest granularity 209 * possible since the counter values are dependent on each other 210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 211 * ctr_mutex lock index for a particular physical range. 212 */ 213 static kmutex_t *ctr_mutex[NPC_MUTEX]; 214 215 #define PP_CTR_LOCK_INDX(pp) \ 216 (((pp)->p_pagenum >> \ 217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 218 219 #define INVALID_COLOR 0xffffffff 220 #define INVALID_MASK 0xffffffff 221 222 /* 223 * Local functions prototypes. 224 */ 225 226 void page_ctr_add(int, int, page_t *, int); 227 void page_ctr_add_internal(int, int, page_t *, int); 228 void page_ctr_sub(int, int, page_t *, int); 229 void page_ctr_sub_internal(int, int, page_t *, int); 230 void page_freelist_lock(int); 231 void page_freelist_unlock(int); 232 page_t *page_promote(int, pfn_t, uchar_t, int, int); 233 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 234 page_t *page_freelist_split(uchar_t, 235 uint_t, int, int, pfn_t, page_list_walker_t *); 236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 237 static int page_trylock_cons(page_t *pp, se_t se); 238 239 /* 240 * The page_counters array below is used to keep track of free contiguous 241 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 242 * This contains an array of counters, the size of the array, a shift value 243 * used to convert a pagenum into a counter array index or vice versa, as 244 * well as a cache of the last successful index to be promoted to a larger 245 * page size. As an optimization, we keep track of the last successful index 246 * to be promoted per page color for the given size region, and this is 247 * allocated dynamically based upon the number of colors for a given 248 * region size. 249 * 250 * Conceptually, the page counters are represented as: 251 * 252 * page_counters[region_size][mnode] 253 * 254 * region_size: size code of a candidate larger page made up 255 * of contiguous free smaller pages. 256 * 257 * page_counters[region_size][mnode].hpm_counters[index]: 258 * represents how many (region_size - 1) pages either 259 * exist or can be created within the given index range. 260 * 261 * Let's look at a sparc example: 262 * If we want to create a free 512k page, we look at region_size 2 263 * for the mnode we want. We calculate the index and look at a specific 264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 265 * this location, it means that 8 64k pages either exist or can be created 266 * from 8K pages in order to make a single free 512k page at the given 267 * index. Note that when a region is full, it will contribute to the 268 * counts in the region above it. Thus we will not know what page 269 * size the free pages will be which can be promoted to this new free 270 * page unless we look at all regions below the current region. 271 */ 272 273 /* 274 * Note: hpmctr_t is defined in platform vm_dep.h 275 * hw_page_map_t contains all the information needed for the page_counters 276 * logic. The fields are as follows: 277 * 278 * hpm_counters: dynamically allocated array to hold counter data 279 * hpm_entries: entries in hpm_counters 280 * hpm_shift: shift for pnum/array index conv 281 * hpm_base: PFN mapped to counter index 0 282 * hpm_color_current: last index in counter array for this color at 283 * which we successfully created a large page 284 */ 285 typedef struct hw_page_map { 286 hpmctr_t *hpm_counters; 287 size_t hpm_entries; 288 int hpm_shift; 289 pfn_t hpm_base; 290 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 291 } hw_page_map_t; 292 293 /* 294 * Element zero is not used, but is allocated for convenience. 295 */ 296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 297 298 /* 299 * Cached value of MNODE_RANGE_CNT(mnode). 300 * This is a function call in x86. 301 */ 302 static int mnode_nranges[MAX_MEM_NODES]; 303 static int mnode_maxmrange[MAX_MEM_NODES]; 304 305 /* 306 * The following macros are convenient ways to get access to the individual 307 * elements of the page_counters arrays. They can be used on both 308 * the left side and right side of equations. 309 */ 310 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 311 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 312 313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 314 (page_counters[(rg_szc)][(mnode)].hpm_counters) 315 316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 317 (page_counters[(rg_szc)][(mnode)].hpm_shift) 318 319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 320 (page_counters[(rg_szc)][(mnode)].hpm_entries) 321 322 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 323 (page_counters[(rg_szc)][(mnode)].hpm_base) 324 325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 326 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 327 328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 329 (page_counters[(rg_szc)][(mnode)]. \ 330 hpm_color_current[(mrange)][(color)]) 331 332 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 333 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 334 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 335 336 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 337 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 338 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 339 340 /* 341 * Protects the hpm_counters and hpm_color_current memory from changing while 342 * looking at page counters information. 343 * Grab the write lock to modify what these fields point at. 344 * Grab the read lock to prevent any pointers from changing. 345 * The write lock can not be held during memory allocation due to a possible 346 * recursion deadlock with trying to grab the read lock while the 347 * write lock is already held. 348 */ 349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 350 351 352 /* 353 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 354 */ 355 void 356 cpu_vm_data_init(struct cpu *cp) 357 { 358 if (cp == CPU0) { 359 cp->cpu_vm_data = (void *)&vm_cpu_data0; 360 } else { 361 void *kmptr; 362 int align; 363 size_t sz; 364 365 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 366 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 367 kmptr = kmem_zalloc(sz, KM_SLEEP); 368 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 370 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 371 } 372 } 373 374 /* 375 * free cpu_vm_data 376 */ 377 void 378 cpu_vm_data_destroy(struct cpu *cp) 379 { 380 if (cp->cpu_seqid && cp->cpu_vm_data) { 381 ASSERT(cp != CPU0); 382 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 383 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 384 } 385 cp->cpu_vm_data = NULL; 386 } 387 388 389 /* 390 * page size to page size code 391 */ 392 int 393 page_szc(size_t pagesize) 394 { 395 int i = 0; 396 397 while (hw_page_array[i].hp_size) { 398 if (pagesize == hw_page_array[i].hp_size) 399 return (i); 400 i++; 401 } 402 return (-1); 403 } 404 405 /* 406 * page size to page size code with the restriction that it be a supported 407 * user page size. If it's not a supported user page size, -1 will be returned. 408 */ 409 int 410 page_szc_user_filtered(size_t pagesize) 411 { 412 int szc = page_szc(pagesize); 413 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 414 return (szc); 415 } 416 return (-1); 417 } 418 419 /* 420 * Return how many page sizes are available for the user to use. This is 421 * what the hardware supports and not based upon how the OS implements the 422 * support of different page sizes. 423 * 424 * If legacy is non-zero, return the number of pagesizes available to legacy 425 * applications. The number of legacy page sizes might be less than the 426 * exported user page sizes. This is to prevent legacy applications that 427 * use the largest page size returned from getpagesizes(3c) from inadvertantly 428 * using the 'new' large pagesizes. 429 */ 430 uint_t 431 page_num_user_pagesizes(int legacy) 432 { 433 if (legacy) 434 return (mmu_legacy_page_sizes); 435 return (mmu_exported_page_sizes); 436 } 437 438 uint_t 439 page_num_pagesizes(void) 440 { 441 return (mmu_page_sizes); 442 } 443 444 /* 445 * returns the count of the number of base pagesize pages associated with szc 446 */ 447 pgcnt_t 448 page_get_pagecnt(uint_t szc) 449 { 450 if (szc >= mmu_page_sizes) 451 panic("page_get_pagecnt: out of range %d", szc); 452 return (hw_page_array[szc].hp_pgcnt); 453 } 454 455 size_t 456 page_get_pagesize(uint_t szc) 457 { 458 if (szc >= mmu_page_sizes) 459 panic("page_get_pagesize: out of range %d", szc); 460 return (hw_page_array[szc].hp_size); 461 } 462 463 /* 464 * Return the size of a page based upon the index passed in. An index of 465 * zero refers to the smallest page size in the system, and as index increases 466 * it refers to the next larger supported page size in the system. 467 * Note that szc and userszc may not be the same due to unsupported szc's on 468 * some systems. 469 */ 470 size_t 471 page_get_user_pagesize(uint_t userszc) 472 { 473 uint_t szc = USERSZC_2_SZC(userszc); 474 475 if (szc >= mmu_page_sizes) 476 panic("page_get_user_pagesize: out of range %d", szc); 477 return (hw_page_array[szc].hp_size); 478 } 479 480 uint_t 481 page_get_shift(uint_t szc) 482 { 483 if (szc >= mmu_page_sizes) 484 panic("page_get_shift: out of range %d", szc); 485 return (PAGE_GET_SHIFT(szc)); 486 } 487 488 uint_t 489 page_get_pagecolors(uint_t szc) 490 { 491 if (szc >= mmu_page_sizes) 492 panic("page_get_pagecolors: out of range %d", szc); 493 return (PAGE_GET_PAGECOLORS(szc)); 494 } 495 496 /* 497 * this assigns the desired equivalent color after a split 498 */ 499 uint_t 500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 501 uint_t ncolor, uint_t ceq_mask) 502 { 503 ASSERT(nszc > szc); 504 ASSERT(szc < mmu_page_sizes); 505 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 506 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 507 508 color &= ceq_mask; 509 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 510 return (color | (ncolor & ~ceq_mask)); 511 } 512 513 /* 514 * The interleaved_mnodes flag is set when mnodes overlap in 515 * the physbase..physmax range, but have disjoint slices. 516 * In this case hpm_counters is shared by all mnodes. 517 * This flag is set dynamically by the platform. 518 */ 519 int interleaved_mnodes = 0; 520 521 /* 522 * Called by startup(). 523 * Size up the per page size free list counters based on physmax 524 * of each node and max_mem_nodes. 525 * 526 * If interleaved_mnodes is set we need to find the first mnode that 527 * exists. hpm_counters for the first mnode will then be shared by 528 * all other mnodes. If interleaved_mnodes is not set, just set 529 * first=mnode each time. That means there will be no sharing. 530 */ 531 size_t 532 page_ctrs_sz(void) 533 { 534 int r; /* region size */ 535 int mnode; 536 int firstmn; /* first mnode that exists */ 537 int nranges; 538 pfn_t physbase; 539 pfn_t physmax; 540 uint_t ctrs_sz = 0; 541 int i; 542 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 543 544 /* 545 * We need to determine how many page colors there are for each 546 * page size in order to allocate memory for any color specific 547 * arrays. 548 */ 549 for (i = 0; i < mmu_page_sizes; i++) { 550 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 551 } 552 553 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 554 555 pgcnt_t r_pgcnt; 556 pfn_t r_base; 557 pgcnt_t r_align; 558 559 if (mem_node_config[mnode].exists == 0) 560 continue; 561 562 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 563 nranges = MNODE_RANGE_CNT(mnode); 564 mnode_nranges[mnode] = nranges; 565 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 566 567 /* 568 * determine size needed for page counter arrays with 569 * base aligned to large page size. 570 */ 571 for (r = 1; r < mmu_page_sizes; r++) { 572 /* add in space for hpm_color_current */ 573 ctrs_sz += sizeof (size_t) * 574 colors_per_szc[r] * nranges; 575 576 if (firstmn != mnode) 577 continue; 578 579 /* add in space for hpm_counters */ 580 r_align = page_get_pagecnt(r); 581 r_base = physbase; 582 r_base &= ~(r_align - 1); 583 r_pgcnt = howmany(physmax - r_base + 1, r_align); 584 585 /* 586 * Round up to always allocate on pointer sized 587 * boundaries. 588 */ 589 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 590 sizeof (hpmctr_t *)); 591 } 592 } 593 594 for (r = 1; r < mmu_page_sizes; r++) { 595 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 596 } 597 598 /* add in space for page_ctrs_cands and pcc_color_free */ 599 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 600 mmu_page_sizes * NPC_MUTEX; 601 602 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 603 604 if (mem_node_config[mnode].exists == 0) 605 continue; 606 607 nranges = mnode_nranges[mnode]; 608 ctrs_sz += sizeof (pcc_info_t) * nranges * 609 mmu_page_sizes * NPC_MUTEX; 610 for (r = 1; r < mmu_page_sizes; r++) { 611 ctrs_sz += sizeof (pgcnt_t) * nranges * 612 colors_per_szc[r] * NPC_MUTEX; 613 } 614 } 615 616 /* ctr_mutex */ 617 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 618 619 /* size for page list counts */ 620 PLCNT_SZ(ctrs_sz); 621 622 /* 623 * add some slop for roundups. page_ctrs_alloc will roundup the start 624 * address of the counters to ecache_alignsize boundary for every 625 * memory node. 626 */ 627 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 628 } 629 630 caddr_t 631 page_ctrs_alloc(caddr_t alloc_base) 632 { 633 int mnode; 634 int mrange, nranges; 635 int r; /* region size */ 636 int i; 637 int firstmn; /* first mnode that exists */ 638 pfn_t physbase; 639 pfn_t physmax; 640 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 641 642 /* 643 * We need to determine how many page colors there are for each 644 * page size in order to allocate memory for any color specific 645 * arrays. 646 */ 647 for (i = 0; i < mmu_page_sizes; i++) { 648 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 649 } 650 651 for (r = 1; r < mmu_page_sizes; r++) { 652 page_counters[r] = (hw_page_map_t *)alloc_base; 653 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 654 } 655 656 /* page_ctrs_cands and pcc_color_free array */ 657 for (i = 0; i < NPC_MUTEX; i++) { 658 for (r = 1; r < mmu_page_sizes; r++) { 659 660 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 661 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 662 663 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 664 pcc_info_t *pi; 665 666 if (mem_node_config[mnode].exists == 0) 667 continue; 668 669 nranges = mnode_nranges[mnode]; 670 671 pi = (pcc_info_t *)alloc_base; 672 alloc_base += sizeof (pcc_info_t) * nranges; 673 page_ctrs_cands[i][r][mnode] = pi; 674 675 for (mrange = 0; mrange < nranges; mrange++) { 676 pi->pcc_color_free = 677 (pgcnt_t *)alloc_base; 678 alloc_base += sizeof (pgcnt_t) * 679 colors_per_szc[r]; 680 pi++; 681 } 682 } 683 } 684 } 685 686 /* ctr_mutex */ 687 for (i = 0; i < NPC_MUTEX; i++) { 688 ctr_mutex[i] = (kmutex_t *)alloc_base; 689 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 690 } 691 692 /* initialize page list counts */ 693 PLCNT_INIT(alloc_base); 694 695 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 696 697 pgcnt_t r_pgcnt; 698 pfn_t r_base; 699 pgcnt_t r_align; 700 int r_shift; 701 int nranges = mnode_nranges[mnode]; 702 703 if (mem_node_config[mnode].exists == 0) 704 continue; 705 706 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 707 708 for (r = 1; r < mmu_page_sizes; r++) { 709 /* 710 * the page_counters base has to be aligned to the 711 * page count of page size code r otherwise the counts 712 * will cross large page boundaries. 713 */ 714 r_align = page_get_pagecnt(r); 715 r_base = physbase; 716 /* base needs to be aligned - lower to aligned value */ 717 r_base &= ~(r_align - 1); 718 r_pgcnt = howmany(physmax - r_base + 1, r_align); 719 r_shift = PAGE_BSZS_SHIFT(r); 720 721 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 722 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 723 PAGE_COUNTERS_BASE(mnode, r) = r_base; 724 for (mrange = 0; mrange < nranges; mrange++) { 725 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 726 r, mrange) = (size_t *)alloc_base; 727 alloc_base += sizeof (size_t) * 728 colors_per_szc[r]; 729 } 730 for (i = 0; i < colors_per_szc[r]; i++) { 731 uint_t color_mask = colors_per_szc[r] - 1; 732 pfn_t pfnum = r_base; 733 size_t idx; 734 int mrange; 735 MEM_NODE_ITERATOR_DECL(it); 736 737 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 738 ASSERT(pfnum != (pfn_t)-1); 739 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 740 color_mask, color_mask, &it); 741 idx = PNUM_TO_IDX(mnode, r, pfnum); 742 idx = (idx >= r_pgcnt) ? 0 : idx; 743 for (mrange = 0; mrange < nranges; mrange++) { 744 PAGE_COUNTERS_CURRENT_COLOR(mnode, 745 r, i, mrange) = idx; 746 } 747 } 748 749 /* hpm_counters may be shared by all mnodes */ 750 if (firstmn == mnode) { 751 PAGE_COUNTERS_COUNTERS(mnode, r) = 752 (hpmctr_t *)alloc_base; 753 alloc_base += 754 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 755 sizeof (hpmctr_t *)); 756 } else { 757 PAGE_COUNTERS_COUNTERS(mnode, r) = 758 PAGE_COUNTERS_COUNTERS(firstmn, r); 759 } 760 761 /* 762 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 763 * satisfy the identity requirement. 764 * We should be able to go from one to the other 765 * and get consistent values. 766 */ 767 ASSERT(PNUM_TO_IDX(mnode, r, 768 (IDX_TO_PNUM(mnode, r, 0))) == 0); 769 ASSERT(IDX_TO_PNUM(mnode, r, 770 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 771 } 772 /* 773 * Roundup the start address of the page_counters to 774 * cache aligned boundary for every memory node. 775 * page_ctrs_sz() has added some slop for these roundups. 776 */ 777 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 778 L2CACHE_ALIGN); 779 } 780 781 /* Initialize other page counter specific data structures. */ 782 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 783 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 784 } 785 786 return (alloc_base); 787 } 788 789 /* 790 * Functions to adjust region counters for each size free list. 791 * Caller is responsible to acquire the ctr_mutex lock if necessary and 792 * thus can be called during startup without locks. 793 */ 794 /* ARGSUSED */ 795 void 796 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 797 { 798 ssize_t r; /* region size */ 799 ssize_t idx; 800 pfn_t pfnum; 801 int lckidx; 802 803 ASSERT(mnode == PP_2_MEM_NODE(pp)); 804 ASSERT(mtype == PP_2_MTYPE(pp)); 805 806 ASSERT(pp->p_szc < mmu_page_sizes); 807 808 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 809 810 /* no counter update needed for largest page size */ 811 if (pp->p_szc >= mmu_page_sizes - 1) { 812 return; 813 } 814 815 r = pp->p_szc + 1; 816 pfnum = pp->p_pagenum; 817 lckidx = PP_CTR_LOCK_INDX(pp); 818 819 /* 820 * Increment the count of free pages for the current 821 * region. Continue looping up in region size incrementing 822 * count if the preceeding region is full. 823 */ 824 while (r < mmu_page_sizes) { 825 idx = PNUM_TO_IDX(mnode, r, pfnum); 826 827 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 828 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 829 830 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 831 break; 832 } else { 833 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 834 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 835 [MTYPE_2_MRANGE(mnode, root_mtype)]; 836 837 cand->pcc_pages_free++; 838 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 839 } 840 r++; 841 } 842 } 843 844 void 845 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 846 { 847 int lckidx = PP_CTR_LOCK_INDX(pp); 848 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 849 850 mutex_enter(lock); 851 page_ctr_add_internal(mnode, mtype, pp, flags); 852 mutex_exit(lock); 853 } 854 855 void 856 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 857 { 858 int lckidx; 859 ssize_t r; /* region size */ 860 ssize_t idx; 861 pfn_t pfnum; 862 863 ASSERT(mnode == PP_2_MEM_NODE(pp)); 864 ASSERT(mtype == PP_2_MTYPE(pp)); 865 866 ASSERT(pp->p_szc < mmu_page_sizes); 867 868 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 869 870 /* no counter update needed for largest page size */ 871 if (pp->p_szc >= mmu_page_sizes - 1) { 872 return; 873 } 874 875 r = pp->p_szc + 1; 876 pfnum = pp->p_pagenum; 877 lckidx = PP_CTR_LOCK_INDX(pp); 878 879 /* 880 * Decrement the count of free pages for the current 881 * region. Continue looping up in region size decrementing 882 * count if the preceeding region was full. 883 */ 884 while (r < mmu_page_sizes) { 885 idx = PNUM_TO_IDX(mnode, r, pfnum); 886 887 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 888 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 889 890 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 891 break; 892 } else { 893 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 894 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 895 [MTYPE_2_MRANGE(mnode, root_mtype)]; 896 897 ASSERT(cand->pcc_pages_free != 0); 898 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 899 900 cand->pcc_pages_free--; 901 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 902 } 903 r++; 904 } 905 } 906 907 void 908 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 909 { 910 int lckidx = PP_CTR_LOCK_INDX(pp); 911 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 912 913 mutex_enter(lock); 914 page_ctr_sub_internal(mnode, mtype, pp, flags); 915 mutex_exit(lock); 916 } 917 918 /* 919 * Adjust page counters following a memory attach, since typically the 920 * size of the array needs to change, and the PFN to counter index 921 * mapping needs to change. 922 * 923 * It is possible this mnode did not exist at startup. In that case 924 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 925 * to change (a theoretical possibility on x86), which means pcc_color_free 926 * arrays must be extended. 927 */ 928 uint_t 929 page_ctrs_adjust(int mnode) 930 { 931 pgcnt_t npgs; 932 int r; /* region size */ 933 int i; 934 size_t pcsz, old_csz; 935 hpmctr_t *new_ctr, *old_ctr; 936 pfn_t oldbase, newbase; 937 pfn_t physbase, physmax; 938 size_t old_npgs; 939 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 940 size_t size_cache[MMU_PAGE_SIZES]; 941 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 942 size_t *old_color_array[MAX_MNODE_MRANGES]; 943 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 944 pcc_info_t **cands_cache; 945 pcc_info_t *old_pi, *pi; 946 pgcnt_t *pgcntp; 947 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 948 int cands_cache_nranges; 949 int old_maxmrange, new_maxmrange; 950 int rc = 0; 951 952 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 953 MMU_PAGE_SIZES, KM_NOSLEEP); 954 if (cands_cache == NULL) 955 return (ENOMEM); 956 957 i = -1; 958 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 959 960 newbase = physbase & ~PC_BASE_ALIGN_MASK; 961 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 962 963 /* prepare to free non-null pointers on the way out */ 964 cands_cache_nranges = nranges; 965 bzero(ctr_cache, sizeof (ctr_cache)); 966 bzero(color_cache, sizeof (color_cache)); 967 968 /* 969 * We need to determine how many page colors there are for each 970 * page size in order to allocate memory for any color specific 971 * arrays. 972 */ 973 for (r = 0; r < mmu_page_sizes; r++) { 974 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 975 } 976 977 /* 978 * Preallocate all of the new hpm_counters arrays as we can't 979 * hold the page_ctrs_rwlock as a writer and allocate memory. 980 * If we can't allocate all of the arrays, undo our work so far 981 * and return failure. 982 */ 983 for (r = 1; r < mmu_page_sizes; r++) { 984 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 985 size_cache[r] = pcsz; 986 ctr_cache[r] = kmem_zalloc(pcsz * 987 sizeof (hpmctr_t), KM_NOSLEEP); 988 if (ctr_cache[r] == NULL) { 989 rc = ENOMEM; 990 goto cleanup; 991 } 992 } 993 994 /* 995 * Preallocate all of the new color current arrays as we can't 996 * hold the page_ctrs_rwlock as a writer and allocate memory. 997 * If we can't allocate all of the arrays, undo our work so far 998 * and return failure. 999 */ 1000 for (r = 1; r < mmu_page_sizes; r++) { 1001 for (mrange = 0; mrange < nranges; mrange++) { 1002 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1003 colors_per_szc[r], KM_NOSLEEP); 1004 if (color_cache[r][mrange] == NULL) { 1005 rc = ENOMEM; 1006 goto cleanup; 1007 } 1008 } 1009 } 1010 1011 /* 1012 * Preallocate all of the new pcc_info_t arrays as we can't 1013 * hold the page_ctrs_rwlock as a writer and allocate memory. 1014 * If we can't allocate all of the arrays, undo our work so far 1015 * and return failure. 1016 */ 1017 for (r = 1; r < mmu_page_sizes; r++) { 1018 for (i = 0; i < NPC_MUTEX; i++) { 1019 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1020 KM_NOSLEEP); 1021 if (pi == NULL) { 1022 rc = ENOMEM; 1023 goto cleanup; 1024 } 1025 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1026 1027 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1028 pgcntp = kmem_zalloc(colors_per_szc[r] * 1029 sizeof (pgcnt_t), KM_NOSLEEP); 1030 if (pgcntp == NULL) { 1031 rc = ENOMEM; 1032 goto cleanup; 1033 } 1034 pi->pcc_color_free = pgcntp; 1035 } 1036 } 1037 } 1038 1039 /* 1040 * Grab the write lock to prevent others from walking these arrays 1041 * while we are modifying them. 1042 */ 1043 PAGE_CTRS_WRITE_LOCK(mnode); 1044 1045 old_nranges = mnode_nranges[mnode]; 1046 cands_cache_nranges = old_nranges; 1047 mnode_nranges[mnode] = nranges; 1048 old_maxmrange = mnode_maxmrange[mnode]; 1049 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1050 new_maxmrange = mnode_maxmrange[mnode]; 1051 1052 for (r = 1; r < mmu_page_sizes; r++) { 1053 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1054 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1055 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1056 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1057 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1058 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1059 old_color_array[mrange] = 1060 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1061 r, mrange); 1062 } 1063 1064 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1065 new_ctr = ctr_cache[r]; 1066 ctr_cache[r] = NULL; 1067 if (old_ctr != NULL && 1068 (oldbase + old_npgs > newbase) && 1069 (newbase + npgs > oldbase)) { 1070 /* 1071 * Map the intersection of the old and new 1072 * counters into the new array. 1073 */ 1074 size_t offset; 1075 if (newbase > oldbase) { 1076 offset = (newbase - oldbase) >> 1077 PAGE_COUNTERS_SHIFT(mnode, r); 1078 bcopy(old_ctr + offset, new_ctr, 1079 MIN(pcsz, (old_csz - offset)) * 1080 sizeof (hpmctr_t)); 1081 } else { 1082 offset = (oldbase - newbase) >> 1083 PAGE_COUNTERS_SHIFT(mnode, r); 1084 bcopy(old_ctr, new_ctr + offset, 1085 MIN(pcsz - offset, old_csz) * 1086 sizeof (hpmctr_t)); 1087 } 1088 } 1089 1090 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1091 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1092 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1093 1094 /* update shared hpm_counters in other mnodes */ 1095 if (interleaved_mnodes) { 1096 for (i = 0; i < max_mem_nodes; i++) { 1097 if (i == mnode) 1098 continue; 1099 if (mem_node_config[i].exists == 0) 1100 continue; 1101 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1102 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1103 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1104 PAGE_COUNTERS_BASE(i, r) = newbase; 1105 } 1106 } 1107 1108 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1109 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1110 color_cache[r][mrange]; 1111 color_cache[r][mrange] = NULL; 1112 } 1113 /* 1114 * for now, just reset on these events as it's probably 1115 * not worthwhile to try and optimize this. 1116 */ 1117 for (i = 0; i < colors_per_szc[r]; i++) { 1118 uint_t color_mask = colors_per_szc