1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "@(#)mem_config.c 1.114 07/10/25 SMI" 27 28 #include <sys/types.h> 29 #include <sys/cmn_err.h> 30 #include <sys/vmem.h> 31 #include <sys/kmem.h> 32 #include <sys/systm.h> 33 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 34 #include <sys/errno.h> 35 #include <sys/memnode.h> 36 #include <sys/memlist.h> 37 #include <sys/memlist_impl.h> 38 #include <sys/tuneable.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/debug.h> 42 #include <sys/vm.h> 43 #include <sys/callb.h> 44 #include <sys/memlist_plat.h> /* for installed_top_size() */ 45 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 46 #include <sys/dumphdr.h> /* for dump_resize() */ 47 #include <sys/atomic.h> /* for use in stats collection */ 48 #include <sys/rwlock.h> 49 #include <sys/cpuvar.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/page.h> 53 #include <vm/vm_dep.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern struct memlist *phys_avail; 63 64 extern void mem_node_add(pfn_t, pfn_t); 65 extern void mem_node_del(pfn_t, pfn_t); 66 67 extern uint_t page_ctrs_adjust(int); 68 static void kphysm_setup_post_add(pgcnt_t); 69 static int kphysm_setup_pre_del(pgcnt_t); 70 static void kphysm_setup_post_del(pgcnt_t, int); 71 72 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 73 74 static int delspan_reserve(pfn_t, pgcnt_t); 75 static void delspan_unreserve(pfn_t, pgcnt_t); 76 77 static kmutex_t memseg_lists_lock; 78 static struct memseg *memseg_va_avail; 79 static struct memseg *memseg_delete_junk; 80 static struct memseg *memseg_edit_junk; 81 void memseg_remap_init(void); 82 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 84 static struct memseg *memseg_reuse(pgcnt_t); 85 86 static struct kmem_cache *memseg_cache; 87 88 /* 89 * Add a chunk of memory to the system. page_t's for this memory 90 * are allocated in the first few pages of the chunk. 91 * base: starting PAGESIZE page of new memory. 92 * npgs: length in PAGESIZE pages. 93 * 94 * Adding mem this way doesn't increase the size of the hash tables; 95 * growing them would be too hard. This should be OK, but adding memory 96 * dynamically most likely means more hash misses, since the tables will 97 * be smaller than they otherwise would be. 98 */ 99 int 100 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 101 { 102 page_t *pp; 103 page_t *opp, *oepp; 104 struct memseg *seg; 105 uint64_t avmem; 106 pfn_t pfn; 107 pfn_t pt_base = base; 108 pgcnt_t tpgs = npgs; 109 pgcnt_t metapgs; 110 int exhausted; 111 pfn_t pnum; 112 int mnode; 113 caddr_t vaddr; 114 int reuse; 115 int mlret; 116 void *mapva; 117 pgcnt_t nkpmpgs = 0; 118 offset_t kpm_pages_off; 119 120 cmn_err(CE_CONT, 121 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 122 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 123 124 /* 125 * Add this span in the delete list to prevent interactions. 126 */ 127 if (!delspan_reserve(base, npgs)) { 128 return (KPHYSM_ESPAN); 129 } 130 /* 131 * Check to see if any of the memory span has been added 132 * by trying an add to the installed memory list. This 133 * forms the interlocking process for add. 134 */ 135 136 memlist_write_lock(); 137 138 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 139 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 140 141 if (mlret == MEML_SPANOP_OK) 142 installed_top_size(phys_install, &physmax, &physinstalled); 143 144 memlist_write_unlock(); 145 146 if (mlret != MEML_SPANOP_OK) { 147 if (mlret == MEML_SPANOP_EALLOC) { 148 delspan_unreserve(pt_base, tpgs); 149 return (KPHYSM_ERESOURCE); 150 } else 151 if (mlret == MEML_SPANOP_ESPAN) { 152 delspan_unreserve(pt_base, tpgs); 153 return (KPHYSM_ESPAN); 154 } else { 155 delspan_unreserve(pt_base, tpgs); 156 return (KPHYSM_ERESOURCE); 157 } 158 } 159 160 /* 161 * We store the page_t's for this new memory in the first 162 * few pages of the chunk. Here, we go and get'em ... 163 */ 164 165 /* 166 * The expression after the '-' gives the number of pages 167 * that will fit in the new memory based on a requirement 168 * of (PAGESIZE + sizeof (page_t)) bytes per page. 169 */ 170 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 171 (PAGESIZE + sizeof (page_t))); 172 173 npgs -= metapgs; 174 base += metapgs; 175 176 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 177 178 exhausted = (metapgs == 0 || npgs == 0); 179 180 if (kpm_enable && !exhausted) { 181 pgcnt_t start, end, nkpmpgs_prelim; 182 size_t ptsz; 183 184 /* 185 * A viable kpm large page mapping must not overlap two 186 * dynamic memsegs. Therefore the total size is checked 187 * to be at least kpm_pgsz and also whether start and end 188 * points are at least kpm_pgsz aligned. 189 */ 190 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 191 pmodkpmp(base + npgs)) { 192 193 kphysm_addmem_error_undospan(pt_base, tpgs); 194 195 /* 196 * There is no specific error code for violating 197 * kpm granularity constraints. 198 */ 199 return (KPHYSM_ENOTVIABLE); 200 } 201 202 start = kpmptop(ptokpmp(base)); 203 end = kpmptop(ptokpmp(base + npgs)); 204 nkpmpgs_prelim = ptokpmp(end - start); 205 ptsz = npgs * sizeof (page_t); 206 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 207 exhausted = (tpgs <= metapgs); 208 if (!exhausted) { 209 npgs = tpgs - metapgs; 210 base = pt_base + metapgs; 211 212 /* final nkpmpgs */ 213 start = kpmptop(ptokpmp(base)); 214 nkpmpgs = ptokpmp(end - start); 215 kpm_pages_off = ptsz + 216 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 217 } 218 } 219 220 /* 221 * Is memory area supplied too small? 222 */ 223 if (exhausted) { 224 kphysm_addmem_error_undospan(pt_base, tpgs); 225 226 /* 227 * There is no specific error code for 'too small'. 228 */ 229 return (KPHYSM_ERESOURCE); 230 } 231 232 /* 233 * We may re-use a previously allocated VA space for the page_ts 234 * eventually, but we need to initialize and lock the pages first. 235 */ 236 237 /* 238 * Get an address in the kernel address map, map 239 * the page_t pages and see if we can touch them. 240 */ 241 242 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 243 if (mapva == NULL) { 244 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 245 " Can't allocate VA for page_ts"); 246 247 kphysm_addmem_error_undospan(pt_base, tpgs); 248 249 return (KPHYSM_ERESOURCE); 250 } 251 pp = mapva; 252 253 if (physmax < (pt_base + tpgs)) 254 physmax = (pt_base + tpgs); 255 256 /* 257 * In the remapping code we map one page at a time so we must do 258 * the same here to match mapping sizes. 259 */ 260 pfn = pt_base; 261 vaddr = (caddr_t)pp; 262 for (pnum = 0; pnum < metapgs; pnum++) { 263 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 264 PROT_READ | PROT_WRITE, 265 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 266 pfn++; 267 vaddr += ptob(1); 268 } 269 270 if (ddi_peek32((dev_info_t *)NULL, 271 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 272 273 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 274 " Can't access pp array at 0x%p [phys 0x%lx]", 275 (void *)pp, pt_base); 276 277 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 278 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 279 280 vmem_free(heap_arena, mapva, ptob(metapgs)); 281 282 kphysm_addmem_error_undospan(pt_base, tpgs); 283 284 return (KPHYSM_EFAULT); 285 } 286 287 /* 288 * Add this memory slice to its memory node translation. 289 * 290 * Note that right now, each node may have only one slice; 291 * this may change with COD or in larger SSM systems with 292 * nested latency groups, so we must not assume that the 293 * node does not yet exist. 294 */ 295 pnum = base + npgs - 1; 296 mem_node_add_slice(base, pnum); 297 298 /* 299 * Allocate or resize page counters as necessary to accommodate 300 * the increase in memory pages. 301 */ 302 mnode = PFN_2_MEM_NODE(pnum); 303 if (page_ctrs_adjust(mnode) != 0) { 304 305 mem_node_pre_del_slice(base, pnum); 306 mem_node_post_del_slice(base, pnum, 0); 307 308 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 309 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 310 311 vmem_free(heap_arena, mapva, ptob(metapgs)); 312 313 kphysm_addmem_error_undospan(pt_base, tpgs); 314 315 return (KPHYSM_ERESOURCE); 316 } 317 318 /* 319 * Update the phys_avail memory list. 320 * The phys_install list was done at the start. 321 */ 322 323 memlist_write_lock(); 324 325 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 326 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 327 ASSERT(mlret == MEML_SPANOP_OK); 328 329 memlist_write_unlock(); 330 331 /* See if we can find a memseg to re-use. */ 332 seg = memseg_reuse(metapgs); 333 334 reuse = (seg != NULL); 335 336 /* 337 * Initialize the memseg structure representing this memory 338 * and add it to the existing list of memsegs. Do some basic 339 * initialization and add the memory to the system. 340 * In order to prevent lock deadlocks, the add_physmem() 341 * code is repeated here, but split into several stages. 342 */ 343 if (seg == NULL) { 344 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 345 bzero(seg, sizeof (struct memseg)); 346 seg->msegflags = MEMSEG_DYNAMIC; 347 seg->pages = pp; 348 } else { 349 /*EMPTY*/ 350 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 351 } 352 353 seg->epages = seg->pages + npgs; 354 seg->pages_base = base; 355 seg->pages_end = base + npgs; 356 357 /* 358 * Initialize metadata. The page_ts are set to locked state 359 * ready to be freed. 360 */ 361 bzero((caddr_t)pp, ptob(metapgs)); 362 363 pfn = seg->pages_base; 364 /* Save the original pp base in case we reuse a memseg. */ 365 opp = pp; 366 oepp = opp + npgs; 367 for (pp = opp; pp < oepp; pp++) { 368 pp->p_pagenum = pfn; 369 pfn++; 370 page_iolock_init(pp); 371 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 372 continue; 373 pp->p_offset = (u_offset_t)-1; 374 } 375 376 if (reuse) { 377 /* Remap our page_ts to the re-used memseg VA space. */ 378 pfn = pt_base; 379 vaddr = (caddr_t)seg->pages; 380 for (pnum = 0; pnum < metapgs; pnum++) { 381 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 382 PROT_READ | PROT_WRITE, 383 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 384 pfn++; 385 vaddr += ptob(1); 386 } 387 388 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 389 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 390 391 vmem_free(heap_arena, mapva, ptob(metapgs)); 392 } 393 394 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 395 396 memsegs_lock(1); 397 398 /* 399 * The new memseg is inserted at the beginning of the list. 400 * Not only does this save searching for the tail, but in the 401 * case of a re-used memseg, it solves the problem of what 402 * happens of some process has still got a pointer to the 403 * memseg and follows the next pointer to continue traversing 404 * the memsegs list. 405 */ 406 407 hat_kpm_addmem_mseg_insert(seg); 408 409 seg->next = memsegs; 410 membar_producer(); 411 412 hat_kpm_addmem_memsegs_update(seg); 413 414 memsegs = seg; 415 416 build_pfn_hash(); 417 418 total_pages += npgs; 419 420 /* 421 * Recalculate the paging parameters now total_pages has changed. 422 * This will also cause the clock hands to be reset before next use. 423 */ 424 setupclock(1); 425 426 memsegs_unlock(1); 427 428 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 429 430 /* 431 * Free the pages outside the lock to avoid locking loops. 432 */ 433 for (pp = seg->pages; pp < seg->epages; pp++) { 434 page_free(pp, 1); 435 } 436 437 /* 438 * Now that we've updated the appropriate memory lists we 439 * need to reset a number of globals, since we've increased memory. 440 * Several have already been updated for us as noted above. The 441 * globals we're interested in at this point are: 442 * physmax - highest page frame number. 443 * physinstalled - number of pages currently installed (done earlier) 444 * maxmem - max free pages in the system 445 * physmem - physical memory pages available 446 * availrmem - real memory available 447 */ 448 449 mutex_enter(&freemem_lock); 450 maxmem += npgs; 451 physmem += npgs; 452 availrmem += npgs; 453 availrmem_initial += npgs; 454 455 mutex_exit(&freemem_lock); 456 457 dump_resize(); 458 459 page_freelist_coalesce_all(mnode); 460 461 kphysm_setup_post_add(npgs); 462 463 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 464 "(0x%" PRIx64 ")\n", 465 physinstalled << (PAGESHIFT - 10), 466 (uint64_t)physinstalled << PAGESHIFT); 467 468 avmem = (uint64_t)freemem << PAGESHIFT; 469 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 470 "avail mem = %" PRId64 "\n", avmem); 471 472 /* 473 * Update lgroup generation number on single lgroup systems 474 */ 475 if (nlgrps == 1) 476 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 477 478 delspan_unreserve(pt_base, tpgs); 479 return (KPHYSM_OK); /* Successfully added system memory */ 480 481 } 482 483 /* 484 * There are various error conditions in kphysm_add_memory_dynamic() 485 * which require a rollback of already changed global state. 486 */ 487 static void 488 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 489 { 490 int mlret; 491 492 /* Unreserve memory span. */ 493 memlist_write_lock(); 494 495 mlret = memlist_delete_span( 496 (uint64_t)(pt_base) << PAGESHIFT, 497 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 498 499 ASSERT(mlret == MEML_SPANOP_OK); 500 phys_install_has_changed(); 501 installed_top_size(phys_install, &physmax, &physinstalled); 502 503 memlist_write_unlock(); 504 delspan_unreserve(pt_base, tpgs); 505 } 506 507 /* 508 * Only return an available memseg of exactly the right size. 509 * When the meta data area has it's own virtual address space 510 * we will need to manage this more carefully and do best fit 511 * allocations, possibly splitting an available area. 512 */ 513 static struct memseg * 514 memseg_reuse(pgcnt_t metapgs) 515 { 516 struct memseg **segpp, *seg; 517 518 mutex_enter(&memseg_lists_lock); 519 520 segpp = &memseg_va_avail; 521 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 522 caddr_t end; 523 524 if (kpm_enable) 525 end = hat_kpm_mseg_reuse(seg); 526 else 527 end = (caddr_t)seg->epages; 528 529 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 530 *segpp = seg->lnext; 531 seg->lnext = NULL; 532 break; 533 } 534 } 535 mutex_exit(&memseg_lists_lock); 536 537 return (seg); 538 } 539 540 static uint_t handle_gen; 541 542 struct memdelspan { 543 struct memdelspan *mds_next; 544 pfn_t mds_base; 545 pgcnt_t mds_npgs; 546 uint_t *mds_bitmap; 547 uint_t *mds_bitmap_retired; 548 }; 549 550 #define NBPBMW (sizeof (uint_t) * NBBY) 551 #define MDS_BITMAPBYTES(MDSP) \ 552 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 553 554 struct transit_list { 555 struct transit_list *trl_next; 556 struct memdelspan *trl_spans; 557 int trl_collect; 558 }; 559 560 struct transit_list_head { 561 kmutex_t trh_lock; 562 struct transit_list *trh_head; 563 }; 564 565 static struct transit_list_head transit_list_head; 566 567 struct mem_handle; 568 static void transit_list_collect(struct mem_handle *, int); 569 static void transit_list_insert(struct transit_list *); 570 static void transit_list_remove(struct transit_list *); 571 572 #ifdef DEBUG 573 #define MEM_DEL_STATS 574 #endif /* DEBUG */ 575 576 #ifdef MEM_DEL_STATS 577 static int mem_del_stat_print = 0; 578 struct mem_del_stat { 579 uint_t nloop; 580 uint_t need_free; 581 uint_t free_loop; 582 uint_t free_low; 583 uint_t free_failed; 584 uint_t ncheck; 585 uint_t nopaget; 586 uint_t lockfail; 587 uint_t nfree; 588 uint_t nreloc; 589 uint_t nrelocfail; 590 uint_t already_done; 591 uint_t first_notfree; 592 uint_t npplocked; 593 uint_t nlockreloc; 594 uint_t nnorepl; 595 uint_t nmodreloc; 596 uint_t ndestroy; 597 uint_t nputpage; 598 uint_t nnoreclaim; 599 uint_t ndelay; 600 uint_t demotefail; 601 uint64_t nticks_total; 602 uint64_t nticks_pgrp; 603 uint_t retired; 604 uint_t toxic; 605 uint_t failing; 606 uint_t modtoxic; 607 uint_t npplkdtoxic; 608 uint_t gptlmodfail; 609 uint_t gptllckfail; 610 }; 611 /* 612 * The stat values are only incremented in the delete thread 613 * so no locking or atomic required. 614 */ 615 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 616 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 617 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 618 static void mem_del_stat_print_func(struct mem_handle *); 619 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 620 #else /* MEM_DEL_STATS */ 621 #define MDSTAT_INCR(MHP, FLD) 622 #define MDSTAT_TOTAL(MHP, ntck) 623 #define MDSTAT_PGRP(MHP, ntck) 624 #define MDSTAT_PRINT(MHP) 625 #endif /* MEM_DEL_STATS */ 626 627 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 628 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 629 630 /* 631 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 632 * The mutex may not be required for other fields, dependent on mh_state. 633 */ 634 struct mem_handle { 635 kmutex_t mh_mutex; 636 struct mem_handle *mh_next; 637 memhandle_t mh_exthandle; 638 mhnd_state_t mh_state; 639 struct transit_list mh_transit; 640 pgcnt_t mh_phys_pages; 641 pgcnt_t mh_vm_pages; 642 pgcnt_t mh_hold_todo; 643 void (*mh_delete_complete)(void *, int error); 644 void *mh_delete_complete_arg; 645 volatile uint_t mh_cancel; 646 volatile uint_t mh_dr_aio_cleanup_cancel; 647 volatile uint_t mh_aio_cleanup_done; 648 kcondvar_t mh_cv; 649 kthread_id_t mh_thread_id; 650 page_t *mh_deleted; /* link through p_next */ 651 #ifdef MEM_DEL_STATS 652 struct mem_del_stat mh_delstat; 653 #endif /* MEM_DEL_STATS */ 654 }; 655 656 static struct mem_handle *mem_handle_head; 657 static kmutex_t mem_handle_list_mutex; 658 659 static struct mem_handle * 660 kphysm_allocate_mem_handle() 661 { 662 struct mem_handle *mhp; 663 664 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 665 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 666 mutex_enter(&mem_handle_list_mutex); 667 mutex_enter(&mhp->mh_mutex); 668 /* handle_gen is protected by list mutex. */ 669 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 670 mhp->mh_next = mem_handle_head; 671 mem_handle_head = mhp; 672 mutex_exit(&mem_handle_list_mutex); 673 674 return (mhp); 675 } 676 677 static void 678 kphysm_free_mem_handle(struct mem_handle *mhp) 679 { 680 struct mem_handle **mhpp; 681 682 ASSERT(mutex_owned(&mhp->mh_mutex)); 683 ASSERT(mhp->mh_state == MHND_FREE); 684 /* 685 * Exit the mutex to preserve locking order. This is OK 686 * here as once in the FREE state, the handle cannot 687 * be found by a lookup. 688 */ 689 mutex_exit(&mhp->mh_mutex); 690 691 mutex_enter(&mem_handle_list_mutex); 692 mhpp = &mem_handle_head; 693 while (*mhpp != NULL && *mhpp != mhp) 694 mhpp = &(*mhpp)->mh_next; 695 ASSERT(*mhpp == mhp); 696 /* 697 * No need to lock the handle (mh_mutex) as only 698 * mh_next changing and this is the only thread that 699 * can be referncing mhp. 700 */ 701 *mhpp = mhp->mh_next; 702 mutex_exit(&mem_handle_list_mutex); 703 704 mutex_destroy(&mhp->mh_mutex); 705 kmem_free(mhp, sizeof (struct mem_handle)); 706 } 707 708 /* 709 * This function finds the internal mem_handle corresponding to an 710 * external handle and returns it with the mh_mutex held. 711 */ 712 static struct mem_handle * 713 kphysm_lookup_mem_handle(memhandle_t handle) 714 { 715 struct mem_handle *mhp; 716 717 mutex_enter(&mem_handle_list_mutex); 718 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 719 if (mhp->mh_exthandle == handle) { 720 mutex_enter(&mhp->mh_mutex); 721 /* 722 * The state of the handle could have been changed 723 * by kphysm_del_release() while waiting for mh_mutex. 724 */ 725 if (mhp->mh_state == MHND_FREE) { 726 mutex_exit(&mhp->mh_mutex); 727 continue; 728 } 729 break; 730 } 731 } 732 mutex_exit(&mem_handle_list_mutex); 733 return (mhp); 734 } 735 736 int 737 kphysm_del_gethandle(memhandle_t *xmhp) 738 { 739 struct mem_handle *mhp; 740 741 mhp = kphysm_allocate_mem_handle(); 742 /* 743 * The handle is allocated using KM_SLEEP, so cannot fail. 744 * If the implementation is changed, the correct error to return 745 * here would be KPHYSM_ENOHANDLES. 746 */ 747 ASSERT(mhp->mh_state == MHND_FREE); 748 mhp->mh_state = MHND_INIT; 749 *xmhp = mhp->mh_exthandle; 750 mutex_exit(&mhp->mh_mutex); 751 return (KPHYSM_OK); 752 } 753 754 static int 755 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 756 { 757 pfn_t e1, e2; 758 759 e1 = b1 + l1; 760 e2 = b2 + l2; 761 762 return (!(b2 >= e1 || b1 >= e2)); 763 } 764 765 static int can_remove_pgs(pgcnt_t); 766 767 static struct memdelspan * 768 span_to_install(pfn_t base, pgcnt_t npgs) 769 { 770 struct memdelspan *mdsp; 771 struct memdelspan *mdsp_new; 772 uint64_t address, size, thislen; 773 struct memlist *mlp; 774 775 mdsp_new = NULL; 776 777 address = (uint64_t)base << PAGESHIFT; 778 size = (uint64_t)npgs << PAGESHIFT; 779 while (size != 0) { 780 memlist_read_lock(); 781 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 782 if (address >= (mlp->address + mlp->size)) 783 continue; 784 if ((address + size) > mlp->address) 785 break; 786 } 787 if (mlp == NULL) { 788 address += size; 789 size = 0; 790 thislen = 0; 791 } else { 792 if (address < mlp->address) { 793 size -= (mlp->address - address); 794 address = mlp->address; 795 } 796 ASSERT(address >= mlp->address); 797 if ((address + size) > (mlp->address + mlp->size)) { 798 thislen = mlp->size - (address - mlp->address); 799 } else { 800 thislen = size; 801 } 802 } 803 memlist_read_unlock(); 804 /* TODO: phys_install could change now */ 805 if (thislen == 0) 806 continue; 807 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 808 mdsp->mds_base = btop(address); 809 mdsp->mds_npgs = btop(thislen); 810 mdsp->mds_next = mdsp_new; 811 mdsp_new = mdsp; 812 address += thislen; 813 size -= thislen; 814 } 815 return (mdsp_new); 816 } 817 818 static void 819 free_delspans(struct memdelspan *mdsp) 820 { 821 struct memdelspan *amdsp; 822 823 while ((amdsp = mdsp) != NULL) { 824 mdsp = amdsp->mds_next; 825 kmem_free(amdsp, sizeof (struct memdelspan)); 826 } 827 } 828 829 /* 830 * Concatenate lists. No list ordering is required. 831 */ 832 833 static void 834 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 835 { 836 while (*mdspp != NULL) 837 mdspp = &(*mdspp)->mds_next; 838 839 *mdspp = mdsp; 840 } 841 842 /* 843 * Given a new list of delspans, check there is no overlap with 844 * all existing span activity (add or delete) and then concatenate 845 * the new spans to the given list. 846 * Return 1 for OK, 0 if overlapping. 847 */ 848 static int 849 delspan_insert( 850 struct transit_list *my_tlp, 851 struct memdelspan *mdsp_new) 852 { 853 struct transit_list_head *trh; 854 struct transit_list *tlp; 855 int ret; 856 857 trh = &transit_list_head; 858 859 ASSERT(my_tlp != NULL); 860 ASSERT(mdsp_new != NULL); 861 862 ret = 1; 863 mutex_enter(&trh->trh_lock); 864 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 865 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 866 struct memdelspan *mdsp; 867 868 for (mdsp = tlp->trl_spans; mdsp != NULL; 869 mdsp = mdsp->mds_next) { 870 struct memdelspan *nmdsp; 871 872 for (nmdsp = mdsp_new; nmdsp != NULL; 873 nmdsp = nmdsp->mds_next) { 874 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 875 nmdsp->mds_base, nmdsp->mds_npgs)) { 876 ret = 0; 877 goto done; 878 } 879 } 880 } 881 } 882 done: 883 if (ret != 0) { 884 if (my_tlp->trl_spans == NULL) 885 transit_list_insert(my_tlp); 886 delspan_concat(&my_tlp->trl_spans, mdsp_new); 887 } 888 mutex_exit(&trh->trh_lock); 889 return (ret); 890 } 891 892 static void 893 delspan_remove( 894 struct transit_list *my_tlp, 895 pfn_t base, 896 pgcnt_t npgs) 897 { 898 struct transit_list_head *trh; 899 struct memdelspan *mdsp; 900 901 trh = &transit_list_head; 902 903 ASSERT(my_tlp != NULL); 904 905 mutex_enter(&trh->trh_lock); 906 if ((mdsp = my_tlp->trl_spans) != NULL) { 907 if (npgs == 0) { 908 my_tlp->trl_spans = NULL; 909 free_delspans(mdsp); 910 transit_list_remove(my_tlp); 911 } else { 912 struct memdelspan **prv; 913 914 prv = &my_tlp->trl_spans; 915 while (mdsp != NULL) { 916 pfn_t p_end; 917 918 p_end = mdsp->mds_base + mdsp->mds_npgs; 919 if (mdsp->mds_base >= base && 920 p_end <= (base + npgs)) { 921 *prv = mdsp->mds_next; 922 mdsp->mds_next = NULL; 923 free_delspans(mdsp); 924 } else { 925 prv = &mdsp->mds_next; 926 } 927 mdsp = *prv; 928 } 929 if (my_tlp->trl_spans == NULL) 930 transit_list_remove(my_tlp); 931 } 932 } 933 mutex_exit(&trh->trh_lock); 934 } 935 936 /* 937 * Reserve interface for add to stop delete before add finished. 938 * This list is only accessed through the delspan_insert/remove 939 * functions and so is fully protected by the mutex in struct transit_list. 940 */ 941 942 static struct transit_list reserve_transit; 943 944 static int 945 delspan_reserve(pfn_t base, pgcnt_t npgs) 946 { 947 struct memdelspan *mdsp; 948 int ret; 949 950 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 951 mdsp->mds_base = base; 952 mdsp->mds_npgs = npgs; 953 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 954 free_delspans(mdsp); 955 } 956 return (ret); 957 } 958 959 static void 960 delspan_unreserve(pfn_t base, pgcnt_t npgs) 961 { 962 delspan_remove(&reserve_transit, base, npgs); 963 } 964 965 /* 966 * Return whether memseg was created by kphysm_add_memory_dynamic(). 967 * If this is the case and startp non zero, return also the start pfn 968 * of the meta data via startp. 969 */ 970 static int 971 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 972 { 973 pfn_t pt_start; 974 975 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 976 return (0); 977 978 /* Meta data is required to be at the beginning */ 979 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 980 981 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 982 if (startp != NULL) 983 *startp = pt_start; 984 985 return (1); 986 } 987 988 int 989 kphysm_del_span( 990 memhandle_t handle, 991 pfn_t base, 992 pgcnt_t npgs) 993 { 994 struct mem_handle *mhp; 995 struct memseg *seg; 996 struct memdelspan *mdsp; 997 struct memdelspan *mdsp_new; 998 pgcnt_t phys_pages, vm_pages; 999 pfn_t p_end; 1000 page_t *pp; 1001 int ret; 1002 1003 mhp = kphysm_lookup_mem_handle(handle); 1004 if (mhp == NULL) { 1005 return (KPHYSM_EHANDLE); 1006 } 1007 if (mhp->mh_state != MHND_INIT) { 1008 mutex_exit(&mhp->mh_mutex); 1009 return (KPHYSM_ESEQUENCE); 1010 } 1011 1012 /* 1013 * Intersect the span with the installed memory list (phys_install). 1014 */ 1015 mdsp_new = span_to_install(base, npgs); 1016 if (mdsp_new == NULL) { 1017 /* 1018 * No physical memory in this range. Is this an 1019 * error? If an attempt to start the delete is made 1020 * for OK returns from del_span such as this, start will 1021 * return an error. 1022 * Could return KPHYSM_ENOWORK. 1023 */ 1024 /* 1025 * It is assumed that there are no error returns 1026 * from span_to_install() due to kmem_alloc failure. 1027 */ 1028 mutex_exit(&mhp->mh_mutex); 1029 return (KPHYSM_OK); 1030 } 1031 /* 1032 * Does this span overlap an existing span? 1033 */ 1034 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1035 /* 1036 * Differentiate between already on list for this handle 1037 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1038 */ 1039 ret = KPHYSM_EBUSY; 1040 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1041 mdsp = mdsp->mds_next) { 1042 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1043 base, npgs)) { 1044 ret = KPHYSM_EDUP; 1045 break; 1046 } 1047 } 1048 mutex_exit(&mhp->mh_mutex); 1049 free_delspans(mdsp_new); 1050 return (ret); 1051 } 1052 /* 1053 * At this point the spans in mdsp_new have been inserted into the 1054 * list of spans for this handle and thereby to the global list of 1055 * spans being processed. Each of these spans must now be checked 1056 * for relocatability. As a side-effect segments in the memseg list 1057 * may be split. 1058 * 1059 * Note that mdsp_new can no longer be used as it is now part of 1060 * a larger list. Select elements of this larger list based 1061 * on base and npgs. 1062 */ 1063 restart: 1064 phys_pages = 0; 1065 vm_pages = 0; 1066 ret = KPHYSM_OK; 1067 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1068 mdsp = mdsp->mds_next) { 1069 pgcnt_t pages_checked; 1070 1071 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1072 continue; 1073 } 1074 p_end = mdsp->mds_base + mdsp->mds_npgs; 1075 /* 1076 * The pages_checked count is a hack. All pages should be 1077 * checked for relocatability. Those not covered by memsegs 1078 * should be tested with arch_kphysm_del_span_ok(). 1079 */ 1080 pages_checked = 0; 1081 for (seg = memsegs; seg; seg = seg->next) { 1082 pfn_t mseg_start; 1083 1084 if (seg->pages_base >= p_end || 1085 seg->pages_end <= mdsp->mds_base) { 1086 /* Span and memseg don't overlap. */ 1087 continue; 1088 } 1089 /* Check that segment is suitable for delete. */ 1090 if (memseg_is_dynamic(seg, &mseg_start)) { 1091 /* 1092 * Can only delete whole added segments 1093 * for the moment. 1094 * Check that this is completely within the 1095 * span. 1096