Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)mem_config.c	1.114	07/10/25 SMI"
     27 
     28 #include <sys/types.h>
     29 #include <sys/cmn_err.h>
     30 #include <sys/vmem.h>
     31 #include <sys/kmem.h>
     32 #include <sys/systm.h>
     33 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
     34 #include <sys/errno.h>
     35 #include <sys/memnode.h>
     36 #include <sys/memlist.h>
     37 #include <sys/memlist_impl.h>
     38 #include <sys/tuneable.h>
     39 #include <sys/proc.h>
     40 #include <sys/disp.h>
     41 #include <sys/debug.h>
     42 #include <sys/vm.h>
     43 #include <sys/callb.h>
     44 #include <sys/memlist_plat.h>	/* for installed_top_size() */
     45 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
     46 #include <sys/dumphdr.h>	/* for dump_resize() */
     47 #include <sys/atomic.h>		/* for use in stats collection */
     48 #include <sys/rwlock.h>
     49 #include <sys/cpuvar.h>
     50 #include <vm/seg_kmem.h>
     51 #include <vm/seg_kpm.h>
     52 #include <vm/page.h>
     53 #include <vm/vm_dep.h>
     54 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
     55 #include <sys/sunddi.h>
     56 #include <sys/mem_config.h>
     57 #include <sys/mem_cage.h>
     58 #include <sys/lgrp.h>
     59 #include <sys/ddi.h>
     60 #include <sys/modctl.h>
     61 
     62 extern struct memlist *phys_avail;
     63 
     64 extern void mem_node_add(pfn_t, pfn_t);
     65 extern void mem_node_del(pfn_t, pfn_t);
     66 
     67 extern uint_t page_ctrs_adjust(int);
     68 static void kphysm_setup_post_add(pgcnt_t);
     69 static int kphysm_setup_pre_del(pgcnt_t);
     70 static void kphysm_setup_post_del(pgcnt_t, int);
     71 
     72 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
     73 
     74 static int delspan_reserve(pfn_t, pgcnt_t);
     75 static void delspan_unreserve(pfn_t, pgcnt_t);
     76 
     77 static kmutex_t memseg_lists_lock;
     78 static struct memseg *memseg_va_avail;
     79 static struct memseg *memseg_delete_junk;
     80 static struct memseg *memseg_edit_junk;
     81 void memseg_remap_init(void);
     82 static void memseg_remap_to_dummy(caddr_t, pgcnt_t);
     83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
     84 static struct memseg *memseg_reuse(pgcnt_t);
     85 
     86 static struct kmem_cache *memseg_cache;
     87 
     88 /*
     89  * Add a chunk of memory to the system.  page_t's for this memory
     90  * are allocated in the first few pages of the chunk.
     91  * base: starting PAGESIZE page of new memory.
     92  * npgs: length in PAGESIZE pages.
     93  *
     94  * Adding mem this way doesn't increase the size of the hash tables;
     95  * growing them would be too hard.  This should be OK, but adding memory
     96  * dynamically most likely means more hash misses, since the tables will
     97  * be smaller than they otherwise would be.
     98  */
     99 int
    100 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
    101 {
    102 	page_t		*pp;
    103 	page_t		*opp, *oepp;
    104 	struct memseg	*seg;
    105 	uint64_t	avmem;
    106 	pfn_t		pfn;
    107 	pfn_t		pt_base = base;
    108 	pgcnt_t		tpgs = npgs;
    109 	pgcnt_t		metapgs;
    110 	int		exhausted;
    111 	pfn_t		pnum;
    112 	int		mnode;
    113 	caddr_t		vaddr;
    114 	int		reuse;
    115 	int		mlret;
    116 	void		*mapva;
    117 	pgcnt_t		nkpmpgs = 0;
    118 	offset_t	kpm_pages_off;
    119 
    120 	cmn_err(CE_CONT,
    121 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
    122 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
    123 
    124 	/*
    125 	 * Add this span in the delete list to prevent interactions.
    126 	 */
    127 	if (!delspan_reserve(base, npgs)) {
    128 		return (KPHYSM_ESPAN);
    129 	}
    130 	/*
    131 	 * Check to see if any of the memory span has been added
    132 	 * by trying an add to the installed memory list. This
    133 	 * forms the interlocking process for add.
    134 	 */
    135 
    136 	memlist_write_lock();
    137 
    138 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
    139 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    140 
    141 	if (mlret == MEML_SPANOP_OK)
    142 		installed_top_size(phys_install, &physmax, &physinstalled);
    143 
    144 	memlist_write_unlock();
    145 
    146 	if (mlret != MEML_SPANOP_OK) {
    147 		if (mlret == MEML_SPANOP_EALLOC) {
    148 			delspan_unreserve(pt_base, tpgs);
    149 			return (KPHYSM_ERESOURCE);
    150 		} else
    151 		if (mlret == MEML_SPANOP_ESPAN) {
    152 			delspan_unreserve(pt_base, tpgs);
    153 			return (KPHYSM_ESPAN);
    154 		} else {
    155 			delspan_unreserve(pt_base, tpgs);
    156 			return (KPHYSM_ERESOURCE);
    157 		}
    158 	}
    159 
    160 	/*
    161 	 * We store the page_t's for this new memory in the first
    162 	 * few pages of the chunk. Here, we go and get'em ...
    163 	 */
    164 
    165 	/*
    166 	 * The expression after the '-' gives the number of pages
    167 	 * that will fit in the new memory based on a requirement
    168 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
    169 	 */
    170 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
    171 	    (PAGESIZE + sizeof (page_t)));
    172 
    173 	npgs -= metapgs;
    174 	base += metapgs;
    175 
    176 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
    177 
    178 	exhausted = (metapgs == 0 || npgs == 0);
    179 
    180 	if (kpm_enable && !exhausted) {
    181 		pgcnt_t start, end, nkpmpgs_prelim;
    182 		size_t	ptsz;
    183 
    184 		/*
    185 		 * A viable kpm large page mapping must not overlap two
    186 		 * dynamic memsegs. Therefore the total size is checked
    187 		 * to be at least kpm_pgsz and also whether start and end
    188 		 * points are at least kpm_pgsz aligned.
    189 		 */
    190 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
    191 		    pmodkpmp(base + npgs)) {
    192 
    193 			kphysm_addmem_error_undospan(pt_base, tpgs);
    194 
    195 			/*
    196 			 * There is no specific error code for violating
    197 			 * kpm granularity constraints.
    198 			 */
    199 			return (KPHYSM_ENOTVIABLE);
    200 		}
    201 
    202 		start = kpmptop(ptokpmp(base));
    203 		end = kpmptop(ptokpmp(base + npgs));
    204 		nkpmpgs_prelim = ptokpmp(end - start);
    205 		ptsz = npgs * sizeof (page_t);
    206 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
    207 		exhausted = (tpgs <= metapgs);
    208 		if (!exhausted) {
    209 			npgs = tpgs - metapgs;
    210 			base = pt_base + metapgs;
    211 
    212 			/* final nkpmpgs */
    213 			start = kpmptop(ptokpmp(base));
    214 			nkpmpgs = ptokpmp(end - start);
    215 			kpm_pages_off = ptsz +
    216 				(nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
    217 		}
    218 	}
    219 
    220 	/*
    221 	 * Is memory area supplied too small?
    222 	 */
    223 	if (exhausted) {
    224 		kphysm_addmem_error_undospan(pt_base, tpgs);
    225 
    226 		/*
    227 		 * There is no specific error code for 'too small'.
    228 		 */
    229 		return (KPHYSM_ERESOURCE);
    230 	}
    231 
    232 	/*
    233 	 * We may re-use a previously allocated VA space for the page_ts
    234 	 * eventually, but we need to initialize and lock the pages first.
    235 	 */
    236 
    237 	/*
    238 	 * Get an address in the kernel address map, map
    239 	 * the page_t pages and see if we can touch them.
    240 	 */
    241 
    242 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
    243 	if (mapva == NULL) {
    244 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
    245 		    " Can't allocate VA for page_ts");
    246 
    247 		kphysm_addmem_error_undospan(pt_base, tpgs);
    248 
    249 		return (KPHYSM_ERESOURCE);
    250 	}
    251 	pp = mapva;
    252 
    253 	if (physmax < (pt_base + tpgs))
    254 		physmax = (pt_base + tpgs);
    255 
    256 	/*
    257 	 * In the remapping code we map one page at a time so we must do
    258 	 * the same here to match mapping sizes.
    259 	 */
    260 	pfn = pt_base;
    261 	vaddr = (caddr_t)pp;
    262 	for (pnum = 0; pnum < metapgs; pnum++) {
    263 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    264 		    PROT_READ | PROT_WRITE,
    265 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
    266 		pfn++;
    267 		vaddr += ptob(1);
    268 	}
    269 
    270 	if (ddi_peek32((dev_info_t *)NULL,
    271 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
    272 
    273 		cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:"
    274 		    " Can't access pp array at 0x%p [phys 0x%lx]",
    275 		    (void *)pp, pt_base);
    276 
    277 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    278 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    279 
    280 		vmem_free(heap_arena, mapva, ptob(metapgs));
    281 
    282 		kphysm_addmem_error_undospan(pt_base, tpgs);
    283 
    284 		return (KPHYSM_EFAULT);
    285 	}
    286 
    287 	/*
    288 	 * Add this memory slice to its memory node translation.
    289 	 *
    290 	 * Note that right now, each node may have only one slice;
    291 	 * this may change with COD or in larger SSM systems with
    292 	 * nested latency groups, so we must not assume that the
    293 	 * node does not yet exist.
    294 	 */
    295 	pnum = base + npgs - 1;
    296 	mem_node_add_slice(base, pnum);
    297 
    298 	/*
    299 	 * Allocate or resize page counters as necessary to accommodate
    300 	 * the increase in memory pages.
    301 	 */
    302 	mnode = PFN_2_MEM_NODE(pnum);
    303 	if (page_ctrs_adjust(mnode) != 0) {
    304 
    305 		mem_node_pre_del_slice(base, pnum);
    306 		mem_node_post_del_slice(base, pnum, 0);
    307 
    308 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    309 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    310 
    311 		vmem_free(heap_arena, mapva, ptob(metapgs));
    312 
    313 		kphysm_addmem_error_undospan(pt_base, tpgs);
    314 
    315 		return (KPHYSM_ERESOURCE);
    316 	}
    317 
    318 	/*
    319 	 * Update the phys_avail memory list.
    320 	 * The phys_install list was done at the start.
    321 	 */
    322 
    323 	memlist_write_lock();
    324 
    325 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
    326 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
    327 	ASSERT(mlret == MEML_SPANOP_OK);
    328 
    329 	memlist_write_unlock();
    330 
    331 	/* See if we can find a memseg to re-use. */
    332 	seg = memseg_reuse(metapgs);
    333 
    334 	reuse = (seg != NULL);
    335 
    336 	/*
    337 	 * Initialize the memseg structure representing this memory
    338 	 * and add it to the existing list of memsegs. Do some basic
    339 	 * initialization and add the memory to the system.
    340 	 * In order to prevent lock deadlocks, the add_physmem()
    341 	 * code is repeated here, but split into several stages.
    342 	 */
    343 	if (seg == NULL) {
    344 		seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
    345 		bzero(seg, sizeof (struct memseg));
    346 		seg->msegflags = MEMSEG_DYNAMIC;
    347 		seg->pages = pp;
    348 	} else {
    349 		/*EMPTY*/
    350 		ASSERT(seg->msegflags & MEMSEG_DYNAMIC);
    351 	}
    352 
    353 	seg->epages = seg->pages + npgs;
    354 	seg->pages_base = base;
    355 	seg->pages_end = base + npgs;
    356 
    357 	/*
    358 	 * Initialize metadata. The page_ts are set to locked state
    359 	 * ready to be freed.
    360 	 */
    361 	bzero((caddr_t)pp, ptob(metapgs));
    362 
    363 	pfn = seg->pages_base;
    364 	/* Save the original pp base in case we reuse a memseg. */
    365 	opp = pp;
    366 	oepp = opp + npgs;
    367 	for (pp = opp; pp < oepp; pp++) {
    368 		pp->p_pagenum = pfn;
    369 		pfn++;
    370 		page_iolock_init(pp);
    371 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
    372 			continue;
    373 		pp->p_offset = (u_offset_t)-1;
    374 	}
    375 
    376 	if (reuse) {
    377 		/* Remap our page_ts to the re-used memseg VA space. */
    378 		pfn = pt_base;
    379 		vaddr = (caddr_t)seg->pages;
    380 		for (pnum = 0; pnum < metapgs; pnum++) {
    381 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    382 			    PROT_READ | PROT_WRITE,
    383 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
    384 			pfn++;
    385 			vaddr += ptob(1);
    386 		}
    387 
    388 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
    389 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    390 
    391 		vmem_free(heap_arena, mapva, ptob(metapgs));
    392 	}
    393 
    394 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
    395 
    396 	memsegs_lock(1);
    397 
    398 	/*
    399 	 * The new memseg is inserted at the beginning of the list.
    400 	 * Not only does this save searching for the tail, but in the
    401 	 * case of a re-used memseg, it solves the problem of what
    402 	 * happens of some process has still got a pointer to the
    403 	 * memseg and follows the next pointer to continue traversing
    404 	 * the memsegs list.
    405 	 */
    406 
    407 	hat_kpm_addmem_mseg_insert(seg);
    408 
    409 	seg->next = memsegs;
    410 	membar_producer();
    411 
    412 	hat_kpm_addmem_memsegs_update(seg);
    413 
    414 	memsegs = seg;
    415 
    416 	build_pfn_hash();
    417 
    418 	total_pages += npgs;
    419 
    420 	/*
    421 	 * Recalculate the paging parameters now total_pages has changed.
    422 	 * This will also cause the clock hands to be reset before next use.
    423 	 */
    424 	setupclock(1);
    425 
    426 	memsegs_unlock(1);
    427 
    428 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
    429 
    430 	/*
    431 	 * Free the pages outside the lock to avoid locking loops.
    432 	 */
    433 	for (pp = seg->pages; pp < seg->epages; pp++) {
    434 		page_free(pp, 1);
    435 	}
    436 
    437 	/*
    438 	 * Now that we've updated the appropriate memory lists we
    439 	 * need to reset a number of globals, since we've increased memory.
    440 	 * Several have already been updated for us as noted above. The
    441 	 * globals we're interested in at this point are:
    442 	 *   physmax - highest page frame number.
    443 	 *   physinstalled - number of pages currently installed (done earlier)
    444 	 *   maxmem - max free pages in the system
    445 	 *   physmem - physical memory pages available
    446 	 *   availrmem - real memory available
    447 	 */
    448 
    449 	mutex_enter(&freemem_lock);
    450 	maxmem += npgs;
    451 	physmem += npgs;
    452 	availrmem += npgs;
    453 	availrmem_initial += npgs;
    454 
    455 	mutex_exit(&freemem_lock);
    456 
    457 	dump_resize();
    458 
    459 	page_freelist_coalesce_all(mnode);
    460 
    461 	kphysm_setup_post_add(npgs);
    462 
    463 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
    464 	    "(0x%" PRIx64 ")\n",
    465 	    physinstalled << (PAGESHIFT - 10),
    466 	    (uint64_t)physinstalled << PAGESHIFT);
    467 
    468 	avmem = (uint64_t)freemem << PAGESHIFT;
    469 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
    470 	    "avail mem = %" PRId64 "\n", avmem);
    471 
    472 	/*
    473 	 * Update lgroup generation number on single lgroup systems
    474 	 */
    475 	if (nlgrps == 1)
    476 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
    477 
    478 	delspan_unreserve(pt_base, tpgs);
    479 	return (KPHYSM_OK);		/* Successfully added system memory */
    480 
    481 }
    482 
    483 /*
    484  * There are various error conditions in kphysm_add_memory_dynamic()
    485  * which require a rollback of already changed global state.
    486  */
    487 static void
    488 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
    489 {
    490 	int mlret;
    491 
    492 	/* Unreserve memory span. */
    493 	memlist_write_lock();
    494 
    495 	mlret = memlist_delete_span(
    496 	    (uint64_t)(pt_base) << PAGESHIFT,
    497 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    498 
    499 	ASSERT(mlret == MEML_SPANOP_OK);
    500 	phys_install_has_changed();
    501 	installed_top_size(phys_install, &physmax, &physinstalled);
    502 
    503 	memlist_write_unlock();
    504 	delspan_unreserve(pt_base, tpgs);
    505 }
    506 
    507 /*
    508  * Only return an available memseg of exactly the right size.
    509  * When the meta data area has it's own virtual address space
    510  * we will need to manage this more carefully and do best fit
    511  * allocations, possibly splitting an available area.
    512  */
    513 static struct memseg *
    514 memseg_reuse(pgcnt_t metapgs)
    515 {
    516 	struct memseg **segpp, *seg;
    517 
    518 	mutex_enter(&memseg_lists_lock);
    519 
    520 	segpp = &memseg_va_avail;
    521 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
    522 		caddr_t end;
    523 
    524 		if (kpm_enable)
    525 			end = hat_kpm_mseg_reuse(seg);
    526 		else
    527 			end = (caddr_t)seg->epages;
    528 
    529 		if (btopr(end - (caddr_t)seg->pages) == metapgs) {
    530 			*segpp = seg->lnext;
    531 			seg->lnext = NULL;
    532 			break;
    533 		}
    534 	}
    535 	mutex_exit(&memseg_lists_lock);
    536 
    537 	return (seg);
    538 }
    539 
    540 static uint_t handle_gen;
    541 
    542 struct memdelspan {
    543 	struct memdelspan *mds_next;
    544 	pfn_t		mds_base;
    545 	pgcnt_t		mds_npgs;
    546 	uint_t		*mds_bitmap;
    547 	uint_t		*mds_bitmap_retired;
    548 };
    549 
    550 #define	NBPBMW		(sizeof (uint_t) * NBBY)
    551 #define	MDS_BITMAPBYTES(MDSP) \
    552 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
    553 
    554 struct transit_list {
    555 	struct transit_list	*trl_next;
    556 	struct memdelspan	*trl_spans;
    557 	int			trl_collect;
    558 };
    559 
    560 struct transit_list_head {
    561 	kmutex_t		trh_lock;
    562 	struct transit_list	*trh_head;
    563 };
    564 
    565 static struct transit_list_head transit_list_head;
    566 
    567 struct mem_handle;
    568 static void transit_list_collect(struct mem_handle *, int);
    569 static void transit_list_insert(struct transit_list *);
    570 static void transit_list_remove(struct transit_list *);
    571 
    572 #ifdef DEBUG
    573 #define	MEM_DEL_STATS
    574 #endif /* DEBUG */
    575 
    576 #ifdef MEM_DEL_STATS
    577 static int mem_del_stat_print = 0;
    578 struct mem_del_stat {
    579 	uint_t	nloop;
    580 	uint_t	need_free;
    581 	uint_t	free_loop;
    582 	uint_t	free_low;
    583 	uint_t	free_failed;
    584 	uint_t	ncheck;
    585 	uint_t	nopaget;
    586 	uint_t	lockfail;
    587 	uint_t	nfree;
    588 	uint_t	nreloc;
    589 	uint_t	nrelocfail;
    590 	uint_t	already_done;
    591 	uint_t	first_notfree;
    592 	uint_t	npplocked;
    593 	uint_t	nlockreloc;
    594 	uint_t	nnorepl;
    595 	uint_t	nmodreloc;
    596 	uint_t	ndestroy;
    597 	uint_t	nputpage;
    598 	uint_t	nnoreclaim;
    599 	uint_t	ndelay;
    600 	uint_t	demotefail;
    601 	uint64_t nticks_total;
    602 	uint64_t nticks_pgrp;
    603 	uint_t	retired;
    604 	uint_t	toxic;
    605 	uint_t	failing;
    606 	uint_t	modtoxic;
    607 	uint_t	npplkdtoxic;
    608 	uint_t	gptlmodfail;
    609 	uint_t	gptllckfail;
    610 };
    611 /*
    612  * The stat values are only incremented in the delete thread
    613  * so no locking or atomic required.
    614  */
    615 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
    616 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
    617 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
    618 static void mem_del_stat_print_func(struct mem_handle *);
    619 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
    620 #else /* MEM_DEL_STATS */
    621 #define	MDSTAT_INCR(MHP, FLD)
    622 #define	MDSTAT_TOTAL(MHP, ntck)
    623 #define	MDSTAT_PGRP(MHP, ntck)
    624 #define	MDSTAT_PRINT(MHP)
    625 #endif /* MEM_DEL_STATS */
    626 
    627 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
    628 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
    629 
    630 /*
    631  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
    632  * The mutex may not be required for other fields, dependent on mh_state.
    633  */
    634 struct mem_handle {
    635 	kmutex_t	mh_mutex;
    636 	struct mem_handle *mh_next;
    637 	memhandle_t	mh_exthandle;
    638 	mhnd_state_t	mh_state;
    639 	struct transit_list mh_transit;
    640 	pgcnt_t		mh_phys_pages;
    641 	pgcnt_t		mh_vm_pages;
    642 	pgcnt_t		mh_hold_todo;
    643 	void		(*mh_delete_complete)(void *, int error);
    644 	void		*mh_delete_complete_arg;
    645 	volatile uint_t mh_cancel;
    646 	volatile uint_t mh_dr_aio_cleanup_cancel;
    647 	volatile uint_t mh_aio_cleanup_done;
    648 	kcondvar_t	mh_cv;
    649 	kthread_id_t	mh_thread_id;
    650 	page_t		*mh_deleted;	/* link through p_next */
    651 #ifdef MEM_DEL_STATS
    652 	struct mem_del_stat mh_delstat;
    653 #endif /* MEM_DEL_STATS */
    654 };
    655 
    656 static struct mem_handle *mem_handle_head;
    657 static kmutex_t mem_handle_list_mutex;
    658 
    659 static struct mem_handle *
    660 kphysm_allocate_mem_handle()
    661 {
    662 	struct mem_handle *mhp;
    663 
    664 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
    665 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
    666 	mutex_enter(&mem_handle_list_mutex);
    667 	mutex_enter(&mhp->mh_mutex);
    668 	/* handle_gen is protected by list mutex. */
    669 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
    670 	mhp->mh_next = mem_handle_head;
    671 	mem_handle_head = mhp;
    672 	mutex_exit(&mem_handle_list_mutex);
    673 
    674 	return (mhp);
    675 }
    676 
    677 static void
    678 kphysm_free_mem_handle(struct mem_handle *mhp)
    679 {
    680 	struct mem_handle **mhpp;
    681 
    682 	ASSERT(mutex_owned(&mhp->mh_mutex));
    683 	ASSERT(mhp->mh_state == MHND_FREE);
    684 	/*
    685 	 * Exit the mutex to preserve locking order. This is OK
    686 	 * here as once in the FREE state, the handle cannot
    687 	 * be found by a lookup.
    688 	 */
    689 	mutex_exit(&mhp->mh_mutex);
    690 
    691 	mutex_enter(&mem_handle_list_mutex);
    692 	mhpp = &mem_handle_head;
    693 	while (*mhpp != NULL && *mhpp != mhp)
    694 		mhpp = &(*mhpp)->mh_next;
    695 	ASSERT(*mhpp == mhp);
    696 	/*
    697 	 * No need to lock the handle (mh_mutex) as only
    698 	 * mh_next changing and this is the only thread that
    699 	 * can be referncing mhp.
    700 	 */
    701 	*mhpp = mhp->mh_next;
    702 	mutex_exit(&mem_handle_list_mutex);
    703 
    704 	mutex_destroy(&mhp->mh_mutex);
    705 	kmem_free(mhp, sizeof (struct mem_handle));
    706 }
    707 
    708 /*
    709  * This function finds the internal mem_handle corresponding to an
    710  * external handle and returns it with the mh_mutex held.
    711  */
    712 static struct mem_handle *
    713 kphysm_lookup_mem_handle(memhandle_t handle)
    714 {
    715 	struct mem_handle *mhp;
    716 
    717 	mutex_enter(&mem_handle_list_mutex);
    718 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
    719 		if (mhp->mh_exthandle == handle) {
    720 			mutex_enter(&mhp->mh_mutex);
    721 			/*
    722 			 * The state of the handle could have been changed
    723 			 * by kphysm_del_release() while waiting for mh_mutex.
    724 			 */
    725 			if (mhp->mh_state == MHND_FREE) {
    726 				mutex_exit(&mhp->mh_mutex);
    727 				continue;
    728 			}
    729 			break;
    730 		}
    731 	}
    732 	mutex_exit(&mem_handle_list_mutex);
    733 	return (mhp);
    734 }
    735 
    736 int
    737 kphysm_del_gethandle(memhandle_t *xmhp)
    738 {
    739 	struct mem_handle *mhp;
    740 
    741 	mhp = kphysm_allocate_mem_handle();
    742 	/*
    743 	 * The handle is allocated using KM_SLEEP, so cannot fail.
    744 	 * If the implementation is changed, the correct error to return
    745 	 * here would be KPHYSM_ENOHANDLES.
    746 	 */
    747 	ASSERT(mhp->mh_state == MHND_FREE);
    748 	mhp->mh_state = MHND_INIT;
    749 	*xmhp = mhp->mh_exthandle;
    750 	mutex_exit(&mhp->mh_mutex);
    751 	return (KPHYSM_OK);
    752 }
    753 
    754 static int
    755 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
    756 {
    757 	pfn_t e1, e2;
    758 
    759 	e1 = b1 + l1;
    760 	e2 = b2 + l2;
    761 
    762 	return (!(b2 >= e1 || b1 >= e2));
    763 }
    764 
    765 static int can_remove_pgs(pgcnt_t);
    766 
    767 static struct memdelspan *
    768 span_to_install(pfn_t base, pgcnt_t npgs)
    769 {
    770 	struct memdelspan *mdsp;
    771 	struct memdelspan *mdsp_new;
    772 	uint64_t address, size, thislen;
    773 	struct memlist *mlp;
    774 
    775 	mdsp_new = NULL;
    776 
    777 	address = (uint64_t)base << PAGESHIFT;
    778 	size = (uint64_t)npgs << PAGESHIFT;
    779 	while (size != 0) {
    780 		memlist_read_lock();
    781 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
    782 			if (address >= (mlp->address + mlp->size))
    783 				continue;
    784 			if ((address + size) > mlp->address)
    785 				break;
    786 		}
    787 		if (mlp == NULL) {
    788 			address += size;
    789 			size = 0;
    790 			thislen = 0;
    791 		} else {
    792 			if (address < mlp->address) {
    793 				size -= (mlp->address - address);
    794 				address = mlp->address;
    795 			}
    796 			ASSERT(address >= mlp->address);
    797 			if ((address + size) > (mlp->address + mlp->size)) {
    798 				thislen = mlp->size - (address - mlp->address);
    799 			} else {
    800 				thislen = size;
    801 			}
    802 		}
    803 		memlist_read_unlock();
    804 		/* TODO: phys_install could change now */
    805 		if (thislen == 0)
    806 			continue;
    807 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
    808 		mdsp->mds_base = btop(address);
    809 		mdsp->mds_npgs = btop(thislen);
    810 		mdsp->mds_next = mdsp_new;
    811 		mdsp_new = mdsp;
    812 		address += thislen;
    813 		size -= thislen;
    814 	}
    815 	return (mdsp_new);
    816 }
    817 
    818 static void
    819 free_delspans(struct memdelspan *mdsp)
    820 {
    821 	struct memdelspan *amdsp;
    822 
    823 	while ((amdsp = mdsp) != NULL) {
    824 		mdsp = amdsp->mds_next;
    825 		kmem_free(amdsp, sizeof (struct memdelspan));
    826 	}
    827 }
    828 
    829 /*
    830  * Concatenate lists. No list ordering is required.
    831  */
    832 
    833 static void
    834 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
    835 {
    836 	while (*mdspp != NULL)
    837 		mdspp = &(*mdspp)->mds_next;
    838 
    839 	*mdspp = mdsp;
    840 }
    841 
    842 /*
    843  * Given a new list of delspans, check there is no overlap with
    844  * all existing span activity (add or delete) and then concatenate
    845  * the new spans to the given list.
    846  * Return 1 for OK, 0 if overlapping.
    847  */
    848 static int
    849 delspan_insert(
    850 	struct transit_list *my_tlp,
    851 	struct memdelspan *mdsp_new)
    852 {
    853 	struct transit_list_head *trh;
    854 	struct transit_list *tlp;
    855 	int ret;
    856 
    857 	trh = &transit_list_head;
    858 
    859 	ASSERT(my_tlp != NULL);
    860 	ASSERT(mdsp_new != NULL);
    861 
    862 	ret = 1;
    863 	mutex_enter(&trh->trh_lock);
    864 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
    865 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
    866 		struct memdelspan *mdsp;
    867 
    868 		for (mdsp = tlp->trl_spans; mdsp != NULL;
    869 		    mdsp = mdsp->mds_next) {
    870 			struct memdelspan *nmdsp;
    871 
    872 			for (nmdsp = mdsp_new; nmdsp != NULL;
    873 			    nmdsp = nmdsp->mds_next) {
    874 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
    875 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
    876 					ret = 0;
    877 					goto done;
    878 				}
    879 			}
    880 		}
    881 	}
    882 done:
    883 	if (ret != 0) {
    884 		if (my_tlp->trl_spans == NULL)
    885 			transit_list_insert(my_tlp);
    886 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
    887 	}
    888 	mutex_exit(&trh->trh_lock);
    889 	return (ret);
    890 }
    891 
    892 static void
    893 delspan_remove(
    894 	struct transit_list *my_tlp,
    895 	pfn_t base,
    896 	pgcnt_t npgs)
    897 {
    898 	struct transit_list_head *trh;
    899 	struct memdelspan *mdsp;
    900 
    901 	trh = &transit_list_head;
    902 
    903 	ASSERT(my_tlp != NULL);
    904 
    905 	mutex_enter(&trh->trh_lock);
    906 	if ((mdsp = my_tlp->trl_spans) != NULL) {
    907 		if (npgs == 0) {
    908 			my_tlp->trl_spans = NULL;
    909 			free_delspans(mdsp);
    910 			transit_list_remove(my_tlp);
    911 		} else {
    912 			struct memdelspan **prv;
    913 
    914 			prv = &my_tlp->trl_spans;
    915 			while (mdsp != NULL) {
    916 				pfn_t p_end;
    917 
    918 				p_end = mdsp->mds_base + mdsp->mds_npgs;
    919 				if (mdsp->mds_base >= base &&
    920 				    p_end <= (base + npgs)) {
    921 					*prv = mdsp->mds_next;
    922 					mdsp->mds_next = NULL;
    923 					free_delspans(mdsp);
    924 				} else {
    925 					prv = &mdsp->mds_next;
    926 				}
    927 				mdsp = *prv;
    928 			}
    929 			if (my_tlp->trl_spans == NULL)
    930 				transit_list_remove(my_tlp);
    931 		}
    932 	}
    933 	mutex_exit(&trh->trh_lock);
    934 }
    935 
    936 /*
    937  * Reserve interface for add to stop delete before add finished.
    938  * This list is only accessed through the delspan_insert/remove
    939  * functions and so is fully protected by the mutex in struct transit_list.
    940  */
    941 
    942 static struct transit_list reserve_transit;
    943 
    944 static int
    945 delspan_reserve(pfn_t base, pgcnt_t npgs)
    946 {
    947 	struct memdelspan *mdsp;
    948 	int ret;
    949 
    950 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
    951 	mdsp->mds_base = base;
    952 	mdsp->mds_npgs = npgs;
    953 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
    954 		free_delspans(mdsp);
    955 	}
    956 	return (ret);
    957 }
    958 
    959 static void
    960 delspan_unreserve(pfn_t base, pgcnt_t npgs)
    961 {
    962 	delspan_remove(&reserve_transit, base, npgs);
    963 }
    964 
    965 /*
    966  * Return whether memseg was created by kphysm_add_memory_dynamic().
    967  * If this is the case and startp non zero, return also the start pfn
    968  * of the meta data via startp.
    969  */
    970 static int
    971 memseg_is_dynamic(struct memseg *seg, pfn_t *startp)
    972 {
    973 	pfn_t		pt_start;
    974 
    975 	if ((seg->msegflags & MEMSEG_DYNAMIC) == 0)
    976 		return (0);
    977 
    978 	/* Meta data is required to be at the beginning */
    979 	ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base);
    980 
    981 	pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
    982 	if (startp != NULL)
    983 		*startp = pt_start;
    984 
    985 	return (1);
    986 }
    987 
    988 int
    989 kphysm_del_span(
    990 	memhandle_t handle,
    991 	pfn_t base,
    992 	pgcnt_t npgs)
    993 {
    994 	struct mem_handle *mhp;
    995 	struct memseg *seg;
    996 	struct memdelspan *mdsp;
    997 	struct memdelspan *mdsp_new;
    998 	pgcnt_t phys_pages, vm_pages;
    999 	pfn_t p_end;
   1000 	page_t *pp;
   1001 	int ret;
   1002 
   1003 	mhp = kphysm_lookup_mem_handle(handle);
   1004 	if (mhp == NULL) {
   1005 		return (KPHYSM_EHANDLE);
   1006 	}
   1007 	if (mhp->mh_state != MHND_INIT) {
   1008 		mutex_exit(&mhp->mh_mutex);
   1009 		return (KPHYSM_ESEQUENCE);
   1010 	}
   1011 
   1012 	/*
   1013 	 * Intersect the span with the installed memory list (phys_install).
   1014 	 */
   1015 	mdsp_new = span_to_install(base, npgs);
   1016 	if (mdsp_new == NULL) {
   1017 		/*
   1018 		 * No physical memory in this range. Is this an
   1019 		 * error? If an attempt to start the delete is made
   1020 		 * for OK returns from del_span such as this, start will
   1021 		 * return an error.
   1022 		 * Could return KPHYSM_ENOWORK.
   1023 		 */
   1024 		/*
   1025 		 * It is assumed that there are no error returns
   1026 		 * from span_to_install() due to kmem_alloc failure.
   1027 		 */
   1028 		mutex_exit(&mhp->mh_mutex);
   1029 		return (KPHYSM_OK);
   1030 	}
   1031 	/*
   1032 	 * Does this span overlap an existing span?
   1033 	 */
   1034 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
   1035 		/*
   1036 		 * Differentiate between already on list for this handle
   1037 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
   1038 		 */
   1039 		ret = KPHYSM_EBUSY;
   1040 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1041 		    mdsp = mdsp->mds_next) {
   1042 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
   1043 			    base, npgs)) {
   1044 				ret = KPHYSM_EDUP;
   1045 				break;
   1046 			}
   1047 		}
   1048 		mutex_exit(&mhp->mh_mutex);
   1049 		free_delspans(mdsp_new);
   1050 		return (ret);
   1051 	}
   1052 	/*
   1053 	 * At this point the spans in mdsp_new have been inserted into the
   1054 	 * list of spans for this handle and thereby to the global list of
   1055 	 * spans being processed. Each of these spans must now be checked
   1056 	 * for relocatability. As a side-effect segments in the memseg list
   1057 	 * may be split.
   1058 	 *
   1059 	 * Note that mdsp_new can no longer be used as it is now part of
   1060 	 * a larger list. Select elements of this larger list based
   1061 	 * on base and npgs.
   1062 	 */
   1063 restart:
   1064 	phys_pages = 0;
   1065 	vm_pages = 0;
   1066 	ret = KPHYSM_OK;
   1067 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1068 	    mdsp = mdsp->mds_next) {
   1069 		pgcnt_t pages_checked;
   1070 
   1071 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
   1072 			continue;
   1073 		}
   1074 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   1075 		/*
   1076 		 * The pages_checked count is a hack. All pages should be
   1077 		 * checked for relocatability. Those not covered by memsegs
   1078 		 * should be tested with arch_kphysm_del_span_ok().
   1079 		 */
   1080 		pages_checked = 0;
   1081 		for (seg = memsegs; seg; seg = seg->next) {
   1082 			pfn_t mseg_start;
   1083 
   1084 			if (seg->pages_base >= p_end ||
   1085 			    seg->pages_end <= mdsp->mds_base) {
   1086 				/* Span and memseg don't overlap. */
   1087 				continue;
   1088 			}
   1089 			/* Check that segment is suitable for delete. */
   1090 			if (memseg_is_dynamic(seg, &mseg_start)) {
   1091 				/*
   1092 				 * Can only delete whole added segments
   1093 				 * for the moment.
   1094 				 * Check that this is completely within the
   1095 				 * span.
   1096 </