Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/types.h>
     27 #include <sys/cmn_err.h>
     28 #include <sys/vmem.h>
     29 #include <sys/kmem.h>
     30 #include <sys/systm.h>
     31 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
     32 #include <sys/errno.h>
     33 #include <sys/memnode.h>
     34 #include <sys/memlist.h>
     35 #include <sys/memlist_impl.h>
     36 #include <sys/tuneable.h>
     37 #include <sys/proc.h>
     38 #include <sys/disp.h>
     39 #include <sys/debug.h>
     40 #include <sys/vm.h>
     41 #include <sys/callb.h>
     42 #include <sys/memlist_plat.h>	/* for installed_top_size() */
     43 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
     44 #include <sys/dumphdr.h>	/* for dump_resize() */
     45 #include <sys/atomic.h>		/* for use in stats collection */
     46 #include <sys/rwlock.h>
     47 #include <sys/cpuvar.h>
     48 #include <vm/seg_kmem.h>
     49 #include <vm/seg_kpm.h>
     50 #include <vm/page.h>
     51 #include <vm/vm_dep.h>
     52 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
     53 #include <sys/sunddi.h>
     54 #include <sys/mem_config.h>
     55 #include <sys/mem_cage.h>
     56 #include <sys/lgrp.h>
     57 #include <sys/ddi.h>
     58 #include <sys/modctl.h>
     59 
     60 extern struct memlist *phys_avail;
     61 
     62 extern void mem_node_add(pfn_t, pfn_t);
     63 extern void mem_node_del(pfn_t, pfn_t);
     64 
     65 extern uint_t page_ctrs_adjust(int);
     66 static void kphysm_setup_post_add(pgcnt_t);
     67 static int kphysm_setup_pre_del(pgcnt_t);
     68 static void kphysm_setup_post_del(pgcnt_t, int);
     69 
     70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs);
     71 
     72 static int delspan_reserve(pfn_t, pgcnt_t);
     73 static void delspan_unreserve(pfn_t, pgcnt_t);
     74 
     75 kmutex_t memseg_lists_lock;
     76 struct memseg *memseg_va_avail;
     77 struct memseg *memseg_alloc(void);
     78 static struct memseg *memseg_delete_junk;
     79 static struct memseg *memseg_edit_junk;
     80 void memseg_remap_init(void);
     81 static void memseg_remap_to_dummy(struct memseg *);
     82 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t);
     83 static struct memseg *memseg_reuse(pgcnt_t);
     84 
     85 static struct kmem_cache *memseg_cache;
     86 
     87 /*
     88  * Interfaces to manage externally allocated
     89  * page_t memory (metadata) for a memseg.
     90  */
     91 #pragma weak	memseg_alloc_meta
     92 #pragma weak	memseg_free_meta
     93 #pragma weak	memseg_get_metapfn
     94 #pragma weak	memseg_remap_meta
     95 
     96 extern int ppvm_enable;
     97 extern page_t *ppvm_base;
     98 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *);
     99 extern void memseg_free_meta(void *, pgcnt_t);
    100 extern pfn_t memseg_get_metapfn(void *, pgcnt_t);
    101 extern void memseg_remap_meta(struct memseg *);
    102 static int memseg_is_dynamic(struct memseg *);
    103 static int memseg_includes_meta(struct memseg *);
    104 static pfn_t memseg_get_start(struct memseg *);
    105 static void memseg_cpu_vm_flush(void);
    106 
    107 int meta_alloc_enable;
    108 
    109 /*
    110  * Add a chunk of memory to the system.
    111  * base: starting PAGESIZE page of new memory.
    112  * npgs: length in PAGESIZE pages.
    113  *
    114  * Adding mem this way doesn't increase the size of the hash tables;
    115  * growing them would be too hard.  This should be OK, but adding memory
    116  * dynamically most likely means more hash misses, since the tables will
    117  * be smaller than they otherwise would be.
    118  */
    119 #ifdef	DEBUG
    120 static int memseg_debug;
    121 #define	MEMSEG_DEBUG(args...) if (memseg_debug) printf(args)
    122 #else
    123 #define	MEMSEG_DEBUG(...)
    124 #endif
    125 
    126 int
    127 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs)
    128 {
    129 	page_t *pp;
    130 	page_t		*opp, *oepp, *segpp;
    131 	struct memseg	*seg;
    132 	uint64_t	avmem;
    133 	pfn_t		pfn;
    134 	pfn_t		pt_base = base;
    135 	pgcnt_t		tpgs = npgs;
    136 	pgcnt_t		metapgs = 0;
    137 	int		exhausted;
    138 	pfn_t		pnum;
    139 	int		mnode;
    140 	caddr_t		vaddr;
    141 	int		reuse;
    142 	int		mlret;
    143 	int		rv;
    144 	int		flags;
    145 	int		meta_alloc = 0;
    146 	void		*mapva;
    147 	void		*metabase = (void *)base;
    148 	pgcnt_t		nkpmpgs = 0;
    149 	offset_t	kpm_pages_off;
    150 
    151 	cmn_err(CE_CONT,
    152 	    "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n",
    153 	    npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT);
    154 
    155 	/*
    156 	 * Add this span in the delete list to prevent interactions.
    157 	 */
    158 	if (!delspan_reserve(base, npgs)) {
    159 		return (KPHYSM_ESPAN);
    160 	}
    161 	/*
    162 	 * Check to see if any of the memory span has been added
    163 	 * by trying an add to the installed memory list. This
    164 	 * forms the interlocking process for add.
    165 	 */
    166 
    167 	memlist_write_lock();
    168 
    169 	mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT,
    170 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    171 
    172 	if (mlret == MEML_SPANOP_OK)
    173 		installed_top_size(phys_install, &physmax, &physinstalled);
    174 
    175 	memlist_write_unlock();
    176 
    177 	if (mlret != MEML_SPANOP_OK) {
    178 		if (mlret == MEML_SPANOP_EALLOC) {
    179 			delspan_unreserve(pt_base, tpgs);
    180 			return (KPHYSM_ERESOURCE);
    181 		} else if (mlret == MEML_SPANOP_ESPAN) {
    182 			delspan_unreserve(pt_base, tpgs);
    183 			return (KPHYSM_ESPAN);
    184 		} else {
    185 			delspan_unreserve(pt_base, tpgs);
    186 			return (KPHYSM_ERESOURCE);
    187 		}
    188 	}
    189 
    190 	if (meta_alloc_enable) {
    191 		/*
    192 		 * Allocate the page_t's from existing memory;
    193 		 * if that fails, allocate from the incoming memory.
    194 		 */
    195 		rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs);
    196 		if (rv == KPHYSM_OK) {
    197 			ASSERT(metapgs);
    198 			ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
    199 			meta_alloc = 1;
    200 			goto mapalloc;
    201 		}
    202 	}
    203 
    204 	/*
    205 	 * We store the page_t's for this new memory in the first
    206 	 * few pages of the chunk. Here, we go and get'em ...
    207 	 */
    208 
    209 	/*
    210 	 * The expression after the '-' gives the number of pages
    211 	 * that will fit in the new memory based on a requirement
    212 	 * of (PAGESIZE + sizeof (page_t)) bytes per page.
    213 	 */
    214 	metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) /
    215 	    (PAGESIZE + sizeof (page_t)));
    216 
    217 	npgs -= metapgs;
    218 	base += metapgs;
    219 
    220 	ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs);
    221 
    222 	exhausted = (metapgs == 0 || npgs == 0);
    223 
    224 	if (kpm_enable && !exhausted) {
    225 		pgcnt_t start, end, nkpmpgs_prelim;
    226 		size_t	ptsz;
    227 
    228 		/*
    229 		 * A viable kpm large page mapping must not overlap two
    230 		 * dynamic memsegs. Therefore the total size is checked
    231 		 * to be at least kpm_pgsz and also whether start and end
    232 		 * points are at least kpm_pgsz aligned.
    233 		 */
    234 		if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) ||
    235 		    pmodkpmp(base + npgs)) {
    236 
    237 			kphysm_addmem_error_undospan(pt_base, tpgs);
    238 
    239 			/*
    240 			 * There is no specific error code for violating
    241 			 * kpm granularity constraints.
    242 			 */
    243 			return (KPHYSM_ENOTVIABLE);
    244 		}
    245 
    246 		start = kpmptop(ptokpmp(base));
    247 		end = kpmptop(ptokpmp(base + npgs));
    248 		nkpmpgs_prelim = ptokpmp(end - start);
    249 		ptsz = npgs * sizeof (page_t);
    250 		metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ);
    251 		exhausted = (tpgs <= metapgs);
    252 		if (!exhausted) {
    253 			npgs = tpgs - metapgs;
    254 			base = pt_base + metapgs;
    255 
    256 			/* final nkpmpgs */
    257 			start = kpmptop(ptokpmp(base));
    258 			nkpmpgs = ptokpmp(end - start);
    259 			kpm_pages_off = ptsz +
    260 			    (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ;
    261 		}
    262 	}
    263 
    264 	/*
    265 	 * Is memory area supplied too small?
    266 	 */
    267 	if (exhausted) {
    268 		kphysm_addmem_error_undospan(pt_base, tpgs);
    269 		/*
    270 		 * There is no specific error code for 'too small'.
    271 		 */
    272 		return (KPHYSM_ERESOURCE);
    273 	}
    274 
    275 mapalloc:
    276 	/*
    277 	 * We may re-use a previously allocated VA space for the page_ts
    278 	 * eventually, but we need to initialize and lock the pages first.
    279 	 */
    280 
    281 	/*
    282 	 * Get an address in the kernel address map, map
    283 	 * the page_t pages and see if we can touch them.
    284 	 */
    285 
    286 	mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP);
    287 	if (mapva == NULL) {
    288 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
    289 		    " Can't allocate VA for page_ts");
    290 
    291 		if (meta_alloc)
    292 			memseg_free_meta(metabase, metapgs);
    293 		kphysm_addmem_error_undospan(pt_base, tpgs);
    294 
    295 		return (KPHYSM_ERESOURCE);
    296 	}
    297 	pp = mapva;
    298 
    299 	if (physmax < (pt_base + tpgs))
    300 		physmax = (pt_base + tpgs);
    301 
    302 	/*
    303 	 * In the remapping code we map one page at a time so we must do
    304 	 * the same here to match mapping sizes.
    305 	 */
    306 	pfn = pt_base;
    307 	vaddr = (caddr_t)pp;
    308 	for (pnum = 0; pnum < metapgs; pnum++) {
    309 		if (meta_alloc)
    310 			pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum);
    311 		hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    312 		    PROT_READ | PROT_WRITE,
    313 		    HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST);
    314 		pfn++;
    315 		vaddr += ptob(1);
    316 	}
    317 
    318 	if (ddi_peek32((dev_info_t *)NULL,
    319 	    (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) {
    320 
    321 		cmn_err(CE_WARN, "kphysm_add_memory_dynamic:"
    322 		    " Can't access pp array at 0x%p [phys 0x%lx]",
    323 		    (void *)pp, pt_base);
    324 
    325 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    326 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    327 
    328 		vmem_free(heap_arena, mapva, ptob(metapgs));
    329 		if (meta_alloc)
    330 			memseg_free_meta(metabase, metapgs);
    331 		kphysm_addmem_error_undospan(pt_base, tpgs);
    332 
    333 		return (KPHYSM_EFAULT);
    334 	}
    335 
    336 	/*
    337 	 * Add this memory slice to its memory node translation.
    338 	 *
    339 	 * Note that right now, each node may have only one slice;
    340 	 * this may change with COD or in larger SSM systems with
    341 	 * nested latency groups, so we must not assume that the
    342 	 * node does not yet exist.
    343 	 */
    344 	pnum = pt_base + tpgs - 1;
    345 	mem_node_add_range(pt_base, pnum);
    346 
    347 	/*
    348 	 * Allocate or resize page counters as necessary to accommodate
    349 	 * the increase in memory pages.
    350 	 */
    351 	mnode = PFN_2_MEM_NODE(pnum);
    352 	PAGE_CTRS_ADJUST(base, npgs, rv);
    353 	if (rv) {
    354 
    355 		mem_node_del_range(pt_base, pnum);
    356 
    357 		hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs),
    358 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    359 
    360 		vmem_free(heap_arena, mapva, ptob(metapgs));
    361 		if (meta_alloc)
    362 			memseg_free_meta(metabase, metapgs);
    363 		kphysm_addmem_error_undospan(pt_base, tpgs);
    364 
    365 		return (KPHYSM_ERESOURCE);
    366 	}
    367 
    368 	/*
    369 	 * Update the phys_avail memory list.
    370 	 * The phys_install list was done at the start.
    371 	 */
    372 
    373 	memlist_write_lock();
    374 
    375 	mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT,
    376 	    (uint64_t)(npgs) << PAGESHIFT, &phys_avail);
    377 	ASSERT(mlret == MEML_SPANOP_OK);
    378 
    379 	memlist_write_unlock();
    380 
    381 	/* See if we can find a memseg to re-use. */
    382 	if (meta_alloc) {
    383 		seg = memseg_reuse(0);
    384 		reuse = 1;	/* force unmapping of temp mapva */
    385 		flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC;
    386 		/*
    387 		 * There is a 1:1 fixed relationship between a pfn
    388 		 * and a page_t VA.  The pfn is used as an index into
    389 		 * the ppvm_base page_t table in order to calculate
    390 		 * the page_t base address for a given pfn range.
    391 		 */
    392 		segpp = ppvm_base + base;
    393 	} else {
    394 		seg = memseg_reuse(metapgs);
    395 		reuse = (seg != NULL);
    396 		flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL;
    397 		segpp = pp;
    398 	}
    399 
    400 	/*
    401 	 * Initialize the memseg structure representing this memory
    402 	 * and add it to the existing list of memsegs. Do some basic
    403 	 * initialization and add the memory to the system.
    404 	 * In order to prevent lock deadlocks, the add_physmem()
    405 	 * code is repeated here, but split into several stages.
    406 	 *
    407 	 * If a memseg is reused, invalidate memseg pointers in
    408 	 * all cpu vm caches.  We need to do this this since the check
    409 	 * 	pp >= seg->pages && pp < seg->epages
    410 	 * used in various places is not atomic and so the first compare
    411 	 * can happen before reuse and the second compare after reuse.
    412 	 * The invalidation ensures that a memseg is not deferenced while
    413 	 * it's page/pfn pointers are changing.
    414 	 */
    415 	if (seg == NULL) {
    416 		seg = memseg_alloc();
    417 		ASSERT(seg != NULL);
    418 		seg->msegflags = flags;
    419 		MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p",
    420 		    (void *)seg, (void *)(seg->pages));
    421 		seg->pages = segpp;
    422 	} else {
    423 		ASSERT(seg->msegflags == flags);
    424 		ASSERT(seg->pages_base == seg->pages_end);
    425 		MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p",
    426 		    (void *)seg, (void *)(seg->pages));
    427 		if (meta_alloc) {
    428 			memseg_cpu_vm_flush();
    429 			seg->pages = segpp;
    430 		}
    431 	}
    432 
    433 	seg->epages = seg->pages + npgs;
    434 	seg->pages_base = base;
    435 	seg->pages_end = base + npgs;
    436 
    437 	/*
    438 	 * Initialize metadata. The page_ts are set to locked state
    439 	 * ready to be freed.
    440 	 */
    441 	bzero((caddr_t)pp, ptob(metapgs));
    442 
    443 	pfn = seg->pages_base;
    444 	/* Save the original pp base in case we reuse a memseg. */
    445 	opp = pp;
    446 	oepp = opp + npgs;
    447 	for (pp = opp; pp < oepp; pp++) {
    448 		pp->p_pagenum = pfn;
    449 		pfn++;
    450 		page_iolock_init(pp);
    451 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
    452 			continue;
    453 		pp->p_offset = (u_offset_t)-1;
    454 	}
    455 
    456 	if (reuse) {
    457 		/* Remap our page_ts to the re-used memseg VA space. */
    458 		pfn = pt_base;
    459 		vaddr = (caddr_t)seg->pages;
    460 		for (pnum = 0; pnum < metapgs; pnum++) {
    461 			if (meta_alloc)
    462 				pfn = memseg_get_metapfn(metabase,
    463 				    (pgcnt_t)pnum);
    464 			hat_devload(kas.a_hat, vaddr, ptob(1), pfn,
    465 			    PROT_READ | PROT_WRITE,
    466 			    HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST);
    467 			pfn++;
    468 			vaddr += ptob(1);
    469 		}
    470 
    471 		hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs),
    472 		    HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK);
    473 
    474 		vmem_free(heap_arena, mapva, ptob(metapgs));
    475 	}
    476 
    477 	hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off);
    478 
    479 	memsegs_lock(1);
    480 
    481 	/*
    482 	 * The new memseg is inserted at the beginning of the list.
    483 	 * Not only does this save searching for the tail, but in the
    484 	 * case of a re-used memseg, it solves the problem of what
    485 	 * happens if some process has still got a pointer to the
    486 	 * memseg and follows the next pointer to continue traversing
    487 	 * the memsegs list.
    488 	 */
    489 
    490 	hat_kpm_addmem_mseg_insert(seg);
    491 
    492 	seg->next = memsegs;
    493 	membar_producer();
    494 
    495 	hat_kpm_addmem_memsegs_update(seg);
    496 
    497 	memsegs = seg;
    498 
    499 	build_pfn_hash();
    500 
    501 	total_pages += npgs;
    502 
    503 	/*
    504 	 * Recalculate the paging parameters now total_pages has changed.
    505 	 * This will also cause the clock hands to be reset before next use.
    506 	 */
    507 	setupclock(1);
    508 
    509 	memsegs_unlock(1);
    510 
    511 	PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs);
    512 
    513 	/*
    514 	 * Free the pages outside the lock to avoid locking loops.
    515 	 */
    516 	for (pp = seg->pages; pp < seg->epages; pp++) {
    517 		page_free(pp, 1);
    518 	}
    519 
    520 	/*
    521 	 * Now that we've updated the appropriate memory lists we
    522 	 * need to reset a number of globals, since we've increased memory.
    523 	 * Several have already been updated for us as noted above. The
    524 	 * globals we're interested in at this point are:
    525 	 *   physmax - highest page frame number.
    526 	 *   physinstalled - number of pages currently installed (done earlier)
    527 	 *   maxmem - max free pages in the system
    528 	 *   physmem - physical memory pages available
    529 	 *   availrmem - real memory available
    530 	 */
    531 
    532 	mutex_enter(&freemem_lock);
    533 	maxmem += npgs;
    534 	physmem += npgs;
    535 	availrmem += npgs;
    536 	availrmem_initial += npgs;
    537 
    538 	mutex_exit(&freemem_lock);
    539 
    540 	dump_resize();
    541 
    542 	page_freelist_coalesce_all(mnode);
    543 
    544 	kphysm_setup_post_add(npgs);
    545 
    546 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK "
    547 	    "(0x%" PRIx64 ")\n",
    548 	    physinstalled << (PAGESHIFT - 10),
    549 	    (uint64_t)physinstalled << PAGESHIFT);
    550 
    551 	avmem = (uint64_t)freemem << PAGESHIFT;
    552 	cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: "
    553 	    "avail mem = %" PRId64 "\n", avmem);
    554 
    555 	/*
    556 	 * Update lgroup generation number on single lgroup systems
    557 	 */
    558 	if (nlgrps == 1)
    559 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
    560 
    561 	delspan_unreserve(pt_base, tpgs);
    562 	return (KPHYSM_OK);		/* Successfully added system memory */
    563 
    564 }
    565 
    566 /*
    567  * There are various error conditions in kphysm_add_memory_dynamic()
    568  * which require a rollback of already changed global state.
    569  */
    570 static void
    571 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs)
    572 {
    573 	int mlret;
    574 
    575 	/* Unreserve memory span. */
    576 	memlist_write_lock();
    577 
    578 	mlret = memlist_delete_span(
    579 	    (uint64_t)(pt_base) << PAGESHIFT,
    580 	    (uint64_t)(tpgs) << PAGESHIFT, &phys_install);
    581 
    582 	ASSERT(mlret == MEML_SPANOP_OK);
    583 	phys_install_has_changed();
    584 	installed_top_size(phys_install, &physmax, &physinstalled);
    585 
    586 	memlist_write_unlock();
    587 	delspan_unreserve(pt_base, tpgs);
    588 }
    589 
    590 /*
    591  * Only return an available memseg of exactly the right size
    592  * if size is required.
    593  * When the meta data area has it's own virtual address space
    594  * we will need to manage this more carefully and do best fit
    595  * allocations, possibly splitting an available area.
    596  */
    597 struct memseg *
    598 memseg_reuse(pgcnt_t metapgs)
    599 {
    600 	int type;
    601 	struct memseg **segpp, *seg;
    602 
    603 	mutex_enter(&memseg_lists_lock);
    604 
    605 	segpp = &memseg_va_avail;
    606 	for (; (seg = *segpp) != NULL; segpp = &seg->lnext) {
    607 		caddr_t end;
    608 
    609 		/*
    610 		 * Make sure we are reusing the right segment type.
    611 		 */
    612 		type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC;
    613 
    614 		if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC))
    615 		    != type)
    616 			continue;
    617 
    618 		if (kpm_enable)
    619 			end = hat_kpm_mseg_reuse(seg);
    620 		else
    621 			end = (caddr_t)seg->epages;
    622 
    623 		/*
    624 		 * Check for the right size if it is provided.
    625 		 */
    626 		if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) {
    627 			*segpp = seg->lnext;
    628 			seg->lnext = NULL;
    629 			break;
    630 		}
    631 	}
    632 	mutex_exit(&memseg_lists_lock);
    633 
    634 	return (seg);
    635 }
    636 
    637 static uint_t handle_gen;
    638 
    639 struct memdelspan {
    640 	struct memdelspan *mds_next;
    641 	pfn_t		mds_base;
    642 	pgcnt_t		mds_npgs;
    643 	uint_t		*mds_bitmap;
    644 	uint_t		*mds_bitmap_retired;
    645 };
    646 
    647 #define	NBPBMW		(sizeof (uint_t) * NBBY)
    648 #define	MDS_BITMAPBYTES(MDSP) \
    649 	((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t))
    650 
    651 struct transit_list {
    652 	struct transit_list	*trl_next;
    653 	struct memdelspan	*trl_spans;
    654 	int			trl_collect;
    655 };
    656 
    657 struct transit_list_head {
    658 	kmutex_t		trh_lock;
    659 	struct transit_list	*trh_head;
    660 };
    661 
    662 static struct transit_list_head transit_list_head;
    663 
    664 struct mem_handle;
    665 static void transit_list_collect(struct mem_handle *, int);
    666 static void transit_list_insert(struct transit_list *);
    667 static void transit_list_remove(struct transit_list *);
    668 
    669 #ifdef DEBUG
    670 #define	MEM_DEL_STATS
    671 #endif /* DEBUG */
    672 
    673 #ifdef MEM_DEL_STATS
    674 static int mem_del_stat_print = 0;
    675 struct mem_del_stat {
    676 	uint_t	nloop;
    677 	uint_t	need_free;
    678 	uint_t	free_loop;
    679 	uint_t	free_low;
    680 	uint_t	free_failed;
    681 	uint_t	ncheck;
    682 	uint_t	nopaget;
    683 	uint_t	lockfail;
    684 	uint_t	nfree;
    685 	uint_t	nreloc;
    686 	uint_t	nrelocfail;
    687 	uint_t	already_done;
    688 	uint_t	first_notfree;
    689 	uint_t	npplocked;
    690 	uint_t	nlockreloc;
    691 	uint_t	nnorepl;
    692 	uint_t	nmodreloc;
    693 	uint_t	ndestroy;
    694 	uint_t	nputpage;
    695 	uint_t	nnoreclaim;
    696 	uint_t	ndelay;
    697 	uint_t	demotefail;
    698 	uint64_t nticks_total;
    699 	uint64_t nticks_pgrp;
    700 	uint_t	retired;
    701 	uint_t	toxic;
    702 	uint_t	failing;
    703 	uint_t	modtoxic;
    704 	uint_t	npplkdtoxic;
    705 	uint_t	gptlmodfail;
    706 	uint_t	gptllckfail;
    707 };
    708 /*
    709  * The stat values are only incremented in the delete thread
    710  * so no locking or atomic required.
    711  */
    712 #define	MDSTAT_INCR(MHP, FLD)	(MHP)->mh_delstat.FLD++
    713 #define	MDSTAT_TOTAL(MHP, ntck)	((MHP)->mh_delstat.nticks_total += (ntck))
    714 #define	MDSTAT_PGRP(MHP, ntck)	((MHP)->mh_delstat.nticks_pgrp += (ntck))
    715 static void mem_del_stat_print_func(struct mem_handle *);
    716 #define	MDSTAT_PRINT(MHP)	mem_del_stat_print_func((MHP))
    717 #else /* MEM_DEL_STATS */
    718 #define	MDSTAT_INCR(MHP, FLD)
    719 #define	MDSTAT_TOTAL(MHP, ntck)
    720 #define	MDSTAT_PGRP(MHP, ntck)
    721 #define	MDSTAT_PRINT(MHP)
    722 #endif /* MEM_DEL_STATS */
    723 
    724 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING,
    725 	MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t;
    726 
    727 /*
    728  * mh_mutex must be taken to examine or change mh_exthandle and mh_state.
    729  * The mutex may not be required for other fields, dependent on mh_state.
    730  */
    731 struct mem_handle {
    732 	kmutex_t	mh_mutex;
    733 	struct mem_handle *mh_next;
    734 	memhandle_t	mh_exthandle;
    735 	mhnd_state_t	mh_state;
    736 	struct transit_list mh_transit;
    737 	pgcnt_t		mh_phys_pages;
    738 	pgcnt_t		mh_vm_pages;
    739 	pgcnt_t		mh_hold_todo;
    740 	void		(*mh_delete_complete)(void *, int error);
    741 	void		*mh_delete_complete_arg;
    742 	volatile uint_t mh_cancel;
    743 	volatile uint_t mh_dr_aio_cleanup_cancel;
    744 	volatile uint_t mh_aio_cleanup_done;
    745 	kcondvar_t	mh_cv;
    746 	kthread_id_t	mh_thread_id;
    747 	page_t		*mh_deleted;	/* link through p_next */
    748 #ifdef MEM_DEL_STATS
    749 	struct mem_del_stat mh_delstat;
    750 #endif /* MEM_DEL_STATS */
    751 };
    752 
    753 static struct mem_handle *mem_handle_head;
    754 static kmutex_t mem_handle_list_mutex;
    755 
    756 static struct mem_handle *
    757 kphysm_allocate_mem_handle()
    758 {
    759 	struct mem_handle *mhp;
    760 
    761 	mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP);
    762 	mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL);
    763 	mutex_enter(&mem_handle_list_mutex);
    764 	mutex_enter(&mhp->mh_mutex);
    765 	/* handle_gen is protected by list mutex. */
    766 	mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen);
    767 	mhp->mh_next = mem_handle_head;
    768 	mem_handle_head = mhp;
    769 	mutex_exit(&mem_handle_list_mutex);
    770 
    771 	return (mhp);
    772 }
    773 
    774 static void
    775 kphysm_free_mem_handle(struct mem_handle *mhp)
    776 {
    777 	struct mem_handle **mhpp;
    778 
    779 	ASSERT(mutex_owned(&mhp->mh_mutex));
    780 	ASSERT(mhp->mh_state == MHND_FREE);
    781 	/*
    782 	 * Exit the mutex to preserve locking order. This is OK
    783 	 * here as once in the FREE state, the handle cannot
    784 	 * be found by a lookup.
    785 	 */
    786 	mutex_exit(&mhp->mh_mutex);
    787 
    788 	mutex_enter(&mem_handle_list_mutex);
    789 	mhpp = &mem_handle_head;
    790 	while (*mhpp != NULL && *mhpp != mhp)
    791 		mhpp = &(*mhpp)->mh_next;
    792 	ASSERT(*mhpp == mhp);
    793 	/*
    794 	 * No need to lock the handle (mh_mutex) as only
    795 	 * mh_next changing and this is the only thread that
    796 	 * can be referncing mhp.
    797 	 */
    798 	*mhpp = mhp->mh_next;
    799 	mutex_exit(&mem_handle_list_mutex);
    800 
    801 	mutex_destroy(&mhp->mh_mutex);
    802 	kmem_free(mhp, sizeof (struct mem_handle));
    803 }
    804 
    805 /*
    806  * This function finds the internal mem_handle corresponding to an
    807  * external handle and returns it with the mh_mutex held.
    808  */
    809 static struct mem_handle *
    810 kphysm_lookup_mem_handle(memhandle_t handle)
    811 {
    812 	struct mem_handle *mhp;
    813 
    814 	mutex_enter(&mem_handle_list_mutex);
    815 	for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) {
    816 		if (mhp->mh_exthandle == handle) {
    817 			mutex_enter(&mhp->mh_mutex);
    818 			/*
    819 			 * The state of the handle could have been changed
    820 			 * by kphysm_del_release() while waiting for mh_mutex.
    821 			 */
    822 			if (mhp->mh_state == MHND_FREE) {
    823 				mutex_exit(&mhp->mh_mutex);
    824 				continue;
    825 			}
    826 			break;
    827 		}
    828 	}
    829 	mutex_exit(&mem_handle_list_mutex);
    830 	return (mhp);
    831 }
    832 
    833 int
    834 kphysm_del_gethandle(memhandle_t *xmhp)
    835 {
    836 	struct mem_handle *mhp;
    837 
    838 	mhp = kphysm_allocate_mem_handle();
    839 	/*
    840 	 * The handle is allocated using KM_SLEEP, so cannot fail.
    841 	 * If the implementation is changed, the correct error to return
    842 	 * here would be KPHYSM_ENOHANDLES.
    843 	 */
    844 	ASSERT(mhp->mh_state == MHND_FREE);
    845 	mhp->mh_state = MHND_INIT;
    846 	*xmhp = mhp->mh_exthandle;
    847 	mutex_exit(&mhp->mh_mutex);
    848 	return (KPHYSM_OK);
    849 }
    850 
    851 static int
    852 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2)
    853 {
    854 	pfn_t e1, e2;
    855 
    856 	e1 = b1 + l1;
    857 	e2 = b2 + l2;
    858 
    859 	return (!(b2 >= e1 || b1 >= e2));
    860 }
    861 
    862 static int can_remove_pgs(pgcnt_t);
    863 
    864 static struct memdelspan *
    865 span_to_install(pfn_t base, pgcnt_t npgs)
    866 {
    867 	struct memdelspan *mdsp;
    868 	struct memdelspan *mdsp_new;
    869 	uint64_t address, size, thislen;
    870 	struct memlist *mlp;
    871 
    872 	mdsp_new = NULL;
    873 
    874 	address = (uint64_t)base << PAGESHIFT;
    875 	size = (uint64_t)npgs << PAGESHIFT;
    876 	while (size != 0) {
    877 		memlist_read_lock();
    878 		for (mlp = phys_install; mlp != NULL; mlp = mlp->next) {
    879 			if (address >= (mlp->address + mlp->size))
    880 				continue;
    881 			if ((address + size) > mlp->address)
    882 				break;
    883 		}
    884 		if (mlp == NULL) {
    885 			address += size;
    886 			size = 0;
    887 			thislen = 0;
    888 		} else {
    889 			if (address < mlp->address) {
    890 				size -= (mlp->address - address);
    891 				address = mlp->address;
    892 			}
    893 			ASSERT(address >= mlp->address);
    894 			if ((address + size) > (mlp->address + mlp->size)) {
    895 				thislen = mlp->size - (address - mlp->address);
    896 			} else {
    897 				thislen = size;
    898 			}
    899 		}
    900 		memlist_read_unlock();
    901 		/* TODO: phys_install could change now */
    902 		if (thislen == 0)
    903 			continue;
    904 		mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
    905 		mdsp->mds_base = btop(address);
    906 		mdsp->mds_npgs = btop(thislen);
    907 		mdsp->mds_next = mdsp_new;
    908 		mdsp_new = mdsp;
    909 		address += thislen;
    910 		size -= thislen;
    911 	}
    912 	return (mdsp_new);
    913 }
    914 
    915 static void
    916 free_delspans(struct memdelspan *mdsp)
    917 {
    918 	struct memdelspan *amdsp;
    919 
    920 	while ((amdsp = mdsp) != NULL) {
    921 		mdsp = amdsp->mds_next;
    922 		kmem_free(amdsp, sizeof (struct memdelspan));
    923 	}
    924 }
    925 
    926 /*
    927  * Concatenate lists. No list ordering is required.
    928  */
    929 
    930 static void
    931 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp)
    932 {
    933 	while (*mdspp != NULL)
    934 		mdspp = &(*mdspp)->mds_next;
    935 
    936 	*mdspp = mdsp;
    937 }
    938 
    939 /*
    940  * Given a new list of delspans, check there is no overlap with
    941  * all existing span activity (add or delete) and then concatenate
    942  * the new spans to the given list.
    943  * Return 1 for OK, 0 if overlapping.
    944  */
    945 static int
    946 delspan_insert(
    947 	struct transit_list *my_tlp,
    948 	struct memdelspan *mdsp_new)
    949 {
    950 	struct transit_list_head *trh;
    951 	struct transit_list *tlp;
    952 	int ret;
    953 
    954 	trh = &transit_list_head;
    955 
    956 	ASSERT(my_tlp != NULL);
    957 	ASSERT(mdsp_new != NULL);
    958 
    959 	ret = 1;
    960 	mutex_enter(&trh->trh_lock);
    961 	/* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */
    962 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
    963 		struct memdelspan *mdsp;
    964 
    965 		for (mdsp = tlp->trl_spans; mdsp != NULL;
    966 		    mdsp = mdsp->mds_next) {
    967 			struct memdelspan *nmdsp;
    968 
    969 			for (nmdsp = mdsp_new; nmdsp != NULL;
    970 			    nmdsp = nmdsp->mds_next) {
    971 				if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
    972 				    nmdsp->mds_base, nmdsp->mds_npgs)) {
    973 					ret = 0;
    974 					goto done;
    975 				}
    976 			}
    977 		}
    978 	}
    979 done:
    980 	if (ret != 0) {
    981 		if (my_tlp->trl_spans == NULL)
    982 			transit_list_insert(my_tlp);
    983 		delspan_concat(&my_tlp->trl_spans, mdsp_new);
    984 	}
    985 	mutex_exit(&trh->trh_lock);
    986 	return (ret);
    987 }
    988 
    989 static void
    990 delspan_remove(
    991 	struct transit_list *my_tlp,
    992 	pfn_t base,
    993 	pgcnt_t npgs)
    994 {
    995 	struct transit_list_head *trh;
    996 	struct memdelspan *mdsp;
    997 
    998 	trh = &transit_list_head;
    999 
   1000 	ASSERT(my_tlp != NULL);
   1001 
   1002 	mutex_enter(&trh->trh_lock);
   1003 	if ((mdsp = my_tlp->trl_spans) != NULL) {
   1004 		if (npgs == 0) {
   1005 			my_tlp->trl_spans = NULL;
   1006 			free_delspans(mdsp);
   1007 			transit_list_remove(my_tlp);
   1008 		} else {
   1009 			struct memdelspan **prv;
   1010 
   1011 			prv = &my_tlp->trl_spans;
   1012 			while (mdsp != NULL) {
   1013 				pfn_t p_end;
   1014 
   1015 				p_end = mdsp->mds_base + mdsp->mds_npgs;
   1016 				if (mdsp->mds_base >= base &&
   1017 				    p_end <= (base + npgs)) {
   1018 					*prv = mdsp->mds_next;
   1019 					mdsp->mds_next = NULL;
   1020 					free_delspans(mdsp);
   1021 				} else {
   1022 					prv = &mdsp->mds_next;
   1023 				}
   1024 				mdsp = *prv;
   1025 			}
   1026 			if (my_tlp->trl_spans == NULL)
   1027 				transit_list_remove(my_tlp);
   1028 		}
   1029 	}
   1030 	mutex_exit(&trh->trh_lock);
   1031 }
   1032 
   1033 /*
   1034  * Reserve interface for add to stop delete before add finished.
   1035  * This list is only accessed through the delspan_insert/remove
   1036  * functions and so is fully protected by the mutex in struct transit_list.
   1037  */
   1038 
   1039 static struct transit_list reserve_transit;
   1040 
   1041 static int
   1042 delspan_reserve(pfn_t base, pgcnt_t npgs)
   1043 {
   1044 	struct memdelspan *mdsp;
   1045 	int ret;
   1046 
   1047 	mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP);
   1048 	mdsp->mds_base = base;
   1049 	mdsp->mds_npgs = npgs;
   1050 	if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) {
   1051 		free_delspans(mdsp);
   1052 	}
   1053 	return (ret);
   1054 }
   1055 
   1056 static void
   1057 delspan_unreserve(pfn_t base, pgcnt_t npgs)
   1058 {
   1059 	delspan_remove(&reserve_transit, base, npgs);
   1060 }
   1061 
   1062 /*
   1063  * Return whether memseg was created by kphysm_add_memory_dynamic().
   1064  */
   1065 static int
   1066 memseg_is_dynamic(struct memseg *seg)
   1067 {
   1068 	return (seg->msegflags & MEMSEG_DYNAMIC);
   1069 }
   1070 
   1071 int
   1072 kphysm_del_span(
   1073 	memhandle_t handle,
   1074 	pfn_t base,
   1075 	pgcnt_t npgs)
   1076 {
   1077 	struct mem_handle *mhp;
   1078 	struct memseg *seg;
   1079 	struct memdelspan *mdsp;
   1080 	struct memdelspan *mdsp_new;
   1081 	pgcnt_t phys_pages, vm_pages;
   1082 	pfn_t p_end;
   1083 	page_t *pp;
   1084 	int ret;
   1085 
   1086 	mhp = kphysm_lookup_mem_handle(handle);
   1087 	if (mhp == NULL) {
   1088 		return (KPHYSM_EHANDLE);
   1089 	}
   1090 	if (mhp->mh_state != MHND_INIT) {
   1091 		mutex_exit(&mhp->mh_mutex);
   1092 		return (KPHYSM_ESEQUENCE);
   1093 	}
   1094 
   1095 	/*
   1096 	 * Intersect the span with the installed memory list (phys_install).
   1097 	 */
   1098 	mdsp_new = span_to_install(base, npgs);
   1099 	if (mdsp_new == NULL) {
   1100 		/*
   1101 		 * No physical memory in this range. Is this an
   1102 		 * error? If an attempt to start the delete is made
   1103 		 * for OK returns from del_span such as this, start will
   1104 		 * return an error.
   1105 		 * Could return KPHYSM_ENOWORK.
   1106 		 */
   1107 		/*
   1108 		 * It is assumed that there are no error returns
   1109 		 * from span_to_install() due to kmem_alloc failure.
   1110 		 */
   1111 		mutex_exit(&mhp->mh_mutex);
   1112 		return (KPHYSM_OK);
   1113 	}
   1114 	/*
   1115 	 * Does this span overlap an existing span?
   1116 	 */
   1117 	if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) {
   1118 		/*
   1119 		 * Differentiate between already on list for this handle
   1120 		 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY).
   1121 		 */
   1122 		ret = KPHYSM_EBUSY;
   1123 		for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1124 		    mdsp = mdsp->mds_next) {
   1125 			if (overlapping(mdsp->mds_base, mdsp->mds_npgs,
   1126 			    base, npgs)) {
   1127 				ret = KPHYSM_EDUP;
   1128 				break;
   1129 			}
   1130 		}
   1131 		mutex_exit(&mhp->mh_mutex);
   1132 		free_delspans(mdsp_new);
   1133 		return (ret);
   1134 	}
   1135 	/*
   1136 	 * At this point the spans in mdsp_new have been inserted into the
   1137 	 * list of spans for this handle and thereby to the global list of
   1138 	 * spans being processed. Each of these spans must now be checked
   1139 	 * for relocatability. As a side-effect segments in the memseg list
   1140 	 * may be split.
   1141 	 *
   1142 	 * Note that mdsp_new can no longer be used as it is now part of
   1143 	 * a larger list. Select elements of this larger list based
   1144 	 * on base and npgs.
   1145 	 */
   1146 restart:
   1147 	phys_pages = 0;
   1148 	vm_pages = 0;
   1149 	ret = KPHYSM_OK;
   1150 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1151 	    mdsp = mdsp->mds_next) {
   1152 		pgcnt_t pages_checked;
   1153 
   1154 		if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) {
   1155 			continue;
   1156 		}
   1157 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   1158 		/*
   1159 		 * The pages_checked count is a hack. All pages should be
   1160 		 * checked for relocatability. Those not covered by memsegs
   1161 		 * should be tested with arch_kphysm_del_span_ok().
   1162 		 */
   1163 		pages_checked = 0;
   1164 		for (seg = memsegs; seg; seg = seg->next) {
   1165 			pfn_t mseg_start;
   1166 
   1167 			if (seg->pages_base >= p_end ||
   1168 			    seg->pages_end <= mdsp->mds_base) {
   1169 				/* Span and memseg don't overlap. */
   1170 				continue;
   1171 			}
   1172 			mseg_start = memseg_get_start(seg);
   1173 			/* Check that segment is suitable for delete. */
   1174 			if (memseg_includes_meta(seg)) {
   1175 				/*
   1176 				 * Check that this segment is completely
   1177 				 * within the span.
   1178 				 */
   1179 				if (mseg_start < mdsp->mds_base ||
   1180 				    seg->pages_end > p_end) {
   1181 					ret = KPHYSM_EBUSY;
   1182 					break;
   1183 				}
   1184 				pages_checked += seg->pages_end - mseg_start;
   1185 			} else {
   1186 				/*
   1187 				 * If this segment is larger than the span,
   1188 				 * try to split it. After the split, it
   1189 				 * is necessary to restart.
   1190 				 */
   1191 				if (seg->pages_base < mdsp->mds_base ||
   1192 				    seg->pages_end > p_end) {
   1193 					pfn_t abase;
   1194 					pgcnt_t anpgs;
   1195 					int s_ret;
   1196 
   1197 					/* Split required.  */
   1198 					if (mdsp->mds_base < seg->pages_base)
   1199 						abase = seg->pages_base;
   1200 					else
   1201 						abase = mdsp->mds_base;
   1202 					if (p_end > seg->pages_end)
   1203 						anpgs = seg->pages_end - abase;
   1204 					else
   1205 						anpgs = p_end - abase;
   1206 					s_ret = kphysm_split_memseg(abase,
   1207 					    anpgs);
   1208 					if (s_ret == 0) {
   1209 						/* Split failed. */
   1210 						ret = KPHYSM_ERESOURCE;
   1211 						break;
   1212 					}
   1213 					goto restart;
   1214 				}
   1215 				pages_checked +=
   1216 				    seg->pages_end - seg->pages_base;
   1217 			}
   1218 			/*
   1219 			 * The memseg is wholly within the delete span.
   1220 			 * The individual pages can now be checked.
   1221 			 */
   1222 			/* Cage test. */
   1223 			for (pp = seg->pages; pp < seg->epages; pp++) {
   1224 				if (PP_ISNORELOC(pp)) {
   1225 					ret = KPHYSM_ENONRELOC;
   1226 					break;
   1227 				}
   1228 			}
   1229 			if (ret != KPHYSM_OK) {
   1230 				break;
   1231 			}
   1232 			phys_pages += (seg->pages_end - mseg_start);
   1233 			vm_pages += MSEG_NPAGES(seg);
   1234 		}
   1235 		if (ret != KPHYSM_OK)
   1236 			break;
   1237 		if (pages_checked != mdsp->mds_npgs) {
   1238 			ret = KPHYSM_ENONRELOC;
   1239 			break;
   1240 		}
   1241 	}
   1242 
   1243 	if (ret == KPHYSM_OK) {
   1244 		mhp->mh_phys_pages += phys_pages;
   1245 		mhp->mh_vm_pages += vm_pages;
   1246 	} else {
   1247 		/*
   1248 		 * Keep holding the mh_mutex to prevent it going away.
   1249 		 */
   1250 		delspan_remove(&mhp->mh_transit, base, npgs);
   1251 	}
   1252 	mutex_exit(&mhp->mh_mutex);
   1253 	return (ret);
   1254 }
   1255 
   1256 int
   1257 kphysm_del_span_query(
   1258 	pfn_t base,
   1259 	pgcnt_t npgs,
   1260 	memquery_t *mqp)
   1261 {
   1262 	struct memdelspan *mdsp;
   1263 	struct memdelspan *mdsp_new;
   1264 	int done_first_nonreloc;
   1265 
   1266 	mqp->phys_pages = 0;
   1267 	mqp->managed = 0;
   1268 	mqp->nonrelocatable = 0;
   1269 	mqp->first_nonrelocatable = 0;
   1270 	mqp->last_nonrelocatable = 0;
   1271 
   1272 	mdsp_new = span_to_install(base, npgs);
   1273 	/*
   1274 	 * It is OK to proceed here if mdsp_new == NULL.
   1275 	 */
   1276 	done_first_nonreloc = 0;
   1277 	for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) {
   1278 		pfn_t sbase;
   1279 		pgcnt_t snpgs;
   1280 
   1281 		mqp->phys_pages += mdsp->mds_npgs;
   1282 		sbase = mdsp->mds_base;
   1283 		snpgs = mdsp->mds_npgs;
   1284 		while (snpgs != 0) {
   1285 			struct memseg *lseg, *seg;
   1286 			pfn_t p_end;
   1287 			page_t *pp;
   1288 			pfn_t mseg_start;
   1289 
   1290 			p_end = sbase + snpgs;
   1291 			/*
   1292 			 * Find the lowest addressed memseg that starts
   1293 			 * after sbase and account for it.
   1294 			 * This is to catch dynamic memsegs whose start
   1295 			 * is hidden.
   1296 			 */
   1297 			seg = NULL;
   1298 			for (lseg = memsegs; lseg != NULL; lseg = lseg->next) {
   1299 				if ((lseg->pages_base >= sbase) ||
   1300 				    (lseg->pages_base < p_end &&
   1301 				    lseg->pages_end > sbase)) {
   1302 					if (seg == NULL ||
   1303 					    seg->pages_base > lseg->pages_base)
   1304 						seg = lseg;
   1305 				}
   1306 			}
   1307 			if (seg != NULL) {
   1308 				mseg_start = memseg_get_start(seg);
   1309 				/*
   1310 				 * Now have the full extent of the memseg so
   1311 				 * do the range check.
   1312 				 */
   1313 				if (mseg_start >= p_end ||
   1314 				    seg->pages_end <= sbase) {
   1315 					/* Span does not overlap memseg. */
   1316 					seg = NULL;
   1317 				}
   1318 			}
   1319 			/*
   1320 			 * Account for gap either before the segment if
   1321 			 * there is one or to the end of the span.
   1322 			 */
   1323 			if (seg == NULL || mseg_start > sbase) {
   1324 				pfn_t a_end;
   1325 
   1326 				a_end = (seg == NULL) ? p_end : mseg_start;
   1327 				/*
   1328 				 * Check with arch layer for relocatability.
   1329 				 */
   1330 				if (arch_kphysm_del_span_ok(sbase,
   1331 				    (a_end - sbase))) {
   1332 					/*
   1333 					 * No non-relocatble pages in this
   1334 					 * area, avoid the fine-grained
   1335 					 * test.
   1336 					 */
   1337 					snpgs -= (a_end - sbase);
   1338 					sbase = a_end;
   1339 				}
   1340 				while (sbase < a_end) {
   1341 					if (!arch_kphysm_del_span_ok(sbase,
   1342 					    1)) {
   1343 						mqp->nonrelocatable++;
   1344 						if (!done_first_nonreloc) {
   1345 							mqp->
   1346 							    first_nonrelocatable
   1347 							    = sbase;
   1348 							done_first_nonreloc = 1;
   1349 						}
   1350 						mqp->last_nonrelocatable =
   1351 						    sbase;
   1352 					}
   1353 					sbase++;
   1354 					snpgs--;
   1355 				}
   1356 			}
   1357 			if (seg != NULL) {
   1358 				ASSERT(mseg_start <= sbase);
   1359 				if (seg->pages_base != mseg_start &&
   1360 				    seg->pages_base > sbase) {
   1361 					pgcnt_t skip_pgs;
   1362 
   1363 					/*
   1364 					 * Skip the page_t area of a
   1365 					 * dynamic memseg.
   1366 					 */
   1367 					skip_pgs = seg->pages_base - sbase;
   1368 					if (snpgs <= skip_pgs) {
   1369 						sbase += snpgs;
   1370 						snpgs = 0;
   1371 						continue;
   1372 					}
   1373 					snpgs -= skip_pgs;
   1374 					sbase += skip_pgs;
   1375 				}
   1376 				ASSERT(snpgs != 0);
   1377 				ASSERT(seg->pages_base <= sbase);
   1378 				/*
   1379 				 * The individual pages can now be checked.
   1380 				 */
   1381 				for (pp = seg->pages +
   1382 				    (sbase - seg->pages_base);
   1383 				    snpgs != 0 && pp < seg->epages; pp++) {
   1384 					mqp->managed++;
   1385 					if (PP_ISNORELOC(pp)) {
   1386 						mqp->nonrelocatable++;
   1387 						if (!done_first_nonreloc) {
   1388 							mqp->
   1389 							    first_nonrelocatable
   1390 							    = sbase;
   1391 							done_first_nonreloc = 1;
   1392 						}
   1393 						mqp->last_nonrelocatable =
   1394 						    sbase;
   1395 					}
   1396 					sbase++;
   1397 					snpgs--;
   1398 				}
   1399 			}
   1400 		}
   1401 	}
   1402 
   1403 	free_delspans(mdsp_new);
   1404 
   1405 	return (KPHYSM_OK);
   1406 }
   1407 
   1408 /*
   1409  * This release function can be called at any stage as follows:
   1410  *	_gethandle only called
   1411  *	_span(s) only called
   1412  *	_start called but failed
   1413  *	delete thread exited
   1414  */
   1415 int
   1416 kphysm_del_release(memhandle_t handle)
   1417 {
   1418 	struct mem_handle *mhp;
   1419 
   1420 	mhp = kphysm_lookup_mem_handle(handle);
   1421 	if (mhp == NULL) {
   1422 		return (KPHYSM_EHANDLE);
   1423 	}
   1424 	switch (mhp->mh_state) {
   1425 	case MHND_STARTING:
   1426 	case MHND_RUNNING:
   1427 		mutex_exit(&mhp->mh_mutex);
   1428 		return (KPHYSM_ENOTFINISHED);
   1429 	case MHND_FREE:
   1430 		ASSERT(mhp->mh_state != MHND_FREE);
   1431 		mutex_exit(&mhp->mh_mutex);
   1432 		return (KPHYSM_EHANDLE);
   1433 	case MHND_INIT:
   1434 		break;
   1435 	case MHND_DONE:
   1436 		break;
   1437 	case MHND_RELEASE:
   1438 		mutex_exit(&mhp->mh_mutex);
   1439 		return (KPHYSM_ESEQUENCE);
   1440 	default:
   1441 #ifdef DEBUG
   1442 		cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d",
   1443 		    (void *)mhp, mhp->mh_state);
   1444 #endif /* DEBUG */
   1445 		mutex_exit(&mhp->mh_mutex);
   1446 		return (KPHYSM_EHANDLE);
   1447 	}
   1448 	/*
   1449 	 * Set state so that we can wait if necessary.
   1450 	 * Also this means that we have read/write access to all
   1451 	 * fields except mh_exthandle and mh_state.
   1452 	 */
   1453 	mhp->mh_state = MHND_RELEASE;
   1454 	/*
   1455 	 * The mem_handle cannot be de-allocated by any other operation
   1456 	 * now, so no need to hold mh_mutex.
   1457 	 */
   1458 	mutex_exit(&mhp->mh_mutex);
   1459 
   1460 	delspan_remove(&mhp->mh_transit, 0, 0);
   1461 	mhp->mh_phys_pages = 0;
   1462 	mhp->mh_vm_pages = 0;
   1463 	mhp->mh_hold_todo = 0;
   1464 	mhp->mh_delete_complete = NULL;
   1465 	mhp->mh_delete_complete_arg = NULL;
   1466 	mhp->mh_cancel = 0;
   1467 
   1468 	mutex_enter(&mhp->mh_mutex);
   1469 	ASSERT(mhp->mh_state == MHND_RELEASE);
   1470 	mhp->mh_state = MHND_FREE;
   1471 
   1472 	kphysm_free_mem_handle(mhp);
   1473 
   1474 	return (KPHYSM_OK);
   1475 }
   1476 
   1477 /*
   1478  * This cancel function can only be called with the thread running.
   1479  */
   1480 int
   1481 kphysm_del_cancel(memhandle_t handle)
   1482 {
   1483 	struct mem_handle *mhp;
   1484 
   1485 	mhp = kphysm_lookup_mem_handle(handle);
   1486 	if (mhp == NULL) {
   1487 		return (KPHYSM_EHANDLE);
   1488 	}
   1489 	if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) {
   1490 		mutex_exit(&mhp->mh_mutex);
   1491 		return (KPHYSM_ENOTRUNNING);
   1492 	}
   1493 	/*
   1494 	 * Set the cancel flag and wake the delete thread up.
   1495 	 * The thread may be waiting on I/O, so the effect of the cancel
   1496 	 * may be delayed.
   1497 	 */
   1498 	if (mhp->mh_cancel == 0) {
   1499 		mhp->mh_cancel = KPHYSM_ECANCELLED;
   1500 		cv_signal(&mhp->mh_cv);
   1501 	}
   1502 	mutex_exit(&mhp->mh_mutex);
   1503 	return (KPHYSM_OK);
   1504 }
   1505 
   1506 int
   1507 kphysm_del_status(
   1508 	memhandle_t handle,
   1509 	memdelstat_t *mdstp)
   1510 {
   1511 	struct mem_handle *mhp;
   1512 
   1513 	mhp = kphysm_lookup_mem_handle(handle);
   1514 	if (mhp == NULL) {
   1515 		return (KPHYSM_EHANDLE);
   1516 	}
   1517 	/*
   1518 	 * Calling kphysm_del_status() is allowed before the delete
   1519 	 * is started to allow for status display.
   1520 	 */
   1521 	if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING &&
   1522 	    mhp->mh_state != MHND_RUNNING) {
   1523 		mutex_exit(&mhp->mh_mutex);
   1524 		return (KPHYSM_ENOTRUNNING);
   1525 	}
   1526 	mdstp->phys_pages = mhp->mh_phys_pages;
   1527 	mdstp->managed = mhp->mh_vm_pages;
   1528 	mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo;
   1529 	mutex_exit(&mhp->mh_mutex);
   1530 	return (KPHYSM_OK);
   1531 }
   1532 
   1533 static int mem_delete_additional_pages = 100;
   1534 
   1535 static int
   1536 can_remove_pgs(pgcnt_t npgs)
   1537 {
   1538 	/*
   1539 	 * If all pageable pages were paged out, freemem would
   1540 	 * equal availrmem.  There is a minimum requirement for
   1541 	 * availrmem.
   1542 	 */
   1543 	if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages))
   1544 	    < npgs)
   1545 		return (0);
   1546 	/* TODO: check swap space, etc. */
   1547 	return (1);
   1548 }
   1549 
   1550 static int
   1551 get_availrmem(pgcnt_t npgs)
   1552 {
   1553 	int ret;
   1554 
   1555 	mutex_enter(&freemem_lock);
   1556 	ret = can_remove_pgs(npgs);
   1557 	if (ret != 0)
   1558 		availrmem -= npgs;
   1559 	mutex_exit(&freemem_lock);
   1560 	return (ret);
   1561 }
   1562 
   1563 static void
   1564 put_availrmem(pgcnt_t npgs)
   1565 {
   1566 	mutex_enter(&freemem_lock);
   1567 	availrmem += npgs;
   1568 	mutex_exit(&freemem_lock);
   1569 }
   1570 
   1571 #define	FREEMEM_INCR	100
   1572 static pgcnt_t freemem_incr = FREEMEM_INCR;
   1573 #define	DEL_FREE_WAIT_FRAC	4
   1574 #define	DEL_FREE_WAIT_TICKS	((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC)
   1575 
   1576 #define	DEL_BUSY_WAIT_FRAC	20
   1577 #define	DEL_BUSY_WAIT_TICKS	((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC)
   1578 
   1579 static void kphysm_del_cleanup(struct mem_handle *);
   1580 
   1581 static void page_delete_collect(page_t *, struct mem_handle *);
   1582 
   1583 static pgcnt_t
   1584 delthr_get_freemem(struct mem_handle *mhp)
   1585 {
   1586 	pgcnt_t free_get;
   1587 	int ret;
   1588 
   1589 	ASSERT(MUTEX_HELD(&mhp->mh_mutex));
   1590 
   1591 	MDSTAT_INCR(mhp, need_free);
   1592 	/*
   1593 	 * Get up to freemem_incr pages.
   1594 	 */
   1595 	free_get = freemem_incr;
   1596 	if (free_get > mhp->mh_hold_todo)
   1597 		free_get = mhp->mh_hold_todo;
   1598 	/*
   1599 	 * Take free_get pages away from freemem,
   1600 	 * waiting if necessary.
   1601 	 */
   1602 
   1603 	while (!mhp->mh_cancel) {
   1604 		mutex_exit(&mhp->mh_mutex);
   1605 		MDSTAT_INCR(mhp, free_loop);
   1606 		/*
   1607 		 * Duplicate test from page_create_throttle()
   1608 		 * but don't override with !PG_WAIT.
   1609 		 */
   1610 		if (freemem < (free_get + throttlefree)) {
   1611 			MDSTAT_INCR(mhp, free_low);
   1612 			ret = 0;
   1613 		} else {
   1614 			ret = page_create_wait(free_get, 0);
   1615 			if (ret == 0) {
   1616 				/* EMPTY */
   1617 				MDSTAT_INCR(mhp, free_failed);
   1618 			}
   1619 		}
   1620 		if (ret != 0) {
   1621 			mutex_enter(&mhp->mh_mutex);
   1622 			return (free_get);
   1623 		}
   1624 
   1625 		/*
   1626 		 * Put pressure on pageout.
   1627 		 */
   1628 		page_needfree(free_get);
   1629 		cv_signal(&proc_pageout->p_cv);
   1630 
   1631 		mutex_enter(&mhp->mh_mutex);
   1632 		(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
   1633 		    DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK);
   1634 		mutex_exit(&mhp->mh_mutex);
   1635 		page_needfree(-(spgcnt_t)free_get);
   1636 
   1637 		mutex_enter(&mhp->mh_mutex);
   1638 	}
   1639 	return (0);
   1640 }
   1641 
   1642 #define	DR_AIO_CLEANUP_DELAY	25000	/* 0.025secs, in usec */
   1643 #define	DR_AIO_CLEANUP_MAXLOOPS_NODELAY	100
   1644 /*
   1645  * This function is run as a helper thread for delete_memory_thread.
   1646  * It is needed in order to force kaio cleanup, so that pages used in kaio
   1647  * will be unlocked and subsequently relocated by delete_memory_thread.
   1648  * The address of the delete_memory_threads's mem_handle is passed in to
   1649  * this thread function, and is used to set the mh_aio_cleanup_done member
   1650  * prior to calling thread_exit().
   1651  */
   1652 static void
   1653 dr_aio_cleanup_thread(caddr_t amhp)
   1654 {
   1655 	proc_t *procp;
   1656 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
   1657 	int cleaned;
   1658 	int n = 0;
   1659 	struct mem_handle *mhp;
   1660 	volatile uint_t *pcancel;
   1661 
   1662 	mhp = (struct mem_handle *)amhp;
   1663 	ASSERT(mhp != NULL);
   1664 	pcancel = &mhp->mh_dr_aio_cleanup_cancel;
   1665 	if (modload("sys", "kaio") == -1) {
   1666 		mhp->mh_aio_cleanup_done = 1;
   1667 		cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio");
   1668 		thread_exit();
   1669 	}
   1670 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
   1671 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
   1672 	if (aio_cleanup_dr_delete_memory == NULL) {
   1673 		mhp->mh_aio_cleanup_done = 1;
   1674 		cmn_err(CE_WARN,
   1675 	    "aio_cleanup_dr_delete_memory not found in kaio");
   1676 		thread_exit();
   1677 	}
   1678 	do {
   1679 		cleaned = 0;
   1680 		mutex_enter(&pidlock);
   1681 		for (procp = practive; (*pcancel == 0) && (procp != NULL);
   1682 		    procp = procp->p_next) {
   1683 			mutex_enter(&procp->p_lock);
   1684 			if (procp->p_aio != NULL) {
   1685 				/* cleanup proc's outstanding kaio */
   1686 				cleaned +=
   1687 				    (*aio_cleanup_dr_delete_memory)(procp);
   1688 			}
   1689 			mutex_exit(&procp->p_lock);
   1690 		}
   1691 		mutex_exit(&pidlock);
   1692 		if ((*pcancel == 0) &&
   1693 		    (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) {
   1694 			/* delay a bit before retrying all procs again */
   1695 			delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
   1696 			n = 0;
   1697 		}
   1698 	} while (*pcancel == 0);
   1699 	mhp->mh_aio_cleanup_done = 1;
   1700 	thread_exit();
   1701 }
   1702 
   1703 static void
   1704 delete_memory_thread(caddr_t amhp)
   1705 {
   1706 	struct mem_handle *mhp;
   1707 	struct memdelspan *mdsp;
   1708 	callb_cpr_t cprinfo;
   1709 	page_t *pp_targ;
   1710 	spgcnt_t freemem_left;
   1711 	void (*del_complete_funcp)(void *, int error);
   1712 	void *del_complete_arg;
   1713 	int comp_code;
   1714 	int ret;
   1715 	int first_scan;
   1716 	uint_t szc;
   1717 #ifdef MEM_DEL_STATS
   1718 	uint64_t start_total, ntick_total;
   1719 	uint64_t start_pgrp, ntick_pgrp;
   1720 #endif /* MEM_DEL_STATS */
   1721 
   1722 	mhp = (struct mem_handle *)amhp;
   1723 
   1724 #ifdef MEM_DEL_STATS
   1725 	start_total = ddi_get_lbolt();
   1726 #endif /* MEM_DEL_STATS */
   1727 
   1728 	CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex,
   1729 	    callb_generic_cpr, "memdel");
   1730 
   1731 	mutex_enter(&mhp->mh_mutex);
   1732 	ASSERT(mhp->mh_state == MHND_STARTING);
   1733 
   1734 	mhp->mh_state = MHND_RUNNING;
   1735 	mhp->mh_thread_id = curthread;
   1736 
   1737 	mhp->mh_hold_todo = mhp->mh_vm_pages;
   1738 	mutex_exit(&mhp->mh_mutex);
   1739 
   1740 	/* Allocate the remap pages now, if necessary. */
   1741 	memseg_remap_init();
   1742 
   1743 	/*
   1744 	 * Subtract from availrmem now if possible as availrmem
   1745 	 * may not be available by the end of the delete.
   1746 	 */
   1747 	if (!get_availrmem(mhp->mh_vm_pages)) {
   1748 		comp_code = KPHYSM_ENOTVIABLE;
   1749 		mutex_enter(&mhp->mh_mutex);
   1750 		goto early_exit;
   1751 	}
   1752 
   1753 	ret = kphysm_setup_pre_del(mhp->mh_vm_pages);
   1754 
   1755 	mutex_enter(&mhp->mh_mutex);
   1756 
   1757 	if (ret != 0) {
   1758 		mhp->mh_cancel = KPHYSM_EREFUSED;
   1759 		goto refused;
   1760 	}
   1761 
   1762 	transit_list_collect(mhp, 1);
   1763 
   1764 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   1765 	    mdsp = mdsp->mds_next) {
   1766 		ASSERT(mdsp->mds_bitmap == NULL);
   1767 		mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP);
   1768 		mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp),
   1769 		    KM_SLEEP);
   1770 	}
   1771 
   1772 	first_scan = 1;
   1773 	freemem_left = 0;
   1774 	/*
   1775 	 * Start dr_aio_cleanup_thread, which periodically iterates
   1776 	 * through the process list and invokes aio cleanup.  This
   1777 	 * is needed in order to avoid a deadly embrace between the
   1778 	 * delete_memory_thread (waiting on writer lock for page, with the
   1779 	 * exclusive-wanted bit set), kaio read request threads (waiting for a
   1780 	 * reader lock on the same page that is wanted by the
   1781 	 * delete_memory_thread), and threads waiting for kaio completion
   1782 	 * (blocked on spt_amp->lock).
   1783 	 */
   1784 	mhp->mh_dr_aio_cleanup_cancel = 0;
   1785 	mhp->mh_aio_cleanup_done = 0;
   1786 	(void) thread_create(NULL, 0, dr_aio_cleanup_thread,
   1787 	    (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1);
   1788 	while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) {
   1789 		pgcnt_t collected;
   1790 
   1791 		MDSTAT_INCR(mhp, nloop);
   1792 		collected = 0;
   1793 		for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) &&
   1794 		    (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) {
   1795 			pfn_t pfn, p_end;
   1796 
   1797 			p_end = mdsp->mds_base + mdsp->mds_npgs;
   1798 			for (pfn = mdsp->mds_base; (pfn < p_end) &&
   1799 			    (mhp->mh_cancel == 0); pfn++) {
   1800 				page_t *pp, *tpp, *tpp_targ;
   1801 				pgcnt_t bit;
   1802 				struct vnode *vp;
   1803 				u_offset_t offset;
   1804 				int mod, result;
   1805 				spgcnt_t pgcnt;
   1806 
   1807 				bit = pfn - mdsp->mds_base;
   1808 				if ((mdsp->mds_bitmap[bit / NBPBMW] &
   1809 				    (1 << (bit % NBPBMW))) != 0) {
   1810 					MDSTAT_INCR(mhp, already_done);
   1811 					continue;
   1812 				}
   1813 				if (freemem_left == 0) {
   1814 					freemem_left += delthr_get_freemem(mhp);
   1815 					if (freemem_left == 0)
   1816 						break;
   1817 				}
   1818 
   1819 				/*
   1820 				 * Release mh_mutex - some of this
   1821 				 * stuff takes some time (eg PUTPAGE).
   1822 				 */
   1823 
   1824 				mutex_exit(&mhp->mh_mutex);
   1825 				MDSTAT_INCR(mhp, ncheck);
   1826 
   1827 				pp = page_numtopp_nolock(pfn);
   1828 				if (pp == NULL) {
   1829 					/*
   1830 					 * Not covered by a page_t - will
   1831 					 * be dealt with elsewhere.
   1832 					 */
   1833 					MDSTAT_INCR(mhp, nopaget);
   1834 					mutex_enter(&mhp->mh_mutex);
   1835 					mdsp->mds_bitmap[bit / NBPBMW] |=
   1836 					    (1 << (bit % NBPBMW));
   1837 					continue;
   1838 				}
   1839 
   1840 				if (!page_try_reclaim_lock(pp, SE_EXCL,
   1841 				    SE_EXCL_WANTED | SE_RETIRED)) {
   1842 					/*
   1843 					 * Page in use elsewhere.  Skip it.
   1844 					 */
   1845 					MDSTAT_INCR(mhp, lockfail);
   1846 					mutex_enter(&mhp->mh_mutex);
   1847 					continue;
   1848 				}
   1849 				/*
   1850 				 * See if the cage expanded into the delete.
   1851 				 * This can happen as we have to allow the
   1852 				 * cage to expand.
   1853 				 */
   1854 				if (PP_ISNORELOC(pp)) {
   1855 					page_unlock(pp);
   1856 					mutex_enter(&mhp->mh_mutex);
   1857 					mhp->mh_cancel = KPHYSM_ENONRELOC;
   1858 					break;
   1859 				}
   1860 				if (PP_RETIRED(pp)) {
   1861 					/*
   1862 					 * Page has been retired and is
   1863 					 * not part of the cage so we
   1864 					 * can now do the accounting for
   1865 					 * it.
   1866 					 */
   1867 					MDSTAT_INCR(mhp, retired);
   1868 					mutex_enter(&mhp->mh_mutex);
   1869 					mdsp->mds_bitmap[bit / NBPBMW]
   1870 					    |= (1 << (bit % NBPBMW));
   1871 					mdsp->mds_bitmap_retired[bit /
   1872 					    NBPBMW] |=
   1873 					    (1 << (bit % NBPBMW));
   1874 					mhp->mh_hold_todo--;
   1875 					continue;
   1876 				}
   1877 				ASSERT(freemem_left != 0);
   1878 				if (PP_ISFREE(pp)) {
   1879 					/*
   1880 					 * Like page_reclaim() only 'freemem'
   1881 					 * processing is already done.
   1882 					 */
   1883 					MDSTAT_INCR(mhp, nfree);
   1884 				free_page_collect:
   1885 					if (PP_ISAGED(pp)) {
   1886 						page_list_sub(pp,
   1887 						    PG_FREE_LIST);
   1888 					} else {
   1889 						page_list_sub(pp,
   1890 						    PG_CACHE_LIST);
   1891 					}
   1892 					PP_CLRFREE(pp);
   1893 					PP_CLRAGED(pp);
   1894 					collected++;
   1895 					mutex_enter(&mhp->mh_mutex);
   1896 					page_delete_collect(pp, mhp);
   1897 					mdsp->mds_bitmap[bit / NBPBMW] |=
   1898 					    (1 << (bit % NBPBMW));
   1899 					freemem_left--;
   1900 					continue;
   1901 				}
   1902 				ASSERT(pp->p_vnode != NULL);
   1903 				if (first_scan) {
   1904 					MDSTAT_INCR(mhp, first_notfree);
   1905 					page_unlock(pp);
   1906 					mutex_enter(&mhp->mh_mutex);
   1907 					continue;
   1908 				}
   1909 				/*
   1910 				 * Keep stats on pages encountered that
   1911 				 * are marked for retirement.
   1912 				 */
   1913 				if (PP_TOXIC(pp)) {
   1914 					MDSTAT_INCR(mhp, toxic);
   1915 				} else if (PP_PR_REQ(pp)) {
   1916 					MDSTAT_INCR(mhp, failing);
   1917 				}
   1918 				/*
   1919 				 * In certain cases below, special exceptions
   1920 				 * are made for pages that are toxic.  This
   1921 				 * is because the current meaning of toxic
   1922 				 * is that an uncorrectable error has been
   1923 				 * previously associated with the page.
   1924 				 */
   1925 				if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
   1926 					if (!PP_TOXIC(pp)) {
   1927 						/*
   1928 						 * Must relocate locked in
   1929 						 * memory pages.
   1930 						 */
   1931 #ifdef MEM_DEL_STATS
   1932 						start_pgrp = ddi_get_lbolt();
   1933 #endif /* MEM_DEL_STATS */
   1934 						/*
   1935 						 * Lock all constituent pages
   1936 						 * of a large page to ensure
   1937 						 * that p_szc won't change.
   1938 						 */
   1939 						if (!group_page_trylock(pp,
   1940 						    SE_EXCL)) {
   1941 							MDSTAT_INCR(mhp,
   1942 							    gptllckfail);
   1943 							page_unlock(pp);
   1944 							mutex_enter(
   1945 							    &mhp->mh_mutex);
   1946 							continue;
   1947 						}
   1948 						MDSTAT_INCR(mhp, npplocked);
   1949 						pp_targ =
   1950 						    page_get_replacement_page(
   1951 						    pp, NULL, 0);
   1952 						if (pp_targ != NULL) {
   1953 #ifdef MEM_DEL_STATS
   1954 							ntick_pgrp =
   1955 							    (uint64_t)
   1956 							    ddi_get_lbolt() -
   1957 							    start_pgrp;
   1958 #endif /* MEM_DEL_STATS */
   1959 							MDSTAT_PGRP(mhp,
   1960 							    ntick_pgrp);
   1961 							MDSTAT_INCR(mhp,
   1962 							    nlockreloc);
   1963 							goto reloc;
   1964 						}
   1965 						group_page_unlock(pp);
   1966 						page_unlock(pp);
   1967 #ifdef MEM_DEL_STATS
   1968 						ntick_pgrp =
   1969 						    (uint64_t)ddi_get_lbolt() -
   1970 						    start_pgrp;
   1971 #endif /* MEM_DEL_STATS */
   1972 						MDSTAT_PGRP(mhp, ntick_pgrp);
   1973 						MDSTAT_INCR(mhp, nnorepl);
   1974 						mutex_enter(&mhp->mh_mutex);
   1975 						continue;
   1976 					} else {
   1977 						/*
   1978 						 * Cannot do anything about
   1979 						 * this page because it is
   1980 						 * toxic.
   1981 						 */
   1982 						MDSTAT_INCR(mhp, npplkdtoxic);
   1983 						page_unlock(pp);
   1984 						mutex_enter(&mhp->mh_mutex);
   1985 						continue;
   1986 					}
   1987 				}
   1988 				/*
   1989 				 * Unload the mappings and check if mod bit
   1990 				 * is set.
   1991 				 */
   1992 				ASSERT(!PP_ISKAS(pp));
   1993 				(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1994 				mod = hat_ismod(pp);
   1995 
   1996 #ifdef MEM_DEL_STATS
   1997 				start_pgrp = ddi_get_lbolt();
   1998 #endif /* MEM_DEL_STATS */
   1999 				if (mod && !PP_TOXIC(pp)) {
   2000 					/*
   2001 					 * Lock all constituent pages
   2002 					 * of a large page to ensure
   2003 					 * that p_szc won't change.
   2004 					 */
   2005 					if (!group_page_trylock(pp, SE_EXCL)) {
   2006 						MDSTAT_INCR(mhp, gptlmodfail);
   2007 						page_unlock(pp);
   2008 						mutex_enter(&mhp->mh_mutex);
   2009 						continue;
   2010 					}
   2011 					pp_targ = page_get_replacement_page(pp,
   2012 					    NULL, 0);
   2013 					if (pp_targ != NULL) {
   2014 						MDSTAT_INCR(mhp, nmodreloc);
   2015 #ifdef MEM_DEL_STATS
   2016 						ntick_pgrp =
   2017 						    (uint64_t)ddi_get_lbolt() -
   2018 						    start_pgrp;
   2019 #endif /* MEM_DEL_STATS */
   2020 						MDSTAT_PGRP(mhp, ntick_pgrp);
   2021 						goto reloc;
   2022 					}
   2023 					group_page_unlock(pp);
   2024 				}
   2025 
   2026 				if (!page_try_demote_pages(pp)) {
   2027 					MDSTAT_INCR(mhp, demotefail);
   2028 					page_unlock(pp);
   2029 #ifdef MEM_DEL_STATS
   2030 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2031 					    start_pgrp;
   2032 #endif /* MEM_DEL_STATS */
   2033 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2034 					mutex_enter(&mhp->mh_mutex);
   2035 					continue;
   2036 				}
   2037 
   2038 				/*
   2039 				 * Regular 'page-out'.
   2040 				 */
   2041 				if (!mod) {
   2042 					MDSTAT_INCR(mhp, ndestroy);
   2043 					page_destroy(pp, 1);
   2044 					/*
   2045 					 * page_destroy was called with
   2046 					 * dontfree. As long as p_lckcnt
   2047 					 * and p_cowcnt are both zero, the
   2048 					 * only additional action of
   2049 					 * page_destroy with !dontfree is to
   2050 					 * call page_free, so we can collect
   2051 					 * the page here.
   2052 					 */
   2053 					collected++;
   2054 #ifdef MEM_DEL_STATS
   2055 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2056 					    start_pgrp;
   2057 #endif /* MEM_DEL_STATS */
   2058 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2059 					mutex_enter(&mhp->mh_mutex);
   2060 					page_delete_collect(pp, mhp);
   2061 					mdsp->mds_bitmap[bit / NBPBMW] |=
   2062 					    (1 << (bit % NBPBMW));
   2063 					continue;
   2064 				}
   2065 				/*
   2066 				 * The page is toxic and the mod bit is
   2067 				 * set, we cannot do anything here to deal
   2068 				 * with it.
   2069 				 */
   2070 				if (PP_TOXIC(pp)) {
   2071 					page_unlock(pp);
   2072 #ifdef MEM_DEL_STATS
   2073 					ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2074 					    start_pgrp;
   2075 #endif /* MEM_DEL_STATS */
   2076 					MDSTAT_PGRP(mhp, ntick_pgrp);
   2077 					MDSTAT_INCR(mhp, modtoxic);
   2078 					mutex_enter(&mhp->mh_mutex);
   2079 					continue;
   2080 				}
   2081 				MDSTAT_INCR(mhp, nputpage);
   2082 				vp = pp->p_vnode;
   2083 				offset = pp->p_offset;
   2084 				VN_HOLD(vp);
   2085 				page_unlock(pp);
   2086 				(void) VOP_PUTPAGE(vp, offset, PAGESIZE,
   2087 				    B_INVAL|B_FORCE, kcred, NULL);
   2088 				VN_RELE(vp);
   2089 #ifdef MEM_DEL_STATS
   2090 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2091 				    start_pgrp;
   2092 #endif /* MEM_DEL_STATS */
   2093 				MDSTAT_PGRP(mhp, ntick_pgrp);
   2094 				/*
   2095 				 * Try to get the page back immediately
   2096 				 * so that it can be collected.
   2097 				 */
   2098 				pp = page_numtopp_nolock(pfn);
   2099 				if (pp == NULL) {
   2100 					MDSTAT_INCR(mhp, nnoreclaim);
   2101 					/*
   2102 					 * This should not happen as this
   2103 					 * thread is deleting the page.
   2104 					 * If this code is generalized, this
   2105 					 * becomes a reality.
   2106 					 */
   2107 #ifdef DEBUG
   2108 					cmn_err(CE_WARN,
   2109 					    "delete_memory_thread(0x%p) "
   2110 					    "pfn 0x%lx has no page_t",
   2111 					    (void *)mhp, pfn);
   2112 #endif /* DEBUG */
   2113 					mutex_enter(&mhp->mh_mutex);
   2114 					continue;
   2115 				}
   2116 				if (page_try_reclaim_lock(pp, SE_EXCL,
   2117 				    SE_EXCL_WANTED | SE_RETIRED)) {
   2118 					if (PP_ISFREE(pp)) {
   2119 						goto free_page_collect;
   2120 					}
   2121 					page_unlock(pp);
   2122 				}
   2123 				MDSTAT_INCR(mhp, nnoreclaim);
   2124 				mutex_enter(&mhp->mh_mutex);
   2125 				continue;
   2126 
   2127 			reloc:
   2128 				/*
   2129 				 * Got some freemem and a target
   2130 				 * page, so move the data to avoid
   2131 				 * I/O and lock problems.
   2132 				 */
   2133 				ASSERT(!page_iolock_assert(pp));
   2134 				MDSTAT_INCR(mhp, nreloc);
   2135 				/*
   2136 				 * page_relocate() will return pgcnt: the
   2137 				 * number of consecutive pages relocated.
   2138 				 * If it is successful, pp will be a
   2139 				 * linked list of the page structs that
   2140 				 * were relocated. If page_relocate() is
   2141 				 * unsuccessful, pp will be unmodified.
   2142 				 */
   2143 #ifdef MEM_DEL_STATS
   2144 				start_pgrp = ddi_get_lbolt();
   2145 #endif /* MEM_DEL_STATS */
   2146 				result = page_relocate(&pp, &pp_targ, 0, 0,
   2147 				    &pgcnt, NULL);
   2148 #ifdef MEM_DEL_STATS
   2149 				ntick_pgrp = (uint64_t)ddi_get_lbolt() -
   2150 				    start_pgrp;
   2151 #endif /* MEM_DEL_STATS */
   2152 				MDSTAT_PGRP(mhp, ntick_pgrp);
   2153 				if (result != 0) {
   2154 					MDSTAT_INCR(mhp, nrelocfail);
   2155 					/*
   2156 					 * We did not succeed. We need
   2157 					 * to give the pp_targ pages back.
   2158 					 * page_free(pp_targ, 1) without
   2159 					 * the freemem accounting.
   2160 					 */
   2161 					group_page_unlock(pp);
   2162 					page_free_replacement_page(pp_targ);
   2163 					page_unlock(pp);
   2164 					mutex_enter(&mhp->mh_mutex);
   2165 					continue;
   2166 				}
   2167 
   2168 				/*
   2169 				 * We will then collect pgcnt pages.
   2170 				 */
   2171 				ASSERT(pgcnt > 0);
   2172 				mutex_enter(&mhp->mh_mutex);
   2173 				/*
   2174 				 * We need to make sure freemem_left is
   2175 				 * large enough.
   2176 				 */
   2177 				while ((freemem_left < pgcnt) &&
   2178 				    (!mhp->mh_cancel)) {
   2179 					freemem_left +=
   2180 					    delthr_get_freemem(mhp);
   2181 				}
   2182 
   2183 				/*
   2184 				 * Do not proceed if mh_cancel is set.
   2185 				 */
   2186 				if (mhp->mh_cancel) {
   2187 					while (pp_targ != NULL) {
   2188 						/*
   2189 						 * Unlink and unlock each page.
   2190 						 */
   2191 						tpp_targ = pp_targ;
   2192 						page_sub(&pp_targ, tpp_targ);
   2193 						page_unlock(tpp_targ);
   2194 					}
   2195 					/*
   2196 					 * We need to give the pp pages back.
   2197 					 * page_free(pp, 1) without the
   2198 					 * freemem accounting.
   2199 					 */
   2200 					page_free_replacement_page(pp);
   2201 					break;
   2202 				}
   2203 
   2204 				/* Now remove pgcnt from freemem_left */
   2205 				freemem_left -= pgcnt;
   2206 				ASSERT(freemem_left >= 0);
   2207 				szc = pp->p_szc;
   2208 				while (pp != NULL) {
   2209 					/*
   2210 					 * pp and pp_targ were passed back as
   2211 					 * a linked list of pages.
   2212 					 * Unlink and unlock each page.
   2213 					 */
   2214 					tpp_targ = pp_targ;
   2215 					page_sub(&pp_targ, tpp_targ);
   2216 					page_unlock(tpp_targ);
   2217 					/*
   2218 					 * The original page is now free
   2219 					 * so remove it from the linked
   2220 					 * list and collect it.
   2221 					 */
   2222 					tpp = pp;
   2223 					page_sub(&pp, tpp);
   2224 					pfn = page_pptonum(tpp);
   2225 					collected++;
   2226 					ASSERT(PAGE_EXCL(tpp));
   2227 					ASSERT(tpp->p_vnode == NULL);
   2228 					ASSERT(!hat_page_is_mapped(tpp));
   2229 					ASSERT(tpp->p_szc == szc);
   2230 					tpp->p_szc = 0;
   2231 					page_delete_collect(tpp, mhp);
   2232 					bit = pfn - mdsp->mds_base;
   2233 					mdsp->mds_bitmap[bit / NBPBMW] |=
   2234 					    (1 << (bit % NBPBMW));
   2235 				}
   2236 				ASSERT(pp_targ == NULL);
   2237 			}
   2238 		}
   2239 		first_scan = 0;
   2240 		if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) &&
   2241 		    (collected == 0)) {
   2242 			/*
   2243 			 * This code is needed as we cannot wait
   2244 			 * for a page to be locked OR the delete to
   2245 			 * be cancelled.  Also, we must delay so
   2246 			 * that other threads get a chance to run
   2247 			 * on our cpu, otherwise page locks may be
   2248 			 * held indefinitely by those threads.
   2249 			 */
   2250 			MDSTAT_INCR(mhp, ndelay);
   2251 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2252 			(void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex,
   2253 			    DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK);
   2254 			CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
   2255 		}
   2256 	}
   2257 	/* stop the dr aio cleanup thread */
   2258 	mhp->mh_dr_aio_cleanup_cancel = 1;
   2259 	transit_list_collect(mhp, 0);
   2260 	if (freemem_left != 0) {
   2261 		/* Return any surplus. */
   2262 		page_create_putback(freemem_left);
   2263 		freemem_left = 0;
   2264 	}
   2265 #ifdef MEM_DEL_STATS
   2266 	ntick_total = (uint64_t)ddi_get_lbolt() - start_total;
   2267 #endif /* MEM_DEL_STATS */
   2268 	MDSTAT_TOTAL(mhp, ntick_total);
   2269 	MDSTAT_PRINT(mhp);
   2270 
   2271 	/*
   2272 	 * If the memory delete was cancelled, exclusive-wanted bits must
   2273 	 * be cleared. If there are retired pages being deleted, they need
   2274 	 * to be unretired.
   2275 	 */
   2276 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2277 	    mdsp = mdsp->mds_next) {
   2278 		pfn_t pfn, p_end;
   2279 
   2280 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   2281 		for (pfn = mdsp->mds_base; pfn < p_end; pfn++) {
   2282 			page_t *pp;
   2283 			pgcnt_t bit;
   2284 
   2285 			bit = pfn - mdsp->mds_base;
   2286 			if (mhp->mh_cancel) {
   2287 				pp = page_numtopp_nolock(pfn);
   2288 				if (pp != NULL) {
   2289 					if ((mdsp->mds_bitmap[bit / NBPBMW] &
   2290 					    (1 << (bit % NBPBMW))) == 0) {
   2291 						page_lock_clr_exclwanted(pp);
   2292 					}
   2293 				}
   2294 			} else {
   2295 				pp = NULL;
   2296 			}
   2297 			if ((mdsp->mds_bitmap_retired[bit / NBPBMW] &
   2298 			    (1 << (bit % NBPBMW))) != 0) {
   2299 				/* do we already have pp? */
   2300 				if (pp == NULL) {
   2301 					pp = page_numtopp_nolock(pfn);
   2302 				}
   2303 				ASSERT(pp != NULL);
   2304 				ASSERT(PP_RETIRED(pp));
   2305 				if (mhp->mh_cancel != 0) {
   2306 					page_unlock(pp);
   2307 					/*
   2308 					 * To satisfy ASSERT below in
   2309 					 * cancel code.
   2310 					 */
   2311 					mhp->mh_hold_todo++;
   2312 				} else {
   2313 					(void) page_unretire_pp(pp,
   2314 					    PR_UNR_CLEAN);
   2315 				}
   2316 			}
   2317 		}
   2318 	}
   2319 	/*
   2320 	 * Free retired page bitmap and collected page bitmap
   2321 	 */
   2322 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2323 	    mdsp = mdsp->mds_next) {
   2324 		ASSERT(mdsp->mds_bitmap_retired != NULL);
   2325 		kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp));
   2326 		mdsp->mds_bitmap_retired = NULL;	/* Paranoia. */
   2327 		ASSERT(mdsp->mds_bitmap != NULL);
   2328 		kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp));
   2329 		mdsp->mds_bitmap = NULL;	/* Paranoia. */
   2330 	}
   2331 
   2332 	/* wait for our dr aio cancel thread to exit */
   2333 	while (!(mhp->mh_aio_cleanup_done)) {
   2334 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
   2335 		delay(drv_usectohz(DR_AIO_CLEANUP_DELAY));
   2336 		CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex);
   2337 	}
   2338 refused:
   2339 	if (mhp->mh_cancel != 0) {
   2340 		page_t *pp;
   2341 
   2342 		comp_code = mhp->mh_cancel;
   2343 		/*
   2344 		 * Go through list of deleted pages (mh_deleted) freeing
   2345 		 * them.
   2346 		 */
   2347 		while ((pp = mhp->mh_deleted) != NULL) {
   2348 			mhp->mh_deleted = pp->p_next;
   2349 			mhp->mh_hold_todo++;
   2350 			mutex_exit(&mhp->mh_mutex);
   2351 			/* Restore p_next. */
   2352 			pp->p_next = pp->p_prev;
   2353 			if (PP_ISFREE(pp)) {
   2354 				cmn_err(CE_PANIC,
   2355 				    "page %p is free",
   2356 				    (void *)pp);
   2357 			}
   2358 			page_free(pp, 1);
   2359 			mutex_enter(&mhp->mh_mutex);
   2360 		}
   2361 		ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages);
   2362 
   2363 		mutex_exit(&mhp->mh_mutex);
   2364 		put_availrmem(mhp->mh_vm_pages);
   2365 		mutex_enter(&mhp->mh_mutex);
   2366 
   2367 		goto t_exit;
   2368 	}
   2369 
   2370 	/*
   2371 	 * All the pages are no longer in use and are exclusively locked.
   2372 	 */
   2373 
   2374 	mhp->mh_deleted = NULL;
   2375 
   2376 	kphysm_del_cleanup(mhp);
   2377 
   2378 	/*
   2379 	 * mem_node_del_range needs to be after kphysm_del_cleanup so
   2380 	 * that the mem_node_config[] will remain intact for the cleanup.
   2381 	 */
   2382 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2383 	    mdsp = mdsp->mds_next) {
   2384 		mem_node_del_range(mdsp->mds_base,
   2385 		    mdsp->mds_base + mdsp->mds_npgs - 1);
   2386 	}
   2387 
   2388 	comp_code = KPHYSM_OK;
   2389 
   2390 t_exit:
   2391 	mutex_exit(&mhp->mh_mutex);
   2392 	kphysm_setup_post_del(mhp->mh_vm_pages,
   2393 	    (comp_code == KPHYSM_OK) ? 0 : 1);
   2394 	mutex_enter(&mhp->mh_mutex);
   2395 
   2396 early_exit:
   2397 	/* mhp->mh_mutex exited by CALLB_CPR_EXIT() */
   2398 	mhp->mh_state = MHND_DONE;
   2399 	del_complete_funcp = mhp->mh_delete_complete;
   2400 	del_complete_arg = mhp->mh_delete_complete_arg;
   2401 	CALLB_CPR_EXIT(&cprinfo);
   2402 	(*del_complete_funcp)(del_complete_arg, comp_code);
   2403 	thread_exit();
   2404 	/*NOTREACHED*/
   2405 }
   2406 
   2407 /*
   2408  * Start the delete of the memory from the system.
   2409  */
   2410 int
   2411 kphysm_del_start(
   2412 	memhandle_t handle,
   2413 	void (*complete)(void *, int),
   2414 	void *complete_arg)
   2415 {
   2416 	struct mem_handle *mhp;
   2417 
   2418 	mhp = kphysm_lookup_mem_handle(handle);
   2419 	if (mhp == NULL) {
   2420 		return (KPHYSM_EHANDLE);
   2421 	}
   2422 	switch (mhp->mh_state) {
   2423 	case MHND_FREE:
   2424 		ASSERT(mhp->mh_state != MHND_FREE);
   2425 		mutex_exit(&mhp->mh_mutex);
   2426 		return (KPHYSM_EHANDLE);
   2427 	case MHND_INIT:
   2428 		break;
   2429 	case MHND_STARTING:
   2430 	case MHND_RUNNING:
   2431 		mutex_exit(&mhp->mh_mutex);
   2432 		return (KPHYSM_ESEQUENCE);
   2433 	case MHND_DONE:
   2434 		mutex_exit(&mhp->mh_mutex);
   2435 		return (KPHYSM_ESEQUENCE);
   2436 	case MHND_RELEASE:
   2437 		mutex_exit(&mhp->mh_mutex);
   2438 		return (KPHYSM_ESEQUENCE);
   2439 	default:
   2440 #ifdef DEBUG
   2441 		cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d",
   2442 		    (void *)mhp, mhp->mh_state);
   2443 #endif /* DEBUG */
   2444 		mutex_exit(&mhp->mh_mutex);
   2445 		return (KPHYSM_EHANDLE);
   2446 	}
   2447 
   2448 	if (mhp->mh_transit.trl_spans == NULL) {
   2449 		mutex_exit(&mhp->mh_mutex);
   2450 		return (KPHYSM_ENOWORK);
   2451 	}
   2452 
   2453 	ASSERT(complete != NULL);
   2454 	mhp->mh_delete_complete = complete;
   2455 	mhp->mh_delete_complete_arg = complete_arg;
   2456 	mhp->mh_state = MHND_STARTING;
   2457 	/*
   2458 	 * Release the mutex in case thread_create sleeps.
   2459 	 */
   2460 	mutex_exit(&mhp->mh_mutex);
   2461 
   2462 	/*
   2463 	 * The "obvious" process for this thread is pageout (proc_pageout)
   2464 	 * but this gives the thread too much power over freemem
   2465 	 * which results in freemem starvation.
   2466 	 */
   2467 	(void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0,
   2468 	    TS_RUN, maxclsyspri - 1);
   2469 
   2470 	return (KPHYSM_OK);
   2471 }
   2472 
   2473 static kmutex_t pp_dummy_lock;		/* Protects init. of pp_dummy. */
   2474 static caddr_t pp_dummy;
   2475 static pgcnt_t pp_dummy_npages;
   2476 static pfn_t *pp_dummy_pfn;	/* Array of dummy pfns. */
   2477 
   2478 static void
   2479 memseg_remap_init_pages(page_t *pages, page_t *epages)
   2480 {
   2481 	page_t *pp;
   2482 
   2483 	for (pp = pages; pp < epages; pp++) {
   2484 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
   2485 		pp->p_offset = (u_offset_t)-1;
   2486 		page_iolock_init(pp);
   2487 		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
   2488 			continue;
   2489 		page_lock_delete(pp);
   2490 	}
   2491 }
   2492 
   2493 void
   2494 memseg_remap_init()
   2495 {
   2496 	mutex_enter(&pp_dummy_lock);
   2497 	if (pp_dummy == NULL) {
   2498 		uint_t dpages;
   2499 		int i;
   2500 
   2501 		/*
   2502 		 * dpages starts off as the size of the structure and
   2503 		 * ends up as the minimum number of pages that will
   2504 		 * hold a whole number of page_t structures.
   2505 		 */
   2506 		dpages = sizeof (page_t);
   2507 		ASSERT(dpages != 0);
   2508 		ASSERT(dpages <= MMU_PAGESIZE);
   2509 
   2510 		while ((dpages & 1) == 0)
   2511 			dpages >>= 1;
   2512 
   2513 		pp_dummy_npages = dpages;
   2514 		/*
   2515 		 * Allocate pp_dummy pages directly from static_arena,
   2516 		 * since these are whole page allocations and are
   2517 		 * referenced by physical address.  This also has the
   2518 		 * nice fringe benefit of hiding the memory from
   2519 		 * ::findleaks since it doesn't deal well with allocated
   2520 		 * kernel heap memory that doesn't have any mappings.
   2521 		 */
   2522 		pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages),
   2523 		    PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP);
   2524 		bzero(pp_dummy, ptob(pp_dummy_npages));
   2525 		ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0);
   2526 		pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) *
   2527 		    pp_dummy_npages, KM_SLEEP);
   2528 		for (i = 0; i < pp_dummy_npages; i++) {
   2529 			pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat,
   2530 			    &pp_dummy[MMU_PAGESIZE * i]);
   2531 			ASSERT(pp_dummy_pfn[i] != PFN_INVALID);
   2532 		}
   2533 		/*
   2534 		 * Initialize the page_t's to a known 'deleted' state
   2535 		 * that matches the state of deleted pages.
   2536 		 */
   2537 		memseg_remap_init_pages((page_t *)pp_dummy,
   2538 		    (page_t *)(pp_dummy + ptob(pp_dummy_npages)));
   2539 		/* Remove kmem mappings for the pages for safety. */
   2540 		hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages),
   2541 		    HAT_UNLOAD_UNLOCK);
   2542 		/* Leave pp_dummy pointer set as flag that init is done. */
   2543 	}
   2544 	mutex_exit(&pp_dummy_lock);
   2545 }
   2546 
   2547 /*
   2548  * Remap a page-aglined range of page_t's to dummy pages.
   2549  */
   2550 void
   2551 remap_to_dummy(caddr_t va, pgcnt_t metapgs)
   2552 {
   2553 	int phase;
   2554 
   2555 	ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE));
   2556 
   2557 	/*
   2558 	 * We may start remapping at a non-zero page offset
   2559 	 * within the dummy pages since the low/high ends
   2560 	 * of the outgoing pp's could be shared by other
   2561 	 * memsegs (see memseg_remap_meta).
   2562 	 */
   2563 	phase = btop((uint64_t)va) % pp_dummy_npages;
   2564 	ASSERT(PAGESIZE % sizeof (page_t) || phase == 0);
   2565 
   2566 	while (metapgs != 0) {
   2567 		pgcnt_t n;
   2568 		int i, j;
   2569 
   2570 		n = pp_dummy_npages;
   2571 		if (n > metapgs)
   2572 			n = metapgs;
   2573 		for (i = 0; i < n; i++) {
   2574 			j = (i + phase) % pp_dummy_npages;
   2575 			hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j],
   2576 			    PROT_READ,
   2577 			    HAT_LOAD | HAT_LOAD_NOCONSIST |
   2578 			    HAT_LOAD_REMAP);
   2579 			va += ptob(1);
   2580 		}
   2581 		metapgs -= n;
   2582 	}
   2583 }
   2584 
   2585 static void
   2586 memseg_remap_to_dummy(struct memseg *seg)
   2587 {
   2588 	caddr_t pp;
   2589 	pgcnt_t metapgs;
   2590 
   2591 	ASSERT(memseg_is_dynamic(seg));
   2592 	ASSERT(pp_dummy != NULL);
   2593 
   2594 
   2595 	if (!memseg_includes_meta(seg)) {
   2596 		memseg_remap_meta(seg);
   2597 		return;
   2598 	}
   2599 
   2600 	pp = (caddr_t)seg->pages;
   2601 	metapgs = seg->pages_base - memseg_get_start(seg);
   2602 	ASSERT(metapgs != 0);
   2603 
   2604 	seg->pages_end = seg->pages_base;
   2605 
   2606 	remap_to_dummy(pp, metapgs);
   2607 }
   2608 
   2609 /*
   2610  * Transition all the deleted pages to the deleted state so that
   2611  * page_lock will not wait. The page_lock_delete call will
   2612  * also wake up any waiters.
   2613  */
   2614 static void
   2615 memseg_lock_delete_all(struct memseg *seg)
   2616 {
   2617 	page_t *pp;
   2618 
   2619 	for (pp = seg->pages; pp < seg->epages; pp++) {
   2620 		pp->p_pagenum = PFN_INVALID;	/* XXXX */
   2621 		page_lock_delete(pp);
   2622 	}
   2623 }
   2624 
   2625 static void
   2626 kphysm_del_cleanup(struct mem_handle *mhp)
   2627 {
   2628 	struct memdelspan	*mdsp;
   2629 	struct memseg		*seg;
   2630 	struct memseg   	**segpp;
   2631 	struct memseg		*seglist;
   2632 	pfn_t			p_end;
   2633 	uint64_t		avmem;
   2634 	pgcnt_t			avpgs;
   2635 	pgcnt_t			npgs;
   2636 
   2637 	avpgs = mhp->mh_vm_pages;
   2638 
   2639 	memsegs_lock(1);
   2640 
   2641 	/*
   2642 	 * remove from main segment list.
   2643 	 */
   2644 	npgs = 0;
   2645 	seglist = NULL;
   2646 	for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL;
   2647 	    mdsp = mdsp->mds_next) {
   2648 		p_end = mdsp->mds_base + mdsp->mds_npgs;
   2649 		for (segpp = &memsegs; (seg = *segpp) != NULL; ) {
   2650 			if (seg->pages_base >= p_end ||
   2651 			    seg->pages_end <= mdsp->mds_base) {
   2652 				/* Span and memseg don't overlap. */
   2653 				segpp = &((*segpp)->next);
   2654 				continue;
   2655 			}
   2656 			ASSERT(seg->pages_base >= mdsp->mds_base);
   2657 			ASSERT(seg->pages_end <= p_end);
   2658 
   2659 			PLCNT_MODIFY_MAX(seg->pages_base,
   2660 			    seg->pages_base - seg->pages_end);
   2661 
   2662 			/* Hide the memseg from future scans. */
   2663 			hat_kpm_delmem_mseg_update(seg, segpp);
   2664 			*segpp = seg->next;
   2665 			membar_producer();	/* TODO: Needed? */
   2666 			npgs += MSEG_NPAGES(seg);
   2667 
   2668 			/*
   2669 			 * Leave the deleted segment's next pointer intact
   2670 			 * in case a memsegs scanning loop is walking this
   2671 			 * segment concurrently.
   2672 			 */
   2673 			seg->lnext = seglist;
   2674 			seglist = seg;
   2675 		}
   2676 	}
   2677 
   2678 	build_pfn_hash();
   2679 
   2680 	ASSERT(npgs < total_pages);
   2681 	total_pages -= npgs;
   2682 
   2683 	/*
   2684 	 * Recalculate the paging parameters now total_pages has changed.
   2685 	 * This will also cause the clock hands to be reset before next use.
   2686 	 */
   2687 	setupclock(1);
   2688 
   2689 	memsegs_unlock(1);
   2690 
   2691 	mutex_exit(&mhp->mh_mutex);
   2692 
   2693 	while ((seg = seglist) != NULL) {
   2694 		pfn_t mseg_start;
   2695 		pfn_t mseg_base, mseg_end;
   2696 		pgcnt_t mseg_npgs;
   2697 		int mlret;
   2698 
   2699 		seglist = seg->lnext;
   2700 
   2701 		/*
   2702 		 * Put the page_t's into the deleted state to stop
   2703 		 * cv_wait()s on the pages. When we remap, the dummy
   2704 		 * page_t's will be in the same state.
   2705 		 */
   2706 		memseg_lock_delete_all(seg);
   2707 		/*
   2708 		 * Collect up information based on pages_base and pages_end
   2709 		 * early so that we can flag early that the memseg has been
   2710 		 * deleted by setting pages_end == pages_base.
   2711 		 */
   2712 		mseg_base = seg->pages_base;
   2713 		mseg_end = seg->pages_end;
   2714 		mseg_npgs = MSEG_NPAGES(seg);
   2715 		mseg_start = memseg_get_start(seg);
   2716 
   2717 		if (memseg_is_dynamic(seg)) {
   2718 			/* Remap the meta data to our special dummy area. */
   2719 			memseg_remap_to_dummy(seg);
   2720 
   2721 			mutex_enter(&memseg_lists_lock);
   2722 			seg->lnext = memseg_va_avail;
   2723 			memseg_va_avail = seg;
   2724 			mutex_exit(&memseg_lists_lock);
   2725 		} else {
   2726 			/*
   2727 			 * For memory whose page_ts were allocated
   2728 			 * at boot, we need to find a new use for
   2729 			 * the page_t memory.
   2730 			 * For the moment, just leak it.
   2731 			 * (It is held in the memseg_delete_junk list.)
   2732 			 */
   2733 			seg->pages_end = seg->pages_base;
   2734 
   2735 			mutex_enter(&memseg_lists_lock);
   2736 			seg->lnext = memseg_delete_junk;
   2737 			memseg_delete_junk = seg;
   2738 			mutex_exit(&memseg_lists_lock);
   2739 		}
   2740 
   2741 		/* Must not use seg now as it could be re-used. */
   2742 
   2743 		memlist_write_lock();
   2744 
   2745 		mlret = memlist_delete_span(
   2746 		    (uint64_t)(mseg_base) << PAGESHIFT,
   2747 		    (uint64_t)(mseg_npgs) << PAGESHIFT,
   2748 		    &phys_avail);
   2749 		ASSERT(mlret == MEML_SPANOP_OK);
   2750 
   2751 		mlret = memlist_delete_span(
   2752 		    (uint64_t)(mseg_start) << PAGESHIFT,
   2753 		    (uint64_t)(mseg_end - mseg_start) <<
   2754 		    PAGESHIFT,
   2755 		    &phys_install);
   2756 		ASSERT(mlret == MEML_SPANOP_OK);
   2757 		phys_install_has_changed();
   2758 
   2759 		memlist_write_unlock();
   2760 	}
   2761 
   2762 	memlist_read_lock();
   2763 	installed_top_size(phys_install, &physmax, &physinstalled);
   2764 	memlist_read_unlock();
   2765 
   2766 	mutex_enter(&freemem_lock);
   2767 	maxmem -= avpgs;
   2768 	physmem -= avpgs;
   2769 	/* availrmem is adjusted during the delete. */
   2770 	availrmem_initial -= avpgs;
   2771 
   2772 	mutex_exit(&freemem_lock);
   2773 
   2774 	dump_resize();
   2775 
   2776 	cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK "
   2777 	    "(0x%" PRIx64 ")\n",
   2778 	    physinstalled << (PAGESHIFT - 10),
   2779 	    (uint64_t)physinstalled << PAGESHIFT);
   2780 
   2781 	avmem = (uint64_t)freemem << PAGESHIFT;
   2782 	cmn_err(CE_CONT, "?kphysm_delete: "
   2783 	    "avail mem = %" PRId64 "\n", avmem);
   2784 
   2785 	/*
   2786 	 * Update lgroup generation number on single lgroup systems
   2787 	 */
   2788 	if (nlgrps == 1)
   2789 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
   2790 
   2791 	/* Successfully deleted system memory */
   2792 	mutex_enter(&mhp->mh_mutex);
   2793 }
   2794 
   2795 static uint_t mdel_nullvp_waiter;
   2796 
   2797 static void
   2798 page_delete_collect(
   2799 	page_t *pp,
   2800 	struct mem_handle *mhp)
   2801 {
   2802 	if (pp->p_vnode) {
   2803 		page_hashout(pp, (kmutex_t *)NULL);
   2804 		/* do not do PP_SETAGED(pp); */
   2805 	} else {
   2806 		kmutex_t *sep;
   2807 
   2808 		sep = page_se_mutex(pp);
   2809 		mutex_enter(sep);
   2810 		if (CV_HAS_WAITERS(&pp->p_cv)) {
   2811 			mdel_nullvp_waiter++;
   2812 			cv_broadcast(&pp->p_cv);
   2813 		}
   2814 		mutex_exit(sep);
   2815 	}
   2816 	ASSERT(pp->p_next == pp->p_prev);
   2817 	ASSERT(pp->p_next == NULL || pp->p_next == pp);
   2818 	pp->p_next = mhp->mh_deleted;
   2819 	mhp->mh_deleted = pp;
   2820 	ASSERT(mhp->mh_hold_todo != 0);
   2821 	mhp->mh_hold_todo--;
   2822 }
   2823 
   2824 static void
   2825 transit_list_collect(struct mem_handle *mhp, int v)
   2826 {
   2827 	struct transit_list_head *trh;
   2828 
   2829 	trh = &transit_list_head;
   2830 	mutex_enter(&trh->trh_lock);
   2831 	mhp->mh_transit.trl_collect = v;
   2832 	mutex_exit(&trh->trh_lock);
   2833 }
   2834 
   2835 static void
   2836 transit_list_insert(struct transit_list *tlp)
   2837 {
   2838 	struct transit_list_head *trh;
   2839 
   2840 	trh = &transit_list_head;
   2841 	ASSERT(MUTEX_HELD(&trh->trh_lock));
   2842 	tlp->trl_next = trh->trh_head;
   2843 	trh->trh_head = tlp;
   2844 }
   2845 
   2846 static void
   2847 transit_list_remove(struct transit_list *tlp)
   2848 {
   2849 	struct transit_list_head *trh;
   2850 	struct transit_list **tlpp;
   2851 
   2852 	trh = &transit_list_head;
   2853 	tlpp = &trh->trh_head;
   2854 	ASSERT(MUTEX_HELD(&trh->trh_lock));
   2855 	while (*tlpp != NULL && *tlpp != tlp)
   2856 		tlpp = &(*tlpp)->trl_next;
   2857 	ASSERT(*tlpp != NULL);
   2858 	if (*tlpp == tlp)
   2859 		*tlpp = tlp->trl_next;
   2860 	tlp->trl_next = NULL;
   2861 }
   2862 
   2863 static struct transit_list *
   2864 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum)
   2865 {
   2866 	struct transit_list *tlp;
   2867 
   2868 	for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) {
   2869 		struct memdelspan *mdsp;
   2870 
   2871 		for (mdsp = tlp->trl_spans; mdsp != NULL;
   2872 		    mdsp = mdsp->mds_next) {
   2873 			if (pfnum >= mdsp->mds_base &&
   2874 			    pfnum < (mdsp->mds_base + mdsp->mds_npgs)) {
   2875 				return (tlp);
   2876 			}
   2877 		}
   2878 	}
   2879 	return (NULL);
   2880 }
   2881 
   2882 int
   2883 pfn_is_being_deleted(pfn_t pfnum)
   2884 {
   2885 	struct transit_list_head *trh;
   2886 	struct transit_list *tlp;
   2887 	int ret;
   2888 
   2889 	trh = &transit_list_head;
   2890 	if (trh->trh_head == NULL)
   2891 		return (0);
   2892 
   2893 	mutex_enter(&trh->trh_lock);
   2894 	tlp = pfnum_to_transit_list(trh, pfnum);
   2895 	ret = (tlp != NULL && tlp->trl_collect);
   2896 	mutex_exit(&trh->trh_lock);
   2897 
   2898 	return (ret);
   2899 }
   2900 
   2901 #ifdef MEM_DEL_STATS
   2902 extern int hz;
   2903 static void
   2904 mem_del_stat_print_func(struct mem_handle *mhp)
   2905 {
   2906 	uint64_t tmp;
   2907 
   2908 	if (mem_del_stat_print) {
   2909 		printf("memory delete loop %x/%x, statistics%s\n",
   2910 		    (uint_t)mhp->mh_transit.trl_spans->mds_base,
   2911 		    (uint_t)mhp->mh_transit.trl_spans->mds_npgs,
   2912 		    (mhp->mh_cancel ? " (cancelled)" : ""));
   2913 		printf("\t%8u nloop\n", mhp->mh_delstat.nloop);
   2914 		printf("\t%8u need_free\n", mhp->mh_delstat.need_free);
   2915 		printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop);
   2916 		printf("\t%8u free_low\n", mhp->mh_delstat.free_low);
   2917 		printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed);
   2918 		printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck);
   2919 		printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget);
   2920 		printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail);
   2921 		printf("\t%8u nfree\n", mhp->mh_delstat.nfree);
   2922 		printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc);
   2923 		printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail);
   2924 		printf("\t%8u already_done\n", mhp->mh_delstat.already_done);
   2925 		printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree);
   2926 		printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked);
   2927 		printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc);
   2928 		printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl);
   2929 		printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc);
   2930 		printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy);
   2931 		printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage);
   2932 		printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim);
   2933 		printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay);
   2934 		printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail);
   2935 		printf("\t%8u retired\n", mhp->mh_delstat.retired);
   2936 		printf("\t%8u toxic\n", mhp->mh_delstat.toxic);
   2937 		printf("\t%8u failing\n", mhp->mh_delstat.failing);
   2938 		printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic);
   2939 		printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic);
   2940 		printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail);
   2941 		printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail);
   2942 		tmp = mhp->mh_delstat.nticks_total / hz;  /* seconds */
   2943 		printf(
   2944 		    "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n",
   2945 		    mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60);
   2946 
   2947 		tmp = mhp->mh_delstat.nticks_pgrp / hz;  /* seconds */
   2948 		printf(
   2949 		    "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n",
   2950 		    mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60);
   2951 	}
   2952 }
   2953 #endif /* MEM_DEL_STATS */
   2954 
   2955 struct mem_callback {
   2956 	kphysm_setup_vector_t	*vec;
   2957 	void			*arg;
   2958 };
   2959 
   2960 #define	NMEMCALLBACKS		100
   2961 
   2962 static struct mem_callback mem_callbacks[NMEMCALLBACKS];
   2963 static uint_t nmemcallbacks;
   2964 static krwlock_t mem_callback_rwlock;
   2965 
   2966 int
   2967 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg)
   2968 {
   2969 	uint_t i, found;
   2970 
   2971 	/*
   2972 	 * This test will become more complicated when the version must
   2973 	 * change.
   2974 	 */
   2975 	if (vec->version != KPHYSM_SETUP_VECTOR_VERSION)
   2976 		return (EINVAL);
   2977 
   2978 	if (vec->post_add == NULL || vec->pre_del == NULL ||
   2979 	    vec->post_del == NULL)
   2980 		return (EINVAL);
   2981 
   2982 	rw_enter(&mem_callback_rwlock, RW_WRITER);
   2983 	for (i = 0, found = 0; i < nmemcallbacks; i++) {
   2984 		if (mem_callbacks[i].vec == NULL && found == 0)
   2985 			found = i + 1;
   2986 		if (mem_callbacks[i].vec == vec &&
   2987 		    mem_callbacks[i].arg == arg) {
   2988 #ifdef DEBUG
   2989 			/* Catch this in DEBUG kernels. */
   2990 			cmn_err(CE_WARN, "kphysm_setup_func_register"
   2991 			    "(0x%p, 0x%p) duplicate registration from 0x%p",
   2992 			    (void *)vec, arg, (void *)caller());
   2993 #endif /* DEBUG */
   2994 			rw_exit(&mem_callback_rwlock);
   2995 			return (EEXIST);
   2996 		}
   2997 	}
   2998 	if (found != 0) {
   2999 		i = found - 1;
   3000 	} else {
   3001 		ASSERT(nmemcallbacks < NMEMCALLBACKS);
   3002 		if (nmemcallbacks == NMEMCALLBACKS) {
   3003 			rw_exit(&mem_callback_rwlock);
   3004 			return (ENOMEM);
   3005 		}
   3006 		i = nmemcallbacks++;
   3007 	}
   3008 	mem_callbacks[i].vec = vec;
   3009 	mem_callbacks[i].arg = arg;
   3010 	rw_exit(&mem_callback_rwlock);
   3011 	return (0);
   3012 }
   3013 
   3014 void
   3015 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg)
   3016 {
   3017 	uint_t i;
   3018 
   3019 	rw_enter(&mem_callback_rwlock, RW_WRITER);
   3020 	for (i = 0; i < nmemcallbacks; i++) {
   3021 		if (mem_callbacks[i].vec == vec &&
   3022 		    mem_callbacks[i].arg == arg) {
   3023 			mem_callbacks[i].vec = NULL;
   3024 			mem_callbacks[i].arg = NULL;
   3025 			if (i == (nmemcallbacks - 1))
   3026 				nmemcallbacks--;
   3027 			break;
   3028 		}
   3029 	}
   3030 	rw_exit(&mem_callback_rwlock);
   3031 }
   3032 
   3033 static void
   3034 kphysm_setup_post_add(pgcnt_t delta_pages)
   3035 {
   3036 	uint_t i;
   3037 
   3038 	rw_enter(&mem_callback_rwlock, RW_READER);
   3039 	for (i = 0; i < nmemcallbacks; i++) {
   3040 		if (mem_callbacks[i].vec != NULL) {
   3041 			(*mem_callbacks[i].vec->post_add)
   3042 			    (mem_callbacks[i].arg, delta_pages);
   3043 		}
   3044 	}
   3045 	rw_exit(&mem_callback_rwlock);
   3046 }
   3047 
   3048 /*
   3049  * Note the locking between pre_del and post_del: The reader lock is held
   3050  * between the two calls to stop the set of functions from changing.
   3051  */
   3052 
   3053 static int
   3054 kphysm_setup_pre_del(pgcnt_t delta_pages)
   3055 {
   3056 	uint_t i;
   3057 	int ret;
   3058 	int aret;
   3059 
   3060 	ret = 0;
   3061 	rw_enter(&mem_callback_rwlock, RW_READER);
   3062 	for (i = 0; i < nmemcallbacks; i++) {
   3063 		if (mem_callbacks[i].vec != NULL) {
   3064 			aret = (*mem_callbacks[i].vec->pre_del)
   3065 			    (mem_callbacks[i].arg, delta_pages);
   3066 			ret |= aret;
   3067 		}
   3068 	}
   3069 
   3070 	return (ret);
   3071 }
   3072 
   3073 static void
   3074 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled)
   3075 {
   3076 	uint_t i;
   3077 
   3078 	for (i = 0; i < nmemcallbacks; i++) {
   3079 		if (mem_callbacks[i].vec != NULL) {
   3080 			(*mem_callbacks[i].vec->post_del)
   3081 			    (mem_callbacks[i].arg, delta_pages, cancelled);
   3082 		}
   3083 	}
   3084 	rw_exit(&mem_callback_rwlock);
   3085 }
   3086 
   3087 static int
   3088 kphysm_split_memseg(
   3089 	pfn_t base,
   3090 	pgcnt_t npgs)
   3091 {
   3092 	struct memseg *seg;
   3093 	struct memseg **segpp;
   3094 	pgcnt_t size_low, size_high;
   3095 	struct memseg *seg_low, *seg_mid, *seg_high;
   3096 
   3097 	/*
   3098 	 * Lock the memsegs list against other updates now
   3099 	 */
   3100 	memsegs_lock(1);
   3101 
   3102 	/*
   3103 	 * Find boot time memseg that wholly covers this area.
   3104 	 */
   3105 
   3106 	/* First find the memseg with page 'base' in it. */
   3107 	for (segpp = &memsegs; (seg = *segpp) != NULL;
   3108 	    segpp = &((*segpp)->next)) {
   3109 		if (base >= seg->pages_base && base < seg->pages_end)
   3110 			break;
   3111 	}
   3112 	if (seg == NULL) {
   3113 		memsegs_unlock(1);
   3114 		return (0);
   3115 	}
   3116 	if (memseg_includes_meta(seg)) {
   3117 		memsegs_unlock(1);
   3118 		return (0);
   3119 	}
   3120 	if ((base + npgs) > seg->pages_end) {
   3121 		memsegs_unlock(1);
   3122 		return (0);
   3123 	}
   3124 
   3125 	/*
   3126 	 * Work out the size of the two segments that will
   3127 	 * surround the new segment, one for low address
   3128 	 * and one for high.
   3129 	 */
   3130 	ASSERT(base >= seg->pages_base);
   3131 	size_low = base - seg->pages_base;
   3132 	ASSERT(seg->pages_end >= (base + npgs));
   3133 	size_high = seg->pages_end - (base + npgs);
   3134 
   3135 	/*
   3136 	 * Sanity check.
   3137 	 */
   3138 	if ((size_low + size_high) == 0) {
   3139 		memsegs_unlock(1);
   3140 		return (0);
   3141 	}
   3142 
   3143 	/*
   3144 	 * Allocate the new structures. The old memseg will not be freed
   3145 	 * as there may be a reference to it.
   3146 	 */
   3147 	seg_low = NULL;
   3148 	seg_high = NULL;
   3149 
   3150 	if (size_low != 0)
   3151 		seg_low = memseg_alloc();
   3152 
   3153 	seg_mid = memseg_alloc();
   3154 
   3155 	if (size_high != 0)
   3156 		seg_high = memseg_alloc();
   3157 
   3158 	/*
   3159 	 * All allocation done now.
   3160 	 */
   3161 	if (size_low != 0) {
   3162 		seg_low->pages = seg->pages;
   3163 		seg_low->epages = seg_low->pages + size_low;
   3164 		seg_low->pages_base = seg->pages_base;
   3165 		seg_low->pages_end = seg_low->pages_base + size_low;
   3166 		seg_low->next = seg_mid;
   3167 		seg_low->msegflags = seg->msegflags;
   3168 	}
   3169 	if (size_high != 0) {
   3170 		seg_high->pages = seg->epages - size_high;
   3171 		seg_high->epages = seg_high->pages + size_high;
   3172 		seg_high->pages_base = seg->pages_end - size_high;
   3173 		seg_high->pages_end = seg_high->pages_base + size_high;
   3174 		seg_high->next = seg->next;
   3175 		seg_high->msegflags = seg->msegflags;
   3176 	}
   3177 
   3178 	seg_mid->pages = seg->pages + size_low;
   3179 	seg_mid->pages_base = seg->pages_base + size_low;
   3180 	seg_mid->epages = seg->epages - size_high;
   3181 	seg_mid->pages_end = seg->pages_end - size_high;
   3182 	seg_mid->next = (seg_high != NULL) ? seg_high : seg->next;
   3183 	seg_mid->msegflags = seg->msegflags;
   3184 
   3185 	/*
   3186 	 * Update hat_kpm specific info of all involved memsegs and
   3187 	 * allow hat_kpm specific global chain updates.
   3188 	 */
   3189 	hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high);
   3190 
   3191 	/*
   3192 	 * At this point we have two equivalent memseg sub-chains,
   3193 	 * seg and seg_low/seg_mid/seg_high, which both chain on to
   3194 	 * the same place in the global chain. By re-writing the pointer
   3195 	 * in the previous element we switch atomically from using the old
   3196 	 * (seg) to the new.
   3197 	 */
   3198 	*segpp = (seg_low != NULL) ? seg_low : seg_mid;
   3199 
   3200 	membar_enter();
   3201 
   3202 	build_pfn_hash();
   3203 	memsegs_unlock(1);
   3204 
   3205 	/*
   3206 	 * We leave the old segment, 'seg', intact as there may be
   3207 	 * references to it. Also, as the value of total_pages has not
   3208 	 * changed and the memsegs list is effectively the same when
   3209 	 * accessed via the old or the new pointer, we do not have to
   3210 	 * cause pageout_scanner() to re-evaluate its hand pointers.
   3211 	 *
   3212 	 * We currently do not re-use or reclaim the page_t memory.
   3213 	 * If we do, then this may have to change.
   3214 	 */
   3215 
   3216 	mutex_enter(&memseg_lists_lock);
   3217 	seg->lnext = memseg_edit_junk;
   3218 	memseg_edit_junk = seg;
   3219 	mutex_exit(&memseg_lists_lock);
   3220 
   3221 	return (1);
   3222 }
   3223 
   3224 /*
   3225  * The sfmmu hat layer (e.g.) accesses some parts of the memseg
   3226  * structure using physical addresses. Therefore a kmem_cache is
   3227  * used with KMC_NOHASH to avoid page crossings within a memseg
   3228  * structure. KMC_NOHASH requires that no external (outside of
   3229  * slab) information is allowed. This, in turn, implies that the
   3230  * cache's slabsize must be exactly a single page, since per-slab
   3231  * information (e.g. the freelist for the slab) is kept at the
   3232  * end of the slab, where it is easy to locate. Should be changed
   3233  * when a more obvious kmem_cache interface/flag will become
   3234  * available.
   3235  */
   3236 void
   3237 mem_config_init()
   3238 {
   3239 	memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg),
   3240 	    0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
   3241 }
   3242 
   3243 struct memseg *
   3244 memseg_alloc()
   3245 {
   3246 	struct memseg *seg;
   3247 
   3248 	seg = kmem_cache_alloc(memseg_cache, KM_SLEEP);
   3249 	bzero(seg, sizeof (struct memseg));
   3250 
   3251 	return (seg);
   3252 }
   3253 
   3254 /*
   3255  * Return whether the page_t memory for this memseg
   3256  * is included in the memseg itself.
   3257  */
   3258 static int
   3259 memseg_includes_meta(struct memseg *seg)
   3260 {
   3261 	return (seg->msegflags & MEMSEG_META_INCL);
   3262 }
   3263 
   3264 pfn_t
   3265 memseg_get_start(struct memseg *seg)
   3266 {
   3267 	pfn_t		pt_start;
   3268 
   3269 	if (memseg_includes_meta(seg)) {
   3270 		pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages);
   3271 
   3272 		/* Meta data is required to be at the beginning */
   3273 		ASSERT(pt_start < seg->pages_base);
   3274 	} else
   3275 		pt_start = seg->pages_base;
   3276 
   3277 	return (pt_start);
   3278 }
   3279 
   3280 /*
   3281  * Invalidate memseg pointers in cpu private vm data caches.
   3282  */
   3283 static void
   3284 memseg_cpu_vm_flush()
   3285 {
   3286 	cpu_t *cp;
   3287 	vm_cpu_data_t *vc;
   3288 
   3289 	mutex_enter(&cpu_lock);
   3290 	pause_cpus(NULL);
   3291 
   3292 	cp = cpu_list;
   3293 	do {
   3294 		vc = cp->cpu_vm_data;
   3295 		vc->vc_pnum_memseg = NULL;
   3296 		vc->vc_pnext_memseg = NULL;
   3297 
   3298 	} while ((cp = cp->cpu_next) != cpu_list);
   3299 
   3300 	start_cpus();
   3301 	mutex_exit(&cpu_lock);
   3302 }
   3303