Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)vm_seg.c	1.80	07/01/03 SMI"
     40 
     41 /*
     42  * VM - segment management.
     43  */
     44 
     45 #include <sys/types.h>
     46 #include <sys/inttypes.h>
     47 #include <sys/t_lock.h>
     48 #include <sys/param.h>
     49 #include <sys/systm.h>
     50 #include <sys/kmem.h>
     51 #include <sys/vmsystm.h>
     52 #include <sys/debug.h>
     53 #include <sys/cmn_err.h>
     54 #include <sys/callb.h>
     55 #include <sys/mem_config.h>
     56 #include <sys/mman.h>
     57 
     58 #include <vm/hat.h>
     59 #include <vm/as.h>
     60 #include <vm/seg.h>
     61 #include <vm/seg_kmem.h>
     62 #include <vm/seg_spt.h>
     63 #include <vm/seg_vn.h>
     64 /*
     65  * kstats for segment advise
     66  */
     67 segadvstat_t segadvstat = {
     68 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
     69 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
     70 };
     71 
     72 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
     73 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
     74 
     75 /* #define	PDEBUG */
     76 #if defined(PDEBUG) || defined(lint) || defined(__lint)
     77 int pdebug = 0;
     78 #else
     79 #define	pdebug		0
     80 #endif	/* PDEBUG */
     81 
     82 #define	PPRINTF				if (pdebug) printf
     83 #define	PPRINT(x)			PPRINTF(x)
     84 #define	PPRINT1(x, a)			PPRINTF(x, a)
     85 #define	PPRINT2(x, a, b)		PPRINTF(x, a, b)
     86 #define	PPRINT3(x, a, b, c)		PPRINTF(x, a, b, c)
     87 #define	PPRINT4(x, a, b, c, d)		PPRINTF(x, a, b, c, d)
     88 #define	PPRINT5(x, a, b, c, d, e)	PPRINTF(x, a, b, c, d, e)
     89 
     90 #define	P_HASHMASK		(p_hashsize - 1)
     91 #define	P_BASESHIFT		6
     92 
     93 /*
     94  * entry in the segment page cache
     95  */
     96 struct seg_pcache {
     97 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
     98 	struct seg_pcache *p_hprev;
     99 	int		p_active;	/* active count */
    100 	int		p_ref;		/* ref bit */
    101 	size_t		p_len;		/* segment length */
    102 	caddr_t		p_addr;		/* base address */
    103 	struct seg 	*p_seg;		/* segment */
    104 	struct page	**p_pp;		/* pp shadow list */
    105 	enum seg_rw	p_rw;		/* rw */
    106 	uint_t		p_flags;	/* bit flags */
    107 	int		(*p_callback)(struct seg *, caddr_t, size_t,
    108 			    struct page **, enum seg_rw);
    109 };
    110 
    111 struct seg_phash {
    112 	struct seg_pcache *p_hnext;	/* list for hashed blocks */
    113 	struct seg_pcache *p_hprev;
    114 	int p_qlen;			/* Q length */
    115 	kmutex_t p_hmutex;		/* protects hash bucket */
    116 };
    117 
    118 static int seg_preap_time = 20;	/* reclaim every 20 secs */
    119 static int seg_pmaxqlen = 5;	/* max Q length in hash list */
    120 static int seg_ppcount = 5;	/* max # of purges per reclaim interval */
    121 static int seg_plazy = 1;	/* if 1, pages are cached after pageunlock */
    122 static pgcnt_t seg_pwindow;	/* max # of pages that can be cached */
    123 static pgcnt_t seg_plocked;	/* # of pages which are cached by pagelock */
    124 static pgcnt_t seg_plocked_window; /* # pages from window */
    125 int seg_preapahead;
    126 
    127 static uint_t seg_pdisable = 0;	/* if not 0, caching temporarily disabled */
    128 
    129 static int seg_pupdate_active = 1;	/* background reclaim thread */
    130 static clock_t seg_preap_interval;	/* reap interval in ticks */
    131 
    132 static kmutex_t seg_pcache;	/* protects the whole pagelock cache */
    133 static kmutex_t seg_pmem;	/* protects window counter */
    134 static ksema_t seg_psaync_sem;	/* sema for reclaim thread */
    135 static struct seg_phash *p_hashtab;
    136 static int p_hashsize = 0;
    137 
    138 #define	p_hash(seg) \
    139 	(P_HASHMASK & \
    140 	((uintptr_t)(seg) >> P_BASESHIFT))
    141 
    142 #define	p_match(pcp, seg, addr, len, rw) \
    143 	(((pcp)->p_seg == (seg) && \
    144 	(pcp)->p_addr == (addr) && \
    145 	(pcp)->p_rw == (rw) && \
    146 	(pcp)->p_len == (len)) ? 1 : 0)
    147 
    148 #define	p_match_pp(pcp, seg, addr, len, pp, rw) \
    149 	(((pcp)->p_seg == (seg) && \
    150 	(pcp)->p_addr == (addr) && \
    151 	(pcp)->p_pp == (pp) && \
    152 	(pcp)->p_rw == (rw) && \
    153 	(pcp)->p_len == (len)) ? 1 : 0)
    154 
    155 
    156 /*
    157  * lookup an address range in pagelock cache. Return shadow list
    158  * and bump up active count.
    159  */
    160 struct page **
    161 seg_plookup(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw)
    162 {
    163 	struct seg_pcache *pcp;
    164 	struct seg_phash *hp;
    165 
    166 	/*
    167 	 * Skip pagelock cache, while DR is in progress or
    168 	 * seg_pcache is off.
    169 	 */
    170 	if (seg_pdisable || seg_plazy == 0) {
    171 		return (NULL);
    172 	}
    173 
    174 	hp = &p_hashtab[p_hash(seg)];
    175 	mutex_enter(&hp->p_hmutex);
    176 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
    177 	    pcp = pcp->p_hnext) {
    178 		if (p_match(pcp, seg, addr, len, rw)) {
    179 			pcp->p_active++;
    180 			mutex_exit(&hp->p_hmutex);
    181 
    182 			PPRINT5("seg_plookup hit: seg %p, addr %p, "
    183 			    "len %lx, count %d, pplist %p \n",
    184 			    (void *)seg, (void *)addr, len, pcp->p_active,
    185 			    (void *)pcp->p_pp);
    186 
    187 			return (pcp->p_pp);
    188 		}
    189 	}
    190 	mutex_exit(&hp->p_hmutex);
    191 
    192 	PPRINT("seg_plookup miss:\n");
    193 
    194 	return (NULL);
    195 }
    196 
    197 /*
    198  * mark address range inactive. If the cache is off or the address
    199  * range is not in the cache we call the segment driver to reclaim
    200  * the pages. Otherwise just decrement active count and set ref bit.
    201  */
    202 void
    203 seg_pinactive(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
    204     enum seg_rw rw, int (*callback)(struct seg *, caddr_t, size_t,
    205     struct page **, enum seg_rw))
    206 {
    207 	struct seg_pcache *pcp;
    208 	struct seg_phash *hp;
    209 
    210 	if (seg_plazy == 0) {
    211 		(void) (*callback)(seg, addr, len, pp, rw);
    212 		return;
    213 	}
    214 	hp = &p_hashtab[p_hash(seg)];
    215 	mutex_enter(&hp->p_hmutex);
    216 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
    217 	    pcp = pcp->p_hnext) {
    218 		if (p_match_pp(pcp, seg, addr, len, pp, rw)) {
    219 			pcp->p_active--;
    220 			ASSERT(pcp->p_active >= 0);
    221 			if (pcp->p_active == 0 && seg_pdisable) {
    222 				int npages;
    223 
    224 				ASSERT(callback == pcp->p_callback);
    225 				/* free the entry */
    226 				hp->p_qlen--;
    227 				pcp->p_hprev->p_hnext = pcp->p_hnext;
    228 				pcp->p_hnext->p_hprev = pcp->p_hprev;
    229 				mutex_exit(&hp->p_hmutex);
    230 				npages = pcp->p_len >> PAGESHIFT;
    231 				mutex_enter(&seg_pmem);
    232 				seg_plocked -= npages;
    233 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
    234 					seg_plocked_window -= npages;
    235 				}
    236 				mutex_exit(&seg_pmem);
    237 				kmem_free(pcp, sizeof (struct seg_pcache));
    238 				goto out;
    239 			}
    240 			pcp->p_ref = 1;
    241 			mutex_exit(&hp->p_hmutex);
    242 			return;
    243 		}
    244 	}
    245 	mutex_exit(&hp->p_hmutex);
    246 out:
    247 	(void) (*callback)(seg, addr, len, pp, rw);
    248 }
    249 
    250 /*
    251  * The seg_pinsert_check() is used by segment drivers to predict whether
    252  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
    253  */
    254 
    255 int
    256 seg_pinsert_check(struct seg *seg, size_t len, uint_t flags)
    257 {
    258 	struct seg_phash *hp;
    259 
    260 	if (seg_plazy == 0) {
    261 		return (SEGP_FAIL);
    262 	}
    263 	if (seg_pdisable != 0) {
    264 		return (SEGP_FAIL);
    265 	}
    266 	ASSERT((len & PAGEOFFSET) == 0);
    267 	hp = &p_hashtab[p_hash(seg)];
    268 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
    269 		return (SEGP_FAIL);
    270 	}
    271 	/*
    272 	 * If the SEGP_FORCE_WIRED flag is set,
    273 	 * we skip the check for seg_pwindow.
    274 	 */
    275 	if ((flags & SEGP_FORCE_WIRED) == 0) {
    276 		pgcnt_t npages;
    277 
    278 		npages = len >> PAGESHIFT;
    279 		if ((seg_plocked_window + npages) > seg_pwindow) {
    280 			return (SEGP_FAIL);
    281 		}
    282 	}
    283 	return (SEGP_SUCCESS);
    284 }
    285 
    286 
    287 /*
    288  * insert address range with shadow list into pagelock cache. If
    289  * the cache is off or caching is temporarily disabled or the allowed
    290  * 'window' is exceeded - return SEGP_FAIL. Otherwise return
    291  * SEGP_SUCCESS.
    292  */
    293 int
    294 seg_pinsert(struct seg *seg, caddr_t addr, size_t len, struct page **pp,
    295     enum seg_rw rw, uint_t flags, int (*callback)(struct seg *, caddr_t,
    296     size_t, struct page **, enum seg_rw))
    297 {
    298 	struct seg_pcache *pcp;
    299 	struct seg_phash *hp;
    300 	pgcnt_t npages;
    301 
    302 	if (seg_plazy == 0) {
    303 		return (SEGP_FAIL);
    304 	}
    305 	if (seg_pdisable != 0) {
    306 		return (SEGP_FAIL);
    307 	}
    308 	ASSERT((len & PAGEOFFSET) == 0);
    309 	hp = &p_hashtab[p_hash(seg)];
    310 	if (hp->p_qlen > seg_pmaxqlen && (flags & SEGP_FORCE_WIRED) == 0) {
    311 		return (SEGP_FAIL);
    312 	}
    313 	npages = len >> PAGESHIFT;
    314 	mutex_enter(&seg_pmem);
    315 	/*
    316 	 * If the SEGP_FORCE_WIRED flag is set,
    317 	 * we skip the check for seg_pwindow.
    318 	 */
    319 	if ((flags & SEGP_FORCE_WIRED) == 0) {
    320 		seg_plocked_window += npages;
    321 		if (seg_plocked_window > seg_pwindow) {
    322 			seg_plocked_window -= npages;
    323 			mutex_exit(&seg_pmem);
    324 			return (SEGP_FAIL);
    325 		}
    326 	}
    327 	seg_plocked += npages;
    328 	mutex_exit(&seg_pmem);
    329 
    330 	pcp = kmem_alloc(sizeof (struct seg_pcache), KM_SLEEP);
    331 	pcp->p_seg = seg;
    332 	pcp->p_addr = addr;
    333 	pcp->p_len = len;
    334 	pcp->p_pp = pp;
    335 	pcp->p_rw = rw;
    336 	pcp->p_callback = callback;
    337 	pcp->p_active = 1;
    338 	pcp->p_flags = flags;
    339 
    340 	PPRINT4("seg_pinsert: seg %p, addr %p, len %lx, pplist %p\n",
    341 	    (void *)seg, (void *)addr, len, (void *)pp);
    342 
    343 	hp = &p_hashtab[p_hash(seg)];
    344 	mutex_enter(&hp->p_hmutex);
    345 	hp->p_qlen++;
    346 	pcp->p_hnext = hp->p_hnext;
    347 	pcp->p_hprev = (struct seg_pcache *)hp;
    348 	hp->p_hnext->p_hprev = pcp;
    349 	hp->p_hnext = pcp;
    350 	mutex_exit(&hp->p_hmutex);
    351 	return (SEGP_SUCCESS);
    352 }
    353 
    354 /*
    355  * purge all entries from the pagelock cache if not active
    356  * and not recently used. Drop all locks and call through
    357  * the address space into the segment driver to reclaim
    358  * the pages. This makes sure we get the address space
    359  * and segment driver locking right.
    360  */
    361 static void
    362 seg_ppurge_all(int force)
    363 {
    364 	struct seg_pcache *delcallb_list = NULL;
    365 	struct seg_pcache *pcp;
    366 	struct seg_phash *hp;
    367 	int purge_count = 0;
    368 	pgcnt_t npages = 0;
    369 	pgcnt_t npages_window = 0;
    370 
    371 	/*
    372 	 * if the cache if off or empty, return
    373 	 */
    374 	if (seg_plazy == 0 || seg_plocked == 0) {
    375 		return;
    376 	}
    377 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
    378 		mutex_enter(&hp->p_hmutex);
    379 		pcp = hp->p_hnext;
    380 
    381 		/*
    382 		 * While 'force' is set, seg_pasync_thread is not
    383 		 * throttled.  This is to speedup flushing of seg_pcache
    384 		 * in preparation for DR.
    385 		 *
    386 		 * In normal case, when 'force' is not set, we throttle
    387 		 * seg_pasync_thread so that we don't spend all the time
    388 		 * time in purging the cache.
    389 		 */
    390 		while ((pcp != (struct seg_pcache *)hp) &&
    391 				(force || (purge_count <= seg_ppcount))) {
    392 
    393 			/*
    394 			 * purge entries which are not active and
    395 			 * have not been used recently and
    396 			 * have the SEGP_ASYNC_FLUSH flag.
    397 			 *
    398 			 * In the 'force' case, we ignore the
    399 			 * SEGP_ASYNC_FLUSH flag.
    400 			 */
    401 			if (!(pcp->p_flags & SEGP_ASYNC_FLUSH))
    402 				pcp->p_ref = 1;
    403 			if (force)
    404 				pcp->p_ref = 0;
    405 			if (!pcp->p_ref && !pcp->p_active) {
    406 				struct as *as = pcp->p_seg->s_as;
    407 
    408 				/*
    409 				 * try to get the readers lock on the address
    410 				 * space before taking out the cache element.
    411 				 * This ensures as_pagereclaim() can actually
    412 				 * call through the address space and free
    413 				 * the pages. If we don't get the lock, just
    414 				 * skip this entry. The pages will be reclaimed
    415 				 * by the segment driver at unmap time.
    416 				 */
    417 				if (AS_LOCK_TRYENTER(as, &as->a_lock,
    418 				    RW_READER)) {
    419 					hp->p_qlen--;
    420 					pcp->p_hprev->p_hnext = pcp->p_hnext;
    421 					pcp->p_hnext->p_hprev = pcp->p_hprev;
    422 					pcp->p_hprev = delcallb_list;
    423 					delcallb_list = pcp;
    424 					purge_count++;
    425 				}
    426 			} else {
    427 				pcp->p_ref = 0;
    428 			}
    429 			pcp = pcp->p_hnext;
    430 		}
    431 		mutex_exit(&hp->p_hmutex);
    432 		if (!force && purge_count > seg_ppcount)
    433 			break;
    434 	}
    435 
    436 	/*
    437 	 * run the delayed callback list. We don't want to hold the
    438 	 * cache lock during a call through the address space.
    439 	 */
    440 	while (delcallb_list != NULL) {
    441 		struct as *as;
    442 
    443 		pcp = delcallb_list;
    444 		delcallb_list = pcp->p_hprev;
    445 		as = pcp->p_seg->s_as;
    446 
    447 		PPRINT4("seg_ppurge_all: purge seg %p, addr %p, len %lx, "
    448 		    "pplist %p\n", (void *)pcp->p_seg, (void *)pcp->p_addr,
    449 		    pcp->p_len, (void *)pcp->p_pp);
    450 
    451 		as_pagereclaim(as, pcp->p_pp, pcp->p_addr,
    452 		    pcp->p_len, pcp->p_rw);
    453 		AS_LOCK_EXIT(as, &as->a_lock);
    454 		npages += pcp->p_len >> PAGESHIFT;
    455 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
    456 			npages_window += pcp->p_len >> PAGESHIFT;
    457 		}
    458 		kmem_free(pcp, sizeof (struct seg_pcache));
    459 	}
    460 	mutex_enter(&seg_pmem);
    461 	seg_plocked -= npages;
    462 	seg_plocked_window -= npages_window;
    463 	mutex_exit(&seg_pmem);
    464 }
    465 
    466 /*
    467  * Remove cached pages for segment(s) entries from hashtable.
    468  * The segments are identified by a given clients callback
    469  * function.
    470  * This is useful for multiple seg's cached on behalf of
    471  * dummy segment (ISM/DISM) with common callback function.
    472  * The clients callback function may return status indicating
    473  * that the last seg's entry has been purged. In such a case
    474  * the seg_ppurge_seg() stops searching hashtable and exits.
    475  * Otherwise all hashtable entries are scanned.
    476  */
    477 void
    478 seg_ppurge_seg(int (*callback)(struct seg *, caddr_t, size_t,
    479     struct page **, enum seg_rw))
    480 {
    481 	struct seg_pcache *pcp, *npcp;
    482 	struct seg_phash *hp;
    483 	pgcnt_t npages = 0;
    484 	pgcnt_t npages_window = 0;
    485 	int	done = 0;
    486 
    487 	/*
    488 	 * if the cache if off or empty, return
    489 	 */
    490 	if (seg_plazy == 0 || seg_plocked == 0) {
    491 		return;
    492 	}
    493 	mutex_enter(&seg_pcache);
    494 	seg_pdisable++;
    495 	mutex_exit(&seg_pcache);
    496 
    497 	for (hp = p_hashtab; hp < &p_hashtab[p_hashsize]; hp++) {
    498 
    499 		mutex_enter(&hp->p_hmutex);
    500 		pcp = hp->p_hnext;
    501 		while (pcp != (struct seg_pcache *)hp) {
    502 
    503 			/*
    504 			 * purge entries which are not active
    505 			 */
    506 			npcp = pcp->p_hnext;
    507 			if (!pcp->p_active && pcp->p_callback == callback) {
    508 				hp->p_qlen--;
    509 				pcp->p_hprev->p_hnext = pcp->p_hnext;
    510 				pcp->p_hnext->p_hprev = pcp->p_hprev;
    511 
    512 				if ((*pcp->p_callback)(pcp->p_seg, pcp->p_addr,
    513 				    pcp->p_len, pcp->p_pp, pcp->p_rw)) {
    514 					done = 1;
    515 				}
    516 
    517 				npages += pcp->p_len >> PAGESHIFT;
    518 				if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
    519 					npages_window +=
    520 					    pcp->p_len >> PAGESHIFT;
    521 				}
    522 				kmem_free(pcp, sizeof (struct seg_pcache));
    523 			}
    524 			pcp = npcp;
    525 			if (done)
    526 				break;
    527 		}
    528 		mutex_exit(&hp->p_hmutex);
    529 		if (done)
    530 			break;
    531 	}
    532 
    533 	mutex_enter(&seg_pcache);
    534 	seg_pdisable--;
    535 	mutex_exit(&seg_pcache);
    536 
    537 	mutex_enter(&seg_pmem);
    538 	seg_plocked -= npages;
    539 	seg_plocked_window -= npages_window;
    540 	mutex_exit(&seg_pmem);
    541 }
    542 
    543 /*
    544  * purge all entries for a given segment. Since we
    545  * callback into the segment driver directly for page
    546  * reclaim the caller needs to hold the right locks.
    547  */
    548 void
    549 seg_ppurge(struct seg *seg)
    550 {
    551 	struct seg_pcache *delcallb_list = NULL;
    552 	struct seg_pcache *pcp;
    553 	struct seg_phash *hp;
    554 	pgcnt_t npages = 0;
    555 	pgcnt_t npages_window = 0;
    556 
    557 	if (seg_plazy == 0) {
    558 		return;
    559 	}
    560 	hp = &p_hashtab[p_hash(seg)];
    561 	mutex_enter(&hp->p_hmutex);
    562 	pcp = hp->p_hnext;
    563 	while (pcp != (struct seg_pcache *)hp) {
    564 		if (pcp->p_seg == seg) {
    565 			if (pcp->p_active) {
    566 				break;
    567 			}
    568 			hp->p_qlen--;
    569 			pcp->p_hprev->p_hnext = pcp->p_hnext;
    570 			pcp->p_hnext->p_hprev = pcp->p_hprev;
    571 			pcp->p_hprev = delcallb_list;
    572 			delcallb_list = pcp;
    573 		}
    574 		pcp = pcp->p_hnext;
    575 	}
    576 	mutex_exit(&hp->p_hmutex);
    577 	while (delcallb_list != NULL) {
    578 		pcp = delcallb_list;
    579 		delcallb_list = pcp->p_hprev;
    580 
    581 		PPRINT4("seg_ppurge: purge seg %p, addr %p, len %lx, "
    582 		    "pplist %p\n", (void *)seg, (void *)pcp->p_addr,
    583 		    pcp->p_len, (void *)pcp->p_pp);
    584 
    585 		ASSERT(seg == pcp->p_seg);
    586 		(void) (*pcp->p_callback)(seg, pcp->p_addr,
    587 		    pcp->p_len, pcp->p_pp, pcp->p_rw);
    588 		npages += pcp->p_len >> PAGESHIFT;
    589 		if ((pcp->p_flags & SEGP_FORCE_WIRED) == 0) {
    590 			npages_window += pcp->p_len >> PAGESHIFT;
    591 		}
    592 		kmem_free(pcp, sizeof (struct seg_pcache));
    593 	}
    594 	mutex_enter(&seg_pmem);
    595 	seg_plocked -= npages;
    596 	seg_plocked_window -= npages_window;
    597 	mutex_exit(&seg_pmem);
    598 }
    599 
    600 static void seg_pinit_mem_config(void);
    601 
    602 /*
    603  * setup the pagelock cache
    604  */
    605 static void
    606 seg_pinit(void)
    607 {
    608 	struct seg_phash *hp;
    609 	int i;
    610 	uint_t physmegs;
    611 
    612 	sema_init(&seg_psaync_sem, 0, NULL, SEMA_DEFAULT, NULL);
    613 
    614 	mutex_enter(&seg_pcache);
    615 	if (p_hashtab == NULL) {
    616 		physmegs = physmem >> (20 - PAGESHIFT);
    617 
    618 		/* If p_hashsize was not set in /etc/system ... */
    619 		if (p_hashsize == 0) {
    620 			/*
    621 			 * Choose p_hashsize based on physmem.
    622 			 */
    623 			if (physmegs < 64) {
    624 				p_hashsize = 64;
    625 			} else if (physmegs < 1024) {
    626 				p_hashsize = 1024;
    627 			} else if (physmegs < 10 * 1024) {
    628 				p_hashsize = 8192;
    629 			} else if (physmegs < 20 * 1024) {
    630 				p_hashsize = 2 * 8192;
    631 				seg_pmaxqlen = 16;
    632 			} else {
    633 				p_hashsize = 128 * 1024;
    634 				seg_pmaxqlen = 128;
    635 			}
    636 		}
    637 
    638 		p_hashtab = kmem_zalloc(
    639 			p_hashsize * sizeof (struct seg_phash), KM_SLEEP);
    640 		for (i = 0; i < p_hashsize; i++) {
    641 			hp = (struct seg_phash *)&p_hashtab[i];
    642 			hp->p_hnext = (struct seg_pcache *)hp;
    643 			hp->p_hprev = (struct seg_pcache *)hp;
    644 			mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
    645 		}
    646 		if (seg_pwindow == 0) {
    647 			if (physmegs < 24) {
    648 				/* don't use cache */
    649 				seg_plazy = 0;
    650 			} else if (physmegs < 64) {
    651 				seg_pwindow = physmem >> 5; /* 3% of memory */
    652 			} else if (physmegs < 10 * 1024) {
    653 				seg_pwindow = physmem >> 3; /* 12% of memory */
    654 			} else {
    655 				seg_pwindow = physmem >> 1;
    656 			}
    657 		}
    658 	}
    659 	mutex_exit(&seg_pcache);
    660 
    661 	seg_pinit_mem_config();
    662 }
    663 
    664 /*
    665  * called by pageout if memory is low
    666  */
    667 void
    668 seg_preap(void)
    669 {
    670 	/*
    671 	 * if the cache if off or empty, return
    672 	 */
    673 	if (seg_plocked == 0 || seg_plazy == 0) {
    674 		return;
    675 	}
    676 	sema_v(&seg_psaync_sem);
    677 }
    678 
    679 static void seg_pupdate(void *);
    680 
    681 /*
    682  * run as a backgroud thread and reclaim pagelock
    683  * pages which have not been used recently
    684  */
    685 void
    686 seg_pasync_thread(void)
    687 {
    688 	callb_cpr_t cpr_info;
    689 	kmutex_t pasync_lock;	/* just for CPR stuff */
    690 
    691 	mutex_init(&pasync_lock, NULL, MUTEX_DEFAULT, NULL);
    692 
    693 	CALLB_CPR_INIT(&cpr_info, &pasync_lock,
    694 		callb_generic_cpr, "seg_pasync");
    695 
    696 	if (seg_preap_interval == 0) {
    697 		seg_preap_interval = seg_preap_time * hz;
    698 	} else {
    699 		seg_preap_interval *= hz;
    700 	}
    701 	if (seg_plazy && seg_pupdate_active) {
    702 		(void) timeout(seg_pupdate, NULL, seg_preap_interval);
    703 	}
    704 
    705 	for (;;) {
    706 		mutex_enter(&pasync_lock);
    707 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
    708 		mutex_exit(&pasync_lock);
    709 		sema_p(&seg_psaync_sem);
    710 		mutex_enter(&pasync_lock);
    711 		CALLB_CPR_SAFE_END(&cpr_info, &pasync_lock);
    712 		mutex_exit(&pasync_lock);
    713 
    714 		seg_ppurge_all(0);
    715 	}
    716 }
    717 
    718 static void
    719 seg_pupdate(void *dummy)
    720 {
    721 	sema_v(&seg_psaync_sem);
    722 
    723 	if (seg_plazy && seg_pupdate_active) {
    724 		(void) timeout(seg_pupdate, dummy, seg_preap_interval);
    725 	}
    726 }
    727 
    728 static struct kmem_cache *seg_cache;
    729 
    730 /*
    731  * Initialize segment management data structures.
    732  */
    733 void
    734 seg_init(void)
    735 {
    736 	kstat_t *ksp;
    737 
    738 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
    739 		0, NULL, NULL, NULL, NULL, NULL, 0);
    740 
    741 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
    742 		segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
    743 	if (ksp) {
    744 		ksp->ks_data = (void *)segadvstat_ptr;
    745 		kstat_install(ksp);
    746 	}
    747 
    748 	seg_pinit();
    749 }
    750 
    751 /*
    752  * Allocate a segment to cover [base, base+size]
    753  * and attach it to the specified address space.
    754  */
    755 struct seg *
    756 seg_alloc(struct as *as, caddr_t base, size_t size)
    757 {
    758 	struct seg *new;
    759 	caddr_t segbase;
    760 	size_t segsize;
    761 
    762 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
    763 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
    764 	    (uintptr_t)segbase;
    765 
    766 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
    767 		return ((struct seg *)NULL);	/* bad virtual addr range */
    768 
    769 	if (as != &kas &&
    770 	    valid_usr_range(segbase, segsize, 0, as,
    771 	    as->a_userlimit) != RANGE_OKAY)
    772 		return ((struct seg *)NULL);	/* bad virtual addr range */
    773 
    774 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
    775 	new->s_ops = NULL;
    776 	new->s_data = NULL;
    777 	new->s_szc = 0;
    778 	new->s_flags = 0;
    779 	if (seg_attach(as, segbase, segsize, new) < 0) {
    780 		kmem_cache_free(seg_cache, new);
    781 		return ((struct seg *)NULL);
    782 	}
    783 	/* caller must fill in ops, data */
    784 	return (new);
    785 }
    786 
    787 /*
    788  * Attach a segment to the address space.  Used by seg_alloc()
    789  * and for kernel startup to attach to static segments.
    790  */
    791 int
    792 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
    793 {
    794 	seg->s_as = as;
    795 	seg->s_base = base;
    796 	seg->s_size = size;
    797 
    798 	/*
    799 	 * as_addseg() will add the segment at the appropraite point
    800 	 * in the list. It will return -1 if there is overlap with
    801 	 * an already existing segment.
    802 	 */
    803 	return (as_addseg(as, seg));
    804 }
    805 
    806 /*
    807  * Unmap a segment and free it from its associated address space.
    808  * This should be called by anybody who's finished with a whole segment's
    809  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
    810  * responsibility of the segment driver to unlink the the segment
    811  * from the address space, and to free public and private data structures
    812  * associated with the segment.  (This is typically done by a call to
    813  * seg_free()).
    814  */
    815 void
    816 seg_unmap(struct seg *seg)
    817 {
    818 #ifdef DEBUG
    819 	int ret;
    820 #endif /* DEBUG */
    821 
    822 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    823 
    824 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
    825 	ASSERT(seg->s_data != NULL);
    826 
    827 	/* Unmap the whole mapping */
    828 #ifdef DEBUG
    829 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
    830 	ASSERT(ret == 0);
    831 #else
    832 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
    833 #endif /* DEBUG */
    834 }
    835 
    836 /*
    837  * Free the segment from its associated as. This should only be called
    838  * if a mapping to the segment has not yet been established (e.g., if
    839  * an error occurs in the middle of doing an as_map when the segment
    840  * has already been partially set up) or if it has already been deleted
    841  * (e.g., from a segment driver unmap routine if the unmap applies to the
    842  * entire segment). If the mapping is currently set up then seg_unmap() should
    843  * be called instead.
    844  */
    845 void
    846 seg_free(struct seg *seg)
    847 {
    848 	register struct as *as = seg->s_as;
    849 	struct seg *tseg = as_removeseg(as, seg);
    850 
    851 	ASSERT(tseg == seg);
    852 
    853 	/*
    854 	 * If the segment private data field is NULL,
    855 	 * then segment driver is not attached yet.
    856 	 */
    857 	if (seg->s_data != NULL)
    858 		SEGOP_FREE(seg);
    859 
    860 	kmem_cache_free(seg_cache, seg);
    861 }
    862 
    863 /*ARGSUSED*/
    864 static void
    865 seg_p_mem_config_post_add(
    866 	void *arg,
    867 	pgcnt_t delta_pages)
    868 {
    869 	/* Nothing to do. */
    870 }
    871 
    872 void
    873 seg_p_enable(void)
    874 {
    875 	mutex_enter(&seg_pcache);
    876 	ASSERT(seg_pdisable != 0);
    877 	seg_pdisable--;
    878 	mutex_exit(&seg_pcache);
    879 }
    880 
    881 /*
    882  * seg_p_disable - disables seg_pcache, and then attempts to empty the
    883  * cache.
    884  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
    885  * SEGP_FAIL if the cache could not be emptied.
    886  */
    887 int
    888 seg_p_disable(void)
    889 {
    890 	pgcnt_t	old_plocked;
    891 	int stall_count = 0;
    892 
    893 	mutex_enter(&seg_pcache);
    894 	seg_pdisable++;
    895 	ASSERT(seg_pdisable != 0);
    896 	mutex_exit(&seg_pcache);
    897 
    898 	/*
    899 	 * Attempt to empty the cache. Terminate if seg_plocked does not
    900 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
    901 	 */
    902 	while (seg_plocked != 0) {
    903 		old_plocked = seg_plocked;
    904 		seg_ppurge_all(1);
    905 		if (seg_plocked == old_plocked) {
    906 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
    907 				return (SEGP_FAIL);
    908 			}
    909 		} else
    910 			stall_count = 0;
    911 		if (seg_plocked != 0)
    912 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
    913 	}
    914 	return (SEGP_SUCCESS);
    915 }
    916 
    917 /*
    918  * Attempt to purge seg_pcache.  May need to return before this has
    919  * completed to allow other pre_del callbacks to unlock pages. This is
    920  * ok because:
    921  *	1) The seg_pdisable flag has been set so at least we won't
    922  *	cache anymore locks and the locks we couldn't purge
    923  *	will not be held if they do get released by a subsequent
    924  *	pre-delete callback.
    925  *
    926  *	2) The rest of the memory delete thread processing does not
    927  *	depend on the changes made in this pre-delete callback. No
    928  *	panics will result, the worst that will happen is that the
    929  *	DR code will timeout and cancel the delete.
    930  */
    931 /*ARGSUSED*/
    932 static int
    933 seg_p_mem_config_pre_del(
    934 	void *arg,
    935 	pgcnt_t delta_pages)
    936 {
    937 	if (seg_p_disable() != SEGP_SUCCESS)
    938 		cmn_err(CE_NOTE,
    939 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
    940 	return (0);
    941 }
    942 
    943 /*ARGSUSED*/
    944 static void
    945 seg_p_mem_config_post_del(
    946 	void *arg,
    947 	pgcnt_t delta_pages,
    948 	int cancelled)
    949 {
    950 	seg_p_enable();
    951 }
    952 
    953 static kphysm_setup_vector_t seg_p_mem_config_vec = {
    954 	KPHYSM_SETUP_VECTOR_VERSION,
    955 	seg_p_mem_config_post_add,
    956 	seg_p_mem_config_pre_del,
    957 	seg_p_mem_config_post_del,
    958 };
    959 
    960 static void
    961 seg_pinit_mem_config(void)
    962 {
    963 	int ret;
    964 
    965 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
    966 	/*
    967 	 * Want to catch this in the debug kernel. At run time, if the
    968 	 * callbacks don't get run all will be OK as the disable just makes
    969 	 * it more likely that the pages can be collected.
    970 	 */
    971 	ASSERT(ret == 0);
    972 }
    973 
    974 extern struct seg_ops segvn_ops;
    975 extern struct seg_ops segspt_shmops;
    976 
    977 /*
    978  * Verify that segment is not a shared anonymous segment which reserves
    979  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
    980  * from one zone to another if any segments are shared.  This is because the
    981  * last process to exit will credit the swap reservation.  This could lead
    982  * to the swap being reserved by one zone, and credited to another.
    983  */
    984 boolean_t
    985 seg_can_change_zones(struct seg *seg)
    986 {
    987 	struct segvn_data *svd;
    988 
    989 	if (seg->s_ops == &segspt_shmops)
    990 		return (B_FALSE);
    991 
    992 	if (seg->s_ops == &segvn_ops) {
    993 		svd = (struct segvn_data *)seg->s_data;
    994 		if (svd->type == MAP_SHARED &&
    995 		    svd->amp != NULL &&
    996 		    svd->amp->swresv > 0)
    997 		return (B_FALSE);
    998 	}
    999 	return (B_TRUE);
   1000 }
   1001 
   1002 /*
   1003  * Return swap reserved by a segment backing a private mapping.
   1004  */
   1005 size_t
   1006 seg_swresv(struct seg *seg)
   1007 {
   1008 	struct segvn_data *svd;
   1009 	size_t swap = 0;
   1010 
   1011 	if (seg->s_ops == &segvn_ops) {
   1012 		svd = (struct segvn_data *)seg->s_data;
   1013 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
   1014 			swap = svd->swresv;
   1015 	}
   1016 	return (swap);
   1017 }
   1018