Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)vm_page.c	1.330	07/11/12 SMI"
     40 
     41 /*
     42  * VM - physical page management.
     43  */
     44 
     45 #include <sys/types.h>
     46 #include <sys/t_lock.h>
     47 #include <sys/param.h>
     48 #include <sys/systm.h>
     49 #include <sys/errno.h>
     50 #include <sys/time.h>
     51 #include <sys/vnode.h>
     52 #include <sys/vm.h>
     53 #include <sys/vtrace.h>
     54 #include <sys/swap.h>
     55 #include <sys/cmn_err.h>
     56 #include <sys/tuneable.h>
     57 #include <sys/sysmacros.h>
     58 #include <sys/cpuvar.h>
     59 #include <sys/callb.h>
     60 #include <sys/debug.h>
     61 #include <sys/tnf_probe.h>
     62 #include <sys/condvar_impl.h>
     63 #include <sys/mem_config.h>
     64 #include <sys/mem_cage.h>
     65 #include <sys/kmem.h>
     66 #include <sys/atomic.h>
     67 #include <sys/strlog.h>
     68 #include <sys/mman.h>
     69 #include <sys/ontrap.h>
     70 #include <sys/lgrp.h>
     71 #include <sys/vfs.h>
     72 
     73 #include <vm/hat.h>
     74 #include <vm/anon.h>
     75 #include <vm/page.h>
     76 #include <vm/seg.h>
     77 #include <vm/pvn.h>
     78 #include <vm/seg_kmem.h>
     79 #include <vm/vm_dep.h>
     80 #include <sys/vm_usage.h>
     81 #include <fs/fs_subr.h>
     82 #include <sys/ddi.h>
     83 #include <sys/modctl.h>
     84 
     85 static int nopageage = 0;
     86 
     87 static pgcnt_t max_page_get;	/* max page_get request size in pages */
     88 pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
     89 
     90 /*
     91  * freemem_lock protects all freemem variables:
     92  * availrmem. Also this lock protects the globals which track the
     93  * availrmem changes for accurate kernel footprint calculation.
     94  * See below for an explanation of these
     95  * globals.
     96  */
     97 kmutex_t freemem_lock;
     98 pgcnt_t availrmem;
     99 pgcnt_t availrmem_initial;
    100 
    101 /*
    102  * These globals track availrmem changes to get a more accurate
    103  * estimate of tke kernel size. Historically pp_kernel is used for
    104  * kernel size and is based on availrmem. But availrmem is adjusted for
    105  * locked pages in the system not just for kernel locked pages.
    106  * These new counters will track the pages locked through segvn and
    107  * by explicit user locking.
    108  *
    109  * segvn_pages_locked : This keeps track on a global basis how many pages
    110  * are currently locked because of I/O.
    111  *
    112  * pages_locked : How many pages are locked because of user specified
    113  * locking through mlock or plock.
    114  *
    115  * pages_useclaim,pages_claimed : These two variables track the
    116  * claim adjustments because of the protection changes on a segvn segment.
    117  *
    118  * All these globals are protected by the same lock which protects availrmem.
    119  */
    120 pgcnt_t segvn_pages_locked;
    121 pgcnt_t pages_locked;
    122 pgcnt_t pages_useclaim;
    123 pgcnt_t pages_claimed;
    124 
    125 
    126 /*
    127  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
    128  */
    129 static kmutex_t	new_freemem_lock;
    130 static uint_t	freemem_wait;	/* someone waiting for freemem */
    131 static kcondvar_t freemem_cv;
    132 
    133 /*
    134  * The logical page free list is maintained as two lists, the 'free'
    135  * and the 'cache' lists.
    136  * The free list contains those pages that should be reused first.
    137  *
    138  * The implementation of the lists is machine dependent.
    139  * page_get_freelist(), page_get_cachelist(),
    140  * page_list_sub(), and page_list_add()
    141  * form the interface to the machine dependent implementation.
    142  *
    143  * Pages with p_free set are on the cache list.
    144  * Pages with p_free and p_age set are on the free list,
    145  *
    146  * A page may be locked while on either list.
    147  */
    148 
    149 /*
    150  * free list accounting stuff.
    151  *
    152  *
    153  * Spread out the value for the number of pages on the
    154  * page free and page cache lists.  If there is just one
    155  * value, then it must be under just one lock.
    156  * The lock contention and cache traffic are a real bother.
    157  *
    158  * When we acquire and then drop a single pcf lock
    159  * we can start in the middle of the array of pcf structures.
    160  * If we acquire more than one pcf lock at a time, we need to
    161  * start at the front to avoid deadlocking.
    162  *
    163  * pcf_count holds the number of pages in each pool.
    164  *
    165  * pcf_block is set when page_create_get_something() has asked the
    166  * PSM page freelist and page cachelist routines without specifying
    167  * a color and nothing came back.  This is used to block anything
    168  * else from moving pages from one list to the other while the
    169  * lists are searched again.  If a page is freeed while pcf_block is
    170  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
    171  * of clearning pcf_block, doing the wakeups, etc.
    172  */
    173 
    174 #if NCPU <= 4
    175 #define	PAD	2
    176 #define	PCF_FANOUT	4
    177 static	uint_t	pcf_mask = PCF_FANOUT - 1;
    178 #else
    179 #define	PAD	10
    180 #ifdef sun4v
    181 #define	PCF_FANOUT	32
    182 #else
    183 #define	PCF_FANOUT	128
    184 #endif
    185 static	uint_t	pcf_mask = PCF_FANOUT - 1;
    186 #endif
    187 
    188 struct pcf {
    189 	kmutex_t	pcf_lock;	/* protects the structure */
    190 	uint_t		pcf_count;	/* page count */
    191 	uint_t		pcf_wait;	/* number of waiters */
    192 	uint_t		pcf_block; 	/* pcgs flag to page_free() */
    193 	uint_t		pcf_reserve; 	/* pages freed after pcf_block set */
    194 	uint_t		pcf_fill[PAD];	/* to line up on the caches */
    195 };
    196 
    197 static struct	pcf	pcf[PCF_FANOUT];
    198 #define	PCF_INDEX()	((CPU->cpu_id) & (pcf_mask))
    199 
    200 kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
    201 kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
    202 kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
    203 static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
    204 
    205 #ifdef VM_STATS
    206 
    207 /*
    208  * No locks, but so what, they are only statistics.
    209  */
    210 
    211 static struct page_tcnt {
    212 	int	pc_free_cache;		/* free's into cache list */
    213 	int	pc_free_dontneed;	/* free's with dontneed */
    214 	int	pc_free_pageout;	/* free's from pageout */
    215 	int	pc_free_free;		/* free's into free list */
    216 	int	pc_free_pages;		/* free's into large page free list */
    217 	int	pc_destroy_pages;	/* large page destroy's */
    218 	int	pc_get_cache;		/* get's from cache list */
    219 	int	pc_get_free;		/* get's from free list */
    220 	int	pc_reclaim;		/* reclaim's */
    221 	int	pc_abortfree;		/* abort's of free pages */
    222 	int	pc_find_hit;		/* find's that find page */
    223 	int	pc_find_miss;		/* find's that don't find page */
    224 	int	pc_destroy_free;	/* # of free pages destroyed */
    225 #define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
    226 	int	pc_find_hashlen[PC_HASH_CNT+1];
    227 	int	pc_addclaim_pages;
    228 	int	pc_subclaim_pages;
    229 	int	pc_free_replacement_page[2];
    230 	int	pc_try_demote_pages[6];
    231 	int	pc_demote_pages[2];
    232 } pagecnt;
    233 
    234 uint_t	hashin_count;
    235 uint_t	hashin_not_held;
    236 uint_t	hashin_already;
    237 
    238 uint_t	hashout_count;
    239 uint_t	hashout_not_held;
    240 
    241 uint_t	page_create_count;
    242 uint_t	page_create_not_enough;
    243 uint_t	page_create_not_enough_again;
    244 uint_t	page_create_zero;
    245 uint_t	page_create_hashout;
    246 uint_t	page_create_page_lock_failed;
    247 uint_t	page_create_trylock_failed;
    248 uint_t	page_create_found_one;
    249 uint_t	page_create_hashin_failed;
    250 uint_t	page_create_dropped_phm;
    251 
    252 uint_t	page_create_new;
    253 uint_t	page_create_exists;
    254 uint_t	page_create_putbacks;
    255 uint_t	page_create_overshoot;
    256 
    257 uint_t	page_reclaim_zero;
    258 uint_t	page_reclaim_zero_locked;
    259 
    260 uint_t	page_rename_exists;
    261 uint_t	page_rename_count;
    262 
    263 uint_t	page_lookup_cnt[20];
    264 uint_t	page_lookup_nowait_cnt[10];
    265 uint_t	page_find_cnt;
    266 uint_t	page_exists_cnt;
    267 uint_t	page_exists_forreal_cnt;
    268 uint_t	page_lookup_dev_cnt;
    269 uint_t	get_cachelist_cnt;
    270 uint_t	page_create_cnt[10];
    271 uint_t	alloc_pages[9];
    272 uint_t	page_exphcontg[19];
    273 uint_t  page_create_large_cnt[10];
    274 
    275 /*
    276  * Collects statistics.
    277  */
    278 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
    279 	uint_t	mylen = 0; \
    280 			\
    281 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
    282 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
    283 			break; \
    284 	} \
    285 	if ((pp) != NULL) \
    286 		pagecnt.pc_find_hit++; \
    287 	else \
    288 		pagecnt.pc_find_miss++; \
    289 	if (mylen > PC_HASH_CNT) \
    290 		mylen = PC_HASH_CNT; \
    291 	pagecnt.pc_find_hashlen[mylen]++; \
    292 }
    293 
    294 #else	/* VM_STATS */
    295 
    296 /*
    297  * Don't collect statistics
    298  */
    299 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
    300 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
    301 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
    302 			break; \
    303 	} \
    304 }
    305 
    306 #endif	/* VM_STATS */
    307 
    308 
    309 
    310 #ifdef DEBUG
    311 #define	MEMSEG_SEARCH_STATS
    312 #endif
    313 
    314 #ifdef MEMSEG_SEARCH_STATS
    315 struct memseg_stats {
    316     uint_t nsearch;
    317     uint_t nlastwon;
    318     uint_t nhashwon;
    319     uint_t nnotfound;
    320 } memseg_stats;
    321 
    322 #define	MEMSEG_STAT_INCR(v) \
    323 	atomic_add_32(&memseg_stats.v, 1)
    324 #else
    325 #define	MEMSEG_STAT_INCR(x)
    326 #endif
    327 
    328 struct memseg *memsegs;		/* list of memory segments */
    329 
    330 
    331 static void page_init_mem_config(void);
    332 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
    333 static void page_do_hashout(page_t *);
    334 static void page_capture_init();
    335 int page_capture_take_action(page_t *, uint_t, void *);
    336 
    337 static void page_demote_vp_pages(page_t *);
    338 
    339 /*
    340  * vm subsystem related initialization
    341  */
    342 void
    343 vm_init(void)
    344 {
    345 	boolean_t callb_vm_cpr(void *, int);
    346 
    347 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
    348 	page_init_mem_config();
    349 	page_retire_init();
    350 	vm_usage_init();
    351 	page_capture_init();
    352 }
    353 
    354 /*
    355  * This function is called at startup and when memory is added or deleted.
    356  */
    357 void
    358 init_pages_pp_maximum()
    359 {
    360 	static pgcnt_t p_min;
    361 	static pgcnt_t pages_pp_maximum_startup;
    362 	static pgcnt_t avrmem_delta;
    363 	static int init_done;
    364 	static int user_set;	/* true if set in /etc/system */
    365 
    366 	if (init_done == 0) {
    367 
    368 		/* If the user specified a value, save it */
    369 		if (pages_pp_maximum != 0) {
    370 			user_set = 1;
    371 			pages_pp_maximum_startup = pages_pp_maximum;
    372 		}
    373 
    374 		/*
    375 		 * Setting of pages_pp_maximum is based first time
    376 		 * on the value of availrmem just after the start-up
    377 		 * allocations. To preserve this relationship at run
    378 		 * time, use a delta from availrmem_initial.
    379 		 */
    380 		ASSERT(availrmem_initial >= availrmem);
    381 		avrmem_delta = availrmem_initial - availrmem;
    382 
    383 		/* The allowable floor of pages_pp_maximum */
    384 		p_min = tune.t_minarmem + 100;
    385 
    386 		/* Make sure we don't come through here again. */
    387 		init_done = 1;
    388 	}
    389 	/*
    390 	 * Determine pages_pp_maximum, the number of currently available
    391 	 * pages (availrmem) that can't be `locked'. If not set by
    392 	 * the user, we set it to 4% of the currently available memory
    393 	 * plus 4MB.
    394 	 * But we also insist that it be greater than tune.t_minarmem;
    395 	 * otherwise a process could lock down a lot of memory, get swapped
    396 	 * out, and never have enough to get swapped back in.
    397 	 */
    398 	if (user_set)
    399 		pages_pp_maximum = pages_pp_maximum_startup;
    400 	else
    401 		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
    402 		    + btop(4 * 1024 * 1024);
    403 
    404 	if (pages_pp_maximum <= p_min) {
    405 		pages_pp_maximum = p_min;
    406 	}
    407 }
    408 
    409 void
    410 set_max_page_get(pgcnt_t target_total_pages)
    411 {
    412 	max_page_get = target_total_pages / 2;
    413 }
    414 
    415 static pgcnt_t pending_delete;
    416 
    417 /*ARGSUSED*/
    418 static void
    419 page_mem_config_post_add(
    420 	void *arg,
    421 	pgcnt_t delta_pages)
    422 {
    423 	set_max_page_get(total_pages - pending_delete);
    424 	init_pages_pp_maximum();
    425 }
    426 
    427 /*ARGSUSED*/
    428 static int
    429 page_mem_config_pre_del(
    430 	void *arg,
    431 	pgcnt_t delta_pages)
    432 {
    433 	pgcnt_t nv;
    434 
    435 	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
    436 	set_max_page_get(total_pages - nv);
    437 	return (0);
    438 }
    439 
    440 /*ARGSUSED*/
    441 static void
    442 page_mem_config_post_del(
    443 	void *arg,
    444 	pgcnt_t delta_pages,
    445 	int cancelled)
    446 {
    447 	pgcnt_t nv;
    448 
    449 	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
    450 	set_max_page_get(total_pages - nv);
    451 	if (!cancelled)
    452 		init_pages_pp_maximum();
    453 }
    454 
    455 static kphysm_setup_vector_t page_mem_config_vec = {
    456 	KPHYSM_SETUP_VECTOR_VERSION,
    457 	page_mem_config_post_add,
    458 	page_mem_config_pre_del,
    459 	page_mem_config_post_del,
    460 };
    461 
    462 static void
    463 page_init_mem_config(void)
    464 {
    465 	int ret;
    466 
    467 	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
    468 	ASSERT(ret == 0);
    469 }
    470 
    471 /*
    472  * Evenly spread out the PCF counters for large free pages
    473  */
    474 static void
    475 page_free_large_ctr(pgcnt_t npages)
    476 {
    477 	static struct pcf	*p = pcf;
    478 	pgcnt_t			lump;
    479 
    480 	freemem += npages;
    481 
    482 	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
    483 
    484 	while (npages > 0) {
    485 
    486 		ASSERT(!p->pcf_block);
    487 
    488 		if (lump < npages) {
    489 			p->pcf_count += (uint_t)lump;
    490 			npages -= lump;
    491 		} else {
    492 			p->pcf_count += (uint_t)npages;
    493 			npages = 0;
    494 		}
    495 
    496 		ASSERT(!p->pcf_wait);
    497 
    498 		if (++p > &pcf[PCF_FANOUT - 1])
    499 			p = pcf;
    500 	}
    501 
    502 	ASSERT(npages == 0);
    503 }
    504 
    505 /*
    506  * Add a physical chunk of memory to the system free lists during startup.
    507  * Platform specific startup() allocates the memory for the page structs.
    508  *
    509  * num	- number of page structures
    510  * base - page number (pfn) to be associated with the first page.
    511  *
    512  * Since we are doing this during startup (ie. single threaded), we will
    513  * use shortcut routines to avoid any locking overhead while putting all
    514  * these pages on the freelists.
    515  *
    516  * NOTE: Any changes performed to page_free(), must also be performed to
    517  *	 add_physmem() since this is how we initialize all page_t's at
    518  *	 boot time.
    519  */
    520 void
    521 add_physmem(
    522 	page_t	*pp,
    523 	pgcnt_t	num,
    524 	pfn_t	pnum)
    525 {
    526 	page_t	*root = NULL;
    527 	uint_t	szc = page_num_pagesizes() - 1;
    528 	pgcnt_t	large = page_get_pagecnt(szc);
    529 	pgcnt_t	cnt = 0;
    530 
    531 	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
    532 	    "add_physmem:pp %p num %lu", pp, num);
    533 
    534 	/*
    535 	 * Arbitrarily limit the max page_get request
    536 	 * to 1/2 of the page structs we have.
    537 	 */
    538 	total_pages += num;
    539 	set_max_page_get(total_pages);
    540 
    541 	PLCNT_MODIFY_MAX(pnum, (long)num);
    542 
    543 	/*
    544 	 * The physical space for the pages array
    545 	 * representing ram pages has already been
    546 	 * allocated.  Here we initialize each lock
    547 	 * in the page structure, and put each on
    548 	 * the free list
    549 	 */
    550 	for (; num; pp++, pnum++, num--) {
    551 
    552 		/*
    553 		 * this needs to fill in the page number
    554 		 * and do any other arch specific initialization
    555 		 */
    556 		add_physmem_cb(pp, pnum);
    557 
    558 		pp->p_lckcnt = 0;
    559 		pp->p_cowcnt = 0;
    560 		pp->p_slckcnt = 0;
    561 
    562 		/*
    563 		 * Initialize the page lock as unlocked, since nobody
    564 		 * can see or access this page yet.
    565 		 */
    566 		pp->p_selock = 0;
    567 
    568 		/*
    569 		 * Initialize IO lock
    570 		 */
    571 		page_iolock_init(pp);
    572 
    573 		/*
    574 		 * initialize other fields in the page_t
    575 		 */
    576 		PP_SETFREE(pp);
    577 		page_clr_all_props(pp);
    578 		PP_SETAGED(pp);
    579 		pp->p_offset = (u_offset_t)-1;
    580 		pp->p_next = pp;
    581 		pp->p_prev = pp;
    582 
    583 		/*
    584 		 * Simple case: System doesn't support large pages.
    585 		 */
    586 		if (szc == 0) {
    587 			pp->p_szc = 0;
    588 			page_free_at_startup(pp);
    589 			continue;
    590 		}
    591 
    592 		/*
    593 		 * Handle unaligned pages, we collect them up onto
    594 		 * the root page until we have a full large page.
    595 		 */
    596 		if (!IS_P2ALIGNED(pnum, large)) {
    597 
    598 			/*
    599 			 * If not in a large page,
    600 			 * just free as small page.
    601 			 */
    602 			if (root == NULL) {
    603 				pp->p_szc = 0;
    604 				page_free_at_startup(pp);
    605 				continue;
    606 			}
    607 
    608 			/*
    609 			 * Link a constituent page into the large page.
    610 			 */
    611 			pp->p_szc = szc;
    612 			page_list_concat(&root, &pp);
    613 
    614 			/*
    615 			 * When large page is fully formed, free it.
    616 			 */
    617 			if (++cnt == large) {
    618 				page_free_large_ctr(cnt);
    619 				page_list_add_pages(root, PG_LIST_ISINIT);
    620 				root = NULL;
    621 				cnt = 0;
    622 			}
    623 			continue;
    624 		}
    625 
    626 		/*
    627 		 * At this point we have a page number which
    628 		 * is aligned. We assert that we aren't already
    629 		 * in a different large page.
    630 		 */
    631 		ASSERT(IS_P2ALIGNED(pnum, large));
    632 		ASSERT(root == NULL && cnt == 0);
    633 
    634 		/*
    635 		 * If insufficient number of pages left to form
    636 		 * a large page, just free the small page.
    637 		 */
    638 		if (num < large) {
    639 			pp->p_szc = 0;
    640 			page_free_at_startup(pp);
    641 			continue;
    642 		}
    643 
    644 		/*
    645 		 * Otherwise start a new large page.
    646 		 */
    647 		pp->p_szc = szc;
    648 		cnt++;
    649 		root = pp;
    650 	}
    651 	ASSERT(root == NULL && cnt == 0);
    652 }
    653 
    654 /*
    655  * Find a page representing the specified [vp, offset].
    656  * If we find the page but it is intransit coming in,
    657  * it will have an "exclusive" lock and we wait for
    658  * the i/o to complete.  A page found on the free list
    659  * is always reclaimed and then locked.  On success, the page
    660  * is locked, its data is valid and it isn't on the free
    661  * list, while a NULL is returned if the page doesn't exist.
    662  */
    663 page_t *
    664 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
    665 {
    666 	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
    667 }
    668 
    669 /*
    670  * Find a page representing the specified [vp, offset].
    671  * We either return the one we found or, if passed in,
    672  * create one with identity of [vp, offset] of the
    673  * pre-allocated page. If we find existing page but it is
    674  * intransit coming in, it will have an "exclusive" lock
    675  * and we wait for the i/o to complete.  A page found on
    676  * the free list is always reclaimed and then locked.
    677  * On success, the page is locked, its data is valid and
    678  * it isn't on the free list, while a NULL is returned
    679  * if the page doesn't exist and newpp is NULL;
    680  */
    681 page_t *
    682 page_lookup_create(
    683 	vnode_t *vp,
    684 	u_offset_t off,
    685 	se_t se,
    686 	page_t *newpp,
    687 	spgcnt_t *nrelocp,
    688 	int flags)
    689 {
    690 	page_t		*pp;
    691 	kmutex_t	*phm;
    692 	ulong_t		index;
    693 	uint_t		hash_locked;
    694 	uint_t		es;
    695 
    696 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
    697 	VM_STAT_ADD(page_lookup_cnt[0]);
    698 	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
    699 
    700 	/*
    701 	 * Acquire the appropriate page hash lock since
    702 	 * we have to search the hash list.  Pages that
    703 	 * hash to this list can't change identity while
    704 	 * this lock is held.
    705 	 */
    706 	hash_locked = 0;
    707 	index = PAGE_HASH_FUNC(vp, off);
    708 	phm = NULL;
    709 top:
    710 	PAGE_HASH_SEARCH(index, pp, vp, off);
    711 	if (pp != NULL) {
    712 		VM_STAT_ADD(page_lookup_cnt[1]);
    713 		es = (newpp != NULL) ? 1 : 0;
    714 		es |= flags;
    715 		if (!hash_locked) {
    716 			VM_STAT_ADD(page_lookup_cnt[2]);
    717 			if (!page_try_reclaim_lock(pp, se, es)) {
    718 				/*
    719 				 * On a miss, acquire the phm.  Then
    720 				 * next time, page_lock() will be called,
    721 				 * causing a wait if the page is busy.
    722 				 * just looping with page_trylock() would
    723 				 * get pretty boring.
    724 				 */
    725 				VM_STAT_ADD(page_lookup_cnt[3]);
    726 				phm = PAGE_HASH_MUTEX(index);
    727 				mutex_enter(phm);
    728 				hash_locked = 1;
    729 				goto top;
    730 			}
    731 		} else {
    732 			VM_STAT_ADD(page_lookup_cnt[4]);
    733 			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
    734 				VM_STAT_ADD(page_lookup_cnt[5]);
    735 				goto top;
    736 			}
    737 		}
    738 
    739 		/*
    740 		 * Since `pp' is locked it can not change identity now.
    741 		 * Reconfirm we locked the correct page.
    742 		 *
    743 		 * Both the p_vnode and p_offset *must* be cast volatile
    744 		 * to force a reload of their values: The PAGE_HASH_SEARCH
    745 		 * macro will have stuffed p_vnode and p_offset into
    746 		 * registers before calling page_trylock(); another thread,
    747 		 * actually holding the hash lock, could have changed the
    748 		 * page's identity in memory, but our registers would not
    749 		 * be changed, fooling the reconfirmation.  If the hash
    750 		 * lock was held during the search, the casting would
    751 		 * not be needed.
    752 		 */
    753 		VM_STAT_ADD(page_lookup_cnt[6]);
    754 		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
    755 		    ((volatile u_offset_t)(pp->p_offset) != off)) {
    756 			VM_STAT_ADD(page_lookup_cnt[7]);
    757 			if (hash_locked) {
    758 				panic("page_lookup_create: lost page %p",
    759 				    (void *)pp);
    760 				/*NOTREACHED*/
    761 			}
    762 			page_unlock(pp);
    763 			phm = PAGE_HASH_MUTEX(index);
    764 			mutex_enter(phm);
    765 			hash_locked = 1;
    766 			goto top;
    767 		}
    768 
    769 		/*
    770 		 * If page_trylock() was called, then pp may still be on
    771 		 * the cachelist (can't be on the free list, it would not
    772 		 * have been found in the search).  If it is on the
    773 		 * cachelist it must be pulled now. To pull the page from
    774 		 * the cachelist, it must be exclusively locked.
    775 		 *
    776 		 * The other big difference between page_trylock() and
    777 		 * page_lock(), is that page_lock() will pull the
    778 		 * page from whatever free list (the cache list in this
    779 		 * case) the page is on.  If page_trylock() was used
    780 		 * above, then we have to do the reclaim ourselves.
    781 		 */
    782 		if ((!hash_locked) && (PP_ISFREE(pp))) {
    783 			ASSERT(PP_ISAGED(pp) == 0);
    784 			VM_STAT_ADD(page_lookup_cnt[8]);
    785 
    786 			/*
    787 			 * page_relcaim will insure that we
    788 			 * have this page exclusively
    789 			 */
    790 
    791 			if (!page_reclaim(pp, NULL)) {
    792 				/*
    793 				 * Page_reclaim dropped whatever lock
    794 				 * we held.
    795 				 */
    796 				VM_STAT_ADD(page_lookup_cnt[9]);
    797 				phm = PAGE_HASH_MUTEX(index);
    798 				mutex_enter(phm);
    799 				hash_locked = 1;
    800 				goto top;
    801 			} else if (se == SE_SHARED && newpp == NULL) {
    802 				VM_STAT_ADD(page_lookup_cnt[10]);
    803 				page_downgrade(pp);
    804 			}
    805 		}
    806 
    807 		if (hash_locked) {
    808 			mutex_exit(phm);
    809 		}
    810 
    811 		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
    812 		    PAGE_EXCL(pp) && nrelocp != NULL) {
    813 			ASSERT(nrelocp != NULL);
    814 			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
    815 			    NULL);
    816 			if (*nrelocp > 0) {
    817 				VM_STAT_COND_ADD(*nrelocp == 1,
    818 				    page_lookup_cnt[11]);
    819 				VM_STAT_COND_ADD(*nrelocp > 1,
    820 				    page_lookup_cnt[12]);
    821 				pp = newpp;
    822 				se = SE_EXCL;
    823 			} else {
    824 				if (se == SE_SHARED) {
    825 					page_downgrade(pp);
    826 				}
    827 				VM_STAT_ADD(page_lookup_cnt[13]);
    828 			}
    829 		} else if (newpp != NULL && nrelocp != NULL) {
    830 			if (PAGE_EXCL(pp) && se == SE_SHARED) {
    831 				page_downgrade(pp);
    832 			}
    833 			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
    834 			    page_lookup_cnt[14]);
    835 			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
    836 			    page_lookup_cnt[15]);
    837 			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
    838 			    page_lookup_cnt[16]);
    839 		} else if (newpp != NULL && PAGE_EXCL(pp)) {
    840 			se = SE_EXCL;
    841 		}
    842 	} else if (!hash_locked) {
    843 		VM_STAT_ADD(page_lookup_cnt[17]);
    844 		phm = PAGE_HASH_MUTEX(index);
    845 		mutex_enter(phm);
    846 		hash_locked = 1;
    847 		goto top;
    848 	} else if (newpp != NULL) {
    849 		/*
    850 		 * If we have a preallocated page then
    851 		 * insert it now and basically behave like
    852 		 * page_create.
    853 		 */
    854 		VM_STAT_ADD(page_lookup_cnt[18]);
    855 		/*
    856 		 * Since we hold the page hash mutex and
    857 		 * just searched for this page, page_hashin
    858 		 * had better not fail.  If it does, that
    859 		 * means some thread did not follow the
    860 		 * page hash mutex rules.  Panic now and
    861 		 * get it over with.  As usual, go down
    862 		 * holding all the locks.
    863 		 */
    864 		ASSERT(MUTEX_HELD(phm));
    865 		if (!page_hashin(newpp, vp, off, phm)) {
    866 			ASSERT(MUTEX_HELD(phm));
    867 			panic("page_lookup_create: hashin failed %p %p %llx %p",
    868 			    (void *)newpp, (void *)vp, off, (void *)phm);
    869 			/*NOTREACHED*/
    870 		}
    871 		ASSERT(MUTEX_HELD(phm));
    872 		mutex_exit(phm);
    873 		phm = NULL;
    874 		page_set_props(newpp, P_REF);
    875 		page_io_lock(newpp);
    876 		pp = newpp;
    877 		se = SE_EXCL;
    878 	} else {
    879 		VM_STAT_ADD(page_lookup_cnt[19]);
    880 		mutex_exit(phm);
    881 	}
    882 
    883 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
    884 
    885 	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
    886 
    887 	return (pp);
    888 }
    889 
    890 /*
    891  * Search the hash list for the page representing the
    892  * specified [vp, offset] and return it locked.  Skip
    893  * free pages and pages that cannot be locked as requested.
    894  * Used while attempting to kluster pages.
    895  */
    896 page_t *
    897 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
    898 {
    899 	page_t		*pp;
    900 	kmutex_t	*phm;
    901 	ulong_t		index;
    902 	uint_t		locked;
    903 
    904 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
    905 	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
    906 
    907 	index = PAGE_HASH_FUNC(vp, off);
    908 	PAGE_HASH_SEARCH(index, pp, vp, off);
    909 	locked = 0;
    910 	if (pp == NULL) {
    911 top:
    912 		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
    913 		locked = 1;
    914 		phm = PAGE_HASH_MUTEX(index);
    915 		mutex_enter(phm);
    916 		PAGE_HASH_SEARCH(index, pp, vp, off);
    917 	}
    918 
    919 	if (pp == NULL || PP_ISFREE(pp)) {
    920 		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
    921 		pp = NULL;
    922 	} else {
    923 		if (!page_trylock(pp, se)) {
    924 			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
    925 			pp = NULL;
    926 		} else {
    927 			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
    928 			/*
    929 			 * See the comment in page_lookup()
    930 			 */
    931 			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
    932 			    ((u_offset_t)(pp->p_offset) != off)) {
    933 				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
    934 				if (locked) {
    935 					panic("page_lookup_nowait %p",
    936 					    (void *)pp);
    937 					/*NOTREACHED*/
    938 				}
    939 				page_unlock(pp);
    940 				goto top;
    941 			}
    942 			if (PP_ISFREE(pp)) {
    943 				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
    944 				page_unlock(pp);
    945 				pp = NULL;
    946 			}
    947 		}
    948 	}
    949 	if (locked) {
    950 		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
    951 		mutex_exit(phm);
    952 	}
    953 
    954 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
    955 
    956 	return (pp);
    957 }
    958 
    959 /*
    960  * Search the hash list for a page with the specified [vp, off]
    961  * that is known to exist and is already locked.  This routine
    962  * is typically used by segment SOFTUNLOCK routines.
    963  */
    964 page_t *
    965 page_find(vnode_t *vp, u_offset_t off)
    966 {
    967 	page_t		*pp;
    968 	kmutex_t	*phm;
    969 	ulong_t		index;
    970 
    971 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
    972 	VM_STAT_ADD(page_find_cnt);
    973 
    974 	index = PAGE_HASH_FUNC(vp, off);
    975 	phm = PAGE_HASH_MUTEX(index);
    976 
    977 	mutex_enter(phm);
    978 	PAGE_HASH_SEARCH(index, pp, vp, off);
    979 	mutex_exit(phm);
    980 
    981 	ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
    982 	return (pp);
    983 }
    984 
    985 /*
    986  * Determine whether a page with the specified [vp, off]
    987  * currently exists in the system.  Obviously this should
    988  * only be considered as a hint since nothing prevents the
    989  * page from disappearing or appearing immediately after
    990  * the return from this routine. Subsequently, we don't
    991  * even bother to lock the list.
    992  */
    993 page_t *
    994 page_exists(vnode_t *vp, u_offset_t off)
    995 {
    996 	page_t	*pp;
    997 	ulong_t		index;
    998 
    999 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
   1000 	VM_STAT_ADD(page_exists_cnt);
   1001 
   1002 	index = PAGE_HASH_FUNC(vp, off);
   1003 	PAGE_HASH_SEARCH(index, pp, vp, off);
   1004 
   1005 	return (pp);
   1006 }
   1007 
   1008 /*
   1009  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
   1010  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
   1011  * with these pages locked SHARED. If necessary reclaim pages from
   1012  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
   1013  *
   1014  * If we fail to lock pages still return 1 if pages exist and contiguous.
   1015  * But in this case return value is just a hint. ppa array won't be filled.
   1016  * Caller should initialize ppa[0] as NULL to distinguish return value.
   1017  *
   1018  * Returns 0 if pages don't exist or not physically contiguous.
   1019  *
   1020  * This routine doesn't work for anonymous(swapfs) pages.
   1021  */
   1022 int
   1023 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
   1024 {
   1025 	pgcnt_t pages;
   1026 	pfn_t pfn;
   1027 	page_t *rootpp;
   1028 	pgcnt_t i;
   1029 	pgcnt_t j;
   1030 	u_offset_t save_off = off;
   1031 	ulong_t index;
   1032 	kmutex_t *phm;
   1033 	page_t *pp;
   1034 	uint_t pszc;
   1035 	int loopcnt = 0;
   1036 
   1037 	ASSERT(szc != 0);
   1038 	ASSERT(vp != NULL);
   1039 	ASSERT(!IS_SWAPFSVP(vp));
   1040 	ASSERT(!VN_ISKAS(vp));
   1041 
   1042 again:
   1043 	if (++loopcnt > 3) {
   1044 		VM_STAT_ADD(page_exphcontg[0]);
   1045 		return (0);
   1046 	}
   1047 
   1048 	index = PAGE_HASH_FUNC(vp, off);
   1049 	phm = PAGE_HASH_MUTEX(index);
   1050 
   1051 	mutex_enter(phm);
   1052 	PAGE_HASH_SEARCH(index, pp, vp, off);
   1053 	mutex_exit(phm);
   1054 
   1055 	VM_STAT_ADD(page_exphcontg[1]);
   1056 
   1057 	if (pp == NULL) {
   1058 		VM_STAT_ADD(page_exphcontg[2]);
   1059 		return (0);
   1060 	}
   1061 
   1062 	pages = page_get_pagecnt(szc);
   1063 	rootpp = pp;
   1064 	pfn = rootpp->p_pagenum;
   1065 
   1066 	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
   1067 		VM_STAT_ADD(page_exphcontg[3]);
   1068 		if (!page_trylock(pp, SE_SHARED)) {
   1069 			VM_STAT_ADD(page_exphcontg[4]);
   1070 			return (1);
   1071 		}
   1072 		if (pp->p_szc != pszc || pp->p_vnode != vp ||
   1073 		    pp->p_offset != off) {
   1074 			VM_STAT_ADD(page_exphcontg[5]);
   1075 			page_unlock(pp);
   1076 			off = save_off;
   1077 			goto again;
   1078 		}
   1079 		/*
   1080 		 * szc was non zero and vnode and offset matched after we
   1081 		 * locked the page it means it can't become free on us.
   1082 		 */
   1083 		ASSERT(!PP_ISFREE(pp));
   1084 		if (!IS_P2ALIGNED(pfn, pages)) {
   1085 			page_unlock(pp);
   1086 			return (0);
   1087 		}
   1088 		ppa[0] = pp;
   1089 		pp++;
   1090 		off += PAGESIZE;
   1091 		pfn++;
   1092 		for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
   1093 			if (!page_trylock(pp, SE_SHARED)) {
   1094 				VM_STAT_ADD(page_exphcontg[6]);
   1095 				pp--;
   1096 				while (i-- > 0) {
   1097 					page_unlock(pp);
   1098 					pp--;
   1099 				}
   1100 				ppa[0] = NULL;
   1101 				return (1);
   1102 			}
   1103 			if (pp->p_szc != pszc) {
   1104 				VM_STAT_ADD(page_exphcontg[7]);
   1105 				page_unlock(pp);
   1106 				pp--;
   1107 				while (i-- > 0) {
   1108 					page_unlock(pp);
   1109 					pp--;
   1110 				}
   1111 				ppa[0] = NULL;
   1112 				off = save_off;
   1113 				goto again;
   1114 			}
   1115 			/*
   1116 			 * szc the same as for previous already locked pages
   1117 			 * with right identity. Since this page had correct
   1118 			 * szc after we locked it can't get freed or destroyed
   1119 			 * and therefore must have the expected identity.
   1120 			 */
   1121 			ASSERT(!PP_ISFREE(pp));
   1122 			if (pp->p_vnode != vp ||
   1123 			    pp->p_offset != off) {
   1124 				panic("page_exists_physcontig: "
   1125 				    "large page identity doesn't match");
   1126 			}
   1127 			ppa[i] = pp;
   1128 			ASSERT(pp->p_pagenum == pfn);
   1129 		}
   1130 		VM_STAT_ADD(page_exphcontg[8]);
   1131 		ppa[pages] = NULL;
   1132 		return (1);
   1133 	} else if (pszc >= szc) {
   1134 		VM_STAT_ADD(page_exphcontg[9]);
   1135 		if (!IS_P2ALIGNED(pfn, pages)) {
   1136 			return (0);
   1137 		}
   1138 		return (1);
   1139 	}
   1140 
   1141 	if (!IS_P2ALIGNED(pfn, pages)) {
   1142 		VM_STAT_ADD(page_exphcontg[10]);
   1143 		return (0);
   1144 	}
   1145 
   1146 	if (page_numtomemseg_nolock(pfn) !=
   1147 	    page_numtomemseg_nolock(pfn + pages - 1)) {
   1148 		VM_STAT_ADD(page_exphcontg[11]);
   1149 		return (0);
   1150 	}
   1151 
   1152 	/*
   1153 	 * We loop up 4 times across pages to promote page size.
   1154 	 * We're extra cautious to promote page size atomically with respect
   1155 	 * to everybody else.  But we can probably optimize into 1 loop if
   1156 	 * this becomes an issue.
   1157 	 */
   1158 
   1159 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
   1160 		ASSERT(pp->p_pagenum == pfn);
   1161 		if (!page_trylock(pp, SE_EXCL)) {
   1162 			VM_STAT_ADD(page_exphcontg[12]);
   1163 			break;
   1164 		}
   1165 		if (pp->p_vnode != vp ||
   1166 		    pp->p_offset != off) {
   1167 			VM_STAT_ADD(page_exphcontg[13]);
   1168 			page_unlock(pp);
   1169 			break;
   1170 		}
   1171 		if (pp->p_szc >= szc) {
   1172 			ASSERT(i == 0);
   1173 			page_unlock(pp);
   1174 			off = save_off;
   1175 			goto again;
   1176 		}
   1177 	}
   1178 
   1179 	if (i != pages) {
   1180 		VM_STAT_ADD(page_exphcontg[14]);
   1181 		--pp;
   1182 		while (i-- > 0) {
   1183 			page_unlock(pp);
   1184 			--pp;
   1185 		}
   1186 		return (0);
   1187 	}
   1188 
   1189 	pp = rootpp;
   1190 	for (i = 0; i < pages; i++, pp