Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #include <sys/types.h>
     40 #include <sys/t_lock.h>
     41 #include <sys/param.h>
     42 #include <sys/buf.h>
     43 #include <sys/uio.h>
     44 #include <sys/proc.h>
     45 #include <sys/systm.h>
     46 #include <sys/mman.h>
     47 #include <sys/cred.h>
     48 #include <sys/vnode.h>
     49 #include <sys/vm.h>
     50 #include <sys/vmparam.h>
     51 #include <sys/vtrace.h>
     52 #include <sys/cmn_err.h>
     53 #include <sys/cpuvar.h>
     54 #include <sys/user.h>
     55 #include <sys/kmem.h>
     56 #include <sys/debug.h>
     57 #include <sys/callb.h>
     58 #include <sys/tnf_probe.h>
     59 #include <sys/mem_cage.h>
     60 #include <sys/time.h>
     61 
     62 #include <vm/hat.h>
     63 #include <vm/as.h>
     64 #include <vm/seg.h>
     65 #include <vm/page.h>
     66 #include <vm/pvn.h>
     67 #include <vm/seg_kmem.h>
     68 
     69 static int checkpage(page_t *, int);
     70 
     71 /*
     72  * The following parameters control operation of the page replacement
     73  * algorithm.  They are initialized to 0, and then computed at boot time
     74  * based on the size of the system.  If they are patched non-zero in
     75  * a loaded vmunix they are left alone and may thus be changed per system
     76  * using adb on the loaded system.
     77  */
     78 pgcnt_t		slowscan = 0;
     79 pgcnt_t		fastscan = 0;
     80 
     81 static pgcnt_t	handspreadpages = 0;
     82 static int	loopfraction = 2;
     83 static pgcnt_t	looppages;
     84 static int	min_percent_cpu = 4;
     85 static int	max_percent_cpu = 80;
     86 static pgcnt_t	maxfastscan = 0;
     87 static pgcnt_t	maxslowscan = 100;
     88 
     89 pgcnt_t	maxpgio = 0;
     90 pgcnt_t	minfree = 0;
     91 pgcnt_t	desfree = 0;
     92 pgcnt_t	lotsfree = 0;
     93 pgcnt_t	needfree = 0;
     94 pgcnt_t	throttlefree = 0;
     95 pgcnt_t	pageout_reserve = 0;
     96 
     97 pgcnt_t	deficit;
     98 pgcnt_t	nscan;
     99 pgcnt_t	desscan;
    100 
    101 /*
    102  * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
    103  * are the number of ticks in each wakeup cycle that gives the
    104  * equivalent of some underlying %CPU duty cycle.
    105  * When RATETOSCHEDPAGING is 4,  and hz is 100, pageout_scanner is
    106  * awakened every 25 clock ticks.  So, converting from %CPU to ticks
    107  * per wakeup cycle would be x% of 25, that is (x * 100) / 25.
    108  * So, for example, 4% == 1 tick and 80% == 20 ticks.
    109  *
    110  * min_pageout_ticks:
    111  *     ticks/wakeup equivalent of min_percent_cpu.
    112  *
    113  * max_pageout_ticks:
    114  *     ticks/wakeup equivalent of max_percent_cpu.
    115  *
    116  * pageout_ticks:
    117  *     Number of clock ticks budgeted for each wakeup cycle.
    118  *     Computed each time around by schedpaging().
    119  *     Varies between min_pageout_ticks .. max_pageout_ticks,
    120  *     depending on memory pressure.
    121  *
    122  * pageout_lbolt:
    123  *     Timestamp of the last time pageout_scanner woke up and started
    124  *     (or resumed) scanning for not recently referenced pages.
    125  */
    126 
    127 static clock_t	min_pageout_ticks;
    128 static clock_t	max_pageout_ticks;
    129 static clock_t	pageout_ticks;
    130 static clock_t	pageout_lbolt;
    131 
    132 static uint_t	reset_hands;
    133 
    134 #define	PAGES_POLL_MASK	1023
    135 
    136 /*
    137  * pageout_sample_lim:
    138  *     The limit on the number of samples needed to establish a value
    139  *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
    140  *
    141  * pageout_sample_cnt:
    142  *     Current sample number.  Once the sample gets large enough,
    143  *     set new values for handspreadpages, fastscan and slowscan.
    144  *
    145  * pageout_sample_pages:
    146  *     The accumulated number of pages scanned during sampling.
    147  *
    148  * pageout_sample_ticks:
    149  *     The accumulated clock ticks for the sample.
    150  *
    151  * pageout_rate:
    152  *     Rate in pages/nanosecond, computed at the end of sampling.
    153  *
    154  * pageout_new_spread:
    155  *     The new value to use for fastscan and handspreadpages.
    156  *     Calculated after enough samples have been taken.
    157  */
    158 
    159 typedef hrtime_t hrrate_t;
    160 
    161 static uint64_t	pageout_sample_lim = 4;
    162 static uint64_t	pageout_sample_cnt = 0;
    163 static pgcnt_t	pageout_sample_pages = 0;
    164 static hrrate_t	pageout_rate = 0;
    165 static pgcnt_t	pageout_new_spread = 0;
    166 
    167 static clock_t	pageout_cycle_ticks;
    168 static hrtime_t	sample_start, sample_end;
    169 static hrtime_t	pageout_sample_etime = 0;
    170 
    171 /*
    172  * Record number of times a pageout_scanner wakeup cycle finished because it
    173  * timed out (exceeded its CPU budget), rather than because it visited
    174  * its budgeted number of pages.
    175  */
    176 uint64_t pageout_timeouts = 0;
    177 
    178 #ifdef VM_STATS
    179 static struct pageoutvmstats_str {
    180 	ulong_t	checkpage[3];
    181 } pageoutvmstats;
    182 #endif /* VM_STATS */
    183 
    184 /*
    185  * Threads waiting for free memory use this condition variable and lock until
    186  * memory becomes available.
    187  */
    188 kmutex_t	memavail_lock;
    189 kcondvar_t	memavail_cv;
    190 
    191 /*
    192  * The size of the clock loop.
    193  */
    194 #define	LOOPPAGES	total_pages
    195 
    196 /*
    197  * Set up the paging constants for the clock algorithm.
    198  * Called after the system is initialized and the amount of memory
    199  * and number of paging devices is known.
    200  *
    201  * lotsfree is 1/64 of memory, but at least 512K.
    202  * desfree is 1/2 of lotsfree.
    203  * minfree is 1/2 of desfree.
    204  *
    205  * Note: to revert to the paging algorithm of Solaris 2.4/2.5, set:
    206  *
    207  *	lotsfree = btop(512K)
    208  *	desfree = btop(200K)
    209  *	minfree = btop(100K)
    210  *	throttlefree = INT_MIN
    211  *	max_percent_cpu = 4
    212  */
    213 void
    214 setupclock(int recalc)
    215 {
    216 
    217 	static spgcnt_t init_lfree, init_dfree, init_mfree;
    218 	static spgcnt_t init_tfree, init_preserve, init_mpgio;
    219 	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
    220 
    221 	looppages = LOOPPAGES;
    222 
    223 	/*
    224 	 * setupclock can now be called to recalculate the paging
    225 	 * parameters in the case of dynamic addition of memory.
    226 	 * So to make sure we make the proper calculations, if such a
    227 	 * situation should arise, we save away the initial values
    228 	 * of each parameter so we can recall them when needed. This
    229 	 * way we don't lose the settings an admin might have made
    230 	 * through the /etc/system file.
    231 	 */
    232 
    233 	if (!recalc) {
    234 		init_lfree = lotsfree;
    235 		init_dfree = desfree;
    236 		init_mfree = minfree;
    237 		init_tfree = throttlefree;
    238 		init_preserve = pageout_reserve;
    239 		init_mpgio = maxpgio;
    240 		init_mfscan = maxfastscan;
    241 		init_fscan = fastscan;
    242 		init_sscan = slowscan;
    243 		init_hspages = handspreadpages;
    244 	}
    245 
    246 	/*
    247 	 * Set up thresholds for paging:
    248 	 */
    249 
    250 	/*
    251 	 * Lotsfree is threshold where paging daemon turns on.
    252 	 */
    253 	if (init_lfree == 0 || init_lfree >= looppages)
    254 		lotsfree = MAX(looppages / 64, btop(512 * 1024));
    255 	else
    256 		lotsfree = init_lfree;
    257 
    258 	/*
    259 	 * Desfree is amount of memory desired free.
    260 	 * If less than this for extended period, start swapping.
    261 	 */
    262 	if (init_dfree == 0 || init_dfree >= lotsfree)
    263 		desfree = lotsfree / 2;
    264 	else
    265 		desfree = init_dfree;
    266 
    267 	/*
    268 	 * Minfree is minimal amount of free memory which is tolerable.
    269 	 */
    270 	if (init_mfree == 0 || init_mfree >= desfree)
    271 		minfree = desfree / 2;
    272 	else
    273 		minfree = init_mfree;
    274 
    275 	/*
    276 	 * Throttlefree is the point at which we start throttling
    277 	 * PG_WAIT requests until enough memory becomes available.
    278 	 */
    279 	if (init_tfree == 0 || init_tfree >= desfree)
    280 		throttlefree = minfree;
    281 	else
    282 		throttlefree = init_tfree;
    283 
    284 	/*
    285 	 * Pageout_reserve is the number of pages that we keep in
    286 	 * stock for pageout's own use.  Having a few such pages
    287 	 * provides insurance against system deadlock due to
    288 	 * pageout needing pages.  When freemem < pageout_reserve,
    289 	 * non-blocking allocations are denied to any threads
    290 	 * other than pageout and sched.  (At some point we might
    291 	 * want to consider a per-thread flag like T_PUSHING_PAGES
    292 	 * to indicate that a thread is part of the page-pushing
    293 	 * dance (e.g. an interrupt thread) and thus is entitled
    294 	 * to the same special dispensation we accord pageout.)
    295 	 */
    296 	if (init_preserve == 0 || init_preserve >= throttlefree)
    297 		pageout_reserve = throttlefree / 2;
    298 	else
    299 		pageout_reserve = init_preserve;
    300 
    301 	/*
    302 	 * Maxpgio thresholds how much paging is acceptable.
    303 	 * This figures that 2/3 busy on an arm is all that is
    304 	 * tolerable for paging.  We assume one operation per disk rev.
    305 	 *
    306 	 * XXX - Does not account for multiple swap devices.
    307 	 */
    308 	if (init_mpgio == 0)
    309 		maxpgio = (DISKRPM * 2) / 3;
    310 	else
    311 		maxpgio = init_mpgio;
    312 
    313 	/*
    314 	 * The clock scan rate varies between fastscan and slowscan
    315 	 * based on the amount of free memory available.  Fastscan
    316 	 * rate should be set based on the number pages that can be
    317 	 * scanned per sec using ~10% of processor time.  Since this
    318 	 * value depends on the processor, MMU, Mhz etc., it is
    319 	 * difficult to determine it in a generic manner for all
    320 	 * architectures.
    321 	 *
    322 	 * Instead of trying to determine the number of pages scanned
    323 	 * per sec for every processor, fastscan is set to be the smaller
    324 	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
    325 	 * time is limited to ~4% of processor time.
    326 	 *
    327 	 * Setting fastscan to be 1/2 of memory allows pageout to scan
    328 	 * all of memory in ~2 secs.  This implies that user pages not
    329 	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
    330 	 * can be reclaimed when free memory is very low.  Stealing pages
    331 	 * not accessed within 1 sec seems reasonable and ensures that
    332 	 * active user processes don't thrash.
    333 	 *
    334 	 * Smaller values of fastscan result in scanning fewer pages
    335 	 * every second and consequently pageout may not be able to free
    336 	 * sufficient memory to maintain the minimum threshold.  Larger
    337 	 * values of fastscan result in scanning a lot more pages which
    338 	 * could lead to thrashing and higher CPU usage.
    339 	 *
    340 	 * Fastscan needs to be limited to a maximum value and should not
    341 	 * scale with memory to prevent pageout from consuming too much
    342 	 * time for scanning on slow CPU's and avoid thrashing, as a
    343 	 * result of scanning too many pages, on faster CPU's.
    344 	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
    345 	 * (the upper bound for fastscan) based on the average number
    346 	 * of pages that can potentially be scanned in ~1 sec (using ~4%
    347 	 * of the CPU) on some of the following machines that currently
    348 	 * run Solaris 2.x:
    349 	 *
    350 	 *			average memory scanned in ~1 sec
    351 	 *
    352 	 *	25 Mhz SS1+:		23 Meg
    353 	 *	LX:			37 Meg
    354 	 *	50 Mhz SC2000:		68 Meg
    355 	 *
    356 	 *	40 Mhz 486:		26 Meg
    357 	 *	66 Mhz 486:		42 Meg
    358 	 *
    359 	 * When free memory falls just below lotsfree, the scan rate
    360 	 * goes from 0 to slowscan (i.e., pageout starts running).  This
    361 	 * transition needs to be smooth and is achieved by ensuring that
    362 	 * pageout scans a small number of pages to satisfy the transient
    363 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
    364 	 * wakeup) since scanning that many pages has no noticible impact
    365 	 * on system performance.
    366 	 *
    367 	 * In addition to setting fastscan and slowscan, pageout is
    368 	 * limited to using ~4% of the CPU.  This results in increasing
    369 	 * the time taken to scan all of memory, which in turn means that
    370 	 * user processes have a better opportunity of preventing their
    371 	 * pages from being stolen.  This has a positive effect on
    372 	 * interactive and overall system performance when memory demand
    373 	 * is high.
    374 	 *
    375 	 * Thus, the rate at which pages are scanned for replacement will
    376 	 * vary linearly between slowscan and the number of pages that
    377 	 * can be scanned using ~4% of processor time instead of varying
    378 	 * linearly between slowscan and fastscan.
    379 	 *
    380 	 * Also, the processor time used by pageout will vary from ~1%
    381 	 * at slowscan to ~4% at fastscan instead of varying between
    382 	 * ~1% at slowscan and ~10% at fastscan.
    383 	 *
    384 	 * The values chosen for the various VM parameters (fastscan,
    385 	 * handspreadpages, etc) are not universally true for all machines,
    386 	 * but appear to be a good rule of thumb for the machines we've
    387 	 * tested.  They have the following ranges:
    388 	 *
    389 	 *	cpu speed:	20 to 70 Mhz
    390 	 *	page size:	4K to 8K
    391 	 *	memory size:	16M to 5G
    392 	 *	page scan rate:	4000 - 17400 4K pages per sec
    393 	 *
    394 	 * The values need to be re-examined for machines which don't
    395 	 * fall into the various ranges (e.g., slower or faster CPUs,
    396 	 * smaller or larger pagesizes etc) shown above.
    397 	 *
    398 	 * On an MP machine, pageout is often unable to maintain the
    399 	 * minimum paging thresholds under heavy load.  This is due to
    400 	 * the fact that user processes running on other CPU's can be
    401 	 * dirtying memory at a much faster pace than pageout can find
    402 	 * pages to free.  The memory demands could be met by enabling
    403 	 * more than one CPU to run the clock algorithm in such a manner
    404 	 * that the various clock hands don't overlap.  This also makes
    405 	 * it more difficult to determine the values for fastscan, slowscan
    406 	 * and handspreadpages.
    407 	 *
    408 	 * The swapper is currently used to free up memory when pageout
    409 	 * is unable to meet memory demands by swapping out processes.
    410 	 * In addition to freeing up memory, swapping also reduces the
    411 	 * demand for memory by preventing user processes from running
    412 	 * and thereby consuming memory.
    413 	 */
    414 	if (init_mfscan == 0) {
    415 		if (pageout_new_spread != 0)
    416 			maxfastscan = pageout_new_spread;
    417 		else
    418 			maxfastscan = MAXHANDSPREADPAGES;
    419 	} else {
    420 		maxfastscan = init_mfscan;
    421 	}
    422 	if (init_fscan == 0)
    423 		fastscan = MIN(looppages / loopfraction, maxfastscan);
    424 	else
    425 		fastscan = init_fscan;
    426 	if (fastscan > looppages / loopfraction)
    427 		fastscan = looppages / loopfraction;
    428 
    429 	/*
    430 	 * Set slow scan time to 1/10 the fast scan time, but
    431 	 * not to exceed maxslowscan.
    432 	 */
    433 	if (init_sscan == 0)
    434 		slowscan = MIN(fastscan / 10, maxslowscan);
    435 	else
    436 		slowscan = init_sscan;
    437 	if (slowscan > fastscan / 2)
    438 		slowscan = fastscan / 2;
    439 
    440 	/*
    441 	 * Handspreadpages is distance (in pages) between front and back
    442 	 * pageout daemon hands.  The amount of time to reclaim a page
    443 	 * once pageout examines it increases with this distance and
    444 	 * decreases as the scan rate rises. It must be < the amount
    445 	 * of pageable memory.
    446 	 *
    447 	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
    448 	 * to be "fastscan" results in the front hand being a few secs
    449 	 * (varies based on the processor speed) ahead of the back hand
    450 	 * at fastscan rates.  This distance can be further reduced, if
    451 	 * necessary, by increasing the processor time used by pageout
    452 	 * to be more than ~4% and preferrably not more than ~10%.
    453 	 *
    454 	 * As a result, user processes have a much better chance of
    455 	 * referencing their pages before the back hand examines them.
    456 	 * This also significantly lowers the number of reclaims from
    457 	 * the freelist since pageout does not end up freeing pages which
    458 	 * may be referenced a sec later.
    459 	 */
    460 	if (init_hspages == 0)
    461 		handspreadpages = fastscan;
    462 	else
    463 		handspreadpages = init_hspages;
    464 
    465 	/*
    466 	 * Make sure that back hand follows front hand by at least
    467 	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
    468 	 * for the back hand to look at a page during the same wakeup of
    469 	 * the pageout daemon in which the front hand cleared its ref bit.
    470 	 */
    471 	if (handspreadpages >= looppages)
    472 		handspreadpages = looppages - 1;
    473 
    474 	/*
    475 	 * If we have been called to recalculate the parameters,
    476 	 * set a flag to re-evaluate the clock hand pointers.
    477 	 */
    478 	if (recalc)
    479 		reset_hands = 1;
    480 }
    481 
    482 /*
    483  * Pageout scheduling.
    484  *
    485  * Schedpaging controls the rate at which the page out daemon runs by
    486  * setting the global variables nscan and desscan RATETOSCHEDPAGING
    487  * times a second.  Nscan records the number of pages pageout has examined
    488  * in its current pass; schedpaging resets this value to zero each time
    489  * it runs.  Desscan records the number of pages pageout should examine
    490  * in its next pass; schedpaging sets this value based on the amount of
    491  * currently available memory.
    492  */
    493 
    494 #define	RATETOSCHEDPAGING	4		/* hz that is */
    495 
    496 static kmutex_t	pageout_mutex;	/* held while pageout or schedpaging running */
    497 
    498 /*
    499  * Pool of available async pageout putpage requests.
    500  */
    501 static struct async_reqs *push_req;
    502 static struct async_reqs *req_freelist;	/* available req structs */
    503 static struct async_reqs *push_list;	/* pending reqs */
    504 static kmutex_t push_lock;		/* protects req pool */
    505 static kcondvar_t push_cv;
    506 
    507 static int async_list_size = 256;	/* number of async request structs */
    508 
    509 static void pageout_scanner(void);
    510 
    511 /*
    512  * If a page is being shared more than "po_share" times
    513  * then leave it alone- don't page it out.
    514  */
    515 #define	MIN_PO_SHARE	(8)
    516 #define	MAX_PO_SHARE	((MIN_PO_SHARE) << 24)
    517 ulong_t	po_share = MIN_PO_SHARE;
    518 
    519 /*
    520  * Schedule rate for paging.
    521  * Rate is linear interpolation between
    522  * slowscan with lotsfree and fastscan when out of memory.
    523  */
    524 static void
    525 schedpaging(void *arg)
    526 {
    527 	spgcnt_t vavail;
    528 
    529 	if (freemem < lotsfree + needfree + kmem_reapahead)
    530 		kmem_reap();
    531 
    532 	if (freemem < lotsfree + needfree)
    533 		seg_preap();
    534 
    535 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
    536 		kcage_cageout_wakeup();
    537 
    538 	if (mutex_tryenter(&pageout_mutex)) {
    539 		/* pageout() not running */
    540 		nscan = 0;
    541 		vavail = freemem - deficit;
    542 		if (pageout_new_spread != 0)
    543 			vavail -= needfree;
    544 		if (vavail < 0)
    545 			vavail = 0;
    546 		if (vavail > lotsfree)
    547 			vavail = lotsfree;
    548 
    549 		/*
    550 		 * Fix for 1161438 (CRS SPR# 73922).  All variables
    551 		 * in the original calculation for desscan were 32 bit signed
    552 		 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
    553 		 * more of memory, the calculation can overflow.  When this
    554 		 * happens, desscan becomes negative and pageout_scanner()
    555 		 * stops paging out.
    556 		 */
    557 		if ((needfree) && (pageout_new_spread == 0)) {
    558 			/*
    559 			 * If we've not yet collected enough samples to
    560 			 * calculate a spread, use the old logic of kicking
    561 			 * into high gear anytime needfree is non-zero.
    562 			 */
    563 			desscan = fastscan / RATETOSCHEDPAGING;
    564 		} else {
    565 			/*
    566 			 * Once we've calculated a spread based on system
    567 			 * memory and usage, just treat needfree as another
    568 			 * form of deficit.
    569 			 */
    570 			spgcnt_t faststmp, slowstmp, result;
    571 
    572 			slowstmp = slowscan * vavail;
    573 			faststmp = fastscan * (lotsfree - vavail);
    574 			result = (slowstmp + faststmp) /
    575 			    nz(lotsfree) / RATETOSCHEDPAGING;
    576 			desscan = (pgcnt_t)result;
    577 		}
    578 
    579 		pageout_ticks = min_pageout_ticks + (lotsfree - vavail) *
    580 		    (max_pageout_ticks - min_pageout_ticks) / nz(lotsfree);
    581 
    582 		if (freemem < lotsfree + needfree ||
    583 		    pageout_sample_cnt < pageout_sample_lim) {
    584 			TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
    585 			    "pageout_cv_signal:freemem %ld", freemem);
    586 			cv_signal(&proc_pageout->p_cv);
    587 		} else {
    588 			/*
    589 			 * There are enough free pages, no need to
    590 			 * kick the scanner thread.  And next time
    591 			 * around, keep more of the `highly shared'
    592 			 * pages.
    593 			 */
    594 			cv_signal_pageout();
    595 			if (po_share > MIN_PO_SHARE) {
    596 				po_share >>= 1;
    597 			}
    598 		}
    599 		mutex_exit(&pageout_mutex);
    600 	}
    601 
    602 	/*
    603 	 * Signal threads waiting for available memory.
    604 	 * NOTE: usually we need to grab memavail_lock before cv_broadcast, but
    605 	 * in this case it is not needed - the waiters will be waken up during
    606 	 * the next invocation of this function.
    607 	 */
    608 	if (kmem_avail() > 0)
    609 		cv_broadcast(&memavail_cv);
    610 
    611 	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
    612 }
    613 
    614 pgcnt_t		pushes;
    615 ulong_t		push_list_size;		/* # of requests on pageout queue */
    616 
    617 #define	FRONT	1
    618 #define	BACK	2
    619 
    620 int dopageout = 1;	/* must be non-zero to turn page stealing on */
    621 
    622 /*
    623  * The page out daemon, which runs as process 2.
    624  *
    625  * As long as there are at least lotsfree pages,
    626  * this process is not run.  When the number of free
    627  * pages stays in the range desfree to lotsfree,
    628  * this daemon runs through the pages in the loop
    629  * at a rate determined in schedpaging().  Pageout manages
    630  * two hands on the clock.  The front hand moves through
    631  * memory, clearing the reference bit,
    632  * and stealing pages from procs that are over maxrss.
    633  * The back hand travels a distance behind the front hand,
    634  * freeing the pages that have not been referenced in the time
    635  * since the front hand passed.  If modified, they are pushed to
    636  * swap before being freed.
    637  *
    638  * There are 2 threads that act on behalf of the pageout process.
    639  * One thread scans pages (pageout_scanner) and frees them up if
    640  * they don't require any VOP_PUTPAGE operation. If a page must be
    641  * written back to its backing store, the request is put on a list
    642  * and the other (pageout) thread is signaled. The pageout thread
    643  * grabs VOP_PUTPAGE requests from the list, and processes them.
    644  * Some filesystems may require resources for the VOP_PUTPAGE
    645  * operations (like memory) and hence can block the pageout
    646  * thread, but the scanner thread can still operate. There is still
    647  * no guarantee that memory deadlocks cannot occur.
    648  *
    649  * For now, this thing is in very rough form.
    650  */
    651 void
    652 pageout()
    653 {
    654 	struct async_reqs *arg;
    655 	pri_t pageout_pri;
    656 	int i;
    657 	pgcnt_t max_pushes;
    658 	callb_cpr_t cprinfo;
    659 
    660 	proc_pageout = ttoproc(curthread);
    661 	proc_pageout->p_cstime = 0;
    662 	proc_pageout->p_stime =  0;
    663 	proc_pageout->p_cutime =  0;
    664 	proc_pageout->p_utime = 0;
    665 	bcopy("pageout", PTOU(curproc)->u_psargs, 8);
    666 	bcopy("pageout", PTOU(curproc)->u_comm, 7);
    667 
    668 	/*
    669 	 * Create pageout scanner thread
    670 	 */
    671 	mutex_init(&pageout_mutex, NULL, MUTEX_DEFAULT, NULL);
    672 	mutex_init(&push_lock, NULL, MUTEX_DEFAULT, NULL);
    673 
    674 	/*
    675 	 * Allocate and initialize the async request structures
    676 	 * for pageout.
    677 	 */
    678 	push_req = (struct async_reqs *)
    679 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
    680 
    681 	req_freelist = push_req;
    682 	for (i = 0; i < async_list_size - 1; i++)
    683 		push_req[i].a_next = &push_req[i + 1];
    684 
    685 	pageout_pri = curthread->t_pri;
    686 
    687 	/* Create the pageout scanner thread. */
    688 	(void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN,
    689 	    pageout_pri - 1);
    690 
    691 	/*
    692 	 * kick off pageout scheduler.
    693 	 */
    694 	schedpaging(NULL);
    695 
    696 	/*
    697 	 * Create kernel cage thread.
    698 	 * The kernel cage thread is started under the pageout process
    699 	 * to take advantage of the less restricted page allocation
    700 	 * in page_create_throttle().
    701 	 */
    702 	kcage_cageout_init();
    703 
    704 	/*
    705 	 * Limit pushes to avoid saturating pageout devices.
    706 	 */
    707 	max_pushes = maxpgio / RATETOSCHEDPAGING;
    708 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
    709 
    710 	for (;;) {
    711 		mutex_enter(&push_lock);
    712 
    713 		while ((arg = push_list) == NULL || pushes > max_pushes) {
    714 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
    715 			cv_wait(&push_cv, &push_lock);
    716 			pushes = 0;
    717 			CALLB_CPR_SAFE_END(&cprinfo, &push_lock);
    718 		}
    719 		push_list = arg->a_next;
    720 		arg->a_next = NULL;
    721 		mutex_exit(&push_lock);
    722 
    723 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
    724 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
    725 			pushes++;
    726 		}
    727 
    728 		/* vp held by checkpage() */
    729 		VN_RELE(arg->a_vp);
    730 
    731 		mutex_enter(&push_lock);
    732 		arg->a_next = req_freelist;	/* back on freelist */
    733 		req_freelist = arg;
    734 		push_list_size--;
    735 		mutex_exit(&push_lock);
    736 	}
    737 }
    738 
    739 /*
    740  * Kernel thread that scans pages looking for ones to free
    741  */
    742 static void
    743 pageout_scanner(void)
    744 {
    745 	struct page *fronthand, *backhand;
    746 	uint_t count;
    747 	callb_cpr_t cprinfo;
    748 	pgcnt_t	nscan_limit;
    749 	pgcnt_t	pcount;
    750 
    751 	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
    752 	mutex_enter(&pageout_mutex);
    753 
    754 	/*
    755 	 * The restart case does not attempt to point the hands at roughly
    756 	 * the right point on the assumption that after one circuit things
    757 	 * will have settled down - and restarts shouldn't be that often.
    758 	 */
    759 
    760 	/*
    761 	 * Set the two clock hands to be separated by a reasonable amount,
    762 	 * but no more than 360 degrees apart.
    763 	 */
    764 	backhand = page_first();
    765 	if (handspreadpages >= total_pages)
    766 		fronthand = page_nextn(backhand, total_pages - 1);
    767 	else
    768 		fronthand = page_nextn(backhand, handspreadpages);
    769 
    770 	min_pageout_ticks = MAX(1,
    771 	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
    772 	max_pageout_ticks = MAX(min_pageout_ticks,
    773 	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
    774 
    775 loop:
    776 	cv_signal_pageout();
    777 
    778 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
    779 	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
    780 	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
    781 
    782 	if (!dopageout)
    783 		goto loop;
    784 
    785 	if (reset_hands) {
    786 		reset_hands = 0;
    787 
    788 		backhand = page_first();
    789 		if (handspreadpages >= total_pages)
    790 			fronthand = page_nextn(backhand, total_pages - 1);
    791 		else
    792 			fronthand = page_nextn(backhand, handspreadpages);
    793 	}
    794 
    795 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
    796 	count = 0;
    797 
    798 	TRACE_4(TR_FAC_VM, TR_PAGEOUT_START,
    799 	    "pageout_start:freemem %ld lotsfree %ld nscan %ld desscan %ld",
    800 	    freemem, lotsfree, nscan, desscan);
    801 
    802 	/* Kernel probe */
    803 	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
    804 	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
    805 
    806 	pcount = 0;
    807 	if (pageout_sample_cnt < pageout_sample_lim) {
    808 		nscan_limit = total_pages;
    809 	} else {
    810 		nscan_limit = desscan;
    811 	}
    812 	pageout_lbolt = ddi_get_lbolt();
    813 	sample_start = gethrtime();
    814 
    815 	/*
    816 	 * Scan the appropriate number of pages for a single duty cycle.
    817 	 * However, stop scanning as soon as there is enough free memory.
    818 	 * For a short while, we will be sampling the performance of the
    819 	 * scanner and need to keep running just to get sample data, in
    820 	 * which case we keep going and don't pay attention to whether
    821 	 * or not there is enough free memory.
    822 	 */
    823 
    824 	while (nscan < nscan_limit && (freemem < lotsfree + needfree ||
    825 	    pageout_sample_cnt < pageout_sample_lim)) {
    826 		int rvfront, rvback;
    827 
    828 		/*
    829 		 * Check to see if we have exceeded our %CPU budget
    830 		 * for this wakeup, but not on every single page visited,
    831 		 * just every once in a while.
    832 		 */
    833 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
    834 			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
    835 			if (pageout_cycle_ticks >= pageout_ticks) {
    836 				++pageout_timeouts;
    837 				break;
    838 			}
    839 		}
    840 
    841 		/*
    842 		 * If checkpage manages to add a page to the free list,
    843 		 * we give ourselves another couple of trips around the loop.
    844 		 */
    845 		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
    846 			count = 0;
    847 		if ((rvback = checkpage(backhand, BACK)) == 1)
    848 			count = 0;
    849 
    850 		++pcount;
    851 
    852 		/*
    853 		 * protected by pageout_mutex instead of cpu_stat_lock
    854 		 */
    855 		CPU_STATS_ADDQ(CPU, vm, scan, 1);
    856 
    857 		/*
    858 		 * Don't include ineligible pages in the number scanned.
    859 		 */
    860 		if (rvfront != -1 || rvback != -1)
    861 			nscan++;
    862 
    863 		backhand = page_next(backhand);
    864 
    865 		/*
    866 		 * backhand update and wraparound check are done separately
    867 		 * because lint barks when it finds an empty "if" body
    868 		 */
    869 
    870 		if ((fronthand = page_next(fronthand)) == page_first())	{
    871 			TRACE_2(TR_FAC_VM, TR_PAGEOUT_HAND_WRAP,
    872 			    "pageout_hand_wrap:freemem %ld whichhand %d",
    873 			    freemem, FRONT);
    874 
    875 			/*
    876 			 * protected by pageout_mutex instead of cpu_stat_lock
    877 			 */
    878 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
    879 			if (++count > 1) {
    880 				/*
    881 				 * Extremely unlikely, but it happens.
    882 				 * We went around the loop at least once
    883 				 * and didn't get far enough.
    884 				 * If we are still skipping `highly shared'
    885 				 * pages, skip fewer of them.  Otherwise,
    886 				 * give up till the next clock tick.
    887 				 */
    888 				if (po_share < MAX_PO_SHARE) {
    889 					po_share <<= 1;
    890 				} else {
    891 					/*
    892 					 * Really a "goto loop", but
    893 					 * if someone is TRACing or
    894 					 * TNF_PROBE_ing, at least
    895 					 * make records to show
    896 					 * where we are.
    897 					 */
    898 					break;
    899 				}
    900 			}
    901 		}
    902 	}
    903 
    904 	sample_end = gethrtime();
    905 
    906 	TRACE_5(TR_FAC_VM, TR_PAGEOUT_END,
    907 	    "pageout_end:freemem %ld lots %ld nscan %ld des %ld count %u",
    908 	    freemem, lotsfree, nscan, desscan, count);
    909 
    910 	/* Kernel probe */
    911 	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
    912 	    tnf_ulong, pages_scanned, nscan, tnf_ulong, pages_free, freemem);
    913 
    914 	if (pageout_sample_cnt < pageout_sample_lim) {
    915 		pageout_sample_pages += pcount;
    916 		pageout_sample_etime += sample_end - sample_start;
    917 		++pageout_sample_cnt;
    918 	}
    919 	if (pageout_sample_cnt >= pageout_sample_lim &&
    920 	    pageout_new_spread == 0) {
    921 		pageout_rate = (hrrate_t)pageout_sample_pages *
    922 		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
    923 		pageout_new_spread = pageout_rate / 10;
    924 		setupclock(1);
    925 	}
    926 
    927 	goto loop;
    928 }
    929 
    930 /*
    931  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
    932  * system (u., page table) or free, then leave it alone.  Otherwise,
    933  * if we are running the front hand, turn off the page's reference bit.
    934  * If the proc is over maxrss, we take it.  If running the back hand,
    935  * check whether the page has been reclaimed.  If not, free the page,
    936  * pushing it to disk first if necessary.
    937  *
    938  * Return values:
    939  *	-1 if the page is not a candidate at all,
    940  *	 0 if not freed, or
    941  *	 1 if we freed it.
    942  */
    943 static int
    944 checkpage(struct page *pp, int whichhand)
    945 {
    946 	int ppattr;
    947 	int isfs = 0;
    948 	int isexec = 0;
    949 	int pagesync_flag;
    950 
    951 	/*
    952 	 * Skip pages:
    953 	 * 	- associated with the kernel vnode since
    954 	 *	    they are always "exclusively" locked.
    955 	 *	- that are free
    956 	 *	- that are shared more than po_share'd times
    957 	 *	- its already locked
    958 	 *
    959 	 * NOTE:  These optimizations assume that reads are atomic.
    960 	 */
    961 
    962 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
    963 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
    964 	    hat_page_checkshare(pp, po_share)) {
    965 		return (-1);
    966 	}
    967 
    968 	if (!page_trylock(pp, SE_EXCL)) {
    969 		/*
    970 		 * Skip the page if we can't acquire the "exclusive" lock.
    971 		 */
    972 		return (-1);
    973 	} else if (PP_ISFREE(pp)) {
    974 		/*
    975 		 * It became free between the above check and our actually
    976 		 * locking the page.  Oh, well there will be other pages.
    977 		 */
    978 		page_unlock(pp);
    979 		return (-1);
    980 	}
    981 
    982 	/*
    983 	 * Reject pages that cannot be freed. The page_struct_lock
    984 	 * need not be acquired to examine these
    985 	 * fields since the page has an "exclusive" lock.
    986 	 */
    987 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
    988 		page_unlock(pp);
    989 		return (-1);
    990 	}
    991 
    992 	/*
    993 	 * Maintain statistics for what we are freeing
    994 	 */
    995 
    996 	if (pp->p_vnode != NULL) {
    997 		if (pp->p_vnode->v_flag & VVMEXEC)
    998 			isexec = 1;
    999 
   1000 		if (!IS_SWAPFSVP(pp->p_vnode))
   1001 			isfs = 1;
   1002 	}
   1003 
   1004 	/*
   1005 	 * Turn off REF and MOD bits with the front hand.
   1006 	 * The back hand examines the REF bit and always considers
   1007 	 * SHARED pages as referenced.
   1008 	 */
   1009 	if (whichhand == FRONT)
   1010 		pagesync_flag = HAT_SYNC_ZERORM;
   1011 	else
   1012 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
   1013 		    HAT_SYNC_STOPON_SHARED;
   1014 
   1015 	ppattr = hat_pagesync(pp, pagesync_flag);
   1016 
   1017 recheck:
   1018 	/*
   1019 	 * If page is referenced; make unreferenced but reclaimable.
   1020 	 * If this page is not referenced, then it must be reclaimable
   1021 	 * and we can add it to the free list.
   1022 	 */
   1023 	if (ppattr & P_REF) {
   1024 		TRACE_2(TR_FAC_VM, TR_PAGEOUT_ISREF,
   1025 		    "pageout_isref:pp %p whichhand %d", pp, whichhand);
   1026 		if (whichhand == FRONT) {
   1027 			/*
   1028 			 * Checking of rss or madvise flags needed here...
   1029 			 *
   1030 			 * If not "well-behaved", fall through into the code
   1031 			 * for not referenced.
   1032 			 */
   1033 			hat_clrref(pp);
   1034 		}
   1035 		/*
   1036 		 * Somebody referenced the page since the front
   1037 		 * hand went by, so it's not a candidate for
   1038 		 * freeing up.
   1039 		 */
   1040 		page_unlock(pp);
   1041 		return (0);
   1042 	}
   1043 
   1044 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
   1045 
   1046 	/*
   1047 	 * If large page, attempt to demote it. If successfully demoted,
   1048 	 * retry the checkpage.
   1049 	 */
   1050 	if (pp->p_szc != 0) {
   1051 		if (!page_try_demote_pages(pp)) {
   1052 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
   1053 			page_unlock(pp);
   1054 			return (-1);
   1055 		}
   1056 		ASSERT(pp->p_szc == 0);
   1057 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
   1058 		/*
   1059 		 * since page_try_demote_pages() could have unloaded some
   1060 		 * mappings it makes sense to reload ppattr.
   1061 		 */
   1062 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
   1063 	}
   1064 
   1065 	/*
   1066 	 * If the page is currently dirty, we have to arrange
   1067 	 * to have it cleaned before it can be freed.
   1068 	 *
   1069 	 * XXX - ASSERT(pp->p_vnode != NULL);
   1070 	 */
   1071 	if ((ppattr & P_MOD) && pp->p_vnode) {
   1072 		struct vnode *vp = pp->p_vnode;
   1073 		u_offset_t offset = pp->p_offset;
   1074 
   1075 		/*
   1076 		 * XXX - Test for process being swapped out or about to exit?
   1077 		 * [Can't get back to process(es) using the page.]
   1078 		 */
   1079 
   1080 		/*
   1081 		 * Hold the vnode before releasing the page lock to
   1082 		 * prevent it from being freed and re-used by some
   1083 		 * other thread.
   1084 		 */
   1085 		VN_HOLD(vp);
   1086 		page_unlock(pp);
   1087 
   1088 		/*
   1089 		 * Queue i/o request for the pageout thread.
   1090 		 */
   1091 		if (!queue_io_request(vp, offset)) {
   1092 			VN_RELE(vp);
   1093 			return (0);
   1094 		}
   1095 		return (1);
   1096 	}
   1097 
   1098 	/*
   1099 	 * Now we unload all the translations,
   1100 	 * and put the page back on to the free list.
   1101 	 * If the page was used (referenced or modified) after
   1102 	 * the pagesync but before it was unloaded we catch it
   1103 	 * and handle the page properly.
   1104 	 */
   1105 	TRACE_2(TR_FAC_VM, TR_PAGEOUT_FREE,
   1106 	    "pageout_free:pp %p whichhand %d", pp, whichhand);
   1107 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
   1108 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
   1109 	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
   1110 		goto recheck;
   1111 
   1112 	/*LINTED: constant in conditional context*/
   1113 	VN_DISPOSE(pp, B_FREE, 0, kcred);
   1114 
   1115 	CPU_STATS_ADD_K(vm, dfree, 1);
   1116 
   1117 	if (isfs) {
   1118 		if (isexec) {
   1119 			CPU_STATS_ADD_K(vm, execfree, 1);
   1120 		} else {
   1121 			CPU_STATS_ADD_K(vm, fsfree, 1);
   1122 		}
   1123 	} else {
   1124 		CPU_STATS_ADD_K(vm, anonfree, 1);
   1125 	}
   1126 
   1127 	return (1);		/* freed a page! */
   1128 }
   1129 
   1130 /*
   1131  * Queue async i/o request from pageout_scanner and segment swapout
   1132  * routines on one common list.  This ensures that pageout devices (swap)
   1133  * are not saturated by pageout_scanner or swapout requests.
   1134  * The pageout thread empties this list by initiating i/o operations.
   1135  */
   1136 int
   1137 queue_io_request(vnode_t *vp, u_offset_t off)
   1138 {
   1139 	struct async_reqs *arg;
   1140 
   1141 	/*
   1142 	 * If we cannot allocate an async request struct,
   1143 	 * skip this page.
   1144 	 */
   1145 	mutex_enter(&push_lock);
   1146 	if ((arg = req_freelist) == NULL) {
   1147 		mutex_exit(&push_lock);
   1148 		return (0);
   1149 	}
   1150 	req_freelist = arg->a_next;		/* adjust freelist */
   1151 	push_list_size++;
   1152 
   1153 	arg->a_vp = vp;
   1154 	arg->a_off = off;
   1155 	arg->a_len = PAGESIZE;
   1156 	arg->a_flags = B_ASYNC | B_FREE;
   1157 	arg->a_cred = kcred;		/* always held */
   1158 
   1159 	/*
   1160 	 * Add to list of pending write requests.
   1161 	 */
   1162 	arg->a_next = push_list;
   1163 	push_list = arg;
   1164 
   1165 	if (req_freelist == NULL) {
   1166 		/*
   1167 		 * No free async requests left. The lock is held so we
   1168 		 * might as well signal the pusher thread now.
   1169 		 */
   1170 		cv_signal(&push_cv);
   1171 	}
   1172 	mutex_exit(&push_lock);
   1173 	return (1);
   1174 }
   1175 
   1176 /*
   1177  * Wakeup pageout to initiate i/o if push_list is not empty.
   1178  */
   1179 void
   1180 cv_signal_pageout()
   1181 {
   1182 	if (push_list != NULL) {
   1183 		mutex_enter(&push_lock);
   1184 		cv_signal(&push_cv);
   1185 		mutex_exit(&push_lock);
   1186 	}
   1187 }
   1188