Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)vm_pvn.c	1.190	07/05/11 SMI"
     40 
     41 /*
     42  * VM - paged vnode.
     43  *
     44  * This file supplies vm support for the vnode operations that deal with pages.
     45  */
     46 #include <sys/types.h>
     47 #include <sys/t_lock.h>
     48 #include <sys/param.h>
     49 #include <sys/sysmacros.h>
     50 #include <sys/systm.h>
     51 #include <sys/time.h>
     52 #include <sys/buf.h>
     53 #include <sys/vnode.h>
     54 #include <sys/uio.h>
     55 #include <sys/vmmeter.h>
     56 #include <sys/vmsystm.h>
     57 #include <sys/mman.h>
     58 #include <sys/vfs.h>
     59 #include <sys/cred.h>
     60 #include <sys/user.h>
     61 #include <sys/kmem.h>
     62 #include <sys/cmn_err.h>
     63 #include <sys/debug.h>
     64 #include <sys/cpuvar.h>
     65 #include <sys/vtrace.h>
     66 #include <sys/tnf_probe.h>
     67 
     68 #include <vm/hat.h>
     69 #include <vm/as.h>
     70 #include <vm/seg.h>
     71 #include <vm/rm.h>
     72 #include <vm/pvn.h>
     73 #include <vm/page.h>
     74 #include <vm/seg_map.h>
     75 #include <vm/seg_kmem.h>
     76 #include <sys/fs/swapnode.h>
     77 
     78 int pvn_nofodklust = 0;
     79 int pvn_write_noklust = 0;
     80 
     81 uint_t pvn_vmodsort_supported = 0;	/* set if HAT supports VMODSORT */
     82 uint_t pvn_vmodsort_disable = 0;	/* set in /etc/system to disable HAT */
     83 					/* support for vmodsort for testing */
     84 
     85 static struct kmem_cache *marker_cache = NULL;
     86 
     87 /*
     88  * Find the largest contiguous block which contains `addr' for file offset
     89  * `offset' in it while living within the file system block sizes (`vp_off'
     90  * and `vp_len') and the address space limits for which no pages currently
     91  * exist and which map to consecutive file offsets.
     92  */
     93 page_t *
     94 pvn_read_kluster(
     95 	struct vnode *vp,
     96 	u_offset_t off,
     97 	struct seg *seg,
     98 	caddr_t addr,
     99 	u_offset_t *offp,			/* return values */
    100 	size_t *lenp,				/* return values */
    101 	u_offset_t vp_off,
    102 	size_t vp_len,
    103 	int isra)
    104 {
    105 	ssize_t deltaf, deltab;
    106 	page_t *pp;
    107 	page_t *plist = NULL;
    108 	spgcnt_t pagesavail;
    109 	u_offset_t vp_end;
    110 
    111 	ASSERT(off >= vp_off && off < vp_off + vp_len);
    112 
    113 	/*
    114 	 * We only want to do klustering/read ahead if there
    115 	 * is more than minfree pages currently available.
    116 	 */
    117 	pagesavail = freemem - minfree;
    118 
    119 	if (pagesavail <= 0)
    120 		if (isra)
    121 			return ((page_t *)NULL);    /* ra case - give up */
    122 		else
    123 			pagesavail = 1;		    /* must return a page */
    124 
    125 	/* We calculate in pages instead of bytes due to 32-bit overflows */
    126 	if (pagesavail < (spgcnt_t)btopr(vp_len)) {
    127 		/*
    128 		 * Don't have enough free memory for the
    129 		 * max request, try sizing down vp request.
    130 		 */
    131 		deltab = (ssize_t)(off - vp_off);
    132 		vp_len -= deltab;
    133 		vp_off += deltab;
    134 		if (pagesavail < btopr(vp_len)) {
    135 			/*
    136 			 * Still not enough memory, just settle for
    137 			 * pagesavail which is at least 1.
    138 			 */
    139 			vp_len = ptob(pagesavail);
    140 		}
    141 	}
    142 
    143 	vp_end = vp_off + vp_len;
    144 	ASSERT(off >= vp_off && off < vp_end);
    145 
    146 	if (isra && SEGOP_KLUSTER(seg, addr, 0))
    147 		return ((page_t *)NULL);	/* segment driver says no */
    148 
    149 	if ((plist = page_create_va(vp, off,
    150 	    PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
    151 		return ((page_t *)NULL);
    152 
    153 	if (vp_len <= PAGESIZE || pvn_nofodklust) {
    154 		*offp = off;
    155 		*lenp = MIN(vp_len, PAGESIZE);
    156 	} else {
    157 		/*
    158 		 * Scan back from front by incrementing "deltab" and
    159 		 * comparing "off" with "vp_off + deltab" to avoid
    160 		 * "signed" versus "unsigned" conversion problems.
    161 		 */
    162 		for (deltab = PAGESIZE; off >= vp_off + deltab;
    163 		    deltab += PAGESIZE) {
    164 			/*
    165 			 * Call back to the segment driver to verify that
    166 			 * the klustering/read ahead operation makes sense.
    167 			 */
    168 			if (SEGOP_KLUSTER(seg, addr, -deltab))
    169 				break;		/* page not eligible */
    170 			if ((pp = page_create_va(vp, off - deltab,
    171 			    PAGESIZE, PG_EXCL, seg, addr - deltab))
    172 			    == NULL)
    173 				break;		/* already have the page */
    174 			/*
    175 			 * Add page to front of page list.
    176 			 */
    177 			page_add(&plist, pp);
    178 		}
    179 		deltab -= PAGESIZE;
    180 
    181 		/* scan forward from front */
    182 		for (deltaf = PAGESIZE; off + deltaf < vp_end;
    183 		    deltaf += PAGESIZE) {
    184 			/*
    185 			 * Call back to the segment driver to verify that
    186 			 * the klustering/read ahead operation makes sense.
    187 			 */
    188 			if (SEGOP_KLUSTER(seg, addr, deltaf))
    189 				break;		/* page not file extension */
    190 			if ((pp = page_create_va(vp, off + deltaf,
    191 			    PAGESIZE, PG_EXCL, seg, addr + deltaf))
    192 			    == NULL)
    193 				break;		/* already have page */
    194 
    195 			/*
    196 			 * Add page to end of page list.
    197 			 */
    198 			page_add(&plist, pp);
    199 			plist = plist->p_next;
    200 		}
    201 		*offp = off = off - deltab;
    202 		*lenp = deltab + deltaf;
    203 		ASSERT(off >= vp_off);
    204 
    205 		/*
    206 		 * If we ended up getting more than was actually
    207 		 * requested, retract the returned length to only
    208 		 * reflect what was requested.  This might happen
    209 		 * if we were allowed to kluster pages across a
    210 		 * span of (say) 5 frags, and frag size is less
    211 		 * than PAGESIZE.  We need a whole number of
    212 		 * pages to contain those frags, but the returned
    213 		 * size should only allow the returned range to
    214 		 * extend as far as the end of the frags.
    215 		 */
    216 		if ((vp_off + vp_len) < (off + *lenp)) {
    217 			ASSERT(vp_end > off);
    218 			*lenp = vp_end - off;
    219 		}
    220 	}
    221 	TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
    222 		"pvn_read_kluster:seg %p addr %x isra %x",
    223 		seg, addr, isra);
    224 	return (plist);
    225 }
    226 
    227 /*
    228  * Handle pages for this vnode on either side of the page "pp"
    229  * which has been locked by the caller.  This routine will also
    230  * do klustering in the range [vp_off, vp_off + vp_len] up
    231  * until a page which is not found.  The offset and length
    232  * of pages included is returned in "*offp" and "*lenp".
    233  *
    234  * Returns a list of dirty locked pages all ready to be
    235  * written back.
    236  */
    237 page_t *
    238 pvn_write_kluster(
    239 	struct vnode *vp,
    240 	page_t *pp,
    241 	u_offset_t *offp,		/* return values */
    242 	size_t *lenp,			/* return values */
    243 	u_offset_t vp_off,
    244 	size_t vp_len,
    245 	int flags)
    246 {
    247 	u_offset_t off;
    248 	page_t *dirty;
    249 	size_t deltab, deltaf;
    250 	se_t se;
    251 	u_offset_t vp_end;
    252 
    253 	off = pp->p_offset;
    254 
    255 	/*
    256 	 * Kustering should not be done if we are invalidating
    257 	 * pages since we could destroy pages that belong to
    258 	 * some other process if this is a swap vnode.
    259 	 */
    260 	if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
    261 		*offp = off;
    262 		*lenp = PAGESIZE;
    263 		return (pp);
    264 	}
    265 
    266 	if (flags & (B_FREE | B_INVAL))
    267 		se = SE_EXCL;
    268 	else
    269 		se = SE_SHARED;
    270 
    271 	dirty = pp;
    272 	/*
    273 	 * Scan backwards looking for pages to kluster by incrementing
    274 	 * "deltab" and comparing "off" with "vp_off + deltab" to
    275 	 * avoid "signed" versus "unsigned" conversion problems.
    276 	 */
    277 	for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
    278 		pp = page_lookup_nowait(vp, off - deltab, se);
    279 		if (pp == NULL)
    280 			break;		/* page not found */
    281 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
    282 			break;
    283 		page_add(&dirty, pp);
    284 	}
    285 	deltab -= PAGESIZE;
    286 
    287 	vp_end = vp_off + vp_len;
    288 	/* now scan forwards looking for pages to kluster */
    289 	for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
    290 		pp = page_lookup_nowait(vp, off + deltaf, se);
    291 		if (pp == NULL)
    292 			break;		/* page not found */
    293 		if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
    294 			break;
    295 		page_add(&dirty, pp);
    296 		dirty = dirty->p_next;
    297 	}
    298 
    299 	*offp = off - deltab;
    300 	*lenp = deltab + deltaf;
    301 	return (dirty);
    302 }
    303 
    304 /*
    305  * Generic entry point used to release the "shared/exclusive" lock
    306  * and the "p_iolock" on pages after i/o is complete.
    307  */
    308 void
    309 pvn_io_done(page_t *plist)
    310 {
    311 	page_t *pp;
    312 
    313 	while (plist != NULL) {
    314 		pp = plist;
    315 		page_sub(&plist, pp);
    316 		page_io_unlock(pp);
    317 		page_unlock(pp);
    318 	}
    319 }
    320 
    321 /*
    322  * Entry point to be used by file system getpage subr's and
    323  * other such routines which either want to unlock pages (B_ASYNC
    324  * request) or destroy a list of pages if an error occurred.
    325  */
    326 void
    327 pvn_read_done(page_t *plist, int flags)
    328 {
    329 	page_t *pp;
    330 
    331 	while (plist != NULL) {
    332 		pp = plist;
    333 		page_sub(&plist, pp);
    334 		page_io_unlock(pp);
    335 		if (flags & B_ERROR) {
    336 			/*LINTED: constant in conditional context*/
    337 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
    338 		} else {
    339 			(void) page_release(pp, 0);
    340 		}
    341 	}
    342 }
    343 
    344 /*
    345  * Automagic pageout.
    346  * When memory gets tight, start freeing pages popping out of the
    347  * write queue.
    348  */
    349 int	write_free = 1;
    350 pgcnt_t	pages_before_pager = 200;	/* LMXXX */
    351 
    352 /*
    353  * Routine to be called when page-out's complete.
    354  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
    355  * after waiting for i/o to complete (biowait) to free the list of
    356  * pages associated with the buffer.  These pages must be locked
    357  * before i/o is initiated.
    358  *
    359  * If a write error occurs, the pages are marked as modified
    360  * so the write will be re-tried later.
    361  */
    362 
    363 void
    364 pvn_write_done(page_t *plist, int flags)
    365 {
    366 	int dfree = 0;
    367 	int pgrec = 0;
    368 	int pgout = 0;
    369 	int pgpgout = 0;
    370 	int anonpgout = 0;
    371 	int anonfree = 0;
    372 	int fspgout = 0;
    373 	int fsfree = 0;
    374 	int execpgout = 0;
    375 	int execfree = 0;
    376 	page_t *pp;
    377 	struct cpu *cpup;
    378 	struct vnode *vp = NULL;	/* for probe */
    379 	uint_t ppattr;
    380 	kmutex_t *vphm = NULL;
    381 
    382 	ASSERT((flags & B_READ) == 0);
    383 
    384 	/*
    385 	 * If we are about to start paging anyway, start freeing pages.
    386 	 */
    387 	if (write_free && freemem < lotsfree + pages_before_pager &&
    388 	    (flags & B_ERROR) == 0) {
    389 		flags |= B_FREE;
    390 	}
    391 
    392 	/*
    393 	 * Handle each page involved in the i/o operation.
    394 	 */
    395 	while (plist != NULL) {
    396 		pp = plist;
    397 		ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
    398 		page_sub(&plist, pp);
    399 
    400 		/* Kernel probe support */
    401 		if (vp == NULL)
    402 			vp = pp->p_vnode;
    403 
    404 		if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
    405 			/*
    406 			 * Move page to the top of the v_page list.
    407 			 * Skip pages modified during IO.
    408 			 */
    409 			vphm = page_vnode_mutex(vp);
    410 			mutex_enter(vphm);
    411 			if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
    412 				page_vpsub(&vp->v_pages, pp);
    413 				page_vpadd(&vp->v_pages, pp);
    414 			}
    415 			mutex_exit(vphm);
    416 		}
    417 
    418 		if (flags & B_ERROR) {
    419 			/*
    420 			 * Write operation failed.  We don't want
    421 			 * to destroy (or free) the page unless B_FORCE
    422 			 * is set. We set the mod bit again and release
    423 			 * all locks on the page so that it will get written
    424 			 * back again later when things are hopefully
    425 			 * better again.
    426 			 * If B_INVAL and B_FORCE is set we really have
    427 			 * to destroy the page.
    428 			 */
    429 			if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
    430 				page_io_unlock(pp);
    431 				/*LINTED: constant in conditional context*/
    432 				VN_DISPOSE(pp, B_INVAL, 0, kcred);
    433 			} else {
    434 				hat_setmod_only(pp);
    435 				page_io_unlock(pp);
    436 				page_unlock(pp);
    437 			}
    438 		} else if (flags & B_INVAL) {
    439 			/*
    440 			 * XXX - Failed writes with B_INVAL set are
    441 			 * not handled appropriately.
    442 			 */
    443 			page_io_unlock(pp);
    444 			/*LINTED: constant in conditional context*/
    445 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
    446 		} else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
    447 			/*
    448 			 * Update statistics for pages being paged out
    449 			 */
    450 			if (pp->p_vnode) {
    451 				if (IS_SWAPFSVP(pp->p_vnode)) {
    452 					anonpgout++;
    453 				} else {
    454 					if (pp->p_vnode->v_flag & VVMEXEC) {
    455 						execpgout++;
    456 					} else {
    457 						fspgout++;
    458 					}
    459 				}
    460 			}
    461 			page_io_unlock(pp);
    462 			pgout = 1;
    463 			pgpgout++;
    464 			TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
    465 				"page_ws_out:pp %p", pp);
    466 
    467 			/*
    468 			 * The page_struct_lock need not be acquired to
    469 			 * examine "p_lckcnt" and "p_cowcnt" since we'll
    470 			 * have an "exclusive" lock if the upgrade succeeds.
    471 			 */
    472 			if (page_tryupgrade(pp) &&
    473 			    pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
    474 				/*
    475 				 * Check if someone has reclaimed the
    476 				 * page.  If ref and mod are not set, no
    477 				 * one is using it so we can free it.
    478 				 * The rest of the system is careful
    479 				 * to use the NOSYNC flag to unload
    480 				 * translations set up for i/o w/o
    481 				 * affecting ref and mod bits.
    482 				 *
    483 				 * Obtain a copy of the real hardware
    484 				 * mod bit using hat_pagesync(pp, HAT_DONTZERO)
    485 				 * to avoid having to flush the cache.
    486 				 */
    487 				ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
    488 					HAT_SYNC_STOPON_MOD);
    489 			ck_refmod:
    490 				if (!(ppattr & (P_REF | P_MOD))) {
    491 					if (hat_page_is_mapped(pp)) {
    492 						/*
    493 						 * Doesn't look like the page
    494 						 * was modified so now we
    495 						 * really have to unload the
    496 						 * translations.  Meanwhile
    497 						 * another CPU could've
    498 						 * modified it so we have to
    499 						 * check again.  We don't loop
    500 						 * forever here because now
    501 						 * the translations are gone
    502 						 * and no one can get a new one
    503 						 * since we have the "exclusive"
    504 						 * lock on the page.
    505 						 */
    506 						(void) hat_pageunload(pp,
    507 							HAT_FORCE_PGUNLOAD);
    508 						ppattr = hat_page_getattr(pp,
    509 							P_REF | P_MOD);
    510 						goto ck_refmod;
    511 					}
    512 					/*
    513 					 * Update statistics for pages being
    514 					 * freed
    515 					 */
    516 					if (pp->p_vnode) {
    517 						if (IS_SWAPFSVP(pp->p_vnode)) {
    518 							anonfree++;
    519 						} else {
    520 							if (pp->p_vnode->v_flag
    521 							    & VVMEXEC) {
    522 								execfree++;
    523 							} else {
    524 								fsfree++;
    525 							}
    526 						}
    527 					}
    528 					/*LINTED: constant in conditional ctx*/
    529 					VN_DISPOSE(pp, B_FREE,
    530 						(flags & B_DONTNEED), kcred);
    531 					dfree++;
    532 				} else {
    533 					page_unlock(pp);
    534 					pgrec++;
    535 					TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
    536 					    "page_ws_free:pp %p", pp);
    537 				}
    538 			} else {
    539 				/*
    540 				 * Page is either `locked' in memory
    541 				 * or was reclaimed and now has a
    542 				 * "shared" lock, so release it.
    543 				 */
    544 				page_unlock(pp);
    545 			}
    546 		} else {
    547 			/*
    548 			 * Neither B_FREE nor B_INVAL nor B_ERROR.
    549 			 * Just release locks.
    550 			 */
    551 			page_io_unlock(pp);
    552 			page_unlock(pp);
    553 		}
    554 	}
    555 
    556 	CPU_STATS_ENTER_K();
    557 	cpup = CPU;		/* get cpup now that CPU cannot change */
    558 	CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
    559 	CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
    560 	CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
    561 	CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
    562 	CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
    563 	CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
    564 	CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
    565 	CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
    566 	CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
    567 	CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
    568 	CPU_STATS_EXIT_K();
    569 
    570 	/* Kernel probe */
    571 	TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
    572 		tnf_opaque,	vnode,			vp,
    573 		tnf_ulong,	pages_pageout,		pgpgout,
    574 		tnf_ulong,	pages_freed,		dfree,
    575 		tnf_ulong,	pages_reclaimed,	pgrec);
    576 }
    577 
    578 /*
    579  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
    580  * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
    581  * operation and is only to be considered if it doesn't involve any
    582  * waiting here.  B_TRUNC indicates that the file is being truncated
    583  * and so no i/o needs to be done. B_FORCE indicates that the page
    584  * must be destroyed so don't try wrting it out.
    585  *
    586  * The caller must ensure that the page is locked.  Returns 1, if
    587  * the page should be written back (the "iolock" is held in this
    588  * case), or 0 if the page has been dealt with or has been
    589  * unlocked.
    590  */
    591 int
    592 pvn_getdirty(page_t *pp, int flags)
    593 {
    594 	ASSERT((flags & (B_INVAL | B_FREE)) ?
    595 	    PAGE_EXCL(pp) : PAGE_SHARED(pp));
    596 	ASSERT(PP_ISFREE(pp) == 0);
    597 
    598 	/*
    599 	 * If trying to invalidate or free a logically `locked' page,
    600 	 * forget it.  Don't need page_struct_lock to check p_lckcnt and
    601 	 * p_cowcnt as the page is exclusively locked.
    602 	 */
    603 	if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
    604 	    (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
    605 		page_unlock(pp);
    606 		return (0);
    607 	}
    608 
    609 	/*
    610 	 * Now acquire the i/o lock so we can add it to the dirty
    611 	 * list (if necessary).  We avoid blocking on the i/o lock
    612 	 * in the following cases:
    613 	 *
    614 	 *	If B_DELWRI is set, which implies that this request is
    615 	 *	due to a klustering operartion.
    616 	 *
    617 	 *	If this is an async (B_ASYNC) operation and we are not doing
    618 	 *	invalidation (B_INVAL) [The current i/o or fsflush will ensure
    619 	 *	that the the page is written out].
    620 	 */
    621 	if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
    622 		if (!page_io_trylock(pp)) {
    623 			page_unlock(pp);
    624 			return (0);
    625 		}
    626 	} else {
    627 		page_io_lock(pp);
    628 	}
    629 
    630 	/*
    631 	 * If we want to free or invalidate the page then
    632 	 * we need to unload it so that anyone who wants
    633 	 * it will have to take a minor fault to get it.
    634 	 * Otherwise, we're just writing the page back so we
    635 	 * need to sync up the hardwre and software mod bit to
    636 	 * detect any future modifications.  We clear the
    637 	 * software mod bit when we put the page on the dirty
    638 	 * list.
    639 	 */
    640 	if (flags & (B_INVAL | B_FREE)) {
    641 		(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
    642 	} else {
    643 		(void) hat_pagesync(pp, HAT_SYNC_ZERORM);
    644 	}
    645 
    646 	if (!hat_ismod(pp) || (flags & B_TRUNC)) {
    647 		/*
    648 		 * Don't need to add it to the
    649 		 * list after all.
    650 		 */
    651 		page_io_unlock(pp);
    652 		if (flags & B_INVAL) {
    653 			/*LINTED: constant in conditional context*/
    654 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
    655 		} else if (flags & B_FREE) {
    656 			/*LINTED: constant in conditional context*/
    657 			VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
    658 		} else {
    659 			/*
    660 			 * This is advisory path for the callers
    661 			 * of VOP_PUTPAGE() who prefer freeing the
    662 			 * page _only_ if no one else is accessing it.
    663 			 * E.g. segmap_release()
    664 			 *
    665 			 * The above hat_ismod() check is useless because:
    666 			 * (1) we may not be holding SE_EXCL lock;
    667 			 * (2) we've not unloaded _all_ translations
    668 			 *
    669 			 * Let page_release() do the heavy-lifting.
    670 			 */
    671 			(void) page_release(pp, 1);
    672 		}
    673 		return (0);
    674 	}
    675 
    676 	/*
    677 	 * Page is dirty, get it ready for the write back
    678 	 * and add page to the dirty list.
    679 	 */
    680 	hat_clrrefmod(pp);
    681 
    682 	/*
    683 	 * If we're going to free the page when we're done
    684 	 * then we can let others try to use it starting now.
    685 	 * We'll detect the fact that they used it when the
    686 	 * i/o is done and avoid freeing the page.
    687 	 */
    688 	if (flags & B_FREE)
    689 		page_downgrade(pp);
    690 
    691 
    692 	TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
    693 
    694 	return (1);
    695 }
    696 
    697 
    698 /*ARGSUSED*/
    699 static int
    700 marker_constructor(void *buf, void *cdrarg, int kmflags)
    701 {
    702 	page_t *mark = buf;
    703 	bzero(mark, sizeof (page_t));
    704 	return (0);
    705 }
    706 
    707 void
    708 pvn_init()
    709 {
    710 	if (pvn_vmodsort_disable == 0)
    711 		pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
    712 	marker_cache = kmem_cache_create("marker_cache",
    713 	    sizeof (page_t), 0, marker_constructor,
    714 	    NULL, NULL, NULL, NULL, 0);
    715 }
    716 
    717 
    718 /*
    719  * Process a vnode's page list for all pages whose offset is >= off.
    720  * Pages are to either be free'd, invalidated, or written back to disk.
    721  *
    722  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
    723  * is specified, otherwise they are "shared" locked.
    724  *
    725  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
    726  *
    727  * Special marker page_t's are inserted in the list in order
    728  * to keep track of where we are in the list when locks are dropped.
    729  *
    730  * Note the list is circular and insertions can happen only at the
    731  * head and tail of the list. The algorithm ensures visiting all pages
    732  * on the list in the following way:
    733  *
    734  *    Drop two marker pages at the end of the list.
    735  *
    736  *    Move one marker page backwards towards the start of the list until
    737  *    it is at the list head, processing the pages passed along the way.
    738  *
    739  *    Due to race conditions when the vphm mutex is dropped, additional pages
    740  *    can be added to either end of the list, so we'll continue to move
    741  *    the marker and process pages until it is up against the end marker.
    742  *
    743  * There is one special exit condition. If we are processing a VMODSORT
    744  * vnode and only writing back modified pages, we can stop as soon as
    745  * we run into an unmodified page.  This makes fsync(3) operations fast.
    746  */
    747 int
    748 pvn_vplist_dirty(
    749 	vnode_t		*vp,
    750 	u_offset_t	off,
    751 	int		(*putapage)(vnode_t *, page_t *, u_offset_t *,
    752 			size_t *, int, cred_t *),
    753 	int		flags,
    754 	cred_t		*cred)
    755 {
    756 	page_t		*pp;
    757 	page_t		*mark;		/* marker page that moves toward head */
    758 	page_t		*end;		/* marker page at end of list */
    759 	int		err = 0;
    760 	int		error;
    761 	kmutex_t	*vphm;
    762 	se_t		se;
    763 	page_t		**where_to_move;
    764 
    765 	ASSERT(vp->v_type != VCHR);
    766 
    767 	if (vp->v_pages == NULL)
    768 		return (0);
    769 
    770 
    771 	/*
    772 	 * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
    773 	 *
    774 	 * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
    775 	 * from getting blocked while flushing pages to a dead NFS server.
    776 	 */
    777 	mutex_enter(&vp->v_lock);
    778 	if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
    779 		mutex_exit(&vp->v_lock);
    780 		return (EAGAIN);
    781 	}
    782 
    783 	while (vp->v_flag & VVMLOCK)
    784 		cv_wait(&vp->v_cv, &vp->v_lock);
    785 
    786 	if (vp->v_pages == NULL) {
    787 		mutex_exit(&vp->v_lock);
    788 		return (0);
    789 	}
    790 
    791 	vp->v_flag |= VVMLOCK;
    792 	mutex_exit(&vp->v_lock);
    793 
    794 
    795 	/*
    796 	 * Set up the marker pages used to walk the list
    797 	 */
    798 	end = kmem_cache_alloc(marker_cache, KM_SLEEP);
    799 	end->p_vnode = vp;
    800 	end->p_offset = (u_offset_t)-2;
    801 	mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
    802 	mark->p_vnode = vp;
    803 	mark->p_offset = (u_offset_t)-1;
    804 
    805 	/*
    806 	 * Grab the lock protecting the vnode's page list
    807 	 * note that this lock is dropped at times in the loop.
    808 	 */
    809 	vphm = page_vnode_mutex(vp);
    810 	mutex_enter(vphm);
    811 	if (vp->v_pages == NULL)
    812 		goto leave;
    813 
    814 	/*
    815 	 * insert the markers and loop through the list of pages
    816 	 */
    817 	page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
    818 	page_vpadd(&mark->p_vpnext, end);
    819 	for (;;) {
    820 
    821 		/*
    822 		 * If only doing an async write back, then we can
    823 		 * stop as soon as we get to start of the list.
    824 		 */
    825 		if (flags == B_ASYNC && vp->v_pages == mark)
    826 			break;
    827 
    828 		/*
    829 		 * otherwise stop when we've gone through all the pages
    830 		 */
    831 		if (mark->p_vpprev == end)
    832 			break;
    833 
    834 		pp = mark->p_vpprev;
    835 		if (vp->v_pages == pp)
    836 			where_to_move = &vp->v_pages;
    837 		else
    838 			where_to_move = &pp->p_vpprev->p_vpnext;
    839 
    840 		ASSERT(pp->p_vnode == vp);
    841 
    842 		/*
    843 		 * Skip this page if the offset is out of the desired range.
    844 		 * Just move the marker and continue.
    845 		 */
    846 		if (pp->p_offset < off) {
    847 			page_vpsub(&vp->v_pages, mark);
    848 			page_vpadd(where_to_move, mark);
    849 			continue;
    850 		}
    851 
    852 		/*
    853 		 * If just flushing dirty pages to disk and this vnode
    854 		 * is using a sorted list of pages, we can stop processing
    855 		 * as soon as we find an unmodified page. Since all the
    856 		 * modified pages are visited first.
    857 		 */
    858 		if (IS_VMODSORT(vp) &&
    859 		    !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
    860 			if (!hat_ismod(pp) && !page_io_locked(pp)) {
    861 #ifdef  DEBUG
    862 				/*
    863 				 * For debug kernels examine what should be
    864 				 * all the remaining clean pages, asserting
    865 				 * that they are not modified.
    866 				 */
    867 				page_t	*chk = pp;
    868 				int	attr;
    869 
    870 				page_vpsub(&vp->v_pages, mark);
    871 				page_vpadd(where_to_move, mark);
    872 				do {
    873 					chk = chk->p_vpprev;
    874 					ASSERT(chk != end);
    875 					if (chk == mark)
    876 						continue;
    877 					attr = hat_page_getattr(chk, P_MOD |
    878 					    P_REF);
    879 					if ((attr & P_MOD) == 0)
    880 						continue;
    881 					panic("v_pages list not all clean: "
    882 					    "page_t*=%p vnode=%p off=%lx "
    883 					    "attr=0x%x last clean page_t*=%p\n",
    884 					    (void *)chk, (void *)chk->p_vnode,
    885 					    (long)chk->p_offset, attr,
    886 					    (void *)pp);
    887 				} while (chk != vp->v_pages);
    888 #endif
    889 				break;
    890 			} else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
    891 				/*
    892 				 * Couldn't get io lock, wait until IO is done.
    893 				 * Block only for sync IO since we don't want
    894 				 * to block async IO.
    895 				 */
    896 				mutex_exit(vphm);
    897 				page_io_wait(pp);
    898 				mutex_enter(vphm);
    899 				continue;
    900 			}
    901 		}
    902 
    903 		/*
    904 		 * If we are supposed to invalidate or free this
    905 		 * page, then we need an exclusive lock.
    906 		 */
    907 		se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
    908 
    909 		/*
    910 		 * We must acquire the page lock for all synchronous
    911 		 * operations (invalidate, free and write).
    912 		 */
    913 		if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
    914 			/*
    915 			 * If the page_lock() drops the mutex
    916 			 * we must retry the loop.
    917 			 */
    918 			if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
    919 				continue;
    920 
    921 			/*
    922 			 * It's ok to move the marker page now.
    923 			 */
    924 			page_vpsub(&vp->v_pages, mark);
    925 			page_vpadd(where_to_move, mark);
    926 		} else {
    927 
    928 			/*
    929 			 * update the marker page for all remaining cases
    930 			 */
    931 			page_vpsub(&vp->v_pages, mark);
    932 			page_vpadd(where_to_move, mark);
    933 
    934 			/*
    935 			 * For write backs, If we can't lock the page, it's
    936 			 * invalid or in the process of being destroyed.  Skip
    937 			 * it, assuming someone else is writing it.
    938 			 */
    939 			if (!page_trylock(pp, se))
    940 				continue;
    941 		}
    942 
    943 		ASSERT(pp->p_vnode == vp);
    944 
    945 		/*
    946 		 * Successfully locked the page, now figure out what to
    947 		 * do with it. Free pages are easily dealt with, invalidate
    948 		 * if desired or just go on to the next page.
    949 		 */
    950 		if (PP_ISFREE(pp)) {
    951 			if ((flags & B_INVAL) == 0) {
    952 				page_unlock(pp);
    953 				continue;
    954 			}
    955 
    956 			/*
    957 			 * Invalidate (destroy) the page.
    958 			 */
    959 			mutex_exit(vphm);
    960 			page_destroy_free(pp);
    961 			mutex_enter(vphm);
    962 			continue;
    963 		}
    964 
    965 		/*
    966 		 * pvn_getdirty() figures out what do do with a dirty page.
    967 		 * If the page is dirty, the putapage() routine will write it
    968 		 * and will kluster any other adjacent dirty pages it can.
    969 		 *
    970 		 * pvn_getdirty() and `(*putapage)' unlock the page.
    971 		 */
    972 		mutex_exit(vphm);
    973 		if (pvn_getdirty(pp, flags)) {
    974 			error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
    975 			if (!err)
    976 				err = error;
    977 		}
    978 		mutex_enter(vphm);
    979 	}
    980 	page_vpsub(&vp->v_pages, mark);
    981 	page_vpsub(&vp->v_pages, end);
    982 
    983 leave:
    984 	/*
    985 	 * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
    986 	 */
    987 	mutex_exit(vphm);
    988 	kmem_cache_free(marker_cache, mark);
    989 	kmem_cache_free(marker_cache, end);
    990 	mutex_enter(&vp->v_lock);
    991 	vp->v_flag &= ~VVMLOCK;
    992 	cv_broadcast(&vp->v_cv);
    993 	mutex_exit(&vp->v_lock);
    994 	return (err);
    995 }
    996 
    997 /*
    998  * Zero out zbytes worth of data. Caller should be aware that this
    999  * routine may enter back into the fs layer (xxx_getpage). Locks
   1000  * that the xxx_getpage routine may need should not be held while
   1001  * calling this.
   1002  */
   1003 void
   1004 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
   1005 {
   1006 	caddr_t addr;
   1007 
   1008 	ASSERT(vp->v_type != VCHR);
   1009 
   1010 	if (vp->v_pages == NULL)
   1011 		return;
   1012 
   1013 	/*
   1014 	 * zbytes may be zero but there still may be some portion of
   1015 	 * a page which needs clearing (since zbytes is a function
   1016 	 * of filesystem block size, not pagesize.)
   1017 	 */
   1018 	if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
   1019 		return;
   1020 
   1021 	/*
   1022 	 * We get the last page and handle the partial
   1023 	 * zeroing via kernel mappings.  This will make the page
   1024 	 * dirty so that we know that when this page is written
   1025 	 * back, the zeroed information will go out with it.  If
   1026 	 * the page is not currently in memory, then the kzero
   1027 	 * operation will cause it to be brought it.  We use kzero
   1028 	 * instead of bzero so that if the page cannot be read in
   1029 	 * for any reason, the system will not panic.  We need
   1030 	 * to zero out a minimum of the fs given zbytes, but we
   1031 	 * might also have to do more to get the entire last page.
   1032 	 */
   1033 
   1034 	if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
   1035 		panic("pvn_vptrunc zbytes");
   1036 	addr = segmap_getmapflt(segkmap, vp, vplen,
   1037 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
   1038 	(void) kzero(addr + (vplen & MAXBOFFSET),
   1039 	    MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
   1040 	(void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
   1041 }
   1042 
   1043 /*
   1044  * Handles common work of the VOP_GETPAGE routines when more than
   1045  * one page must be returned by calling a file system specific operation
   1046  * to do most of the work.  Must be called with the vp already locked
   1047  * by the VOP_GETPAGE routine.
   1048  */
   1049 int
   1050 pvn_getpages(
   1051 	int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
   1052 		size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
   1053 	struct vnode *vp,
   1054 	u_offset_t off,
   1055 	size_t len,
   1056 	uint_t *protp,
   1057 	page_t *pl[],
   1058 	size_t plsz,
   1059 	struct seg *seg,
   1060 	caddr_t addr,
   1061 	enum seg_rw rw,
   1062 	struct cred *cred)
   1063 {
   1064 	page_t **ppp;
   1065 	u_offset_t o, eoff;
   1066 	size_t sz, xlen;
   1067 	int err;
   1068 
   1069 	ASSERT(plsz >= len);		/* insure that we have enough space */
   1070 
   1071 	/*
   1072 	 * Loop one page at a time and let getapage function fill
   1073 	 * in the next page in array.  We only allow one page to be
   1074 	 * returned at a time (except for the last page) so that we
   1075 	 * don't have any problems with duplicates and other such
   1076 	 * painful problems.  This is a very simple minded algorithm,
   1077 	 * but it does the job correctly.  We hope that the cost of a
   1078 	 * getapage call for a resident page that we might have been
   1079 	 * able to get from an earlier call doesn't cost too much.
   1080 	 */
   1081 	ppp = pl;
   1082 	sz = PAGESIZE;
   1083 	eoff = off + len;
   1084 	xlen = len;
   1085 	for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
   1086 	    xlen -= PAGESIZE) {
   1087 		if (o + PAGESIZE >= eoff) {
   1088 			/*
   1089 			 * Last time through - allow the all of
   1090 			 * what's left of the pl[] array to be used.
   1091 			 */
   1092 			sz = plsz - (o - off);
   1093 		}
   1094 		err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
   1095 		    rw, cred);
   1096 		if (err) {
   1097 			/*
   1098 			 * Release any pages we already got.
   1099 			 */
   1100 			if (o > off && pl != NULL) {
   1101 				for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
   1102 					(void) page_release(*ppp, 1);
   1103 			}
   1104 			break;
   1105 		}
   1106 		if (pl != NULL)
   1107 			ppp++;
   1108 	}
   1109 	return (err);
   1110 }
   1111 
   1112 /*
   1113  * Initialize the page list array.
   1114  */
   1115 /*ARGSUSED*/
   1116 void
   1117 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
   1118     u_offset_t off, size_t io_len, enum seg_rw rw)
   1119 {
   1120 	ssize_t sz;
   1121 	page_t *ppcur, **ppp;
   1122 
   1123 	/*
   1124 	 * Set up to load plsz worth
   1125 	 * starting at the needed page.
   1126 	 */
   1127 	while (pp != NULL && pp->p_offset != off) {
   1128 		/*
   1129 		 * Remove page from the i/o list,
   1130 		 * release the i/o and the page lock.
   1131 		 */
   1132 		ppcur = pp;
   1133 		page_sub(&pp, ppcur);
   1134 		page_io_unlock(ppcur);
   1135 		(void) page_release(ppcur, 1);
   1136 	}
   1137 
   1138 	if (pp == NULL) {
   1139 		pl[0] = NULL;
   1140 		return;
   1141 	}
   1142 
   1143 	sz = plsz;
   1144 
   1145 	/*
   1146 	 * Initialize the page list array.
   1147 	 */
   1148 	ppp = pl;
   1149 	do {
   1150 		ppcur = pp;
   1151 		*ppp++ = ppcur;
   1152 		page_sub(&pp, ppcur);
   1153 		page_io_unlock(ppcur);
   1154 		if (rw != S_CREATE)
   1155 			page_downgrade(ppcur);
   1156 		sz -= PAGESIZE;
   1157 	} while (sz > 0 && pp != NULL);
   1158 	*ppp = NULL;		/* terminate list */
   1159 
   1160 	/*
   1161 	 * Now free the remaining pages that weren't
   1162 	 * loaded in the page list.
   1163 	 */
   1164 	while (pp != NULL) {
   1165 		ppcur = pp;
   1166 		page_sub(&pp, ppcur);
   1167 		page_io_unlock(ppcur);
   1168 		(void) page_release(ppcur, 1);
   1169 	}
   1170 }
   1171