Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)vpm.c	1.4	07/10/25 SMI"
     27 
     28 /*
     29  * VM - generic vnode page mapping interfaces.
     30  *
     31  * Mechanism to provide temporary mappings to vnode pages.
     32  * The typical use would be to copy/access file data.
     33  */
     34 
     35 #include <sys/types.h>
     36 #include <sys/t_lock.h>
     37 #include <sys/param.h>
     38 #include <sys/sysmacros.h>
     39 #include <sys/buf.h>
     40 #include <sys/systm.h>
     41 #include <sys/vnode.h>
     42 #include <sys/mman.h>
     43 #include <sys/errno.h>
     44 #include <sys/cred.h>
     45 #include <sys/kmem.h>
     46 #include <sys/vtrace.h>
     47 #include <sys/cmn_err.h>
     48 #include <sys/debug.h>
     49 #include <sys/thread.h>
     50 #include <sys/dumphdr.h>
     51 #include <sys/bitmap.h>
     52 #include <sys/lgrp.h>
     53 
     54 #include <vm/seg_kmem.h>
     55 #include <vm/hat.h>
     56 #include <vm/as.h>
     57 #include <vm/seg.h>
     58 #include <vm/seg_kpm.h>
     59 #include <vm/seg_map.h>
     60 #include <vm/page.h>
     61 #include <vm/pvn.h>
     62 #include <vm/rm.h>
     63 #include <vm/vpm.h>
     64 
     65 /*
     66  * Needs to be enabled by each platform.
     67  */
     68 int vpm_enable = 0;
     69 
     70 #ifdef	SEGKPM_SUPPORT
     71 
     72 
     73 int	vpm_cache_enable = 1;
     74 long	vpm_cache_percent = 12;
     75 long	vpm_cache_size;
     76 int	vpm_nfreelist = 0;
     77 int	vpmd_freemsk = 0;
     78 
     79 #define	VPM_S_PAD	64
     80 union vpm_cpu {
     81 	struct {
     82 		int	vcpu_free_ndx;
     83 		ulong_t	vcpu_hits;
     84 		ulong_t vcpu_misses;
     85 	} vcpu;
     86 	char vpm_pad[VPM_S_PAD];
     87 };
     88 static union vpm_cpu	*vpmd_cpu;
     89 
     90 #define	vfree_ndx	vcpu.vcpu_free_ndx
     91 
     92 int	vpm_cachemode = VPMCACHE_LRU;
     93 
     94 #define	PPMTX(pp) (&(pp)->p_ilock)
     95 
     96 static struct vpmap *vpmd_vpmap;	/* list of vpmap structs preallocated */
     97 static struct vpmfree *vpmd_free;
     98 #define	VPMAPMTX(vpm)	(&vpm->vpm_mtx)
     99 #define	VPMAP2VMF(vpm)	(&vpmd_free[(vpm - vpmd_vpmap) & vpmd_freemsk])
    100 #define	VPMAP2VMF_NDX(vpm)	(ushort_t)((vpm - vpmd_vpmap) & vpmd_freemsk)
    101 #define	VPMP(id)	(&vpmd_vpmap[id - 1])
    102 #define	VPMID(vpm)	(uint_t)((vpm - vpmd_vpmap) + 1)
    103 
    104 
    105 #ifdef	DEBUG
    106 
    107 struct	vpm_debug {
    108 	int vpmd_steals;
    109 	int vpmd_contend;
    110 	int vpmd_prevpagelocked;
    111 	int vpmd_getpagefailed;
    112 	int vpmd_zerostart;
    113 	int vpmd_emptyfreelist;
    114 	int vpmd_nofreevpms;
    115 } vpm_debug;
    116 
    117 #define	VPM_DEBUG(x)	((vpm_debug.x)++)
    118 
    119 int	steals;
    120 int	steals_mtbf = 7;
    121 int	contend;
    122 int	contend_mtbf = 127;
    123 
    124 #define	VPM_MTBF(v, f)	(((++(v)) & (f)) != (f))
    125 
    126 #else	/* DEBUG */
    127 
    128 #define	VPM_MTBF(v, f)	(1)
    129 #define	VPM_DEBUG(x)	/* nothing */
    130 
    131 #endif
    132 
    133 /*
    134  * The vpm cache.
    135  *
    136  * The main purpose of having a cache here is to speed up page_lookup()
    137  * operations and also provide an LRU(default) behaviour of file pages. The
    138  * page_lookup() operation tends to be expensive if a page has to be
    139  * reclaimed from the system page cache("cachelist"). Once we speed up the
    140  * page_lookup()->page_reclaim() path then there there should be no need for
    141  * this cache. The system page cache(cachelist) should effectively serve the
    142  * purpose of caching file pages.
    143  *
    144  * This cache is very similar to segmap's smap cache. Each page in the
    145  * cache is tracked by the structure vpmap_t. But unlike segmap, there is no
    146  * hash table. The page_t has a reference to the vpmap_t when cached. For a
    147  * given vnode, offset the page is found by means of a page_lookup() operation.
    148  * Any page which has a mapping(i.e when cached) will not be in the
    149  * system 'cachelist'. Hence the page_lookup() will not have to do a
    150  * page_reclaim(). That is how the cache serves to speed up page_lookup()
    151  * operations.
    152  *
    153  * This cache can be disabled by setting vpm_cache_enable = 0 in /etc/system.
    154  */
    155 
    156 void
    157 vpm_init()
    158 {
    159 	long  npages;
    160 	struct vpmap *vpm;
    161 	struct vpmfree *vpmflp;
    162 	int i, ndx;
    163 	extern void prefetch_smap_w(void *);
    164 
    165 	if (!vpm_cache_enable) {
    166 		return;
    167 	}
    168 
    169 	/*
    170 	 * Set the size of the cache.
    171 	 */
    172 	vpm_cache_size = mmu_ptob((physmem * vpm_cache_percent)/100);
    173 	if (vpm_cache_size < VPMAP_MINCACHE) {
    174 		vpm_cache_size = VPMAP_MINCACHE;
    175 	}
    176 
    177 	/*
    178 	 * Number of freelists.
    179 	 */
    180 	if (vpm_nfreelist == 0) {
    181 		vpm_nfreelist = max_ncpus;
    182 	} else if (vpm_nfreelist < 0 || vpm_nfreelist > 2 * max_ncpus) {
    183 		cmn_err(CE_WARN, "vpmap create : number of freelist "
    184 		"vpm_nfreelist %d using %d", vpm_nfreelist, max_ncpus);
    185 		vpm_nfreelist = 2 * max_ncpus;
    186 	}
    187 
    188 	/*
    189 	 * Round it up to the next power of 2
    190 	 */
    191 	if (vpm_nfreelist & (vpm_nfreelist - 1)) {
    192 		vpm_nfreelist = 1 << (highbit(vpm_nfreelist));
    193 	}
    194 	vpmd_freemsk = vpm_nfreelist - 1;
    195 
    196 	/*
    197 	 * Use a per cpu rotor index to spread the allocations evenly
    198 	 * across the available vpm freelists.
    199 	 */
    200 	vpmd_cpu = kmem_zalloc(sizeof (union vpm_cpu) * max_ncpus, KM_SLEEP);
    201 	ndx = 0;
    202 	for (i = 0; i < max_ncpus; i++) {
    203 
    204 		vpmd_cpu[i].vfree_ndx = ndx;
    205 		ndx = (ndx + 1) & vpmd_freemsk;
    206 	}
    207 
    208 	/*
    209 	 * Allocate and initialize the freelist.
    210 	 */
    211 	vpmd_free = kmem_zalloc(vpm_nfreelist * sizeof (struct vpmfree),
    212 				KM_SLEEP);
    213 	for (i = 0; i < vpm_nfreelist; i++) {
    214 
    215 		vpmflp = &vpmd_free[i];
    216 		/*
    217 		 * Set up initial queue pointers. They will get flipped
    218 		 * back and forth.
    219 		 */
    220 		vpmflp->vpm_allocq = &vpmflp->vpm_freeq[VPMALLOCQ];
    221 		vpmflp->vpm_releq = &vpmflp->vpm_freeq[VPMRELEQ];
    222 	}
    223 
    224 	npages = mmu_btop(vpm_cache_size);
    225 
    226 
    227 	/*
    228 	 * Allocate and initialize the vpmap structs.
    229 	 */
    230 	vpmd_vpmap = kmem_zalloc(sizeof (struct vpmap) * npages, KM_SLEEP);
    231 	for (vpm = vpmd_vpmap; vpm <= &vpmd_vpmap[npages - 1]; vpm++) {
    232 		struct vpmfree *vpmflp;
    233 		union vpm_freeq *releq;
    234 		struct vpmap *vpmapf;
    235 
    236 		/*
    237 		 * Use prefetch as we have to walk thru a large number of
    238 		 * these data structures. We just use the smap's prefetch
    239 		 * routine as it does the same. This should work fine
    240 		 * for x64(this needs to be modified when enabled on sparc).
    241 		 */
    242 		prefetch_smap_w((void *)vpm);
    243 
    244 		vpm->vpm_free_ndx = VPMAP2VMF_NDX(vpm);
    245 
    246 		vpmflp = VPMAP2VMF(vpm);
    247 		releq = vpmflp->vpm_releq;
    248 
    249 		vpmapf = releq->vpmq_free;
    250 		if (vpmapf == NULL) {
    251 			releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
    252 		} else {
    253 			vpm->vpm_next = vpmapf;
    254 			vpm->vpm_prev = vpmapf->vpm_prev;
    255 			vpmapf->vpm_prev = vpm;
    256 			vpm->vpm_prev->vpm_next = vpm;
    257 			releq->vpmq_free = vpm->vpm_next;
    258 		}
    259 
    260 		/*
    261 		 * Indicate that the vpmap is on the releq at start
    262 		 */
    263 		vpm->vpm_ndxflg = VPMRELEQ;
    264 	}
    265 }
    266 
    267 
    268 /*
    269  * unhooks vpm from the freelist if it is still on the freelist.
    270  */
    271 #define	VPMAP_RMFREELIST(vpm) \
    272 	{ \
    273 		if (vpm->vpm_next != NULL) { \
    274 			union vpm_freeq *freeq; \
    275 			struct vpmfree *vpmflp; \
    276 			vpmflp = &vpmd_free[vpm->vpm_free_ndx]; \
    277 			freeq = &vpmflp->vpm_freeq[vpm->vpm_ndxflg]; \
    278 			mutex_enter(&freeq->vpmq_mtx); \
    279 			if (freeq->vpmq_free != vpm) { \
    280 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
    281 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
    282 			} else if (vpm == vpm->vpm_next) { \
    283 				freeq->vpmq_free = NULL; \
    284 			} else { \
    285 				freeq->vpmq_free = vpm->vpm_next; \
    286 				vpm->vpm_prev->vpm_next = vpm->vpm_next; \
    287 				vpm->vpm_next->vpm_prev = vpm->vpm_prev; \
    288 			} \
    289 			mutex_exit(&freeq->vpmq_mtx); \
    290 			vpm->vpm_next = vpm->vpm_prev = NULL; \
    291 		} \
    292 	}
    293 
    294 static int
    295 get_freelndx(int mode)
    296 {
    297 	int ndx;
    298 
    299 	ndx = vpmd_cpu[CPU->cpu_seqid].vfree_ndx & vpmd_freemsk;
    300 	switch (mode) {
    301 
    302 	case	VPMCACHE_LRU:
    303 	default:
    304 			vpmd_cpu[CPU->cpu_seqid].vfree_ndx++;
    305 			break;
    306 	}
    307 	return (ndx);
    308 }
    309 
    310 
    311 /*
    312  * Find one vpmap structure from the free lists and use it for the newpage.
    313  * The previous page it cached is dissociated and released. The page_t's
    314  * p_vpmref is cleared only when the vpm it is pointing to is locked(or
    315  * for AMD64 when the page is exclusively locked in page_unload. That is
    316  * because the p_vpmref is treated as mapping).
    317  *
    318  * The page's p_vpmref is set when the page is
    319  * locked(at least SHARED locked).
    320  */
    321 static struct vpmap *
    322 get_free_vpmap(page_t *newpage)
    323 {
    324 	struct vpmfree *vpmflp;
    325 	kmutex_t *vmtx;
    326 	struct vpmap *vpm, *first;
    327 	union vpm_freeq *allocq, *releq;
    328 	page_t *pp = NULL;
    329 	int end_ndx, page_locked = 0;
    330 	int free_ndx;
    331 
    332 	/*
    333 	 * get the freelist bin index.
    334 	 */
    335 	free_ndx = get_freelndx(vpm_cachemode);
    336 
    337 	end_ndx = free_ndx;
    338 	vpmflp = &vpmd_free[free_ndx];
    339 
    340 retry_queue:
    341 	allocq = vpmflp->vpm_allocq;
    342 	mutex_enter(&allocq->vpmq_mtx);
    343 
    344 	if ((vpm = allocq->vpmq_free) == NULL) {
    345 
    346 skip_queue:
    347 		/*
    348 		 * The alloc list is empty or this queue is being skipped;
    349 		 * first see if the allocq toggled.
    350 		 */
    351 		if (vpmflp->vpm_allocq != allocq) {
    352 			/* queue changed */
    353 			mutex_exit(&allocq->vpmq_mtx);
    354 			goto retry_queue;
    355 		}
    356 		releq = vpmflp->vpm_releq;
    357 		if (!mutex_tryenter(&releq->vpmq_mtx)) {
    358 			/* cannot get releq; a free vpmap may be there now */
    359 			mutex_exit(&allocq->vpmq_mtx);
    360 
    361 			/*
    362 			 * This loop could spin forever if this thread has
    363 			 * higher priority than the thread that is holding
    364 			 * releq->vpmq_mtx. In order to force the other thread
    365 			 * to run, we'll lock/unlock the mutex which is safe
    366 			 * since we just unlocked the allocq mutex.
    367 			 */
    368 			mutex_enter(&releq->vpmq_mtx);
    369 			mutex_exit(&releq->vpmq_mtx);
    370 			goto retry_queue;
    371 		}
    372 		if (releq->vpmq_free == NULL) {
    373 			VPM_DEBUG(vpmd_emptyfreelist);
    374 			/*
    375 			 * This freelist is empty.
    376 			 * This should not happen unless clients
    377 			 * are failing to release the vpmap after
    378 			 * accessing the data. Before resorting
    379 			 * to sleeping, try the next list of the same color.
    380 			 */
    381 			free_ndx = (free_ndx + 1) & vpmd_freemsk;
    382 			if (free_ndx != end_ndx) {
    383 				mutex_exit(&releq->vpmq_mtx);
    384 				mutex_exit(&allocq->vpmq_mtx);
    385 				vpmflp = &vpmd_free[free_ndx];
    386 				goto retry_queue;
    387 			}
    388 			/*
    389 			 * Tried all freelists.
    390 			 * wait on this list and hope something gets freed.
    391 			 */
    392 			vpmflp->vpm_want++;
    393 			mutex_exit(&vpmflp->vpm_freeq[1].vpmq_mtx);
    394 			cv_wait(&vpmflp->vpm_free_cv,
    395 				&vpmflp->vpm_freeq[0].vpmq_mtx);
    396 			vpmflp->vpm_want--;
    397 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
    398 			vpmflp = &vpmd_free[free_ndx];
    399 			VPM_DEBUG(vpmd_nofreevpms);
    400 			goto retry_queue;
    401 		} else {
    402 			/*
    403 			 * Something on the rele queue; flip the alloc
    404 			 * and rele queues and retry.
    405 			 */
    406 			vpmflp->vpm_allocq = releq;
    407 			vpmflp->vpm_releq = allocq;
    408 			mutex_exit(&allocq->vpmq_mtx);
    409 			mutex_exit(&releq->vpmq_mtx);
    410 			if (page_locked) {
    411 				delay(hz >> 2);
    412 				page_locked = 0;
    413 			}
    414 			goto retry_queue;
    415 		}
    416 	} else {
    417 		int gotnewvpm;
    418 		kmutex_t *pmtx;
    419 		uint_t vpmref;
    420 
    421 		/*
    422 		 * Fastpath the case we get the vpmap mutex
    423 		 * on the first try.
    424 		 */
    425 		first = vpm;
    426 next_vpmap:
    427 		vmtx = VPMAPMTX(vpm);
    428 		if (!mutex_tryenter(vmtx)) {
    429 			/*
    430 			 * Another thread is trying to reclaim this slot.
    431 			 * Skip to the next queue or vpmap.
    432 			 */
    433 			if ((vpm = vpm->vpm_next) == first) {
    434 				goto skip_queue;
    435 			} else {
    436 				goto next_vpmap;
    437 			}
    438 		}
    439 
    440 		/*
    441 		 * Assign this vpm to the newpage.
    442 		 */
    443 		pmtx = PPMTX(newpage);
    444 		gotnewvpm = 0;
    445 		mutex_enter(pmtx);
    446 
    447 		/*
    448 		 * Check if some other thread already assigned a vpm to
    449 		 * this page.
    450 		 */
    451 		if ((vpmref = newpage->p_vpmref) == 0) {
    452 			newpage->p_vpmref = VPMID(vpm);
    453 			gotnewvpm = 1;
    454 		} else {
    455 			VPM_DEBUG(vpmd_contend);
    456 			mutex_exit(vmtx);
    457 		}
    458 		mutex_exit(pmtx);
    459 
    460 		if (gotnewvpm) {
    461 
    462 			/*
    463 			 * At this point, we've selected the vpm. Remove vpm
    464 			 * from its freelist. If vpm is the first one in
    465 			 * the freelist, update the head of the freelist.
    466 			 */
    467 			if (first == vpm) {
    468 				ASSERT(first == allocq->vpmq_free);
    469 				allocq->vpmq_free = vpm->vpm_next;
    470 			}
    471 
    472 			/*
    473 			 * If the head of the freelist still points to vpm,
    474 			 * then there are no more free vpmaps in that list.
    475 			 */
    476 			if (allocq->vpmq_free == vpm)
    477 				/*
    478 				 * Took the last one
    479 				 */
    480 				allocq->vpmq_free = NULL;
    481 			else {
    482 				vpm->vpm_prev->vpm_next = vpm->vpm_next;
    483 				vpm->vpm_next->vpm_prev = vpm->vpm_prev;
    484 			}
    485 			mutex_exit(&allocq->vpmq_mtx);
    486 			vpm->vpm_prev = vpm->vpm_next = NULL;
    487 
    488 			/*
    489 			 * Disassociate the previous page. On x64 systems
    490 			 * p_vpmref is used as a mapping reference to the page.
    491 			 */
    492 			if ((pp = vpm->vpm_pp) != NULL &&
    493 				vpm->vpm_vp == pp->p_vnode &&
    494 				vpm->vpm_off == pp->p_offset) {
    495 
    496 				pmtx = PPMTX(pp);
    497 				if (page_trylock(pp, SE_SHARED)) {
    498 					/*
    499 					 * Now verify that it is the correct
    500 					 * page. If not someone else stole it,
    501 					 * so just unlock it and leave.
    502 					 */
    503 					mutex_enter(pmtx);
    504 					if (PP_ISFREE(pp) ||
    505 						vpm->vpm_vp != pp->p_vnode ||
    506 						vpm->vpm_off != pp->p_offset ||
    507 						pp->p_vpmref != VPMID(vpm)) {
    508 						mutex_exit(pmtx);
    509 
    510 						page_unlock(pp);
    511 					} else {
    512 						/*
    513 						 * Release the page.
    514 						 */
    515 						pp->p_vpmref = 0;
    516 						mutex_exit(pmtx);
    517 						hat_kpm_mapout(pp, 0,
    518 							hat_kpm_page2va(pp, 1));
    519 						(void) page_release(pp, 1);
    520 					}
    521 				} else {
    522 					/*
    523 					 * If the page cannot be locked, just
    524 					 * clear the p_vpmref and go.
    525 					 */
    526 					mutex_enter(pmtx);
    527 					if (pp->p_vpmref == VPMID(vpm)) {
    528 						pp->p_vpmref = 0;
    529 					}
    530 					mutex_exit(pmtx);
    531 					VPM_DEBUG(vpmd_prevpagelocked);
    532 				}
    533 			}
    534 
    535 			/*
    536 			 * Setup vpm to point to the new page.
    537 			 */
    538 			vpm->vpm_pp = newpage;
    539 			vpm->vpm_vp = newpage->p_vnode;
    540 			vpm->vpm_off = newpage->p_offset;
    541 
    542 		} else {
    543 			int steal = !VPM_MTBF(steals, steals_mtbf);
    544 			/*
    545 			 * Page already has a vpm assigned just use that.
    546 			 * Grab the vpm mutex and verify that it is still
    547 			 * the correct one. The pp->p_vpmref should not change
    548 			 * once we have the vpm mutex and the page lock.
    549 			 */
    550 			mutex_exit(&allocq->vpmq_mtx);
    551 			vpm = VPMP(vpmref);
    552 			vmtx = VPMAPMTX(vpm);
    553 			mutex_enter(vmtx);
    554 			if ((steal && vpm->vpm_refcnt == 0) ||
    555 			    vpm->vpm_pp != newpage) {
    556 				/*
    557 				 * The vpm got stolen, retry.
    558 				 * clear the p_vpmref.
    559 				 */
    560 				pmtx = PPMTX(newpage);
    561 				mutex_enter(pmtx);
    562 				if (newpage->p_vpmref == vpmref) {
    563 					newpage->p_vpmref = 0;
    564 				}
    565 				mutex_exit(pmtx);
    566 
    567 				mutex_exit(vmtx);
    568 				VPM_DEBUG(vpmd_steals);
    569 				goto retry_queue;
    570 			} else if (vpm->vpm_refcnt == 0) {
    571 				/*
    572 				 * Remove it from the free list if it
    573 				 * exists there.
    574 				 */
    575 				VPMAP_RMFREELIST(vpm);
    576 			}
    577 		}
    578 		return (vpm);
    579 	}
    580 }
    581 
    582 static void
    583 free_vpmap(struct vpmap *vpm)
    584 {
    585 	struct vpmfree *vpmflp;
    586 	struct vpmap *vpmfreelist;
    587 	union vpm_freeq *releq;
    588 
    589 	ASSERT(MUTEX_HELD(VPMAPMTX(vpm)));
    590 
    591 	if (vpm->vpm_refcnt != 0) {
    592 		panic("free_vpmap");
    593 		/*NOTREACHED*/
    594 	}
    595 
    596 	vpmflp = &vpmd_free[vpm->vpm_free_ndx];
    597 	/*
    598 	 * Add to the tail of the release queue
    599 	 * Note that vpm_releq and vpm_allocq could toggle
    600 	 * before we get the lock. This does not affect
    601 	 * correctness as the 2 queues are only maintained
    602 	 * to reduce lock pressure.
    603 	 */
    604 	releq = vpmflp->vpm_releq;
    605 	if (releq == &vpmflp->vpm_freeq[0]) {
    606 		vpm->vpm_ndxflg = 0;
    607 	} else {
    608 		vpm->vpm_ndxflg = 1;
    609 	}
    610 	mutex_enter(&releq->vpmq_mtx);
    611 	vpmfreelist = releq->vpmq_free;
    612 	if (vpmfreelist == 0) {
    613 		int want;
    614 
    615 		releq->vpmq_free = vpm->vpm_next = vpm->vpm_prev = vpm;
    616 		/*
    617 		 * Both queue mutexes are held to set vpm_want;
    618 		 * snapshot the value before dropping releq mutex.
    619 		 * If vpm_want appears after the releq mutex is dropped,
    620 		 * then the vpmap just freed is already gone.
    621 		 */
    622 		want = vpmflp->vpm_want;
    623 		mutex_exit(&releq->vpmq_mtx);
    624 		/*
    625 		 * See if there was a waiter before dropping the releq mutex
    626 		 * then recheck after obtaining vpm_freeq[0] mutex as
    627 		 * the another thread may have already signaled.
    628 		 */
    629 		if (want) {
    630 			mutex_enter(&vpmflp->vpm_freeq[0].vpmq_mtx);
    631 			if (vpmflp->vpm_want)
    632 				cv_signal(&vpmflp->vpm_free_cv);
    633 			mutex_exit(&vpmflp->vpm_freeq[0].vpmq_mtx);
    634 		}
    635 	} else {
    636 		vpm->vpm_next = vpmfreelist;
    637 		vpm->vpm_prev = vpmfreelist->vpm_prev;
    638 		vpmfreelist->vpm_prev = vpm;
    639 		vpm->vpm_prev->vpm_next = vpm;
    640 		mutex_exit(&releq->vpmq_mtx);
    641 	}
    642 }
    643 
    644 /*
    645  * Get the vpmap for the page.
    646  * The refcnt of this vpm is incremented.
    647  */
    648 static struct vpmap *
    649 get_vpmap(page_t *pp)
    650 {
    651 	struct vpmap *vpm = NULL;
    652 	kmutex_t *vmtx;
    653 	kmutex_t *pmtx;
    654 	unsigned int refid;
    655 
    656 	ASSERT((pp != NULL) && PAGE_LOCKED(pp));
    657 
    658 	if (VPM_MTBF(contend, contend_mtbf) && (refid = pp->p_vpmref) != 0) {
    659 		vpm = VPMP(refid);
    660 		vmtx = VPMAPMTX(vpm);
    661 		mutex_enter(vmtx);
    662 		/*
    663 		 * Since we have the page lock and the vpm mutex, the
    664 		 * pp->p_vpmref cannot change.
    665 		 */
    666 		if (vpm->vpm_pp != pp) {
    667 			pmtx = PPMTX(pp);
    668 
    669 			/*
    670 			 * Clear the p_vpmref as it is incorrect.
    671 			 * This can happen if the page was stolen.
    672 			 * On x64 this should not happen as p_vpmref
    673 			 * is treated as a mapping on the page. So
    674 			 * if the page is stolen, the mapping would have
    675 			 * been cleared in page_unload().
    676 			 */
    677 			mutex_enter(pmtx);
    678 			if (pp->p_vpmref == refid)
    679 				pp->p_vpmref = 0;
    680 			mutex_exit(pmtx);
    681 
    682 			mutex_exit(vmtx);
    683 			vpm = NULL;
    684 		} else if (vpm->vpm_refcnt == 0) {
    685 			/*
    686 			 * Got the vpm, remove it from the free
    687 			 * list if it exists there.
    688 			 */
    689 			VPMAP_RMFREELIST(vpm);
    690 		}
    691 	}
    692 	if (vpm == NULL) {
    693 		/*
    694 		 * get_free_vpmap() returns with the vpmap mutex held.
    695 		 */
    696 		vpm = get_free_vpmap(pp);
    697 		vmtx = VPMAPMTX(vpm);
    698 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_misses++;
    699 	} else {
    700 		vpmd_cpu[CPU->cpu_seqid].vcpu.vcpu_hits++;
    701 	}
    702 
    703 	vpm->vpm_refcnt++;
    704 	mutex_exit(vmtx);
    705 
    706 	return (vpm);
    707 }
    708 
    709 /* END --- vpm cache ---- */
    710 
    711 /*
    712  * The vnode page mapping(vpm) interface routines.
    713  */
    714 
    715 /*
    716  * Find or create the pages starting form baseoff for specified
    717  * length 'len'.
    718  */
    719 static int
    720 vpm_pagecreate(
    721 	struct vnode *vp,
    722 	u_offset_t baseoff,
    723 	size_t len,
    724 	vmap_t vml[],
    725 	int nseg,
    726 	int *newpage)
    727 {
    728 
    729 	page_t *pp = NULL;
    730 	caddr_t base;
    731 	u_offset_t off = baseoff;
    732 	int i;
    733 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
    734 
    735 	for (i = 0; len > 0; len -= PAGESIZE, i++) {
    736 		struct vpmap *vpm;
    737 
    738 
    739 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
    740 
    741 			base = segkpm_create_va(off);
    742 
    743 			/*
    744 			 * the seg pointer passed in is just advisor. Just
    745 			 * pass segkmap for now like segmap does with
    746 			 * segmap_kpm enabled.
    747 			 */
    748 			if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT,
    749 			    segkmap, base)) == NULL) {
    750 				panic("segmap_pagecreate_vpm: "
    751 				    "page_create failed");
    752 				/*NOTREACHED*/
    753 			}
    754 			if (newpage != NULL)
    755 				*newpage = 1;
    756 
    757 			page_io_unlock(pp);
    758 		}
    759 
    760 		/*
    761 		 * Get the vpm for this page_t.
    762 		 */
    763 		if (vpm_cache_enable) {
    764 			vpm = get_vpmap(pp);
    765 			vml[i].vs_data = (void *)&vpm->vpm_pp;
    766 		} else {
    767 			vml[i].vs_data = (void *)pp;
    768 			pp->p_vpmref = 0;
    769 		}
    770 
    771 		vml[i].vs_addr = hat_kpm_mapin(pp, 0);
    772 		vml[i].vs_len = PAGESIZE;
    773 
    774 		off += PAGESIZE;
    775 	}
    776 	vml[i].vs_data = NULL;
    777 	vml[i].vs_addr = (caddr_t)NULL;
    778 	return (0);
    779 }
    780 
    781 
    782 /*
    783  * Returns vpm mappings of pages in the range [off, off+len], where
    784  * len is rounded up to the PAGESIZE boundary. The list of pages and
    785  * the page addresses are returned in the SGL vml (vmap_t) array passed in.
    786  * The nseg is the number of vmap_t entries in the array.
    787  *
    788  * Currently max len allowed is MAXBSIZE therefore, it will either
    789  * fetch/create one or two pages depending on what is the PAGESIZE.
    790  *
    791  * The segmap's SM_LOCKPROTO  usage is not supported by these interfaces.
    792  * For such cases, use the seg_map interfaces.
    793  */
    794 int
    795 vpm_map_pages(
    796 	struct vnode *vp,
    797 	u_offset_t off,
    798 	size_t len,
    799 	int fetchpage,
    800 	vmap_t *vml,
    801 	int nseg,
    802 	int  *newpage,
    803 	enum seg_rw rw)
    804 {
    805 	extern struct vnode *common_specvp();
    806 	u_offset_t baseoff;
    807 	uint_t prot;
    808 	caddr_t base;
    809 	page_t *pp, *pplist[MAXVMAPS];
    810 	struct vpmap *vpm;
    811 	int i, error = 0;
    812 
    813 	ASSERT(nseg >= MINVMAPS && nseg < MAXVMAPS);
    814 	baseoff = off & (offset_t)PAGEMASK;
    815 	vml[0].vs_data = NULL;
    816 	vml[0].vs_addr = (caddr_t)NULL;
    817 	/*
    818 	 * For now, lets restrict it to MAXBSIZE. XXX - We can allow
    819 	 * len longer then MAXBSIZE, but there should be a limit
    820 	 * which should be determined by how many pages the VOP_GETPAGE()
    821 	 * can fetch.
    822 	 */
    823 	if (off + len > baseoff + MAXBSIZE) {
    824 		panic("vpm_map_pages bad len");
    825 		/*NOTREACHED*/
    826 	}
    827 
    828 	/*
    829 	 * If this is a block device we have to be sure to use the
    830 	 * "common" block device vnode for the mapping.
    831 	 */
    832 	if (vp->v_type == VBLK)
    833 		vp = common_specvp(vp);
    834 
    835 	/*
    836 	 * round up len to a multiple of PAGESIZE.
    837 	 */
    838 	len = ((off + len - baseoff + PAGESIZE - 1) & (uintptr_t)PAGEMASK);
    839 
    840 	if (!fetchpage)
    841 		return (vpm_pagecreate(vp, baseoff, len, vml, nseg, newpage));
    842 
    843 	for (i = 0; len > 0; len -= PAGESIZE, i++, pplist[i] = NULL) {
    844 
    845 		pp = page_lookup(vp, baseoff, SE_SHARED);
    846 
    847 		/*
    848 		 * If we did not find the page or if this page was not
    849 		 * in our cache, then let VOP_GETPAGE get all the pages.
    850 		 * We need to call VOP_GETPAGE so that filesytems can do some
    851 		 * (un)necessary tracking for sequential access.
    852 		 */
    853 
    854 		if (pp == NULL || (vpm_cache_enable && pp->p_vpmref == 0) ||
    855 			(rw == S_WRITE && hat_page_getattr(pp, P_MOD | P_REF)
    856 							!= (P_MOD | P_REF))) {
    857 			if (pp != NULL) {
    858 				page_unlock(pp);
    859 			}
    860 
    861 			/*
    862 			 * Pass a dummy address as it will be required
    863 			 * by page_create_va(). We pass segkmap as the seg
    864 			 * as some file systems(UFS) check it.
    865 			 */
    866 			base = segkpm_create_va(baseoff);
    867 
    868 			error = VOP_GETPAGE(vp, baseoff, len, &prot, &pplist[i],
    869 			len, segkmap, base, rw, CRED(), NULL);
    870 			if (error) {
    871 				VPM_DEBUG(vpmd_getpagefailed);
    872 				pplist[i] = NULL;
    873 			}
    874 			break;
    875 		} else {
    876 			pplist[i] = pp;
    877 			baseoff += PAGESIZE;
    878 		}
    879 	}
    880 
    881 	if (error) {
    882 		for (i = 0; pplist[i] != NULL; i++) {
    883 			page_unlock(pplist[i]);
    884 			pplist[i] = NULL;
    885 		}
    886 		vml[0].vs_addr = NULL;
    887 		vml[0].vs_data = NULL;
    888 		return (error);
    889 	}
    890 
    891 	/*
    892 	 * Get the vpm's for pages.
    893 	 */
    894 	for (i = 0; pplist[i] != NULL; i++) {
    895 		if (vpm_cache_enable) {
    896 			vpm = get_vpmap(pplist[i]);
    897 			vml[i].vs_data = (void *)&(vpm->vpm_pp);
    898 		} else {
    899 			vml[i].vs_data = (void *)pplist[i];
    900 			pplist[i]->p_vpmref = 0;
    901 		}
    902 
    903 		vml[i].vs_addr = hat_kpm_mapin(pplist[i], 0);
    904 		vml[i].vs_len = PAGESIZE;
    905 	}
    906 
    907 	vml[i].vs_data = NULL;
    908 	vml[i].vs_addr = (caddr_t)NULL;
    909 
    910 	return (0);
    911 }
    912 
    913 /*
    914  * Release the vpm mappings on the pages and unlock them.
    915  */
    916 void
    917 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
    918 {
    919 	int i;
    920 	struct vpmap *vpm;
    921 	kmutex_t *mtx;
    922 	page_t *pp;
    923 
    924 	for (i = 0; vml[i].vs_data != NULL; i++) {
    925 		ASSERT(IS_KPM_ADDR(vml[i].vs_addr));
    926 
    927 		if (vpm_cache_enable) {
    928 			pp = *(((page_t **)vml[i].vs_data));
    929 		} else {
    930 			pp = (page_t *)vml[i].vs_data;
    931 		}
    932 
    933 		/*
    934 		 * Mark page as being modified or referenced, bacause vpm pages
    935 		 * would not cause faults where it would be set normally.
    936 		 */
    937 		if (rw == S_WRITE) {
    938 			hat_setrefmod(pp);
    939 		} else {
    940 			ASSERT(rw == S_READ);
    941 			hat_setref(pp);
    942 		}
    943 
    944 		if (vpm_cache_enable) {
    945 			page_unlock(pp);
    946 			vpm = (struct vpmap *)((char *)vml[i].vs_data
    947 					- offsetof(struct vpmap, vpm_pp));
    948 			mtx = VPMAPMTX(vpm);
    949 			mutex_enter(mtx);
    950 
    951 			if (--vpm->vpm_refcnt == 0) {
    952 				free_vpmap(vpm);
    953 			}
    954 			mutex_exit(mtx);
    955 		} else {
    956 			hat_kpm_mapout(pp, 0, vml[i].vs_addr);
    957 			(void) page_release(pp, 1);
    958 		}
    959 		vml[i].vs_data = NULL;
    960 		vml[i].vs_addr = NULL;
    961 	}
    962 }
    963 
    964 /*
    965  * Given the vp, off and the uio structure, this routine will do the
    966  * the copy (uiomove). If the last page created is partially written,
    967  * the rest of the page is zeroed out. It also zeros the beginning of
    968  * the first page till the start offset if requested(zerostart).
    969  * If pages are to be fetched, it will call the filesystem's getpage
    970  * function (VOP_GETPAGE) to get them, otherwise they will be created if
    971  * not already present in the page cache.
    972  */
    973 int
    974 vpm_data_copy(struct vnode *vp,
    975 	u_offset_t off,
    976 	size_t len,
    977 	struct uio *uio,
    978 	int fetchpage,
    979 	int *newpage,
    980 	int zerostart,
    981 	enum seg_rw rw)
    982 {
    983 	int error;
    984 	struct vmap vml[MINVMAPS];
    985 	enum uio_rw uiorw;
    986 	int npages = 0;
    987 
    988 	uiorw = (rw == S_WRITE) ? UIO_WRITE : UIO_READ;
    989 	/*
    990 	 * 'off' will be the offset where the I/O starts.
    991 	 * We get the pages starting at the (off & PAGEMASK)
    992 	 * page boundary.
    993 	 */
    994 	error = vpm_map_pages(vp, off, (uint_t)len,
    995 		fetchpage, vml, MINVMAPS, &npages,  rw);
    996 
    997 	if (newpage != NULL)
    998 		*newpage = npages;
    999 	if (!error) {
   1000 		int i, pn, slen = len;
   1001 		int pon = off & PAGEOFFSET;
   1002 
   1003 		/*
   1004 		 * Clear from the beginning of the page to start offset
   1005 		 * if requested.
   1006 		 */
   1007 		if (!fetchpage && zerostart) {
   1008 			(void) kzero(vml[0].vs_addr,  (uint_t)pon);
   1009 			VPM_DEBUG(vpmd_zerostart);
   1010 		}
   1011 
   1012 		for (i = 0; !error && slen > 0 &&
   1013 				vml[i].vs_addr != NULL; i++) {
   1014 			pn = (int)MIN(slen, (PAGESIZE - pon));
   1015 			error = uiomove(vml[i].vs_addr + pon,
   1016 				    (long)pn, uiorw, uio);
   1017 			slen -= pn;
   1018 			pon = 0;
   1019 		}
   1020 
   1021 		/*
   1022 		 * When new pages are created, zero out part of the
   1023 		 * page we did not copy to.
   1024 		 */
   1025 		if (!fetchpage && npages &&
   1026 			uio->uio_loffset < roundup(off + len, PAGESIZE)) {
   1027 			int nzero;
   1028 
   1029 			pon = (uio->uio_loffset & PAGEOFFSET);
   1030 			nzero = PAGESIZE  - pon;
   1031 			i = (uio->uio_loffset - (off & PAGEMASK)) / PAGESIZE;
   1032 			(void) kzero(vml[i].vs_addr + pon, (uint_t)nzero);
   1033 		}
   1034 		vpm_unmap_pages(vml, rw);
   1035 	}
   1036 	return (error);
   1037 }
   1038 
   1039 /*
   1040  * called to flush pages for the given vnode covering
   1041  * [off, off+len] range.
   1042  */
   1043 int
   1044 vpm_sync_pages(struct vnode *vp,
   1045 		u_offset_t off,
   1046 		size_t len,
   1047 		uint_t flags)
   1048 {
   1049 	extern struct vnode *common_specvp();
   1050 	int bflags = 0;
   1051 	int error = 0;
   1052 	size_t psize = roundup(len, PAGESIZE);
   1053 
   1054 	/*
   1055 	 * If this is a block device we have to be sure to use the
   1056 	 * "common" block device vnode for the mapping.
   1057 	 */
   1058 	if (vp->v_type == VBLK)
   1059 		vp = common_specvp(vp);
   1060 
   1061 	if ((flags & ~SM_DONTNEED) != 0) {
   1062 		if (flags & SM_ASYNC)
   1063 			bflags |= B_ASYNC;
   1064 		if (flags & SM_INVAL)
   1065 			bflags |= B_INVAL;
   1066 		if (flags & SM_DESTROY)
   1067 			bflags |= (B_INVAL|B_TRUNC);
   1068 		if (flags & SM_FREE)
   1069 			bflags |= B_FREE;
   1070 		if (flags & SM_DONTNEED)
   1071 			bflags |= B_DONTNEED;
   1072 
   1073 		error = VOP_PUTPAGE(vp, off, psize, bflags, CRED(), NULL);
   1074 	}
   1075 
   1076 	return (error);
   1077 }
   1078 
   1079 
   1080 #else	/* SEGKPM_SUPPORT */
   1081 
   1082 /* vpm stubs */
   1083 void
   1084 vpm_init()
   1085 {
   1086 }
   1087 
   1088 /*ARGSUSED*/
   1089 int
   1090 vpm_pagecreate(
   1091 	struct vnode *vp,
   1092 	u_offset_t baseoff,
   1093 	size_t len,
   1094 	vmap_t vml[],
   1095 	int nseg,
   1096 	int *newpage)
   1097 {
   1098 	return (0);
   1099 }
   1100 
   1101 /*ARGSUSED*/
   1102 int
   1103 vpm_map_pages(
   1104 	struct vnode *vp,
   1105 	u_offset_t off,
   1106 	size_t len,
   1107 	int fetchpage,
   1108 	vmap_t vml[],
   1109 	int nseg,
   1110 	int *newpage,
   1111 	enum seg_rw rw)
   1112 {
   1113 	return (0);
   1114 }
   1115 
   1116 /*ARGSUSED*/
   1117 int
   1118 vpm_data_copy(struct vnode *vp,
   1119 	u_offset_t off,
   1120 	size_t len,
   1121 	struct uio *uio,
   1122 	int fetchpage,
   1123 	int *newpage,
   1124 	int zerostart,
   1125 	enum seg_rw rw)
   1126 {
   1127 	return (0);
   1128 }
   1129 
   1130 /*ARGSUSED*/
   1131 void
   1132 vpm_unmap_pages(vmap_t vml[], enum seg_rw rw)
   1133 {
   1134 }
   1135 /*ARGSUSED*/
   1136 int
   1137 vpm_sync_pages(struct vnode *vp,
   1138 		u_offset_t off,
   1139 		size_t len,
   1140 		uint_t flags)
   1141 {
   1142 	return (0);
   1143 }
   1144 #endif	/* SEGKPM_SUPPORT */
   1145