Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * Portions of this source code were derived from Berkeley 4.3 BSD
     31  * under license from the Regents of the University of California.
     32  */
     33 
     34 #pragma ident	"@(#)seg_map.c	1.143	07/10/25 SMI"
     35 
     36 /*
     37  * VM - generic vnode mapping segment.
     38  *
     39  * The segmap driver is used only by the kernel to get faster (than seg_vn)
     40  * mappings [lower routine overhead; more persistent cache] to random
     41  * vnode/offsets.  Note than the kernel may (and does) use seg_vn as well.
     42  */
     43 
     44 #include <sys/types.h>
     45 #include <sys/t_lock.h>
     46 #include <sys/param.h>
     47 #include <sys/sysmacros.h>
     48 #include <sys/buf.h>
     49 #include <sys/systm.h>
     50 #include <sys/vnode.h>
     51 #include <sys/mman.h>
     52 #include <sys/errno.h>
     53 #include <sys/cred.h>
     54 #include <sys/kmem.h>
     55 #include <sys/vtrace.h>
     56 #include <sys/cmn_err.h>
     57 #include <sys/debug.h>
     58 #include <sys/thread.h>
     59 #include <sys/dumphdr.h>
     60 #include <sys/bitmap.h>
     61 #include <sys/lgrp.h>
     62 
     63 #include <vm/seg_kmem.h>
     64 #include <vm/hat.h>
     65 #include <vm/as.h>
     66 #include <vm/seg.h>
     67 #include <vm/seg_kpm.h>
     68 #include <vm/seg_map.h>
     69 #include <vm/page.h>
     70 #include <vm/pvn.h>
     71 #include <vm/rm.h>
     72 
     73 /*
     74  * Private seg op routines.
     75  */
     76 static void	segmap_free(struct seg *seg);
     77 faultcode_t segmap_fault(struct hat *hat, struct seg *seg, caddr_t addr,
     78 			size_t len, enum fault_type type, enum seg_rw rw);
     79 static faultcode_t segmap_faulta(struct seg *seg, caddr_t addr);
     80 static int	segmap_checkprot(struct seg *seg, caddr_t addr, size_t len,
     81 			uint_t prot);
     82 static int	segmap_kluster(struct seg *seg, caddr_t addr, ssize_t);
     83 static int	segmap_getprot(struct seg *seg, caddr_t addr, size_t len,
     84 			uint_t *protv);
     85 static u_offset_t	segmap_getoffset(struct seg *seg, caddr_t addr);
     86 static int	segmap_gettype(struct seg *seg, caddr_t addr);
     87 static int	segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
     88 static void	segmap_dump(struct seg *seg);
     89 static int	segmap_pagelock(struct seg *seg, caddr_t addr, size_t len,
     90 			struct page ***ppp, enum lock_type type,
     91 			enum seg_rw rw);
     92 static void	segmap_badop(void);
     93 static int	segmap_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp);
     94 static lgrp_mem_policy_info_t	*segmap_getpolicy(struct seg *seg,
     95     caddr_t addr);
     96 static int	segmap_capable(struct seg *seg, segcapability_t capability);
     97 
     98 /* segkpm support */
     99 static caddr_t	segmap_pagecreate_kpm(struct seg *, vnode_t *, u_offset_t,
    100 			struct smap *, enum seg_rw);
    101 struct smap	*get_smap_kpm(caddr_t, page_t **);
    102 
    103 #define	SEGMAP_BADOP(t)	(t(*)())segmap_badop
    104 
    105 static struct seg_ops segmap_ops = {
    106 	SEGMAP_BADOP(int),	/* dup */
    107 	SEGMAP_BADOP(int),	/* unmap */
    108 	segmap_free,
    109 	segmap_fault,
    110 	segmap_faulta,
    111 	SEGMAP_BADOP(int),	/* setprot */
    112 	segmap_checkprot,
    113 	segmap_kluster,
    114 	SEGMAP_BADOP(size_t),	/* swapout */
    115 	SEGMAP_BADOP(int),	/* sync */
    116 	SEGMAP_BADOP(size_t),	/* incore */
    117 	SEGMAP_BADOP(int),	/* lockop */
    118 	segmap_getprot,
    119 	segmap_getoffset,
    120 	segmap_gettype,
    121 	segmap_getvp,
    122 	SEGMAP_BADOP(int),	/* advise */
    123 	segmap_dump,
    124 	segmap_pagelock,	/* pagelock */
    125 	SEGMAP_BADOP(int),	/* setpgsz */
    126 	segmap_getmemid,	/* getmemid */
    127 	segmap_getpolicy,	/* getpolicy */
    128 	segmap_capable,		/* capable */
    129 };
    130 
    131 /*
    132  * Private segmap routines.
    133  */
    134 static void	segmap_unlock(struct hat *hat, struct seg *seg, caddr_t addr,
    135 			size_t len, enum seg_rw rw, struct smap *smp);
    136 static void	segmap_smapadd(struct smap *smp);
    137 static struct smap *segmap_hashin(struct smap *smp, struct vnode *vp,
    138 			u_offset_t off, int hashid);
    139 static void	segmap_hashout(struct smap *smp);
    140 
    141 
    142 /*
    143  * Statistics for segmap operations.
    144  *
    145  * No explicit locking to protect these stats.
    146  */
    147 struct segmapcnt segmapcnt = {
    148 	{ "fault",		KSTAT_DATA_ULONG },
    149 	{ "faulta",		KSTAT_DATA_ULONG },
    150 	{ "getmap",		KSTAT_DATA_ULONG },
    151 	{ "get_use",		KSTAT_DATA_ULONG },
    152 	{ "get_reclaim",	KSTAT_DATA_ULONG },
    153 	{ "get_reuse",		KSTAT_DATA_ULONG },
    154 	{ "get_unused",		KSTAT_DATA_ULONG },
    155 	{ "get_nofree",		KSTAT_DATA_ULONG },
    156 	{ "rel_async",		KSTAT_DATA_ULONG },
    157 	{ "rel_write",		KSTAT_DATA_ULONG },
    158 	{ "rel_free",		KSTAT_DATA_ULONG },
    159 	{ "rel_abort",		KSTAT_DATA_ULONG },
    160 	{ "rel_dontneed",	KSTAT_DATA_ULONG },
    161 	{ "release",		KSTAT_DATA_ULONG },
    162 	{ "pagecreate",		KSTAT_DATA_ULONG },
    163 	{ "free_notfree",	KSTAT_DATA_ULONG },
    164 	{ "free_dirty",		KSTAT_DATA_ULONG },
    165 	{ "free",		KSTAT_DATA_ULONG },
    166 	{ "stolen",		KSTAT_DATA_ULONG },
    167 	{ "get_nomtx",		KSTAT_DATA_ULONG }
    168 };
    169 
    170 kstat_named_t *segmapcnt_ptr = (kstat_named_t *)&segmapcnt;
    171 uint_t segmapcnt_ndata = sizeof (segmapcnt) / sizeof (kstat_named_t);
    172 
    173 /*
    174  * Return number of map pages in segment.
    175  */
    176 #define	MAP_PAGES(seg)		((seg)->s_size >> MAXBSHIFT)
    177 
    178 /*
    179  * Translate addr into smap number within segment.
    180  */
    181 #define	MAP_PAGE(seg, addr)  (((addr) - (seg)->s_base) >> MAXBSHIFT)
    182 
    183 /*
    184  * Translate addr in seg into struct smap pointer.
    185  */
    186 #define	GET_SMAP(seg, addr)	\
    187 	&(((struct segmap_data *)((seg)->s_data))->smd_sm[MAP_PAGE(seg, addr)])
    188 
    189 /*
    190  * Bit in map (16 bit bitmap).
    191  */
    192 #define	SMAP_BIT_MASK(bitindex)	(1 << ((bitindex) & 0xf))
    193 
    194 static int smd_colormsk = 0;
    195 static int smd_ncolor = 0;
    196 static int smd_nfree = 0;
    197 static int smd_freemsk = 0;
    198 #ifdef DEBUG
    199 static int *colors_used;
    200 #endif
    201 static struct smap *smd_smap;
    202 static struct smaphash *smd_hash;
    203 #ifdef SEGMAP_HASHSTATS
    204 static unsigned int *smd_hash_len;
    205 #endif
    206 static struct smfree *smd_free;
    207 static ulong_t smd_hashmsk = 0;
    208 
    209 #define	SEGMAP_MAXCOLOR		2
    210 #define	SEGMAP_CACHE_PAD	64
    211 
    212 union segmap_cpu {
    213 	struct {
    214 		uint32_t	scpu_free_ndx[SEGMAP_MAXCOLOR];
    215 		struct smap	*scpu_last_smap;
    216 		ulong_t		scpu_getmap;
    217 		ulong_t		scpu_release;
    218 		ulong_t		scpu_get_reclaim;
    219 		ulong_t		scpu_fault;
    220 		ulong_t		scpu_pagecreate;
    221 		ulong_t		scpu_get_reuse;
    222 	} scpu;
    223 	char	scpu_pad[SEGMAP_CACHE_PAD];
    224 };
    225 static union segmap_cpu *smd_cpu;
    226 
    227 /*
    228  * There are three locks in seg_map:
    229  *	- per freelist mutexes
    230  *	- per hashchain mutexes
    231  *	- per smap mutexes
    232  *
    233  * The lock ordering is to get the smap mutex to lock down the slot
    234  * first then the hash lock (for hash in/out (vp, off) list) or the
    235  * freelist lock to put the slot back on the free list.
    236  *
    237  * The hash search is done by only holding the hashchain lock, when a wanted
    238  * slot is found, we drop the hashchain lock then lock the slot so there
    239  * is no overlapping of hashchain and smap locks. After the slot is
    240  * locked, we verify again if the slot is still what we are looking
    241  * for.
    242  *
    243  * Allocation of a free slot is done by holding the freelist lock,
    244  * then locking the smap slot at the head of the freelist. This is
    245  * in reversed lock order so mutex_tryenter() is used.
    246  *
    247  * The smap lock protects all fields in smap structure except for
    248  * the link fields for hash/free lists which are protected by
    249  * hashchain and freelist locks.
    250  */
    251 
    252 #define	SHASHMTX(hashid)	(&smd_hash[hashid].sh_mtx)
    253 
    254 #define	SMP2SMF(smp)		(&smd_free[(smp - smd_smap) & smd_freemsk])
    255 #define	SMP2SMF_NDX(smp)	(ushort_t)((smp - smd_smap) & smd_freemsk)
    256 
    257 #define	SMAPMTX(smp) (&smp->sm_mtx)
    258 
    259 #define	SMAP_HASHFUNC(vp, off, hashid) \
    260 	{ \
    261 	hashid = ((((uintptr_t)(vp) >> 6) + ((uintptr_t)(vp) >> 3) + \
    262 		((off) >> MAXBSHIFT)) & smd_hashmsk); \
    263 	}
    264 
    265 /*
    266  * The most frequently updated kstat counters are kept in the
    267  * per cpu array to avoid hot cache blocks. The update function
    268  * sums the cpu local counters to update the global counters.
    269  */
    270 
    271 /* ARGSUSED */
    272 int
    273 segmap_kstat_update(kstat_t *ksp, int rw)
    274 {
    275 	int i;
    276 	ulong_t	getmap, release, get_reclaim;
    277 	ulong_t	fault, pagecreate, get_reuse;
    278 
    279 	if (rw == KSTAT_WRITE)
    280 		return (EACCES);
    281 	getmap = release = get_reclaim = (ulong_t)0;
    282 	fault = pagecreate = get_reuse = (ulong_t)0;
    283 	for (i = 0; i < max_ncpus; i++) {
    284 		getmap += smd_cpu[i].scpu.scpu_getmap;
    285 		release  += smd_cpu[i].scpu.scpu_release;
    286 		get_reclaim += smd_cpu[i].scpu.scpu_get_reclaim;
    287 		fault  += smd_cpu[i].scpu.scpu_fault;
    288 		pagecreate  += smd_cpu[i].scpu.scpu_pagecreate;
    289 		get_reuse += smd_cpu[i].scpu.scpu_get_reuse;
    290 	}
    291 	segmapcnt.smp_getmap.value.ul = getmap;
    292 	segmapcnt.smp_release.value.ul = release;
    293 	segmapcnt.smp_get_reclaim.value.ul = get_reclaim;
    294 	segmapcnt.smp_fault.value.ul = fault;
    295 	segmapcnt.smp_pagecreate.value.ul = pagecreate;
    296 	segmapcnt.smp_get_reuse.value.ul = get_reuse;
    297 	return (0);
    298 }
    299 
    300 int
    301 segmap_create(struct seg *seg, void *argsp)
    302 {
    303 	struct segmap_data *smd;
    304 	struct smap *smp;
    305 	struct smfree *sm;
    306 	struct segmap_crargs *a = (struct segmap_crargs *)argsp;
    307 	struct smaphash *shashp;
    308 	union segmap_cpu *scpu;
    309 	long i, npages;
    310 	size_t hashsz;
    311 	uint_t nfreelist;
    312 	extern void prefetch_smap_w(void *);
    313 	extern int max_ncpus;
    314 
    315 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
    316 
    317 	if (((uintptr_t)seg->s_base | seg->s_size) & MAXBOFFSET) {
    318 		panic("segkmap not MAXBSIZE aligned");
    319 		/*NOTREACHED*/
    320 	}
    321 
    322 	smd = kmem_zalloc(sizeof (struct segmap_data), KM_SLEEP);
    323 
    324 	seg->s_data = (void *)smd;
    325 	seg->s_ops = &segmap_ops;
    326 	smd->smd_prot = a->prot;
    327 
    328 	/*
    329 	 * Scale the number of smap freelists to be
    330 	 * proportional to max_ncpus * number of virtual colors.
    331 	 * The caller can over-ride this scaling by providing
    332 	 * a non-zero a->nfreelist argument.
    333 	 */
    334 	nfreelist = a->nfreelist;
    335 	if (nfreelist == 0)
    336 		nfreelist = max_ncpus;
    337 	else if (nfreelist < 0 || nfreelist > 4 * max_ncpus) {
    338 		cmn_err(CE_WARN, "segmap_create: nfreelist out of range "
    339 		"%d, using %d", nfreelist, max_ncpus);
    340 		nfreelist = max_ncpus;
    341 	}
    342 	if (nfreelist & (nfreelist - 1)) {
    343 		/* round up nfreelist to the next power of two. */
    344 		nfreelist = 1 << (highbit(nfreelist));
    345 	}
    346 
    347 	/*
    348 	 * Get the number of virtual colors - must be a power of 2.
    349 	 */
    350 	if (a->shmsize)
    351 		smd_ncolor = a->shmsize >> MAXBSHIFT;
    352 	else
    353 		smd_ncolor = 1;
    354 	ASSERT((smd_ncolor & (smd_ncolor - 1)) == 0);
    355 	ASSERT(smd_ncolor <= SEGMAP_MAXCOLOR);
    356 	smd_colormsk = smd_ncolor - 1;
    357 	smd->smd_nfree = smd_nfree = smd_ncolor * nfreelist;
    358 	smd_freemsk = smd_nfree - 1;
    359 
    360 	/*
    361 	 * Allocate and initialize the freelist headers.
    362 	 * Note that sm_freeq[1] starts out as the release queue. This
    363 	 * is known when the smap structures are initialized below.
    364 	 */
    365 	smd_free = smd->smd_free =
    366 	    kmem_zalloc(smd_nfree * sizeof (struct smfree), KM_SLEEP);
    367 	for (i = 0; i < smd_nfree; i++) {
    368 		sm = &smd->smd_free[i];
    369 		mutex_init(&sm->sm_freeq[0].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
    370 		mutex_init(&sm->sm_freeq[1].smq_mtx, NULL, MUTEX_DEFAULT, NULL);
    371 		sm->sm_allocq = &sm->sm_freeq[0];
    372 		sm->sm_releq = &sm->sm_freeq[1];
    373 	}
    374 
    375 	/*
    376 	 * Allocate and initialize the smap hash chain headers.
    377 	 * Compute hash size rounding down to the next power of two.
    378 	 */
    379 	npages = MAP_PAGES(seg);
    380 	smd->smd_npages = npages;
    381 	hashsz = npages / SMAP_HASHAVELEN;
    382 	hashsz = 1 << (highbit(hashsz)-1);
    383 	smd_hashmsk = hashsz - 1;
    384 	smd_hash = smd->smd_hash =
    385 	    kmem_alloc(hashsz * sizeof (struct smaphash), KM_SLEEP);
    386 #ifdef SEGMAP_HASHSTATS
    387 	smd_hash_len =
    388 	    kmem_zalloc(hashsz * sizeof (unsigned int), KM_SLEEP);
    389 #endif
    390 	for (i = 0, shashp = smd_hash; i < hashsz; i++, shashp++) {
    391 		shashp->sh_hash_list = NULL;
    392 		mutex_init(&shashp->sh_mtx, NULL, MUTEX_DEFAULT, NULL);
    393 	}
    394 
    395 	/*
    396 	 * Allocate and initialize the smap structures.
    397 	 * Link all slots onto the appropriate freelist.
    398 	 * The smap array is large enough to affect boot time
    399 	 * on large systems, so use memory prefetching and only
    400 	 * go through the array 1 time. Inline a optimized version
    401 	 * of segmap_smapadd to add structures to freelists with
    402 	 * knowledge that no locks are needed here.
    403 	 */
    404 	smd_smap = smd->smd_sm =
    405 		kmem_alloc(sizeof (struct smap) * npages, KM_SLEEP);
    406 
    407 	for (smp = &smd->smd_sm[MAP_PAGES(seg) - 1];
    408 	    smp >= smd->smd_sm; smp--) {
    409 		struct smap *smpfreelist;
    410 		struct sm_freeq *releq;
    411 
    412 		prefetch_smap_w((char *)smp);
    413 
    414 		smp->sm_vp = NULL;
    415 		smp->sm_hash = NULL;
    416 		smp->sm_off = 0;
    417 		smp->sm_bitmap = 0;
    418 		smp->sm_refcnt = 0;
    419 		mutex_init(&smp->sm_mtx, NULL, MUTEX_DEFAULT, NULL);
    420 		smp->sm_free_ndx = SMP2SMF_NDX(smp);
    421 
    422 		sm = SMP2SMF(smp);
    423 		releq = sm->sm_releq;
    424 
    425 		smpfreelist = releq->smq_free;
    426 		if (smpfreelist == 0) {
    427 			releq->smq_free = smp->sm_next = smp->sm_prev = smp;
    428 		} else {
    429 			smp->sm_next = smpfreelist;
    430 			smp->sm_prev = smpfreelist->sm_prev;
    431 			smpfreelist->sm_prev = smp;
    432 			smp->sm_prev->sm_next = smp;
    433 			releq->smq_free = smp->sm_next;
    434 		}
    435 
    436 		/*
    437 		 * sm_flag = 0 (no SM_QNDX_ZERO) implies smap on sm_freeq[1]
    438 		 */
    439 		smp->sm_flags = 0;
    440 
    441 #ifdef	SEGKPM_SUPPORT
    442 		/*
    443 		 * Due to the fragile prefetch loop no
    444 		 * separate function is used here.
    445 		 */
    446 		smp->sm_kpme_next = NULL;
    447 		smp->sm_kpme_prev = NULL;
    448 		smp->sm_kpme_page = NULL;
    449 #endif
    450 	}
    451 
    452 	/*
    453 	 * Allocate the per color indices that distribute allocation
    454 	 * requests over the free lists. Each cpu will have a private
    455 	 * rotor index to spread the allocations even across the available
    456 	 * smap freelists. Init the scpu_last_smap field to the first
    457 	 * smap element so there is no need to check for NULL.
    458 	 */
    459 	smd_cpu =
    460 		kmem_zalloc(sizeof (union segmap_cpu) * max_ncpus, KM_SLEEP);
    461 	for (i = 0, scpu = smd_cpu; i < max_ncpus; i++, scpu++) {
    462 		int j;
    463 		for (j = 0; j < smd_ncolor; j++)
    464 			scpu->scpu.scpu_free_ndx[j] = j;
    465 		scpu->scpu.scpu_last_smap = smd_smap;
    466 	}
    467 
    468 	if (vpm_enable) {
    469 		vpm_init();
    470 	}
    471 
    472 #ifdef DEBUG
    473 	/*
    474 	 * Keep track of which colors are used more often.
    475 	 */
    476 	colors_used = kmem_zalloc(smd_nfree * sizeof (int), KM_SLEEP);
    477 #endif /* DEBUG */
    478 
    479 	return (0);
    480 }
    481 
    482 static void
    483 segmap_free(seg)
    484 	struct seg *seg;
    485 {
    486 	ASSERT(seg->s_as && RW_WRITE_HELD(&seg->s_as->a_lock));
    487 }
    488 
    489 /*
    490  * Do a F_SOFTUNLOCK call over the range requested.
    491  * The range must have already been F_SOFTLOCK'ed.
    492  */
    493 static void
    494 segmap_unlock(
    495 	struct hat *hat,
    496 	struct seg *seg,
    497 	caddr_t addr,
    498 	size_t len,
    499 	enum seg_rw rw,
    500 	struct smap *smp)
    501 {
    502 	page_t *pp;
    503 	caddr_t adr;
    504 	u_offset_t off;
    505 	struct vnode *vp;
    506 	kmutex_t *smtx;
    507 
    508 	ASSERT(smp->sm_refcnt > 0);
    509 
    510 #ifdef lint
    511 	seg = seg;
    512 #endif
    513 
    514 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    515 
    516 		/*
    517 		 * We're called only from segmap_fault and this was a
    518 		 * NOP in case of a kpm based smap, so dangerous things
    519 		 * must have happened in the meantime. Pages are prefaulted
    520 		 * and locked in segmap_getmapflt and they will not be
    521 		 * unlocked until segmap_release.
    522 		 */
    523 		panic("segmap_unlock: called with kpm addr %p", (void *)addr);
    524 		/*NOTREACHED*/
    525 	}
    526 
    527 	vp = smp->sm_vp;
    528 	off = smp->sm_off + (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    529 
    530 	hat_unlock(hat, addr, P2ROUNDUP(len, PAGESIZE));
    531 	for (adr = addr; adr < addr + len; adr += PAGESIZE, off += PAGESIZE) {
    532 		ushort_t bitmask;
    533 
    534 		/*
    535 		 * Use page_find() instead of page_lookup() to
    536 		 * find the page since we know that it has
    537 		 * "shared" lock.
    538 		 */
    539 		pp = page_find(vp, off);
    540 		if (pp == NULL) {
    541 			panic("segmap_unlock: page not found");
    542 			/*NOTREACHED*/
    543 		}
    544 
    545 		if (rw == S_WRITE) {
    546 			hat_setrefmod(pp);
    547 		} else if (rw != S_OTHER) {
    548 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    549 				"segmap_fault:pp %p vp %p offset %llx",
    550 				pp, vp, off);
    551 			hat_setref(pp);
    552 		}
    553 
    554 		/*
    555 		 * Clear bitmap, if the bit corresponding to "off" is set,
    556 		 * since the page and translation are being unlocked.
    557 		 */
    558 		bitmask = SMAP_BIT_MASK((off - smp->sm_off) >> PAGESHIFT);
    559 
    560 		/*
    561 		 * Large Files: Following assertion is to verify
    562 		 * the correctness of the cast to (int) above.
    563 		 */
    564 		ASSERT((u_offset_t)(off - smp->sm_off) <= INT_MAX);
    565 		smtx = SMAPMTX(smp);
    566 		mutex_enter(smtx);
    567 		if (smp->sm_bitmap & bitmask) {
    568 			smp->sm_bitmap &= ~bitmask;
    569 		}
    570 		mutex_exit(smtx);
    571 
    572 		page_unlock(pp);
    573 	}
    574 }
    575 
    576 #define	MAXPPB	(MAXBSIZE/4096)	/* assumes minimum page size of 4k */
    577 
    578 /*
    579  * This routine is called via a machine specific fault handling
    580  * routine.  It is also called by software routines wishing to
    581  * lock or unlock a range of addresses.
    582  *
    583  * Note that this routine expects a page-aligned "addr".
    584  */
    585 faultcode_t
    586 segmap_fault(
    587 	struct hat *hat,
    588 	struct seg *seg,
    589 	caddr_t addr,
    590 	size_t len,
    591 	enum fault_type type,
    592 	enum seg_rw rw)
    593 {
    594 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    595 	struct smap *smp;
    596 	page_t *pp, **ppp;
    597 	struct vnode *vp;
    598 	u_offset_t off;
    599 	page_t *pl[MAXPPB + 1];
    600 	uint_t prot;
    601 	u_offset_t addroff;
    602 	caddr_t adr;
    603 	int err;
    604 	u_offset_t sm_off;
    605 	int hat_flag;
    606 
    607 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    608 		int newpage;
    609 		kmutex_t *smtx;
    610 
    611 		/*
    612 		 * Pages are successfully prefaulted and locked in
    613 		 * segmap_getmapflt and can't be unlocked until
    614 		 * segmap_release. No hat mappings have to be locked
    615 		 * and they also can't be unlocked as long as the
    616 		 * caller owns an active kpm addr.
    617 		 */
    618 #ifndef DEBUG
    619 		if (type != F_SOFTUNLOCK)
    620 			return (0);
    621 #endif
    622 
    623 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
    624 			panic("segmap_fault: smap not found "
    625 			    "for addr %p", (void *)addr);
    626 			/*NOTREACHED*/
    627 		}
    628 
    629 		smtx = SMAPMTX(smp);
    630 #ifdef	DEBUG
    631 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
    632 		if (newpage) {
    633 			cmn_err(CE_WARN, "segmap_fault: newpage? smp %p",
    634 				(void *)smp);
    635 		}
    636 
    637 		if (type != F_SOFTUNLOCK) {
    638 			mutex_exit(smtx);
    639 			return (0);
    640 		}
    641 #endif
    642 		mutex_exit(smtx);
    643 		vp = smp->sm_vp;
    644 		sm_off = smp->sm_off;
    645 
    646 		if (vp == NULL)
    647 			return (FC_MAKE_ERR(EIO));
    648 
    649 		ASSERT(smp->sm_refcnt > 0);
    650 
    651 		addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    652 		if (addroff + len > MAXBSIZE)
    653 			panic("segmap_fault: endaddr %p exceeds MAXBSIZE chunk",
    654 			    (void *)(addr + len));
    655 
    656 		off = sm_off + addroff;
    657 
    658 		pp = page_find(vp, off);
    659 
    660 		if (pp == NULL)
    661 			panic("segmap_fault: softunlock page not found");
    662 
    663 		/*
    664 		 * Set ref bit also here in case of S_OTHER to avoid the
    665 		 * overhead of supporting other cases than F_SOFTUNLOCK
    666 		 * with segkpm. We can do this because the underlying
    667 		 * pages are locked anyway.
    668 		 */
    669 		if (rw == S_WRITE) {
    670 			hat_setrefmod(pp);
    671 		} else {
    672 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    673 				"segmap_fault:pp %p vp %p offset %llx",
    674 				pp, vp, off);
    675 			hat_setref(pp);
    676 		}
    677 
    678 		return (0);
    679 	}
    680 
    681 	smd_cpu[CPU->cpu_seqid].scpu.scpu_fault++;
    682 	smp = GET_SMAP(seg, addr);
    683 	vp = smp->sm_vp;
    684 	sm_off = smp->sm_off;
    685 
    686 	if (vp == NULL)
    687 		return (FC_MAKE_ERR(EIO));
    688 
    689 	ASSERT(smp->sm_refcnt > 0);
    690 
    691 	addroff = (u_offset_t)((uintptr_t)addr & MAXBOFFSET);
    692 	if (addroff + len > MAXBSIZE) {
    693 		panic("segmap_fault: endaddr %p "
    694 		    "exceeds MAXBSIZE chunk", (void *)(addr + len));
    695 		/*NOTREACHED*/
    696 	}
    697 	off = sm_off + addroff;
    698 
    699 	/*
    700 	 * First handle the easy stuff
    701 	 */
    702 	if (type == F_SOFTUNLOCK) {
    703 		segmap_unlock(hat, seg, addr, len, rw, smp);
    704 		return (0);
    705 	}
    706 
    707 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
    708 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
    709 	err = VOP_GETPAGE(vp, (offset_t)off, len, &prot, pl, MAXBSIZE,
    710 	    seg, addr, rw, CRED(), NULL);
    711 
    712 	if (err)
    713 		return (FC_MAKE_ERR(err));
    714 
    715 	prot &= smd->smd_prot;
    716 
    717 	/*
    718 	 * Handle all pages returned in the pl[] array.
    719 	 * This loop is coded on the assumption that if
    720 	 * there was no error from the VOP_GETPAGE routine,
    721 	 * that the page list returned will contain all the
    722 	 * needed pages for the vp from [off..off + len].
    723 	 */
    724 	ppp = pl;
    725 	while ((pp = *ppp++) != NULL) {
    726 		u_offset_t poff;
    727 		ASSERT(pp->p_vnode == vp);
    728 		hat_flag = HAT_LOAD;
    729 
    730 		/*
    731 		 * Verify that the pages returned are within the range
    732 		 * of this segmap region.  Note that it is theoretically
    733 		 * possible for pages outside this range to be returned,
    734 		 * but it is not very likely.  If we cannot use the
    735 		 * page here, just release it and go on to the next one.
    736 		 */
    737 		if (pp->p_offset < sm_off ||
    738 		    pp->p_offset >= sm_off + MAXBSIZE) {
    739 			(void) page_release(pp, 1);
    740 			continue;
    741 		}
    742 
    743 		ASSERT(hat == kas.a_hat);
    744 		poff = pp->p_offset;
    745 		adr = addr + (poff - off);
    746 		if (adr >= addr && adr < addr + len) {
    747 			hat_setref(pp);
    748 			TRACE_3(TR_FAC_VM, TR_SEGMAP_FAULT,
    749 			    "segmap_fault:pp %p vp %p offset %llx",
    750 			    pp, vp, poff);
    751 			if (type == F_SOFTLOCK)
    752 				hat_flag = HAT_LOAD_LOCK;
    753 		}
    754 
    755 		/*
    756 		 * Deal with VMODSORT pages here. If we know this is a write
    757 		 * do the setmod now and allow write protection.
    758 		 * As long as it's modified or not S_OTHER, remove write
    759 		 * protection. With S_OTHER it's up to the FS to deal with this.
    760 		 */
    761 		if (IS_VMODSORT(vp)) {
    762 			if (rw == S_WRITE)
    763 				hat_setmod(pp);
    764 			else if (rw != S_OTHER && !hat_ismod(pp))
    765 				prot &= ~PROT_WRITE;
    766 		}
    767 
    768 		hat_memload(hat, adr, pp, prot, hat_flag);
    769 		if (hat_flag != HAT_LOAD_LOCK)
    770 			page_unlock(pp);
    771 	}
    772 	return (0);
    773 }
    774 
    775 /*
    776  * This routine is used to start I/O on pages asynchronously.
    777  */
    778 static faultcode_t
    779 segmap_faulta(struct seg *seg, caddr_t addr)
    780 {
    781 	struct smap *smp;
    782 	struct vnode *vp;
    783 	u_offset_t off;
    784 	int err;
    785 
    786 	if (segmap_kpm && IS_KPM_ADDR(addr)) {
    787 		int	newpage;
    788 		kmutex_t *smtx;
    789 
    790 		/*
    791 		 * Pages are successfully prefaulted and locked in
    792 		 * segmap_getmapflt and can't be unlocked until
    793 		 * segmap_release. No hat mappings have to be locked
    794 		 * and they also can't be unlocked as long as the
    795 		 * caller owns an active kpm addr.
    796 		 */
    797 #ifdef	DEBUG
    798 		if ((smp = get_smap_kpm(addr, NULL)) == NULL) {
    799 			panic("segmap_faulta: smap not found "
    800 			    "for addr %p", (void *)addr);
    801 			/*NOTREACHED*/
    802 		}
    803 
    804 		smtx = SMAPMTX(smp);
    805 		newpage = smp->sm_flags & SM_KPM_NEWPAGE;
    806 		mutex_exit(smtx);
    807 		if (newpage)
    808 			cmn_err(CE_WARN, "segmap_faulta: newpage? smp %p",
    809 			    (void *)smp);
    810 #endif
    811 		return (0);
    812 	}
    813 
    814 	segmapcnt.smp_faulta.value.ul++;
    815 	smp = GET_SMAP(seg, addr);
    816 
    817 	ASSERT(smp->sm_refcnt > 0);
    818 
    819 	vp = smp->sm_vp;
    820 	off = smp->sm_off;
    821 
    822 	if (vp == NULL) {
    823 		cmn_err(CE_WARN, "segmap_faulta - no vp");
    824 		return (FC_MAKE_ERR(EIO));
    825 	}
    826 
    827 	TRACE_3(TR_FAC_VM, TR_SEGMAP_GETPAGE,
    828 		"segmap_getpage:seg %p addr %p vp %p", seg, addr, vp);
    829 
    830 	err = VOP_GETPAGE(vp, (offset_t)(off + ((offset_t)((uintptr_t)addr
    831 	    & MAXBOFFSET))), PAGESIZE, (uint_t *)NULL, (page_t **)NULL, 0,
    832 	    seg, addr, S_READ, CRED(), NULL);
    833 
    834 	if (err)
    835 		return (FC_MAKE_ERR(err));
    836 	return (0);
    837 }
    838 
    839 /*ARGSUSED*/
    840 static int
    841 segmap_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
    842 {
    843 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    844 
    845 	ASSERT(seg->s_as && RW_LOCK_HELD(&seg->s_as->a_lock));
    846 
    847 	/*
    848 	 * Need not acquire the segment lock since
    849 	 * "smd_prot" is a read-only field.
    850 	 */
    851 	return (((smd->smd_prot & prot) != prot) ? EACCES : 0);
    852 }
    853 
    854 static int
    855 segmap_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
    856 {
    857 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    858 	size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1;
    859 
    860 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    861 
    862 	if (pgno != 0) {
    863 		do
    864 			protv[--pgno] = smd->smd_prot;
    865 		while (pgno != 0);
    866 	}
    867 	return (0);
    868 }
    869 
    870 static u_offset_t
    871 segmap_getoffset(struct seg *seg, caddr_t addr)
    872 {
    873 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    874 
    875 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    876 
    877 	return ((u_offset_t)smd->smd_sm->sm_off + (addr - seg->s_base));
    878 }
    879 
    880 /*ARGSUSED*/
    881 static int
    882 segmap_gettype(struct seg *seg, caddr_t addr)
    883 {
    884 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    885 
    886 	return (MAP_SHARED);
    887 }
    888 
    889 /*ARGSUSED*/
    890 static int
    891 segmap_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
    892 {
    893 	struct segmap_data *smd = (struct segmap_data *)seg->s_data;
    894 
    895 	ASSERT(seg->s_as && RW_READ_HELD(&seg->s_as->a_lock));
    896 
    897 	/* XXX - This doesn't make any sense */
    898 	*vpp = smd->smd_sm->sm_vp;
    899 	return (0);
    900 }
    901 
    902 /*
    903  * Check to see if it makes sense to do kluster/read ahead to
    904  * addr + delta relative to the mapping at addr.  We assume here
    905  * that delta is a signed PAGESIZE'd multiple (which can be negative).
    906  *
    907  * For segmap we always "approve" of this action from our standpoint.
    908  */
    909 /*ARGSUSED*/
    910 static int
    911 segmap_kluster(struct seg *seg, caddr_t addr, ssize_t delta)
    912 {
    913 	return (0);
    914 }
    915 
    916 static void
    917 segmap_badop()
    918 {
    919 	panic("segmap_badop");
    920 	/*NOTREACHED*/
    921 }
    922 
    923 /*
    924  * Special private segmap operations
    925  */
    926 
    927 /*
    928  * Add smap to the appropriate free list.
    929  */
    930 static void
    931 segmap_smapadd(struct smap *smp)
    932 {
    933 	struct smfree *sm;
    934 	struct smap *smpfreelist;
    935 	struct sm_freeq *releq;
    936 
    937 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
    938 
    939 	if (smp->sm_refcnt != 0) {
    940 		panic("segmap_smapadd");
    941 		/*NOTREACHED*/
    942 	}
    943 
    944 	sm = &smd_free[smp->sm_free_ndx];
    945 	/*
    946 	 * Add to the tail of the release queue
    947 	 * Note that sm_releq and sm_allocq could toggle
    948 	 * before we get the lock. This does not affect
    949 	 * correctness as the 2 queues are only maintained
    950 	 * to reduce lock pressure.
    951 	 */
    952 	releq = sm->sm_releq;
    953 	if (releq == &sm->sm_freeq[0])
    954 		smp->sm_flags |= SM_QNDX_ZERO;
    955 	else
    956 		smp->sm_flags &= ~SM_QNDX_ZERO;
    957 	mutex_enter(&releq->smq_mtx);
    958 	smpfreelist = releq->smq_free;
    959 	if (smpfreelist == 0) {
    960 		int want;
    961 
    962 		releq->smq_free = smp->sm_next = smp->sm_prev = smp;
    963 		/*
    964 		 * Both queue mutexes held to set sm_want;
    965 		 * snapshot the value before dropping releq mutex.
    966 		 * If sm_want appears after the releq mutex is dropped,
    967 		 * then the smap just freed is already gone.
    968 		 */
    969 		want = sm->sm_want;
    970 		mutex_exit(&releq->smq_mtx);
    971 		/*
    972 		 * See if there was a waiter before dropping the releq mutex
    973 		 * then recheck after obtaining sm_freeq[0] mutex as
    974 		 * the another thread may have already signaled.
    975 		 */
    976 		if (want) {
    977 			mutex_enter(&sm->sm_freeq[0].smq_mtx);
    978 			if (sm->sm_want)
    979 				cv_signal(&sm->sm_free_cv);
    980 			mutex_exit(&sm->sm_freeq[0].smq_mtx);
    981 		}
    982 	} else {
    983 		smp->sm_next = smpfreelist;
    984 		smp->sm_prev = smpfreelist->sm_prev;
    985 		smpfreelist->sm_prev = smp;
    986 		smp->sm_prev->sm_next = smp;
    987 		mutex_exit(&releq->smq_mtx);
    988 	}
    989 }
    990 
    991 
    992 static struct smap *
    993 segmap_hashin(struct smap *smp, struct vnode *vp, u_offset_t off, int hashid)
    994 {
    995 	struct smap **hpp;
    996 	struct smap *tmp;
    997 	kmutex_t *hmtx;
    998 
    999 	ASSERT(MUTEX_HELD(SMAPMTX(smp)));
   1000 	ASSERT(smp->sm_vp == NULL);
   1001 	ASSERT(smp->sm_hash == NULL);
   1002 	ASSERT(smp->sm_prev == NULL);
   1003 	ASSERT(smp->sm_next == NULL);
   1004 	ASSERT(hashid >= 0 && hashid <= smd_hashmsk);
   1005 
   1006 	hmtx = SHASHMTX(hashid);
   1007 
   1008 	mutex_enter(hmtx);
   1009 	/*
   1010 	 * First we need to verify that no one has created a smp
   1011 	 * with (vp,off) as its tag before we us.
   1012 	 */
   1013 	for (tmp = smd_hash[hashid].sh_hash_list;
   1014 	    tmp != NULL; tmp = tmp->sm_hash)
   1015 		if (tmp->sm_vp == vp && tmp->sm_off == off)
   1016 			break;
   1017 
   1018 	if (tmp == NULL) {
   1019 		/*
   1020 		 * No one created one yet.
   1021 		 *
   1022 		 * Funniness here - we don't increment the ref count on the
   1023 		 * vnode * even though we have another pointer to it here.
   1024 		 * The reason for this is that we don't want the fact that
   1025 		 * a seg_map entry somewhere refers to a vnode to prevent the
   1026 		 * vnode * itself from going away.  This is because this
   1027 		 * reference to the vnode is a "soft one".  In the case where
   1028 		 * a mapping is being used by a rdwr [or directory routine?]
   1029 		 * there already has to be a non-zero ref count on the vnode.
   1030 		 * In the case where the vp has been freed and the the smap
   1031 		 * structure is on the free list, there are no pages in memory
   1032 		 * that can refer to the vnode.  Thus even if we reuse the same
   1033 		 * vnode/smap structure for a vnode which has the same
   1034 		 * address but represents a different object, we are ok.
   1035 		 */
   1036 		smp->sm_vp = vp;
   1037 		smp->sm_off = off;
   1038 
   1039 		hpp = &smd_hash[hashid].sh_hash_list;
   1040 		smp->sm_hash = *hpp;
   1041 		*hpp = smp;
   1042 #ifdef SEGMAP_HASHSTATS
   1043 		smd_hash_len[hashid]++;
   1044 #endif
   1045 	}
   1046 	mutex_exit(hmtx);
   1047