Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #pragma ident	"@(#)seg_spt.c	1.106	07/10/10 SMI"
     27 
     28 #include <sys/param.h>
     29 #include <sys/user.h>
     30 #include <sys/mman.h>
     31 #include <sys/kmem.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/systm.h>
     35 #include <sys/tuneable.h>
     36 #include <vm/hat.h>
     37 #include <vm/seg.h>
     38 #include <vm/as.h>
     39 #include <vm/anon.h>
     40 #include <vm/page.h>
     41 #include <sys/buf.h>
     42 #include <sys/swap.h>
     43 #include <sys/atomic.h>
     44 #include <vm/seg_spt.h>
     45 #include <sys/debug.h>
     46 #include <sys/vtrace.h>
     47 #include <sys/shm.h>
     48 #include <sys/shm_impl.h>
     49 #include <sys/lgrp.h>
     50 #include <sys/vmsystm.h>
     51 #include <sys/policy.h>
     52 #include <sys/project.h>
     53 #include <sys/tnf_probe.h>
     54 #include <sys/zone.h>
     55 
     56 #define	SEGSPTADDR	(caddr_t)0x0
     57 
     58 /*
     59  * # pages used for spt
     60  */
     61 size_t	spt_used;
     62 
     63 /*
     64  * segspt_minfree is the memory left for system after ISM
     65  * locked its pages; it is set up to 5% of availrmem in
     66  * sptcreate when ISM is created.  ISM should not use more
     67  * than ~90% of availrmem; if it does, then the performance
     68  * of the system may decrease. Machines with large memories may
     69  * be able to use up more memory for ISM so we set the default
     70  * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
     71  * If somebody wants even more memory for ISM (risking hanging
     72  * the system) they can patch the segspt_minfree to smaller number.
     73  */
     74 pgcnt_t segspt_minfree = 0;
     75 
     76 static int segspt_create(struct seg *seg, caddr_t argsp);
     77 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
     78 static void segspt_free(struct seg *seg);
     79 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
     80 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
     81 
     82 static void
     83 segspt_badop()
     84 {
     85 	panic("segspt_badop called");
     86 	/*NOTREACHED*/
     87 }
     88 
     89 #define	SEGSPT_BADOP(t)	(t(*)())segspt_badop
     90 
     91 struct seg_ops segspt_ops = {
     92 	SEGSPT_BADOP(int),		/* dup */
     93 	segspt_unmap,
     94 	segspt_free,
     95 	SEGSPT_BADOP(int),		/* fault */
     96 	SEGSPT_BADOP(faultcode_t),	/* faulta */
     97 	SEGSPT_BADOP(int),		/* setprot */
     98 	SEGSPT_BADOP(int),		/* checkprot */
     99 	SEGSPT_BADOP(int),		/* kluster */
    100 	SEGSPT_BADOP(size_t),		/* swapout */
    101 	SEGSPT_BADOP(int),		/* sync */
    102 	SEGSPT_BADOP(size_t),		/* incore */
    103 	SEGSPT_BADOP(int),		/* lockop */
    104 	SEGSPT_BADOP(int),		/* getprot */
    105 	SEGSPT_BADOP(u_offset_t), 	/* getoffset */
    106 	SEGSPT_BADOP(int),		/* gettype */
    107 	SEGSPT_BADOP(int),		/* getvp */
    108 	SEGSPT_BADOP(int),		/* advise */
    109 	SEGSPT_BADOP(void),		/* dump */
    110 	SEGSPT_BADOP(int),		/* pagelock */
    111 	SEGSPT_BADOP(int),		/* setpgsz */
    112 	SEGSPT_BADOP(int),		/* getmemid */
    113 	segspt_getpolicy,		/* getpolicy */
    114 	SEGSPT_BADOP(int),		/* capable */
    115 };
    116 
    117 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
    118 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
    119 static void segspt_shmfree(struct seg *seg);
    120 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
    121 		caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
    122 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
    123 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
    124 			register size_t len, register uint_t prot);
    125 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
    126 			uint_t prot);
    127 static int	segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
    128 static size_t	segspt_shmswapout(struct seg *seg);
    129 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
    130 			register char *vec);
    131 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
    132 			int attr, uint_t flags);
    133 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
    134 			int attr, int op, ulong_t *lockmap, size_t pos);
    135 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
    136 			uint_t *protv);
    137 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
    138 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
    139 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
    140 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
    141 			uint_t behav);
    142 static void segspt_shmdump(struct seg *seg);
    143 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
    144 			struct page ***, enum lock_type, enum seg_rw);
    145 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
    146 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
    147 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
    148 static int segspt_shmcapable(struct seg *, segcapability_t);
    149 
    150 struct seg_ops segspt_shmops = {
    151 	segspt_shmdup,
    152 	segspt_shmunmap,
    153 	segspt_shmfree,
    154 	segspt_shmfault,
    155 	segspt_shmfaulta,
    156 	segspt_shmsetprot,
    157 	segspt_shmcheckprot,
    158 	segspt_shmkluster,
    159 	segspt_shmswapout,
    160 	segspt_shmsync,
    161 	segspt_shmincore,
    162 	segspt_shmlockop,
    163 	segspt_shmgetprot,
    164 	segspt_shmgetoffset,
    165 	segspt_shmgettype,
    166 	segspt_shmgetvp,
    167 	segspt_shmadvise,	/* advise */
    168 	segspt_shmdump,
    169 	segspt_shmpagelock,
    170 	segspt_shmsetpgsz,
    171 	segspt_shmgetmemid,
    172 	segspt_shmgetpolicy,
    173 	segspt_shmcapable,
    174 };
    175 
    176 static void segspt_purge(struct seg *seg);
    177 static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **,
    178 		enum seg_rw);
    179 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
    180 		page_t **ppa);
    181 
    182 
    183 
    184 /*ARGSUSED*/
    185 int
    186 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
    187 	uint_t prot, uint_t flags, uint_t share_szc)
    188 {
    189 	int 	err;
    190 	struct  as	*newas;
    191 	struct	segspt_crargs sptcargs;
    192 
    193 #ifdef DEBUG
    194 	TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
    195 			tnf_ulong, size, size );
    196 #endif
    197 	if (segspt_minfree == 0)	/* leave min 5% of availrmem for */
    198 		segspt_minfree = availrmem/20;	/* for the system */
    199 
    200 	if (!hat_supported(HAT_SHARED_PT, (void *)0))
    201 		return (EINVAL);
    202 
    203 	/*
    204 	 * get a new as for this shared memory segment
    205 	 */
    206 	newas = as_alloc();
    207 	newas->a_proc = NULL;
    208 	sptcargs.amp = amp;
    209 	sptcargs.prot = prot;
    210 	sptcargs.flags = flags;
    211 	sptcargs.szc = share_szc;
    212 	/*
    213 	 * create a shared page table (spt) segment
    214 	 */
    215 
    216 	if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
    217 		as_free(newas);
    218 		return (err);
    219 	}
    220 	*sptseg = sptcargs.seg_spt;
    221 	return (0);
    222 }
    223 
    224 void
    225 sptdestroy(struct as *as, struct anon_map *amp)
    226 {
    227 
    228 #ifdef DEBUG
    229 	TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
    230 #endif
    231 	(void) as_unmap(as, SEGSPTADDR, amp->size);
    232 	as_free(as);
    233 }
    234 
    235 /*
    236  * called from seg_free().
    237  * free (i.e., unlock, unmap, return to free list)
    238  *  all the pages in the given seg.
    239  */
    240 void
    241 segspt_free(struct seg	*seg)
    242 {
    243 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
    244 
    245 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    246 
    247 	if (sptd != NULL) {
    248 		if (sptd->spt_realsize)
    249 			segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
    250 
    251 	if (sptd->spt_ppa_lckcnt)
    252 		kmem_free(sptd->spt_ppa_lckcnt,
    253 		    sizeof (*sptd->spt_ppa_lckcnt)
    254 		    * btopr(sptd->spt_amp->size));
    255 		kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
    256 		cv_destroy(&sptd->spt_cv);
    257 		mutex_destroy(&sptd->spt_lock);
    258 		kmem_free(sptd, sizeof (*sptd));
    259 	}
    260 }
    261 
    262 /*ARGSUSED*/
    263 static int
    264 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
    265 	uint_t flags)
    266 {
    267 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    268 
    269 	return (0);
    270 }
    271 
    272 /*ARGSUSED*/
    273 static size_t
    274 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
    275 {
    276 	caddr_t	eo_seg;
    277 	pgcnt_t	npages;
    278 	struct shm_data *shmd = (struct shm_data *)seg->s_data;
    279 	struct seg	*sptseg;
    280 	struct spt_data *sptd;
    281 
    282 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    283 #ifdef lint
    284 	seg = seg;
    285 #endif
    286 	sptseg = shmd->shm_sptseg;
    287 	sptd = sptseg->s_data;
    288 
    289 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    290 		eo_seg = addr + len;
    291 		while (addr < eo_seg) {
    292 			/* page exists, and it's locked. */
    293 			*vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
    294 			    SEG_PAGE_ANON;
    295 			addr += PAGESIZE;
    296 		}
    297 		return (len);
    298 	} else {
    299 		struct  anon_map *amp = shmd->shm_amp;
    300 		struct  anon	*ap;
    301 		page_t		*pp;
    302 		pgcnt_t 	anon_index;
    303 		struct vnode 	*vp;
    304 		u_offset_t 	off;
    305 		ulong_t		i;
    306 		int		ret;
    307 		anon_sync_obj_t	cookie;
    308 
    309 		addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
    310 		anon_index = seg_page(seg, addr);
    311 		npages = btopr(len);
    312 		if (anon_index + npages > btopr(shmd->shm_amp->size)) {
    313 			return (EINVAL);
    314 		}
    315 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
    316 		for (i = 0; i < npages; i++, anon_index++) {
    317 			ret = 0;
    318 			anon_array_enter(amp, anon_index, &cookie);
    319 			ap = anon_get_ptr(amp->ahp, anon_index);
    320 			if (ap != NULL) {
    321 				swap_xlate(ap, &vp, &off);
    322 				anon_array_exit(&cookie);
    323 				pp = page_lookup_nowait(vp, off, SE_SHARED);
    324 				if (pp != NULL) {
    325 					ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
    326 					page_unlock(pp);
    327 				}
    328 			} else {
    329 				anon_array_exit(&cookie);
    330 			}
    331 			if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
    332 				ret |= SEG_PAGE_LOCKED;
    333 			}
    334 			*vec++ = (char)ret;
    335 		}
    336 		ANON_LOCK_EXIT(&amp->a_rwlock);
    337 		return (len);
    338 	}
    339 }
    340 
    341 static int
    342 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
    343 {
    344 	size_t share_size;
    345 
    346 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    347 
    348 	/*
    349 	 * seg.s_size may have been rounded up to the largest page size
    350 	 * in shmat().
    351 	 * XXX This should be cleanedup. sptdestroy should take a length
    352 	 * argument which should be the same as sptcreate. Then
    353 	 * this rounding would not be needed (or is done in shm.c)
    354 	 * Only the check for full segment will be needed.
    355 	 *
    356 	 * XXX -- shouldn't raddr == 0 always? These tests don't seem
    357 	 * to be useful at all.
    358 	 */
    359 	share_size = page_get_pagesize(seg->s_szc);
    360 	ssize = P2ROUNDUP(ssize, share_size);
    361 
    362 	if (raddr == seg->s_base && ssize == seg->s_size) {
    363 		seg_free(seg);
    364 		return (0);
    365 	} else
    366 		return (EINVAL);
    367 }
    368 
    369 int
    370 segspt_create(struct seg *seg, caddr_t argsp)
    371 {
    372 	int		err;
    373 	caddr_t		addr = seg->s_base;
    374 	struct spt_data *sptd;
    375 	struct 	segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
    376 	struct anon_map *amp = sptcargs->amp;
    377 	struct kshmid	*sp = amp->a_sp;
    378 	struct	cred	*cred = CRED();
    379 	ulong_t		i, j, anon_index = 0;
    380 	pgcnt_t		npages = btopr(amp->size);
    381 	struct vnode	*vp;
    382 	page_t		**ppa;
    383 	uint_t		hat_flags;
    384 	size_t		pgsz;
    385 	pgcnt_t		pgcnt;
    386 	caddr_t		a;
    387 	pgcnt_t		pidx;
    388 	size_t		sz;
    389 	proc_t		*procp = curproc;
    390 	rctl_qty_t	lockedbytes = 0;
    391 	kproject_t	*proj;
    392 
    393 	/*
    394 	 * We are holding the a_lock on the underlying dummy as,
    395 	 * so we can make calls to the HAT layer.
    396 	 */
    397 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    398 	ASSERT(sp != NULL);
    399 
    400 #ifdef DEBUG
    401 	TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
    402 	    tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
    403 #endif
    404 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
    405 		if (err = anon_swap_adjust(npages))
    406 			return (err);
    407 	}
    408 	err = ENOMEM;
    409 
    410 	if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
    411 		goto out1;
    412 
    413 	if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
    414 		if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
    415 		    KM_NOSLEEP)) == NULL)
    416 			goto out2;
    417 	}
    418 
    419 	mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
    420 
    421 	if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
    422 		goto out3;
    423 
    424 	seg->s_ops = &segspt_ops;
    425 	sptd->spt_vp = vp;
    426 	sptd->spt_amp = amp;
    427 	sptd->spt_prot = sptcargs->prot;
    428 	sptd->spt_flags = sptcargs->flags;
    429 	seg->s_data = (caddr_t)sptd;
    430 	sptd->spt_ppa = NULL;
    431 	sptd->spt_ppa_lckcnt = NULL;
    432 	seg->s_szc = sptcargs->szc;
    433 	cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
    434 	sptd->spt_gen = 0;
    435 
    436 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    437 	if (seg->s_szc > amp->a_szc) {
    438 		amp->a_szc = seg->s_szc;
    439 	}
    440 	ANON_LOCK_EXIT(&amp->a_rwlock);
    441 
    442 	/*
    443 	 * Set policy to affect initial allocation of pages in
    444 	 * anon_map_createpages()
    445 	 */
    446 	(void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
    447 	    NULL, 0, ptob(npages));
    448 
    449 	if (sptcargs->flags & SHM_PAGEABLE) {
    450 		size_t  share_sz;
    451 		pgcnt_t new_npgs, more_pgs;
    452 		struct anon_hdr *nahp;
    453 		zone_t *zone;
    454 
    455 		share_sz = page_get_pagesize(seg->s_szc);
    456 		if (!IS_P2ALIGNED(amp->size, share_sz)) {
    457 			/*
    458 			 * We are rounding up the size of the anon array
    459 			 * on 4 M boundary because we always create 4 M
    460 			 * of page(s) when locking, faulting pages and we
    461 			 * don't have to check for all corner cases e.g.
    462 			 * if there is enough space to allocate 4 M
    463 			 * page.
    464 			 */
    465 			new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
    466 			more_pgs = new_npgs - npages;
    467 
    468 			/*
    469 			 * The zone will never be NULL, as a fully created
    470 			 * shm always has an owning zone.
    471 			 */
    472 			zone = sp->shm_perm.ipc_zone;
    473 			ASSERT(zone != NULL);
    474 			if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
    475 				err = ENOMEM;
    476 				goto out4;
    477 			}
    478 
    479 			nahp = anon_create(new_npgs, ANON_SLEEP);
    480 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    481 			(void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
    482 			    ANON_SLEEP);
    483 			anon_release(amp->ahp, npages);
    484 			amp->ahp = nahp;
    485 			ASSERT(amp->swresv == ptob(npages));
    486 			amp->swresv = amp->size = ptob(new_npgs);
    487 			ANON_LOCK_EXIT(&amp->a_rwlock);
    488 			npages = new_npgs;
    489 		}
    490 
    491 		sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
    492 		    sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
    493 		sptd->spt_pcachecnt = 0;
    494 		sptd->spt_realsize = ptob(npages);
    495 		sptcargs->seg_spt = seg;
    496 		return (0);
    497 	}
    498 
    499 	/*
    500 	 * get array of pages for each anon slot in amp
    501 	 */
    502 	if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
    503 	    seg, addr, S_CREATE, cred)) != 0)
    504 		goto out4;
    505 
    506 	mutex_enter(&sp->shm_mlock);
    507 
    508 	/* May be partially locked, so, count bytes to charge for locking */
    509 	for (i = 0; i < npages; i++)
    510 		if (ppa[i]->p_lckcnt == 0)
    511 			lockedbytes += PAGESIZE;
    512 
    513 	proj = sp->shm_perm.ipc_proj;
    514 
    515 	if (lockedbytes > 0) {
    516 		mutex_enter(&procp->p_lock);
    517 		if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
    518 			mutex_exit(&procp->p_lock);
    519 			mutex_exit(&sp->shm_mlock);
    520 			for (i = 0; i < npages; i++)
    521 				page_unlock(ppa[i]);
    522 			err = ENOMEM;
    523 			goto out4;
    524 		}
    525 		mutex_exit(&procp->p_lock);
    526 	}
    527 
    528 	/*
    529 	 * addr is initial address corresponding to the first page on ppa list
    530 	 */
    531 	for (i = 0; i < npages; i++) {
    532 		/* attempt to lock all pages */
    533 		if (page_pp_lock(ppa[i], 0, 1) == 0) {
    534 			/*
    535 			 * if unable to lock any page, unlock all
    536 			 * of them and return error
    537 			 */
    538 			for (j = 0; j < i; j++)
    539 				page_pp_unlock(ppa[j], 0, 1);
    540 			for (i = 0; i < npages; i++)
    541 				page_unlock(ppa[i]);
    542 			rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
    543 			mutex_exit(&sp->shm_mlock);
    544 			err = ENOMEM;
    545 			goto out4;
    546 		}
    547 	}
    548 	mutex_exit(&sp->shm_mlock);
    549 
    550 	/*
    551 	 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
    552 	 * for the entire life of the segment. For example platforms
    553 	 * that do not support Dynamic Reconfiguration.
    554 	 */
    555 	hat_flags = HAT_LOAD_SHARE;
    556 	if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
    557 		hat_flags |= HAT_LOAD_LOCK;
    558 
    559 	/*
    560 	 * Load translations one lare page at a time
    561 	 * to make sure we don't create mappings bigger than
    562 	 * segment's size code in case underlying pages
    563 	 * are shared with segvn's segment that uses bigger
    564 	 * size code than we do.
    565 	 */
    566 	pgsz = page_get_pagesize(seg->s_szc);
    567 	pgcnt = page_get_pagecnt(seg->s_szc);
    568 	for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
    569 		sz = MIN(pgsz, ptob(npages - pidx));
    570 		hat_memload_array(seg->s_as->a_hat, a, sz,
    571 		    &ppa[pidx], sptd->spt_prot, hat_flags);
    572 	}
    573 
    574 	/*
    575 	 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
    576 	 * we will leave the pages locked SE_SHARED for the life
    577 	 * of the ISM segment. This will prevent any calls to
    578 	 * hat_pageunload() on this ISM segment for those platforms.
    579 	 */
    580 	if (!(hat_flags & HAT_LOAD_LOCK)) {
    581 		/*
    582 		 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
    583 		 * we no longer need to hold the SE_SHARED lock on the pages,
    584 		 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
    585 		 * SE_SHARED lock on the pages as necessary.
    586 		 */
    587 		for (i = 0; i < npages; i++)
    588 			page_unlock(ppa[i]);
    589 	}
    590 	sptd->spt_pcachecnt = 0;
    591 	kmem_free(ppa, ((sizeof (page_t *)) * npages));
    592 	sptd->spt_realsize = ptob(npages);
    593 	atomic_add_long(&spt_used, npages);
    594 	sptcargs->seg_spt = seg;
    595 	return (0);
    596 
    597 out4:
    598 	seg->s_data = NULL;
    599 	kmem_free(vp, sizeof (*vp));
    600 	cv_destroy(&sptd->spt_cv);
    601 out3:
    602 	mutex_destroy(&sptd->spt_lock);
    603 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
    604 		kmem_free(ppa, (sizeof (*ppa) * npages));
    605 out2:
    606 	kmem_free(sptd, sizeof (*sptd));
    607 out1:
    608 	if ((sptcargs->flags & SHM_PAGEABLE) == 0)
    609 		anon_swap_restore(npages);
    610 	return (err);
    611 }
    612 
    613 /*ARGSUSED*/
    614 void
    615 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
    616 {
    617 	struct page 	*pp;
    618 	struct spt_data *sptd = (struct spt_data *)seg->s_data;
    619 	pgcnt_t		npages;
    620 	ulong_t		anon_idx;
    621 	struct anon_map *amp;
    622 	struct anon 	*ap;
    623 	struct vnode 	*vp;
    624 	u_offset_t 	off;
    625 	uint_t		hat_flags;
    626 	int		root = 0;
    627 	pgcnt_t		pgs, curnpgs = 0;
    628 	page_t		*rootpp;
    629 	rctl_qty_t	unlocked_bytes = 0;
    630 	kproject_t	*proj;
    631 	kshmid_t	*sp;
    632 
    633 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    634 
    635 	len = P2ROUNDUP(len, PAGESIZE);
    636 
    637 	npages = btop(len);
    638 
    639 	hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
    640 	if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
    641 	    (sptd->spt_flags & SHM_PAGEABLE)) {
    642 		hat_flags = HAT_UNLOAD_UNMAP;
    643 	}
    644 
    645 	hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
    646 
    647 	amp = sptd->spt_amp;
    648 	if (sptd->spt_flags & SHM_PAGEABLE)
    649 		npages = btop(amp->size);
    650 
    651 	ASSERT(amp != NULL);
    652 
    653 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    654 		sp = amp->a_sp;
    655 		proj = sp->shm_perm.ipc_proj;
    656 		mutex_enter(&sp->shm_mlock);
    657 	}
    658 	for (anon_idx = 0; anon_idx < npages; anon_idx++) {
    659 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    660 			if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
    661 				panic("segspt_free_pages: null app");
    662 				/*NOTREACHED*/
    663 			}
    664 		} else {
    665 			if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
    666 			    == NULL)
    667 				continue;
    668 		}
    669 		ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
    670 		swap_xlate(ap, &vp, &off);
    671 
    672 		/*
    673 		 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
    674 		 * the pages won't be having SE_SHARED lock at this
    675 		 * point.
    676 		 *
    677 		 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
    678 		 * the pages are still held SE_SHARED locked from the
    679 		 * original segspt_create()
    680 		 *
    681 		 * Our goal is to get SE_EXCL lock on each page, remove
    682 		 * permanent lock on it and invalidate the page.
    683 		 */
    684 		if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    685 			if (hat_flags == HAT_UNLOAD_UNMAP)
    686 				pp = page_lookup(vp, off, SE_EXCL);
    687 			else {
    688 				if ((pp = page_find(vp, off)) == NULL) {
    689 					panic("segspt_free_pages: "
    690 					    "page not locked");
    691 					/*NOTREACHED*/
    692 				}
    693 				if (!page_tryupgrade(pp)) {
    694 					page_unlock(pp);
    695 					pp = page_lookup(vp, off, SE_EXCL);
    696 				}
    697 			}
    698 			if (pp == NULL) {
    699 				panic("segspt_free_pages: "
    700 				    "page not in the system");
    701 				/*NOTREACHED*/
    702 			}
    703 			ASSERT(pp->p_lckcnt > 0);
    704 			page_pp_unlock(pp, 0, 1);
    705 			if (pp->p_lckcnt == 0)
    706 				unlocked_bytes += PAGESIZE;
    707 		} else {
    708 			if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
    709 				continue;
    710 		}
    711 		/*
    712 		 * It's logical to invalidate the pages here as in most cases
    713 		 * these were created by segspt.
    714 		 */
    715 		if (pp->p_szc != 0) {
    716 			if (root == 0) {
    717 				ASSERT(curnpgs == 0);
    718 				root = 1;
    719 				rootpp = pp;
    720 				pgs = curnpgs = page_get_pagecnt(pp->p_szc);
    721 				ASSERT(pgs > 1);
    722 				ASSERT(IS_P2ALIGNED(pgs, pgs));
    723 				ASSERT(!(page_pptonum(pp) & (pgs - 1)));
    724 				curnpgs--;
    725 			} else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
    726 				ASSERT(curnpgs == 1);
    727 				ASSERT(page_pptonum(pp) ==
    728 				    page_pptonum(rootpp) + (pgs - 1));
    729 				page_destroy_pages(rootpp);
    730 				root = 0;
    731 				curnpgs = 0;
    732 			} else {
    733 				ASSERT(curnpgs > 1);
    734 				ASSERT(page_pptonum(pp) ==
    735 				    page_pptonum(rootpp) + (pgs - curnpgs));
    736 				curnpgs--;
    737 			}
    738 		} else {
    739 			if (root != 0 || curnpgs != 0) {
    740 				panic("segspt_free_pages: bad large page");
    741 				/*NOTREACHED*/
    742 			}
    743 			/*LINTED: constant in conditional context */
    744 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
    745 		}
    746 	}
    747 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    748 		if (unlocked_bytes > 0)
    749 			rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
    750 		mutex_exit(&sp->shm_mlock);
    751 	}
    752 	if (root != 0 || curnpgs != 0) {
    753 		panic("segspt_free_pages: bad large page");
    754 		/*NOTREACHED*/
    755 	}
    756 
    757 	/*
    758 	 * mark that pages have been released
    759 	 */
    760 	sptd->spt_realsize = 0;
    761 
    762 	if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
    763 		atomic_add_long(&spt_used, -npages);
    764 		anon_swap_restore(npages);
    765 	}
    766 }
    767 
    768 /*
    769  * Get memory allocation policy info for specified address in given segment
    770  */
    771 static lgrp_mem_policy_info_t *
    772 segspt_getpolicy(struct seg *seg, caddr_t addr)
    773 {
    774 	struct anon_map		*amp;
    775 	ulong_t			anon_index;
    776 	lgrp_mem_policy_info_t	*policy_info;
    777 	struct spt_data		*spt_data;
    778 
    779 	ASSERT(seg != NULL);
    780 
    781 	/*
    782 	 * Get anon_map from segspt
    783 	 *
    784 	 * Assume that no lock needs to be held on anon_map, since
    785 	 * it should be protected by its reference count which must be
    786 	 * nonzero for an existing segment
    787 	 * Need to grab readers lock on policy tree though
    788 	 */
    789 	spt_data = (struct spt_data *)seg->s_data;
    790 	if (spt_data == NULL)
    791 		return (NULL);
    792 	amp = spt_data->spt_amp;
    793 	ASSERT(amp->refcnt != 0);
    794 
    795 	/*
    796 	 * Get policy info
    797 	 *
    798 	 * Assume starting anon index of 0
    799 	 */
    800 	anon_index = seg_page(seg, addr);
    801 	policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
    802 
    803 	return (policy_info);
    804 }
    805 
    806 /*
    807  * DISM only.
    808  * Return locked pages over a given range.
    809  *
    810  * We will cache all DISM locked pages and save the pplist for the
    811  * entire segment in the ppa field of the underlying DISM segment structure.
    812  * Later, during a call to segspt_reclaim() we will use this ppa array
    813  * to page_unlock() all of the pages and then we will free this ppa list.
    814  */
    815 /*ARGSUSED*/
    816 static int
    817 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
    818     struct page ***ppp, enum lock_type type, enum seg_rw rw)
    819 {
    820 	struct  shm_data *shmd = (struct shm_data *)seg->s_data;
    821 	struct  seg	*sptseg = shmd->shm_sptseg;
    822 	struct  spt_data *sptd = sptseg->s_data;
    823 	pgcnt_t pg_idx, npages, tot_npages, npgs;
    824 	struct  page **pplist, **pl, **ppa, *pp;
    825 	struct  anon_map *amp;
    826 	spgcnt_t	an_idx;
    827 	int 	ret = ENOTSUP;
    828 	uint_t	pl_built = 0;
    829 	struct  anon *ap;
    830 	struct  vnode *vp;
    831 	u_offset_t off;
    832 	pgcnt_t claim_availrmem = 0;
    833 	uint_t	szc;
    834 
    835 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
    836 
    837 	/*
    838 	 * We want to lock/unlock the entire ISM segment. Therefore,
    839 	 * we will be using the underlying sptseg and it's base address
    840 	 * and length for the caching arguments.
    841 	 */
    842 	ASSERT(sptseg);
    843 	ASSERT(sptd);
    844 
    845 	pg_idx = seg_page(seg, addr);
    846 	npages = btopr(len);
    847 
    848 	/*
    849 	 * check if the request is larger than number of pages covered
    850 	 * by amp
    851 	 */
    852 	if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
    853 		*ppp = NULL;
    854 		return (ENOTSUP);
    855 	}
    856 
    857 	if (type == L_PAGEUNLOCK) {
    858 		ASSERT(sptd->spt_ppa != NULL);
    859 
    860 		seg_pinactive(seg, seg->s_base, sptd->spt_amp->size,
    861 		    sptd->spt_ppa, sptd->spt_prot, segspt_reclaim);
    862 
    863 		/*
    864 		 * If someone is blocked while unmapping, we purge
    865 		 * segment page cache and thus reclaim pplist synchronously
    866 		 * without waiting for seg_pasync_thread. This speeds up
    867 		 * unmapping in cases where munmap(2) is called, while
    868 		 * raw async i/o is still in progress or where a thread
    869 		 * exits on data fault in a multithreaded application.
    870 		 */
    871 		if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
    872 			segspt_purge(seg);
    873 		}
    874 		return (0);
    875 	} else if (type == L_PAGERECLAIM) {
    876 		ASSERT(sptd->spt_ppa != NULL);
    877 		(void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size,
    878 		    sptd->spt_ppa, sptd->spt_prot);
    879 		return (0);
    880 	}
    881 
    882 	if (sptd->spt_flags & DISM_PPA_CHANGED) {
    883 		segspt_purge(seg);
    884 		/*
    885 		 * for DISM ppa needs to be rebuild since
    886 		 * number of locked pages could be changed
    887 		 */
    888 		*ppp = NULL;
    889 		return (ENOTSUP);
    890 	}
    891 
    892 	/*
    893 	 * First try to find pages in segment page cache, without
    894 	 * holding the segment lock.
    895 	 */
    896 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
    897 	    sptd->spt_prot);
    898 	if (pplist != NULL) {
    899 		ASSERT(sptd->spt_ppa != NULL);
    900 		ASSERT(sptd->spt_ppa == pplist);
    901 		ppa = sptd->spt_ppa;
    902 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
    903 			if (ppa[an_idx] == NULL) {
    904 				seg_pinactive(seg, seg->s_base,
    905 				    sptd->spt_amp->size, ppa,
    906 				    sptd->spt_prot, segspt_reclaim);
    907 				*ppp = NULL;
    908 				return (ENOTSUP);
    909 			}
    910 			if ((szc = ppa[an_idx]->p_szc) != 0) {
    911 				npgs = page_get_pagecnt(szc);
    912 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
    913 			} else {
    914 				an_idx++;
    915 			}
    916 		}
    917 		/*
    918 		 * Since we cache the entire DISM segment, we want to
    919 		 * set ppp to point to the first slot that corresponds
    920 		 * to the requested addr, i.e. pg_idx.
    921 		 */
    922 		*ppp = &(sptd->spt_ppa[pg_idx]);
    923 		return (0);
    924 	}
    925 
    926 	/* The L_PAGELOCK case... */
    927 	mutex_enter(&sptd->spt_lock);
    928 	/*
    929 	 * try to find pages in segment page cache with mutex
    930 	 */
    931 	pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size,
    932 	    sptd->spt_prot);
    933 	if (pplist != NULL) {
    934 		ASSERT(sptd->spt_ppa != NULL);
    935 		ASSERT(sptd->spt_ppa == pplist);
    936 		ppa = sptd->spt_ppa;
    937 		for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
    938 			if (ppa[an_idx] == NULL) {
    939 				mutex_exit(&sptd->spt_lock);
    940 				seg_pinactive(seg, seg->s_base,
    941 				    sptd->spt_amp->size, ppa,
    942 				    sptd->spt_prot, segspt_reclaim);
    943 				*ppp = NULL;
    944 				return (ENOTSUP);
    945 			}
    946 			if ((szc = ppa[an_idx]->p_szc) != 0) {
    947 				npgs = page_get_pagecnt(szc);
    948 				an_idx = P2ROUNDUP(an_idx + 1, npgs);
    949 			} else {
    950 				an_idx++;
    951 			}
    952 		}
    953 		/*
    954 		 * Since we cache the entire DISM segment, we want to
    955 		 * set ppp to point to the first slot that corresponds