Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)seg_vn.c	1.282	07/11/12 SMI"
     40 
     41 /*
     42  * VM - shared or copy-on-write from a vnode/anonymous memory.
     43  */
     44 
     45 #include <sys/types.h>
     46 #include <sys/param.h>
     47 #include <sys/t_lock.h>
     48 #include <sys/errno.h>
     49 #include <sys/systm.h>
     50 #include <sys/mman.h>
     51 #include <sys/debug.h>
     52 #include <sys/cred.h>
     53 #include <sys/vmsystm.h>
     54 #include <sys/tuneable.h>
     55 #include <sys/bitmap.h>
     56 #include <sys/swap.h>
     57 #include <sys/kmem.h>
     58 #include <sys/sysmacros.h>
     59 #include <sys/vtrace.h>
     60 #include <sys/cmn_err.h>
     61 #include <sys/callb.h>
     62 #include <sys/vm.h>
     63 #include <sys/dumphdr.h>
     64 #include <sys/lgrp.h>
     65 
     66 #include <vm/hat.h>
     67 #include <vm/as.h>
     68 #include <vm/seg.h>
     69 #include <vm/seg_vn.h>
     70 #include <vm/pvn.h>
     71 #include <vm/anon.h>
     72 #include <vm/page.h>
     73 #include <vm/vpage.h>
     74 #include <sys/proc.h>
     75 #include <sys/task.h>
     76 #include <sys/project.h>
     77 #include <sys/zone.h>
     78 #include <sys/shm_impl.h>
     79 /*
     80  * Private seg op routines.
     81  */
     82 static int	segvn_dup(struct seg *seg, struct seg *newseg);
     83 static int	segvn_unmap(struct seg *seg, caddr_t addr, size_t len);
     84 static void	segvn_free(struct seg *seg);
     85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg,
     86 		    caddr_t addr, size_t len, enum fault_type type,
     87 		    enum seg_rw rw);
     88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr);
     89 static int	segvn_setprot(struct seg *seg, caddr_t addr,
     90 		    size_t len, uint_t prot);
     91 static int	segvn_checkprot(struct seg *seg, caddr_t addr,
     92 		    size_t len, uint_t prot);
     93 static int	segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta);
     94 static size_t	segvn_swapout(struct seg *seg);
     95 static int	segvn_sync(struct seg *seg, caddr_t addr, size_t len,
     96 		    int attr, uint_t flags);
     97 static size_t	segvn_incore(struct seg *seg, caddr_t addr, size_t len,
     98 		    char *vec);
     99 static int	segvn_lockop(struct seg *seg, caddr_t addr, size_t len,
    100 		    int attr, int op, ulong_t *lockmap, size_t pos);
    101 static int	segvn_getprot(struct seg *seg, caddr_t addr, size_t len,
    102 		    uint_t *protv);
    103 static u_offset_t	segvn_getoffset(struct seg *seg, caddr_t addr);
    104 static int	segvn_gettype(struct seg *seg, caddr_t addr);
    105 static int	segvn_getvp(struct seg *seg, caddr_t addr,
    106 		    struct vnode **vpp);
    107 static int	segvn_advise(struct seg *seg, caddr_t addr, size_t len,
    108 		    uint_t behav);
    109 static void	segvn_dump(struct seg *seg);
    110 static int	segvn_pagelock(struct seg *seg, caddr_t addr, size_t len,
    111 		    struct page ***ppp, enum lock_type type, enum seg_rw rw);
    112 static int	segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len,
    113 		    uint_t szc);
    114 static int	segvn_getmemid(struct seg *seg, caddr_t addr,
    115 		    memid_t *memidp);
    116 static lgrp_mem_policy_info_t	*segvn_getpolicy(struct seg *, caddr_t);
    117 static int	segvn_capable(struct seg *seg, segcapability_t capable);
    118 
    119 struct	seg_ops segvn_ops = {
    120 	segvn_dup,
    121 	segvn_unmap,
    122 	segvn_free,
    123 	segvn_fault,
    124 	segvn_faulta,
    125 	segvn_setprot,
    126 	segvn_checkprot,
    127 	segvn_kluster,
    128 	segvn_swapout,
    129 	segvn_sync,
    130 	segvn_incore,
    131 	segvn_lockop,
    132 	segvn_getprot,
    133 	segvn_getoffset,
    134 	segvn_gettype,
    135 	segvn_getvp,
    136 	segvn_advise,
    137 	segvn_dump,
    138 	segvn_pagelock,
    139 	segvn_setpagesize,
    140 	segvn_getmemid,
    141 	segvn_getpolicy,
    142 	segvn_capable,
    143 };
    144 
    145 /*
    146  * Common zfod structures, provided as a shorthand for others to use.
    147  */
    148 static segvn_crargs_t zfod_segvn_crargs =
    149 	SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
    150 static segvn_crargs_t kzfod_segvn_crargs =
    151 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER,
    152 	PROT_ALL & ~PROT_USER);
    153 static segvn_crargs_t stack_noexec_crargs =
    154 	SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL);
    155 
    156 caddr_t	zfod_argsp = (caddr_t)&zfod_segvn_crargs;	/* user zfod argsp */
    157 caddr_t	kzfod_argsp = (caddr_t)&kzfod_segvn_crargs;	/* kernel zfod argsp */
    158 caddr_t	stack_exec_argsp = (caddr_t)&zfod_segvn_crargs;	/* executable stack */
    159 caddr_t	stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */
    160 
    161 #define	vpgtob(n)	((n) * sizeof (struct vpage))	/* For brevity */
    162 
    163 size_t	segvn_comb_thrshld = UINT_MAX;	/* patchable -- see 1196681 */
    164 
    165 static int	segvn_concat(struct seg *, struct seg *, int);
    166 static int	segvn_extend_prev(struct seg *, struct seg *,
    167 		    struct segvn_crargs *, size_t);
    168 static int	segvn_extend_next(struct seg *, struct seg *,
    169 		    struct segvn_crargs *, size_t);
    170 static void	segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw);
    171 static void	segvn_pagelist_rele(page_t **);
    172 static void	segvn_setvnode_mpss(vnode_t *);
    173 static void	segvn_relocate_pages(page_t **, page_t *);
    174 static int	segvn_full_szcpages(page_t **, uint_t, int *, uint_t *);
    175 static int	segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t,
    176     uint_t, page_t **, page_t **, uint_t *, int *);
    177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t,
    178     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
    179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t,
    180     caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int);
    181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t,
    182     u_offset_t, struct vpage *, page_t **, uint_t,
    183     enum fault_type, enum seg_rw, int, int);
    184 static void	segvn_vpage(struct seg *);
    185 
    186 static void segvn_purge(struct seg *seg);
    187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **,
    188     enum seg_rw);
    189 
    190 static int sameprot(struct seg *, caddr_t, size_t);
    191 
    192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t);
    193 static int segvn_clrszc(struct seg *);
    194 static struct seg *segvn_split_seg(struct seg *, caddr_t);
    195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t,
    196     ulong_t, uint_t);
    197 
    198 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t,
    199     size_t, void *, u_offset_t);
    200 
    201 static int segvn_slock_anonpages(page_t *, int);
    202 static void segvn_sunlock_anonpages(page_t *, int);
    203 
    204 static struct kmem_cache *segvn_cache;
    205 static struct kmem_cache **segvn_szc_cache;
    206 
    207 #ifdef VM_STATS
    208 static struct segvnvmstats_str {
    209 	ulong_t	fill_vp_pages[31];
    210 	ulong_t fltvnpages[49];
    211 	ulong_t	fullszcpages[10];
    212 	ulong_t	relocatepages[3];
    213 	ulong_t	fltanpages[17];
    214 	ulong_t pagelock[3];
    215 	ulong_t	demoterange[3];
    216 } segvnvmstats;
    217 #endif /* VM_STATS */
    218 
    219 #define	SDR_RANGE	1		/* demote entire range */
    220 #define	SDR_END		2		/* demote non aligned ends only */
    221 
    222 #define	CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) {	    \
    223 		if ((len) != 0) { 		      	      		      \
    224 			lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);  \
    225 			ASSERT(lpgaddr >= (seg)->s_base);	      	      \
    226 			lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) +    \
    227 			    (len)), pgsz);				      \
    228 			ASSERT(lpgeaddr > lpgaddr);		      	      \
    229 			ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size);    \
    230 		} else {					      	      \
    231 			lpgeaddr = lpgaddr = (addr);	      		      \
    232 		}							      \
    233 	}
    234 
    235 /*ARGSUSED*/
    236 static int
    237 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags)
    238 {
    239 	struct segvn_data *svd = buf;
    240 
    241 	rw_init(&svd->lock, NULL, RW_DEFAULT, NULL);
    242 	mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL);
    243 	svd->svn_trnext = svd->svn_trprev = NULL;
    244 	return (0);
    245 }
    246 
    247 /*ARGSUSED1*/
    248 static void
    249 segvn_cache_destructor(void *buf, void *cdrarg)
    250 {
    251 	struct segvn_data *svd = buf;
    252 
    253 	rw_destroy(&svd->lock);
    254 	mutex_destroy(&svd->segp_slock);
    255 }
    256 
    257 /*ARGSUSED*/
    258 static int
    259 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags)
    260 {
    261 	bzero(buf, sizeof (svntr_t));
    262 	return (0);
    263 }
    264 
    265 /*
    266  * Patching this variable to non-zero allows the system to run with
    267  * stacks marked as "not executable".  It's a bit of a kludge, but is
    268  * provided as a tweakable for platforms that export those ABIs
    269  * (e.g. sparc V8) that have executable stacks enabled by default.
    270  * There are also some restrictions for platforms that don't actually
    271  * implement 'noexec' protections.
    272  *
    273  * Once enabled, the system is (therefore) unable to provide a fully
    274  * ABI-compliant execution environment, though practically speaking,
    275  * most everything works.  The exceptions are generally some interpreters
    276  * and debuggers that create executable code on the stack and jump
    277  * into it (without explicitly mprotecting the address range to include
    278  * PROT_EXEC).
    279  *
    280  * One important class of applications that are disabled are those
    281  * that have been transformed into malicious agents using one of the
    282  * numerous "buffer overflow" attacks.  See 4007890.
    283  */
    284 int noexec_user_stack = 0;
    285 int noexec_user_stack_log = 1;
    286 
    287 int segvn_lpg_disable = 0;
    288 uint_t segvn_maxpgszc = 0;
    289 
    290 ulong_t segvn_vmpss_clrszc_cnt;
    291 ulong_t segvn_vmpss_clrszc_err;
    292 ulong_t segvn_fltvnpages_clrszc_cnt;
    293 ulong_t segvn_fltvnpages_clrszc_err;
    294 ulong_t segvn_setpgsz_align_err;
    295 ulong_t segvn_setpgsz_anon_align_err;
    296 ulong_t segvn_setpgsz_getattr_err;
    297 ulong_t segvn_setpgsz_eof_err;
    298 ulong_t segvn_faultvnmpss_align_err1;
    299 ulong_t segvn_faultvnmpss_align_err2;
    300 ulong_t segvn_faultvnmpss_align_err3;
    301 ulong_t segvn_faultvnmpss_align_err4;
    302 ulong_t segvn_faultvnmpss_align_err5;
    303 ulong_t	segvn_vmpss_pageio_deadlk_err;
    304 
    305 int segvn_use_regions = 1;
    306 
    307 /*
    308  * Segvn supports text replication optimization for NUMA platforms. Text
    309  * replica's are represented by anon maps (amp). There's one amp per text file
    310  * region per lgroup. A process chooses the amp for each of its text mappings
    311  * based on the lgroup assignment of its main thread (t_tid = 1). All
    312  * processes that want a replica on a particular lgroup for the same text file
    313  * mapping share the same amp. amp's are looked up in svntr_hashtab hash table
    314  * with vp,off,size,szc used as a key. Text replication segments are read only
    315  * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by
    316  * forcing COW faults from vnode to amp and mapping amp pages instead of vnode
    317  * pages. Replication amp is assigned to a segment when it gets its first
    318  * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread
    319  * rechecks periodically if the process still maps an amp local to the main
    320  * thread. If not async thread forces process to remap to an amp in the new
    321  * home lgroup of the main thread. Current text replication implementation
    322  * only provides the benefit to workloads that do most of their work in the
    323  * main thread of a process or all the threads of a process run in the same
    324  * lgroup. To extend text replication benefit to different types of
    325  * multithreaded workloads further work would be needed in the hat layer to
    326  * allow the same virtual address in the same hat to simultaneously map
    327  * different physical addresses (i.e. page table replication would be needed
    328  * for x86).
    329  *
    330  * amp pages are used instead of vnode pages as long as segment has a very
    331  * simple life cycle.  It's created via segvn_create(), handles S_EXEC
    332  * (S_READ) pagefaults and is fully unmapped.  If anything more complicated
    333  * happens such as protection is changed, real COW fault happens, pagesize is
    334  * changed, MC_LOCK is requested or segment is partially unmapped we turn off
    335  * text replication by converting the segment back to vnode only segment
    336  * (unmap segment's address range and set svd->amp to NULL).
    337  *
    338  * The original file can be changed after amp is inserted into
    339  * svntr_hashtab. Processes that are launched after the file is already
    340  * changed can't use the replica's created prior to the file change. To
    341  * implement this functionality hash entries are timestamped. Replica's can
    342  * only be used if current file modification time is the same as the timestamp
    343  * saved when hash entry was created. However just timestamps alone are not
    344  * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We
    345  * deal with file changes via MAP_SHARED mappings differently. When writable
    346  * MAP_SHARED mappings are created to vnodes marked as executable we mark all
    347  * existing replica's for this vnode as not usable for future text
    348  * mappings. And we don't create new replica's for files that currently have
    349  * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is
    350  * true).
    351  */
    352 
    353 #define	SEGVN_TEXTREPL_MAXBYTES_FACTOR	(20)
    354 size_t	segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR;
    355 
    356 static ulong_t			svntr_hashtab_sz = 512;
    357 static svntr_bucket_t		*svntr_hashtab = NULL;
    358 static struct kmem_cache	*svntr_cache;
    359 static svntr_stats_t		*segvn_textrepl_stats;
    360 static ksema_t 			segvn_trasync_sem;
    361 
    362 int				segvn_disable_textrepl = 1;
    363 size_t				textrepl_size_thresh = (size_t)-1;
    364 size_t				segvn_textrepl_bytes = 0;
    365 size_t				segvn_textrepl_max_bytes = 0;
    366 clock_t				segvn_update_textrepl_interval = 0;
    367 int				segvn_update_tr_time = 10;
    368 int				segvn_disable_textrepl_update = 0;
    369 
    370 static void segvn_textrepl(struct seg *);
    371 static void segvn_textunrepl(struct seg *, int);
    372 static void segvn_inval_trcache(vnode_t *);
    373 static void segvn_trasync_thread(void);
    374 static void segvn_trupdate_wakeup(void *);
    375 static void segvn_trupdate(void);
    376 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *,
    377     ulong_t);
    378 
    379 /*
    380  * Initialize segvn data structures
    381  */
    382 void
    383 segvn_init(void)
    384 {
    385 	uint_t maxszc;
    386 	uint_t szc;
    387 	size_t pgsz;
    388 
    389 	segvn_cache = kmem_cache_create("segvn_cache",
    390 	    sizeof (struct segvn_data), 0,
    391 	    segvn_cache_constructor, segvn_cache_destructor, NULL,
    392 	    NULL, NULL, 0);
    393 
    394 	if (segvn_lpg_disable == 0) {
    395 		szc = maxszc = page_num_pagesizes() - 1;
    396 		if (szc == 0) {
    397 			segvn_lpg_disable = 1;
    398 		}
    399 		if (page_get_pagesize(0) != PAGESIZE) {
    400 			panic("segvn_init: bad szc 0");
    401 			/*NOTREACHED*/
    402 		}
    403 		while (szc != 0) {
    404 			pgsz = page_get_pagesize(szc);
    405 			if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) {
    406 				panic("segvn_init: bad szc %d", szc);
    407 				/*NOTREACHED*/
    408 			}
    409 			szc--;
    410 		}
    411 		if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc)
    412 			segvn_maxpgszc = maxszc;
    413 	}
    414 
    415 	if (segvn_maxpgszc) {
    416 		segvn_szc_cache = (struct kmem_cache **)kmem_alloc(
    417 		    (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *),
    418 		    KM_SLEEP);
    419 	}
    420 
    421 	for (szc = 1; szc <= segvn_maxpgszc; szc++) {
    422 		char	str[32];
    423 
    424 		(void) sprintf(str, "segvn_szc_cache%d", szc);
    425 		segvn_szc_cache[szc] = kmem_cache_create(str,
    426 		    page_get_pagecnt(szc) * sizeof (page_t *), 0,
    427 		    NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
    428 	}
    429 
    430 
    431 	if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL))
    432 		segvn_use_regions = 0;
    433 
    434 	/*
    435 	 * For now shared regions and text replication segvn support
    436 	 * are mutually exclusive. This is acceptable because
    437 	 * currently significant benefit from text replication was
    438 	 * only observed on AMD64 NUMA platforms (due to relatively
    439 	 * small L2$ size) and currently we don't support shared
    440 	 * regions on x86.
    441 	 */
    442 	if (segvn_use_regions && !segvn_disable_textrepl) {
    443 		segvn_disable_textrepl = 1;
    444 	}
    445 
    446 #if defined(_LP64)
    447 	if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 &&
    448 	    !segvn_disable_textrepl) {
    449 		ulong_t i;
    450 		size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t);
    451 
    452 		svntr_cache = kmem_cache_create("svntr_cache",
    453 		    sizeof (svntr_t), 0, svntr_cache_constructor, NULL,
    454 		    NULL, NULL, NULL, 0);
    455 		svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP);
    456 		for (i = 0; i < svntr_hashtab_sz; i++) {
    457 			mutex_init(&svntr_hashtab[i].tr_lock,  NULL,
    458 			    MUTEX_DEFAULT, NULL);
    459 		}
    460 		segvn_textrepl_max_bytes = ptob(physmem) /
    461 		    segvn_textrepl_max_bytes_factor;
    462 		segvn_textrepl_stats = kmem_zalloc(NCPU *
    463 		    sizeof (svntr_stats_t), KM_SLEEP);
    464 		sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL);
    465 		(void) thread_create(NULL, 0, segvn_trasync_thread,
    466 		    NULL, 0, &p0, TS_RUN, minclsyspri);
    467 	}
    468 #endif
    469 }
    470 
    471 #define	SEGVN_PAGEIO	((void *)0x1)
    472 #define	SEGVN_NOPAGEIO	((void *)0x2)
    473 
    474 static void
    475 segvn_setvnode_mpss(vnode_t *vp)
    476 {
    477 	int err;
    478 
    479 	ASSERT(vp->v_mpssdata == NULL ||
    480 	    vp->v_mpssdata == SEGVN_PAGEIO ||
    481 	    vp->v_mpssdata == SEGVN_NOPAGEIO);
    482 
    483 	if (vp->v_mpssdata == NULL) {
    484 		if (vn_vmpss_usepageio(vp)) {
    485 			err = VOP_PAGEIO(vp, (page_t *)NULL,
    486 			    (u_offset_t)0, 0, 0, CRED(), NULL);
    487 		} else {
    488 			err = ENOSYS;
    489 		}
    490 		/*
    491 		 * set v_mpssdata just once per vnode life
    492 		 * so that it never changes.
    493 		 */
    494 		mutex_enter(&vp->v_lock);
    495 		if (vp->v_mpssdata == NULL) {
    496 			if (err == EINVAL) {
    497 				vp->v_mpssdata = SEGVN_PAGEIO;
    498 			} else {
    499 				vp->v_mpssdata = SEGVN_NOPAGEIO;
    500 			}
    501 		}
    502 		mutex_exit(&vp->v_lock);
    503 	}
    504 }
    505 
    506 int
    507 segvn_create(struct seg *seg, void *argsp)
    508 {
    509 	struct segvn_crargs *a = (struct segvn_crargs *)argsp;
    510 	struct segvn_data *svd;
    511 	size_t swresv = 0;
    512 	struct cred *cred;
    513 	struct anon_map *amp;
    514 	int error = 0;
    515 	size_t pgsz;
    516 	lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT;
    517 	int use_rgn = 0;
    518 	int trok = 0;
    519 
    520 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
    521 
    522 	if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) {
    523 		panic("segvn_create type");
    524 		/*NOTREACHED*/
    525 	}
    526 
    527 	/*
    528 	 * Check arguments.  If a shared anon structure is given then
    529 	 * it is illegal to also specify a vp.
    530 	 */
    531 	if (a->amp != NULL && a->vp != NULL) {
    532 		panic("segvn_create anon_map");
    533 		/*NOTREACHED*/
    534 	}
    535 
    536 	if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) &&
    537 	    a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) &&
    538 	    segvn_use_regions) {
    539 		use_rgn = 1;
    540 	}
    541 
    542 	/* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */
    543 	if (a->type == MAP_SHARED)
    544 		a->flags &= ~MAP_NORESERVE;
    545 
    546 	if (a->szc != 0) {
    547 		if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) ||
    548 		    (a->amp != NULL && a->type == MAP_PRIVATE) ||
    549 		    (a->flags & MAP_NORESERVE) || seg->s_as == &kas) {
    550 			a->szc = 0;
    551 		} else {
    552 			if (a->szc > segvn_maxpgszc)
    553 				a->szc = segvn_maxpgszc;
    554 			pgsz = page_get_pagesize(a->szc);
    555 			if (!IS_P2ALIGNED(seg->s_base, pgsz) ||
    556 			    !IS_P2ALIGNED(seg->s_size, pgsz)) {
    557 				a->szc = 0;
    558 			} else if (a->vp != NULL) {
    559 				extern struct vnode kvp;
    560 				if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) {
    561 					/*
    562 					 * paranoid check.
    563 					 * hat_page_demote() is not supported
    564 					 * on swapfs pages.
    565 					 */
    566 					a->szc = 0;
    567 				} else if (map_addr_vacalign_check(seg->s_base,
    568 				    a->offset & PAGEMASK)) {
    569 					a->szc = 0;
    570 				}
    571 			} else if (a->amp != NULL) {
    572 				pgcnt_t anum = btopr(a->offset);
    573 				pgcnt_t pgcnt = page_get_pagecnt(a->szc);
    574 				if (!IS_P2ALIGNED(anum, pgcnt)) {
    575 					a->szc = 0;
    576 				}
    577 			}
    578 		}
    579 	}
    580 
    581 	/*
    582 	 * If segment may need private pages, reserve them now.
    583 	 */
    584 	if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) ||
    585 	    (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) {
    586 		if (anon_resv(seg->s_size) == 0)
    587 			return (EAGAIN);
    588 		swresv = seg->s_size;
    589 		TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u",
    590 		    seg, swresv, 1);
    591 	}
    592 
    593 	/*
    594 	 * Reserve any mapping structures that may be required.
    595 	 *
    596 	 * Don't do it for segments that may use regions. It's currently a
    597 	 * noop in the hat implementations anyway.
    598 	 */
    599 	if (!use_rgn) {
    600 		hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP);
    601 	}
    602 
    603 	if (a->cred) {
    604 		cred = a->cred;
    605 		crhold(cred);
    606 	} else {
    607 		crhold(cred = CRED());
    608 	}
    609 
    610 	/* Inform the vnode of the new mapping */
    611 	if (a->vp != NULL) {
    612 		error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK,
    613 		    seg->s_as, seg->s_base, seg->s_size, a->prot,
    614 		    a->maxprot, a->type, cred, NULL);
    615 		if (error) {
    616 			if (swresv != 0) {
    617 				anon_unresv(swresv);
    618 				TRACE_3(TR_FAC_VM, TR_ANON_PROC,
    619 				    "anon proc:%p %lu %u", seg, swresv, 0);
    620 			}
    621 			crfree(cred);
    622 			if (!use_rgn) {
    623 				hat_unload(seg->s_as->a_hat, seg->s_base,
    624 				    seg->s_size, HAT_UNLOAD_UNMAP);
    625 			}
    626 			return (error);
    627 		}
    628 		/*
    629 		 * svntr_hashtab will be NULL if we support shared regions.
    630 		 */
    631 		trok = ((a->flags & MAP_TEXT) &&
    632 		    (seg->s_size > textrepl_size_thresh ||
    633 		    (a->flags & _MAP_TEXTREPL)) &&
    634 		    lgrp_optimizations() && svntr_hashtab != NULL &&
    635 		    a->type == MAP_PRIVATE && swresv == 0 &&
    636 		    !(a->flags & MAP_NORESERVE) &&
    637 		    seg->s_as != &kas && a->vp->v_type == VREG);
    638 
    639 		ASSERT(!trok || !use_rgn);
    640 	}
    641 
    642 	/*
    643 	 * If more than one segment in the address space, and they're adjacent
    644 	 * virtually, try to concatenate them.  Don't concatenate if an
    645 	 * explicit anon_map structure was supplied (e.g., SystemV shared
    646 	 * memory) or if we'll use text replication for this segment.
    647 	 */
    648 	if (a->amp == NULL && !use_rgn && !trok) {
    649 		struct seg *pseg, *nseg;
    650 		struct segvn_data *psvd, *nsvd;
    651 		lgrp_mem_policy_t ppolicy, npolicy;
    652 		uint_t	lgrp_mem_policy_flags = 0;
    653 		extern lgrp_mem_policy_t lgrp_mem_default_policy;
    654 
    655 		/*
    656 		 * Memory policy flags (lgrp_mem_policy_flags) is valid when
    657 		 * extending stack/heap segments.
    658 		 */
    659 		if ((a->vp == NULL) && (a->type == MAP_PRIVATE) &&
    660 		    !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) {
    661 			lgrp_mem_policy_flags = a->lgrp_mem_policy_flags;
    662 		} else {
    663 			/*
    664 			 * Get policy when not extending it from another segment
    665 			 */
    666 			mpolicy = lgrp_mem_policy_default(seg->s_size, a->type);
    667 		}
    668 
    669 		/*
    670 		 * First, try to concatenate the previous and new segments
    671 		 */
    672 		pseg = AS_SEGPREV(seg->s_as, seg);
    673 		if (pseg != NULL &&
    674 		    pseg->s_base + pseg->s_size == seg->s_base &&
    675 		    pseg->s_ops == &segvn_ops) {
    676 			/*
    677 			 * Get memory allocation policy from previous segment.
    678 			 * When extension is specified (e.g. for heap) apply
    679 			 * this policy to the new segment regardless of the
    680 			 * outcome of segment concatenation.  Extension occurs
    681 			 * for non-default policy otherwise default policy is
    682 			 * used and is based on extended segment size.
    683 			 */
    684 			psvd = (struct segvn_data *)pseg->s_data;
    685 			ppolicy = psvd->policy_info.mem_policy;
    686 			if (lgrp_mem_policy_flags ==
    687 			    LGRP_MP_FLAG_EXTEND_UP) {
    688 				if (ppolicy != lgrp_mem_default_policy) {
    689 					mpolicy = ppolicy;
    690 				} else {
    691 					mpolicy = lgrp_mem_policy_default(
    692 					    pseg->s_size + seg->s_size,
    693 					    a->type);
    694 				}
    695 			}
    696 
    697 			if (mpolicy == ppolicy &&
    698 			    (pseg->s_size + seg->s_size <=
    699 			    segvn_comb_thrshld || psvd->amp == NULL) &&
    700 			    segvn_extend_prev(pseg, seg, a, swresv) == 0) {
    701 				/*
    702 				 * success! now try to concatenate
    703 				 * with following seg
    704 				 */
    705 				crfree(cred);
    706 				nseg = AS_SEGNEXT(pseg->s_as, pseg);
    707 				if (nseg != NULL &&
    708 				    nseg != pseg &&
    709 				    nseg->s_ops == &segvn_ops &&
    710 				    pseg->s_base + pseg->s_size ==
    711 				    nseg->s_base)
    712 					(void) segvn_concat(pseg, nseg, 0);
    713 				ASSERT(pseg->s_szc == 0 ||
    714 				    (a->szc == pseg->s_szc &&
    715 				    IS_P2ALIGNED(pseg->s_base, pgsz) &&
    716 				    IS_P2ALIGNED(pseg->s_size, pgsz)));
    717 				return (0);
    718 			}
    719 		}
    720 
    721 		/*
    722 		 * Failed, so try to concatenate with following seg
    723 		 */
    724 		nseg = AS_SEGNEXT(seg->s_as, seg);
    725 		if (nseg != NULL &&
    726 		    seg->s_base + seg->s_size == nseg->s_base &&
    727 		    nseg->s_ops == &segvn_ops) {
    728 			/*
    729 			 * Get memory allocation policy from next segment.
    730 			 * When extension is specified (e.g. for stack) apply
    731 			 * this policy to the new segment regardless of the
    732 			 * outcome of segment concatenation.  Extension occurs
    733 			 * for non-default policy otherwise default policy is
    734 			 * used and is based on extended segment size.
    735 			 */
    736 			nsvd = (struct segvn_data *)nseg->s_data;
    737 			npolicy = nsvd->policy_info.mem_policy;
    738 			if (lgrp_mem_policy_flags ==
    739 			    LGRP_MP_FLAG_EXTEND_DOWN) {
    740 				if (npolicy != lgrp_mem_default_policy) {
    741 					mpolicy = npolicy;
    742 				} else {
    743 					mpolicy = lgrp_mem_policy_default(
    744 					    nseg->s_size + seg->s_size,
    745 					    a->type);
    746 				}
    747 			}
    748 
    749 			if (mpolicy == npolicy &&
    750 			    segvn_extend_next(seg, nseg, a, swresv) == 0) {
    751 				crfree(cred);
    752 				ASSERT(nseg->s_szc == 0 ||
    753 				    (a->szc == nseg->s_szc &&
    754 				    IS_P2ALIGNED(nseg->s_base, pgsz) &&
    755 				    IS_P2ALIGNED(nseg->s_size, pgsz)));
    756 				return (0);
    757 			}
    758 		}
    759 	}
    760 
    761 	if (a->vp != NULL) {
    762 		VN_HOLD(a->vp);
    763 		if (a->type == MAP_SHARED)
    764 			lgrp_shm_policy_init(NULL, a->vp);
    765 	}
    766 	svd = kmem_cache_alloc(segvn_cache, KM_SLEEP);
    767 
    768 	seg->s_ops = &segvn_ops;
    769 	seg->s_data = (void *)svd;
    770 	seg->s_szc = a->szc;
    771 
    772 	svd->seg = seg;
    773 	svd->vp = a->vp;
    774 	/*
    775 	 * Anonymous mappings have no backing file so the offset is meaningless.
    776 	 */
    777 	svd->offset = a->vp ? (a->offset & PAGEMASK) : 0;
    778 	svd->prot = a->prot;
    779 	svd->maxprot = a->maxprot;
    780 	svd->pageprot = 0;
    781 	svd->type = a->type;
    782 	svd->vpage = NULL;
    783 	svd->cred = cred;
    784 	svd->advice = MADV_NORMAL;
    785 	svd->pageadvice = 0;
    786 	svd->flags = (ushort_t)a->flags;
    787 	svd->softlockcnt = 0;
    788 	svd->rcookie = HAT_INVALID_REGION_COOKIE;
    789 
    790 	if (a->szc != 0 && a->vp != NULL) {
    791 		segvn_setvnode_mpss(a->vp);
    792 	}
    793 	if (svd->type == MAP_SHARED && svd->vp != NULL &&
    794 	    (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) {
    795 		ASSERT(vn_is_mapped(svd->vp, V_WRITE));
    796 		segvn_inval_trcache(svd->vp);
    797 	}
    798 
    799 	amp = a->amp;
    800 	if ((svd->amp = amp) == NULL) {
    801 		svd->anon_index = 0;
    802 		if (svd->type == MAP_SHARED) {
    803 			svd->swresv = 0;
    804 			/*
    805 			 * Shared mappings to a vp need no other setup.
    806 			 * If we have a shared mapping to an anon_map object
    807 			 * which hasn't been allocated yet,  allocate the
    808 			 * struct now so that it will be properly shared
    809 			 * by remembering the swap reservation there.
    810 			 */
    811 			if (a->vp == NULL) {
    812 				svd->amp = anonmap_alloc(seg->s_size, swresv,
    813 				    ANON_SLEEP);
    814 				svd->amp->a_szc = seg->s_szc;
    815 			}
    816 		} else {
    817 			/*
    818 			 * Private mapping (with or without a vp).
    819 			 * Allocate anon_map when needed.
    820 			 */
    821 			svd->swresv = swresv;
    822 		}
    823 	} else {
    824 		pgcnt_t anon_num;
    825 
    826 		/*
    827 		 * Mapping to an existing anon_map structure without a vp.
    828 		 * For now we will insure that the segment size isn't larger
    829 		 * than the size - offset gives us.  Later on we may wish to
    830 		 * have the anon array dynamically allocated itself so that
    831 		 * we don't always have to allocate all the anon pointer slots.
    832 		 * This of course involves adding extra code to check that we
    833 		 * aren't trying to use an anon pointer slot beyond the end
    834 		 * of the currently allocated anon array.
    835 		 */
    836 		if ((amp->size - a->offset) < seg->s_size) {
    837 			panic("segvn_create anon_map size");
    838 			/*NOTREACHED*/
    839 		}
    840 
    841 		anon_num = btopr(a->offset);
    842 
    843 		if (a->type == MAP_SHARED) {
    844 			/*
    845 			 * SHARED mapping to a given anon_map.
    846 			 */
    847 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    848 			amp->refcnt++;
    849 			if (a->szc > amp->a_szc) {
    850 				amp->a_szc = a->szc;
    851 			}
    852 			ANON_LOCK_EXIT(&amp->a_rwlock);
    853 			svd->anon_index = anon_num;
    854 			svd->swresv = 0;
    855 		} else {
    856 			/*
    857 			 * PRIVATE mapping to a given anon_map.
    858 			 * Make sure that all the needed anon
    859 			 * structures are created (so that we will
    860 			 * share the underlying pages if nothing
    861 			 * is written by this mapping) and then
    862 			 * duplicate the anon array as is done
    863 			 * when a privately mapped segment is dup'ed.
    864 			 */
    865 			struct anon *ap;
    866 			caddr_t addr;
    867 			caddr_t eaddr;
    868 			ulong_t	anon_idx;
    869 			int hat_flag = HAT_LOAD;
    870 
    871 			if (svd->flags & MAP_TEXT) {
    872 				hat_flag |= HAT_LOAD_TEXT;
    873 			}
    874 
    875 			svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP);
    876 			svd->amp->a_szc = seg->s_szc;
    877 			svd->anon_index = 0;
    878 			svd->swresv = swresv;
    879 
    880 			/*
    881 			 * Prevent 2 threads from allocating anon
    882 			 * slots simultaneously.
    883 			 */
    884 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
    885 			eaddr = seg->s_base + seg->s_size;
    886 
    887 			for (anon_idx = anon_num, addr = seg->s_base;
    888 			    addr < eaddr; addr += PAGESIZE, anon_idx++) {
    889 				page_t *pp;
    890 
    891 				if ((ap = anon_get_ptr(amp->ahp,
    892 				    anon_idx)) != NULL)
    893 					continue;
    894 
    895 				/*
    896 				 * Allocate the anon struct now.
    897 				 * Might as well load up translation
    898 				 * to the page while we're at it...
    899 				 */
    900 				pp = anon_zero(seg, addr, &ap, cred);
    901 				if (ap == NULL || pp == NULL) {
    902 					panic("segvn_create anon_zero");
    903 					/*NOTREACHED*/
    904 				}
    905 
    906 				/*
    907 				 * Re-acquire the anon_map lock and
    908 				 * initialize the anon array entry.
    909 				 */
    910 				ASSERT(anon_get_ptr(amp->ahp,
    911 				    anon_idx) == NULL);
    912 				(void) anon_set_ptr(amp->ahp, anon_idx, ap,
    913 				    ANON_SLEEP);
    914 
    915 				ASSERT(seg->s_szc == 0);
    916 				ASSERT(!IS_VMODSORT(pp->p_vnode));
    917 
    918 				ASSERT(use_rgn == 0);
    919 				hat_memload(seg->s_as->a_hat, addr, pp,
    920 				    svd->prot & ~PROT_WRITE, hat_flag);
    921 
    922 				page_unlock(pp);
    923 			}
    924 			ASSERT(seg->s_szc == 0);
    925 			anon_dup(amp->ahp, anon_num, svd->amp->ahp,
    926 			    0, seg->s_size);
    927 			ANON_LOCK_EXIT(&amp->a_rwlock);
    928 		}
    929 	}
    930 
    931 	/*
    932 	 * Set default memory allocation policy for segment
    933 	 *
    934 	 * Always set policy for private memory at least for initialization
    935 	 * even if this is a shared memory segment
    936 	 */
    937 	(void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size);
    938 
    939 	if (svd->type == MAP_SHARED)
    940 		(void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index,
    941 		    svd->vp, svd->offset, seg->s_size);
    942 
    943 	if (use_rgn) {
    944 		ASSERT(!trok);
    945 		ASSERT(svd->amp == NULL);
    946 		svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base,
    947 		    seg->s_size, (void *)svd->vp, svd->offset, svd->prot,
    948 		    (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback,
    949 		    HAT_REGION_TEXT);
    950 	}
    951 
    952 	ASSERT(!trok || !(svd->prot & PROT_WRITE));
    953 	svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;