1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "@(#)seg_vn.c 1.282 07/11/12 SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/callb.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 #include <sys/proc.h> 75 #include <sys/task.h> 76 #include <sys/project.h> 77 #include <sys/zone.h> 78 #include <sys/shm_impl.h> 79 /* 80 * Private seg op routines. 81 */ 82 static int segvn_dup(struct seg *seg, struct seg *newseg); 83 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 84 static void segvn_free(struct seg *seg); 85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 86 caddr_t addr, size_t len, enum fault_type type, 87 enum seg_rw rw); 88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 89 static int segvn_setprot(struct seg *seg, caddr_t addr, 90 size_t len, uint_t prot); 91 static int segvn_checkprot(struct seg *seg, caddr_t addr, 92 size_t len, uint_t prot); 93 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 94 static size_t segvn_swapout(struct seg *seg); 95 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 96 int attr, uint_t flags); 97 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 98 char *vec); 99 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 100 int attr, int op, ulong_t *lockmap, size_t pos); 101 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 102 uint_t *protv); 103 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 104 static int segvn_gettype(struct seg *seg, caddr_t addr); 105 static int segvn_getvp(struct seg *seg, caddr_t addr, 106 struct vnode **vpp); 107 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segvn_dump(struct seg *seg); 110 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 111 struct page ***ppp, enum lock_type type, enum seg_rw rw); 112 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 113 uint_t szc); 114 static int segvn_getmemid(struct seg *seg, caddr_t addr, 115 memid_t *memidp); 116 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 117 static int segvn_capable(struct seg *seg, segcapability_t capable); 118 119 struct seg_ops segvn_ops = { 120 segvn_dup, 121 segvn_unmap, 122 segvn_free, 123 segvn_fault, 124 segvn_faulta, 125 segvn_setprot, 126 segvn_checkprot, 127 segvn_kluster, 128 segvn_swapout, 129 segvn_sync, 130 segvn_incore, 131 segvn_lockop, 132 segvn_getprot, 133 segvn_getoffset, 134 segvn_gettype, 135 segvn_getvp, 136 segvn_advise, 137 segvn_dump, 138 segvn_pagelock, 139 segvn_setpagesize, 140 segvn_getmemid, 141 segvn_getpolicy, 142 segvn_capable, 143 }; 144 145 /* 146 * Common zfod structures, provided as a shorthand for others to use. 147 */ 148 static segvn_crargs_t zfod_segvn_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 150 static segvn_crargs_t kzfod_segvn_crargs = 151 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 152 PROT_ALL & ~PROT_USER); 153 static segvn_crargs_t stack_noexec_crargs = 154 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 155 156 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 157 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 158 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 159 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 160 161 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 162 163 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 164 165 static int segvn_concat(struct seg *, struct seg *, int); 166 static int segvn_extend_prev(struct seg *, struct seg *, 167 struct segvn_crargs *, size_t); 168 static int segvn_extend_next(struct seg *, struct seg *, 169 struct segvn_crargs *, size_t); 170 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 171 static void segvn_pagelist_rele(page_t **); 172 static void segvn_setvnode_mpss(vnode_t *); 173 static void segvn_relocate_pages(page_t **, page_t *); 174 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 175 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 176 uint_t, page_t **, page_t **, uint_t *, int *); 177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 178 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 180 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 182 u_offset_t, struct vpage *, page_t **, uint_t, 183 enum fault_type, enum seg_rw, int, int); 184 static void segvn_vpage(struct seg *); 185 186 static void segvn_purge(struct seg *seg); 187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 188 enum seg_rw); 189 190 static int sameprot(struct seg *, caddr_t, size_t); 191 192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 193 static int segvn_clrszc(struct seg *); 194 static struct seg *segvn_split_seg(struct seg *, caddr_t); 195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 196 ulong_t, uint_t); 197 198 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 199 size_t, void *, u_offset_t); 200 201 static int segvn_slock_anonpages(page_t *, int); 202 static void segvn_sunlock_anonpages(page_t *, int); 203 204 static struct kmem_cache *segvn_cache; 205 static struct kmem_cache **segvn_szc_cache; 206 207 #ifdef VM_STATS 208 static struct segvnvmstats_str { 209 ulong_t fill_vp_pages[31]; 210 ulong_t fltvnpages[49]; 211 ulong_t fullszcpages[10]; 212 ulong_t relocatepages[3]; 213 ulong_t fltanpages[17]; 214 ulong_t pagelock[3]; 215 ulong_t demoterange[3]; 216 } segvnvmstats; 217 #endif /* VM_STATS */ 218 219 #define SDR_RANGE 1 /* demote entire range */ 220 #define SDR_END 2 /* demote non aligned ends only */ 221 222 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 223 if ((len) != 0) { \ 224 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 225 ASSERT(lpgaddr >= (seg)->s_base); \ 226 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 227 (len)), pgsz); \ 228 ASSERT(lpgeaddr > lpgaddr); \ 229 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 230 } else { \ 231 lpgeaddr = lpgaddr = (addr); \ 232 } \ 233 } 234 235 /*ARGSUSED*/ 236 static int 237 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 238 { 239 struct segvn_data *svd = buf; 240 241 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 242 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 243 svd->svn_trnext = svd->svn_trprev = NULL; 244 return (0); 245 } 246 247 /*ARGSUSED1*/ 248 static void 249 segvn_cache_destructor(void *buf, void *cdrarg) 250 { 251 struct segvn_data *svd = buf; 252 253 rw_destroy(&svd->lock); 254 mutex_destroy(&svd->segp_slock); 255 } 256 257 /*ARGSUSED*/ 258 static int 259 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 260 { 261 bzero(buf, sizeof (svntr_t)); 262 return (0); 263 } 264 265 /* 266 * Patching this variable to non-zero allows the system to run with 267 * stacks marked as "not executable". It's a bit of a kludge, but is 268 * provided as a tweakable for platforms that export those ABIs 269 * (e.g. sparc V8) that have executable stacks enabled by default. 270 * There are also some restrictions for platforms that don't actually 271 * implement 'noexec' protections. 272 * 273 * Once enabled, the system is (therefore) unable to provide a fully 274 * ABI-compliant execution environment, though practically speaking, 275 * most everything works. The exceptions are generally some interpreters 276 * and debuggers that create executable code on the stack and jump 277 * into it (without explicitly mprotecting the address range to include 278 * PROT_EXEC). 279 * 280 * One important class of applications that are disabled are those 281 * that have been transformed into malicious agents using one of the 282 * numerous "buffer overflow" attacks. See 4007890. 283 */ 284 int noexec_user_stack = 0; 285 int noexec_user_stack_log = 1; 286 287 int segvn_lpg_disable = 0; 288 uint_t segvn_maxpgszc = 0; 289 290 ulong_t segvn_vmpss_clrszc_cnt; 291 ulong_t segvn_vmpss_clrszc_err; 292 ulong_t segvn_fltvnpages_clrszc_cnt; 293 ulong_t segvn_fltvnpages_clrszc_err; 294 ulong_t segvn_setpgsz_align_err; 295 ulong_t segvn_setpgsz_anon_align_err; 296 ulong_t segvn_setpgsz_getattr_err; 297 ulong_t segvn_setpgsz_eof_err; 298 ulong_t segvn_faultvnmpss_align_err1; 299 ulong_t segvn_faultvnmpss_align_err2; 300 ulong_t segvn_faultvnmpss_align_err3; 301 ulong_t segvn_faultvnmpss_align_err4; 302 ulong_t segvn_faultvnmpss_align_err5; 303 ulong_t segvn_vmpss_pageio_deadlk_err; 304 305 int segvn_use_regions = 1; 306 307 /* 308 * Segvn supports text replication optimization for NUMA platforms. Text 309 * replica's are represented by anon maps (amp). There's one amp per text file 310 * region per lgroup. A process chooses the amp for each of its text mappings 311 * based on the lgroup assignment of its main thread (t_tid = 1). All 312 * processes that want a replica on a particular lgroup for the same text file 313 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 314 * with vp,off,size,szc used as a key. Text replication segments are read only 315 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 316 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 317 * pages. Replication amp is assigned to a segment when it gets its first 318 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 319 * rechecks periodically if the process still maps an amp local to the main 320 * thread. If not async thread forces process to remap to an amp in the new 321 * home lgroup of the main thread. Current text replication implementation 322 * only provides the benefit to workloads that do most of their work in the 323 * main thread of a process or all the threads of a process run in the same 324 * lgroup. To extend text replication benefit to different types of 325 * multithreaded workloads further work would be needed in the hat layer to 326 * allow the same virtual address in the same hat to simultaneously map 327 * different physical addresses (i.e. page table replication would be needed 328 * for x86). 329 * 330 * amp pages are used instead of vnode pages as long as segment has a very 331 * simple life cycle. It's created via segvn_create(), handles S_EXEC 332 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 333 * happens such as protection is changed, real COW fault happens, pagesize is 334 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 335 * text replication by converting the segment back to vnode only segment 336 * (unmap segment's address range and set svd->amp to NULL). 337 * 338 * The original file can be changed after amp is inserted into 339 * svntr_hashtab. Processes that are launched after the file is already 340 * changed can't use the replica's created prior to the file change. To 341 * implement this functionality hash entries are timestamped. Replica's can 342 * only be used if current file modification time is the same as the timestamp 343 * saved when hash entry was created. However just timestamps alone are not 344 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 345 * deal with file changes via MAP_SHARED mappings differently. When writable 346 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 347 * existing replica's for this vnode as not usable for future text 348 * mappings. And we don't create new replica's for files that currently have 349 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 350 * true). 351 */ 352 353 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 354 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 355 356 static ulong_t svntr_hashtab_sz = 512; 357 static svntr_bucket_t *svntr_hashtab = NULL; 358 static struct kmem_cache *svntr_cache; 359 static svntr_stats_t *segvn_textrepl_stats; 360 static ksema_t segvn_trasync_sem; 361 362 int segvn_disable_textrepl = 1; 363 size_t textrepl_size_thresh = (size_t)-1; 364 size_t segvn_textrepl_bytes = 0; 365 size_t segvn_textrepl_max_bytes = 0; 366 clock_t segvn_update_textrepl_interval = 0; 367 int segvn_update_tr_time = 10; 368 int segvn_disable_textrepl_update = 0; 369 370 static void segvn_textrepl(struct seg *); 371 static void segvn_textunrepl(struct seg *, int); 372 static void segvn_inval_trcache(vnode_t *); 373 static void segvn_trasync_thread(void); 374 static void segvn_trupdate_wakeup(void *); 375 static void segvn_trupdate(void); 376 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 377 ulong_t); 378 379 /* 380 * Initialize segvn data structures 381 */ 382 void 383 segvn_init(void) 384 { 385 uint_t maxszc; 386 uint_t szc; 387 size_t pgsz; 388 389 segvn_cache = kmem_cache_create("segvn_cache", 390 sizeof (struct segvn_data), 0, 391 segvn_cache_constructor, segvn_cache_destructor, NULL, 392 NULL, NULL, 0); 393 394 if (segvn_lpg_disable == 0) { 395 szc = maxszc = page_num_pagesizes() - 1; 396 if (szc == 0) { 397 segvn_lpg_disable = 1; 398 } 399 if (page_get_pagesize(0) != PAGESIZE) { 400 panic("segvn_init: bad szc 0"); 401 /*NOTREACHED*/ 402 } 403 while (szc != 0) { 404 pgsz = page_get_pagesize(szc); 405 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 406 panic("segvn_init: bad szc %d", szc); 407 /*NOTREACHED*/ 408 } 409 szc--; 410 } 411 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 412 segvn_maxpgszc = maxszc; 413 } 414 415 if (segvn_maxpgszc) { 416 segvn_szc_cache = (struct kmem_cache **)kmem_alloc( 417 (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *), 418 KM_SLEEP); 419 } 420 421 for (szc = 1; szc <= segvn_maxpgszc; szc++) { 422 char str[32]; 423 424 (void) sprintf(str, "segvn_szc_cache%d", szc); 425 segvn_szc_cache[szc] = kmem_cache_create(str, 426 page_get_pagecnt(szc) * sizeof (page_t *), 0, 427 NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 428 } 429 430 431 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 432 segvn_use_regions = 0; 433 434 /* 435 * For now shared regions and text replication segvn support 436 * are mutually exclusive. This is acceptable because 437 * currently significant benefit from text replication was 438 * only observed on AMD64 NUMA platforms (due to relatively 439 * small L2$ size) and currently we don't support shared 440 * regions on x86. 441 */ 442 if (segvn_use_regions && !segvn_disable_textrepl) { 443 segvn_disable_textrepl = 1; 444 } 445 446 #if defined(_LP64) 447 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 448 !segvn_disable_textrepl) { 449 ulong_t i; 450 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 451 452 svntr_cache = kmem_cache_create("svntr_cache", 453 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 454 NULL, NULL, NULL, 0); 455 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 456 for (i = 0; i < svntr_hashtab_sz; i++) { 457 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 458 MUTEX_DEFAULT, NULL); 459 } 460 segvn_textrepl_max_bytes = ptob(physmem) / 461 segvn_textrepl_max_bytes_factor; 462 segvn_textrepl_stats = kmem_zalloc(NCPU * 463 sizeof (svntr_stats_t), KM_SLEEP); 464 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 465 (void) thread_create(NULL, 0, segvn_trasync_thread, 466 NULL, 0, &p0, TS_RUN, minclsyspri); 467 } 468 #endif 469 } 470 471 #define SEGVN_PAGEIO ((void *)0x1) 472 #define SEGVN_NOPAGEIO ((void *)0x2) 473 474 static void 475 segvn_setvnode_mpss(vnode_t *vp) 476 { 477 int err; 478 479 ASSERT(vp->v_mpssdata == NULL || 480 vp->v_mpssdata == SEGVN_PAGEIO || 481 vp->v_mpssdata == SEGVN_NOPAGEIO); 482 483 if (vp->v_mpssdata == NULL) { 484 if (vn_vmpss_usepageio(vp)) { 485 err = VOP_PAGEIO(vp, (page_t *)NULL, 486 (u_offset_t)0, 0, 0, CRED(), NULL); 487 } else { 488 err = ENOSYS; 489 } 490 /* 491 * set v_mpssdata just once per vnode life 492 * so that it never changes. 493 */ 494 mutex_enter(&vp->v_lock); 495 if (vp->v_mpssdata == NULL) { 496 if (err == EINVAL) { 497 vp->v_mpssdata = SEGVN_PAGEIO; 498 } else { 499 vp->v_mpssdata = SEGVN_NOPAGEIO; 500 } 501 } 502 mutex_exit(&vp->v_lock); 503 } 504 } 505 506 int 507 segvn_create(struct seg *seg, void *argsp) 508 { 509 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 510 struct segvn_data *svd; 511 size_t swresv = 0; 512 struct cred *cred; 513 struct anon_map *amp; 514 int error = 0; 515 size_t pgsz; 516 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 517 int use_rgn = 0; 518 int trok = 0; 519 520 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 521 522 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 523 panic("segvn_create type"); 524 /*NOTREACHED*/ 525 } 526 527 /* 528 * Check arguments. If a shared anon structure is given then 529 * it is illegal to also specify a vp. 530 */ 531 if (a->amp != NULL && a->vp != NULL) { 532 panic("segvn_create anon_map"); 533 /*NOTREACHED*/ 534 } 535 536 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 537 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 538 segvn_use_regions) { 539 use_rgn = 1; 540 } 541 542 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 543 if (a->type == MAP_SHARED) 544 a->flags &= ~MAP_NORESERVE; 545 546 if (a->szc != 0) { 547 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 548 (a->amp != NULL && a->type == MAP_PRIVATE) || 549 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 550 a->szc = 0; 551 } else { 552 if (a->szc > segvn_maxpgszc) 553 a->szc = segvn_maxpgszc; 554 pgsz = page_get_pagesize(a->szc); 555 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 556 !IS_P2ALIGNED(seg->s_size, pgsz)) { 557 a->szc = 0; 558 } else if (a->vp != NULL) { 559 extern struct vnode kvp; 560 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 561 /* 562 * paranoid check. 563 * hat_page_demote() is not supported 564 * on swapfs pages. 565 */ 566 a->szc = 0; 567 } else if (map_addr_vacalign_check(seg->s_base, 568 a->offset & PAGEMASK)) { 569 a->szc = 0; 570 } 571 } else if (a->amp != NULL) { 572 pgcnt_t anum = btopr(a->offset); 573 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 574 if (!IS_P2ALIGNED(anum, pgcnt)) { 575 a->szc = 0; 576 } 577 } 578 } 579 } 580 581 /* 582 * If segment may need private pages, reserve them now. 583 */ 584 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 585 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 586 if (anon_resv(seg->s_size) == 0) 587 return (EAGAIN); 588 swresv = seg->s_size; 589 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 590 seg, swresv, 1); 591 } 592 593 /* 594 * Reserve any mapping structures that may be required. 595 * 596 * Don't do it for segments that may use regions. It's currently a 597 * noop in the hat implementations anyway. 598 */ 599 if (!use_rgn) { 600 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 601 } 602 603 if (a->cred) { 604 cred = a->cred; 605 crhold(cred); 606 } else { 607 crhold(cred = CRED()); 608 } 609 610 /* Inform the vnode of the new mapping */ 611 if (a->vp != NULL) { 612 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 613 seg->s_as, seg->s_base, seg->s_size, a->prot, 614 a->maxprot, a->type, cred, NULL); 615 if (error) { 616 if (swresv != 0) { 617 anon_unresv(swresv); 618 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 619 "anon proc:%p %lu %u", seg, swresv, 0); 620 } 621 crfree(cred); 622 if (!use_rgn) { 623 hat_unload(seg->s_as->a_hat, seg->s_base, 624 seg->s_size, HAT_UNLOAD_UNMAP); 625 } 626 return (error); 627 } 628 /* 629 * svntr_hashtab will be NULL if we support shared regions. 630 */ 631 trok = ((a->flags & MAP_TEXT) && 632 (seg->s_size > textrepl_size_thresh || 633 (a->flags & _MAP_TEXTREPL)) && 634 lgrp_optimizations() && svntr_hashtab != NULL && 635 a->type == MAP_PRIVATE && swresv == 0 && 636 !(a->flags & MAP_NORESERVE) && 637 seg->s_as != &kas && a->vp->v_type == VREG); 638 639 ASSERT(!trok || !use_rgn); 640 } 641 642 /* 643 * If more than one segment in the address space, and they're adjacent 644 * virtually, try to concatenate them. Don't concatenate if an 645 * explicit anon_map structure was supplied (e.g., SystemV shared 646 * memory) or if we'll use text replication for this segment. 647 */ 648 if (a->amp == NULL && !use_rgn && !trok) { 649 struct seg *pseg, *nseg; 650 struct segvn_data *psvd, *nsvd; 651 lgrp_mem_policy_t ppolicy, npolicy; 652 uint_t lgrp_mem_policy_flags = 0; 653 extern lgrp_mem_policy_t lgrp_mem_default_policy; 654 655 /* 656 * Memory policy flags (lgrp_mem_policy_flags) is valid when 657 * extending stack/heap segments. 658 */ 659 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 660 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 661 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 662 } else { 663 /* 664 * Get policy when not extending it from another segment 665 */ 666 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 667 } 668 669 /* 670 * First, try to concatenate the previous and new segments 671 */ 672 pseg = AS_SEGPREV(seg->s_as, seg); 673 if (pseg != NULL && 674 pseg->s_base + pseg->s_size == seg->s_base && 675 pseg->s_ops == &segvn_ops) { 676 /* 677 * Get memory allocation policy from previous segment. 678 * When extension is specified (e.g. for heap) apply 679 * this policy to the new segment regardless of the 680 * outcome of segment concatenation. Extension occurs 681 * for non-default policy otherwise default policy is 682 * used and is based on extended segment size. 683 */ 684 psvd = (struct segvn_data *)pseg->s_data; 685 ppolicy = psvd->policy_info.mem_policy; 686 if (lgrp_mem_policy_flags == 687 LGRP_MP_FLAG_EXTEND_UP) { 688 if (ppolicy != lgrp_mem_default_policy) { 689 mpolicy = ppolicy; 690 } else { 691 mpolicy = lgrp_mem_policy_default( 692 pseg->s_size + seg->s_size, 693 a->type); 694 } 695 } 696 697 if (mpolicy == ppolicy && 698 (pseg->s_size + seg->s_size <= 699 segvn_comb_thrshld || psvd->amp == NULL) && 700 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 701 /* 702 * success! now try to concatenate 703 * with following seg 704 */ 705 crfree(cred); 706 nseg = AS_SEGNEXT(pseg->s_as, pseg); 707 if (nseg != NULL && 708 nseg != pseg && 709 nseg->s_ops == &segvn_ops && 710 pseg->s_base + pseg->s_size == 711 nseg->s_base) 712 (void) segvn_concat(pseg, nseg, 0); 713 ASSERT(pseg->s_szc == 0 || 714 (a->szc == pseg->s_szc && 715 IS_P2ALIGNED(pseg->s_base, pgsz) && 716 IS_P2ALIGNED(pseg->s_size, pgsz))); 717 return (0); 718 } 719 } 720 721 /* 722 * Failed, so try to concatenate with following seg 723 */ 724 nseg = AS_SEGNEXT(seg->s_as, seg); 725 if (nseg != NULL && 726 seg->s_base + seg->s_size == nseg->s_base && 727 nseg->s_ops == &segvn_ops) { 728 /* 729 * Get memory allocation policy from next segment. 730 * When extension is specified (e.g. for stack) apply 731 * this policy to the new segment regardless of the 732 * outcome of segment concatenation. Extension occurs 733 * for non-default policy otherwise default policy is 734 * used and is based on extended segment size. 735 */ 736 nsvd = (struct segvn_data *)nseg->s_data; 737 npolicy = nsvd->policy_info.mem_policy; 738 if (lgrp_mem_policy_flags == 739 LGRP_MP_FLAG_EXTEND_DOWN) { 740 if (npolicy != lgrp_mem_default_policy) { 741 mpolicy = npolicy; 742 } else { 743 mpolicy = lgrp_mem_policy_default( 744 nseg->s_size + seg->s_size, 745 a->type); 746 } 747 } 748 749 if (mpolicy == npolicy && 750 segvn_extend_next(seg, nseg, a, swresv) == 0) { 751 crfree(cred); 752 ASSERT(nseg->s_szc == 0 || 753 (a->szc == nseg->s_szc && 754 IS_P2ALIGNED(nseg->s_base, pgsz) && 755 IS_P2ALIGNED(nseg->s_size, pgsz))); 756 return (0); 757 } 758 } 759 } 760 761 if (a->vp != NULL) { 762 VN_HOLD(a->vp); 763 if (a->type == MAP_SHARED) 764 lgrp_shm_policy_init(NULL, a->vp); 765 } 766 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 767 768 seg->s_ops = &segvn_ops; 769 seg->s_data = (void *)svd; 770 seg->s_szc = a->szc; 771 772 svd->seg = seg; 773 svd->vp = a->vp; 774 /* 775 * Anonymous mappings have no backing file so the offset is meaningless. 776 */ 777 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 778 svd->prot = a->prot; 779 svd->maxprot = a->maxprot; 780 svd->pageprot = 0; 781 svd->type = a->type; 782 svd->vpage = NULL; 783 svd->cred = cred; 784 svd->advice = MADV_NORMAL; 785 svd->pageadvice = 0; 786 svd->flags = (ushort_t)a->flags; 787 svd->softlockcnt = 0; 788 svd->rcookie = HAT_INVALID_REGION_COOKIE; 789 790 if (a->szc != 0 && a->vp != NULL) { 791 segvn_setvnode_mpss(a->vp); 792 } 793 if (svd->type == MAP_SHARED && svd->vp != NULL && 794 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 795 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 796 segvn_inval_trcache(svd->vp); 797 } 798 799 amp = a->amp; 800 if ((svd->amp = amp) == NULL) { 801 svd->anon_index = 0; 802 if (svd->type == MAP_SHARED) { 803 svd->swresv = 0; 804 /* 805 * Shared mappings to a vp need no other setup. 806 * If we have a shared mapping to an anon_map object 807 * which hasn't been allocated yet, allocate the 808 * struct now so that it will be properly shared 809 * by remembering the swap reservation there. 810 */ 811 if (a->vp == NULL) { 812 svd->amp = anonmap_alloc(seg->s_size, swresv, 813 ANON_SLEEP); 814 svd->amp->a_szc = seg->s_szc; 815 } 816 } else { 817 /* 818 * Private mapping (with or without a vp). 819 * Allocate anon_map when needed. 820 */ 821 svd->swresv = swresv; 822 } 823 } else { 824 pgcnt_t anon_num; 825 826 /* 827 * Mapping to an existing anon_map structure without a vp. 828 * For now we will insure that the segment size isn't larger 829 * than the size - offset gives us. Later on we may wish to 830 * have the anon array dynamically allocated itself so that 831 * we don't always have to allocate all the anon pointer slots. 832 * This of course involves adding extra code to check that we 833 * aren't trying to use an anon pointer slot beyond the end 834 * of the currently allocated anon array. 835 */ 836 if ((amp->size - a->offset) < seg->s_size) { 837 panic("segvn_create anon_map size"); 838 /*NOTREACHED*/ 839 } 840 841 anon_num = btopr(a->offset); 842 843 if (a->type == MAP_SHARED) { 844 /* 845 * SHARED mapping to a given anon_map. 846 */ 847 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 848 amp->refcnt++; 849 if (a->szc > amp->a_szc) { 850 amp->a_szc = a->szc; 851 } 852 ANON_LOCK_EXIT(&->a_rwlock); 853 svd->anon_index = anon_num; 854 svd->swresv = 0; 855 } else { 856 /* 857 * PRIVATE mapping to a given anon_map. 858 * Make sure that all the needed anon 859 * structures are created (so that we will 860 * share the underlying pages if nothing 861 * is written by this mapping) and then 862 * duplicate the anon array as is done 863 * when a privately mapped segment is dup'ed. 864 */ 865 struct anon *ap; 866 caddr_t addr; 867 caddr_t eaddr; 868 ulong_t anon_idx; 869 int hat_flag = HAT_LOAD; 870 871 if (svd->flags & MAP_TEXT) { 872 hat_flag |= HAT_LOAD_TEXT; 873 } 874 875 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 876 svd->amp->a_szc = seg->s_szc; 877 svd->anon_index = 0; 878 svd->swresv = swresv; 879 880 /* 881 * Prevent 2 threads from allocating anon 882 * slots simultaneously. 883 */ 884 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 885 eaddr = seg->s_base + seg->s_size; 886 887 for (anon_idx = anon_num, addr = seg->s_base; 888 addr < eaddr; addr += PAGESIZE, anon_idx++) { 889 page_t *pp; 890 891 if ((ap = anon_get_ptr(amp->ahp, 892 anon_idx)) != NULL) 893 continue; 894 895 /* 896 * Allocate the anon struct now. 897 * Might as well load up translation 898 * to the page while we're at it... 899 */ 900 pp = anon_zero(seg, addr, &ap, cred); 901 if (ap == NULL || pp == NULL) { 902 panic("segvn_create anon_zero"); 903 /*NOTREACHED*/ 904 } 905 906 /* 907 * Re-acquire the anon_map lock and 908 * initialize the anon array entry. 909 */ 910 ASSERT(anon_get_ptr(amp->ahp, 911 anon_idx) == NULL); 912 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 913 ANON_SLEEP); 914 915 ASSERT(seg->s_szc == 0); 916 ASSERT(!IS_VMODSORT(pp->p_vnode)); 917 918 ASSERT(use_rgn == 0); 919 hat_memload(seg->s_as->a_hat, addr, pp, 920 svd->prot & ~PROT_WRITE, hat_flag); 921 922 page_unlock(pp); 923 } 924 ASSERT(seg->s_szc == 0); 925 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 926 0, seg->s_size); 927 ANON_LOCK_EXIT(&->a_rwlock); 928 } 929 } 930 931 /* 932 * Set default memory allocation policy for segment 933 * 934 * Always set policy for private memory at least for initialization 935 * even if this is a shared memory segment 936 */ 937 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 938 939 if (svd->type == MAP_SHARED) 940 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 941 svd->vp, svd->offset, seg->s_size); 942 943 if (use_rgn) { 944 ASSERT(!trok); 945 ASSERT(svd->amp == NULL); 946 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 947 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 948 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 949 HAT_REGION_TEXT); 950 } 951 952 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 953 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF;