1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "@(#)seg_spt.c 1.106 07/10/10 SMI" 27 28 #include <sys/param.h> 29 #include <sys/user.h> 30 #include <sys/mman.h> 31 #include <sys/kmem.h> 32 #include <sys/sysmacros.h> 33 #include <sys/cmn_err.h> 34 #include <sys/systm.h> 35 #include <sys/tuneable.h> 36 #include <vm/hat.h> 37 #include <vm/seg.h> 38 #include <vm/as.h> 39 #include <vm/anon.h> 40 #include <vm/page.h> 41 #include <sys/buf.h> 42 #include <sys/swap.h> 43 #include <sys/atomic.h> 44 #include <vm/seg_spt.h> 45 #include <sys/debug.h> 46 #include <sys/vtrace.h> 47 #include <sys/shm.h> 48 #include <sys/shm_impl.h> 49 #include <sys/lgrp.h> 50 #include <sys/vmsystm.h> 51 #include <sys/policy.h> 52 #include <sys/project.h> 53 #include <sys/tnf_probe.h> 54 #include <sys/zone.h> 55 56 #define SEGSPTADDR (caddr_t)0x0 57 58 /* 59 * # pages used for spt 60 */ 61 size_t spt_used; 62 63 /* 64 * segspt_minfree is the memory left for system after ISM 65 * locked its pages; it is set up to 5% of availrmem in 66 * sptcreate when ISM is created. ISM should not use more 67 * than ~90% of availrmem; if it does, then the performance 68 * of the system may decrease. Machines with large memories may 69 * be able to use up more memory for ISM so we set the default 70 * segspt_minfree to 5% (which gives ISM max 95% of availrmem. 71 * If somebody wants even more memory for ISM (risking hanging 72 * the system) they can patch the segspt_minfree to smaller number. 73 */ 74 pgcnt_t segspt_minfree = 0; 75 76 static int segspt_create(struct seg *seg, caddr_t argsp); 77 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize); 78 static void segspt_free(struct seg *seg); 79 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len); 80 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr); 81 82 static void 83 segspt_badop() 84 { 85 panic("segspt_badop called"); 86 /*NOTREACHED*/ 87 } 88 89 #define SEGSPT_BADOP(t) (t(*)())segspt_badop 90 91 struct seg_ops segspt_ops = { 92 SEGSPT_BADOP(int), /* dup */ 93 segspt_unmap, 94 segspt_free, 95 SEGSPT_BADOP(int), /* fault */ 96 SEGSPT_BADOP(faultcode_t), /* faulta */ 97 SEGSPT_BADOP(int), /* setprot */ 98 SEGSPT_BADOP(int), /* checkprot */ 99 SEGSPT_BADOP(int), /* kluster */ 100 SEGSPT_BADOP(size_t), /* swapout */ 101 SEGSPT_BADOP(int), /* sync */ 102 SEGSPT_BADOP(size_t), /* incore */ 103 SEGSPT_BADOP(int), /* lockop */ 104 SEGSPT_BADOP(int), /* getprot */ 105 SEGSPT_BADOP(u_offset_t), /* getoffset */ 106 SEGSPT_BADOP(int), /* gettype */ 107 SEGSPT_BADOP(int), /* getvp */ 108 SEGSPT_BADOP(int), /* advise */ 109 SEGSPT_BADOP(void), /* dump */ 110 SEGSPT_BADOP(int), /* pagelock */ 111 SEGSPT_BADOP(int), /* setpgsz */ 112 SEGSPT_BADOP(int), /* getmemid */ 113 segspt_getpolicy, /* getpolicy */ 114 SEGSPT_BADOP(int), /* capable */ 115 }; 116 117 static int segspt_shmdup(struct seg *seg, struct seg *newseg); 118 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize); 119 static void segspt_shmfree(struct seg *seg); 120 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg, 121 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw); 122 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr); 123 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr, 124 register size_t len, register uint_t prot); 125 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, 126 uint_t prot); 127 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta); 128 static size_t segspt_shmswapout(struct seg *seg); 129 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, 130 register char *vec); 131 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len, 132 int attr, uint_t flags); 133 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len, 134 int attr, int op, ulong_t *lockmap, size_t pos); 135 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, 136 uint_t *protv); 137 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr); 138 static int segspt_shmgettype(struct seg *seg, caddr_t addr); 139 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp); 140 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, 141 uint_t behav); 142 static void segspt_shmdump(struct seg *seg); 143 static int segspt_shmpagelock(struct seg *, caddr_t, size_t, 144 struct page ***, enum lock_type, enum seg_rw); 145 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t); 146 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *); 147 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t); 148 static int segspt_shmcapable(struct seg *, segcapability_t); 149 150 struct seg_ops segspt_shmops = { 151 segspt_shmdup, 152 segspt_shmunmap, 153 segspt_shmfree, 154 segspt_shmfault, 155 segspt_shmfaulta, 156 segspt_shmsetprot, 157 segspt_shmcheckprot, 158 segspt_shmkluster, 159 segspt_shmswapout, 160 segspt_shmsync, 161 segspt_shmincore, 162 segspt_shmlockop, 163 segspt_shmgetprot, 164 segspt_shmgetoffset, 165 segspt_shmgettype, 166 segspt_shmgetvp, 167 segspt_shmadvise, /* advise */ 168 segspt_shmdump, 169 segspt_shmpagelock, 170 segspt_shmsetpgsz, 171 segspt_shmgetmemid, 172 segspt_shmgetpolicy, 173 segspt_shmcapable, 174 }; 175 176 static void segspt_purge(struct seg *seg); 177 static int segspt_reclaim(struct seg *, caddr_t, size_t, struct page **, 178 enum seg_rw); 179 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len, 180 page_t **ppa); 181 182 183 184 /*ARGSUSED*/ 185 int 186 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp, 187 uint_t prot, uint_t flags, uint_t share_szc) 188 { 189 int err; 190 struct as *newas; 191 struct segspt_crargs sptcargs; 192 193 #ifdef DEBUG 194 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */, 195 tnf_ulong, size, size ); 196 #endif 197 if (segspt_minfree == 0) /* leave min 5% of availrmem for */ 198 segspt_minfree = availrmem/20; /* for the system */ 199 200 if (!hat_supported(HAT_SHARED_PT, (void *)0)) 201 return (EINVAL); 202 203 /* 204 * get a new as for this shared memory segment 205 */ 206 newas = as_alloc(); 207 newas->a_proc = NULL; 208 sptcargs.amp = amp; 209 sptcargs.prot = prot; 210 sptcargs.flags = flags; 211 sptcargs.szc = share_szc; 212 /* 213 * create a shared page table (spt) segment 214 */ 215 216 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) { 217 as_free(newas); 218 return (err); 219 } 220 *sptseg = sptcargs.seg_spt; 221 return (0); 222 } 223 224 void 225 sptdestroy(struct as *as, struct anon_map *amp) 226 { 227 228 #ifdef DEBUG 229 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */); 230 #endif 231 (void) as_unmap(as, SEGSPTADDR, amp->size); 232 as_free(as); 233 } 234 235 /* 236 * called from seg_free(). 237 * free (i.e., unlock, unmap, return to free list) 238 * all the pages in the given seg. 239 */ 240 void 241 segspt_free(struct seg *seg) 242 { 243 struct spt_data *sptd = (struct spt_data *)seg->s_data; 244 245 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 246 247 if (sptd != NULL) { 248 if (sptd->spt_realsize) 249 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize); 250 251 if (sptd->spt_ppa_lckcnt) 252 kmem_free(sptd->spt_ppa_lckcnt, 253 sizeof (*sptd->spt_ppa_lckcnt) 254 * btopr(sptd->spt_amp->size)); 255 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp)); 256 cv_destroy(&sptd->spt_cv); 257 mutex_destroy(&sptd->spt_lock); 258 kmem_free(sptd, sizeof (*sptd)); 259 } 260 } 261 262 /*ARGSUSED*/ 263 static int 264 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr, 265 uint_t flags) 266 { 267 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 268 269 return (0); 270 } 271 272 /*ARGSUSED*/ 273 static size_t 274 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec) 275 { 276 caddr_t eo_seg; 277 pgcnt_t npages; 278 struct shm_data *shmd = (struct shm_data *)seg->s_data; 279 struct seg *sptseg; 280 struct spt_data *sptd; 281 282 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 283 #ifdef lint 284 seg = seg; 285 #endif 286 sptseg = shmd->shm_sptseg; 287 sptd = sptseg->s_data; 288 289 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 290 eo_seg = addr + len; 291 while (addr < eo_seg) { 292 /* page exists, and it's locked. */ 293 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED | 294 SEG_PAGE_ANON; 295 addr += PAGESIZE; 296 } 297 return (len); 298 } else { 299 struct anon_map *amp = shmd->shm_amp; 300 struct anon *ap; 301 page_t *pp; 302 pgcnt_t anon_index; 303 struct vnode *vp; 304 u_offset_t off; 305 ulong_t i; 306 int ret; 307 anon_sync_obj_t cookie; 308 309 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK); 310 anon_index = seg_page(seg, addr); 311 npages = btopr(len); 312 if (anon_index + npages > btopr(shmd->shm_amp->size)) { 313 return (EINVAL); 314 } 315 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 316 for (i = 0; i < npages; i++, anon_index++) { 317 ret = 0; 318 anon_array_enter(amp, anon_index, &cookie); 319 ap = anon_get_ptr(amp->ahp, anon_index); 320 if (ap != NULL) { 321 swap_xlate(ap, &vp, &off); 322 anon_array_exit(&cookie); 323 pp = page_lookup_nowait(vp, off, SE_SHARED); 324 if (pp != NULL) { 325 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON; 326 page_unlock(pp); 327 } 328 } else { 329 anon_array_exit(&cookie); 330 } 331 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) { 332 ret |= SEG_PAGE_LOCKED; 333 } 334 *vec++ = (char)ret; 335 } 336 ANON_LOCK_EXIT(&->a_rwlock); 337 return (len); 338 } 339 } 340 341 static int 342 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize) 343 { 344 size_t share_size; 345 346 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 347 348 /* 349 * seg.s_size may have been rounded up to the largest page size 350 * in shmat(). 351 * XXX This should be cleanedup. sptdestroy should take a length 352 * argument which should be the same as sptcreate. Then 353 * this rounding would not be needed (or is done in shm.c) 354 * Only the check for full segment will be needed. 355 * 356 * XXX -- shouldn't raddr == 0 always? These tests don't seem 357 * to be useful at all. 358 */ 359 share_size = page_get_pagesize(seg->s_szc); 360 ssize = P2ROUNDUP(ssize, share_size); 361 362 if (raddr == seg->s_base && ssize == seg->s_size) { 363 seg_free(seg); 364 return (0); 365 } else 366 return (EINVAL); 367 } 368 369 int 370 segspt_create(struct seg *seg, caddr_t argsp) 371 { 372 int err; 373 caddr_t addr = seg->s_base; 374 struct spt_data *sptd; 375 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp; 376 struct anon_map *amp = sptcargs->amp; 377 struct kshmid *sp = amp->a_sp; 378 struct cred *cred = CRED(); 379 ulong_t i, j, anon_index = 0; 380 pgcnt_t npages = btopr(amp->size); 381 struct vnode *vp; 382 page_t **ppa; 383 uint_t hat_flags; 384 size_t pgsz; 385 pgcnt_t pgcnt; 386 caddr_t a; 387 pgcnt_t pidx; 388 size_t sz; 389 proc_t *procp = curproc; 390 rctl_qty_t lockedbytes = 0; 391 kproject_t *proj; 392 393 /* 394 * We are holding the a_lock on the underlying dummy as, 395 * so we can make calls to the HAT layer. 396 */ 397 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 398 ASSERT(sp != NULL); 399 400 #ifdef DEBUG 401 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */, 402 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size); 403 #endif 404 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 405 if (err = anon_swap_adjust(npages)) 406 return (err); 407 } 408 err = ENOMEM; 409 410 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL) 411 goto out1; 412 413 if ((sptcargs->flags & SHM_PAGEABLE) == 0) { 414 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages), 415 KM_NOSLEEP)) == NULL) 416 goto out2; 417 } 418 419 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL); 420 421 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL) 422 goto out3; 423 424 seg->s_ops = &segspt_ops; 425 sptd->spt_vp = vp; 426 sptd->spt_amp = amp; 427 sptd->spt_prot = sptcargs->prot; 428 sptd->spt_flags = sptcargs->flags; 429 seg->s_data = (caddr_t)sptd; 430 sptd->spt_ppa = NULL; 431 sptd->spt_ppa_lckcnt = NULL; 432 seg->s_szc = sptcargs->szc; 433 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL); 434 sptd->spt_gen = 0; 435 436 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 437 if (seg->s_szc > amp->a_szc) { 438 amp->a_szc = seg->s_szc; 439 } 440 ANON_LOCK_EXIT(&->a_rwlock); 441 442 /* 443 * Set policy to affect initial allocation of pages in 444 * anon_map_createpages() 445 */ 446 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index, 447 NULL, 0, ptob(npages)); 448 449 if (sptcargs->flags & SHM_PAGEABLE) { 450 size_t share_sz; 451 pgcnt_t new_npgs, more_pgs; 452 struct anon_hdr *nahp; 453 zone_t *zone; 454 455 share_sz = page_get_pagesize(seg->s_szc); 456 if (!IS_P2ALIGNED(amp->size, share_sz)) { 457 /* 458 * We are rounding up the size of the anon array 459 * on 4 M boundary because we always create 4 M 460 * of page(s) when locking, faulting pages and we 461 * don't have to check for all corner cases e.g. 462 * if there is enough space to allocate 4 M 463 * page. 464 */ 465 new_npgs = btop(P2ROUNDUP(amp->size, share_sz)); 466 more_pgs = new_npgs - npages; 467 468 /* 469 * The zone will never be NULL, as a fully created 470 * shm always has an owning zone. 471 */ 472 zone = sp->shm_perm.ipc_zone; 473 ASSERT(zone != NULL); 474 if (anon_resv_zone(ptob(more_pgs), zone) == 0) { 475 err = ENOMEM; 476 goto out4; 477 } 478 479 nahp = anon_create(new_npgs, ANON_SLEEP); 480 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 481 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages, 482 ANON_SLEEP); 483 anon_release(amp->ahp, npages); 484 amp->ahp = nahp; 485 ASSERT(amp->swresv == ptob(npages)); 486 amp->swresv = amp->size = ptob(new_npgs); 487 ANON_LOCK_EXIT(&->a_rwlock); 488 npages = new_npgs; 489 } 490 491 sptd->spt_ppa_lckcnt = kmem_zalloc(npages * 492 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP); 493 sptd->spt_pcachecnt = 0; 494 sptd->spt_realsize = ptob(npages); 495 sptcargs->seg_spt = seg; 496 return (0); 497 } 498 499 /* 500 * get array of pages for each anon slot in amp 501 */ 502 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa, 503 seg, addr, S_CREATE, cred)) != 0) 504 goto out4; 505 506 mutex_enter(&sp->shm_mlock); 507 508 /* May be partially locked, so, count bytes to charge for locking */ 509 for (i = 0; i < npages; i++) 510 if (ppa[i]->p_lckcnt == 0) 511 lockedbytes += PAGESIZE; 512 513 proj = sp->shm_perm.ipc_proj; 514 515 if (lockedbytes > 0) { 516 mutex_enter(&procp->p_lock); 517 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) { 518 mutex_exit(&procp->p_lock); 519 mutex_exit(&sp->shm_mlock); 520 for (i = 0; i < npages; i++) 521 page_unlock(ppa[i]); 522 err = ENOMEM; 523 goto out4; 524 } 525 mutex_exit(&procp->p_lock); 526 } 527 528 /* 529 * addr is initial address corresponding to the first page on ppa list 530 */ 531 for (i = 0; i < npages; i++) { 532 /* attempt to lock all pages */ 533 if (page_pp_lock(ppa[i], 0, 1) == 0) { 534 /* 535 * if unable to lock any page, unlock all 536 * of them and return error 537 */ 538 for (j = 0; j < i; j++) 539 page_pp_unlock(ppa[j], 0, 1); 540 for (i = 0; i < npages; i++) 541 page_unlock(ppa[i]); 542 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0); 543 mutex_exit(&sp->shm_mlock); 544 err = ENOMEM; 545 goto out4; 546 } 547 } 548 mutex_exit(&sp->shm_mlock); 549 550 /* 551 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK 552 * for the entire life of the segment. For example platforms 553 * that do not support Dynamic Reconfiguration. 554 */ 555 hat_flags = HAT_LOAD_SHARE; 556 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL)) 557 hat_flags |= HAT_LOAD_LOCK; 558 559 /* 560 * Load translations one lare page at a time 561 * to make sure we don't create mappings bigger than 562 * segment's size code in case underlying pages 563 * are shared with segvn's segment that uses bigger 564 * size code than we do. 565 */ 566 pgsz = page_get_pagesize(seg->s_szc); 567 pgcnt = page_get_pagecnt(seg->s_szc); 568 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) { 569 sz = MIN(pgsz, ptob(npages - pidx)); 570 hat_memload_array(seg->s_as->a_hat, a, sz, 571 &ppa[pidx], sptd->spt_prot, hat_flags); 572 } 573 574 /* 575 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 576 * we will leave the pages locked SE_SHARED for the life 577 * of the ISM segment. This will prevent any calls to 578 * hat_pageunload() on this ISM segment for those platforms. 579 */ 580 if (!(hat_flags & HAT_LOAD_LOCK)) { 581 /* 582 * On platforms that support HAT_DYNAMIC_ISM_UNMAP, 583 * we no longer need to hold the SE_SHARED lock on the pages, 584 * since L_PAGELOCK and F_SOFTLOCK calls will grab the 585 * SE_SHARED lock on the pages as necessary. 586 */ 587 for (i = 0; i < npages; i++) 588 page_unlock(ppa[i]); 589 } 590 sptd->spt_pcachecnt = 0; 591 kmem_free(ppa, ((sizeof (page_t *)) * npages)); 592 sptd->spt_realsize = ptob(npages); 593 atomic_add_long(&spt_used, npages); 594 sptcargs->seg_spt = seg; 595 return (0); 596 597 out4: 598 seg->s_data = NULL; 599 kmem_free(vp, sizeof (*vp)); 600 cv_destroy(&sptd->spt_cv); 601 out3: 602 mutex_destroy(&sptd->spt_lock); 603 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 604 kmem_free(ppa, (sizeof (*ppa) * npages)); 605 out2: 606 kmem_free(sptd, sizeof (*sptd)); 607 out1: 608 if ((sptcargs->flags & SHM_PAGEABLE) == 0) 609 anon_swap_restore(npages); 610 return (err); 611 } 612 613 /*ARGSUSED*/ 614 void 615 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len) 616 { 617 struct page *pp; 618 struct spt_data *sptd = (struct spt_data *)seg->s_data; 619 pgcnt_t npages; 620 ulong_t anon_idx; 621 struct anon_map *amp; 622 struct anon *ap; 623 struct vnode *vp; 624 u_offset_t off; 625 uint_t hat_flags; 626 int root = 0; 627 pgcnt_t pgs, curnpgs = 0; 628 page_t *rootpp; 629 rctl_qty_t unlocked_bytes = 0; 630 kproject_t *proj; 631 kshmid_t *sp; 632 633 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 634 635 len = P2ROUNDUP(len, PAGESIZE); 636 637 npages = btop(len); 638 639 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP; 640 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) || 641 (sptd->spt_flags & SHM_PAGEABLE)) { 642 hat_flags = HAT_UNLOAD_UNMAP; 643 } 644 645 hat_unload(seg->s_as->a_hat, addr, len, hat_flags); 646 647 amp = sptd->spt_amp; 648 if (sptd->spt_flags & SHM_PAGEABLE) 649 npages = btop(amp->size); 650 651 ASSERT(amp != NULL); 652 653 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 654 sp = amp->a_sp; 655 proj = sp->shm_perm.ipc_proj; 656 mutex_enter(&sp->shm_mlock); 657 } 658 for (anon_idx = 0; anon_idx < npages; anon_idx++) { 659 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 660 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) { 661 panic("segspt_free_pages: null app"); 662 /*NOTREACHED*/ 663 } 664 } else { 665 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx)) 666 == NULL) 667 continue; 668 } 669 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0); 670 swap_xlate(ap, &vp, &off); 671 672 /* 673 * If this platform supports HAT_DYNAMIC_ISM_UNMAP, 674 * the pages won't be having SE_SHARED lock at this 675 * point. 676 * 677 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP, 678 * the pages are still held SE_SHARED locked from the 679 * original segspt_create() 680 * 681 * Our goal is to get SE_EXCL lock on each page, remove 682 * permanent lock on it and invalidate the page. 683 */ 684 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 685 if (hat_flags == HAT_UNLOAD_UNMAP) 686 pp = page_lookup(vp, off, SE_EXCL); 687 else { 688 if ((pp = page_find(vp, off)) == NULL) { 689 panic("segspt_free_pages: " 690 "page not locked"); 691 /*NOTREACHED*/ 692 } 693 if (!page_tryupgrade(pp)) { 694 page_unlock(pp); 695 pp = page_lookup(vp, off, SE_EXCL); 696 } 697 } 698 if (pp == NULL) { 699 panic("segspt_free_pages: " 700 "page not in the system"); 701 /*NOTREACHED*/ 702 } 703 ASSERT(pp->p_lckcnt > 0); 704 page_pp_unlock(pp, 0, 1); 705 if (pp->p_lckcnt == 0) 706 unlocked_bytes += PAGESIZE; 707 } else { 708 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL) 709 continue; 710 } 711 /* 712 * It's logical to invalidate the pages here as in most cases 713 * these were created by segspt. 714 */ 715 if (pp->p_szc != 0) { 716 if (root == 0) { 717 ASSERT(curnpgs == 0); 718 root = 1; 719 rootpp = pp; 720 pgs = curnpgs = page_get_pagecnt(pp->p_szc); 721 ASSERT(pgs > 1); 722 ASSERT(IS_P2ALIGNED(pgs, pgs)); 723 ASSERT(!(page_pptonum(pp) & (pgs - 1))); 724 curnpgs--; 725 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) { 726 ASSERT(curnpgs == 1); 727 ASSERT(page_pptonum(pp) == 728 page_pptonum(rootpp) + (pgs - 1)); 729 page_destroy_pages(rootpp); 730 root = 0; 731 curnpgs = 0; 732 } else { 733 ASSERT(curnpgs > 1); 734 ASSERT(page_pptonum(pp) == 735 page_pptonum(rootpp) + (pgs - curnpgs)); 736 curnpgs--; 737 } 738 } else { 739 if (root != 0 || curnpgs != 0) { 740 panic("segspt_free_pages: bad large page"); 741 /*NOTREACHED*/ 742 } 743 /*LINTED: constant in conditional context */ 744 VN_DISPOSE(pp, B_INVAL, 0, kcred); 745 } 746 } 747 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 748 if (unlocked_bytes > 0) 749 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0); 750 mutex_exit(&sp->shm_mlock); 751 } 752 if (root != 0 || curnpgs != 0) { 753 panic("segspt_free_pages: bad large page"); 754 /*NOTREACHED*/ 755 } 756 757 /* 758 * mark that pages have been released 759 */ 760 sptd->spt_realsize = 0; 761 762 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) { 763 atomic_add_long(&spt_used, -npages); 764 anon_swap_restore(npages); 765 } 766 } 767 768 /* 769 * Get memory allocation policy info for specified address in given segment 770 */ 771 static lgrp_mem_policy_info_t * 772 segspt_getpolicy(struct seg *seg, caddr_t addr) 773 { 774 struct anon_map *amp; 775 ulong_t anon_index; 776 lgrp_mem_policy_info_t *policy_info; 777 struct spt_data *spt_data; 778 779 ASSERT(seg != NULL); 780 781 /* 782 * Get anon_map from segspt 783 * 784 * Assume that no lock needs to be held on anon_map, since 785 * it should be protected by its reference count which must be 786 * nonzero for an existing segment 787 * Need to grab readers lock on policy tree though 788 */ 789 spt_data = (struct spt_data *)seg->s_data; 790 if (spt_data == NULL) 791 return (NULL); 792 amp = spt_data->spt_amp; 793 ASSERT(amp->refcnt != 0); 794 795 /* 796 * Get policy info 797 * 798 * Assume starting anon index of 0 799 */ 800 anon_index = seg_page(seg, addr); 801 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0); 802 803 return (policy_info); 804 } 805 806 /* 807 * DISM only. 808 * Return locked pages over a given range. 809 * 810 * We will cache all DISM locked pages and save the pplist for the 811 * entire segment in the ppa field of the underlying DISM segment structure. 812 * Later, during a call to segspt_reclaim() we will use this ppa array 813 * to page_unlock() all of the pages and then we will free this ppa list. 814 */ 815 /*ARGSUSED*/ 816 static int 817 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len, 818 struct page ***ppp, enum lock_type type, enum seg_rw rw) 819 { 820 struct shm_data *shmd = (struct shm_data *)seg->s_data; 821 struct seg *sptseg = shmd->shm_sptseg; 822 struct spt_data *sptd = sptseg->s_data; 823 pgcnt_t pg_idx, npages, tot_npages, npgs; 824 struct page **pplist, **pl, **ppa, *pp; 825 struct anon_map *amp; 826 spgcnt_t an_idx; 827 int ret = ENOTSUP; 828 uint_t pl_built = 0; 829 struct anon *ap; 830 struct vnode *vp; 831 u_offset_t off; 832 pgcnt_t claim_availrmem = 0; 833 uint_t szc; 834 835 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 836 837 /* 838 * We want to lock/unlock the entire ISM segment. Therefore, 839 * we will be using the underlying sptseg and it's base address 840 * and length for the caching arguments. 841 */ 842 ASSERT(sptseg); 843 ASSERT(sptd); 844 845 pg_idx = seg_page(seg, addr); 846 npages = btopr(len); 847 848 /* 849 * check if the request is larger than number of pages covered 850 * by amp 851 */ 852 if (pg_idx + npages > btopr(sptd->spt_amp->size)) { 853 *ppp = NULL; 854 return (ENOTSUP); 855 } 856 857 if (type == L_PAGEUNLOCK) { 858 ASSERT(sptd->spt_ppa != NULL); 859 860 seg_pinactive(seg, seg->s_base, sptd->spt_amp->size, 861 sptd->spt_ppa, sptd->spt_prot, segspt_reclaim); 862 863 /* 864 * If someone is blocked while unmapping, we purge 865 * segment page cache and thus reclaim pplist synchronously 866 * without waiting for seg_pasync_thread. This speeds up 867 * unmapping in cases where munmap(2) is called, while 868 * raw async i/o is still in progress or where a thread 869 * exits on data fault in a multithreaded application. 870 */ 871 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) { 872 segspt_purge(seg); 873 } 874 return (0); 875 } else if (type == L_PAGERECLAIM) { 876 ASSERT(sptd->spt_ppa != NULL); 877 (void) segspt_reclaim(seg, seg->s_base, sptd->spt_amp->size, 878 sptd->spt_ppa, sptd->spt_prot); 879 return (0); 880 } 881 882 if (sptd->spt_flags & DISM_PPA_CHANGED) { 883 segspt_purge(seg); 884 /* 885 * for DISM ppa needs to be rebuild since 886 * number of locked pages could be changed 887 */ 888 *ppp = NULL; 889 return (ENOTSUP); 890 } 891 892 /* 893 * First try to find pages in segment page cache, without 894 * holding the segment lock. 895 */ 896 pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, 897 sptd->spt_prot); 898 if (pplist != NULL) { 899 ASSERT(sptd->spt_ppa != NULL); 900 ASSERT(sptd->spt_ppa == pplist); 901 ppa = sptd->spt_ppa; 902 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 903 if (ppa[an_idx] == NULL) { 904 seg_pinactive(seg, seg->s_base, 905 sptd->spt_amp->size, ppa, 906 sptd->spt_prot, segspt_reclaim); 907 *ppp = NULL; 908 return (ENOTSUP); 909 } 910 if ((szc = ppa[an_idx]->p_szc) != 0) { 911 npgs = page_get_pagecnt(szc); 912 an_idx = P2ROUNDUP(an_idx + 1, npgs); 913 } else { 914 an_idx++; 915 } 916 } 917 /* 918 * Since we cache the entire DISM segment, we want to 919 * set ppp to point to the first slot that corresponds 920 * to the requested addr, i.e. pg_idx. 921 */ 922 *ppp = &(sptd->spt_ppa[pg_idx]); 923 return (0); 924 } 925 926 /* The L_PAGELOCK case... */ 927 mutex_enter(&sptd->spt_lock); 928 /* 929 * try to find pages in segment page cache with mutex 930 */ 931 pplist = seg_plookup(seg, seg->s_base, sptd->spt_amp->size, 932 sptd->spt_prot); 933 if (pplist != NULL) { 934 ASSERT(sptd->spt_ppa != NULL); 935 ASSERT(sptd->spt_ppa == pplist); 936 ppa = sptd->spt_ppa; 937 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) { 938 if (ppa[an_idx] == NULL) { 939 mutex_exit(&sptd->spt_lock); 940 seg_pinactive(seg, seg->s_base, 941 sptd->spt_amp->size, ppa, 942 sptd->spt_prot, segspt_reclaim); 943 *ppp = NULL; 944 return (ENOTSUP); 945 } 946 if ((szc = ppa[an_idx]->p_szc) != 0) { 947 npgs = page_get_pagecnt(szc); 948 an_idx = P2ROUNDUP(an_idx + 1, npgs); 949 } else { 950 an_idx++; 951 } 952 } 953 /* 954 * Since we cache the entire DISM segment, we want to 955 * set ppp to point to the first slot that corresponds