1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "@(#)vm_anon.c 1.196 07/12/10 SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/debug.h> 110 #include <sys/fs/swapnode.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 #include <sys/rctl.h> 117 118 #include <vm/as.h> 119 #include <vm/hat.h> 120 #include <vm/anon.h> 121 #include <vm/page.h> 122 #include <vm/vpage.h> 123 #include <vm/seg.h> 124 #include <vm/rm.h> 125 126 #include <fs/fs_subr.h> 127 128 struct vnode *anon_vp; 129 130 int anon_debug; 131 132 kmutex_t anoninfo_lock; 133 struct k_anoninfo k_anoninfo; 134 ani_free_t ani_free_pool[ANI_MAX_POOL]; 135 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 136 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 137 138 /* 139 * Global hash table for (vp, off) -> anon slot 140 */ 141 extern int swap_maxcontig; 142 size_t anon_hash_size; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 #ifdef VM_STATS 149 static struct anonvmstats_str { 150 ulong_t getpages[30]; 151 ulong_t privatepages[10]; 152 ulong_t demotepages[9]; 153 ulong_t decrefpages[9]; 154 ulong_t dupfillholes[4]; 155 ulong_t freepages[1]; 156 } anonvmstats; 157 #endif /* VM_STATS */ 158 159 160 /*ARGSUSED*/ 161 static int 162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 163 { 164 struct anon_map *amp = buf; 165 166 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 167 return (0); 168 } 169 170 /*ARGSUSED1*/ 171 static void 172 anonmap_cache_destructor(void *buf, void *cdrarg) 173 { 174 struct anon_map *amp = buf; 175 176 rw_destroy(&->a_rwlock); 177 } 178 179 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 180 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 181 182 void 183 anon_init(void) 184 { 185 int i; 186 187 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 188 189 for (i = 0; i < AH_LOCK_SIZE; i++) { 190 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 192 } 193 194 for (i = 0; i < ANON_LOCKSIZE; i++) { 195 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 196 MUTEX_DEFAULT, NULL); 197 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 198 } 199 200 anon_hash = (struct anon **) 201 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 202 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 203 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 204 anonmap_cache = kmem_cache_create("anonmap_cache", 205 sizeof (struct anon_map), 0, 206 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 207 NULL, NULL, 0); 208 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 209 210 anon_vp = vn_alloc(KM_SLEEP); 211 vn_setops(anon_vp, swap_vnodeops); 212 anon_vp->v_type = VREG; 213 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 214 } 215 216 /* 217 * Global anon slot hash table manipulation. 218 */ 219 220 static void 221 anon_addhash(struct anon *ap) 222 { 223 int index; 224 225 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 226 index = ANON_HASH(ap->an_vp, ap->an_off); 227 ap->an_hash = anon_hash[index]; 228 anon_hash[index] = ap; 229 } 230 231 static void 232 anon_rmhash(struct anon *ap) 233 { 234 struct anon **app; 235 236 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 237 238 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 239 *app; app = &((*app)->an_hash)) { 240 if (*app == ap) { 241 *app = ap->an_hash; 242 break; 243 } 244 } 245 } 246 247 /* 248 * The anon array interfaces. Functions allocating, 249 * freeing array of pointers, and returning/setting 250 * entries in the array of pointers for a given offset. 251 * 252 * Create the list of pointers 253 */ 254 struct anon_hdr * 255 anon_create(pgcnt_t npages, int flags) 256 { 257 struct anon_hdr *ahp; 258 ulong_t nchunks; 259 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 260 261 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 262 return (NULL); 263 } 264 265 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 266 /* 267 * Single level case. 268 */ 269 ahp->size = npages; 270 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 271 272 if (flags & ANON_ALLOC_FORCE) 273 ahp->flags |= ANON_ALLOC_FORCE; 274 275 ahp->array_chunk = kmem_zalloc( 276 ahp->size * sizeof (struct anon *), kmemflags); 277 278 if (ahp->array_chunk == NULL) { 279 kmem_free(ahp, sizeof (struct anon_hdr)); 280 return (NULL); 281 } 282 } else { 283 /* 284 * 2 Level case. 285 * anon hdr size needs to be rounded off to be a multiple 286 * of ANON_CHUNK_SIZE. This is important as various anon 287 * related functions depend on this. 288 * NOTE - 289 * anon_grow() makes anon hdr size a multiple of 290 * ANON_CHUNK_SIZE. 291 * amp size is <= anon hdr size. 292 * anon_index + seg_pgs <= anon hdr size. 293 */ 294 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 295 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 296 297 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 298 kmemflags); 299 300 if (ahp->array_chunk == NULL) { 301 kmem_free(ahp, sizeof (struct anon_hdr)); 302 return (NULL); 303 } 304 } 305 return (ahp); 306 } 307 308 /* 309 * Free the array of pointers 310 */ 311 void 312 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 313 { 314 ulong_t i; 315 void **ppp; 316 ulong_t nchunks; 317 318 ASSERT(npages <= ahp->size); 319 320 /* 321 * Single level case. 322 */ 323 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 324 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 325 } else { 326 /* 327 * 2 level case. 328 */ 329 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 330 for (i = 0; i < nchunks; i++) { 331 ppp = &ahp->array_chunk[i]; 332 if (*ppp != NULL) 333 kmem_free(*ppp, PAGESIZE); 334 } 335 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 336 } 337 mutex_destroy(&ahp->serial_lock); 338 kmem_free(ahp, sizeof (struct anon_hdr)); 339 } 340 341 /* 342 * Return the pointer from the list for a 343 * specified anon index. 344 */ 345 struct anon * 346 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 347 { 348 struct anon **app; 349 350 ASSERT(an_idx < ahp->size); 351 352 /* 353 * Single level case. 354 */ 355 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 356 return ((struct anon *) 357 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 358 } else { 359 360 /* 361 * 2 level case. 362 */ 363 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 364 if (app) { 365 return ((struct anon *) 366 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 367 ANON_PTRMASK)); 368 } else { 369 return (NULL); 370 } 371 } 372 } 373 374 /* 375 * Return the anon pointer for the first valid entry in the anon list, 376 * starting from the given index. 377 */ 378 struct anon * 379 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 380 { 381 struct anon *ap; 382 struct anon **app; 383 ulong_t chunkoff; 384 ulong_t i; 385 ulong_t j; 386 pgcnt_t size; 387 388 i = *index; 389 size = ahp->size; 390 391 ASSERT(i < size); 392 393 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 394 /* 395 * 1 level case 396 */ 397 while (i < size) { 398 ap = (struct anon *) 399 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 400 if (ap) { 401 *index = i; 402 return (ap); 403 } 404 i++; 405 } 406 } else { 407 /* 408 * 2 level case 409 */ 410 chunkoff = i & ANON_CHUNK_OFF; 411 while (i < size) { 412 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 413 if (app) 414 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 415 ap = (struct anon *) 416 ((uintptr_t)app[j] & ANON_PTRMASK); 417 if (ap) { 418 *index = i + (j - chunkoff); 419 return (ap); 420 } 421 } 422 chunkoff = 0; 423 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 424 } 425 } 426 *index = size; 427 return (NULL); 428 } 429 430 /* 431 * Set list entry with a given pointer for a specified offset 432 */ 433 int 434 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 435 { 436 void **ppp; 437 struct anon **app; 438 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 439 uintptr_t *ap_addr; 440 441 ASSERT(an_idx < ahp->size); 442 443 /* 444 * Single level case. 445 */ 446 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 447 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 448 } else { 449 450 /* 451 * 2 level case. 452 */ 453 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 454 455 ASSERT(ppp != NULL); 456 if (*ppp == NULL) { 457 mutex_enter(&ahp->serial_lock); 458 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 459 if (*ppp == NULL) { 460 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 461 if (*ppp == NULL) { 462 mutex_exit(&ahp->serial_lock); 463 return (ENOMEM); 464 } 465 } 466 mutex_exit(&ahp->serial_lock); 467 } 468 app = *ppp; 469 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 470 } 471 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 472 return (0); 473 } 474 475 /* 476 * Copy anon array into a given new anon array 477 */ 478 int 479 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 480 struct anon_hdr *dahp, ulong_t d_idx, 481 pgcnt_t npages, int flags) 482 { 483 void **sapp, **dapp; 484 void *ap; 485 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 486 487 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 488 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 489 490 /* 491 * Both arrays are 1 level. 492 */ 493 if (((sahp->size <= ANON_CHUNK_SIZE) && 494 (dahp->size <= ANON_CHUNK_SIZE)) || 495 ((sahp->flags & ANON_ALLOC_FORCE) && 496 (dahp->flags & ANON_ALLOC_FORCE))) { 497 498 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 499 npages * sizeof (struct anon *)); 500 return (0); 501 } 502 503 /* 504 * Both arrays are 2 levels. 505 */ 506 if (sahp->size > ANON_CHUNK_SIZE && 507 dahp->size > ANON_CHUNK_SIZE && 508 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 509 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 510 511 ulong_t sapidx, dapidx; 512 ulong_t *sap, *dap; 513 ulong_t chknp; 514 515 while (npages != 0) { 516 517 sapidx = s_idx & ANON_CHUNK_OFF; 518 dapidx = d_idx & ANON_CHUNK_OFF; 519 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 520 if (chknp > npages) 521 chknp = npages; 522 523 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 524 if ((sap = *sapp) != NULL) { 525 dapp = &dahp->array_chunk[d_idx 526 >> ANON_CHUNK_SHIFT]; 527 if ((dap = *dapp) == NULL) { 528 *dapp = kmem_zalloc(PAGESIZE, 529 kmemflags); 530 if ((dap = *dapp) == NULL) 531 return (ENOMEM); 532 } 533 bcopy((sap + sapidx), (dap + dapidx), 534 chknp << ANON_PTRSHIFT); 535 } 536 s_idx += chknp; 537 d_idx += chknp; 538 npages -= chknp; 539 } 540 return (0); 541 } 542 543 /* 544 * At least one of the arrays is 2 level. 545 */ 546 while (npages--) { 547 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 548 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 549 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 550 return (ENOMEM); 551 } 552 s_idx++; 553 d_idx++; 554 } 555 return (0); 556 } 557 558 559 /* 560 * ANON_INITBUF is a convenience macro for anon_grow() below. It 561 * takes a buffer dst, which is at least as large as buffer src. It 562 * does a bcopy from src into dst, and then bzeros the extra bytes 563 * of dst. If tail is set, the data in src is tail aligned within 564 * dst instead of head aligned. 565 */ 566 567 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 568 if (tail) { \ 569 bzero((dst), (dstsize) - (srclen)); \ 570 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 571 } else { \ 572 bcopy((src), (dst), (srclen)); \ 573 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 574 } 575 576 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 577 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 578 579 /* 580 * anon_grow() is used to efficiently extend an existing anon array. 581 * startidx_p points to the index into the anon array of the first page 582 * that is in use. oldseg_pgs is the number of pages in use, starting at 583 * *startidx_p. newpages is the number of additional pages desired. 584 * 585 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 586 * 587 * The growth is done by creating a new top level of the anon array, 588 * and (if the array is 2-level) reusing the existing second level arrays. 589 * 590 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 591 * 592 * Returns the new number of pages in the anon array. 593 */ 594 pgcnt_t 595 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 596 pgcnt_t newseg_pgs, int flags) 597 { 598 ulong_t startidx = startidx_p ? *startidx_p : 0; 599 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 600 pgcnt_t oelems, nelems, totpages; 601 void **level1; 602 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 603 int growdown = (flags & ANON_GROWDOWN); 604 size_t newarrsz, oldarrsz; 605 void *level2; 606 607 ASSERT(!(startidx_p == NULL && growdown)); 608 ASSERT(startidx + oldseg_pgs <= ahp->size); 609 610 /* 611 * Determine the total number of pages needed in the new 612 * anon array. If growing down, totpages is all pages from 613 * startidx through the end of the array, plus <newseg_pgs> 614 * pages. If growing up, keep all pages from page 0 through 615 * the last page currently in use, plus <newseg_pgs> pages. 616 */ 617 if (growdown) 618 totpages = oldamp_pgs - startidx + newseg_pgs; 619 else 620 totpages = startidx + oldseg_pgs + newseg_pgs; 621 622 /* If the array is already large enough, just return. */ 623 624 if (oldamp_pgs >= totpages) { 625 if (growdown) 626 *startidx_p = oldamp_pgs - totpages; 627 return (oldamp_pgs); 628 } 629 630 /* 631 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 632 * by the corresponding arrays. 633 * oelems/nelems are the number of pointers in the top level arrays 634 * which may be either level 1 or level 2. 635 * Will the new anon array be one level or two levels? 636 */ 637 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 638 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 639 oelems = oldamp_pgs; 640 nelems = newamp_pgs; 641 } else { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 643 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 644 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 645 } 646 647 newarrsz = nelems * sizeof (void *); 648 level1 = kmem_alloc(newarrsz, kmemflags); 649 if (level1 == NULL) 650 return (0); 651 652 /* Are we converting from a one level to a two level anon array? */ 653 654 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 655 !(ahp->flags & ANON_ALLOC_FORCE)) { 656 657 /* 658 * Yes, we're converting to a two level. Reuse old level 1 659 * as new level 2 if it is exactly PAGESIZE. Otherwise 660 * alloc a new level 2 and copy the old level 1 data into it. 661 */ 662 if (oldamp_pgs == ANON_CHUNK_SIZE) { 663 level2 = (void *)ahp->array_chunk; 664 } else { 665 level2 = kmem_alloc(PAGESIZE, kmemflags); 666 if (level2 == NULL) { 667 kmem_free(level1, newarrsz); 668 return (0); 669 } 670 oldarrsz = oldamp_pgs * sizeof (void *); 671 672 ANON_INITBUF(ahp->array_chunk, oldarrsz, 673 level2, PAGESIZE, growdown); 674 kmem_free(ahp->array_chunk, oldarrsz); 675 } 676 bzero(level1, newarrsz); 677 if (growdown) 678 level1[nelems - 1] = level2; 679 else 680 level1[0] = level2; 681 } else { 682 oldarrsz = oelems * sizeof (void *); 683 684 ANON_INITBUF(ahp->array_chunk, oldarrsz, 685 level1, newarrsz, growdown); 686 kmem_free(ahp->array_chunk, oldarrsz); 687 } 688 689 ahp->array_chunk = level1; 690 ahp->size = newamp_pgs; 691 if (growdown) 692 *startidx_p = newamp_pgs - totpages; 693 694 return (newamp_pgs); 695 } 696 697 698 /* 699 * Called from clock handler to sync ani_free value. 700 */ 701 702 void 703 set_anoninfo(void) 704 { 705 int ix; 706 pgcnt_t total = 0; 707 708 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 709 total += ani_free_pool[ix].ani_count; 710 } 711 k_anoninfo.ani_free = total; 712 } 713 714 /* 715 * Reserve anon space. 716 * 717 * It's no longer simply a matter of incrementing ani_resv to 718 * reserve swap space, we need to check memory-based as well 719 * as disk-backed (physical) swap. The following algorithm 720 * is used: 721 * Check the space on physical swap 722 * i.e. amount needed < ani_max - ani_phys_resv 723 * If we are swapping on swapfs check 724 * amount needed < (availrmem - swapfs_minfree) 725 * Since the algorithm to check for the quantity of swap space is 726 * almost the same as that for reserving it, we'll just use anon_resvmem 727 * with a flag to decrement availrmem. 728 * 729 * Return non-zero on success. 730 */ 731 int 732 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 733 { 734 pgcnt_t npages = btopr(size); 735 pgcnt_t mswap_pages = 0; 736 pgcnt_t pswap_pages = 0; 737 proc_t *p = curproc; 738 739 if (zone != NULL && takemem) { 740 /* test zone.max-swap resource control */ 741 mutex_enter(&p->p_lock); 742 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 743 mutex_exit(&p->p_lock); 744 return (0); 745 } 746 mutex_exit(&p->p_lock); 747 } 748 mutex_enter(&anoninfo_lock); 749 750 /* 751 * pswap_pages is the number of pages we can take from 752 * physical (i.e. disk-backed) swap. 753 */ 754 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 755 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 756 757 ANON_PRINT(A_RESV, 758 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 759 npages, takemem, pswap_pages, (void *)caller())); 760 761 if (npages <= pswap_pages) { 762 /* 763 * we have enough space on a physical swap 764 */ 765 if (takemem) 766 k_anoninfo.ani_phys_resv += npages; 767 mutex_exit(&anoninfo_lock); 768 return (1); 769 } else if (pswap_pages != 0) { 770 /* 771 * we have some space on a physical swap 772 */ 773 if (takemem) { 774 /* 775 * use up remainder of phys swap 776 */ 777 k_anoninfo.ani_phys_resv += pswap_pages; 778 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 779 } 780 } 781 /* 782 * since (npages > pswap_pages) we need mem swap 783 * mswap_pages is the number of pages needed from availrmem 784 */ 785 ASSERT(npages > pswap_pages); 786 mswap_pages = npages - pswap_pages; 787 788 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 789 mswap_pages)); 790 791 /* 792 * priv processes can reserve memory as swap as long as availrmem 793 * remains greater than swapfs_minfree; in the case of non-priv 794 * processes, memory can be reserved as swap only if availrmem 795 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 796 * swapfs_reserve amount of memswap is not available to non-priv 797 * processes. This protects daemons such as automounter dying 798 * as a result of application processes eating away almost entire 799 * membased swap. This safeguard becomes useless if apps are run 800 * with root access. 801 * 802 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 803 * 804 */ 805 if (tryhard) { 806 mutex_exit(&anoninfo_lock); 807 (void) page_reclaim_mem(mswap_pages, 808 swapfs_minfree + swapfs_reserve, 0); 809 mutex_enter(&anoninfo_lock); 810 } 811 812 mutex_enter(&freemem_lock); 813 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 814 (availrmem > (swapfs_minfree + mswap_pages) && 815 secpolicy_resource(CRED()) == 0)) { 816 817 if (takemem) { 818 /* 819 * Take the memory from the rest of the system. 820 */ 821 availrmem -= mswap_pages; 822 mutex_exit(&freemem_lock); 823 k_anoninfo.ani_mem_resv += mswap_pages; 824 ANI_ADD(mswap_pages); 825 ANON_PRINT((A_RESV | A_MRESV), 826 ("anon_resvmem: took %ld pages of availrmem\n", 827 mswap_pages)); 828 } else { 829 mutex_exit(&freemem_lock); 830 } 831 832 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 833 mutex_exit(&anoninfo_lock); 834 return (1); 835 836 } else { 837 /* 838 * Fail if not enough memory 839 */ 840 841 if (takemem) { 842 k_anoninfo.ani_phys_resv -= pswap_pages; 843 } 844 845 mutex_exit(&freemem_lock); 846 mutex_exit(&anoninfo_lock); 847 ANON_PRINT(A_RESV, 848 ("anon_resvmem: not enough space from swapfs\n")); 849 if (zone != NULL && takemem) 850 rctl_decr_swap(zone, ptob(npages)); 851 return (0); 852 } 853 } 854 855 /* 856 * Give back an anon reservation. 857 */ 858 void 859 anon_unresvmem(size_t size, zone_t *zone) 860 { 861 pgcnt_t npages = btopr(size); 862 spgcnt_t mem_free_pages = 0; 863 pgcnt_t phys_free_slots; 864 #ifdef ANON_DEBUG 865 pgcnt_t mem_resv; 866 #endif 867 if (zone != NULL) 868 rctl_decr_swap(zone, ptob(npages)); 869 870 mutex_enter(&anoninfo_lock); 871 872 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 873 /* 874 * If some of this reservation belonged to swapfs 875 * give it back to availrmem. 876 * ani_mem_resv is the amount of availrmem swapfs has reserved. 877 * but some of that memory could be locked by segspt so we can only 878 * return non locked ani_mem_resv back to availrmem 879 */ 880 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 881 ANON_PRINT((A_RESV | A_MRESV), 882 ("anon_unresv: growing availrmem by %ld pages\n", 883 MIN(k_anoninfo.ani_mem_resv, npages))); 884 885 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 886 k_anoninfo.ani_locked_swap), npages); 887 mutex_enter(&freemem_lock); 888 availrmem += mem_free_pages; 889 mutex_exit(&freemem_lock); 890 k_anoninfo.ani_mem_resv -= mem_free_pages; 891 892 ANI_ADD(-mem_free_pages); 893 } 894 /* 895 * The remainder of the pages is returned to phys swap 896 */ 897 ASSERT(npages >= mem_free_pages); 898 phys_free_slots = npages - mem_free_pages; 899 900 if (phys_free_slots) { 901 k_anoninfo.ani_phys_resv -= phys_free_slots; 902 } 903 904 #ifdef ANON_DEBUG 905 mem_resv = k_anoninfo.ani_mem_resv; 906 #endif 907 908 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 909 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 910 911 mutex_exit(&anoninfo_lock); 912 913 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 914 npages, mem_resv, (void *)caller())); 915 } 916 917 /* 918 * Allocate an anon slot and return it with the lock held. 919 */ 920 struct anon * 921 anon_alloc(struct vnode *vp, anoff_t off) 922 { 923 struct anon *ap; 924 kmutex_t *ahm; 925 926 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 927 if (vp == NULL) { 928 swap_alloc(ap); 929 } else { 930 ap->an_vp = vp; 931 ap->an_off = off; 932 } 933 ap->an_refcnt = 1; 934 ap->an_pvp = NULL; 935 ap->an_poff = 0; 936 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 937 mutex_enter(ahm); 938 anon_addhash(ap); 939 mutex_exit(ahm); 940 ANI_ADD(-1); 941 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 942 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 943 return (ap); 944 } 945 946 /* 947 * Decrement the reference count of an anon page. 948 * If reference count goes to zero, free it and 949 * its associated page (if any). 950 */ 951 void 952 anon_decref(struct anon *ap) 953 { 954 page_t *pp; 955 struct vnode *vp; 956 anoff_t off; 957 kmutex_t *ahm; 958 959 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 960 mutex_enter(ahm); 961 ASSERT(ap->an_refcnt != 0); 962 if (ap->an_refcnt == 0) 963 panic("anon_decref: slot count 0"); 964 if (--ap->an_refcnt == 0) { 965 swap_xlate(ap, &vp, &off); 966 anon_rmhash(ap); 967 if (ap->an_pvp != NULL) 968 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 969 mutex_exit(ahm); 970 971 /* 972 * If there is a page for this anon slot we will need to 973 * call VN_DISPOSE to get rid of the vp association and 974 * put the page back on the free list as really free. 975 * Acquire the "exclusive" lock to ensure that any 976 * pending i/o always completes before the swap slot 977 * is freed. 978 */ 979 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 980 if (pp != NULL) { 981 /*LINTED: constant in conditional context */ 982 VN_DISPOSE(pp, B_INVAL, 0, kcred); 983 } 984 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 985 (void *)ap, (void *)ap->an_vp)); 986 987 kmem_cache_free(anon_cache, ap); 988 989 ANI_ADD(1); 990 } else { 991 mutex_exit(ahm); 992 } 993 } 994 995 996 /* 997 * check an_refcnt of the root anon slot (anon_index argument is aligned at 998 * seg->s_szc level) to determine whether COW processing is required. 999 * anonpages_hash_lock[] held on the root ap ensures that if root's 1000 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1001 * later since this process can't fork while its AS lock is held). 1002 * 1003 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1004 */ 1005 int 1006 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1007 { 1008 struct anon *ap; 1009 kmutex_t *ahmpages = NULL; 1010 1011 ap = anon_get_ptr(ahp, anon_index); 1012 if (ap == NULL) 1013 return (0); 1014 1015 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1016 mutex_enter(ahmpages); 1017 ASSERT(ap->an_refcnt >= 1); 1018 if (ap->an_refcnt == 1) { 1019 mutex_exit(ahmpages); 1020 return (0); 1021 } 1022 mutex_exit(ahmpages); 1023 return (1); 1024 } 1025 /* 1026 * Check 'nslots' anon slots for refcnt > 1. 1027 * 1028 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1029 * returns 0. 1030 */ 1031 static int 1032 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1033 { 1034 struct anon *ap; 1035 1036 while (nslots-- > 0) { 1037 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1038 ap->an_refcnt > 1) 1039 return (1); 1040 anon_index++; 1041 } 1042 1043 return (0); 1044 } 1045 1046 static void 1047 anon_decref_pages( 1048 struct anon_hdr *ahp, 1049 ulong_t an_idx, 1050 uint_t szc) 1051 { 1052 struct anon *ap = anon_get_ptr(ahp, an_idx); 1053 kmutex_t *ahmpages = NULL; 1054 page_t *pp; 1055 pgcnt_t pgcnt = page_get_pagecnt(szc); 1056 pgcnt_t i; 1057 struct vnode *vp; 1058 anoff_t off; 1059 kmutex_t *ahm; 1060 #ifdef DEBUG 1061 int refcnt = 1; 1062 #endif 1063 1064 ASSERT(szc != 0); 1065 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1066 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1067 ASSERT(an_idx < ahp->size); 1068 1069 if (ahp->size - an_idx < pgcnt) { 1070 /* 1071 * In case of shared mappings total anon map size may not be 1072 * the largest page size aligned. 1073 */ 1074 pgcnt = ahp->size - an_idx; 1075 } 1076 1077 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1078 1079 if (ap != NULL) { 1080 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1081 mutex_enter(ahmpages); 1082 ASSERT((refcnt = ap->an_refcnt) != 0); 1083 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1084 if (ap->an_refcnt == 1) { 1085 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1086 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1087 mutex_exit(ahmpages); 1088 ahmpages = NULL; 1089 } 1090 } 1091 1092 i = 0; 1093 while (i < pgcnt) { 1094 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1095 ASSERT(refcnt == 1 && ahmpages ==