Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 /*
     30  * University Copyright- Copyright (c) 1982, 1986, 1988
     31  * The Regents of the University of California
     32  * All Rights Reserved
     33  *
     34  * University Acknowledgment- Portions of this document are derived from
     35  * software developed by the University of California, Berkeley, and its
     36  * contributors.
     37  */
     38 
     39 #pragma ident	"@(#)vm_anon.c	1.196	07/12/10 SMI"
     40 
     41 /*
     42  * VM - anonymous pages.
     43  *
     44  * This layer sits immediately above the vm_swap layer.  It manages
     45  * physical pages that have no permanent identity in the file system
     46  * name space, using the services of the vm_swap layer to allocate
     47  * backing storage for these pages.  Since these pages have no external
     48  * identity, they are discarded when the last reference is removed.
     49  *
     50  * An important function of this layer is to manage low-level sharing
     51  * of pages that are logically distinct but that happen to be
     52  * physically identical (e.g., the corresponding pages of the processes
     53  * resulting from a fork before one process or the other changes their
     54  * contents).  This pseudo-sharing is present only as an optimization
     55  * and is not to be confused with true sharing in which multiple
     56  * address spaces deliberately contain references to the same object;
     57  * such sharing is managed at a higher level.
     58  *
     59  * The key data structure here is the anon struct, which contains a
     60  * reference count for its associated physical page and a hint about
     61  * the identity of that page.  Anon structs typically live in arrays,
     62  * with an instance's position in its array determining where the
     63  * corresponding backing storage is allocated; however, the swap_xlate()
     64  * routine abstracts away this representation information so that the
     65  * rest of the anon layer need not know it.  (See the swap layer for
     66  * more details on anon struct layout.)
     67  *
     68  * In the future versions of the system, the association between an
     69  * anon struct and its position on backing store will change so that
     70  * we don't require backing store all anonymous pages in the system.
     71  * This is important for consideration for large memory systems.
     72  * We can also use this technique to delay binding physical locations
     73  * to anonymous pages until pageout/swapout time where we can make
     74  * smarter allocation decisions to improve anonymous klustering.
     75  *
     76  * Many of the routines defined here take a (struct anon **) argument,
     77  * which allows the code at this level to manage anon pages directly,
     78  * so that callers can regard anon structs as opaque objects and not be
     79  * concerned with assigning or inspecting their contents.
     80  *
     81  * Clients of this layer refer to anon pages indirectly.  That is, they
     82  * maintain arrays of pointers to anon structs rather than maintaining
     83  * anon structs themselves.  The (struct anon **) arguments mentioned
     84  * above are pointers to entries in these arrays.  It is these arrays
     85  * that capture the mapping between offsets within a given segment and
     86  * the corresponding anonymous backing storage address.
     87  */
     88 
     89 #ifdef DEBUG
     90 #define	ANON_DEBUG
     91 #endif
     92 
     93 #include <sys/types.h>
     94 #include <sys/t_lock.h>
     95 #include <sys/param.h>
     96 #include <sys/systm.h>
     97 #include <sys/mman.h>
     98 #include <sys/cred.h>
     99 #include <sys/thread.h>
    100 #include <sys/vnode.h>
    101 #include <sys/cpuvar.h>
    102 #include <sys/swap.h>
    103 #include <sys/cmn_err.h>
    104 #include <sys/vtrace.h>
    105 #include <sys/kmem.h>
    106 #include <sys/sysmacros.h>
    107 #include <sys/bitmap.h>
    108 #include <sys/vmsystm.h>
    109 #include <sys/debug.h>
    110 #include <sys/fs/swapnode.h>
    111 #include <sys/tnf_probe.h>
    112 #include <sys/lgrp.h>
    113 #include <sys/policy.h>
    114 #include <sys/condvar_impl.h>
    115 #include <sys/mutex_impl.h>
    116 #include <sys/rctl.h>
    117 
    118 #include <vm/as.h>
    119 #include <vm/hat.h>
    120 #include <vm/anon.h>
    121 #include <vm/page.h>
    122 #include <vm/vpage.h>
    123 #include <vm/seg.h>
    124 #include <vm/rm.h>
    125 
    126 #include <fs/fs_subr.h>
    127 
    128 struct vnode *anon_vp;
    129 
    130 int anon_debug;
    131 
    132 kmutex_t	anoninfo_lock;
    133 struct		k_anoninfo k_anoninfo;
    134 ani_free_t	ani_free_pool[ANI_MAX_POOL];
    135 pad_mutex_t	anon_array_lock[ANON_LOCKSIZE];
    136 kcondvar_t	anon_array_cv[ANON_LOCKSIZE];
    137 
    138 /*
    139  * Global hash table for (vp, off) -> anon slot
    140  */
    141 extern	int swap_maxcontig;
    142 size_t	anon_hash_size;
    143 struct anon **anon_hash;
    144 
    145 static struct kmem_cache *anon_cache;
    146 static struct kmem_cache *anonmap_cache;
    147 
    148 #ifdef VM_STATS
    149 static struct anonvmstats_str {
    150 	ulong_t getpages[30];
    151 	ulong_t privatepages[10];
    152 	ulong_t demotepages[9];
    153 	ulong_t decrefpages[9];
    154 	ulong_t	dupfillholes[4];
    155 	ulong_t freepages[1];
    156 } anonvmstats;
    157 #endif /* VM_STATS */
    158 
    159 
    160 /*ARGSUSED*/
    161 static int
    162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
    163 {
    164 	struct anon_map *amp = buf;
    165 
    166 	rw_init(&amp->a_rwlock, NULL, RW_DEFAULT, NULL);
    167 	return (0);
    168 }
    169 
    170 /*ARGSUSED1*/
    171 static void
    172 anonmap_cache_destructor(void *buf, void *cdrarg)
    173 {
    174 	struct anon_map *amp = buf;
    175 
    176 	rw_destroy(&amp->a_rwlock);
    177 }
    178 
    179 kmutex_t	anonhash_lock[AH_LOCK_SIZE];
    180 kmutex_t	anonpages_hash_lock[AH_LOCK_SIZE];
    181 
    182 void
    183 anon_init(void)
    184 {
    185 	int i;
    186 
    187 	anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN);
    188 
    189 	for (i = 0; i < AH_LOCK_SIZE; i++) {
    190 		mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL);
    191 		mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL);
    192 	}
    193 
    194 	for (i = 0; i < ANON_LOCKSIZE; i++) {
    195 		mutex_init(&anon_array_lock[i].pad_mutex, NULL,
    196 		    MUTEX_DEFAULT, NULL);
    197 		cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
    198 	}
    199 
    200 	anon_hash = (struct anon **)
    201 	    kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
    202 	anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
    203 	    AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
    204 	anonmap_cache = kmem_cache_create("anonmap_cache",
    205 	    sizeof (struct anon_map), 0,
    206 	    anonmap_cache_constructor, anonmap_cache_destructor, NULL,
    207 	    NULL, NULL, 0);
    208 	swap_maxcontig = (1024 * 1024) >> PAGESHIFT;	/* 1MB of pages */
    209 
    210 	anon_vp = vn_alloc(KM_SLEEP);
    211 	vn_setops(anon_vp, swap_vnodeops);
    212 	anon_vp->v_type = VREG;
    213 	anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
    214 }
    215 
    216 /*
    217  * Global anon slot hash table manipulation.
    218  */
    219 
    220 static void
    221 anon_addhash(struct anon *ap)
    222 {
    223 	int index;
    224 
    225 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
    226 	index = ANON_HASH(ap->an_vp, ap->an_off);
    227 	ap->an_hash = anon_hash[index];
    228 	anon_hash[index] = ap;
    229 }
    230 
    231 static void
    232 anon_rmhash(struct anon *ap)
    233 {
    234 	struct anon **app;
    235 
    236 	ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]));
    237 
    238 	for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
    239 	    *app; app = &((*app)->an_hash)) {
    240 		if (*app == ap) {
    241 			*app = ap->an_hash;
    242 			break;
    243 		}
    244 	}
    245 }
    246 
    247 /*
    248  * The anon array interfaces. Functions allocating,
    249  * freeing array of pointers, and returning/setting
    250  * entries in the array of pointers for a given offset.
    251  *
    252  * Create the list of pointers
    253  */
    254 struct anon_hdr *
    255 anon_create(pgcnt_t npages, int flags)
    256 {
    257 	struct anon_hdr *ahp;
    258 	ulong_t nchunks;
    259 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    260 
    261 	if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
    262 		return (NULL);
    263 	}
    264 
    265 	mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
    266 	/*
    267 	 * Single level case.
    268 	 */
    269 	ahp->size = npages;
    270 	if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
    271 
    272 		if (flags & ANON_ALLOC_FORCE)
    273 			ahp->flags |= ANON_ALLOC_FORCE;
    274 
    275 		ahp->array_chunk = kmem_zalloc(
    276 		    ahp->size * sizeof (struct anon *), kmemflags);
    277 
    278 		if (ahp->array_chunk == NULL) {
    279 			kmem_free(ahp, sizeof (struct anon_hdr));
    280 			return (NULL);
    281 		}
    282 	} else {
    283 		/*
    284 		 * 2 Level case.
    285 		 * anon hdr size needs to be rounded off  to be a multiple
    286 		 * of ANON_CHUNK_SIZE. This is important as various anon
    287 		 * related functions depend on this.
    288 		 * NOTE -
    289 		 * anon_grow()  makes anon hdr size a multiple of
    290 		 * ANON_CHUNK_SIZE.
    291 		 * amp size is <= anon hdr size.
    292 		 * anon_index + seg_pgs <= anon hdr size.
    293 		 */
    294 		ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
    295 		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
    296 
    297 		ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
    298 		    kmemflags);
    299 
    300 		if (ahp->array_chunk == NULL) {
    301 			kmem_free(ahp, sizeof (struct anon_hdr));
    302 			return (NULL);
    303 		}
    304 	}
    305 	return (ahp);
    306 }
    307 
    308 /*
    309  * Free the array of pointers
    310  */
    311 void
    312 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
    313 {
    314 	ulong_t i;
    315 	void **ppp;
    316 	ulong_t nchunks;
    317 
    318 	ASSERT(npages <= ahp->size);
    319 
    320 	/*
    321 	 * Single level case.
    322 	 */
    323 	if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    324 		kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
    325 	} else {
    326 		/*
    327 		 * 2 level case.
    328 		 */
    329 		nchunks = ahp->size >> ANON_CHUNK_SHIFT;
    330 		for (i = 0; i < nchunks; i++) {
    331 			ppp = &ahp->array_chunk[i];
    332 			if (*ppp != NULL)
    333 				kmem_free(*ppp, PAGESIZE);
    334 		}
    335 		kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
    336 	}
    337 	mutex_destroy(&ahp->serial_lock);
    338 	kmem_free(ahp, sizeof (struct anon_hdr));
    339 }
    340 
    341 /*
    342  * Return the pointer from the list for a
    343  * specified anon index.
    344  */
    345 struct anon *
    346 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
    347 {
    348 	struct anon **app;
    349 
    350 	ASSERT(an_idx < ahp->size);
    351 
    352 	/*
    353 	 * Single level case.
    354 	 */
    355 	if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
    356 		return ((struct anon *)
    357 		    ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
    358 	} else {
    359 
    360 		/*
    361 		 * 2 level case.
    362 		 */
    363 		app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    364 		if (app) {
    365 			return ((struct anon *)
    366 			    ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
    367 			    ANON_PTRMASK));
    368 		} else {
    369 			return (NULL);
    370 		}
    371 	}
    372 }
    373 
    374 /*
    375  * Return the anon pointer for the first valid entry in the anon list,
    376  * starting from the given index.
    377  */
    378 struct anon *
    379 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
    380 {
    381 	struct anon *ap;
    382 	struct anon **app;
    383 	ulong_t chunkoff;
    384 	ulong_t i;
    385 	ulong_t j;
    386 	pgcnt_t size;
    387 
    388 	i = *index;
    389 	size = ahp->size;
    390 
    391 	ASSERT(i < size);
    392 
    393 	if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
    394 		/*
    395 		 * 1 level case
    396 		 */
    397 		while (i < size) {
    398 			ap = (struct anon *)
    399 			    ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
    400 			if (ap) {
    401 				*index = i;
    402 				return (ap);
    403 			}
    404 			i++;
    405 		}
    406 	} else {
    407 		/*
    408 		 * 2 level case
    409 		 */
    410 		chunkoff = i & ANON_CHUNK_OFF;
    411 		while (i < size) {
    412 			app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
    413 			if (app)
    414 				for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
    415 					ap = (struct anon *)
    416 					    ((uintptr_t)app[j] & ANON_PTRMASK);
    417 					if (ap) {
    418 						*index = i + (j - chunkoff);
    419 						return (ap);
    420 					}
    421 				}
    422 			chunkoff = 0;
    423 			i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
    424 		}
    425 	}
    426 	*index = size;
    427 	return (NULL);
    428 }
    429 
    430 /*
    431  * Set list entry with a given pointer for a specified offset
    432  */
    433 int
    434 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
    435 {
    436 	void		**ppp;
    437 	struct anon	**app;
    438 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    439 	uintptr_t	*ap_addr;
    440 
    441 	ASSERT(an_idx < ahp->size);
    442 
    443 	/*
    444 	 * Single level case.
    445 	 */
    446 	if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    447 		ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
    448 	} else {
    449 
    450 		/*
    451 		 * 2 level case.
    452 		 */
    453 		ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    454 
    455 		ASSERT(ppp != NULL);
    456 		if (*ppp == NULL) {
    457 			mutex_enter(&ahp->serial_lock);
    458 			ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
    459 			if (*ppp == NULL) {
    460 				*ppp = kmem_zalloc(PAGESIZE, kmemflags);
    461 				if (*ppp == NULL) {
    462 					mutex_exit(&ahp->serial_lock);
    463 					return (ENOMEM);
    464 				}
    465 			}
    466 			mutex_exit(&ahp->serial_lock);
    467 		}
    468 		app = *ppp;
    469 		ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
    470 	}
    471 	*ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
    472 	return (0);
    473 }
    474 
    475 /*
    476  * Copy anon array into a given new anon array
    477  */
    478 int
    479 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
    480 	struct anon_hdr *dahp, ulong_t d_idx,
    481 	pgcnt_t npages, int flags)
    482 {
    483 	void **sapp, **dapp;
    484 	void *ap;
    485 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    486 
    487 	ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
    488 	ASSERT((npages <= sahp->size) && (npages <= dahp->size));
    489 
    490 	/*
    491 	 * Both arrays are 1 level.
    492 	 */
    493 	if (((sahp->size <= ANON_CHUNK_SIZE) &&
    494 	    (dahp->size <= ANON_CHUNK_SIZE)) ||
    495 	    ((sahp->flags & ANON_ALLOC_FORCE) &&
    496 	    (dahp->flags & ANON_ALLOC_FORCE))) {
    497 
    498 		bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
    499 		    npages * sizeof (struct anon *));
    500 		return (0);
    501 	}
    502 
    503 	/*
    504 	 * Both arrays are 2 levels.
    505 	 */
    506 	if (sahp->size > ANON_CHUNK_SIZE &&
    507 	    dahp->size > ANON_CHUNK_SIZE &&
    508 	    ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
    509 	    ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
    510 
    511 		ulong_t sapidx, dapidx;
    512 		ulong_t *sap, *dap;
    513 		ulong_t chknp;
    514 
    515 		while (npages != 0) {
    516 
    517 			sapidx = s_idx & ANON_CHUNK_OFF;
    518 			dapidx = d_idx & ANON_CHUNK_OFF;
    519 			chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
    520 			if (chknp > npages)
    521 				chknp = npages;
    522 
    523 			sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
    524 			if ((sap = *sapp) != NULL) {
    525 				dapp = &dahp->array_chunk[d_idx
    526 				    >> ANON_CHUNK_SHIFT];
    527 				if ((dap = *dapp) == NULL) {
    528 					*dapp = kmem_zalloc(PAGESIZE,
    529 					    kmemflags);
    530 					if ((dap = *dapp) == NULL)
    531 						return (ENOMEM);
    532 				}
    533 				bcopy((sap + sapidx), (dap + dapidx),
    534 				    chknp << ANON_PTRSHIFT);
    535 			}
    536 			s_idx += chknp;
    537 			d_idx += chknp;
    538 			npages -= chknp;
    539 		}
    540 		return (0);
    541 	}
    542 
    543 	/*
    544 	 * At least one of the arrays is 2 level.
    545 	 */
    546 	while (npages--) {
    547 		if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
    548 			ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
    549 			if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
    550 					return (ENOMEM);
    551 		}
    552 		s_idx++;
    553 		d_idx++;
    554 	}
    555 	return (0);
    556 }
    557 
    558 
    559 /*
    560  * ANON_INITBUF is a convenience macro for anon_grow() below. It
    561  * takes a buffer dst, which is at least as large as buffer src. It
    562  * does a bcopy from src into dst, and then bzeros the extra bytes
    563  * of dst. If tail is set, the data in src is tail aligned within
    564  * dst instead of head aligned.
    565  */
    566 
    567 #define	ANON_INITBUF(src, srclen, dst, dstsize, tail)			      \
    568 	if (tail) {							      \
    569 		bzero((dst), (dstsize) - (srclen));			      \
    570 		bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
    571 	} else {							      \
    572 		bcopy((src), (dst), (srclen));				      \
    573 		bzero((char *)(dst) + (srclen), (dstsize) - (srclen));	      \
    574 	}
    575 
    576 #define	ANON_1_LEVEL_INC	(ANON_CHUNK_SIZE / 8)
    577 #define	ANON_2_LEVEL_INC	(ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
    578 
    579 /*
    580  * anon_grow() is used to efficiently extend an existing anon array.
    581  * startidx_p points to the index into the anon array of the first page
    582  * that is in use. oldseg_pgs is the number of pages in use, starting at
    583  * *startidx_p. newpages is the number of additional pages desired.
    584  *
    585  * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
    586  *
    587  * The growth is done by creating a new top level of the anon array,
    588  * and (if the array is 2-level) reusing the existing second level arrays.
    589  *
    590  * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
    591  *
    592  * Returns the new number of pages in the anon array.
    593  */
    594 pgcnt_t
    595 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
    596     pgcnt_t newseg_pgs, int flags)
    597 {
    598 	ulong_t startidx = startidx_p ? *startidx_p : 0;
    599 	pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
    600 	pgcnt_t oelems, nelems, totpages;
    601 	void **level1;
    602 	int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
    603 	int growdown = (flags & ANON_GROWDOWN);
    604 	size_t newarrsz, oldarrsz;
    605 	void *level2;
    606 
    607 	ASSERT(!(startidx_p == NULL && growdown));
    608 	ASSERT(startidx + oldseg_pgs <= ahp->size);
    609 
    610 	/*
    611 	 * Determine the total number of pages needed in the new
    612 	 * anon array. If growing down, totpages is all pages from
    613 	 * startidx through the end of the array, plus <newseg_pgs>
    614 	 * pages. If growing up, keep all pages from page 0 through
    615 	 * the last page currently in use, plus <newseg_pgs> pages.
    616 	 */
    617 	if (growdown)
    618 		totpages = oldamp_pgs - startidx + newseg_pgs;
    619 	else
    620 		totpages = startidx + oldseg_pgs + newseg_pgs;
    621 
    622 	/* If the array is already large enough, just return. */
    623 
    624 	if (oldamp_pgs >= totpages) {
    625 		if (growdown)
    626 			*startidx_p = oldamp_pgs - totpages;
    627 		return (oldamp_pgs);
    628 	}
    629 
    630 	/*
    631 	 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
    632 	 * by the corresponding arrays.
    633 	 * oelems/nelems are the number of pointers in the top level arrays
    634 	 * which may be either level 1 or level 2.
    635 	 * Will the new anon array be one level or two levels?
    636 	 */
    637 	if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
    638 		newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
    639 		oelems = oldamp_pgs;
    640 		nelems = newamp_pgs;
    641 	} else {
    642 		newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
    643 		oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
    644 		nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
    645 	}
    646 
    647 	newarrsz = nelems * sizeof (void *);
    648 	level1 = kmem_alloc(newarrsz, kmemflags);
    649 	if (level1 == NULL)
    650 		return (0);
    651 
    652 	/* Are we converting from a one level to a two level anon array? */
    653 
    654 	if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
    655 	    !(ahp->flags & ANON_ALLOC_FORCE)) {
    656 
    657 		/*
    658 		 * Yes, we're converting to a two level. Reuse old level 1
    659 		 * as new level 2 if it is exactly PAGESIZE. Otherwise
    660 		 * alloc a new level 2 and copy the old level 1 data into it.
    661 		 */
    662 		if (oldamp_pgs == ANON_CHUNK_SIZE) {
    663 			level2 = (void *)ahp->array_chunk;
    664 		} else {
    665 			level2 = kmem_alloc(PAGESIZE, kmemflags);
    666 			if (level2 == NULL) {
    667 				kmem_free(level1, newarrsz);
    668 				return (0);
    669 			}
    670 			oldarrsz = oldamp_pgs * sizeof (void *);
    671 
    672 			ANON_INITBUF(ahp->array_chunk, oldarrsz,
    673 			    level2, PAGESIZE, growdown);
    674 			kmem_free(ahp->array_chunk, oldarrsz);
    675 		}
    676 		bzero(level1, newarrsz);
    677 		if (growdown)
    678 			level1[nelems - 1] = level2;
    679 		else
    680 			level1[0] = level2;
    681 	} else {
    682 		oldarrsz = oelems * sizeof (void *);
    683 
    684 		ANON_INITBUF(ahp->array_chunk, oldarrsz,
    685 		    level1, newarrsz, growdown);
    686 		kmem_free(ahp->array_chunk, oldarrsz);
    687 	}
    688 
    689 	ahp->array_chunk = level1;
    690 	ahp->size = newamp_pgs;
    691 	if (growdown)
    692 		*startidx_p = newamp_pgs - totpages;
    693 
    694 	return (newamp_pgs);
    695 }
    696 
    697 
    698 /*
    699  * Called from clock handler to sync ani_free value.
    700  */
    701 
    702 void
    703 set_anoninfo(void)
    704 {
    705 	int	ix;
    706 	pgcnt_t	total = 0;
    707 
    708 	for (ix = 0; ix < ANI_MAX_POOL; ix++) {
    709 		total += ani_free_pool[ix].ani_count;
    710 	}
    711 	k_anoninfo.ani_free = total;
    712 }
    713 
    714 /*
    715  * Reserve anon space.
    716  *
    717  * It's no longer simply a matter of incrementing ani_resv to
    718  * reserve swap space, we need to check memory-based as well
    719  * as disk-backed (physical) swap.  The following algorithm
    720  * is used:
    721  * 	Check the space on physical swap
    722  * 		i.e. amount needed < ani_max - ani_phys_resv
    723  * 	If we are swapping on swapfs check
    724  *		amount needed < (availrmem - swapfs_minfree)
    725  * Since the algorithm to check for the quantity of swap space is
    726  * almost the same as that for reserving it, we'll just use anon_resvmem
    727  * with a flag to decrement availrmem.
    728  *
    729  * Return non-zero on success.
    730  */
    731 int
    732 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
    733 {
    734 	pgcnt_t npages = btopr(size);
    735 	pgcnt_t mswap_pages = 0;
    736 	pgcnt_t pswap_pages = 0;
    737 	proc_t *p = curproc;
    738 
    739 	if (zone != NULL && takemem) {
    740 		/* test zone.max-swap resource control */
    741 		mutex_enter(&p->p_lock);
    742 		if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
    743 			mutex_exit(&p->p_lock);
    744 			return (0);
    745 		}
    746 		mutex_exit(&p->p_lock);
    747 	}
    748 	mutex_enter(&anoninfo_lock);
    749 
    750 	/*
    751 	 * pswap_pages is the number of pages we can take from
    752 	 * physical (i.e. disk-backed) swap.
    753 	 */
    754 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    755 	pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
    756 
    757 	ANON_PRINT(A_RESV,
    758 	    ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
    759 	    npages, takemem, pswap_pages, (void *)caller()));
    760 
    761 	if (npages <= pswap_pages) {
    762 		/*
    763 		 * we have enough space on a physical swap
    764 		 */
    765 		if (takemem)
    766 			k_anoninfo.ani_phys_resv += npages;
    767 		mutex_exit(&anoninfo_lock);
    768 		return (1);
    769 	} else if (pswap_pages != 0) {
    770 		/*
    771 		 * we have some space on a physical swap
    772 		 */
    773 		if (takemem) {
    774 			/*
    775 			 * use up remainder of phys swap
    776 			 */
    777 			k_anoninfo.ani_phys_resv += pswap_pages;
    778 			ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
    779 		}
    780 	}
    781 	/*
    782 	 * since (npages > pswap_pages) we need mem swap
    783 	 * mswap_pages is the number of pages needed from availrmem
    784 	 */
    785 	ASSERT(npages > pswap_pages);
    786 	mswap_pages = npages - pswap_pages;
    787 
    788 	ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
    789 	    mswap_pages));
    790 
    791 	/*
    792 	 * priv processes can reserve memory as swap as long as availrmem
    793 	 * remains greater than swapfs_minfree; in the case of non-priv
    794 	 * processes, memory can be reserved as swap only if availrmem
    795 	 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
    796 	 * swapfs_reserve amount of memswap is not available to non-priv
    797 	 * processes. This protects daemons such as automounter dying
    798 	 * as a result of application processes eating away almost entire
    799 	 * membased swap. This safeguard becomes useless if apps are run
    800 	 * with root access.
    801 	 *
    802 	 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
    803 	 *
    804 	 */
    805 	if (tryhard) {
    806 		mutex_exit(&anoninfo_lock);
    807 		(void) page_reclaim_mem(mswap_pages,
    808 		    swapfs_minfree + swapfs_reserve, 0);
    809 		mutex_enter(&anoninfo_lock);
    810 	}
    811 
    812 	mutex_enter(&freemem_lock);
    813 	if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
    814 	    (availrmem > (swapfs_minfree + mswap_pages) &&
    815 	    secpolicy_resource(CRED()) == 0)) {
    816 
    817 		if (takemem) {
    818 			/*
    819 			 * Take the memory from the rest of the system.
    820 			 */
    821 			availrmem -= mswap_pages;
    822 			mutex_exit(&freemem_lock);
    823 			k_anoninfo.ani_mem_resv += mswap_pages;
    824 			ANI_ADD(mswap_pages);
    825 			ANON_PRINT((A_RESV | A_MRESV),
    826 			    ("anon_resvmem: took %ld pages of availrmem\n",
    827 			    mswap_pages));
    828 		} else {
    829 			mutex_exit(&freemem_lock);
    830 		}
    831 
    832 		ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    833 		mutex_exit(&anoninfo_lock);
    834 		return (1);
    835 
    836 	} else {
    837 		/*
    838 		 * Fail if not enough memory
    839 		 */
    840 
    841 		if (takemem) {
    842 			k_anoninfo.ani_phys_resv -= pswap_pages;
    843 		}
    844 
    845 		mutex_exit(&freemem_lock);
    846 		mutex_exit(&anoninfo_lock);
    847 		ANON_PRINT(A_RESV,
    848 		    ("anon_resvmem: not enough space from swapfs\n"));
    849 		if (zone != NULL && takemem)
    850 			rctl_decr_swap(zone, ptob(npages));
    851 		return (0);
    852 	}
    853 }
    854 
    855 /*
    856  * Give back an anon reservation.
    857  */
    858 void
    859 anon_unresvmem(size_t size, zone_t *zone)
    860 {
    861 	pgcnt_t npages = btopr(size);
    862 	spgcnt_t mem_free_pages = 0;
    863 	pgcnt_t phys_free_slots;
    864 #ifdef	ANON_DEBUG
    865 	pgcnt_t mem_resv;
    866 #endif
    867 	if (zone != NULL)
    868 		rctl_decr_swap(zone, ptob(npages));
    869 
    870 	mutex_enter(&anoninfo_lock);
    871 
    872 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
    873 	/*
    874 	 * If some of this reservation belonged to swapfs
    875 	 * give it back to availrmem.
    876 	 * ani_mem_resv is the amount of availrmem swapfs has reserved.
    877 	 * but some of that memory could be locked by segspt so we can only
    878 	 * return non locked ani_mem_resv back to availrmem
    879 	 */
    880 	if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
    881 		ANON_PRINT((A_RESV | A_MRESV),
    882 		    ("anon_unresv: growing availrmem by %ld pages\n",
    883 		    MIN(k_anoninfo.ani_mem_resv, npages)));
    884 
    885 		mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
    886 		    k_anoninfo.ani_locked_swap), npages);
    887 		mutex_enter(&freemem_lock);
    888 		availrmem += mem_free_pages;
    889 		mutex_exit(&freemem_lock);
    890 		k_anoninfo.ani_mem_resv -= mem_free_pages;
    891 
    892 		ANI_ADD(-mem_free_pages);
    893 	}
    894 	/*
    895 	 * The remainder of the pages is returned to phys swap
    896 	 */
    897 	ASSERT(npages >= mem_free_pages);
    898 	phys_free_slots = npages - mem_free_pages;
    899 
    900 	if (phys_free_slots) {
    901 		k_anoninfo.ani_phys_resv -= phys_free_slots;
    902 	}
    903 
    904 #ifdef	ANON_DEBUG
    905 	mem_resv = k_anoninfo.ani_mem_resv;
    906 #endif
    907 
    908 	ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
    909 	ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
    910 
    911 	mutex_exit(&anoninfo_lock);
    912 
    913 	ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
    914 	    npages, mem_resv, (void *)caller()));
    915 }
    916 
    917 /*
    918  * Allocate an anon slot and return it with the lock held.
    919  */
    920 struct anon *
    921 anon_alloc(struct vnode *vp, anoff_t off)
    922 {
    923 	struct anon	*ap;
    924 	kmutex_t	*ahm;
    925 
    926 	ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
    927 	if (vp == NULL) {
    928 		swap_alloc(ap);
    929 	} else {
    930 		ap->an_vp = vp;
    931 		ap->an_off = off;
    932 	}
    933 	ap->an_refcnt = 1;
    934 	ap->an_pvp = NULL;
    935 	ap->an_poff = 0;
    936 	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
    937 	mutex_enter(ahm);
    938 	anon_addhash(ap);
    939 	mutex_exit(ahm);
    940 	ANI_ADD(-1);
    941 	ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
    942 	    (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
    943 	return (ap);
    944 }
    945 
    946 /*
    947  * Decrement the reference count of an anon page.
    948  * If reference count goes to zero, free it and
    949  * its associated page (if any).
    950  */
    951 void
    952 anon_decref(struct anon *ap)
    953 {
    954 	page_t *pp;
    955 	struct vnode *vp;
    956 	anoff_t off;
    957 	kmutex_t *ahm;
    958 
    959 	ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
    960 	mutex_enter(ahm);
    961 	ASSERT(ap->an_refcnt != 0);
    962 	if (ap->an_refcnt == 0)
    963 		panic("anon_decref: slot count 0");
    964 	if (--ap->an_refcnt == 0) {
    965 		swap_xlate(ap, &vp, &off);
    966 		anon_rmhash(ap);
    967 		if (ap->an_pvp != NULL)
    968 			swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
    969 		mutex_exit(ahm);
    970 
    971 		/*
    972 		 * If there is a page for this anon slot we will need to
    973 		 * call VN_DISPOSE to get rid of the vp association and
    974 		 * put the page back on the free list as really free.
    975 		 * Acquire the "exclusive" lock to ensure that any
    976 		 * pending i/o always completes before the swap slot
    977 		 * is freed.
    978 		 */
    979 		pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
    980 		if (pp != NULL) {
    981 			/*LINTED: constant in conditional context */
    982 			VN_DISPOSE(pp, B_INVAL, 0, kcred);
    983 		}
    984 		ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
    985 		    (void *)ap, (void *)ap->an_vp));
    986 
    987 		kmem_cache_free(anon_cache, ap);
    988 
    989 		ANI_ADD(1);
    990 	} else {
    991 		mutex_exit(ahm);
    992 	}
    993 }
    994 
    995 
    996 /*
    997  * check an_refcnt of the root anon slot (anon_index argument is aligned at
    998  * seg->s_szc level) to determine whether COW processing is required.
    999  * anonpages_hash_lock[] held on the root ap ensures that if root's
   1000  * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
   1001  * later since this process can't fork while its AS lock is held).
   1002  *
   1003  * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
   1004  */
   1005 int
   1006 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
   1007 {
   1008 	struct anon	*ap;
   1009 	kmutex_t	*ahmpages = NULL;
   1010 
   1011 	ap = anon_get_ptr(ahp, anon_index);
   1012 	if (ap == NULL)
   1013 		return (0);
   1014 
   1015 	ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1016 	mutex_enter(ahmpages);
   1017 	ASSERT(ap->an_refcnt >= 1);
   1018 	if (ap->an_refcnt == 1) {
   1019 		mutex_exit(ahmpages);
   1020 		return (0);
   1021 	}
   1022 	mutex_exit(ahmpages);
   1023 	return (1);
   1024 }
   1025 /*
   1026  * Check 'nslots' anon slots for refcnt > 1.
   1027  *
   1028  * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
   1029  * returns 0.
   1030  */
   1031 static int
   1032 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
   1033 {
   1034 	struct anon *ap;
   1035 
   1036 	while (nslots-- > 0) {
   1037 		if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
   1038 		    ap->an_refcnt > 1)
   1039 			return (1);
   1040 		anon_index++;
   1041 	}
   1042 
   1043 	return (0);
   1044 }
   1045 
   1046 static void
   1047 anon_decref_pages(
   1048 	struct anon_hdr *ahp,
   1049 	ulong_t an_idx,
   1050 	uint_t szc)
   1051 {
   1052 	struct anon *ap = anon_get_ptr(ahp, an_idx);
   1053 	kmutex_t *ahmpages = NULL;
   1054 	page_t *pp;
   1055 	pgcnt_t pgcnt = page_get_pagecnt(szc);
   1056 	pgcnt_t i;
   1057 	struct vnode *vp;
   1058 	anoff_t   off;
   1059 	kmutex_t *ahm;
   1060 #ifdef DEBUG
   1061 	int refcnt = 1;
   1062 #endif
   1063 
   1064 	ASSERT(szc != 0);
   1065 	ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
   1066 	ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
   1067 	ASSERT(an_idx < ahp->size);
   1068 
   1069 	if (ahp->size - an_idx < pgcnt) {
   1070 		/*
   1071 		 * In case of shared mappings total anon map size may not be
   1072 		 * the largest page size aligned.
   1073 		 */
   1074 		pgcnt = ahp->size - an_idx;
   1075 	}
   1076 
   1077 	VM_STAT_ADD(anonvmstats.decrefpages[0]);
   1078 
   1079 	if (ap != NULL) {
   1080 		ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)];
   1081 		mutex_enter(ahmpages);
   1082 		ASSERT((refcnt = ap->an_refcnt) != 0);
   1083 		VM_STAT_ADD(anonvmstats.decrefpages[1]);
   1084 		if (ap->an_refcnt == 1) {
   1085 			VM_STAT_ADD(anonvmstats.decrefpages[2]);
   1086 			ASSERT(!anon_share(ahp, an_idx, pgcnt));
   1087 			mutex_exit(ahmpages);
   1088 			ahmpages = NULL;
   1089 		}
   1090 	}
   1091 
   1092 	i = 0;
   1093 	while (i < pgcnt) {
   1094 		if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
   1095 			ASSERT(refcnt == 1 && ahmpages ==