Home | History | Annotate | Download | only in vm
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)vm_usage.c	1.3	07/06/07 SMI"
     28 
     29 /*
     30  * vm_usage
     31  *
     32  * This file implements the getvmusage() private system call.
     33  * getvmusage() counts the amount of resident memory pages and swap
     34  * reserved by the specified process collective. A "process collective" is
     35  * the set of processes owned by a particular, zone, project, task, or user.
     36  *
     37  * rss and swap are counted so that for a given process collective, a page is
     38  * only counted once.  For example, this means that if multiple processes in
     39  * the same project map the same page, then the project will only be charged
     40  * once for that page.  On the other hand, if two processes in different
     41  * projects map the same page, then both projects will be charged
     42  * for the page.
     43  *
     44  * The vm_getusage() calculation is implemented so that the first thread
     45  * performs the rss/swap counting. Other callers will wait for that thread to
     46  * finish, copying the results.  This enables multiple rcapds and prstats to
     47  * consume data from the same calculation.  The results are also cached so that
     48  * a caller interested in recent results can just copy them instead of starting
     49  * a new calculation. The caller passes the maximium age (in seconds) of the
     50  * data.  If the cached data is young enough, the cache is copied, otherwise,
     51  * a new calculation is executed and the cache is replaced with the new
     52  * data.
     53  *
     54  * The rss calculation for each process collective is as follows:
     55  *
     56  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
     57  *     and/or users.
     58  *   - For each proc:
     59  *	- Figure out proc's collectives (zone, project, task, and/or user).
     60  *	- For each seg in proc's address space:
     61  *		- If seg is private:
     62  *			- Lookup anons in the amp.
     63  *			- For incore pages not previously visited each of the
     64  *			  proc's collectives, add incore pagesize to each.
     65  *			  collective.
     66  *			  Anon's with a refcnt of 1 can be assummed to be not
     67  *			  previously visited.
     68  *			- For address ranges without anons in the amp:
     69  *				- Lookup pages in underlying vnode.
     70  *				- For incore pages not previously visiting for
     71  *				  each of the proc's collectives, add incore
     72  *				  pagesize to each collective.
     73  *		- If seg is shared:
     74  *			- Lookup pages in the shared amp or vnode.
     75  *			- For incore pages not previously visited for each of
     76  *			  the proc's collectives, add incore pagesize to each
     77  *			  collective.
     78  *
     79  * Swap is reserved by private segments, and shared anonymous segments.
     80  * The only shared anon segments which do not reserve swap are ISM segments
     81  * and schedctl segments, both of which can be identified by having
     82  * amp->swresv == 0.
     83  *
     84  * The swap calculation for each collective is as follows:
     85  *
     86  *   - Inspect flags, determine if counting rss for zones, projects, tasks,
     87  *     and/or users.
     88  *   - For each proc:
     89  *	- Figure out proc's collectives (zone, project, task, and/or user).
     90  *	- For each seg in proc's address space:
     91  *		- If seg is private:
     92  *			- Add svd->swresv pages to swap count for each of the
     93  *			  proc's collectives.
     94  *		- If seg is anon, shared, and amp->swresv != 0
     95  *			- For address ranges in amp not previously visited for
     96  *			  each of the proc's collectives, add size of address
     97  *			  range to the swap count for each collective.
     98  *
     99  * These two calculations are done simultaneously, with most of the work
    100  * being done in vmu_calculate_seg().  The results of the calculation are
    101  * copied into "vmu_data.vmu_cache_results".
    102  *
    103  * To perform the calculation, various things are tracked and cached:
    104  *
    105  *    - incore/not-incore page ranges for all vnodes.
    106  *	(vmu_data.vmu_all_vnodes_hash)
    107  *	This eliminates looking up the same page more than once.
    108  *
    109  *    - incore/not-incore page ranges for all shared amps.
    110  *	(vmu_data.vmu_all_amps_hash)
    111  *	This eliminates looking up the same page more than once.
    112  *
    113  *    - visited page ranges for each collective.
    114  *	   - per vnode (entity->vme_vnode_hash)
    115  *	   - per shared amp (entity->vme_amp_hash)
    116  *	For accurate counting of map-shared and cow-shared pages.
    117  *
    118  *    - visited private anons (refcnt > 1) for each collective.
    119  *	(entity->vme_anon_hash)
    120  *	For accurate counting of cow-shared pages.
    121  *
    122  * The common accounting structure is the vmu_entity_t, which represents
    123  * collectives:
    124  *
    125  *    - A zone.
    126  *    - A project, task, or user within a zone.
    127  *    - The entire system (vmu_data.vmu_system).
    128  *    - Each collapsed (col) project and user.  This means a given projid or
    129  *	uid, regardless of which zone the process is in.  For instance,
    130  *      project 0 in the global zone and project 0 in a non global zone are
    131  *	the same collapsed project.
    132  *
    133  *  Each entity structure tracks which pages have been already visited for
    134  *  that entity (via previously inspected processes) so that these pages are
    135  *  not double counted.
    136  */
    137 
    138 #include <sys/errno.h>
    139 #include <sys/types.h>
    140 #include <sys/zone.h>
    141 #include <sys/proc.h>
    142 #include <sys/project.h>
    143 #include <sys/task.h>
    144 #include <sys/thread.h>
    145 #include <sys/time.h>
    146 #include <sys/mman.h>
    147 #include <sys/modhash.h>
    148 #include <sys/modhash_impl.h>
    149 #include <sys/shm.h>
    150 #include <sys/swap.h>
    151 #include <sys/synch.h>
    152 #include <sys/systm.h>
    153 #include <sys/var.h>
    154 #include <sys/vm_usage.h>
    155 #include <sys/zone.h>
    156 #include <vm/anon.h>
    157 #include <vm/as.h>
    158 #include <vm/seg_vn.h>
    159 #include <vm/seg_spt.h>
    160 
    161 #define	VMUSAGE_HASH_SIZE		512
    162 
    163 #define	VMUSAGE_TYPE_VNODE		1
    164 #define	VMUSAGE_TYPE_AMP		2
    165 #define	VMUSAGE_TYPE_ANON		3
    166 
    167 #define	VMUSAGE_BOUND_UNKNOWN		0
    168 #define	VMUSAGE_BOUND_INCORE		1
    169 #define	VMUSAGE_BOUND_NOT_INCORE	2
    170 
    171 /*
    172  * bounds for vnodes and shared amps
    173  * Each bound is either entirely incore, entirely not in core, or
    174  * entirely unknown.  bounds are stored in order by offset.
    175  */
    176 typedef struct vmu_bound {
    177 	struct  vmu_bound *vmb_next;
    178 	pgcnt_t vmb_start;  /* page offset in vnode/amp on which bound starts */
    179 	pgcnt_t	vmb_end;    /* page offset in vnode/amp on which bound ends */
    180 	char	vmb_type;   /* One of VMUSAGE_BOUND_* */
    181 } vmu_bound_t;
    182 
    183 /*
    184  * hash of visited objects (vnodes or shared amps)
    185  * key is address of vnode or amp.  Bounds lists known incore/non-incore
    186  * bounds for vnode/amp.
    187  */
    188 typedef struct vmu_object {
    189 	struct vmu_object	*vmo_next;	/* free list */
    190 	caddr_t		vmo_key;
    191 	short		vmo_type;
    192 	vmu_bound_t	*vmo_bounds;
    193 } vmu_object_t;
    194 
    195 /*
    196  * Entity by which to count results.
    197  *
    198  * The entity structure keeps the current rss/swap counts for each entity
    199  * (zone, project, etc), and hashes of vm structures that have already
    200  * been visited for the entity.
    201  *
    202  * vme_next:	links the list of all entities currently being counted by
    203  *		vmu_calculate().
    204  *
    205  * vme_next_calc: links the list of entities related to the current process
    206  *		 being counted by vmu_calculate_proc().
    207  *
    208  * vmu_calculate_proc() walks all processes.  For each process, it makes a
    209  * list of the entities related to that process using vme_next_calc.  This
    210  * list changes each time vmu_calculate_proc() is called.
    211  *
    212  */
    213 typedef struct vmu_entity {
    214 	struct vmu_entity *vme_next;
    215 	struct vmu_entity *vme_next_calc;
    216 	mod_hash_t	*vme_vnode_hash; /* vnodes visited for entity */
    217 	mod_hash_t	*vme_amp_hash;	 /* shared amps visited for entity */
    218 	mod_hash_t	*vme_anon_hash;	 /* cow anons visited for entity */
    219 	vmusage_t	vme_result;	 /* identifies entity and results */
    220 } vmu_entity_t;
    221 
    222 /*
    223  * Hash of entities visited within a zone, and an entity for the zone
    224  * itself.
    225  */
    226 typedef struct vmu_zone {
    227 	struct vmu_zone	*vmz_next;	/* free list */
    228 	id_t		vmz_id;
    229 	vmu_entity_t	*vmz_zone;
    230 	mod_hash_t	*vmz_projects_hash;
    231 	mod_hash_t	*vmz_tasks_hash;
    232 	mod_hash_t	*vmz_rusers_hash;
    233 	mod_hash_t	*vmz_eusers_hash;
    234 } vmu_zone_t;
    235 
    236 /*
    237  * Cache of results from last calculation
    238  */
    239 typedef struct vmu_cache {
    240 	vmusage_t	*vmc_results;	/* Results from last call to */
    241 					/* vm_getusage(). */
    242 	uint64_t	vmc_nresults;	/* Count of cached results */
    243 	uint64_t	vmc_refcnt;	/* refcnt for free */
    244 	uint_t		vmc_flags;	/* Flags for vm_getusage() */
    245 	hrtime_t	vmc_timestamp;	/* when cache was created */
    246 } vmu_cache_t;
    247 
    248 /*
    249  * top level rss info for the system
    250  */
    251 typedef struct vmu_data {
    252 	kmutex_t	vmu_lock;		/* Protects vmu_data */
    253 	kcondvar_t	vmu_cv;			/* Used to signal threads */
    254 						/* Waiting for */
    255 						/* Rss_calc_thread to finish */
    256 	vmu_entity_t	*vmu_system;		/* Entity for tracking */
    257 						/* rss/swap for all processes */
    258 						/* in all zones */
    259 	mod_hash_t	*vmu_zones_hash;	/* Zones visited */
    260 	mod_hash_t	*vmu_projects_col_hash; /* These *_col_hash hashes */
    261 	mod_hash_t	*vmu_rusers_col_hash;	/* keep track of entities, */
    262 	mod_hash_t	*vmu_eusers_col_hash;	/* ignoring zoneid, in order */
    263 						/* to implement VMUSAGE_COL_* */
    264 						/* flags, which aggregate by */
    265 						/* project or user regardless */
    266 						/* of zoneid. */
    267 	mod_hash_t	*vmu_all_vnodes_hash;	/* System wide visited vnodes */
    268 						/* to track incore/not-incore */
    269 	mod_hash_t	*vmu_all_amps_hash;	/* System wide visited shared */
    270 						/* amps to track incore/not- */
    271 						/* incore */
    272 	vmu_entity_t	*vmu_entities;		/* Linked list of entities */
    273 	size_t		vmu_nentities;		/* Count of entities in list */
    274 	vmu_cache_t	*vmu_cache;		/* Cached results */
    275 	kthread_t	*vmu_calc_thread;	/* NULL, or thread running */
    276 						/* vmu_calculate() */
    277 	uint_t		vmu_calc_flags;		/* Flags being using by */
    278 						/* currently running calc */
    279 						/* thread */
    280 	uint_t		vmu_pending_flags;	/* Flags of vm_getusage() */
    281 						/* threads waiting for */
    282 						/* calc thread to finish */
    283 	uint_t		vmu_pending_waiters;	/* Number of threads waiting */
    284 						/* for calc thread */
    285 	vmu_bound_t	*vmu_free_bounds;
    286 	vmu_object_t	*vmu_free_objects;
    287 	vmu_entity_t	*vmu_free_entities;
    288 	vmu_zone_t	*vmu_free_zones;
    289 } vmu_data_t;
    290 
    291 extern struct as kas;
    292 extern proc_t *practive;
    293 extern zone_t *global_zone;
    294 extern struct seg_ops segvn_ops;
    295 extern struct seg_ops segspt_shmops;
    296 
    297 static vmu_data_t vmu_data;
    298 static kmem_cache_t *vmu_bound_cache;
    299 static kmem_cache_t *vmu_object_cache;
    300 
    301 /*
    302  * Save a bound on the free list
    303  */
    304 static void
    305 vmu_free_bound(vmu_bound_t *bound)
    306 {
    307 	bound->vmb_next = vmu_data.vmu_free_bounds;
    308 	vmu_data.vmu_free_bounds = bound;
    309 }
    310 
    311 /*
    312  * Free an object, and all visited bound info.
    313  */
    314 static void
    315 vmu_free_object(mod_hash_val_t val)
    316 {
    317 	vmu_object_t *obj = (vmu_object_t *)val;
    318 	vmu_bound_t *bound = obj->vmo_bounds;
    319 	vmu_bound_t *tmp;
    320 
    321 	while (bound != NULL) {
    322 		tmp = bound;
    323 		bound = bound->vmb_next;
    324 		vmu_free_bound(tmp);
    325 	}
    326 	obj->vmo_next = vmu_data.vmu_free_objects;
    327 	vmu_data.vmu_free_objects = obj;
    328 }
    329 
    330 /*
    331  * Free an entity, and hashes of visited objects for that entity.
    332  */
    333 static void
    334 vmu_free_entity(mod_hash_val_t val)
    335 {
    336 	vmu_entity_t *entity = (vmu_entity_t *)val;
    337 
    338 	if (entity->vme_vnode_hash != NULL)
    339 		i_mod_hash_clear_nosync(entity->vme_vnode_hash);
    340 	if (entity->vme_amp_hash != NULL)
    341 		i_mod_hash_clear_nosync(entity->vme_amp_hash);
    342 	if (entity->vme_anon_hash != NULL)
    343 		i_mod_hash_clear_nosync(entity->vme_anon_hash);
    344 
    345 	entity->vme_next = vmu_data.vmu_free_entities;
    346 	vmu_data.vmu_free_entities = entity;
    347 }
    348 
    349 /*
    350  * Free zone entity, and all hashes of entities inside that zone,
    351  * which are projects, tasks, and users.
    352  */
    353 static void
    354 vmu_free_zone(mod_hash_val_t val)
    355 {
    356 	vmu_zone_t *zone = (vmu_zone_t *)val;
    357 
    358 	if (zone->vmz_zone != NULL) {
    359 		vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
    360 		zone->vmz_zone = NULL;
    361 	}
    362 	if (zone->vmz_projects_hash != NULL)
    363 		i_mod_hash_clear_nosync(zone->vmz_projects_hash);
    364 	if (zone->vmz_tasks_hash != NULL)
    365 		i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
    366 	if (zone->vmz_rusers_hash != NULL)
    367 		i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
    368 	if (zone->vmz_eusers_hash != NULL)
    369 		i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
    370 	zone->vmz_next = vmu_data.vmu_free_zones;
    371 	vmu_data.vmu_free_zones = zone;
    372 }
    373 
    374 /*
    375  * Initialize synchronization primitives and hashes for system-wide tracking
    376  * of visited vnodes and shared amps.  Initialize results cache.
    377  */
    378 void
    379 vm_usage_init()
    380 {
    381 	mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
    382 	cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
    383 
    384 	vmu_data.vmu_system = NULL;
    385 	vmu_data.vmu_zones_hash = NULL;
    386 	vmu_data.vmu_projects_col_hash = NULL;
    387 	vmu_data.vmu_rusers_col_hash = NULL;
    388 	vmu_data.vmu_eusers_col_hash = NULL;
    389 
    390 	vmu_data.vmu_free_bounds = NULL;
    391 	vmu_data.vmu_free_objects = NULL;
    392 	vmu_data.vmu_free_entities = NULL;
    393 	vmu_data.vmu_free_zones = NULL;
    394 
    395 	vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
    396 	    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
    397 	    sizeof (vnode_t));
    398 	vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
    399 	    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
    400 	    sizeof (struct anon_map));
    401 	vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
    402 	    "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
    403 	    vmu_free_entity);
    404 	vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
    405 	    "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
    406 	    vmu_free_entity);
    407 	vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
    408 	    "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
    409 	    vmu_free_entity);
    410 	vmu_data.vmu_zones_hash = mod_hash_create_idhash(
    411 	    "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
    412 
    413 	vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
    414 	    sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
    415 	vmu_object_cache = kmem_cache_create("vmu_object_cache",
    416 	    sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
    417 
    418 	vmu_data.vmu_entities = NULL;
    419 	vmu_data.vmu_nentities = 0;
    420 
    421 	vmu_data.vmu_cache = NULL;
    422 	vmu_data.vmu_calc_thread = NULL;
    423 	vmu_data.vmu_calc_flags = 0;
    424 	vmu_data.vmu_pending_flags = 0;
    425 	vmu_data.vmu_pending_waiters = 0;
    426 }
    427 
    428 /*
    429  * Allocate hashes for tracking vm objects visited for an entity.
    430  * Update list of entities.
    431  */
    432 static vmu_entity_t *
    433 vmu_alloc_entity(id_t id, int type, id_t zoneid)
    434 {
    435 	vmu_entity_t *entity;
    436 
    437 	if (vmu_data.vmu_free_entities != NULL) {
    438 		entity = vmu_data.vmu_free_entities;
    439 		vmu_data.vmu_free_entities =
    440 		    vmu_data.vmu_free_entities->vme_next;
    441 		bzero(&entity->vme_result, sizeof (vmusage_t));
    442 	} else {
    443 		entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
    444 	}
    445 	entity->vme_result.vmu_id = id;
    446 	entity->vme_result.vmu_zoneid = zoneid;
    447 	entity->vme_result.vmu_type = type;
    448 
    449 	if (entity->vme_vnode_hash == NULL)
    450 		entity->vme_vnode_hash = mod_hash_create_ptrhash(
    451 		    "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
    452 		    sizeof (vnode_t));
    453 
    454 	if (entity->vme_amp_hash == NULL)
    455 		entity->vme_amp_hash = mod_hash_create_ptrhash(
    456 		    "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
    457 		    sizeof (struct anon_map));
    458 
    459 	if (entity->vme_anon_hash == NULL)
    460 		entity->vme_anon_hash = mod_hash_create_ptrhash(
    461 		    "vmusage anon hash", VMUSAGE_HASH_SIZE,
    462 		    mod_hash_null_valdtor, sizeof (struct anon));
    463 
    464 	entity->vme_next = vmu_data.vmu_entities;
    465 	vmu_data.vmu_entities = entity;
    466 	vmu_data.vmu_nentities++;
    467 
    468 	return (entity);
    469 }
    470 
    471 /*
    472  * Allocate a zone entity, and hashes for tracking visited vm objects
    473  * for projects, tasks, and users within that zone.
    474  */
    475 static vmu_zone_t *
    476 vmu_alloc_zone(id_t id)
    477 {
    478 	vmu_zone_t *zone;
    479 
    480 	if (vmu_data.vmu_free_zones != NULL) {
    481 		zone = vmu_data.vmu_free_zones;
    482 		vmu_data.vmu_free_zones =
    483 		    vmu_data.vmu_free_zones->vmz_next;
    484 		zone->vmz_next = NULL;
    485 		zone->vmz_zone = NULL;
    486 	} else {
    487 		zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
    488 	}
    489 
    490 	zone->vmz_id = id;
    491 
    492 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
    493 		zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
    494 
    495 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
    496 	    VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
    497 		zone->vmz_projects_hash = mod_hash_create_idhash(
    498 		    "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
    499 
    500 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
    501 	    != 0 && zone->vmz_tasks_hash == NULL)
    502 		zone->vmz_tasks_hash = mod_hash_create_idhash(
    503 		    "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
    504 
    505 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
    506 	    != 0 && zone->vmz_rusers_hash == NULL)
    507 		zone->vmz_rusers_hash = mod_hash_create_idhash(
    508 		    "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
    509 
    510 	if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
    511 	    != 0 && zone->vmz_eusers_hash == NULL)
    512 		zone->vmz_eusers_hash = mod_hash_create_idhash(
    513 		    "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
    514 
    515 	return (zone);
    516 }
    517 
    518 /*
    519  * Allocate a structure for tracking visited bounds for a vm object.
    520  */
    521 static vmu_object_t *
    522 vmu_alloc_object(caddr_t key, int type)
    523 {
    524 	vmu_object_t *object;
    525 
    526 	if (vmu_data.vmu_free_objects != NULL) {
    527 		object = vmu_data.vmu_free_objects;
    528 		vmu_data.vmu_free_objects =
    529 		    vmu_data.vmu_free_objects->vmo_next;
    530 	} else {
    531 		object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
    532 	}
    533 
    534 	object->vmo_key = key;
    535 	object->vmo_type = type;
    536 	object->vmo_bounds = NULL;
    537 
    538 	return (object);
    539 }
    540 
    541 /*
    542  * Allocate and return a bound structure.
    543  */
    544 static vmu_bound_t *
    545 vmu_alloc_bound()
    546 {
    547 	vmu_bound_t *bound;
    548 
    549 	if (vmu_data.vmu_free_bounds != NULL) {
    550 		bound = vmu_data.vmu_free_bounds;
    551 		vmu_data.vmu_free_bounds =
    552 		    vmu_data.vmu_free_bounds->vmb_next;
    553 		bzero(bound, sizeof (vmu_bound_t));
    554 	} else {
    555 		bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
    556 		bzero(bound, sizeof (vmu_bound_t));
    557 	}
    558 	return (bound);
    559 }
    560 
    561 /*
    562  * vmu_find_insert_* functions implement hash lookup or allocate and
    563  * insert operations.
    564  */
    565 static vmu_object_t *
    566 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
    567 {
    568 	int ret;
    569 	vmu_object_t *object;
    570 
    571 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
    572 	    (mod_hash_val_t *)&object);
    573 	if (ret != 0) {
    574 		object = vmu_alloc_object(key, type);
    575 		ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
    576 		    (mod_hash_val_t)object, (mod_hash_hndl_t)0);
    577 		ASSERT(ret == 0);
    578 	}
    579 	return (object);
    580 }
    581 
    582 static int
    583 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
    584 {
    585 	int ret;
    586 	caddr_t val;
    587 
    588 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
    589 	    (mod_hash_val_t *)&val);
    590 
    591 	if (ret == 0)
    592 		return (0);
    593 
    594 	ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
    595 	    (mod_hash_val_t)key, (mod_hash_hndl_t)0);
    596 
    597 	ASSERT(ret == 0);
    598 
    599 	return (1);
    600 }
    601 
    602 static vmu_entity_t *
    603 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
    604 {
    605 	int ret;
    606 	vmu_entity_t *entity;
    607 
    608 	ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
    609 	    (mod_hash_val_t *)&entity);
    610 	if (ret != 0) {
    611 		entity = vmu_alloc_entity(id, type, zoneid);
    612 		ret = i_mod_hash_insert_nosync(hash,
    613 		    (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
    614 		    (mod_hash_hndl_t)0);
    615 		ASSERT(ret == 0);
    616 	}
    617 	return (entity);
    618 }
    619 
    620 
    621 
    622 
    623 /*
    624  * Returns list of object bounds between start and end.  New bounds inserted
    625  * by this call are given type.
    626  *
    627  * Returns the number of pages covered if new bounds are created.  Returns 0
    628  * if region between start/end consists of all existing bounds.
    629  */
    630 static pgcnt_t
    631 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
    632     end, char type, vmu_bound_t **first, vmu_bound_t **last)
    633 {
    634 	vmu_bound_t *next;
    635 	vmu_bound_t *prev = NULL;
    636 	vmu_bound_t *tmp = NULL;
    637 	pgcnt_t ret = 0;
    638 
    639 	*first = *last = NULL;
    640 
    641 	for (next = ro->vmo_bounds; next != NULL; next = next->vmb_next) {
    642 		/*
    643 		 * Find bounds overlapping or overlapped by range [start,end].
    644 		 */
    645 		if (start > next->vmb_end) {
    646 			/* bound is before new bound */
    647 			prev = next;
    648 			continue;
    649 		}
    650 		if (next->vmb_start > end) {
    651 			/* bound is after new bound */
    652 			break;
    653 		}
    654 		if (*first == NULL)
    655 			*first = next;
    656 		*last = next;
    657 	}
    658 
    659 	if (*first == NULL) {
    660 		ASSERT(*last == NULL);
    661 		/*
    662 		 * No bounds overlapping range [start,end], so create new
    663 		 * bound
    664 		 */
    665 		tmp = vmu_alloc_bound();
    666 		tmp->vmb_start = start;
    667 		tmp->vmb_end = end;
    668 		tmp->vmb_type = type;
    669 		if (prev == NULL) {
    670 			tmp->vmb_next = ro->vmo_bounds;
    671 			ro->vmo_bounds = tmp;
    672 		} else {
    673 			tmp->vmb_next = prev->vmb_next;
    674 			prev->vmb_next = tmp;
    675 		}
    676 		*first = tmp;
    677 		*last = tmp;
    678 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
    679 		ret = tmp->vmb_end - tmp->vmb_start + 1;
    680 		return (ret);
    681 	}
    682 
    683 	/* Check to see if start is before first known bound */
    684 	ASSERT(first != NULL && last != NULL);
    685 	next = (*first);
    686 	if (start < (*first)->vmb_start) {
    687 		/* Create new bound before first bound */
    688 		tmp = vmu_alloc_bound();
    689 		tmp->vmb_start = start;
    690 		tmp->vmb_end = (*first)->vmb_start - 1;
    691 		tmp->vmb_type = type;
    692 		tmp->vmb_next = *first;
    693 		if (*first == ro->vmo_bounds)
    694 			ro->vmo_bounds = tmp;
    695 		if (prev != NULL)
    696 			prev->vmb_next = tmp;
    697 		ASSERT(tmp->vmb_end >= tmp->vmb_start);
    698 		ret += tmp->vmb_end - tmp->vmb_start + 1;
    699 		*first = tmp;
    700 	}
    701 	/*
    702 	 * Between start and end, search for gaps between and after existing
    703 	 * bounds.  Create new bounds to fill gaps if they exist.
    704 	 */
    705 	while (end > next->vmb_end) {
    706 		/*
    707 		 * Check for gap between bound and next bound. if no gap,
    708 		 * continue.
    709 		 */
    710 		if ((next != *last) &&
    711 		    ((next->vmb_end + 1) == next->vmb_next->vmb_start)) {
    712 			next = next->vmb_next;
    713 			continue;
    714 		}
    715 		/*
    716 		 * Insert new bound in gap after bound, and before next
    717 		 * bound if next bound exists.
    718 		 */
    719 		tmp = vmu_alloc_bound();
    720 		tmp->vmb_type = type;
    721 		tmp->vmb_next = next->vmb_next;
    722 		tmp->vmb_start = next->vmb_end + 1;
    723 
    724 		if (next != *last) {
    725 			tmp->vmb_end = next->vmb_next->vmb_start - 1;
    726 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
    727 			ret += tmp->vmb_end - tmp->vmb_start + 1;
    728 			next->vmb_next = tmp;
    729 			next = tmp->vmb_next;
    730 		} else {
    731 			tmp->vmb_end = end;
    732 			ASSERT(tmp->vmb_end >= tmp->vmb_start);
    733 			ret += tmp->vmb_end - tmp->vmb_start + 1;
    734 			next->vmb_next = tmp;
    735 			*last = tmp;
    736 			break;
    737 		}
    738 	}
    739 	return (ret);
    740 }
    741 
    742 /*
    743  * vmu_update_bounds()
    744  *
    745  * first, last:	list of continuous bounds, of which zero or more are of
    746  * 		type VMUSAGE_BOUND_UNKNOWN.
    747  *
    748  * new_first, new_last:	list of continuous bounds, of which none are of
    749  *			type VMUSAGE_BOUND_UNKNOWN.  These bounds are used to
    750  *			update the types of bounds in (first,last) with
    751  *			type VMUSAGE_BOUND_UNKNOWN.
    752  *
    753  * For the list of bounds (first,last), this function updates any bounds
    754  * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
    755  * the list (new_first, new_last).
    756  *
    757  * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
    758  * (new_first, new_last), it will be split into multiple bounds.
    759  *
    760  * Return value:
    761  * 	The number of pages in the list of bounds (first,last) that were of
    762  *	type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
    763  *	VMUSAGE_BOUND_INCORE.
    764  *
    765  */
    766 static pgcnt_t
    767 vmu_update_bounds(vmu_bound_t **first, vmu_bound_t **last,
    768     vmu_bound_t *new_first, vmu_bound_t *new_last)
    769 {
    770 	vmu_bound_t *next, *new_next, *tmp;
    771 	pgcnt_t rss = 0;
    772 
    773 	next = *first;
    774 	new_next = new_first;
    775 
    776 	/*
    777 	 * Verify first and last bound are covered by new bounds if they
    778 	 * have unknown type.
    779 	 */
    780 	ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
    781 	    (*first)->vmb_start >= new_next->vmb_start);
    782 	ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
    783 	    (*last)->vmb_end <= new_last->vmb_end);
    784 	for (;;) {
    785 		/* If bound already has type, proceed to next bound */
    786 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
    787 			if (next == *last)
    788 				break;
    789 			next = next->vmb_next;
    790 			continue;
    791 		}
    792 		while (new_next->vmb_end < next->vmb_start)
    793 			new_next = new_next->vmb_next;
    794 		ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
    795 		next->vmb_type = new_next->vmb_type;
    796 		if (new_next->vmb_end < next->vmb_end) {
    797 			/* need to split bound */
    798 			tmp = vmu_alloc_bound();
    799 			tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
    800 			tmp->vmb_start = new_next->vmb_end + 1;
    801 			tmp->vmb_end = next->vmb_end;
    802 			tmp->vmb_next = next->vmb_next;
    803 			next->vmb_end = new_next->vmb_end;
    804 			next->vmb_next = tmp;
    805 			if (*last == next)
    806 				*last = tmp;
    807 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
    808 				rss += next->vmb_end - next->vmb_start + 1;
    809 			next = tmp;
    810 		} else {
    811 			if (next->vmb_type == VMUSAGE_BOUND_INCORE)
    812 				rss += next->vmb_end - next->vmb_start + 1;
    813 			if (next == *last)
    814 				break;
    815 			next = next->vmb_next;
    816 		}
    817 	}
    818 	return (rss);
    819 }
    820 
    821 /*
    822  * merges adjacent bounds with same type between first and last bound.
    823  * After merge, last pointer is no longer valid, as last bound may be
    824  * merged away.
    825  */
    826 static void
    827 vmu_merge_bounds(vmu_bound_t **first, vmu_bound_t **last)
    828 {
    829 	vmu_bound_t *next;
    830 	vmu_bound_t *tmp;
    831 
    832 	ASSERT(*first != NULL);
    833 	ASSERT(*last != NULL);
    834 
    835 	next = *first;
    836 	while (next != *last) {
    837 
    838 		/* If bounds are adjacent and have same type, merge them */
    839 		if (((next->vmb_end + 1) == next->vmb_next->vmb_start) &&
    840 		    (next->vmb_type == next->vmb_next->vmb_type)) {
    841 			tmp = next->vmb_next;
    842 			next->vmb_end = tmp->vmb_end;
    843 			next->vmb_next = tmp->vmb_next;
    844 			vmu_free_bound(tmp);
    845 			if (tmp == *last)
    846 				*last = next;
    847 		} else {
    848 			next = next->vmb_next;
    849 		}
    850 	}
    851 }
    852 
    853 /*
    854  * Given an amp and a list of bounds, updates each bound's type with
    855  * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
    856  *
    857  * If a bound is partially incore, it will be split into two bounds.
    858  * first and last may be modified, as bounds may be split into multiple
    859  * bounds if the are partially incore/not-incore.
    860  *
    861  * Set incore to non-zero if bounds are already known to be incore
    862  *
    863  */
    864 static void
    865 vmu_amp_update_incore_bounds(struct anon_map *amp, vmu_bound_t **first,
    866     vmu_bound_t **last, boolean_t incore)
    867 {
    868 	vmu_bound_t *next;
    869 	vmu_bound_t *tmp;
    870 	pgcnt_t index;
    871 	short bound_type;
    872 	short page_type;
    873 	vnode_t *vn;
    874 	anoff_t off;
    875 	struct anon *ap;
    876 
    877 	next = *first;
    878 	/* Shared anon slots don't change once set */
    879 	ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
    880 	for (;;) {
    881 		if (incore == B_TRUE)
    882 			next->vmb_type = VMUSAGE_BOUND_INCORE;
    883 
    884 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
    885 			if (next == *last)
    886 				break;
    887 			next = next->vmb_next;
    888 			continue;
    889 		}
    890 		bound_type = next->vmb_type;
    891 		index = next->vmb_start;
    892 		while (index <= next->vmb_end) {
    893 
    894 			/*
    895 			 * These are used to determine how much to increment
    896 			 * index when a large page is found.
    897 			 */
    898 			page_t *page;
    899 			pgcnt_t pgcnt = 1;
    900 			uint_t pgshft;
    901 			pgcnt_t pgmsk;
    902 
    903 			ap = anon_get_ptr(amp->ahp, index);
    904 			if (ap != NULL)
    905 				swap_xlate(ap, &vn, &off);
    906 
    907 			if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
    908 			    (page = page_exists(vn, off)) != NULL) {
    909 				page_type = VMUSAGE_BOUND_INCORE;
    910 				if (page->p_szc > 0) {
    911 					pgcnt = page_get_pagecnt(page->p_szc);
    912 					pgshft = page_get_shift(page->p_szc);
    913 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
    914 					    - 1;
    915 				}
    916 			} else {
    917 				page_type = VMUSAGE_BOUND_NOT_INCORE;
    918 			}
    919 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
    920 				next->vmb_type = page_type;
    921 			} else if (next->vmb_type != page_type) {
    922 				/*
    923 				 * if current bound type does not match page
    924 				 * type, need to split off new bound.
    925 				 */
    926 				tmp = vmu_alloc_bound();
    927 				tmp->vmb_type = page_type;
    928 				tmp->vmb_start = index;
    929 				tmp->vmb_end = next->vmb_end;
    930 				tmp->vmb_next = next->vmb_next;
    931 				next->vmb_end = index - 1;
    932 				next->vmb_next = tmp;
    933 				if (*last == next)
    934 					*last = tmp;
    935 				next = tmp;
    936 			}
    937 			if (pgcnt > 1) {
    938 				/*
    939 				 * If inside large page, jump to next large
    940 				 * page
    941 				 */
    942 				index = (index & ~pgmsk) + pgcnt;
    943 			} else {
    944 				index++;
    945 			}
    946 		}
    947 		if (next == *last) {
    948 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
    949 			break;
    950 		} else
    951 			next = next->vmb_next;
    952 	}
    953 	ANON_LOCK_EXIT(&amp->a_rwlock);
    954 }
    955 
    956 /*
    957  * Same as vmu_amp_update_incore_bounds(), except for tracking
    958  * incore-/not-incore for vnodes.
    959  */
    960 static void
    961 vmu_vnode_update_incore_bounds(vnode_t *vnode, vmu_bound_t **first,
    962     vmu_bound_t **last)
    963 {
    964 	vmu_bound_t *next;
    965 	vmu_bound_t *tmp;
    966 	pgcnt_t index;
    967 	short bound_type;
    968 	short page_type;
    969 
    970 	next = *first;
    971 	for (;;) {
    972 		if (vnode->v_pages == NULL)
    973 			next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
    974 
    975 		if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
    976 			if (next == *last)
    977 				break;
    978 			next = next->vmb_next;
    979 			continue;
    980 		}
    981 
    982 		bound_type = next->vmb_type;
    983 		index = next->vmb_start;
    984 		while (index <= next->vmb_end) {
    985 
    986 			/*
    987 			 * These are used to determine how much to increment
    988 			 * index when a large page is found.
    989 			 */
    990 			page_t *page;
    991 			pgcnt_t pgcnt = 1;
    992 			uint_t pgshft;
    993 			pgcnt_t pgmsk;
    994 
    995 			if (vnode->v_pages != NULL &&
    996 			    (page = page_exists(vnode, ptob(index))) != NULL) {
    997 				page_type = VMUSAGE_BOUND_INCORE;
    998 				if (page->p_szc > 0) {
    999 					pgcnt = page_get_pagecnt(page->p_szc);
   1000 					pgshft = page_get_shift(page->p_szc);
   1001 					pgmsk = (0x1 << (pgshft - PAGESHIFT))
   1002 					    - 1;
   1003 				}
   1004 			} else {
   1005 				page_type = VMUSAGE_BOUND_NOT_INCORE;
   1006 			}
   1007 			if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
   1008 				next->vmb_type = page_type;
   1009 			} else if (next->vmb_type != page_type) {
   1010 				/*
   1011 				 * if current bound type does not match page
   1012 				 * type, need to split off new bound.
   1013 				 */
   1014 				tmp = vmu_alloc_bound();
   1015 				tmp->vmb_type = page_type;
   1016 				tmp->vmb_start = index;
   1017 				tmp->vmb_end = next->vmb_end;
   1018 				tmp->vmb_next = next->vmb_next;
   1019 				next->vmb_end = index - 1;
   1020 				next->vmb_next = tmp;
   1021 				if (*last == next)
   1022 					*last = tmp;
   1023 				next = tmp;
   1024 			}
   1025 			if (pgcnt > 1) {
   1026 				/*
   1027 				 * If inside large page, jump to next large
   1028 				 * page
   1029 				 */
   1030 				index = (index & ~pgmsk) + pgcnt;
   1031 			} else {
   1032 				index++;
   1033 			}
   1034 		}
   1035 		if (next == *last) {
   1036 			ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
   1037 			break;
   1038 		} else
   1039 			next = next->vmb_next;
   1040 	}
   1041 }
   1042 
   1043 /*
   1044  * Calculate the rss and swap consumed by a segment.  vmu_entities is the
   1045  * list of entities to visit.  For shared segments, the vnode or amp
   1046  * is looked up in each entity to see if has been already counted.  Private
   1047  * anon pages are checked per entity to ensure that cow pages are not
   1048  * double counted.
   1049  *
   1050  * For private mapped files, first the amp is checked for private pages.