Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 #include <sys/disp.h>
     30 #include <sys/param.h>
     31 #include <sys/systm.h>
     32 #include <sys/sysmacros.h>
     33 #include <sys/atomic.h>
     34 #include <sys/cpucaps_impl.h>
     35 #include <sys/dtrace.h>
     36 #include <sys/sdt.h>
     37 #include <sys/debug.h>
     38 #include <sys/rctl.h>
     39 #include <sys/errno.h>
     40 
     41 /*
     42  * CPU Caps implementation
     43  * =======================
     44  *
     45  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
     46  * usage for all projects running inside the zone. If the zone CPU cap is set
     47  * below the project CPU cap, the latter will have no effect.
     48  *
     49  * When CPU usage of projects and/or zones reaches specified caps, threads in
     50  * them do not get scheduled and instead are placed on wait queues associated
     51  * with a cap. Such threads will start running again only when CPU usage drops
     52  * below the cap level. Each zone and each project has its own wait queue.
     53  *
     54  * When CPU cap is set, the kernel continously keeps track of CPU time used by
     55  * capped zones and/or projects over a short time interval and calculates their
     56  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
     57  * cap, LWPs running in the user-land (when they are not holding any critical
     58  * kernel locks) are placed on special wait queues until their project's or
     59  * zone's CPU usage drops below the cap.
     60  *
     61  * The system maintains a list of all capped projects and all capped zones. On
     62  * every clock tick every active thread belonging to a capped project adds its
     63  * CPU usage to its project. Usage from all projects belonging to a capped zone
     64  * is aggregated to get the zone usage.
     65  *
     66  * When the current CPU usage is above the cap, a project or zone is considered
     67  * over-capped. Every user thread caught running in an over-capped project or
     68  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
     69  * is requested to surrender its CPU. This causes scheduling class specific
     70  * CL_PREEMPT() callback to be invoked. The callback function places threads
     71  * marked as TS_PROJWAIT on a wait queue and calls switch().
     72  *
     73  * Threads are only placed on wait queues after trapping from user-land
     74  * (they could be holding some user locks, but no kernel locks) and while
     75  * returning from the trap back to the user-land when no kernel locks are held.
     76  * Putting threads on wait queues in random places while running in the
     77  * kernel might lead to all kinds of locking problems.
     78  *
     79  * Accounting
     80  * ==========
     81  *
     82  * Accounting of CPU usage is based on per-thread micro-state accounting data.
     83  * On every clock tick clock() adds new on-CPU time for every thread found on
     84  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
     85  * New times means time since it was last accounted for. On-CPU times greater
     86  * than 1 tick are truncated to 1 tick.
     87  *
     88  * Project CPU usage is aggregated from all threads within the project.
     89  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
     90  * CPU usage is calculated on every clock tick by walking list of projects and
     91  * adding their usage together.
     92  *
     93  * Decay
     94  * =====
     95  *
     96  * CPU usage is decayed by the caps_update() routine which is called once per
     97  * every clock tick. It walks lists of project caps and decays their usages by
     98  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
     99  * are made runnable again, one thread per clock tick.
    100  *
    101  * Interfaces
    102  * ==========
    103  *
    104  * The CPU Caps facility provides the following interfaces to the rest of the
    105  * system:
    106  *
    107  *   cpucaps_project_add(kproject_t *)
    108  *
    109  * Notifies the framework of a new project. It should be put on the
    110  * capped_projects list if its zone has a cap.
    111  *
    112  *   cpucaps_project_remove(kproject_t *)
    113  *
    114  * Remove the association between the specified project and its cap.
    115  * Called right before the project is destroyed.
    116  *
    117  * cpucaps_project_set(kproject_t *, rctl_qty_t)
    118  *
    119  * Set project cap of the specified project to the specified value. Setting the
    120  * value to NOCAP is equivalent to removing the cap.
    121  *
    122  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
    123  *
    124  * Set zone cap of the specified zone to the specified value. Setting the value
    125  * to NOCAP is equivalent to removing the cap.
    126  *
    127  *   cpucaps_zone_remove(zone_t *)
    128  *
    129  * Remove the association between the zone and its cap.
    130  *
    131  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
    132  *
    133  * Charges specified thread's project the amount of on-CPU time that it used.
    134  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
    135  * Otherwise returns True if project or zone should be penalized because its
    136  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
    137  * bits in t_schedflag in this case.
    138  *
    139  *   CPUCAPS_ENFORCE(kthread_id_t *)
    140  *
    141  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
    142  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
    143  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
    144  * wait queue or False otherwise.
    145  *
    146  *   cpucaps_sc_init(caps_sc_t *)
    147  *
    148  * Initializes the scheduling-class specific CPU Caps data for a thread.
    149  *
    150  * LOCKS
    151  * =====
    152  *
    153  * all the individual caps structures and their lists are protected by a global
    154  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
    155  * caps, so it is usually uncontended. We avoid all blocking memory allocations
    156  * while holding caps_lock to prevent clock() from blocking.
    157  *
    158  * Thread state is protected by the thread lock. It protects the association
    159  * between a thread and its project and, as a consequence, to its zone. The
    160  * association can not break while thread lock is held, so the project or zone
    161  * cap are not going to disappear while thread lock is held.
    162  *
    163  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
    164  * grabbed by scheduling classes already holding thread lock at high PIL and by
    165  * clock thread performing usage decay. We should do as little work as possible
    166  * while holding the lock since it may be very hot. All threads in the project
    167  * contend for the same cache line doing cap usage updates.
    168  */
    169 
    170 /*
    171  * caps_lock protects list of capped projects and zones, changes in the cap
    172  * state and changes of the global cpucaps_enabled flag.
    173  *
    174  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
    175  * modified in parallel. This can be per-zone cap flag, but we don't keep any
    176  * cap state for now.
    177  */
    178 static kmutex_t caps_lock;		/* lock to protect: */
    179 static list_t capped_zones;		/* - list of zones with caps */
    180 static list_t capped_projects;		/* - list of projects with caps */
    181 boolean_t cpucaps_enabled;		/* - are there any caps defined? */
    182 boolean_t cpucaps_busy;			/* - is framework busy? */
    183 
    184 /*
    185  * The accounting is based on the number of nanoseconds threads spend running
    186  * during a tick which is kept in the cap_tick_cost variable.
    187  */
    188 static hrtime_t cap_tick_cost;
    189 
    190 /*
    191  * How much of the usage value is decayed every clock tick
    192  * Decay one per cent of value per tick
    193  */
    194 #define	CAP_DECAY_FACTOR 100
    195 
    196 /*
    197  * Scale the value and round it to the closest integer value
    198  */
    199 #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
    200 
    201 static void caps_update();
    202 
    203 /*
    204  * CAP kstats.
    205  */
    206 struct cap_kstat {
    207 	kstat_named_t	cap_value;
    208 	kstat_named_t	cap_usage;
    209 	kstat_named_t	cap_nwait;
    210 	kstat_named_t	cap_below;
    211 	kstat_named_t	cap_above;
    212 	kstat_named_t	cap_maxusage;
    213 	kstat_named_t	cap_zonename;
    214 } cap_kstat = {
    215 	{ "value",	KSTAT_DATA_UINT64 },
    216 	{ "usage",	KSTAT_DATA_UINT64 },
    217 	{ "nwait",	KSTAT_DATA_UINT64 },
    218 	{ "below_sec",	KSTAT_DATA_UINT64 },
    219 	{ "above_sec",	KSTAT_DATA_UINT64 },
    220 	{ "maxusage",	KSTAT_DATA_UINT64 },
    221 	{ "zonename",	KSTAT_DATA_STRING },
    222 };
    223 
    224 
    225 static kmutex_t cap_kstat_lock;
    226 static int cap_kstat_update(kstat_t *, int);
    227 
    228 /*
    229  * Initialize CPU caps infrastructure.
    230  *   - Initialize lists of capped zones and capped projects
    231  *   - Set cpucaps_clock_callout to NULL
    232  */
    233 void
    234 cpucaps_init()
    235 {
    236 	/*
    237 	 * Initialize global variables
    238 	 */
    239 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
    240 
    241 	list_create(&capped_zones, sizeof (cpucap_t),
    242 	    offsetof(cpucap_t, cap_link));
    243 	list_create(&capped_projects, sizeof (cpucap_t),
    244 	    offsetof(cpucap_t, cap_link));
    245 
    246 	cpucaps_enabled = B_FALSE;
    247 	cpucaps_busy = B_FALSE;
    248 	cpucaps_clock_callout = NULL;
    249 }
    250 
    251 /*
    252  * Initialize scheduling-class specific CPU Caps data.
    253  */
    254 void
    255 cpucaps_sc_init(caps_sc_t *csc)
    256 {
    257 	csc->csc_cputime = 0;
    258 }
    259 
    260 /*
    261  * Allocate and initialize cpucap structure
    262  */
    263 static cpucap_t *
    264 cap_alloc(void)
    265 {
    266 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
    267 
    268 	DISP_LOCK_INIT(&cap->cap_usagelock);
    269 	waitq_init(&cap->cap_waitq);
    270 
    271 	return (cap);
    272 }
    273 
    274 /*
    275  * Free cpucap structure
    276  */
    277 static void
    278 cap_free(cpucap_t *cap)
    279 {
    280 	if (cap == NULL)
    281 		return;
    282 
    283 	/*
    284 	 * This cap should not be active
    285 	 */
    286 	ASSERT(!list_link_active(&cap->cap_link));
    287 	ASSERT(cap->cap_value == 0);
    288 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
    289 
    290 	waitq_fini(&cap->cap_waitq);
    291 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
    292 
    293 	kmem_free(cap, sizeof (cpucap_t));
    294 }
    295 
    296 /*
    297  * Activate cap - insert into active list and unblock its
    298  * wait queue. Should be called with caps_lock held.
    299  * The cap_value field is set to the value supplied.
    300  */
    301 static void
    302 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
    303 {
    304 	ASSERT(MUTEX_HELD(&caps_lock));
    305 
    306 	/*
    307 	 * Cap can not be already enabled
    308 	 */
    309 	ASSERT(!CAP_ENABLED(cap));
    310 	ASSERT(!list_link_active(&cap->cap_link));
    311 
    312 	list_insert_tail(l, cap);
    313 	cap->cap_below = cap->cap_above = 0;
    314 	cap->cap_maxusage = 0;
    315 	cap->cap_usage = 0;
    316 	cap->cap_value = value;
    317 	waitq_unblock(&cap->cap_waitq);
    318 	if (CPUCAPS_OFF()) {
    319 		cpucaps_enabled = B_TRUE;
    320 		cpucaps_clock_callout = caps_update;
    321 	}
    322 }
    323 
    324 /*
    325  * Deactivate cap
    326  *   - Block its wait queue. This prevents any new threads from being
    327  *	enqueued there and moves all enqueued threads to the run queue.
    328  *   - Remove cap from list l.
    329  *   - Disable CPU caps globally if there are no capped projects or zones
    330  *
    331  * Should be called with caps_lock held.
    332  */
    333 static void
    334 cap_disable(list_t *l, cpucap_t *cap)
    335 {
    336 	ASSERT(MUTEX_HELD(&caps_lock));
    337 	/*
    338 	 * Cap should be currently active
    339 	 */
    340 	ASSERT(CPUCAPS_ON());
    341 	ASSERT(list_link_active(&cap->cap_link));
    342 	ASSERT(CAP_ENABLED(cap));
    343 
    344 	waitq_block(&cap->cap_waitq);
    345 	list_remove(l, cap);
    346 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
    347 		cpucaps_enabled = B_FALSE;
    348 		cpucaps_clock_callout = NULL;
    349 	}
    350 	cap->cap_value = 0;
    351 	cap->cap_project = NULL;
    352 	cap->cap_zone = NULL;
    353 	if (cap->cap_kstat != NULL) {
    354 		kstat_delete(cap->cap_kstat);
    355 		cap->cap_kstat = NULL;
    356 	}
    357 
    358 }
    359 
    360 /*
    361  * Enable cap for a project kpj
    362  * It is safe to enable already enabled project cap.
    363  * Should be called with caps_lock held.
    364  */
    365 static void
    366 cap_project_enable(kproject_t *kpj, hrtime_t value)
    367 {
    368 	cpucap_t *cap = kpj->kpj_cpucap;
    369 
    370 	ASSERT(MUTEX_HELD(&caps_lock));
    371 	ASSERT(cap != NULL);
    372 
    373 	if (CAP_DISABLED(cap)) {
    374 		ASSERT(cap->cap_kstat == NULL);
    375 		cap_enable(&capped_projects, cap, value);
    376 		cap->cap_project = kpj;
    377 		cap->cap_zone = kpj->kpj_zone;
    378 
    379 		/*
    380 		 * Create cap kstats
    381 		 */
    382 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
    383 		    KSTAT_TYPE_NAMED,
    384 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    385 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    386 		    cap->cap_kstat->ks_data_size +=
    387 			strlen(cap->cap_zone->zone_name) + 1;
    388 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
    389 		    cap->cap_kstat->ks_data = &cap_kstat;
    390 		    cap->cap_kstat->ks_update = cap_kstat_update;
    391 		    cap->cap_kstat->ks_private = cap;
    392 		    kstat_install(cap->cap_kstat);
    393 		}
    394 	}
    395 }
    396 
    397 /*
    398  * Disable project cap.
    399  * It is safe to disable already disabled project cap.
    400  * Should be called with caps_lock held.
    401  */
    402 static void
    403 cap_project_disable(kproject_t *kpj)
    404 {
    405 	cpucap_t *cap = kpj->kpj_cpucap;
    406 
    407 	ASSERT(MUTEX_HELD(&caps_lock));
    408 	ASSERT(cap != NULL);
    409 	ASSERT(cap->cap_project == kpj);
    410 
    411 	if (CAP_ENABLED(cap))
    412 		cap_disable(&capped_projects, cap);
    413 }
    414 
    415 /*
    416  * Enable cap for a zone
    417  * It is safe to enable already enabled zone cap.
    418  * Should be called with caps_lock held.
    419  */
    420 static void
    421 cap_zone_enable(zone_t *zone, hrtime_t value)
    422 {
    423 	cpucap_t *cap = zone->zone_cpucap;
    424 
    425 	ASSERT(MUTEX_HELD(&caps_lock));
    426 	ASSERT(cap != NULL);
    427 
    428 	if (CAP_DISABLED(cap)) {
    429 		ASSERT(cap->cap_kstat == NULL);
    430 		cap_enable(&capped_zones, cap, value);
    431 		cap->cap_zone = zone;
    432 
    433 		/*
    434 		 * Create cap kstats
    435 		 */
    436 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
    437 		    KSTAT_TYPE_NAMED,
    438 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    439 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    440 		    cap->cap_kstat->ks_data_size +=
    441 			strlen(cap->cap_zone->zone_name) + 1;
    442 		    cap->cap_kstat->ks_lock = &cap_kstat_lock;
    443 		    cap->cap_kstat->ks_data = &cap_kstat;
    444 		    cap->cap_kstat->ks_update = cap_kstat_update;
    445 		    cap->cap_kstat->ks_private = cap;
    446 		    kstat_install(cap->cap_kstat);
    447 		}
    448 	}
    449 }
    450 
    451 /*
    452  * Disable zone cap.
    453  * It is safe to disable already disabled zone cap.
    454  * Should be called with caps_lock held.
    455  */
    456 static void
    457 cap_zone_disable(zone_t *zone)
    458 {
    459 	cpucap_t *cap = zone->zone_cpucap;
    460 
    461 	ASSERT(MUTEX_HELD(&caps_lock));
    462 	ASSERT(cap != NULL);
    463 	ASSERT(cap->cap_zone == zone);
    464 
    465 	if (CAP_ENABLED(cap))
    466 		cap_disable(&capped_zones, cap);
    467 }
    468 
    469 /*
    470  * Apply specified callback to all caps contained in the list `l'.
    471  */
    472 static void
    473 cap_walk(list_t *l, void (*cb)(cpucap_t *))
    474 {
    475 	cpucap_t *cap;
    476 
    477 	ASSERT(MUTEX_HELD(&caps_lock));
    478 
    479 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
    480 		(*cb)(cap);
    481 	}
    482 }
    483 
    484 /*
    485  * If cap limit is not reached, make one thread from wait queue runnable.
    486  * The waitq_isempty check is performed without the waitq lock. If a new thread
    487  * is placed on the waitq right after the check, it will be picked up during the
    488  * next invocation of cap_poke_waitq().
    489  */
    490 static void
    491 cap_poke_waitq(cpucap_t *cap)
    492 {
    493 	ASSERT(MUTEX_HELD(&caps_lock));
    494 
    495 	if (cap->cap_usage >= cap->cap_value) {
    496 		cap->cap_above++;
    497 	} else {
    498 		waitq_t *wq = &cap->cap_waitq;
    499 
    500 		cap->cap_below++;
    501 
    502 		if (!waitq_isempty(wq))
    503 			waitq_runone(wq);
    504 	}
    505 }
    506 
    507 /*
    508  * The callback function called for every cap on capped_projects list.
    509  * Decay cap usage by CAP_DECAY_FACTOR
    510  * Add this cap project usage to its zone usage.
    511  * Kick off a thread from the cap waitq if cap is not reached.
    512  */
    513 static void
    514 cap_project_usage_walker(cpucap_t *cap)
    515 {
    516 	zone_t		*zone = cap->cap_zone;
    517 	hrtime_t	cap_usage = cap->cap_usage;
    518 
    519 	ASSERT(MUTEX_HELD(&caps_lock));
    520 	ASSERT(cap->cap_project->kpj_cpucap == cap);
    521 	ASSERT(zone == cap->cap_project->kpj_zone);
    522 	ASSERT(CAP_ENABLED(cap));
    523 
    524 	/*
    525 	 * Set or clear the CAP_REACHED flag based on the current usage.
    526 	 * Only projects having their own caps are ever marked as CAP_REACHED.
    527 	 */
    528 	cap_poke_waitq(cap);
    529 
    530 	/*
    531 	 * Add project's CPU usage to our zone's CPU usage.
    532 	 */
    533 	if (ZONE_IS_CAPPED(zone)) {
    534 		cpucap_t *zcap = zone->zone_cpucap;
    535 
    536 		ASSERT(zcap->cap_zone == zone);
    537 
    538 		/*
    539 		 * If we haven't reset this zone's usage during this clock tick
    540 		 * yet, then do it now. The cap_lbolt field is used to check
    541 		 * whether this is the first zone's project we see during this
    542 		 * tick or a subsequent one.
    543 		 */
    544 		if (zcap->cap_lbolt != lbolt64) {
    545 			if (zcap->cap_usage > zcap->cap_maxusage)
    546 				zcap->cap_maxusage = zcap->cap_usage;
    547 			zcap->cap_usage = 0;
    548 			zcap->cap_lbolt = lbolt64;
    549 		}
    550 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
    551 		    hrtime_t, cap_usage);
    552 		zcap->cap_usage += cap_usage;
    553 		/* Check for overflows */
    554 		if (zcap->cap_usage < 0)
    555 			zcap->cap_usage = MAX_USAGE - 1;
    556 	}
    557 
    558 	/*
    559 	 * Decay project usage.
    560 	 */
    561 	disp_lock_enter(&cap->cap_usagelock);
    562 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
    563 	disp_lock_exit(&cap->cap_usagelock);
    564 }
    565 
    566 /*
    567  * On every clock tick walk the list of project caps and update the CPU usage.
    568  * Also walk the list of zone caps checking whether any threads should
    569  * transition from wait queue to run queue.
    570  *
    571  * This function gets called by the clock thread directly when there are any
    572  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
    573  * caps_lock for long periods of time, so there should be almost no contention
    574  * for it.
    575  */
    576 static void
    577 caps_update()
    578 {
    579 	mutex_enter(&caps_lock);
    580 	cap_walk(&capped_projects, cap_project_usage_walker);
    581 	cap_walk(&capped_zones, cap_poke_waitq);
    582 	mutex_exit(&caps_lock);
    583 }
    584 
    585 /*
    586  * The function is called for each project in a zone when the zone cap is
    587  * modified. It enables project caps if zone cap is enabled and disables if the
    588  * zone cap is disabled and project doesn't have its own cap.
    589  *
    590  * For each project that does not have cpucap structure allocated it allocates a
    591  * new structure and assigns to kpj->cpu_cap. The allocation is performed
    592  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
    593  * held.
    594  */
    595 static int
    596 cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
    597 {
    598 	cpucap_t *project_cap = NULL;
    599 	cpucap_t *zone_cap = (cpucap_t *)arg;
    600 
    601 	ASSERT(zone_cap != NULL);
    602 
    603 	if (kpj->kpj_cpucap == NULL) {
    604 		/*
    605 		 * This is the first time any cap was established for this
    606 		 * project. Allocate a new cpucap structure for it.
    607 		 */
    608 		project_cap = cap_alloc();
    609 	}
    610 
    611 	mutex_enter(&caps_lock);
    612 
    613 	/*
    614 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
    615 	 * and assign the newly allocated cpucap structure to it.
    616 	 */
    617 	if (kpj->kpj_cpucap == NULL) {
    618 		kpj->kpj_cpucap = project_cap;
    619 	} else if (project_cap != NULL) {
    620 		cap_free(project_cap);
    621 	}
    622 
    623 	project_cap = kpj->kpj_cpucap;
    624 
    625 	if (CAP_DISABLED(zone_cap)) {
    626 		/*
    627 		 * Remove all projects in this zone without caps
    628 		 * from the capped_projects list.
    629 		 */
    630 		if (project_cap->cap_value == MAX_USAGE) {
    631 			cap_project_disable(kpj);
    632 		}
    633 	} else if (CAP_DISABLED(project_cap)) {
    634 		/*
    635 		 * Add the project to capped_projects list.
    636 		 */
    637 		ASSERT(project_cap->cap_value == 0);
    638 		cap_project_enable(kpj, MAX_USAGE);
    639 	}
    640 	mutex_exit(&caps_lock);
    641 
    642 	return (0);
    643 }
    644 
    645 /*
    646  * Set zone cap to cap_val
    647  * If cap_val is equal to NOCAP, disable zone cap.
    648  *
    649  * If this is the first time a cap is set on a zone, allocate cpucap structure
    650  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
    651  */
    652 int
    653 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
    654 {
    655 	cpucap_t *cap = NULL;
    656 	hrtime_t value;
    657 
    658 	if (cap_val == 0)
    659 		return (EINVAL);
    660 
    661 	ASSERT(cap_val <= MAXCAP);
    662 	if (cap_val > MAXCAP)
    663 		cap_val = MAXCAP;
    664 
    665 	/*
    666 	 * Nothing to do if trying to disable a cap on a zone when caps are off
    667 	 * or a zone which does not have a cap yet.
    668 	 */
    669 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
    670 		return (0);
    671 
    672 	if (zone->zone_cpucap == NULL)
    673 		cap = cap_alloc();
    674 
    675 	mutex_enter(&caps_lock);
    676 
    677 	if (cpucaps_busy) {
    678 		mutex_exit(&caps_lock);
    679 		return (EBUSY);
    680 	}
    681 
    682 	/*
    683 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
    684 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
    685 	 */
    686 	if (zone->zone_cpucap == NULL) {
    687 		zone->zone_cpucap = cap;
    688 	} else if (cap != NULL) {
    689 		cap_free(cap);
    690 	}
    691 
    692 	cap = zone->zone_cpucap;
    693 	value = cap_val * cap_tick_cost;
    694 	if (value < 0)
    695 		value = MAX_USAGE;
    696 
    697 	/* Nothing to do if the value is staying the same */
    698 	if (value == cap->cap_value) {
    699 		mutex_exit(&caps_lock);
    700 		return (0);
    701 	}
    702 
    703 	/*
    704 	 * Clear cap statistics since the cap value itself changes.
    705 	 */
    706 	cap->cap_above = cap->cap_below = 0;
    707 
    708 
    709 	if (cap_val == NOCAP) {
    710 		if (CAP_ENABLED(cap)) {
    711 			/*
    712 			 * Remove cap for the zone
    713 			 */
    714 			cap_zone_disable(zone);
    715 			cpucaps_busy = B_TRUE;
    716 			mutex_exit(&caps_lock);
    717 			/*
    718 			 * Disable caps for all project belonging to this zone
    719 			 * unless they have their own cap.
    720 			 */
    721 			(void) project_walk_all(zone->zone_id,
    722 			    cap_project_zone_modify_walker, cap);
    723 
    724 			mutex_enter(&caps_lock);
    725 			cpucaps_busy = B_FALSE;
    726 		}
    727 	} else if (CAP_DISABLED(cap)) {
    728 		/*
    729 		 * Set a cap on a zone which previously was not capped.
    730 		 */
    731 		cap_zone_enable(zone, value);
    732 		cpucaps_busy = B_TRUE;
    733 		mutex_exit(&caps_lock);
    734 
    735 		/*
    736 		 * Enable cap for all projects belonging to this zone.
    737 		 */
    738 		(void) project_walk_all(zone->zone_id,
    739 		    cap_project_zone_modify_walker, cap);
    740 
    741 		mutex_enter(&caps_lock);
    742 		cpucaps_busy = B_FALSE;
    743 	} else {
    744 		/*
    745 		 * No state transitions, just change the value
    746 		 */
    747 		cap->cap_value = value;
    748 	}
    749 
    750 	ASSERT(MUTEX_HELD(&caps_lock));
    751 	ASSERT(!cpucaps_busy);
    752 	mutex_exit(&caps_lock);
    753 
    754 	return (0);
    755 }
    756 
    757 /*
    758  * The project is going away so disable its cap.
    759  */
    760 void
    761 cpucaps_project_remove(kproject_t *kpj)
    762 {
    763 	mutex_enter(&caps_lock);
    764 	if (PROJECT_IS_CAPPED(kpj))
    765 		cap_project_disable(kpj);
    766 	if (kpj->kpj_cpucap != NULL) {
    767 		cap_free(kpj->kpj_cpucap);
    768 		kpj->kpj_cpucap = NULL;
    769 	}
    770 	mutex_exit(&caps_lock);
    771 }
    772 
    773 /*
    774  * The zone is going away, so disable its cap.
    775  */
    776 void
    777 cpucaps_zone_remove(zone_t *zone)
    778 {
    779 	mutex_enter(&caps_lock);
    780 	while (ZONE_IS_CAPPED(zone)) {
    781 		mutex_exit(&caps_lock);
    782 		(void) cpucaps_zone_set(zone, NOCAP);
    783 		mutex_enter(&caps_lock);
    784 	}
    785 	if (zone->zone_cpucap != NULL) {
    786 		cap_free(zone->zone_cpucap);
    787 		zone->zone_cpucap = NULL;
    788 	}
    789 	mutex_exit(&caps_lock);
    790 }
    791 
    792 /*
    793  * New project was created. It should be put on the capped_projects list if
    794  * its zone has a cap.
    795  */
    796 void
    797 cpucaps_project_add(kproject_t *kpj)
    798 {
    799 	cpucap_t *cap = NULL;
    800 
    801 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
    802 		return;
    803 
    804 	/*
    805 	 * This project was never capped before, so allocate its cap structure.
    806 	 */
    807 	if (kpj->kpj_cpucap == NULL)
    808 		cap = cap_alloc();
    809 
    810 	mutex_enter(&caps_lock);
    811 	/*
    812 	 * Double-check with caps_lock held
    813 	 */
    814 	if (kpj->kpj_cpucap == NULL) {
    815 		kpj->kpj_cpucap = cap;
    816 	} else if (cap != NULL) {
    817 		cap_free(cap);
    818 	}
    819 
    820 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
    821 		cap_project_enable(kpj, MAX_USAGE);
    822 
    823 	mutex_exit(&caps_lock);
    824 }
    825 
    826 /*
    827  * Set project cap to cap_val
    828  * If cap_val is equal to NOCAP, disable project cap.
    829  *
    830  * If this is the first time a cap is set on a project, allocate cpucap
    831  * structure without holding caps_lock to avoid KM_SLEEP allocation with
    832  * caps_lock held.
    833  */
    834 int
    835 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
    836 {
    837 	cpucap_t *cap = NULL;
    838 	hrtime_t value;
    839 
    840 	if (cap_val == 0)
    841 		return (EINVAL);
    842 
    843 	ASSERT(cap_val <= MAXCAP);
    844 	if (cap_val > MAXCAP)
    845 		cap_val = MAXCAP;
    846 
    847 	/*
    848 	 * Nothing to do if trying to disable project cap and caps are not
    849 	 * enabled or if trying to disable cap on a project that does not have
    850 	 * cap enabled.
    851 	 */
    852 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
    853 		return (0);
    854 
    855 	if (kpj->kpj_cpucap == NULL) {
    856 		/*
    857 		 * This project was never capped before, so allocate its cap
    858 		 * structure.
    859 		 */
    860 		cap = cap_alloc();
    861 	}
    862 
    863 	mutex_enter(&caps_lock);
    864 
    865 	/*
    866 	 * Double-check with caps_lock held.
    867 	 */
    868 	if (kpj->kpj_cpucap == NULL) {
    869 		kpj->kpj_cpucap = cap;
    870 	} else if (cap != NULL) {
    871 		cap_free(cap);
    872 	}
    873 
    874 	/*
    875 	 * Get the actual pointer to the project cap.
    876 	 */
    877 	cap = kpj->kpj_cpucap;
    878 	value = cap_val * cap_tick_cost;
    879 	if (value < 0)
    880 		value = MAX_USAGE;
    881 
    882 	/*
    883 	 * Nothing to do if the value is not changing
    884 	 */
    885 	if (value == cap->cap_value) {
    886 		mutex_exit(&caps_lock);
    887 		return (0);
    888 	}
    889 
    890 	/*
    891 	 * Clear cap statistics since the cap value itself changes.
    892 	 */
    893 	cap->cap_above = cap->cap_below = 0;
    894 	cap->cap_maxusage = 0;
    895 
    896 	if (cap_val != NOCAP) {
    897 		/*
    898 		 * Enable this cap if it is not already enabled.
    899 		 */
    900 		if (CAP_DISABLED(cap))
    901 			cap_project_enable(kpj, value);
    902 		else
    903 			cap->cap_value = value;
    904 	} else if (CAP_ENABLED(cap)) {
    905 		/*
    906 		 * User requested to drop a cap on the project. If it is part of
    907 		 * capped zone, keep the cap and set the value to MAX_USAGE,
    908 		 * otherwise disable the cap.
    909 		 */
    910 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
    911 			cap->cap_value = MAX_USAGE;
    912 		} else {
    913 			cap_project_disable(kpj);
    914 		}
    915 	}
    916 	mutex_exit(&caps_lock);
    917 
    918 	return (0);
    919 }
    920 
    921 /*
    922  * Get cap usage.
    923  */
    924 static rctl_qty_t
    925 cap_get(cpucap_t *cap)
    926 {
    927 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
    928 }
    929 
    930 /*
    931  * Get current project usage.
    932  */
    933 rctl_qty_t
    934 cpucaps_project_get(kproject_t *kpj)
    935 {
    936 	return (cap_get(kpj->kpj_cpucap));
    937 }
    938 
    939 /*
    940  * Get current zone usage.
    941  */
    942 rctl_qty_t
    943 cpucaps_zone_get(zone_t *zone)
    944 {
    945 	return (cap_get(zone->zone_cpucap));
    946 }
    947 
    948 /*
    949  * Charge project of thread t the time thread t spent on CPU since previously
    950  * adjusted.
    951  *
    952  * Record the current on-CPU time in the csc structure.
    953  *
    954  * Do not adjust for more than one tick worth of time.
    955  *
    956  * It is possible that the project cap is being disabled while this routine is
    957  * executed. This should not cause any issues since the association between the
    958  * thread and its project is protected by thread lock.
    959  */
    960 static void
    961 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
    962 {
    963 	kproject_t	*kpj = ttoproj(t);
    964 	hrtime_t	new_usage;
    965 	hrtime_t	usage_delta;
    966 
    967 	ASSERT(THREAD_LOCK_HELD(t));
    968 	ASSERT(kpj->kpj_cpucap != NULL);
    969 
    970 	/* Get on-CPU time since birth of a thread */
    971 	new_usage = mstate_thread_onproc_time(t);
    972 
    973 	/* Time spent on CPU since last checked */
    974 	usage_delta = new_usage - csc->csc_cputime;
    975 
    976 	/* Save the accumulated on-CPU time */
    977 	csc->csc_cputime = new_usage;
    978 
    979 	/* Charge at most one tick worth of on-CPU time */
    980 	if (usage_delta > cap_tick_cost)
    981 		usage_delta = cap_tick_cost;
    982 
    983 	/* Add usage_delta to the project usage value. */
    984 	if (usage_delta > 0) {
    985 		cpucap_t *cap = kpj->kpj_cpucap;
    986 
    987 		DTRACE_PROBE2(cpucaps__project__charge,
    988 		    kthread_id_t, t, hrtime_t, usage_delta);
    989 
    990 		disp_lock_enter_high(&cap->cap_usagelock);
    991 		cap->cap_usage += usage_delta;
    992 
    993 		/* Check for overflows */
    994 		if (cap->cap_usage < 0)
    995 			cap->cap_usage = MAX_USAGE - 1;
    996 
    997 		disp_lock_exit_high(&cap->cap_usagelock);
    998 
    999 		/*
   1000 		 * cap_maxusage is only kept for observability. Move it outside
   1001 		 * the lock to reduce the time spent while holding the lock.
   1002 		 */
   1003 		if (cap->cap_usage > cap->cap_maxusage)
   1004 			cap->cap_maxusage = cap->cap_usage;
   1005 	}
   1006 }
   1007 
   1008 /*
   1009  * Charge thread's project and return True if project or zone should be
   1010  * penalized because its project or zone is exceeding its cap. Also sets
   1011  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
   1012  *
   1013  * It is possible that the project cap is being disabled while this routine is
   1014  * executed. This should not cause any issues since the association between the
   1015  * thread and its project is protected by thread lock. It will still set
   1016  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
   1017  * anything on the blocked wait queue.
   1018  *
   1019  */
   1020 boolean_t
   1021 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
   1022 {
   1023 	kproject_t	*kpj = ttoproj(t);
   1024 	klwp_t		*lwp = t->t_lwp;
   1025 	zone_t		*zone;
   1026 	cpucap_t	*project_cap;
   1027 	boolean_t	rc = B_FALSE;
   1028 
   1029 	ASSERT(THREAD_LOCK_HELD(t));
   1030 
   1031 	/* Nothing to do for projects that are not capped. */
   1032 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
   1033 		return (B_FALSE);
   1034 
   1035 	caps_charge_adjust(t, csc);
   1036 
   1037 	/*
   1038 	 * The caller only requested to charge the project usage, no enforcement
   1039 	 * part.
   1040 	 */
   1041 	if (charge_type == CPUCAPS_CHARGE_ONLY)
   1042 		return (B_FALSE);
   1043 
   1044 	project_cap = kpj->kpj_cpucap;
   1045 
   1046 	if (project_cap->cap_usage >= project_cap->cap_value) {
   1047 		t->t_schedflag |= TS_PROJWAITQ;
   1048 		rc = B_TRUE;
   1049 	} else if (t->t_schedflag & TS_PROJWAITQ) {
   1050 		t->t_schedflag &= ~TS_PROJWAITQ;
   1051 	}
   1052 
   1053 	zone = ttozone(t);
   1054 	if (!ZONE_IS_CAPPED(zone)) {
   1055 		if (t->t_schedflag & TS_ZONEWAITQ)
   1056 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1057 	} else {
   1058 		cpucap_t *zone_cap = zone->zone_cpucap;
   1059 
   1060 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
   1061 			t->t_schedflag |= TS_ZONEWAITQ;
   1062 			rc = B_TRUE;
   1063 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
   1064 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1065 		}
   1066 	}
   1067 
   1068 
   1069 	return (rc);
   1070 }
   1071 
   1072 /*
   1073  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
   1074  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
   1075  *
   1076  * CPU Caps are only enforced for user threads.
   1077  *
   1078  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
   1079  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
   1080  *
   1081  * It is possible that by the time we enter cpucaps_enforce() the cap is already
   1082  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
   1083  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
   1084  * apply.
   1085  */
   1086 boolean_t
   1087 cpucaps_enforce(kthread_t *t)
   1088 {
   1089 	klwp_t *lwp = t->t_lwp;
   1090 
   1091 	ASSERT(THREAD_LOCK_HELD(t));
   1092 
   1093 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
   1094 		if (t->t_schedflag & TS_PROJWAITQ) {
   1095 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
   1096 			t->t_schedflag &= ~TS_ANYWAITQ;
   1097 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
   1098 				t)) {
   1099 				return (B_TRUE);
   1100 			}
   1101 		}
   1102 		if (t->t_schedflag & TS_ZONEWAITQ) {
   1103 			ASSERT(ttozone(t)->zone_cpucap != NULL);
   1104 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1105 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
   1106 				t)) {
   1107 				return (B_TRUE);
   1108 			}
   1109 		}
   1110 	}
   1111 
   1112 	/*
   1113 	 * The thread is not enqueued on the wait queue.
   1114 	 */
   1115 	return (B_FALSE);
   1116 }
   1117 
   1118 /*
   1119  * Convert internal cap statistics into values exported by cap kstat.
   1120  */
   1121 static int
   1122 cap_kstat_update(kstat_t *ksp, int rw)
   1123 {
   1124 	struct cap_kstat *capsp = &cap_kstat;
   1125 	cpucap_t *cap = ksp->ks_private;
   1126 	clock_t	tick_sec = SEC_TO_TICK(1);
   1127 	char *zonename = cap->cap_zone->zone_name;
   1128 
   1129 	if (rw == KSTAT_WRITE)
   1130 		return (EACCES);
   1131 
   1132 	capsp->cap_value.value.ui64 =
   1133 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
   1134 	capsp->cap_usage.value.ui64 =
   1135 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
   1136 	capsp->cap_maxusage.value.ui64 =
   1137 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
   1138 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
   1139 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
   1140 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
   1141 	kstat_named_setstr(&capsp->cap_zonename, zonename);
   1142 
   1143 	return (0);
   1144 }
   1145