Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/disp.h>
     28 #include <sys/param.h>
     29 #include <sys/systm.h>
     30 #include <sys/sysmacros.h>
     31 #include <sys/atomic.h>
     32 #include <sys/cpucaps_impl.h>
     33 #include <sys/dtrace.h>
     34 #include <sys/sdt.h>
     35 #include <sys/debug.h>
     36 #include <sys/rctl.h>
     37 #include <sys/errno.h>
     38 
     39 /*
     40  * CPU Caps implementation
     41  * =======================
     42  *
     43  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
     44  * usage for all projects running inside the zone. If the zone CPU cap is set
     45  * below the project CPU cap, the latter will have no effect.
     46  *
     47  * When CPU usage of projects and/or zones reaches specified caps, threads in
     48  * them do not get scheduled and instead are placed on wait queues associated
     49  * with a cap. Such threads will start running again only when CPU usage drops
     50  * below the cap level. Each zone and each project has its own wait queue.
     51  *
     52  * When CPU cap is set, the kernel continously keeps track of CPU time used by
     53  * capped zones and/or projects over a short time interval and calculates their
     54  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
     55  * cap, LWPs running in the user-land (when they are not holding any critical
     56  * kernel locks) are placed on special wait queues until their project's or
     57  * zone's CPU usage drops below the cap.
     58  *
     59  * The system maintains a list of all capped projects and all capped zones. On
     60  * every clock tick every active thread belonging to a capped project adds its
     61  * CPU usage to its project. Usage from all projects belonging to a capped zone
     62  * is aggregated to get the zone usage.
     63  *
     64  * When the current CPU usage is above the cap, a project or zone is considered
     65  * over-capped. Every user thread caught running in an over-capped project or
     66  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
     67  * is requested to surrender its CPU. This causes scheduling class specific
     68  * CL_PREEMPT() callback to be invoked. The callback function places threads
     69  * marked as TS_PROJWAIT on a wait queue and calls switch().
     70  *
     71  * Threads are only placed on wait queues after trapping from user-land
     72  * (they could be holding some user locks, but no kernel locks) and while
     73  * returning from the trap back to the user-land when no kernel locks are held.
     74  * Putting threads on wait queues in random places while running in the
     75  * kernel might lead to all kinds of locking problems.
     76  *
     77  * Accounting
     78  * ==========
     79  *
     80  * Accounting of CPU usage is based on per-thread micro-state accounting data.
     81  * On every clock tick clock() adds new on-CPU time for every thread found on
     82  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
     83  * New times means time since it was last accounted for. On-CPU times greater
     84  * than 1 tick are truncated to 1 tick.
     85  *
     86  * Project CPU usage is aggregated from all threads within the project.
     87  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
     88  * CPU usage is calculated on every clock tick by walking list of projects and
     89  * adding their usage together.
     90  *
     91  * Decay
     92  * =====
     93  *
     94  * CPU usage is decayed by the caps_update() routine which is called once per
     95  * every clock tick. It walks lists of project caps and decays their usages by
     96  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
     97  * are made runnable again, one thread per clock tick.
     98  *
     99  * Interfaces
    100  * ==========
    101  *
    102  * The CPU Caps facility provides the following interfaces to the rest of the
    103  * system:
    104  *
    105  *   cpucaps_project_add(kproject_t *)
    106  *
    107  * Notifies the framework of a new project. It should be put on the
    108  * capped_projects list if its zone has a cap.
    109  *
    110  *   cpucaps_project_remove(kproject_t *)
    111  *
    112  * Remove the association between the specified project and its cap.
    113  * Called right before the project is destroyed.
    114  *
    115  * cpucaps_project_set(kproject_t *, rctl_qty_t)
    116  *
    117  * Set project cap of the specified project to the specified value. Setting the
    118  * value to NOCAP is equivalent to removing the cap.
    119  *
    120  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
    121  *
    122  * Set zone cap of the specified zone to the specified value. Setting the value
    123  * to NOCAP is equivalent to removing the cap.
    124  *
    125  *   cpucaps_zone_remove(zone_t *)
    126  *
    127  * Remove the association between the zone and its cap.
    128  *
    129  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
    130  *
    131  * Charges specified thread's project the amount of on-CPU time that it used.
    132  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
    133  * Otherwise returns True if project or zone should be penalized because its
    134  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
    135  * bits in t_schedflag in this case.
    136  *
    137  *   CPUCAPS_ENFORCE(kthread_id_t *)
    138  *
    139  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
    140  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
    141  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
    142  * wait queue or False otherwise.
    143  *
    144  *   cpucaps_sc_init(caps_sc_t *)
    145  *
    146  * Initializes the scheduling-class specific CPU Caps data for a thread.
    147  *
    148  * LOCKS
    149  * =====
    150  *
    151  * all the individual caps structures and their lists are protected by a global
    152  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
    153  * caps, so it is usually uncontended. We avoid all blocking memory allocations
    154  * while holding caps_lock to prevent clock() from blocking.
    155  *
    156  * Thread state is protected by the thread lock. It protects the association
    157  * between a thread and its project and, as a consequence, to its zone. The
    158  * association can not break while thread lock is held, so the project or zone
    159  * cap are not going to disappear while thread lock is held.
    160  *
    161  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
    162  * grabbed by scheduling classes already holding thread lock at high PIL and by
    163  * clock thread performing usage decay. We should do as little work as possible
    164  * while holding the lock since it may be very hot. All threads in the project
    165  * contend for the same cache line doing cap usage updates.
    166  */
    167 
    168 /*
    169  * caps_lock protects list of capped projects and zones, changes in the cap
    170  * state and changes of the global cpucaps_enabled flag.
    171  *
    172  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
    173  * modified in parallel. This can be per-zone cap flag, but we don't keep any
    174  * cap state for now.
    175  */
    176 static kmutex_t caps_lock;		/* lock to protect: */
    177 static list_t capped_zones;		/* - list of zones with caps */
    178 static list_t capped_projects;		/* - list of projects with caps */
    179 boolean_t cpucaps_enabled;		/* - are there any caps defined? */
    180 boolean_t cpucaps_busy;			/* - is framework busy? */
    181 
    182 /*
    183  * The accounting is based on the number of nanoseconds threads spend running
    184  * during a tick which is kept in the cap_tick_cost variable.
    185  */
    186 static hrtime_t cap_tick_cost;
    187 
    188 /*
    189  * How much of the usage value is decayed every clock tick
    190  * Decay one per cent of value per tick
    191  */
    192 #define	CAP_DECAY_FACTOR 100
    193 
    194 /*
    195  * Scale the value and round it to the closest integer value
    196  */
    197 #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
    198 
    199 static void caps_update();
    200 
    201 /*
    202  * CAP kstats.
    203  */
    204 struct cap_kstat {
    205 	kstat_named_t	cap_value;
    206 	kstat_named_t	cap_usage;
    207 	kstat_named_t	cap_nwait;
    208 	kstat_named_t	cap_below;
    209 	kstat_named_t	cap_above;
    210 	kstat_named_t	cap_maxusage;
    211 	kstat_named_t	cap_zonename;
    212 } cap_kstat = {
    213 	{ "value",	KSTAT_DATA_UINT64 },
    214 	{ "usage",	KSTAT_DATA_UINT64 },
    215 	{ "nwait",	KSTAT_DATA_UINT64 },
    216 	{ "below_sec",	KSTAT_DATA_UINT64 },
    217 	{ "above_sec",	KSTAT_DATA_UINT64 },
    218 	{ "maxusage",	KSTAT_DATA_UINT64 },
    219 	{ "zonename",	KSTAT_DATA_STRING },
    220 };
    221 
    222 
    223 static kmutex_t cap_kstat_lock;
    224 static int cap_kstat_update(kstat_t *, int);
    225 
    226 /*
    227  * Initialize CPU caps infrastructure.
    228  *   - Initialize lists of capped zones and capped projects
    229  *   - Set cpucaps_clock_callout to NULL
    230  */
    231 void
    232 cpucaps_init()
    233 {
    234 	/*
    235 	 * Initialize global variables
    236 	 */
    237 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
    238 
    239 	list_create(&capped_zones, sizeof (cpucap_t),
    240 	    offsetof(cpucap_t, cap_link));
    241 	list_create(&capped_projects, sizeof (cpucap_t),
    242 	    offsetof(cpucap_t, cap_link));
    243 
    244 	cpucaps_enabled = B_FALSE;
    245 	cpucaps_busy = B_FALSE;
    246 	cpucaps_clock_callout = NULL;
    247 }
    248 
    249 /*
    250  * Initialize scheduling-class specific CPU Caps data.
    251  */
    252 void
    253 cpucaps_sc_init(caps_sc_t *csc)
    254 {
    255 	csc->csc_cputime = 0;
    256 }
    257 
    258 /*
    259  * Allocate and initialize cpucap structure
    260  */
    261 static cpucap_t *
    262 cap_alloc(void)
    263 {
    264 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
    265 
    266 	DISP_LOCK_INIT(&cap->cap_usagelock);
    267 	waitq_init(&cap->cap_waitq);
    268 
    269 	return (cap);
    270 }
    271 
    272 /*
    273  * Free cpucap structure
    274  */
    275 static void
    276 cap_free(cpucap_t *cap)
    277 {
    278 	if (cap == NULL)
    279 		return;
    280 
    281 	/*
    282 	 * This cap should not be active
    283 	 */
    284 	ASSERT(!list_link_active(&cap->cap_link));
    285 	ASSERT(cap->cap_value == 0);
    286 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
    287 
    288 	waitq_fini(&cap->cap_waitq);
    289 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
    290 
    291 	kmem_free(cap, sizeof (cpucap_t));
    292 }
    293 
    294 /*
    295  * Activate cap - insert into active list and unblock its
    296  * wait queue. Should be called with caps_lock held.
    297  * The cap_value field is set to the value supplied.
    298  */
    299 static void
    300 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
    301 {
    302 	ASSERT(MUTEX_HELD(&caps_lock));
    303 
    304 	/*
    305 	 * Cap can not be already enabled
    306 	 */
    307 	ASSERT(!CAP_ENABLED(cap));
    308 	ASSERT(!list_link_active(&cap->cap_link));
    309 
    310 	list_insert_tail(l, cap);
    311 	cap->cap_below = cap->cap_above = 0;
    312 	cap->cap_maxusage = 0;
    313 	cap->cap_usage = 0;
    314 	cap->cap_value = value;
    315 	waitq_unblock(&cap->cap_waitq);
    316 	if (CPUCAPS_OFF()) {
    317 		cpucaps_enabled = B_TRUE;
    318 		cpucaps_clock_callout = caps_update;
    319 	}
    320 }
    321 
    322 /*
    323  * Deactivate cap
    324  *   - Block its wait queue. This prevents any new threads from being
    325  *	enqueued there and moves all enqueued threads to the run queue.
    326  *   - Remove cap from list l.
    327  *   - Disable CPU caps globally if there are no capped projects or zones
    328  *
    329  * Should be called with caps_lock held.
    330  */
    331 static void
    332 cap_disable(list_t *l, cpucap_t *cap)
    333 {
    334 	ASSERT(MUTEX_HELD(&caps_lock));
    335 	/*
    336 	 * Cap should be currently active
    337 	 */
    338 	ASSERT(CPUCAPS_ON());
    339 	ASSERT(list_link_active(&cap->cap_link));
    340 	ASSERT(CAP_ENABLED(cap));
    341 
    342 	waitq_block(&cap->cap_waitq);
    343 	list_remove(l, cap);
    344 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
    345 		cpucaps_enabled = B_FALSE;
    346 		cpucaps_clock_callout = NULL;
    347 	}
    348 	cap->cap_value = 0;
    349 	cap->cap_project = NULL;
    350 	cap->cap_zone = NULL;
    351 	if (cap->cap_kstat != NULL) {
    352 		kstat_delete(cap->cap_kstat);
    353 		cap->cap_kstat = NULL;
    354 	}
    355 
    356 }
    357 
    358 /*
    359  * Enable cap for a project kpj
    360  * It is safe to enable already enabled project cap.
    361  * Should be called with caps_lock held.
    362  */
    363 static void
    364 cap_project_enable(kproject_t *kpj, hrtime_t value)
    365 {
    366 	cpucap_t *cap = kpj->kpj_cpucap;
    367 
    368 	ASSERT(MUTEX_HELD(&caps_lock));
    369 	ASSERT(cap != NULL);
    370 
    371 	if (CAP_DISABLED(cap)) {
    372 		ASSERT(cap->cap_kstat == NULL);
    373 		cap_enable(&capped_projects, cap, value);
    374 		cap->cap_project = kpj;
    375 		cap->cap_zone = kpj->kpj_zone;
    376 
    377 		/*
    378 		 * Create cap kstats
    379 		 */
    380 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
    381 		    KSTAT_TYPE_NAMED,
    382 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    383 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    384 			cap->cap_kstat->ks_data_size +=
    385 			    strlen(cap->cap_zone->zone_name) + 1;
    386 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
    387 			cap->cap_kstat->ks_data = &cap_kstat;
    388 			cap->cap_kstat->ks_update = cap_kstat_update;
    389 			cap->cap_kstat->ks_private = cap;
    390 			kstat_install(cap->cap_kstat);
    391 		}
    392 	}
    393 }
    394 
    395 /*
    396  * Disable project cap.
    397  * It is safe to disable already disabled project cap.
    398  * Should be called with caps_lock held.
    399  */
    400 static void
    401 cap_project_disable(kproject_t *kpj)
    402 {
    403 	cpucap_t *cap = kpj->kpj_cpucap;
    404 
    405 	ASSERT(MUTEX_HELD(&caps_lock));
    406 	ASSERT(cap != NULL);
    407 	ASSERT(cap->cap_project == kpj);
    408 
    409 	if (CAP_ENABLED(cap))
    410 		cap_disable(&capped_projects, cap);
    411 }
    412 
    413 /*
    414  * Enable cap for a zone
    415  * It is safe to enable already enabled zone cap.
    416  * Should be called with caps_lock held.
    417  */
    418 static void
    419 cap_zone_enable(zone_t *zone, hrtime_t value)
    420 {
    421 	cpucap_t *cap = zone->zone_cpucap;
    422 
    423 	ASSERT(MUTEX_HELD(&caps_lock));
    424 	ASSERT(cap != NULL);
    425 
    426 	if (CAP_DISABLED(cap)) {
    427 		ASSERT(cap->cap_kstat == NULL);
    428 		cap_enable(&capped_zones, cap, value);
    429 		cap->cap_zone = zone;
    430 
    431 		/*
    432 		 * Create cap kstats
    433 		 */
    434 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
    435 		    KSTAT_TYPE_NAMED,
    436 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    437 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    438 			cap->cap_kstat->ks_data_size +=
    439 			    strlen(cap->cap_zone->zone_name) + 1;
    440 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
    441 			cap->cap_kstat->ks_data = &cap_kstat;
    442 			cap->cap_kstat->ks_update = cap_kstat_update;
    443 			cap->cap_kstat->ks_private = cap;
    444 			kstat_install(cap->cap_kstat);
    445 		}
    446 	}
    447 }
    448 
    449 /*
    450  * Disable zone cap.
    451  * It is safe to disable already disabled zone cap.
    452  * Should be called with caps_lock held.
    453  */
    454 static void
    455 cap_zone_disable(zone_t *zone)
    456 {
    457 	cpucap_t *cap = zone->zone_cpucap;
    458 
    459 	ASSERT(MUTEX_HELD(&caps_lock));
    460 	ASSERT(cap != NULL);
    461 	ASSERT(cap->cap_zone == zone);
    462 
    463 	if (CAP_ENABLED(cap))
    464 		cap_disable(&capped_zones, cap);
    465 }
    466 
    467 /*
    468  * Apply specified callback to all caps contained in the list `l'.
    469  */
    470 static void
    471 cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
    472 {
    473 	static uint64_t cpucap_walk_gen;
    474 	cpucap_t *cap;
    475 
    476 	ASSERT(MUTEX_HELD(&caps_lock));
    477 
    478 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
    479 		(*cb)(cap, cpucap_walk_gen);
    480 	}
    481 
    482 	atomic_inc_64(&cpucap_walk_gen);
    483 }
    484 
    485 /*
    486  * If cap limit is not reached, make one thread from wait queue runnable.
    487  * The waitq_isempty check is performed without the waitq lock. If a new thread
    488  * is placed on the waitq right after the check, it will be picked up during the
    489  * next invocation of cap_poke_waitq().
    490  */
    491 /* ARGSUSED */
    492 static void
    493 cap_poke_waitq(cpucap_t *cap, int64_t gen)
    494 {
    495 	ASSERT(MUTEX_HELD(&caps_lock));
    496 
    497 	if (cap->cap_usage >= cap->cap_value) {
    498 		cap->cap_above++;
    499 	} else {
    500 		waitq_t *wq = &cap->cap_waitq;
    501 
    502 		cap->cap_below++;
    503 
    504 		if (!waitq_isempty(wq))
    505 			waitq_runone(wq);
    506 	}
    507 }
    508 
    509 /*
    510  * The callback function called for every cap on capped_projects list.
    511  * Decay cap usage by CAP_DECAY_FACTOR
    512  * Add this cap project usage to its zone usage.
    513  * Kick off a thread from the cap waitq if cap is not reached.
    514  */
    515 static void
    516 cap_project_usage_walker(cpucap_t *cap, int64_t gen)
    517 {
    518 	zone_t		*zone = cap->cap_zone;
    519 	hrtime_t	cap_usage = cap->cap_usage;
    520 
    521 	ASSERT(MUTEX_HELD(&caps_lock));
    522 	ASSERT(cap->cap_project->kpj_cpucap == cap);
    523 	ASSERT(zone == cap->cap_project->kpj_zone);
    524 	ASSERT(CAP_ENABLED(cap));
    525 
    526 	/*
    527 	 * Set or clear the CAP_REACHED flag based on the current usage.
    528 	 * Only projects having their own caps are ever marked as CAP_REACHED.
    529 	 */
    530 	cap_poke_waitq(cap, 0);
    531 
    532 	/*
    533 	 * Add project's CPU usage to our zone's CPU usage.
    534 	 */
    535 	if (ZONE_IS_CAPPED(zone)) {
    536 		cpucap_t *zcap = zone->zone_cpucap;
    537 
    538 		ASSERT(zcap->cap_zone == zone);
    539 
    540 		/*
    541 		 * If we haven't reset this zone's usage during this clock tick
    542 		 * yet, then do it now. The cap_gen field is used to check
    543 		 * whether this is the first zone's project we see during this
    544 		 * tick or a subsequent one.
    545 		 */
    546 		if (zcap->cap_gen != gen) {
    547 			if (zcap->cap_usage > zcap->cap_maxusage)
    548 				zcap->cap_maxusage = zcap->cap_usage;
    549 			zcap->cap_usage = 0;
    550 			zcap->cap_gen = gen;
    551 		}
    552 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
    553 		    hrtime_t, cap_usage);
    554 		zcap->cap_usage += cap_usage;
    555 		/* Check for overflows */
    556 		if (zcap->cap_usage < 0)
    557 			zcap->cap_usage = MAX_USAGE - 1;
    558 	}
    559 
    560 	/*
    561 	 * Decay project usage.
    562 	 */
    563 	disp_lock_enter(&cap->cap_usagelock);
    564 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
    565 	disp_lock_exit(&cap->cap_usagelock);
    566 }
    567 
    568 /*
    569  * On every clock tick walk the list of project caps and update the CPU usage.
    570  * Also walk the list of zone caps checking whether any threads should
    571  * transition from wait queue to run queue.
    572  *
    573  * This function gets called by the clock thread directly when there are any
    574  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
    575  * caps_lock for long periods of time, so there should be almost no contention
    576  * for it.
    577  */
    578 static void
    579 caps_update()
    580 {
    581 	mutex_enter(&caps_lock);
    582 	cap_walk(&capped_projects, cap_project_usage_walker);
    583 	cap_walk(&capped_zones, cap_poke_waitq);
    584 	mutex_exit(&caps_lock);
    585 }
    586 
    587 /*
    588  * The function is called for each project in a zone when the zone cap is
    589  * modified. It enables project caps if zone cap is enabled and disables if the
    590  * zone cap is disabled and project doesn't have its own cap.
    591  *
    592  * For each project that does not have cpucap structure allocated it allocates a
    593  * new structure and assigns to kpj->cpu_cap. The allocation is performed
    594  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
    595  * held.
    596  */
    597 static int
    598 cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
    599 {
    600 	cpucap_t *project_cap = NULL;
    601 	cpucap_t *zone_cap = (cpucap_t *)arg;
    602 
    603 	ASSERT(zone_cap != NULL);
    604 
    605 	if (kpj->kpj_cpucap == NULL) {
    606 		/*
    607 		 * This is the first time any cap was established for this
    608 		 * project. Allocate a new cpucap structure for it.
    609 		 */
    610 		project_cap = cap_alloc();
    611 	}
    612 
    613 	mutex_enter(&caps_lock);
    614 
    615 	/*
    616 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
    617 	 * and assign the newly allocated cpucap structure to it.
    618 	 */
    619 	if (kpj->kpj_cpucap == NULL) {
    620 		kpj->kpj_cpucap = project_cap;
    621 	} else if (project_cap != NULL) {
    622 		cap_free(project_cap);
    623 	}
    624 
    625 	project_cap = kpj->kpj_cpucap;
    626 
    627 	if (CAP_DISABLED(zone_cap)) {
    628 		/*
    629 		 * Remove all projects in this zone without caps
    630 		 * from the capped_projects list.
    631 		 */
    632 		if (project_cap->cap_value == MAX_USAGE) {
    633 			cap_project_disable(kpj);
    634 		}
    635 	} else if (CAP_DISABLED(project_cap)) {
    636 		/*
    637 		 * Add the project to capped_projects list.
    638 		 */
    639 		ASSERT(project_cap->cap_value == 0);
    640 		cap_project_enable(kpj, MAX_USAGE);
    641 	}
    642 	mutex_exit(&caps_lock);
    643 
    644 	return (0);
    645 }
    646 
    647 /*
    648  * Set zone cap to cap_val
    649  * If cap_val is equal to NOCAP, disable zone cap.
    650  *
    651  * If this is the first time a cap is set on a zone, allocate cpucap structure
    652  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
    653  */
    654 int
    655 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
    656 {
    657 	cpucap_t *cap = NULL;
    658 	hrtime_t value;
    659 
    660 	if (cap_val == 0)
    661 		return (EINVAL);
    662 
    663 	ASSERT(cap_val <= MAXCAP);
    664 	if (cap_val > MAXCAP)
    665 		cap_val = MAXCAP;
    666 
    667 	/*
    668 	 * Nothing to do if trying to disable a cap on a zone when caps are off
    669 	 * or a zone which does not have a cap yet.
    670 	 */
    671 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
    672 		return (0);
    673 
    674 	if (zone->zone_cpucap == NULL)
    675 		cap = cap_alloc();
    676 
    677 	mutex_enter(&caps_lock);
    678 
    679 	if (cpucaps_busy) {
    680 		mutex_exit(&caps_lock);
    681 		return (EBUSY);
    682 	}
    683 
    684 	/*
    685 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
    686 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
    687 	 */
    688 	if (zone->zone_cpucap == NULL) {
    689 		zone->zone_cpucap = cap;
    690 	} else if (cap != NULL) {
    691 		cap_free(cap);
    692 	}
    693 
    694 	cap = zone->zone_cpucap;
    695 	value = cap_val * cap_tick_cost;
    696 	if (value < 0)
    697 		value = MAX_USAGE;
    698 
    699 	/* Nothing to do if the value is staying the same */
    700 	if (value == cap->cap_value) {
    701 		mutex_exit(&caps_lock);
    702 		return (0);
    703 	}
    704 
    705 	/*
    706 	 * Clear cap statistics since the cap value itself changes.
    707 	 */
    708 	cap->cap_above = cap->cap_below = 0;
    709 
    710 
    711 	if (cap_val == NOCAP) {
    712 		if (CAP_ENABLED(cap)) {
    713 			/*
    714 			 * Remove cap for the zone
    715 			 */
    716 			cap_zone_disable(zone);
    717 			cpucaps_busy = B_TRUE;
    718 			mutex_exit(&caps_lock);
    719 			/*
    720 			 * Disable caps for all project belonging to this zone
    721 			 * unless they have their own cap.
    722 			 */
    723 			(void) project_walk_all(zone->zone_id,
    724 			    cap_project_zone_modify_walker, cap);
    725 
    726 			mutex_enter(&caps_lock);
    727 			cpucaps_busy = B_FALSE;
    728 		}
    729 	} else if (CAP_DISABLED(cap)) {
    730 		/*
    731 		 * Set a cap on a zone which previously was not capped.
    732 		 */
    733 		cap_zone_enable(zone, value);
    734 		cpucaps_busy = B_TRUE;
    735 		mutex_exit(&caps_lock);
    736 
    737 		/*
    738 		 * Enable cap for all projects belonging to this zone.
    739 		 */
    740 		(void) project_walk_all(zone->zone_id,
    741 		    cap_project_zone_modify_walker, cap);
    742 
    743 		mutex_enter(&caps_lock);
    744 		cpucaps_busy = B_FALSE;
    745 	} else {
    746 		/*
    747 		 * No state transitions, just change the value
    748 		 */
    749 		cap->cap_value = value;
    750 	}
    751 
    752 	ASSERT(MUTEX_HELD(&caps_lock));
    753 	ASSERT(!cpucaps_busy);
    754 	mutex_exit(&caps_lock);
    755 
    756 	return (0);
    757 }
    758 
    759 /*
    760  * The project is going away so disable its cap.
    761  */
    762 void
    763 cpucaps_project_remove(kproject_t *kpj)
    764 {
    765 	mutex_enter(&caps_lock);
    766 	if (PROJECT_IS_CAPPED(kpj))
    767 		cap_project_disable(kpj);
    768 	if (kpj->kpj_cpucap != NULL) {
    769 		cap_free(kpj->kpj_cpucap);
    770 		kpj->kpj_cpucap = NULL;
    771 	}
    772 	mutex_exit(&caps_lock);
    773 }
    774 
    775 /*
    776  * The zone is going away, so disable its cap.
    777  */
    778 void
    779 cpucaps_zone_remove(zone_t *zone)
    780 {
    781 	mutex_enter(&caps_lock);
    782 	while (ZONE_IS_CAPPED(zone)) {
    783 		mutex_exit(&caps_lock);
    784 		(void) cpucaps_zone_set(zone, NOCAP);
    785 		mutex_enter(&caps_lock);
    786 	}
    787 	if (zone->zone_cpucap != NULL) {
    788 		cap_free(zone->zone_cpucap);
    789 		zone->zone_cpucap = NULL;
    790 	}
    791 	mutex_exit(&caps_lock);
    792 }
    793 
    794 /*
    795  * New project was created. It should be put on the capped_projects list if
    796  * its zone has a cap.
    797  */
    798 void
    799 cpucaps_project_add(kproject_t *kpj)
    800 {
    801 	cpucap_t *cap = NULL;
    802 
    803 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
    804 		return;
    805 
    806 	/*
    807 	 * This project was never capped before, so allocate its cap structure.
    808 	 */
    809 	if (kpj->kpj_cpucap == NULL)
    810 		cap = cap_alloc();
    811 
    812 	mutex_enter(&caps_lock);
    813 	/*
    814 	 * Double-check with caps_lock held
    815 	 */
    816 	if (kpj->kpj_cpucap == NULL) {
    817 		kpj->kpj_cpucap = cap;
    818 	} else if (cap != NULL) {
    819 		cap_free(cap);
    820 	}
    821 
    822 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
    823 		cap_project_enable(kpj, MAX_USAGE);
    824 
    825 	mutex_exit(&caps_lock);
    826 }
    827 
    828 /*
    829  * Set project cap to cap_val
    830  * If cap_val is equal to NOCAP, disable project cap.
    831  *
    832  * If this is the first time a cap is set on a project, allocate cpucap
    833  * structure without holding caps_lock to avoid KM_SLEEP allocation with
    834  * caps_lock held.
    835  */
    836 int
    837 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
    838 {
    839 	cpucap_t *cap = NULL;
    840 	hrtime_t value;
    841 
    842 	if (cap_val == 0)
    843 		return (EINVAL);
    844 
    845 	ASSERT(cap_val <= MAXCAP);
    846 	if (cap_val > MAXCAP)
    847 		cap_val = MAXCAP;
    848 
    849 	/*
    850 	 * Nothing to do if trying to disable project cap and caps are not
    851 	 * enabled or if trying to disable cap on a project that does not have
    852 	 * cap enabled.
    853 	 */
    854 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
    855 		return (0);
    856 
    857 	if (kpj->kpj_cpucap == NULL) {
    858 		/*
    859 		 * This project was never capped before, so allocate its cap
    860 		 * structure.
    861 		 */
    862 		cap = cap_alloc();
    863 	}
    864 
    865 	mutex_enter(&caps_lock);
    866 
    867 	/*
    868 	 * Double-check with caps_lock held.
    869 	 */
    870 	if (kpj->kpj_cpucap == NULL) {
    871 		kpj->kpj_cpucap = cap;
    872 	} else if (cap != NULL) {
    873 		cap_free(cap);
    874 	}
    875 
    876 	/*
    877 	 * Get the actual pointer to the project cap.
    878 	 */
    879 	cap = kpj->kpj_cpucap;
    880 	value = cap_val * cap_tick_cost;
    881 	if (value < 0)
    882 		value = MAX_USAGE;
    883 
    884 	/*
    885 	 * Nothing to do if the value is not changing
    886 	 */
    887 	if (value == cap->cap_value) {
    888 		mutex_exit(&caps_lock);
    889 		return (0);
    890 	}
    891 
    892 	/*
    893 	 * Clear cap statistics since the cap value itself changes.
    894 	 */
    895 	cap->cap_above = cap->cap_below = 0;
    896 	cap->cap_maxusage = 0;
    897 
    898 	if (cap_val != NOCAP) {
    899 		/*
    900 		 * Enable this cap if it is not already enabled.
    901 		 */
    902 		if (CAP_DISABLED(cap))
    903 			cap_project_enable(kpj, value);
    904 		else
    905 			cap->cap_value = value;
    906 	} else if (CAP_ENABLED(cap)) {
    907 		/*
    908 		 * User requested to drop a cap on the project. If it is part of
    909 		 * capped zone, keep the cap and set the value to MAX_USAGE,
    910 		 * otherwise disable the cap.
    911 		 */
    912 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
    913 			cap->cap_value = MAX_USAGE;
    914 		} else {
    915 			cap_project_disable(kpj);
    916 		}
    917 	}
    918 	mutex_exit(&caps_lock);
    919 
    920 	return (0);
    921 }
    922 
    923 /*
    924  * Get cap usage.
    925  */
    926 static rctl_qty_t
    927 cap_get(cpucap_t *cap)
    928 {
    929 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
    930 }
    931 
    932 /*
    933  * Get current project usage.
    934  */
    935 rctl_qty_t
    936 cpucaps_project_get(kproject_t *kpj)
    937 {
    938 	return (cap_get(kpj->kpj_cpucap));
    939 }
    940 
    941 /*
    942  * Get current zone usage.
    943  */
    944 rctl_qty_t
    945 cpucaps_zone_get(zone_t *zone)
    946 {
    947 	return (cap_get(zone->zone_cpucap));
    948 }
    949 
    950 /*
    951  * Charge project of thread t the time thread t spent on CPU since previously
    952  * adjusted.
    953  *
    954  * Record the current on-CPU time in the csc structure.
    955  *
    956  * Do not adjust for more than one tick worth of time.
    957  *
    958  * It is possible that the project cap is being disabled while this routine is
    959  * executed. This should not cause any issues since the association between the
    960  * thread and its project is protected by thread lock.
    961  */
    962 static void
    963 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
    964 {
    965 	kproject_t	*kpj = ttoproj(t);
    966 	hrtime_t	new_usage;
    967 	hrtime_t	usage_delta;
    968 
    969 	ASSERT(THREAD_LOCK_HELD(t));
    970 	ASSERT(kpj->kpj_cpucap != NULL);
    971 
    972 	/* Get on-CPU time since birth of a thread */
    973 	new_usage = mstate_thread_onproc_time(t);
    974 
    975 	/* Time spent on CPU since last checked */
    976 	usage_delta = new_usage - csc->csc_cputime;
    977 
    978 	/* Save the accumulated on-CPU time */
    979 	csc->csc_cputime = new_usage;
    980 
    981 	/* Charge at most one tick worth of on-CPU time */
    982 	if (usage_delta > cap_tick_cost)
    983 		usage_delta = cap_tick_cost;
    984 
    985 	/* Add usage_delta to the project usage value. */
    986 	if (usage_delta > 0) {
    987 		cpucap_t *cap = kpj->kpj_cpucap;
    988 
    989 		DTRACE_PROBE2(cpucaps__project__charge,
    990 		    kthread_id_t, t, hrtime_t, usage_delta);
    991 
    992 		disp_lock_enter_high(&cap->cap_usagelock);
    993 		cap->cap_usage += usage_delta;
    994 
    995 		/* Check for overflows */
    996 		if (cap->cap_usage < 0)
    997 			cap->cap_usage = MAX_USAGE - 1;
    998 
    999 		disp_lock_exit_high(&cap->cap_usagelock);
   1000 
   1001 		/*
   1002 		 * cap_maxusage is only kept for observability. Move it outside
   1003 		 * the lock to reduce the time spent while holding the lock.
   1004 		 */
   1005 		if (cap->cap_usage > cap->cap_maxusage)
   1006 			cap->cap_maxusage = cap->cap_usage;
   1007 	}
   1008 }
   1009 
   1010 /*
   1011  * Charge thread's project and return True if project or zone should be
   1012  * penalized because its project or zone is exceeding its cap. Also sets
   1013  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
   1014  *
   1015  * It is possible that the project cap is being disabled while this routine is
   1016  * executed. This should not cause any issues since the association between the
   1017  * thread and its project is protected by thread lock. It will still set
   1018  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
   1019  * anything on the blocked wait queue.
   1020  *
   1021  */
   1022 boolean_t
   1023 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
   1024 {
   1025 	kproject_t	*kpj = ttoproj(t);
   1026 	klwp_t		*lwp = t->t_lwp;
   1027 	zone_t		*zone;
   1028 	cpucap_t	*project_cap;
   1029 	boolean_t	rc = B_FALSE;
   1030 
   1031 	ASSERT(THREAD_LOCK_HELD(t));
   1032 
   1033 	/* Nothing to do for projects that are not capped. */
   1034 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
   1035 		return (B_FALSE);
   1036 
   1037 	caps_charge_adjust(t, csc);
   1038 
   1039 	/*
   1040 	 * The caller only requested to charge the project usage, no enforcement
   1041 	 * part.
   1042 	 */
   1043 	if (charge_type == CPUCAPS_CHARGE_ONLY)
   1044 		return (B_FALSE);
   1045 
   1046 	project_cap = kpj->kpj_cpucap;
   1047 
   1048 	if (project_cap->cap_usage >= project_cap->cap_value) {
   1049 		t->t_schedflag |= TS_PROJWAITQ;
   1050 		rc = B_TRUE;
   1051 	} else if (t->t_schedflag & TS_PROJWAITQ) {
   1052 		t->t_schedflag &= ~TS_PROJWAITQ;
   1053 	}
   1054 
   1055 	zone = ttozone(t);
   1056 	if (!ZONE_IS_CAPPED(zone)) {
   1057 		if (t->t_schedflag & TS_ZONEWAITQ)
   1058 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1059 	} else {
   1060 		cpucap_t *zone_cap = zone->zone_cpucap;
   1061 
   1062 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
   1063 			t->t_schedflag |= TS_ZONEWAITQ;
   1064 			rc = B_TRUE;
   1065 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
   1066 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1067 		}
   1068 	}
   1069 
   1070 
   1071 	return (rc);
   1072 }
   1073 
   1074 /*
   1075  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
   1076  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
   1077  *
   1078  * CPU Caps are only enforced for user threads.
   1079  *
   1080  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
   1081  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
   1082  *
   1083  * It is possible that by the time we enter cpucaps_enforce() the cap is already
   1084  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
   1085  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
   1086  * apply.
   1087  */
   1088 boolean_t
   1089 cpucaps_enforce(kthread_t *t)
   1090 {
   1091 	klwp_t *lwp = t->t_lwp;
   1092 
   1093 	ASSERT(THREAD_LOCK_HELD(t));
   1094 
   1095 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
   1096 		if (t->t_schedflag & TS_PROJWAITQ) {
   1097 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
   1098 			t->t_schedflag &= ~TS_ANYWAITQ;
   1099 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
   1100 			    t)) {
   1101 				return (B_TRUE);
   1102 			}
   1103 		}
   1104 		if (t->t_schedflag & TS_ZONEWAITQ) {
   1105 			ASSERT(ttozone(t)->zone_cpucap != NULL);
   1106 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1107 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
   1108 			    t)) {
   1109 				return (B_TRUE);
   1110 			}
   1111 		}
   1112 	}
   1113 
   1114 	/*
   1115 	 * The thread is not enqueued on the wait queue.
   1116 	 */
   1117 	return (B_FALSE);
   1118 }
   1119 
   1120 /*
   1121  * Convert internal cap statistics into values exported by cap kstat.
   1122  */
   1123 static int
   1124 cap_kstat_update(kstat_t *ksp, int rw)
   1125 {
   1126 	struct cap_kstat *capsp = &cap_kstat;
   1127 	cpucap_t *cap = ksp->ks_private;
   1128 	clock_t	tick_sec = SEC_TO_TICK(1);
   1129 	char *zonename = cap->cap_zone->zone_name;
   1130 
   1131 	if (rw == KSTAT_WRITE)
   1132 		return (EACCES);
   1133 
   1134 	capsp->cap_value.value.ui64 =
   1135 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
   1136 	capsp->cap_usage.value.ui64 =
   1137 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
   1138 	capsp->cap_maxusage.value.ui64 =
   1139 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
   1140 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
   1141 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
   1142 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
   1143 	kstat_named_setstr(&capsp->cap_zonename, zonename);
   1144 
   1145 	return (0);
   1146 }
   1147