Home | History | Annotate | Download | only in disp
      1   3792   akolb /*
      2   3792   akolb  * CDDL HEADER START
      3   3792   akolb  *
      4   3792   akolb  * The contents of this file are subject to the terms of the
      5   3792   akolb  * Common Development and Distribution License (the "License").
      6   3792   akolb  * You may not use this file except in compliance with the License.
      7   3792   akolb  *
      8   3792   akolb  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9   3792   akolb  * or http://www.opensolaris.org/os/licensing.
     10   3792   akolb  * See the License for the specific language governing permissions
     11   3792   akolb  * and limitations under the License.
     12   3792   akolb  *
     13   3792   akolb  * When distributing Covered Code, include this CDDL HEADER in each
     14   3792   akolb  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15   3792   akolb  * If applicable, add the following below this CDDL HEADER, with the
     16   3792   akolb  * fields enclosed by brackets "[]" replaced with your own identifying
     17   3792   akolb  * information: Portions Copyright [yyyy] [name of copyright owner]
     18   3792   akolb  *
     19   3792   akolb  * CDDL HEADER END
     20   3792   akolb  */
     21   3792   akolb 
     22   3792   akolb /*
     23  11066  rafael  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24   3792   akolb  * Use is subject to license terms.
     25   3792   akolb  */
     26   3792   akolb 
     27   3792   akolb #include <sys/disp.h>
     28   3792   akolb #include <sys/param.h>
     29   3792   akolb #include <sys/systm.h>
     30   3792   akolb #include <sys/sysmacros.h>
     31   3792   akolb #include <sys/atomic.h>
     32   3792   akolb #include <sys/cpucaps_impl.h>
     33   3792   akolb #include <sys/dtrace.h>
     34   3792   akolb #include <sys/sdt.h>
     35   3792   akolb #include <sys/debug.h>
     36   3792   akolb #include <sys/rctl.h>
     37   3792   akolb #include <sys/errno.h>
     38   3792   akolb 
     39   3792   akolb /*
     40   3792   akolb  * CPU Caps implementation
     41   3792   akolb  * =======================
     42   3792   akolb  *
     43   3792   akolb  * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
     44   3792   akolb  * usage for all projects running inside the zone. If the zone CPU cap is set
     45   3792   akolb  * below the project CPU cap, the latter will have no effect.
     46   3792   akolb  *
     47   3792   akolb  * When CPU usage of projects and/or zones reaches specified caps, threads in
     48   3792   akolb  * them do not get scheduled and instead are placed on wait queues associated
     49   3792   akolb  * with a cap. Such threads will start running again only when CPU usage drops
     50   3792   akolb  * below the cap level. Each zone and each project has its own wait queue.
     51   3792   akolb  *
     52   3792   akolb  * When CPU cap is set, the kernel continously keeps track of CPU time used by
     53   3792   akolb  * capped zones and/or projects over a short time interval and calculates their
     54   3792   akolb  * current CPU usage as a percentage. When the accumulated usage reaches the CPU
     55   3792   akolb  * cap, LWPs running in the user-land (when they are not holding any critical
     56   3792   akolb  * kernel locks) are placed on special wait queues until their project's or
     57   3792   akolb  * zone's CPU usage drops below the cap.
     58   3792   akolb  *
     59   3792   akolb  * The system maintains a list of all capped projects and all capped zones. On
     60   3792   akolb  * every clock tick every active thread belonging to a capped project adds its
     61   3792   akolb  * CPU usage to its project. Usage from all projects belonging to a capped zone
     62   3792   akolb  * is aggregated to get the zone usage.
     63   3792   akolb  *
     64   3792   akolb  * When the current CPU usage is above the cap, a project or zone is considered
     65   3792   akolb  * over-capped. Every user thread caught running in an over-capped project or
     66   3792   akolb  * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
     67   3792   akolb  * is requested to surrender its CPU. This causes scheduling class specific
     68   3792   akolb  * CL_PREEMPT() callback to be invoked. The callback function places threads
     69   3792   akolb  * marked as TS_PROJWAIT on a wait queue and calls switch().
     70   3792   akolb  *
     71   3792   akolb  * Threads are only placed on wait queues after trapping from user-land
     72   3792   akolb  * (they could be holding some user locks, but no kernel locks) and while
     73   3792   akolb  * returning from the trap back to the user-land when no kernel locks are held.
     74   3792   akolb  * Putting threads on wait queues in random places while running in the
     75   3792   akolb  * kernel might lead to all kinds of locking problems.
     76   3792   akolb  *
     77   3792   akolb  * Accounting
     78   3792   akolb  * ==========
     79   3792   akolb  *
     80   3792   akolb  * Accounting of CPU usage is based on per-thread micro-state accounting data.
     81   3792   akolb  * On every clock tick clock() adds new on-CPU time for every thread found on
     82   3792   akolb  * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
     83   3792   akolb  * New times means time since it was last accounted for. On-CPU times greater
     84   3792   akolb  * than 1 tick are truncated to 1 tick.
     85   3792   akolb  *
     86   3792   akolb  * Project CPU usage is aggregated from all threads within the project.
     87   3792   akolb  * Zone CPU usage is the sum of usages for all projects within the zone. Zone
     88   3792   akolb  * CPU usage is calculated on every clock tick by walking list of projects and
     89   3792   akolb  * adding their usage together.
     90   3792   akolb  *
     91   3792   akolb  * Decay
     92   3792   akolb  * =====
     93   3792   akolb  *
     94   3792   akolb  * CPU usage is decayed by the caps_update() routine which is called once per
     95   3792   akolb  * every clock tick. It walks lists of project caps and decays their usages by
     96   3792   akolb  * one per cent. If CPU usage drops below cap levels, threads on the wait queue
     97   3792   akolb  * are made runnable again, one thread per clock tick.
     98   3792   akolb  *
     99   3792   akolb  * Interfaces
    100   3792   akolb  * ==========
    101   3792   akolb  *
    102   3792   akolb  * The CPU Caps facility provides the following interfaces to the rest of the
    103   3792   akolb  * system:
    104   3792   akolb  *
    105   3792   akolb  *   cpucaps_project_add(kproject_t *)
    106   3792   akolb  *
    107   3792   akolb  * Notifies the framework of a new project. It should be put on the
    108   3792   akolb  * capped_projects list if its zone has a cap.
    109   3792   akolb  *
    110   3792   akolb  *   cpucaps_project_remove(kproject_t *)
    111   3792   akolb  *
    112   3792   akolb  * Remove the association between the specified project and its cap.
    113   3792   akolb  * Called right before the project is destroyed.
    114   3792   akolb  *
    115   3792   akolb  * cpucaps_project_set(kproject_t *, rctl_qty_t)
    116   3792   akolb  *
    117   3792   akolb  * Set project cap of the specified project to the specified value. Setting the
    118   3792   akolb  * value to NOCAP is equivalent to removing the cap.
    119   3792   akolb  *
    120   3792   akolb  *   cpucaps_zone_set(zone_t *, rctl_qty_t)
    121   3792   akolb  *
    122   3792   akolb  * Set zone cap of the specified zone to the specified value. Setting the value
    123   3792   akolb  * to NOCAP is equivalent to removing the cap.
    124   3792   akolb  *
    125   3792   akolb  *   cpucaps_zone_remove(zone_t *)
    126   3792   akolb  *
    127   3792   akolb  * Remove the association between the zone and its cap.
    128   3792   akolb  *
    129   3792   akolb  *   cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
    130   3792   akolb  *
    131   3792   akolb  * Charges specified thread's project the amount of on-CPU time that it used.
    132   3792   akolb  * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
    133   3792   akolb  * Otherwise returns True if project or zone should be penalized because its
    134   3792   akolb  * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
    135   3792   akolb  * bits in t_schedflag in this case.
    136   3792   akolb  *
    137   3792   akolb  *   CPUCAPS_ENFORCE(kthread_id_t *)
    138   3792   akolb  *
    139   3792   akolb  * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
    140   3792   akolb  * state on project or zone wait queues, as requested by TS_PROJWAITQ or
    141   3792   akolb  * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
    142   3792   akolb  * wait queue or False otherwise.
    143   3792   akolb  *
    144   3792   akolb  *   cpucaps_sc_init(caps_sc_t *)
    145   3792   akolb  *
    146   3792   akolb  * Initializes the scheduling-class specific CPU Caps data for a thread.
    147   3792   akolb  *
    148   3792   akolb  * LOCKS
    149   3792   akolb  * =====
    150   3792   akolb  *
    151   3792   akolb  * all the individual caps structures and their lists are protected by a global
    152   3792   akolb  * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
    153   3792   akolb  * caps, so it is usually uncontended. We avoid all blocking memory allocations
    154   3792   akolb  * while holding caps_lock to prevent clock() from blocking.
    155   3792   akolb  *
    156   3792   akolb  * Thread state is protected by the thread lock. It protects the association
    157   3792   akolb  * between a thread and its project and, as a consequence, to its zone. The
    158   3792   akolb  * association can not break while thread lock is held, so the project or zone
    159   3792   akolb  * cap are not going to disappear while thread lock is held.
    160   3792   akolb  *
    161   3792   akolb  * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
    162   3792   akolb  * grabbed by scheduling classes already holding thread lock at high PIL and by
    163   3792   akolb  * clock thread performing usage decay. We should do as little work as possible
    164   3792   akolb  * while holding the lock since it may be very hot. All threads in the project
    165   3792   akolb  * contend for the same cache line doing cap usage updates.
    166   3792   akolb  */
    167   3792   akolb 
    168   3792   akolb /*
    169   3792   akolb  * caps_lock protects list of capped projects and zones, changes in the cap
    170   3792   akolb  * state and changes of the global cpucaps_enabled flag.
    171   3792   akolb  *
    172   3792   akolb  * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
    173   3792   akolb  * modified in parallel. This can be per-zone cap flag, but we don't keep any
    174   3792   akolb  * cap state for now.
    175   3792   akolb  */
    176   3792   akolb static kmutex_t caps_lock;		/* lock to protect: */
    177   3792   akolb static list_t capped_zones;		/* - list of zones with caps */
    178   3792   akolb static list_t capped_projects;		/* - list of projects with caps */
    179   3792   akolb boolean_t cpucaps_enabled;		/* - are there any caps defined? */
    180   3792   akolb boolean_t cpucaps_busy;			/* - is framework busy? */
    181   3792   akolb 
    182   3792   akolb /*
    183   3792   akolb  * The accounting is based on the number of nanoseconds threads spend running
    184   3792   akolb  * during a tick which is kept in the cap_tick_cost variable.
    185   3792   akolb  */
    186   3792   akolb static hrtime_t cap_tick_cost;
    187   3792   akolb 
    188   3792   akolb /*
    189   3792   akolb  * How much of the usage value is decayed every clock tick
    190   3792   akolb  * Decay one per cent of value per tick
    191   3792   akolb  */
    192   3792   akolb #define	CAP_DECAY_FACTOR 100
    193   3792   akolb 
    194   3792   akolb /*
    195   3792   akolb  * Scale the value and round it to the closest integer value
    196   3792   akolb  */
    197   3792   akolb #define	ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
    198   3792   akolb 
    199   3792   akolb static void caps_update();
    200   3792   akolb 
    201   3792   akolb /*
    202   3792   akolb  * CAP kstats.
    203   3792   akolb  */
    204   3792   akolb struct cap_kstat {
    205   3792   akolb 	kstat_named_t	cap_value;
    206   3792   akolb 	kstat_named_t	cap_usage;
    207   3792   akolb 	kstat_named_t	cap_nwait;
    208   3792   akolb 	kstat_named_t	cap_below;
    209   3792   akolb 	kstat_named_t	cap_above;
    210   3792   akolb 	kstat_named_t	cap_maxusage;
    211   3792   akolb 	kstat_named_t	cap_zonename;
    212   3792   akolb } cap_kstat = {
    213   3792   akolb 	{ "value",	KSTAT_DATA_UINT64 },
    214   3792   akolb 	{ "usage",	KSTAT_DATA_UINT64 },
    215   3792   akolb 	{ "nwait",	KSTAT_DATA_UINT64 },
    216   3792   akolb 	{ "below_sec",	KSTAT_DATA_UINT64 },
    217   3792   akolb 	{ "above_sec",	KSTAT_DATA_UINT64 },
    218   3792   akolb 	{ "maxusage",	KSTAT_DATA_UINT64 },
    219   3792   akolb 	{ "zonename",	KSTAT_DATA_STRING },
    220   3792   akolb };
    221   3792   akolb 
    222   3792   akolb 
    223   3792   akolb static kmutex_t cap_kstat_lock;
    224   3792   akolb static int cap_kstat_update(kstat_t *, int);
    225   3792   akolb 
    226   3792   akolb /*
    227   3792   akolb  * Initialize CPU caps infrastructure.
    228   3792   akolb  *   - Initialize lists of capped zones and capped projects
    229   3792   akolb  *   - Set cpucaps_clock_callout to NULL
    230   3792   akolb  */
    231   3792   akolb void
    232   3792   akolb cpucaps_init()
    233   3792   akolb {
    234   3792   akolb 	/*
    235   3792   akolb 	 * Initialize global variables
    236   3792   akolb 	 */
    237   3792   akolb 	cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
    238   3792   akolb 
    239   3792   akolb 	list_create(&capped_zones, sizeof (cpucap_t),
    240   3792   akolb 	    offsetof(cpucap_t, cap_link));
    241   3792   akolb 	list_create(&capped_projects, sizeof (cpucap_t),
    242   3792   akolb 	    offsetof(cpucap_t, cap_link));
    243   3792   akolb 
    244   3792   akolb 	cpucaps_enabled = B_FALSE;
    245   3792   akolb 	cpucaps_busy = B_FALSE;
    246   3792   akolb 	cpucaps_clock_callout = NULL;
    247   3792   akolb }
    248   3792   akolb 
    249   3792   akolb /*
    250   3792   akolb  * Initialize scheduling-class specific CPU Caps data.
    251   3792   akolb  */
    252   3792   akolb void
    253   3792   akolb cpucaps_sc_init(caps_sc_t *csc)
    254   3792   akolb {
    255   3792   akolb 	csc->csc_cputime = 0;
    256   3792   akolb }
    257   3792   akolb 
    258   3792   akolb /*
    259   3792   akolb  * Allocate and initialize cpucap structure
    260   3792   akolb  */
    261   3792   akolb static cpucap_t *
    262   3792   akolb cap_alloc(void)
    263   3792   akolb {
    264   3792   akolb 	cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
    265   3792   akolb 
    266   3792   akolb 	DISP_LOCK_INIT(&cap->cap_usagelock);
    267   3792   akolb 	waitq_init(&cap->cap_waitq);
    268   3792   akolb 
    269   3792   akolb 	return (cap);
    270   3792   akolb }
    271   3792   akolb 
    272   3792   akolb /*
    273   3792   akolb  * Free cpucap structure
    274   3792   akolb  */
    275   3792   akolb static void
    276   3792   akolb cap_free(cpucap_t *cap)
    277   3792   akolb {
    278   3792   akolb 	if (cap == NULL)
    279   3792   akolb 		return;
    280   3792   akolb 
    281   3792   akolb 	/*
    282   3792   akolb 	 * This cap should not be active
    283   3792   akolb 	 */
    284   3792   akolb 	ASSERT(!list_link_active(&cap->cap_link));
    285   3792   akolb 	ASSERT(cap->cap_value == 0);
    286   3792   akolb 	ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
    287   3792   akolb 
    288   3792   akolb 	waitq_fini(&cap->cap_waitq);
    289   3792   akolb 	DISP_LOCK_DESTROY(&cap->cap_usagelock);
    290   3792   akolb 
    291   3792   akolb 	kmem_free(cap, sizeof (cpucap_t));
    292   3792   akolb }
    293   3792   akolb 
    294   3792   akolb /*
    295   3792   akolb  * Activate cap - insert into active list and unblock its
    296   3792   akolb  * wait queue. Should be called with caps_lock held.
    297   3792   akolb  * The cap_value field is set to the value supplied.
    298   3792   akolb  */
    299   3792   akolb static void
    300   3792   akolb cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
    301   3792   akolb {
    302   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    303   3792   akolb 
    304   3792   akolb 	/*
    305   3792   akolb 	 * Cap can not be already enabled
    306   3792   akolb 	 */
    307   3792   akolb 	ASSERT(!CAP_ENABLED(cap));
    308   3792   akolb 	ASSERT(!list_link_active(&cap->cap_link));
    309   3792   akolb 
    310   3792   akolb 	list_insert_tail(l, cap);
    311   3792   akolb 	cap->cap_below = cap->cap_above = 0;
    312   3792   akolb 	cap->cap_maxusage = 0;
    313   3792   akolb 	cap->cap_usage = 0;
    314   3792   akolb 	cap->cap_value = value;
    315   3792   akolb 	waitq_unblock(&cap->cap_waitq);
    316   3792   akolb 	if (CPUCAPS_OFF()) {
    317   3792   akolb 		cpucaps_enabled = B_TRUE;
    318   3792   akolb 		cpucaps_clock_callout = caps_update;
    319   3792   akolb 	}
    320   3792   akolb }
    321   3792   akolb 
    322   3792   akolb /*
    323   3792   akolb  * Deactivate cap
    324   3792   akolb  *   - Block its wait queue. This prevents any new threads from being
    325   3792   akolb  *	enqueued there and moves all enqueued threads to the run queue.
    326   3792   akolb  *   - Remove cap from list l.
    327   3792   akolb  *   - Disable CPU caps globally if there are no capped projects or zones
    328   3792   akolb  *
    329   3792   akolb  * Should be called with caps_lock held.
    330   3792   akolb  */
    331   3792   akolb static void
    332   3792   akolb cap_disable(list_t *l, cpucap_t *cap)
    333   3792   akolb {
    334   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    335   3792   akolb 	/*
    336   3792   akolb 	 * Cap should be currently active
    337   3792   akolb 	 */
    338   3792   akolb 	ASSERT(CPUCAPS_ON());
    339   3792   akolb 	ASSERT(list_link_active(&cap->cap_link));
    340   3792   akolb 	ASSERT(CAP_ENABLED(cap));
    341   3792   akolb 
    342   3792   akolb 	waitq_block(&cap->cap_waitq);
    343   3792   akolb 	list_remove(l, cap);
    344   3792   akolb 	if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
    345   3792   akolb 		cpucaps_enabled = B_FALSE;
    346   3792   akolb 		cpucaps_clock_callout = NULL;
    347   3792   akolb 	}
    348   3792   akolb 	cap->cap_value = 0;
    349   3792   akolb 	cap->cap_project = NULL;
    350   3792   akolb 	cap->cap_zone = NULL;
    351   3792   akolb 	if (cap->cap_kstat != NULL) {
    352   3792   akolb 		kstat_delete(cap->cap_kstat);
    353   3792   akolb 		cap->cap_kstat = NULL;
    354   3792   akolb 	}
    355   3792   akolb 
    356   3792   akolb }
    357   3792   akolb 
    358   3792   akolb /*
    359   3792   akolb  * Enable cap for a project kpj
    360   3792   akolb  * It is safe to enable already enabled project cap.
    361   3792   akolb  * Should be called with caps_lock held.
    362   3792   akolb  */
    363   3792   akolb static void
    364   3792   akolb cap_project_enable(kproject_t *kpj, hrtime_t value)
    365   3792   akolb {
    366   3792   akolb 	cpucap_t *cap = kpj->kpj_cpucap;
    367   3792   akolb 
    368   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    369   3792   akolb 	ASSERT(cap != NULL);
    370   3792   akolb 
    371   3792   akolb 	if (CAP_DISABLED(cap)) {
    372   3792   akolb 		ASSERT(cap->cap_kstat == NULL);
    373   3792   akolb 		cap_enable(&capped_projects, cap, value);
    374   3792   akolb 		cap->cap_project = kpj;
    375   3792   akolb 		cap->cap_zone = kpj->kpj_zone;
    376   3792   akolb 
    377   3792   akolb 		/*
    378   3792   akolb 		 * Create cap kstats
    379   3792   akolb 		 */
    380   3792   akolb 		if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
    381   3792   akolb 		    KSTAT_TYPE_NAMED,
    382   3792   akolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    383   3792   akolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    384  11066  rafael 			cap->cap_kstat->ks_data_size +=
    385  11066  rafael 			    strlen(cap->cap_zone->zone_name) + 1;
    386  11066  rafael 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
    387  11066  rafael 			cap->cap_kstat->ks_data = &cap_kstat;
    388  11066  rafael 			cap->cap_kstat->ks_update = cap_kstat_update;
    389  11066  rafael 			cap->cap_kstat->ks_private = cap;
    390  11066  rafael 			kstat_install(cap->cap_kstat);
    391   3792   akolb 		}
    392   3792   akolb 	}
    393   3792   akolb }
    394   3792   akolb 
    395   3792   akolb /*
    396   3792   akolb  * Disable project cap.
    397   3792   akolb  * It is safe to disable already disabled project cap.
    398   3792   akolb  * Should be called with caps_lock held.
    399   3792   akolb  */
    400   3792   akolb static void
    401   3792   akolb cap_project_disable(kproject_t *kpj)
    402   3792   akolb {
    403   3792   akolb 	cpucap_t *cap = kpj->kpj_cpucap;
    404   3792   akolb 
    405   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    406   3792   akolb 	ASSERT(cap != NULL);
    407   3792   akolb 	ASSERT(cap->cap_project == kpj);
    408   3792   akolb 
    409   3792   akolb 	if (CAP_ENABLED(cap))
    410   3792   akolb 		cap_disable(&capped_projects, cap);
    411   3792   akolb }
    412   3792   akolb 
    413   3792   akolb /*
    414   3792   akolb  * Enable cap for a zone
    415   3792   akolb  * It is safe to enable already enabled zone cap.
    416   3792   akolb  * Should be called with caps_lock held.
    417   3792   akolb  */
    418   3792   akolb static void
    419   3792   akolb cap_zone_enable(zone_t *zone, hrtime_t value)
    420   3792   akolb {
    421   3792   akolb 	cpucap_t *cap = zone->zone_cpucap;
    422   3792   akolb 
    423   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    424   3792   akolb 	ASSERT(cap != NULL);
    425   3792   akolb 
    426   3792   akolb 	if (CAP_DISABLED(cap)) {
    427   3792   akolb 		ASSERT(cap->cap_kstat == NULL);
    428   3792   akolb 		cap_enable(&capped_zones, cap, value);
    429   3792   akolb 		cap->cap_zone = zone;
    430   3792   akolb 
    431   3792   akolb 		/*
    432   3792   akolb 		 * Create cap kstats
    433   3792   akolb 		 */
    434   3792   akolb 		if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
    435   3792   akolb 		    KSTAT_TYPE_NAMED,
    436   3792   akolb 		    sizeof (cap_kstat) / sizeof (kstat_named_t),
    437   3792   akolb 		    KSTAT_FLAG_VIRTUAL)) != NULL) {
    438  11066  rafael 			cap->cap_kstat->ks_data_size +=
    439  11066  rafael 			    strlen(cap->cap_zone->zone_name) + 1;
    440  11066  rafael 			cap->cap_kstat->ks_lock = &cap_kstat_lock;
    441  11066  rafael 			cap->cap_kstat->ks_data = &cap_kstat;
    442  11066  rafael 			cap->cap_kstat->ks_update = cap_kstat_update;
    443  11066  rafael 			cap->cap_kstat->ks_private = cap;
    444  11066  rafael 			kstat_install(cap->cap_kstat);
    445   3792   akolb 		}
    446   3792   akolb 	}
    447   3792   akolb }
    448   3792   akolb 
    449   3792   akolb /*
    450   3792   akolb  * Disable zone cap.
    451   3792   akolb  * It is safe to disable already disabled zone cap.
    452   3792   akolb  * Should be called with caps_lock held.
    453   3792   akolb  */
    454   3792   akolb static void
    455   3792   akolb cap_zone_disable(zone_t *zone)
    456   3792   akolb {
    457   3792   akolb 	cpucap_t *cap = zone->zone_cpucap;
    458   3792   akolb 
    459   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    460   3792   akolb 	ASSERT(cap != NULL);
    461   3792   akolb 	ASSERT(cap->cap_zone == zone);
    462   3792   akolb 
    463   3792   akolb 	if (CAP_ENABLED(cap))
    464   3792   akolb 		cap_disable(&capped_zones, cap);
    465   3792   akolb }
    466   3792   akolb 
    467   3792   akolb /*
    468   3792   akolb  * Apply specified callback to all caps contained in the list `l'.
    469   3792   akolb  */
    470   3792   akolb static void
    471  11066  rafael cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
    472   3792   akolb {
    473  11066  rafael 	static uint64_t cpucap_walk_gen;
    474   3792   akolb 	cpucap_t *cap;
    475   3792   akolb 
    476   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    477   3792   akolb 
    478   3792   akolb 	for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
    479  11066  rafael 		(*cb)(cap, cpucap_walk_gen);
    480   3792   akolb 	}
    481  11066  rafael 
    482  11066  rafael 	atomic_inc_64(&cpucap_walk_gen);
    483   3792   akolb }
    484   3792   akolb 
    485   3792   akolb /*
    486   3792   akolb  * If cap limit is not reached, make one thread from wait queue runnable.
    487   3792   akolb  * The waitq_isempty check is performed without the waitq lock. If a new thread
    488   3792   akolb  * is placed on the waitq right after the check, it will be picked up during the
    489   3792   akolb  * next invocation of cap_poke_waitq().
    490   3792   akolb  */
    491  11066  rafael /* ARGSUSED */
    492   3792   akolb static void
    493  11066  rafael cap_poke_waitq(cpucap_t *cap, int64_t gen)
    494   3792   akolb {
    495   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    496   3792   akolb 
    497   3792   akolb 	if (cap->cap_usage >= cap->cap_value) {
    498   3792   akolb 		cap->cap_above++;
    499   3792   akolb 	} else {
    500   3792   akolb 		waitq_t *wq = &cap->cap_waitq;
    501   3792   akolb 
    502   3792   akolb 		cap->cap_below++;
    503   3792   akolb 
    504   3792   akolb 		if (!waitq_isempty(wq))
    505   3792   akolb 			waitq_runone(wq);
    506   3792   akolb 	}
    507   3792   akolb }
    508   3792   akolb 
    509   3792   akolb /*
    510   3792   akolb  * The callback function called for every cap on capped_projects list.
    511   3792   akolb  * Decay cap usage by CAP_DECAY_FACTOR
    512   3792   akolb  * Add this cap project usage to its zone usage.
    513   3792   akolb  * Kick off a thread from the cap waitq if cap is not reached.
    514   3792   akolb  */
    515   3792   akolb static void
    516  11066  rafael cap_project_usage_walker(cpucap_t *cap, int64_t gen)
    517   3792   akolb {
    518   3792   akolb 	zone_t		*zone = cap->cap_zone;
    519   3792   akolb 	hrtime_t	cap_usage = cap->cap_usage;
    520   3792   akolb 
    521   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    522   3792   akolb 	ASSERT(cap->cap_project->kpj_cpucap == cap);
    523   3792   akolb 	ASSERT(zone == cap->cap_project->kpj_zone);
    524   3792   akolb 	ASSERT(CAP_ENABLED(cap));
    525   3792   akolb 
    526   3792   akolb 	/*
    527   3792   akolb 	 * Set or clear the CAP_REACHED flag based on the current usage.
    528   3792   akolb 	 * Only projects having their own caps are ever marked as CAP_REACHED.
    529   3792   akolb 	 */
    530  11066  rafael 	cap_poke_waitq(cap, 0);
    531   3792   akolb 
    532   3792   akolb 	/*
    533   3792   akolb 	 * Add project's CPU usage to our zone's CPU usage.
    534   3792   akolb 	 */
    535   3792   akolb 	if (ZONE_IS_CAPPED(zone)) {
    536   3792   akolb 		cpucap_t *zcap = zone->zone_cpucap;
    537   3792   akolb 
    538   3792   akolb 		ASSERT(zcap->cap_zone == zone);
    539   3792   akolb 
    540   3792   akolb 		/*
    541   3792   akolb 		 * If we haven't reset this zone's usage during this clock tick
    542  11066  rafael 		 * yet, then do it now. The cap_gen field is used to check
    543   3792   akolb 		 * whether this is the first zone's project we see during this
    544   3792   akolb 		 * tick or a subsequent one.
    545   3792   akolb 		 */
    546  11066  rafael 		if (zcap->cap_gen != gen) {
    547   3792   akolb 			if (zcap->cap_usage > zcap->cap_maxusage)
    548   3792   akolb 				zcap->cap_maxusage = zcap->cap_usage;
    549   3792   akolb 			zcap->cap_usage = 0;
    550  11066  rafael 			zcap->cap_gen = gen;
    551   3792   akolb 		}
    552   3792   akolb 		DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
    553   3792   akolb 		    hrtime_t, cap_usage);
    554   3792   akolb 		zcap->cap_usage += cap_usage;
    555   3792   akolb 		/* Check for overflows */
    556   3792   akolb 		if (zcap->cap_usage < 0)
    557   3792   akolb 			zcap->cap_usage = MAX_USAGE - 1;
    558   3792   akolb 	}
    559   3792   akolb 
    560   3792   akolb 	/*
    561   3792   akolb 	 * Decay project usage.
    562   3792   akolb 	 */
    563   3792   akolb 	disp_lock_enter(&cap->cap_usagelock);
    564   3792   akolb 	cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
    565   3792   akolb 	disp_lock_exit(&cap->cap_usagelock);
    566   3792   akolb }
    567   3792   akolb 
    568   3792   akolb /*
    569   3792   akolb  * On every clock tick walk the list of project caps and update the CPU usage.
    570   3792   akolb  * Also walk the list of zone caps checking whether any threads should
    571   3792   akolb  * transition from wait queue to run queue.
    572   3792   akolb  *
    573   3792   akolb  * This function gets called by the clock thread directly when there are any
    574   3792   akolb  * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
    575   3792   akolb  * caps_lock for long periods of time, so there should be almost no contention
    576   3792   akolb  * for it.
    577   3792   akolb  */
    578   3792   akolb static void
    579   3792   akolb caps_update()
    580   3792   akolb {
    581   3792   akolb 	mutex_enter(&caps_lock);
    582   3792   akolb 	cap_walk(&capped_projects, cap_project_usage_walker);
    583   3792   akolb 	cap_walk(&capped_zones, cap_poke_waitq);
    584   3792   akolb 	mutex_exit(&caps_lock);
    585   3792   akolb }
    586   3792   akolb 
    587   3792   akolb /*
    588   3792   akolb  * The function is called for each project in a zone when the zone cap is
    589   3792   akolb  * modified. It enables project caps if zone cap is enabled and disables if the
    590   3792   akolb  * zone cap is disabled and project doesn't have its own cap.
    591   3792   akolb  *
    592   3792   akolb  * For each project that does not have cpucap structure allocated it allocates a
    593   3792   akolb  * new structure and assigns to kpj->cpu_cap. The allocation is performed
    594   3792   akolb  * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
    595   3792   akolb  * held.
    596   3792   akolb  */
    597   3792   akolb static int
    598   3792   akolb cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
    599   3792   akolb {
    600   3792   akolb 	cpucap_t *project_cap = NULL;
    601   3792   akolb 	cpucap_t *zone_cap = (cpucap_t *)arg;
    602   3792   akolb 
    603   3792   akolb 	ASSERT(zone_cap != NULL);
    604   3792   akolb 
    605   3792   akolb 	if (kpj->kpj_cpucap == NULL) {
    606   3792   akolb 		/*
    607   3792   akolb 		 * This is the first time any cap was established for this
    608   3792   akolb 		 * project. Allocate a new cpucap structure for it.
    609   3792   akolb 		 */
    610   3792   akolb 		project_cap = cap_alloc();
    611   3792   akolb 	}
    612   3792   akolb 
    613   3792   akolb 	mutex_enter(&caps_lock);
    614   3792   akolb 
    615   3792   akolb 	/*
    616   3792   akolb 	 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
    617   3792   akolb 	 * and assign the newly allocated cpucap structure to it.
    618   3792   akolb 	 */
    619   3792   akolb 	if (kpj->kpj_cpucap == NULL) {
    620   3792   akolb 		kpj->kpj_cpucap = project_cap;
    621   3792   akolb 	} else if (project_cap != NULL) {
    622   3792   akolb 		cap_free(project_cap);
    623   3792   akolb 	}
    624   3792   akolb 
    625   3792   akolb 	project_cap = kpj->kpj_cpucap;
    626   3792   akolb 
    627   3792   akolb 	if (CAP_DISABLED(zone_cap)) {
    628   3792   akolb 		/*
    629   3792   akolb 		 * Remove all projects in this zone without caps
    630   3792   akolb 		 * from the capped_projects list.
    631   3792   akolb 		 */
    632   3792   akolb 		if (project_cap->cap_value == MAX_USAGE) {
    633   3792   akolb 			cap_project_disable(kpj);
    634   3792   akolb 		}
    635   3792   akolb 	} else if (CAP_DISABLED(project_cap)) {
    636   3792   akolb 		/*
    637   3792   akolb 		 * Add the project to capped_projects list.
    638   3792   akolb 		 */
    639   3792   akolb 		ASSERT(project_cap->cap_value == 0);
    640   3792   akolb 		cap_project_enable(kpj, MAX_USAGE);
    641   3792   akolb 	}
    642   3792   akolb 	mutex_exit(&caps_lock);
    643   3792   akolb 
    644   3792   akolb 	return (0);
    645   3792   akolb }
    646   3792   akolb 
    647   3792   akolb /*
    648   3792   akolb  * Set zone cap to cap_val
    649   3792   akolb  * If cap_val is equal to NOCAP, disable zone cap.
    650   3792   akolb  *
    651   3792   akolb  * If this is the first time a cap is set on a zone, allocate cpucap structure
    652   3792   akolb  * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
    653   3792   akolb  */
    654   3792   akolb int
    655   3792   akolb cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
    656   3792   akolb {
    657   3792   akolb 	cpucap_t *cap = NULL;
    658   3792   akolb 	hrtime_t value;
    659   3792   akolb 
    660   3792   akolb 	if (cap_val == 0)
    661   3792   akolb 		return (EINVAL);
    662   3792   akolb 
    663   3792   akolb 	ASSERT(cap_val <= MAXCAP);
    664   3792   akolb 	if (cap_val > MAXCAP)
    665   3792   akolb 		cap_val = MAXCAP;
    666   3792   akolb 
    667   3792   akolb 	/*
    668   3792   akolb 	 * Nothing to do if trying to disable a cap on a zone when caps are off
    669   3792   akolb 	 * or a zone which does not have a cap yet.
    670   3792   akolb 	 */
    671   3792   akolb 	if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
    672   3792   akolb 		return (0);
    673   3792   akolb 
    674   3792   akolb 	if (zone->zone_cpucap == NULL)
    675   3792   akolb 		cap = cap_alloc();
    676   3792   akolb 
    677   3792   akolb 	mutex_enter(&caps_lock);
    678   3792   akolb 
    679   3792   akolb 	if (cpucaps_busy) {
    680   3792   akolb 		mutex_exit(&caps_lock);
    681   3792   akolb 		return (EBUSY);
    682   3792   akolb 	}
    683   3792   akolb 
    684   3792   akolb 	/*
    685   3792   akolb 	 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
    686   3792   akolb 	 * held. If it is still NULL, assign a newly allocated cpucap to it.
    687   3792   akolb 	 */
    688   3792   akolb 	if (zone->zone_cpucap == NULL) {
    689   3792   akolb 		zone->zone_cpucap = cap;
    690   3792   akolb 	} else if (cap != NULL) {
    691   3792   akolb 		cap_free(cap);
    692   3792   akolb 	}
    693   3792   akolb 
    694   3792   akolb 	cap = zone->zone_cpucap;
    695   3792   akolb 	value = cap_val * cap_tick_cost;
    696   3792   akolb 	if (value < 0)
    697   3792   akolb 		value = MAX_USAGE;
    698   3792   akolb 
    699   3792   akolb 	/* Nothing to do if the value is staying the same */
    700   3792   akolb 	if (value == cap->cap_value) {
    701   3792   akolb 		mutex_exit(&caps_lock);
    702   3792   akolb 		return (0);
    703   3792   akolb 	}
    704   3792   akolb 
    705   3792   akolb 	/*
    706   3792   akolb 	 * Clear cap statistics since the cap value itself changes.
    707   3792   akolb 	 */
    708   3792   akolb 	cap->cap_above = cap->cap_below = 0;
    709   3792   akolb 
    710   3792   akolb 
    711   3792   akolb 	if (cap_val == NOCAP) {
    712   3792   akolb 		if (CAP_ENABLED(cap)) {
    713   3792   akolb 			/*
    714   3792   akolb 			 * Remove cap for the zone
    715   3792   akolb 			 */
    716   3792   akolb 			cap_zone_disable(zone);
    717   3792   akolb 			cpucaps_busy = B_TRUE;
    718   3792   akolb 			mutex_exit(&caps_lock);
    719   3792   akolb 			/*
    720   3792   akolb 			 * Disable caps for all project belonging to this zone
    721   3792   akolb 			 * unless they have their own cap.
    722   3792   akolb 			 */
    723   3792   akolb 			(void) project_walk_all(zone->zone_id,
    724   3792   akolb 			    cap_project_zone_modify_walker, cap);
    725   3792   akolb 
    726   3792   akolb 			mutex_enter(&caps_lock);
    727   3792   akolb 			cpucaps_busy = B_FALSE;
    728   3792   akolb 		}
    729   3792   akolb 	} else if (CAP_DISABLED(cap)) {
    730   3792   akolb 		/*
    731   3792   akolb 		 * Set a cap on a zone which previously was not capped.
    732   3792   akolb 		 */
    733   3792   akolb 		cap_zone_enable(zone, value);
    734   3792   akolb 		cpucaps_busy = B_TRUE;
    735   3792   akolb 		mutex_exit(&caps_lock);
    736   3792   akolb 
    737   3792   akolb 		/*
    738   3792   akolb 		 * Enable cap for all projects belonging to this zone.
    739   3792   akolb 		 */
    740   3792   akolb 		(void) project_walk_all(zone->zone_id,
    741   3792   akolb 		    cap_project_zone_modify_walker, cap);
    742   3792   akolb 
    743   3792   akolb 		mutex_enter(&caps_lock);
    744   3792   akolb 		cpucaps_busy = B_FALSE;
    745   3792   akolb 	} else {
    746   3792   akolb 		/*
    747   3792   akolb 		 * No state transitions, just change the value
    748   3792   akolb 		 */
    749   3792   akolb 		cap->cap_value = value;
    750   3792   akolb 	}
    751   3792   akolb 
    752   3792   akolb 	ASSERT(MUTEX_HELD(&caps_lock));
    753   3792   akolb 	ASSERT(!cpucaps_busy);
    754   3792   akolb 	mutex_exit(&caps_lock);
    755   3792   akolb 
    756   3792   akolb 	return (0);
    757   3792   akolb }
    758   3792   akolb 
    759   3792   akolb /*
    760   3792   akolb  * The project is going away so disable its cap.
    761   3792   akolb  */
    762   3792   akolb void
    763   3792   akolb cpucaps_project_remove(kproject_t *kpj)
    764   3792   akolb {
    765   3792   akolb 	mutex_enter(&caps_lock);
    766   3792   akolb 	if (PROJECT_IS_CAPPED(kpj))
    767   3792   akolb 		cap_project_disable(kpj);
    768   3792   akolb 	if (kpj->kpj_cpucap != NULL) {
    769   3792   akolb 		cap_free(kpj->kpj_cpucap);
    770   3792   akolb 		kpj->kpj_cpucap = NULL;
    771   3792   akolb 	}
    772   3792   akolb 	mutex_exit(&caps_lock);
    773   3792   akolb }
    774   3792   akolb 
    775   3792   akolb /*
    776   3792   akolb  * The zone is going away, so disable its cap.
    777   3792   akolb  */
    778   3792   akolb void
    779   3792   akolb cpucaps_zone_remove(zone_t *zone)
    780   3792   akolb {
    781   3792   akolb 	mutex_enter(&caps_lock);
    782   3792   akolb 	while (ZONE_IS_CAPPED(zone)) {
    783   3792   akolb 		mutex_exit(&caps_lock);
    784   3792   akolb 		(void) cpucaps_zone_set(zone, NOCAP);
    785   3792   akolb 		mutex_enter(&caps_lock);
    786   3792   akolb 	}
    787   3792   akolb 	if (zone->zone_cpucap != NULL) {
    788   3792   akolb 		cap_free(zone->zone_cpucap);
    789   3792   akolb 		zone->zone_cpucap = NULL;
    790   3792   akolb 	}
    791   3792   akolb 	mutex_exit(&caps_lock);
    792   3792   akolb }
    793   3792   akolb 
    794   3792   akolb /*
    795   3792   akolb  * New project was created. It should be put on the capped_projects list if
    796   3792   akolb  * its zone has a cap.
    797   3792   akolb  */
    798   3792   akolb void
    799   3792   akolb cpucaps_project_add(kproject_t *kpj)
    800   3792   akolb {
    801   3792   akolb 	cpucap_t *cap = NULL;
    802   3792   akolb 
    803   3792   akolb 	if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
    804   3792   akolb 		return;
    805   3792   akolb 
    806   3792   akolb 	/*
    807   3792   akolb 	 * This project was never capped before, so allocate its cap structure.
    808   3792   akolb 	 */
    809   3792   akolb 	if (kpj->kpj_cpucap == NULL)
    810   3792   akolb 		cap = cap_alloc();
    811   3792   akolb 
    812   3792   akolb 	mutex_enter(&caps_lock);
    813   3792   akolb 	/*
    814   3792   akolb 	 * Double-check with caps_lock held
    815   3792   akolb 	 */
    816   3792   akolb 	if (kpj->kpj_cpucap == NULL) {
    817   3792   akolb 		kpj->kpj_cpucap = cap;
    818   3792   akolb 	} else if (cap != NULL) {
    819   3792   akolb 		cap_free(cap);
    820   3792   akolb 	}
    821   3792   akolb 
    822   3792   akolb 	if (ZONE_IS_CAPPED(kpj->kpj_zone))
    823   3792   akolb 		cap_project_enable(kpj, MAX_USAGE);
    824   3792   akolb 
    825   3792   akolb 	mutex_exit(&caps_lock);
    826   3792   akolb }
    827   3792   akolb 
    828   3792   akolb /*
    829   3792   akolb  * Set project cap to cap_val
    830   3792   akolb  * If cap_val is equal to NOCAP, disable project cap.
    831   3792   akolb  *
    832   3792   akolb  * If this is the first time a cap is set on a project, allocate cpucap
    833   3792   akolb  * structure without holding caps_lock to avoid KM_SLEEP allocation with
    834   3792   akolb  * caps_lock held.
    835   3792   akolb  */
    836   3792   akolb int
    837   3792   akolb cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
    838   3792   akolb {
    839   3792   akolb 	cpucap_t *cap = NULL;
    840   3792   akolb 	hrtime_t value;
    841   3792   akolb 
    842   3792   akolb 	if (cap_val == 0)
    843   3792   akolb 		return (EINVAL);
    844   3792   akolb 
    845   3792   akolb 	ASSERT(cap_val <= MAXCAP);
    846   3792   akolb 	if (cap_val > MAXCAP)
    847   3792   akolb 		cap_val = MAXCAP;
    848   3792   akolb 
    849   3792   akolb 	/*
    850   3792   akolb 	 * Nothing to do if trying to disable project cap and caps are not
    851   3792   akolb 	 * enabled or if trying to disable cap on a project that does not have
    852   3792   akolb 	 * cap enabled.
    853   3792   akolb 	 */
    854   3792   akolb 	if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
    855   3792   akolb 		return (0);
    856   3792   akolb 
    857   3792   akolb 	if (kpj->kpj_cpucap == NULL) {
    858   3792   akolb 		/*
    859   3792   akolb 		 * This project was never capped before, so allocate its cap
    860   3792   akolb 		 * structure.
    861   3792   akolb 		 */
    862   3792   akolb 		cap = cap_alloc();
    863   3792   akolb 	}
    864   3792   akolb 
    865   3792   akolb 	mutex_enter(&caps_lock);
    866   3792   akolb 
    867   3792   akolb 	/*
    868   3792   akolb 	 * Double-check with caps_lock held.
    869   3792   akolb 	 */
    870   3792   akolb 	if (kpj->kpj_cpucap == NULL) {
    871   3792   akolb 		kpj->kpj_cpucap = cap;
    872   3792   akolb 	} else if (cap != NULL) {
    873   3792   akolb 		cap_free(cap);
    874   3792   akolb 	}
    875   3792   akolb 
    876   3792   akolb 	/*
    877   3792   akolb 	 * Get the actual pointer to the project cap.
    878   3792   akolb 	 */
    879   3792   akolb 	cap = kpj->kpj_cpucap;
    880   3792   akolb 	value = cap_val * cap_tick_cost;
    881   3792   akolb 	if (value < 0)
    882   3792   akolb 		value = MAX_USAGE;
    883   3792   akolb 
    884   3792   akolb 	/*
    885   3792   akolb 	 * Nothing to do if the value is not changing
    886   3792   akolb 	 */
    887   3792   akolb 	if (value == cap->cap_value) {
    888   3792   akolb 		mutex_exit(&caps_lock);
    889   3792   akolb 		return (0);
    890   3792   akolb 	}
    891   3792   akolb 
    892   3792   akolb 	/*
    893   3792   akolb 	 * Clear cap statistics since the cap value itself changes.
    894   3792   akolb 	 */
    895   3792   akolb 	cap->cap_above = cap->cap_below = 0;
    896   3792   akolb 	cap->cap_maxusage = 0;
    897   3792   akolb 
    898   3792   akolb 	if (cap_val != NOCAP) {
    899   3792   akolb 		/*
    900   3792   akolb 		 * Enable this cap if it is not already enabled.
    901   3792   akolb 		 */
    902   3792   akolb 		if (CAP_DISABLED(cap))
    903   3792   akolb 			cap_project_enable(kpj, value);
    904   3792   akolb 		else
    905   3792   akolb 			cap->cap_value = value;
    906   3792   akolb 	} else if (CAP_ENABLED(cap)) {
    907   3792   akolb 		/*
    908   3792   akolb 		 * User requested to drop a cap on the project. If it is part of
    909   3792   akolb 		 * capped zone, keep the cap and set the value to MAX_USAGE,
    910   3792   akolb 		 * otherwise disable the cap.
    911   3792   akolb 		 */
    912   3792   akolb 		if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
    913   3792   akolb 			cap->cap_value = MAX_USAGE;
    914   3792   akolb 		} else {
    915   3792   akolb 			cap_project_disable(kpj);
    916   3792   akolb 		}
    917   3792   akolb 	}
    918   3792   akolb 	mutex_exit(&caps_lock);
    919   3792   akolb 
    920   3792   akolb 	return (0);
    921   3792   akolb }
    922   3792   akolb 
    923   3792   akolb /*
    924   3792   akolb  * Get cap usage.
    925   3792   akolb  */
    926   3792   akolb static rctl_qty_t
    927   3792   akolb cap_get(cpucap_t *cap)
    928   3792   akolb {
    929   3792   akolb 	return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
    930   3792   akolb }
    931   3792   akolb 
    932   3792   akolb /*
    933   3792   akolb  * Get current project usage.
    934   3792   akolb  */
    935   3792   akolb rctl_qty_t
    936   3792   akolb cpucaps_project_get(kproject_t *kpj)
    937   3792   akolb {
    938   3792   akolb 	return (cap_get(kpj->kpj_cpucap));
    939   3792   akolb }
    940   3792   akolb 
    941   3792   akolb /*
    942   3792   akolb  * Get current zone usage.
    943   3792   akolb  */
    944   3792   akolb rctl_qty_t
    945   3792   akolb cpucaps_zone_get(zone_t *zone)
    946   3792   akolb {
    947   3792   akolb 	return (cap_get(zone->zone_cpucap));
    948   3792   akolb }
    949   3792   akolb 
    950   3792   akolb /*
    951   3792   akolb  * Charge project of thread t the time thread t spent on CPU since previously
    952   3792   akolb  * adjusted.
    953   3792   akolb  *
    954   3792   akolb  * Record the current on-CPU time in the csc structure.
    955   3792   akolb  *
    956   3792   akolb  * Do not adjust for more than one tick worth of time.
    957   3792   akolb  *
    958   4939   akolb  * It is possible that the project cap is being disabled while this routine is
    959   4939   akolb  * executed. This should not cause any issues since the association between the
    960   4939   akolb  * thread and its project is protected by thread lock.
    961   3792   akolb  */
    962   3792   akolb static void
    963   3792   akolb caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
    964   3792   akolb {
    965   3792   akolb 	kproject_t	*kpj = ttoproj(t);
    966   3792   akolb 	hrtime_t	new_usage;
    967   3792   akolb 	hrtime_t	usage_delta;
    968   3792   akolb 
    969   3792   akolb 	ASSERT(THREAD_LOCK_HELD(t));
    970   4939   akolb 	ASSERT(kpj->kpj_cpucap != NULL);
    971   3792   akolb 
    972   3792   akolb 	/* Get on-CPU time since birth of a thread */
    973   3792   akolb 	new_usage = mstate_thread_onproc_time(t);
    974   3792   akolb 
    975   3792   akolb 	/* Time spent on CPU since last checked */
    976   3792   akolb 	usage_delta = new_usage - csc->csc_cputime;
    977   3792   akolb 
    978   3792   akolb 	/* Save the accumulated on-CPU time */
    979   3792   akolb 	csc->csc_cputime = new_usage;
    980   3792   akolb 
    981   3792   akolb 	/* Charge at most one tick worth of on-CPU time */
    982   3792   akolb 	if (usage_delta > cap_tick_cost)
    983   3792   akolb 		usage_delta = cap_tick_cost;
    984   3792   akolb 
    985   3792   akolb 	/* Add usage_delta to the project usage value. */
    986   3792   akolb 	if (usage_delta > 0) {
    987   3792   akolb 		cpucap_t *cap = kpj->kpj_cpucap;
    988   3792   akolb 
    989   3792   akolb 		DTRACE_PROBE2(cpucaps__project__charge,
    990   3792   akolb 		    kthread_id_t, t, hrtime_t, usage_delta);
    991   3792   akolb 
    992   3792   akolb 		disp_lock_enter_high(&cap->cap_usagelock);
    993   3792   akolb 		cap->cap_usage += usage_delta;
    994   3792   akolb 
    995   3792   akolb 		/* Check for overflows */
    996   3792   akolb 		if (cap->cap_usage < 0)
    997   3792   akolb 			cap->cap_usage = MAX_USAGE - 1;
    998   3792   akolb 
    999   3792   akolb 		disp_lock_exit_high(&cap->cap_usagelock);
   1000   3792   akolb 
   1001   3792   akolb 		/*
   1002   3792   akolb 		 * cap_maxusage is only kept for observability. Move it outside
   1003   3792   akolb 		 * the lock to reduce the time spent while holding the lock.
   1004   3792   akolb 		 */
   1005   3792   akolb 		if (cap->cap_usage > cap->cap_maxusage)
   1006   3792   akolb 			cap->cap_maxusage = cap->cap_usage;
   1007   3792   akolb 	}
   1008   3792   akolb }
   1009   3792   akolb 
   1010   3792   akolb /*
   1011   3792   akolb  * Charge thread's project and return True if project or zone should be
   1012   3792   akolb  * penalized because its project or zone is exceeding its cap. Also sets
   1013   3792   akolb  * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
   1014   4939   akolb  *
   1015   4939   akolb  * It is possible that the project cap is being disabled while this routine is
   1016   4939   akolb  * executed. This should not cause any issues since the association between the
   1017   4939   akolb  * thread and its project is protected by thread lock. It will still set
   1018   4939   akolb  * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
   1019   4939   akolb  * anything on the blocked wait queue.
   1020   4939   akolb  *
   1021   3792   akolb  */
   1022   3792   akolb boolean_t
   1023   3792   akolb cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
   1024   3792   akolb {
   1025   3792   akolb 	kproject_t	*kpj = ttoproj(t);
   1026   3792   akolb 	klwp_t		*lwp = t->t_lwp;
   1027   3792   akolb 	zone_t		*zone;
   1028   3792   akolb 	cpucap_t	*project_cap;
   1029   3792   akolb 	boolean_t	rc = B_FALSE;
   1030   3792   akolb 
   1031   3792   akolb 	ASSERT(THREAD_LOCK_HELD(t));
   1032   3792   akolb 
   1033   3792   akolb 	/* Nothing to do for projects that are not capped. */
   1034   3792   akolb 	if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
   1035   3792   akolb 		return (B_FALSE);
   1036   3792   akolb 
   1037   3792   akolb 	caps_charge_adjust(t, csc);
   1038   3792   akolb 
   1039   3792   akolb 	/*
   1040   3792   akolb 	 * The caller only requested to charge the project usage, no enforcement
   1041   3792   akolb 	 * part.
   1042   3792   akolb 	 */
   1043   3792   akolb 	if (charge_type == CPUCAPS_CHARGE_ONLY)
   1044   3792   akolb 		return (B_FALSE);
   1045   3792   akolb 
   1046   3792   akolb 	project_cap = kpj->kpj_cpucap;
   1047   3792   akolb 
   1048   3792   akolb 	if (project_cap->cap_usage >= project_cap->cap_value) {
   1049   3792   akolb 		t->t_schedflag |= TS_PROJWAITQ;
   1050   3792   akolb 		rc = B_TRUE;
   1051   3792   akolb 	} else if (t->t_schedflag & TS_PROJWAITQ) {
   1052   3792   akolb 		t->t_schedflag &= ~TS_PROJWAITQ;
   1053   3792   akolb 	}
   1054   3792   akolb 
   1055   3792   akolb 	zone = ttozone(t);
   1056   3792   akolb 	if (!ZONE_IS_CAPPED(zone)) {
   1057   3792   akolb 		if (t->t_schedflag & TS_ZONEWAITQ)
   1058   3792   akolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1059   3792   akolb 	} else {
   1060   3792   akolb 		cpucap_t *zone_cap = zone->zone_cpucap;
   1061   3792   akolb 
   1062   3792   akolb 		if (zone_cap->cap_usage >= zone_cap->cap_value) {
   1063   3792   akolb 			t->t_schedflag |= TS_ZONEWAITQ;
   1064   3792   akolb 			rc = B_TRUE;
   1065   3792   akolb 		} else if (t->t_schedflag & TS_ZONEWAITQ) {
   1066   3792   akolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1067   3792   akolb 		}
   1068   3792   akolb 	}
   1069   3792   akolb 
   1070   3792   akolb 
   1071   3792   akolb 	return (rc);
   1072   3792   akolb }
   1073   3792   akolb 
   1074   3792   akolb /*
   1075   3792   akolb  * Enforce CPU caps. If got preempted in the user-land, we know that thread does
   1076   3792   akolb  * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
   1077   3792   akolb  *
   1078   3792   akolb  * CPU Caps are only enforced for user threads.
   1079   3792   akolb  *
   1080   3792   akolb  * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
   1081   3792   akolb  * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
   1082   3792   akolb  *
   1083   3792   akolb  * It is possible that by the time we enter cpucaps_enforce() the cap is already
   1084   3792   akolb  * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
   1085   3792   akolb  * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
   1086   3792   akolb  * apply.
   1087   3792   akolb  */
   1088   3792   akolb boolean_t
   1089   3792   akolb cpucaps_enforce(kthread_t *t)
   1090   3792   akolb {
   1091   3792   akolb 	klwp_t *lwp = t->t_lwp;
   1092   3792   akolb 
   1093   3792   akolb 	ASSERT(THREAD_LOCK_HELD(t));
   1094   3792   akolb 
   1095   3792   akolb 	if (lwp != NULL && lwp->lwp_state == LWP_USER) {
   1096   3792   akolb 		if (t->t_schedflag & TS_PROJWAITQ) {
   1097   3792   akolb 			ASSERT(ttoproj(t)->kpj_cpucap != NULL);
   1098   3792   akolb 			t->t_schedflag &= ~TS_ANYWAITQ;
   1099   3792   akolb 			if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
   1100  11066  rafael 			    t)) {
   1101   3792   akolb 				return (B_TRUE);
   1102   3792   akolb 			}
   1103   3792   akolb 		}
   1104   3792   akolb 		if (t->t_schedflag & TS_ZONEWAITQ) {
   1105   3792   akolb 			ASSERT(ttozone(t)->zone_cpucap != NULL);
   1106   3792   akolb 			t->t_schedflag &= ~TS_ZONEWAITQ;
   1107   3792   akolb 			if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
   1108  11066  rafael 			    t)) {
   1109   3792   akolb 				return (B_TRUE);
   1110   3792   akolb 			}
   1111   3792   akolb 		}
   1112   3792   akolb 	}
   1113   3792   akolb 
   1114   3792   akolb 	/*
   1115   3792   akolb 	 * The thread is not enqueued on the wait queue.
   1116   3792   akolb 	 */
   1117   3792   akolb 	return (B_FALSE);
   1118   3792   akolb }
   1119   3792   akolb 
   1120   3792   akolb /*
   1121   3792   akolb  * Convert internal cap statistics into values exported by cap kstat.
   1122   3792   akolb  */
   1123   3792   akolb static int
   1124   3792   akolb cap_kstat_update(kstat_t *ksp, int rw)
   1125   3792   akolb {
   1126   3792   akolb 	struct cap_kstat *capsp = &cap_kstat;
   1127   3792   akolb 	cpucap_t *cap = ksp->ks_private;
   1128   3792   akolb 	clock_t	tick_sec = SEC_TO_TICK(1);
   1129   3792   akolb 	char *zonename = cap->cap_zone->zone_name;
   1130   3792   akolb 
   1131   3792   akolb 	if (rw == KSTAT_WRITE)
   1132   3792   akolb 		return (EACCES);
   1133   3792   akolb 
   1134   3792   akolb 	capsp->cap_value.value.ui64 =
   1135   3792   akolb 	    ROUND_SCALE(cap->cap_value, cap_tick_cost);
   1136   3792   akolb 	capsp->cap_usage.value.ui64 =
   1137   3792   akolb 	    ROUND_SCALE(cap->cap_usage, cap_tick_cost);
   1138   3792   akolb 	capsp->cap_maxusage.value.ui64 =
   1139   3792   akolb 	    ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
   1140   3792   akolb 	capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
   1141   3792   akolb 	capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
   1142   3792   akolb 	capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
   1143   3792   akolb 	kstat_named_setstr(&capsp->cap_zonename, zonename);
   1144   3792   akolb 
   1145   3792   akolb 	return (0);
   1146   3792   akolb }
   1147