Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/cpu_pm.h>
     27 #include <sys/cmn_err.h>
     28 #include <sys/time.h>
     29 #include <sys/sdt.h>
     30 
     31 /*
     32  * Solaris Event Based CPU Power Manager
     33  *
     34  * This file implements platform independent event based CPU power management.
     35  * When CPUs are configured into the system, the CMT scheduling subsystem will
     36  * query the platform to determine if the CPU belongs to any power management
     37  * domains. That is, sets of CPUs that share power management states.
     38  *
     39  * Active Power Management domains represent a group of CPUs across which the
     40  * Operating System can request speed changes (which may in turn result
     41  * in voltage changes). This allows the operating system to trade off
     42  * performance for power savings.
     43  *
     44  * Idle Power Management domains can enter power savings states when they are
     45  * unutilized. These states allow the Operating System to trade off power
     46  * for performance (in the form of latency to transition from the idle state
     47  * to an active one).
     48  *
     49  * For each active and idle power domain the CMT subsystem instantiates, a
     50  * cpupm_domain_t structure is created. As the dispatcher schedules threads
     51  * to run on the system's CPUs, it will also track the utilization of the
     52  * enumerated power domains. Significant changes in utilization will result
     53  * in the dispatcher sending the power manager events that relate to the
     54  * utilization of the power domain. The power manager recieves the events,
     55  * and in the context of the policy objectives in force, may decide to request
     56  * the domain's power/performance state be changed.
     57  *
     58  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
     59  * manager will request the CPUs in the domain run at their fastest (and most
     60  * power consuming) state. When the domain becomes idle (utilization at zero),
     61  * the power manager will request that the CPUs run at a speed that saves the
     62  * most power.
     63  *
     64  * The advantage of this scheme, is that the CPU power manager working with the
     65  * dispatcher can be extremely responsive to changes in utilization. Optimizing
     66  * for performance in the presence of utilization, and power savings in the
     67  * presence of idleness. Such close collaboration with the dispatcher has other
     68  * benefits that will play out in the form of more sophisticated power /
     69  * performance policy in the near future.
     70  *
     71  * Avoiding state thrashing in the presence of transient periods of utilization
     72  * and idleness while still being responsive to non-transient periods is key.
     73  * The power manager implements a "governor" that is used to throttle
     74  * state transitions when a significant amount of transient idle or transient
     75  * work is detected.
     76  *
     77  * Kernel background activity (e.g. taskq threads) are by far the most common
     78  * form of transient utilization. Ungoverned in the face of this utililzation,
     79  * hundreds of state transitions per second would result on an idle system.
     80  *
     81  * Transient idleness is common when a thread briefly yields the CPU to
     82  * wait for an event elsewhere in the system. Where the idle period is short
     83  * enough, the overhead associated with making the state transition doesn't
     84  * justify the power savings.
     85  *
     86  * The following is the state machine for the governor implemented by
     87  * cpupm_utilization_event():
     88  *
     89  *         ----->---tw---->-----
     90  *        /                     \
     91  *      (I)-<-ti-<-     -<-ntw-<(W)
     92  *       |         \   /         |
     93  *       \          \ /          /
     94  *        >-nti/rm->(D)--->-tw->-
     95  * Key:
     96  *
     97  * States
     98  * - (D): Default (ungoverned)
     99  * - (W): Transient work governed
    100  * - (I): Transient idle governed
    101  * State Transitions
    102  * - tw: transient work
    103  * - ti: transient idleness
    104  * - ntw: non-transient work
    105  * - nti: non-transient idleness
    106  * - rm: thread remain event
    107  */
    108 
    109 static cpupm_domain_t *cpupm_domains = NULL;
    110 
    111 /*
    112  * Uninitialized state of CPU power management is disabled
    113  */
    114 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
    115 
    116 /*
    117  * Periods of utilization lasting less than this time interval are characterized
    118  * as transient. State changes associated with transient work are considered
    119  * to be mispredicted. That is, it's not worth raising and lower power states
    120  * where the utilization lasts for less than this interval.
    121  */
    122 hrtime_t cpupm_tw_predict_interval;
    123 
    124 /*
    125  * Periods of idleness lasting less than this time interval are characterized
    126  * as transient. State changes associated with transient idle are considered
    127  * to be mispredicted. That is, it's not worth lowering and raising power
    128  * states where the idleness lasts for less than this interval.
    129  */
    130 hrtime_t cpupm_ti_predict_interval;
    131 
    132 /*
    133  * Number of mispredictions after which future transitions will be governed.
    134  */
    135 int cpupm_mispredict_thresh = 4;
    136 
    137 /*
    138  * Likewise, the number of mispredicted governed transitions after which the
    139  * governor will be removed.
    140  */
    141 int cpupm_mispredict_gov_thresh = 4;
    142 
    143 /*
    144  * The transient work and transient idle prediction intervals are specified
    145  * here. Tuning them higher will result in the transient work, and transient
    146  * idle governors being used more aggresively, which limits the frequency of
    147  * state transitions at the expense of performance and power savings,
    148  * respectively. The intervals are specified in nanoseconds.
    149  */
    150 /*
    151  * 400 usec
    152  */
    153 #define	CPUPM_DEFAULT_TI_INTERVAL	400000
    154 /*
    155  * 400 usec
    156  */
    157 #define	CPUPM_DEFAULT_TW_INTERVAL	400000
    158 
    159 hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
    160 hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
    161 
    162 
    163 static void	cpupm_governor_initialize(void);
    164 static void	cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
    165 
    166 cpupm_policy_t
    167 cpupm_get_policy(void)
    168 {
    169 	return (cpupm_policy);
    170 }
    171 
    172 int
    173 cpupm_set_policy(cpupm_policy_t new_policy)
    174 {
    175 	static int	gov_init = 0;
    176 	int		result = 0;
    177 
    178 	mutex_enter(&cpu_lock);
    179 	if (new_policy == cpupm_policy) {
    180 		mutex_exit(&cpu_lock);
    181 		return (result);
    182 	}
    183 
    184 	/*
    185 	 * Pausing CPUs causes a high priority thread to be scheduled
    186 	 * on all other CPUs (besides the current one). This locks out
    187 	 * other CPUs from making CPUPM state transitions.
    188 	 */
    189 	switch (new_policy) {
    190 	case CPUPM_POLICY_DISABLED:
    191 		pause_cpus(NULL);
    192 		cpupm_policy = CPUPM_POLICY_DISABLED;
    193 		start_cpus();
    194 
    195 		result = cmt_pad_disable(PGHW_POW_ACTIVE);
    196 
    197 		/*
    198 		 * Once PAD has been enabled, it should always be possible
    199 		 * to disable it.
    200 		 */
    201 		ASSERT(result == 0);
    202 
    203 		/*
    204 		 * Bring all the active power domains to the maximum
    205 		 * performance state.
    206 		 */
    207 		cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
    208 		    CPUPM_STATE_MAX_PERF);
    209 
    210 		break;
    211 	case CPUPM_POLICY_ELASTIC:
    212 
    213 		result = cmt_pad_enable(PGHW_POW_ACTIVE);
    214 		if (result < 0) {
    215 			/*
    216 			 * Failed to enable PAD across the active power
    217 			 * domains, which may well be because none were
    218 			 * enumerated.
    219 			 */
    220 			break;
    221 		}
    222 
    223 		/*
    224 		 * Initialize the governor parameters the first time through.
    225 		 */
    226 		if (gov_init == 0) {
    227 			cpupm_governor_initialize();
    228 			gov_init = 1;
    229 		}
    230 
    231 		pause_cpus(NULL);
    232 		cpupm_policy = CPUPM_POLICY_ELASTIC;
    233 		start_cpus();
    234 
    235 		break;
    236 	default:
    237 		cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
    238 		    new_policy);
    239 		ASSERT(0);
    240 		break;
    241 	}
    242 	mutex_exit(&cpu_lock);
    243 
    244 	return (result);
    245 }
    246 
    247 /*
    248  * Look for an existing power domain
    249  */
    250 static cpupm_domain_t *
    251 cpupm_domain_find(id_t id, cpupm_dtype_t type)
    252 {
    253 	ASSERT(MUTEX_HELD(&cpu_lock));
    254 
    255 	cpupm_domain_t *dom;
    256 
    257 	dom = cpupm_domains;
    258 	while (dom != NULL) {
    259 		if (id == dom->cpd_id && type == dom->cpd_type)
    260 			return (dom);
    261 		dom = dom->cpd_next;
    262 	}
    263 	return (NULL);
    264 }
    265 
    266 /*
    267  * Create a new domain
    268  */
    269 static cpupm_domain_t *
    270 cpupm_domain_create(id_t id, cpupm_dtype_t type)
    271 {
    272 	cpupm_domain_t *dom;
    273 
    274 	ASSERT(MUTEX_HELD(&cpu_lock));
    275 
    276 	dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
    277 	dom->cpd_id = id;
    278 	dom->cpd_type = type;
    279 
    280 	/* Link into the known domain list */
    281 	dom->cpd_next = cpupm_domains;
    282 	cpupm_domains = dom;
    283 
    284 	return (dom);
    285 }
    286 
    287 static void
    288 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
    289 {
    290 	/*
    291 	 * In the envent we're enumerating because the domain's state
    292 	 * configuration has changed, toss any existing states.
    293 	 */
    294 	if (dom->cpd_nstates > 0) {
    295 		kmem_free(dom->cpd_states,
    296 		    sizeof (cpupm_state_t) * dom->cpd_nstates);
    297 		dom->cpd_nstates = 0;
    298 	}
    299 
    300 	/*
    301 	 * Query to determine the number of states, allocate storage
    302 	 * large enough to hold the state information, and pass it back
    303 	 * to the platform driver to complete the enumeration.
    304 	 */
    305 	dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
    306 
    307 	if (dom->cpd_nstates == 0)
    308 		return;
    309 
    310 	dom->cpd_states =
    311 	    kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
    312 	(void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
    313 }
    314 
    315 /*
    316  * Initialize the specified type of power domain on behalf of the CPU
    317  */
    318 cpupm_domain_t *
    319 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
    320 {
    321 	cpupm_domain_t	*dom;
    322 	id_t		did;
    323 
    324 	ASSERT(MUTEX_HELD(&cpu_lock));
    325 
    326 	/*
    327 	 * Instantiate the domain if it doesn't already exist
    328 	 * and enumerate its power states.
    329 	 */
    330 	did = cpupm_domain_id(cp, type);
    331 	dom = cpupm_domain_find(did, type);
    332 	if (dom == NULL) {
    333 		dom = cpupm_domain_create(did, type);
    334 		cpupm_domain_state_enum(cp, dom);
    335 	}
    336 
    337 	/*
    338 	 * Named state initialization
    339 	 */
    340 	if (type == CPUPM_DTYPE_ACTIVE) {
    341 		/*
    342 		 * For active power domains, the highest performance
    343 		 * state is defined as first state returned from
    344 		 * the domain enumeration.
    345 		 */
    346 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
    347 		    &dom->cpd_states[0];
    348 		dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
    349 		    &dom->cpd_states[dom->cpd_nstates - 1];
    350 
    351 		/*
    352 		 * Begin by assuming CPU is running at the max perf state.
    353 		 */
    354 		dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
    355 	}
    356 
    357 	return (dom);
    358 }
    359 
    360 /*
    361  * Return the id associated with the given type of domain
    362  * to which cp belongs
    363  */
    364 id_t
    365 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
    366 {
    367 	return (cpupm_plat_domain_id(cp, type));
    368 }
    369 
    370 /*
    371  * Initiate a state change for the specified domain on behalf of cp
    372  */
    373 int
    374 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
    375 {
    376 	if (cpupm_plat_change_state(cp, state) < 0)
    377 		return (-1);
    378 
    379 	DTRACE_PROBE2(cpupm__change__state,
    380 	    cpupm_domain_t *, dom,
    381 	    cpupm_state_t *, state);
    382 
    383 	dom->cpd_state = state;
    384 	return (0);
    385 }
    386 
    387 /*
    388  * Interface into the CPU power manager to indicate a significant change
    389  * in utilization of the specified active power domain
    390  */
    391 void
    392 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
    393 			    cpupm_util_event_t event)
    394 {
    395 	cpupm_state_t	*new_state = NULL;
    396 	hrtime_t	last;
    397 
    398 	if (cpupm_policy == CPUPM_POLICY_DISABLED) {
    399 		return;
    400 	}
    401 
    402 	/*
    403 	 * What follows is a simple elastic power state management policy.
    404 	 *
    405 	 * If the utilization has become non-zero, and the domain was
    406 	 * previously at it's lowest power state, then transition it
    407 	 * to the highest state in the spirit of "race to idle".
    408 	 *
    409 	 * If the utilization has dropped to zero, then transition the
    410 	 * domain to its lowest power state.
    411 	 *
    412 	 * Statistics are maintained to implement a governor to reduce state
    413 	 * transitions resulting from either transient work, or periods of
    414 	 * transient idleness on the domain.
    415 	 */
    416 	switch (event) {
    417 	case CPUPM_DOM_REMAIN_BUSY:
    418 
    419 		/*
    420 		 * We've received an event that the domain is running a thread
    421 		 * that's made it to the end of it's time slice. If we are at
    422 		 * low power, then raise it. If the transient work governor
    423 		 * is engaged, then remove it.
    424 		 */
    425 		if (dom->cpd_state ==
    426 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
    427 			new_state =
    428 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
    429 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
    430 				dom->cpd_governor = CPUPM_GOV_DISENGAGED;
    431 				dom->cpd_tw = 0;
    432 			}
    433 		}
    434 		break;
    435 
    436 	case CPUPM_DOM_BUSY_FROM_IDLE:
    437 		last = dom->cpd_last_lower;
    438 		dom->cpd_last_raise = now;
    439 
    440 		DTRACE_PROBE3(cpupm__raise__req,
    441 		    cpupm_domain_t *, dom,
    442 		    hrtime_t, last,
    443 		    hrtime_t, now);
    444 
    445 		if (dom->cpd_state ==
    446 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
    447 
    448 			/*
    449 			 * There's non-zero utilization, and the domain is
    450 			 * running in the lower power state. Before we
    451 			 * consider raising power, check if the preceeding
    452 			 * idle period was transient in duration.
    453 			 *
    454 			 * If the domain is already transient work governed,
    455 			 * then we don't bother maintaining transient idle
    456 			 * statistics, as the presence of enough transient work
    457 			 * can also make the domain frequently transiently idle.
    458 			 * In this case, we still want to remain transient work
    459 			 * governed.
    460 			 */
    461 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
    462 				if ((now - last) < cpupm_ti_predict_interval) {
    463 					/*
    464 					 * We're raising the domain power and
    465 					 * we *just* lowered it. Consider
    466 					 * this a mispredicted power state
    467 					 * transition due to a transient
    468 					 * idle period.
    469 					 */
    470 					if (++dom->cpd_ti >=
    471 					    cpupm_mispredict_thresh) {
    472 						/*
    473 						 * There's enough transient
    474 						 * idle transitions to
    475 						 * justify governing future
    476 						 * lowering requests.
    477 						 */
    478 						dom->cpd_governor =
    479 						    CPUPM_GOV_TRANS_IDLE;
    480 						dom->cpd_ti = 0;
    481 						DTRACE_PROBE1(
    482 						    cpupm__ti__governed,
    483 						    cpupm_domain_t *, dom);
    484 					}
    485 				} else {
    486 					/*
    487 					 * We correctly predicted the last
    488 					 * lowering.
    489 					 */
    490 					dom->cpd_ti = 0;
    491 				}
    492 			}
    493 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
    494 				/*
    495 				 * Raise requests are governed due to
    496 				 * transient work.
    497 				 */
    498 				DTRACE_PROBE1(cpupm__raise__governed,
    499 				    cpupm_domain_t *, dom);
    500 
    501 				return;
    502 			}
    503 			/*
    504 			 * Prepare to transition to the higher power state
    505 			 */
    506 			new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
    507 
    508 		} else if (dom->cpd_state ==
    509 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
    510 
    511 			/*
    512 			 * Utilization is non-zero, and we're already running
    513 			 * in the higher power state. Take this opportunity to
    514 			 * perform some book keeping if the last lowering
    515 			 * request was governed.
    516 			 */
    517 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
    518 
    519 				if ((now - last) >= cpupm_ti_predict_interval) {
    520 					/*
    521 					 * The domain is transient idle
    522 					 * governed, and we mispredicted
    523 					 * governing the last lowering request.
    524 					 */
    525 					if (++dom->cpd_ti >=
    526 					    cpupm_mispredict_gov_thresh) {
    527 						/*
    528 						 * There's enough non-transient
    529 						 * idle periods to justify
    530 						 * removing the governor.
    531 						 */
    532 						dom->cpd_governor =
    533 						    CPUPM_GOV_DISENGAGED;
    534 						dom->cpd_ti = 0;
    535 						DTRACE_PROBE1(
    536 						    cpupm__ti__ungoverned,
    537 						    cpupm_domain_t *, dom);
    538 					}
    539 				} else {
    540 					/*
    541 					 * Correctly predicted governing the
    542 					 * last lowering request.
    543 					 */
    544 					dom->cpd_ti = 0;
    545 				}
    546 			}
    547 		}
    548 		break;
    549 
    550 	case CPUPM_DOM_IDLE_FROM_BUSY:
    551 		last = dom->cpd_last_raise;
    552 		dom->cpd_last_lower = now;
    553 
    554 		DTRACE_PROBE3(cpupm__lower__req,
    555 		    cpupm_domain_t *, dom,
    556 		    hrtime_t, last,
    557 		    hrtime_t, now);
    558 
    559 		if (dom->cpd_state ==
    560 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
    561 
    562 			/*
    563 			 * The domain is idle, and is running in the highest
    564 			 * performance state. Before we consider lowering power,
    565 			 * perform some book keeping for the transient work
    566 			 * governor.
    567 			 */
    568 			if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
    569 				if ((now - last) < cpupm_tw_predict_interval) {
    570 					/*
    571 					 * We're lowering the domain power and
    572 					 * we *just* raised it. Consider the
    573 					 * last raise mispredicted due to
    574 					 * transient work.
    575 					 */
    576 					if (++dom->cpd_tw >=
    577 					    cpupm_mispredict_thresh) {
    578 						/*
    579 						 * There's enough transient work
    580 						 * transitions to justify
    581 						 * governing future raise
    582 						 * requests.
    583 						 */
    584 						dom->cpd_governor =
    585 						    CPUPM_GOV_TRANS_WORK;
    586 						dom->cpd_tw = 0;
    587 						DTRACE_PROBE1(
    588 						    cpupm__tw__governed,
    589 						    cpupm_domain_t *, dom);
    590 					}
    591 				} else {
    592 					/*
    593 					 * We correctly predicted during the
    594 					 * last raise.
    595 					 */
    596 					dom->cpd_tw = 0;
    597 				}
    598 			}
    599 			if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
    600 				/*
    601 				 * Lowering requests are governed due to
    602 				 * transient idleness.
    603 				 */
    604 				DTRACE_PROBE1(cpupm__lowering__governed,
    605 				    cpupm_domain_t *, dom);
    606 
    607 				return;
    608 			}
    609 
    610 			/*
    611 			 * Prepare to transition to a lower power state.
    612 			 */
    613 			new_state =
    614 			    dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
    615 
    616 		} else if (dom->cpd_state ==
    617 		    dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
    618 
    619 			/*
    620 			 * The domain is idle, and we're already running in
    621 			 * the lower power state. Take this opportunity to
    622 			 * perform some book keeping if the last raising
    623 			 * request was governed.
    624 			 */
    625 			if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
    626 				if ((now - last) >= cpupm_tw_predict_interval) {
    627 					/*
    628 					 * The domain is transient work
    629 					 * governed, and we mispredicted
    630 					 * governing the last raising request.
    631 					 */
    632 					if (++dom->cpd_tw >=
    633 					    cpupm_mispredict_gov_thresh) {
    634 						/*
    635 						 * There's enough non-transient
    636 						 * work to justify removing
    637 						 * the governor.
    638 						 */
    639 						dom->cpd_governor =
    640 						    CPUPM_GOV_DISENGAGED;
    641 						dom->cpd_tw = 0;
    642 						DTRACE_PROBE1(
    643 						    cpupm__tw__ungoverned,
    644 						    cpupm_domain_t *, dom);
    645 					}
    646 				} else {
    647 					/*
    648 					 * We correctly predicted governing
    649 					 * the last raise.
    650 					 */
    651 					dom->cpd_tw = 0;
    652 				}
    653 			}
    654 		}
    655 		break;
    656 	}
    657 	/*
    658 	 * Change the power state
    659 	 * Not much currently done if this doesn't succeed
    660 	 */
    661 	if (new_state)
    662 		(void) cpupm_change_state(cp, dom, new_state);
    663 }
    664 
    665 
    666 /*
    667  * Interface called by platforms to dynamically change the
    668  * MAX performance cpupm state
    669  */
    670 void
    671 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
    672 {
    673 	cpupm_domain_t	*dom;
    674 	id_t		did;
    675 	cpupm_dtype_t	type = CPUPM_DTYPE_ACTIVE;
    676 	boolean_t	change_state = B_FALSE;
    677 	cpupm_state_t	*new_state = NULL;
    678 
    679 	did = cpupm_domain_id(cp, type);
    680 	mutex_enter(&cpu_lock);
    681 	dom = cpupm_domain_find(did, type);
    682 	mutex_exit(&cpu_lock);
    683 
    684 	/*
    685 	 * Can use a lock to avoid changing the power state of the cpu when
    686 	 * CPUPM_STATE_MAX_PERF is getting changed.
    687 	 * Since the occurance of events to change MAX_PERF is not frequent,
    688 	 * it may not be a good idea to overburden with locks. In the worst
    689 	 * case, for one cycle the power may not get changed to the required
    690 	 * level
    691 	 */
    692 	if (dom != NULL) {
    693 		if (dom->cpd_state ==
    694 		    dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
    695 			change_state = B_TRUE;
    696 		}
    697 
    698 		/*
    699 		 * If an out of range level is passed, use the lowest supported
    700 		 * speed.
    701 		 */
    702 		if (max_perf_level >= dom->cpd_nstates &&
    703 		    dom->cpd_nstates > 1) {
    704 			max_perf_level = dom->cpd_nstates - 1;
    705 		}
    706 
    707 		dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
    708 		    &dom->cpd_states[max_perf_level];
    709 
    710 		/*
    711 		 * If the current state is MAX_PERF, change the current state
    712 		 * to the new MAX_PERF
    713 		 */
    714 		if (change_state) {
    715 			new_state =
    716 			    dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
    717 			if (new_state) {
    718 				(void) cpupm_change_state(cp, dom, new_state);
    719 			}
    720 		}
    721 	}
    722 }
    723 
    724 /*
    725  * Initialize the parameters for the transience governor state machine
    726  */
    727 static void
    728 cpupm_governor_initialize(void)
    729 {
    730 	/*
    731 	 * The default prediction intervals are specified in nanoseconds.
    732 	 * Convert these to the equivalent in unscaled hrtime, which is the
    733 	 * format of the timestamps passed to cpupm_utilization_event()
    734 	 */
    735 	cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
    736 	cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
    737 }
    738 
    739 /*
    740  * Initiate a state change in all CPUPM domain instances of the specified type
    741  */
    742 static void
    743 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
    744 {
    745 	cpu_t		*cp;
    746 	pg_cmt_t	*pwr_pg;
    747 	cpupm_domain_t	*dom;
    748 	group_t		*hwset;
    749 	group_iter_t	giter;
    750 	pg_cpu_itr_t	cpu_iter;
    751 	pghw_type_t	hw;
    752 
    753 	ASSERT(MUTEX_HELD(&cpu_lock));
    754 
    755 	switch (type) {
    756 	case CPUPM_DTYPE_ACTIVE:
    757 		hw = PGHW_POW_ACTIVE;
    758 		break;
    759 	default:
    760 		/*
    761 		 * Power domain types other than "active" unsupported.
    762 		 */
    763 		ASSERT(type == CPUPM_DTYPE_ACTIVE);
    764 		return;
    765 	}
    766 
    767 	if ((hwset = pghw_set_lookup(hw)) == NULL)
    768 		return;
    769 
    770 	/*
    771 	 * Iterate over the power domains
    772 	 */
    773 	group_iter_init(&giter);
    774 	while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
    775 
    776 		dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
    777 
    778 		/*
    779 		 * Iterate over the CPUs in each domain
    780 		 */
    781 		PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
    782 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
    783 			(void) cpupm_change_state(cp, dom,
    784 			    dom->cpd_named_states[state]);
    785 		}
    786 	}
    787 }
    788