Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright (c) 2009, Intel Corporation.
     23  * All rights reserved.
     24  */
     25 
     26 /*
     27  * Introduction
     28  * This file implements a CPU event notification mechanism to signal clients
     29  * which are interested in CPU related events.
     30  * Currently it only supports CPU idle state change events which will be
     31  * triggered just before CPU entering hardware idle state and just after CPU
     32  * wakes up from hardware idle state.
     33  * Please refer to PSARC/2009/115 for detail information.
     34  *
     35  * Lock Strategy
     36  * 1) cpu_idle_prop_busy/free are protected by cpu_idle_prop_lock.
     37  * 2) No protection for cpu_idle_cb_state because it's per-CPU data.
     38  * 3) cpu_idle_cb_busy is protected by cpu_idle_cb_lock.
     39  * 4) cpu_idle_cb_array is protected by pause_cpus/start_cpus logic.
     40  * 5) cpu_idle_cb_max/curr are protected by both cpu_idle_cb_lock and
     41  *    pause_cpus/start_cpus logic.
     42  * We have optimized the algorithm for hot path on read side access.
     43  * In the current algorithm, it's lock free on read side access.
     44  * On write side, we use pause_cpus() to keep other CPUs in the pause thread,
     45  * which will guarantee that no other threads will access
     46  * cpu_idle_cb_max/curr/array data structure.
     47  */
     48 
     49 #include <sys/types.h>
     50 #include <sys/cmn_err.h>
     51 #include <sys/cpuvar.h>
     52 #include <sys/cpu.h>
     53 #include <sys/kmem.h>
     54 #include <sys/machcpuvar.h>
     55 #include <sys/sdt.h>
     56 #include <sys/sysmacros.h>
     57 #include <sys/synch.h>
     58 #include <sys/systm.h>
     59 #include <sys/sunddi.h>
     60 #if defined(__sparc)
     61 #include <sys/machsystm.h>
     62 #elif defined(__x86)
     63 #include <sys/archsystm.h>
     64 #endif
     65 #include <sys/cpu_event.h>
     66 
     67 /* Define normal state for CPU on different platforms. */
     68 #if defined(__x86)
     69 #define	CPU_IDLE_STATE_NORMAL		IDLE_STATE_C0
     70 #elif defined(__sparc)
     71 /*
     72  * At the time of this implementation IDLE_STATE_NORMAL is defined
     73  * in mach_startup.c, and not in a header file.  So if we find it is
     74  * undefined, then we set it to the value as defined in mach_startup.c
     75  * Should it eventually be defined, we will pick it up.
     76  */
     77 #ifndef	IDLE_STATE_NORMAL
     78 #define	IDLE_STATE_NORMAL	0
     79 #endif
     80 #define	CPU_IDLE_STATE_NORMAL	IDLE_STATE_NORMAL
     81 #endif
     82 
     83 /*
     84  * To improve cache efficiency and avoid cache false sharing, CPU idle
     85  * properties are grouped into cache lines as below:
     86  * |     CPU0      |     CPU1      |.........|     CPUn      |
     87  * | cache line 0  | cache line 1  |.........| cache line n  |
     88  * | v0 | ... | vm | v0 | ... | vm |.........| v0 | ... | vm |
     89  * To access value of property m for CPU n, using following value as index:
     90  *    index = seq_id_of_CPUn * CPU_IDLE_VALUE_GROUP_SIZE + m.
     91  */
     92 #define	CPU_IDLE_VALUE_GROUP_SIZE	\
     93 	(CPU_CACHE_COHERENCE_SIZE / sizeof (cpu_idle_prop_value_t))
     94 
     95 /* Get callback context handle for current CPU. */
     96 #define	CPU_IDLE_GET_CTX(cp)		\
     97 	((cpu_idle_callback_context_t)(intptr_t)((cp)->cpu_seqid))
     98 
     99 /* Get CPU sequential id from ctx. */
    100 #define	CPU_IDLE_CTX2CPUID(ctx)		((processorid_t)(intptr_t)(ctx))
    101 
    102 /* Compute index from callback context handle. */
    103 #define	CPU_IDLE_CTX2IDX(ctx)		\
    104 	(((int)(intptr_t)(ctx)) * CPU_IDLE_VALUE_GROUP_SIZE)
    105 
    106 #define	CPU_IDLE_HDL2VALP(hdl, idx)	\
    107 	(&((cpu_idle_prop_impl_t *)(hdl))->value[(idx)])
    108 
    109 /*
    110  * When cpu_idle_cb_array is NULL or full, increase CPU_IDLE_ARRAY_CAPACITY_INC
    111  * entries every time. Here we prefer linear growth instead of exponential.
    112  */
    113 #define	CPU_IDLE_ARRAY_CAPACITY_INC	0x10
    114 
    115 typedef struct cpu_idle_prop_impl {
    116 	cpu_idle_prop_value_t		*value;
    117 	struct cpu_idle_prop_impl	*next;
    118 	char				*name;
    119 	cpu_idle_prop_update_t		update;
    120 	void				*private;
    121 	cpu_idle_prop_type_t		type;
    122 	uint32_t			refcnt;
    123 } cpu_idle_prop_impl_t;
    124 
    125 typedef struct cpu_idle_prop_item {
    126 	cpu_idle_prop_type_t		type;
    127 	char				*name;
    128 	cpu_idle_prop_update_t		update;
    129 	void				*arg;
    130 	cpu_idle_prop_handle_t		handle;
    131 } cpu_idle_prop_item_t;
    132 
    133 /* Structure to maintain registered callbacks in list. */
    134 typedef struct cpu_idle_cb_impl {
    135 	struct cpu_idle_cb_impl		*next;
    136 	cpu_idle_callback_t		*callback;
    137 	void				*argument;
    138 	int				priority;
    139 } cpu_idle_cb_impl_t;
    140 
    141 /*
    142  * Structure to maintain registered callbacks in priority order and also
    143  * optimized for cache efficiency for reading access.
    144  */
    145 typedef struct cpu_idle_cb_item {
    146 	cpu_idle_enter_cbfn_t		enter;
    147 	cpu_idle_exit_cbfn_t		exit;
    148 	void				*arg;
    149 	cpu_idle_cb_impl_t		*impl;
    150 } cpu_idle_cb_item_t;
    151 
    152 /* Per-CPU state aligned to CPU_CACHE_COHERENCE_SIZE to avoid false sharing. */
    153 typedef union cpu_idle_cb_state {
    154 	struct {
    155 		int			index;
    156 		boolean_t		ready;
    157 		cpu_idle_prop_value_t	*idle_state;
    158 		cpu_idle_prop_value_t	*enter_ts;
    159 		cpu_idle_prop_value_t	*exit_ts;
    160 		cpu_idle_prop_value_t	*last_idle;
    161 		cpu_idle_prop_value_t	*last_busy;
    162 		cpu_idle_prop_value_t	*total_idle;
    163 		cpu_idle_prop_value_t	*total_busy;
    164 		cpu_idle_prop_value_t	*intr_cnt;
    165 	} v;
    166 #ifdef _LP64
    167 	char				align[2 * CPU_CACHE_COHERENCE_SIZE];
    168 #else
    169 	char				align[CPU_CACHE_COHERENCE_SIZE];
    170 #endif
    171 } cpu_idle_cb_state_t;
    172 
    173 static kmutex_t				cpu_idle_prop_lock;
    174 static cpu_idle_prop_impl_t		*cpu_idle_prop_busy = NULL;
    175 static cpu_idle_prop_impl_t		*cpu_idle_prop_free = NULL;
    176 
    177 static kmutex_t				cpu_idle_cb_lock;
    178 static cpu_idle_cb_impl_t		*cpu_idle_cb_busy = NULL;
    179 static cpu_idle_cb_item_t		*cpu_idle_cb_array = NULL;
    180 static int				cpu_idle_cb_curr = 0;
    181 static int				cpu_idle_cb_max = 0;
    182 
    183 static cpu_idle_cb_state_t		*cpu_idle_cb_state;
    184 
    185 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
    186     cpu_idle_prop_value_t *valp);
    187 
    188 static cpu_idle_prop_item_t cpu_idle_prop_array[] = {
    189 	{
    190 	    CPU_IDLE_PROP_TYPE_INTPTR, CPU_IDLE_PROP_IDLE_STATE,
    191 	    NULL, NULL, NULL
    192 	},
    193 	{
    194 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_ENTER_TIMESTAMP,
    195 	    NULL, NULL, NULL
    196 	},
    197 	{
    198 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_EXIT_TIMESTAMP,
    199 	    NULL, NULL, NULL
    200 	},
    201 	{
    202 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_IDLE_TIME,
    203 	    NULL, NULL, NULL
    204 	},
    205 	{
    206 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_LAST_BUSY_TIME,
    207 	    NULL, NULL, NULL
    208 	},
    209 	{
    210 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_IDLE_TIME,
    211 	    NULL, NULL, NULL
    212 	},
    213 	{
    214 	    CPU_IDLE_PROP_TYPE_HRTIME, CPU_IDLE_PROP_TOTAL_BUSY_TIME,
    215 	    NULL, NULL, NULL
    216 	},
    217 	{
    218 	    CPU_IDLE_PROP_TYPE_UINT64, CPU_IDLE_PROP_INTERRUPT_COUNT,
    219 	    cpu_idle_prop_update_intr_cnt, NULL, NULL
    220 	},
    221 };
    222 
    223 #define	CPU_IDLE_PROP_IDX_IDLE_STATE	0
    224 #define	CPU_IDLE_PROP_IDX_ENTER_TS	1
    225 #define	CPU_IDLE_PROP_IDX_EXIT_TS	2
    226 #define	CPU_IDLE_PROP_IDX_LAST_IDLE	3
    227 #define	CPU_IDLE_PROP_IDX_LAST_BUSY	4
    228 #define	CPU_IDLE_PROP_IDX_TOTAL_IDLE	5
    229 #define	CPU_IDLE_PROP_IDX_TOTAL_BUSY	6
    230 #define	CPU_IDLE_PROP_IDX_INTR_CNT	7
    231 
    232 /*ARGSUSED*/
    233 static void
    234 cpu_idle_dtrace_enter(void *arg, cpu_idle_callback_context_t ctx,
    235     cpu_idle_check_wakeup_t check_func, void *check_arg)
    236 {
    237 	int state;
    238 
    239 	state = cpu_idle_prop_get_intptr(
    240 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle, ctx);
    241 	DTRACE_PROBE1(idle__state__transition, uint_t, state);
    242 }
    243 
    244 /*ARGSUSED*/
    245 static void
    246 cpu_idle_dtrace_exit(void *arg, cpu_idle_callback_context_t ctx, int flag)
    247 {
    248 	DTRACE_PROBE1(idle__state__transition, uint_t, CPU_IDLE_STATE_NORMAL);
    249 }
    250 
    251 static cpu_idle_callback_handle_t cpu_idle_cb_handle_dtrace;
    252 static cpu_idle_callback_t cpu_idle_callback_dtrace = {
    253 	CPU_IDLE_CALLBACK_VERS,
    254 	cpu_idle_dtrace_enter,
    255 	cpu_idle_dtrace_exit,
    256 };
    257 
    258 #if defined(__x86) && !defined(__xpv)
    259 extern void tlb_going_idle(void);
    260 extern void tlb_service(void);
    261 
    262 static cpu_idle_callback_handle_t cpu_idle_cb_handle_tlb;
    263 static cpu_idle_callback_t cpu_idle_callback_tlb = {
    264 	CPU_IDLE_CALLBACK_VERS,
    265 	(cpu_idle_enter_cbfn_t)tlb_going_idle,
    266 	(cpu_idle_exit_cbfn_t)tlb_service,
    267 };
    268 #endif
    269 
    270 void
    271 cpu_event_init(void)
    272 {
    273 	int i, idx;
    274 	size_t sz;
    275 	intptr_t buf;
    276 	cpu_idle_cb_state_t *sp;
    277 	cpu_idle_prop_item_t *ip;
    278 
    279 	mutex_init(&cpu_idle_cb_lock, NULL, MUTEX_DRIVER, NULL);
    280 	mutex_init(&cpu_idle_prop_lock, NULL, MUTEX_DRIVER, NULL);
    281 
    282 	/* Create internal properties. */
    283 	for (i = 0, ip = cpu_idle_prop_array;
    284 	    i < sizeof (cpu_idle_prop_array) / sizeof (cpu_idle_prop_array[0]);
    285 	    i++, ip++) {
    286 		(void) cpu_idle_prop_create_property(ip->name, ip->type,
    287 		    ip->update, ip->arg, &ip->handle);
    288 		ASSERT(ip->handle != NULL);
    289 	}
    290 
    291 	/* Allocate buffer and align to CPU_CACHE_COHERENCE_SIZE. */
    292 	sz = sizeof (cpu_idle_cb_state_t) * max_ncpus;
    293 	sz += CPU_CACHE_COHERENCE_SIZE;
    294 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
    295 	cpu_idle_cb_state = (cpu_idle_cb_state_t *)P2ROUNDUP(buf,
    296 	    CPU_CACHE_COHERENCE_SIZE);
    297 
    298 	/* Cache frequently used property value pointers. */
    299 	for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
    300 		idx = CPU_IDLE_CTX2IDX(i);
    301 #define	___INIT_P(f, i)	\
    302 	sp->v.f = CPU_IDLE_HDL2VALP(cpu_idle_prop_array[(i)].handle, idx)
    303 		___INIT_P(idle_state, CPU_IDLE_PROP_IDX_IDLE_STATE);
    304 		___INIT_P(enter_ts, CPU_IDLE_PROP_IDX_ENTER_TS);
    305 		___INIT_P(exit_ts, CPU_IDLE_PROP_IDX_EXIT_TS);
    306 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_LAST_IDLE);
    307 		___INIT_P(last_busy, CPU_IDLE_PROP_IDX_LAST_BUSY);
    308 		___INIT_P(total_idle, CPU_IDLE_PROP_IDX_TOTAL_IDLE);
    309 		___INIT_P(total_busy, CPU_IDLE_PROP_IDX_TOTAL_BUSY);
    310 		___INIT_P(last_idle, CPU_IDLE_PROP_IDX_INTR_CNT);
    311 #undef	___INIT_P
    312 	}
    313 
    314 	/* Register built-in callbacks. */
    315 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_DTRACE,
    316 	    &cpu_idle_callback_dtrace, NULL, &cpu_idle_cb_handle_dtrace) != 0) {
    317 		cmn_err(CE_PANIC,
    318 		    "cpu_idle: failed to register callback for dtrace.");
    319 	}
    320 #if defined(__x86) && !defined(__xpv)
    321 	if (cpu_idle_register_callback(CPU_IDLE_CB_PRIO_TLB,
    322 	    &cpu_idle_callback_tlb, NULL, &cpu_idle_cb_handle_tlb) != 0) {
    323 		cmn_err(CE_PANIC,
    324 		    "cpu_idle: failed to register callback for tlb_flush.");
    325 	}
    326 #endif
    327 }
    328 
    329 void
    330 cpu_event_init_cpu(cpu_t *cp)
    331 {
    332 	ASSERT(cp->cpu_seqid < max_ncpus);
    333 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
    334 }
    335 
    336 void
    337 cpu_event_fini_cpu(cpu_t *cp)
    338 {
    339 	ASSERT(cp->cpu_seqid < max_ncpus);
    340 	cpu_idle_cb_state[cp->cpu_seqid].v.ready = B_FALSE;
    341 }
    342 
    343 static void
    344 cpu_idle_insert_callback(cpu_idle_cb_impl_t *cip)
    345 {
    346 	int unlock = 0, unpause = 0;
    347 	int i, cnt_new = 0, cnt_old = 0;
    348 	char *buf_new = NULL, *buf_old = NULL;
    349 
    350 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
    351 
    352 	/*
    353 	 * Expand array if it's full.
    354 	 * Memory must be allocated out of pause/start_cpus() scope because
    355 	 * kmem_zalloc() can't be called with KM_SLEEP flag within that scope.
    356 	 */
    357 	if (cpu_idle_cb_curr == cpu_idle_cb_max) {
    358 		cnt_new = cpu_idle_cb_max + CPU_IDLE_ARRAY_CAPACITY_INC;
    359 		buf_new = (char *)kmem_zalloc(cnt_new *
    360 		    sizeof (cpu_idle_cb_item_t), KM_SLEEP);
    361 	}
    362 
    363 	/* Try to acquire cpu_lock if not held yet. */
    364 	if (!MUTEX_HELD(&cpu_lock)) {
    365 		mutex_enter(&cpu_lock);
    366 		unlock = 1;
    367 	}
    368 	/*
    369 	 * Pause all other CPUs (and let them run pause thread).
    370 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
    371 	 * after pause_cpus().
    372 	 */
    373 	if (!cpus_paused()) {
    374 		pause_cpus(NULL);
    375 		unpause = 1;
    376 	}
    377 
    378 	/* Copy content to new buffer if needed. */
    379 	if (buf_new != NULL) {
    380 		buf_old = (char *)cpu_idle_cb_array;
    381 		cnt_old = cpu_idle_cb_max;
    382 		if (buf_old != NULL) {
    383 			ASSERT(cnt_old != 0);
    384 			bcopy(cpu_idle_cb_array, buf_new,
    385 			    sizeof (cpu_idle_cb_item_t) * cnt_old);
    386 		}
    387 		cpu_idle_cb_array = (cpu_idle_cb_item_t *)buf_new;
    388 		cpu_idle_cb_max = cnt_new;
    389 	}
    390 
    391 	/* Insert into array according to priority. */
    392 	ASSERT(cpu_idle_cb_curr < cpu_idle_cb_max);
    393 	for (i = cpu_idle_cb_curr; i > 0; i--) {
    394 		if (cpu_idle_cb_array[i - 1].impl->priority >= cip->priority) {
    395 			break;
    396 		}
    397 		cpu_idle_cb_array[i] = cpu_idle_cb_array[i - 1];
    398 	}
    399 	cpu_idle_cb_array[i].arg = cip->argument;
    400 	cpu_idle_cb_array[i].enter = cip->callback->idle_enter;
    401 	cpu_idle_cb_array[i].exit = cip->callback->idle_exit;
    402 	cpu_idle_cb_array[i].impl = cip;
    403 	cpu_idle_cb_curr++;
    404 
    405 	/* Resume other CPUs from paused state if needed. */
    406 	if (unpause) {
    407 		start_cpus();
    408 	}
    409 	if (unlock) {
    410 		mutex_exit(&cpu_lock);
    411 	}
    412 
    413 	/* Free old resource if needed. */
    414 	if (buf_old != NULL) {
    415 		ASSERT(cnt_old != 0);
    416 		kmem_free(buf_old, cnt_old * sizeof (cpu_idle_cb_item_t));
    417 	}
    418 }
    419 
    420 static void
    421 cpu_idle_remove_callback(cpu_idle_cb_impl_t *cip)
    422 {
    423 	int i, found = 0;
    424 	int unlock = 0, unpause = 0;
    425 	cpu_idle_cb_state_t *sp;
    426 
    427 	ASSERT(MUTEX_HELD(&cpu_idle_cb_lock));
    428 
    429 	/* Try to acquire cpu_lock if not held yet. */
    430 	if (!MUTEX_HELD(&cpu_lock)) {
    431 		mutex_enter(&cpu_lock);
    432 		unlock = 1;
    433 	}
    434 	/*
    435 	 * Pause all other CPUs.
    436 	 * It's guaranteed that no other threads will access cpu_idle_cb_array
    437 	 * after pause_cpus().
    438 	 */
    439 	if (!cpus_paused()) {
    440 		pause_cpus(NULL);
    441 		unpause = 1;
    442 	}
    443 
    444 	/* Remove cip from array. */
    445 	for (i = 0; i < cpu_idle_cb_curr; i++) {
    446 		if (found == 0) {
    447 			if (cpu_idle_cb_array[i].impl == cip) {
    448 				found = 1;
    449 			}
    450 		} else {
    451 			cpu_idle_cb_array[i - 1] = cpu_idle_cb_array[i];
    452 		}
    453 	}
    454 	ASSERT(found != 0);
    455 	cpu_idle_cb_curr--;
    456 
    457 	/*
    458 	 * Reset property ready flag for all CPUs if no registered callback
    459 	 * left because cpu_idle_enter/exit will stop updating property if
    460 	 * there's no callback registered.
    461 	 */
    462 	if (cpu_idle_cb_curr == 0) {
    463 		for (sp = cpu_idle_cb_state, i = 0; i < max_ncpus; i++, sp++) {
    464 			sp->v.ready = B_FALSE;
    465 		}
    466 	}
    467 
    468 	/* Resume other CPUs from paused state if needed. */
    469 	if (unpause) {
    470 		start_cpus();
    471 	}
    472 	if (unlock) {
    473 		mutex_exit(&cpu_lock);
    474 	}
    475 }
    476 
    477 int
    478 cpu_idle_register_callback(uint_t prio, cpu_idle_callback_t *cbp,
    479     void *arg, cpu_idle_callback_handle_t *hdlp)
    480 {
    481 	cpu_idle_cb_state_t *sp;
    482 	cpu_idle_cb_impl_t *cip = NULL;
    483 
    484 	/* First validate parameters. */
    485 	ASSERT(!CPU_ON_INTR(CPU));
    486 	ASSERT(CPU->cpu_seqid < max_ncpus);
    487 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
    488 	if (sp->v.index != 0) {
    489 		cmn_err(CE_NOTE,
    490 		    "!cpu_event: register_callback called from callback.");
    491 		return (EBUSY);
    492 	} else if (cbp == NULL || hdlp == NULL) {
    493 		cmn_err(CE_NOTE,
    494 		    "!cpu_event: NULL parameters in register_callback.");
    495 		return (EINVAL);
    496 	} else if (prio < CPU_IDLE_CB_PRIO_LOW_BASE ||
    497 	    prio >= CPU_IDLE_CB_PRIO_RESV_BASE) {
    498 		cmn_err(CE_NOTE,
    499 		    "!cpu_event: priority 0x%x out of range.", prio);
    500 		return (EINVAL);
    501 	} else if (cbp->version != CPU_IDLE_CALLBACK_VERS) {
    502 		cmn_err(CE_NOTE,
    503 		    "!cpu_event: callback version %d is not supported.",
    504 		    cbp->version);
    505 		return (EINVAL);
    506 	}
    507 
    508 	mutex_enter(&cpu_idle_cb_lock);
    509 	/* Check whether callback with priority exists if not dynamic. */
    510 	if (prio != CPU_IDLE_CB_PRIO_DYNAMIC) {
    511 		for (cip = cpu_idle_cb_busy; cip != NULL;
    512 		    cip = cip->next) {
    513 			if (cip->priority == prio) {
    514 				mutex_exit(&cpu_idle_cb_lock);
    515 				cmn_err(CE_NOTE, "!cpu_event: callback with "
    516 				    "priority 0x%x already exists.", prio);
    517 				return (EEXIST);
    518 			}
    519 		}
    520 	}
    521 
    522 	cip = kmem_zalloc(sizeof (*cip), KM_SLEEP);
    523 	cip->callback = cbp;
    524 	cip->argument = arg;
    525 	cip->priority = prio;
    526 	cip->next = cpu_idle_cb_busy;
    527 	cpu_idle_cb_busy = cip;
    528 	cpu_idle_insert_callback(cip);
    529 	mutex_exit(&cpu_idle_cb_lock);
    530 
    531 	*hdlp = (cpu_idle_callback_handle_t)cip;
    532 
    533 	return (0);
    534 }
    535 
    536 int
    537 cpu_idle_unregister_callback(cpu_idle_callback_handle_t hdl)
    538 {
    539 	int rc = ENODEV;
    540 	cpu_idle_cb_state_t *sp;
    541 	cpu_idle_cb_impl_t *ip, **ipp;
    542 
    543 	ASSERT(!CPU_ON_INTR(CPU));
    544 	ASSERT(CPU->cpu_seqid < max_ncpus);
    545 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
    546 	if (sp->v.index != 0) {
    547 		cmn_err(CE_NOTE,
    548 		    "!cpu_event: unregister_callback called from callback.");
    549 		return (EBUSY);
    550 	} else if (hdl == NULL) {
    551 		cmn_err(CE_NOTE,
    552 		    "!cpu_event: hdl is NULL in unregister_callback.");
    553 		return (EINVAL);
    554 	}
    555 
    556 	ip = (cpu_idle_cb_impl_t *)hdl;
    557 	mutex_enter(&cpu_idle_cb_lock);
    558 	for (ipp = &cpu_idle_cb_busy; *ipp != NULL; ipp = &(*ipp)->next) {
    559 		if (*ipp == ip) {
    560 			*ipp = ip->next;
    561 			cpu_idle_remove_callback(ip);
    562 			rc = 0;
    563 			break;
    564 		}
    565 	}
    566 	mutex_exit(&cpu_idle_cb_lock);
    567 
    568 	if (rc == 0) {
    569 		kmem_free(ip, sizeof (*ip));
    570 	} else {
    571 		cmn_err(CE_NOTE,
    572 		    "!cpu_event: callback handle %p not found.", (void *)hdl);
    573 	}
    574 
    575 	return (rc);
    576 }
    577 
    578 static int
    579 cpu_idle_enter_state(cpu_idle_cb_state_t *sp, intptr_t state)
    580 {
    581 	sp->v.idle_state->cipv_intptr = state;
    582 	sp->v.enter_ts->cipv_hrtime = gethrtime_unscaled();
    583 	sp->v.last_busy->cipv_hrtime = sp->v.enter_ts->cipv_hrtime -
    584 	    sp->v.exit_ts->cipv_hrtime;
    585 	sp->v.total_busy->cipv_hrtime += sp->v.last_busy->cipv_hrtime;
    586 	if (sp->v.ready == B_FALSE) {
    587 		sp->v.ready = B_TRUE;
    588 		return (0);
    589 	}
    590 
    591 	return (1);
    592 }
    593 
    594 static void
    595 cpu_idle_exit_state(cpu_idle_cb_state_t *sp)
    596 {
    597 	sp->v.idle_state->cipv_intptr = CPU_IDLE_STATE_NORMAL;
    598 	sp->v.exit_ts->cipv_hrtime = gethrtime_unscaled();
    599 	sp->v.last_idle->cipv_hrtime = sp->v.exit_ts->cipv_hrtime -
    600 	    sp->v.enter_ts->cipv_hrtime;
    601 	sp->v.total_idle->cipv_hrtime += sp->v.last_idle->cipv_hrtime;
    602 }
    603 
    604 /*ARGSUSED*/
    605 int
    606 cpu_idle_enter(int state, int flag,
    607     cpu_idle_check_wakeup_t check_func, void *check_arg)
    608 {
    609 	int i;
    610 	cpu_idle_cb_item_t *cip;
    611 	cpu_idle_cb_state_t *sp;
    612 	cpu_idle_callback_context_t ctx;
    613 #if defined(__x86)
    614 	ulong_t iflags;
    615 #endif
    616 
    617 	ctx = CPU_IDLE_GET_CTX(CPU);
    618 	ASSERT(CPU->cpu_seqid < max_ncpus);
    619 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
    620 	ASSERT(sp->v.index == 0);
    621 
    622 	/*
    623 	 * On x86, cpu_idle_enter can be called from idle thread with either
    624 	 * interrupts enabled or disabled, so we need to make sure interrupts
    625 	 * are disabled here.
    626 	 * On SPARC, cpu_idle_enter will be called from idle thread with
    627 	 * interrupt disabled, so no special handling necessary.
    628 	 */
    629 #if defined(__x86)
    630 	iflags = intr_clear();
    631 #endif
    632 
    633 	/* Skip calling callback if state is not ready for current CPU. */
    634 	if (cpu_idle_enter_state(sp, state) == 0) {
    635 #if defined(__x86)
    636 		intr_restore(iflags);
    637 #endif
    638 		return (0);
    639 	}
    640 
    641 	for (i = 0, cip = cpu_idle_cb_array; i < cpu_idle_cb_curr; i++, cip++) {
    642 		/*
    643 		 * Increase index so corresponding idle_exit callback
    644 		 * will be invoked should interrupt happen during
    645 		 * idle_enter callback.
    646 		 */
    647 		sp->v.index++;
    648 
    649 		/* Call idle_enter callback function if it's not NULL. */
    650 		if (cip->enter != NULL) {
    651 			cip->enter(cip->arg, ctx, check_func, check_arg);
    652 
    653 			/*
    654 			 * cpu_idle_enter runs with interrupts
    655 			 * disabled, so the idle_enter callbacks will
    656 			 * also be called with interrupts disabled.
    657 			 * It is permissible for the callbacks to
    658 			 * enable the interrupts, if they can also
    659 			 * handle the condition if the interrupt
    660 			 * occurs.
    661 			 *
    662 			 * However, if an interrupt occurs and we
    663 			 * return here without dealing with it, we
    664 			 * return to the cpu_idle_enter() caller
    665 			 * with an EBUSY, and the caller will not
    666 			 * enter the idle state.
    667 			 *
    668 			 * We detect the interrupt, by checking the
    669 			 * index value of the state pointer.  If it
    670 			 * is not the index we incremented above,
    671 			 * then it was cleared while processing
    672 			 * the interrupt.
    673 			 *
    674 			 * Also note, that at this point of the code
    675 			 * the normal index value will be one greater
    676 			 * than the variable 'i' in the loop, as it
    677 			 * hasn't yet been incremented.
    678 			 */
    679 			if (sp->v.index != i + 1) {
    680 #if defined(__x86)
    681 				intr_restore(iflags);
    682 #endif
    683 				return (EBUSY);
    684 			}
    685 		}
    686 	}
    687 #if defined(__x86)
    688 	intr_restore(iflags);
    689 #endif
    690 
    691 	return (0);
    692 }
    693 
    694 void
    695 cpu_idle_exit(int flag)
    696 {
    697 	int i;
    698 	cpu_idle_cb_item_t *cip;
    699 	cpu_idle_cb_state_t *sp;
    700 	cpu_idle_callback_context_t ctx;
    701 #if defined(__x86)
    702 	ulong_t iflags;
    703 #endif
    704 
    705 	ASSERT(CPU->cpu_seqid < max_ncpus);
    706 	sp = &cpu_idle_cb_state[CPU->cpu_seqid];
    707 
    708 #if defined(__sparc)
    709 	/*
    710 	 * On SPARC, cpu_idle_exit will only be called from idle thread
    711 	 * with interrupt disabled.
    712 	 */
    713 
    714 	if (sp->v.index != 0) {
    715 		ctx = CPU_IDLE_GET_CTX(CPU);
    716 		cpu_idle_exit_state(sp);
    717 		for (i = sp->v.index - 1; i >= 0; i--) {
    718 			cip = &cpu_idle_cb_array[i];
    719 			if (cip->exit != NULL) {
    720 				cip->exit(cip->arg, ctx, flag);
    721 			}
    722 		}
    723 		sp->v.index = 0;
    724 	}
    725 #elif defined(__x86)
    726 	/*
    727 	 * On x86, cpu_idle_exit will be called from idle thread or interrupt
    728 	 * handler. When called from interrupt handler, interrupts will be
    729 	 * disabled. When called from idle thread, interrupts may be disabled
    730 	 * or enabled.
    731 	 */
    732 
    733 	/* Called from interrupt, interrupts are already disabled. */
    734 	if (flag & CPU_IDLE_CB_FLAG_INTR) {
    735 		/*
    736 		 * return if cpu_idle_exit already called or
    737 		 * there is no registered callback.
    738 		 */
    739 		if (sp->v.index == 0) {
    740 			return;
    741 		}
    742 		ctx = CPU_IDLE_GET_CTX(CPU);
    743 		cpu_idle_exit_state(sp);
    744 		for (i = sp->v.index - 1; i >= 0; i--) {
    745 			cip = &cpu_idle_cb_array[i];
    746 			if (cip->exit != NULL) {
    747 				cip->exit(cip->arg, ctx, flag);
    748 			}
    749 		}
    750 		sp->v.index = 0;
    751 
    752 	/* Called from idle thread, need to disable interrupt. */
    753 	} else {
    754 		iflags = intr_clear();
    755 		if (sp->v.index != 0) {
    756 			ctx = CPU_IDLE_GET_CTX(CPU);
    757 			cpu_idle_exit_state(sp);
    758 			for (i = sp->v.index - 1; i >= 0; i--) {
    759 				cip = &cpu_idle_cb_array[i];
    760 				if (cip->exit != NULL) {
    761 					cip->exit(cip->arg, ctx, flag);
    762 				}
    763 			}
    764 			sp->v.index = 0;
    765 		}
    766 		intr_restore(iflags);
    767 	}
    768 #endif
    769 }
    770 
    771 cpu_idle_callback_context_t
    772 cpu_idle_get_context(void)
    773 {
    774 	return (CPU_IDLE_GET_CTX(CPU));
    775 }
    776 
    777 /*
    778  * Allocate property structure in group of CPU_IDLE_VALUE_GROUP_SIZE to improve
    779  * cache efficiency. To simplify implementation, allocated memory for property
    780  * structure won't be freed.
    781  */
    782 static void
    783 cpu_idle_prop_allocate_impl(void)
    784 {
    785 	int i;
    786 	size_t sz;
    787 	intptr_t buf;
    788 	cpu_idle_prop_impl_t *prop;
    789 	cpu_idle_prop_value_t *valp;
    790 
    791 	ASSERT(!CPU_ON_INTR(CPU));
    792 	prop = kmem_zalloc(sizeof (*prop) * CPU_IDLE_VALUE_GROUP_SIZE,
    793 	    KM_SLEEP);
    794 	sz = sizeof (*valp) * CPU_IDLE_VALUE_GROUP_SIZE * max_ncpus;
    795 	sz += CPU_CACHE_COHERENCE_SIZE;
    796 	buf = (intptr_t)kmem_zalloc(sz, KM_SLEEP);
    797 	valp = (cpu_idle_prop_value_t *)P2ROUNDUP(buf,
    798 	    CPU_CACHE_COHERENCE_SIZE);
    799 
    800 	for (i = 0; i < CPU_IDLE_VALUE_GROUP_SIZE; i++, prop++, valp++) {
    801 		prop->value = valp;
    802 		prop->next = cpu_idle_prop_free;
    803 		cpu_idle_prop_free = prop;
    804 	}
    805 }
    806 
    807 int
    808 cpu_idle_prop_create_property(const char *name, cpu_idle_prop_type_t type,
    809     cpu_idle_prop_update_t update, void *arg, cpu_idle_prop_handle_t *hdlp)
    810 {
    811 	int rc = EEXIST;
    812 	cpu_idle_prop_impl_t *prop;
    813 
    814 	ASSERT(!CPU_ON_INTR(CPU));
    815 	if (name == NULL || hdlp == NULL) {
    816 		cmn_err(CE_WARN,
    817 		    "!cpu_event: NULL parameters in create_property.");
    818 		return (EINVAL);
    819 	}
    820 
    821 	mutex_enter(&cpu_idle_prop_lock);
    822 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
    823 		if (strcmp(prop->name, name) == 0) {
    824 			cmn_err(CE_NOTE,
    825 			    "!cpu_event: property %s already exists.", name);
    826 			break;
    827 		}
    828 	}
    829 	if (prop == NULL) {
    830 		if (cpu_idle_prop_free == NULL) {
    831 			cpu_idle_prop_allocate_impl();
    832 		}
    833 		ASSERT(cpu_idle_prop_free != NULL);
    834 		prop = cpu_idle_prop_free;
    835 		cpu_idle_prop_free = prop->next;
    836 		prop->next = cpu_idle_prop_busy;
    837 		cpu_idle_prop_busy = prop;
    838 
    839 		ASSERT(prop->value != NULL);
    840 		prop->name = strdup(name);
    841 		prop->type = type;
    842 		prop->update = update;
    843 		prop->private = arg;
    844 		prop->refcnt = 1;
    845 		*hdlp = prop;
    846 		rc = 0;
    847 	}
    848 	mutex_exit(&cpu_idle_prop_lock);
    849 
    850 	return (rc);
    851 }
    852 
    853 int
    854 cpu_idle_prop_destroy_property(cpu_idle_prop_handle_t hdl)
    855 {
    856 	int rc = ENODEV;
    857 	cpu_idle_prop_impl_t *prop, **propp;
    858 	cpu_idle_prop_value_t *valp;
    859 
    860 	ASSERT(!CPU_ON_INTR(CPU));
    861 	if (hdl == NULL) {
    862 		cmn_err(CE_WARN,
    863 		    "!cpu_event: hdl is NULL in destroy_property.");
    864 		return (EINVAL);
    865 	}
    866 
    867 	prop = (cpu_idle_prop_impl_t *)hdl;
    868 	mutex_enter(&cpu_idle_prop_lock);
    869 	for (propp = &cpu_idle_prop_busy; *propp != NULL;
    870 	    propp = &(*propp)->next) {
    871 		if (*propp == prop) {
    872 			ASSERT(prop->refcnt > 0);
    873 			if (atomic_cas_32(&prop->refcnt, 1, 0) == 1) {
    874 				*propp = prop->next;
    875 				strfree(prop->name);
    876 				valp = prop->value;
    877 				bzero(prop, sizeof (*prop));
    878 				prop->value = valp;
    879 				prop->next = cpu_idle_prop_free;
    880 				cpu_idle_prop_free = prop;
    881 				rc = 0;
    882 			} else {
    883 				rc = EBUSY;
    884 			}
    885 			break;
    886 		}
    887 	}
    888 	mutex_exit(&cpu_idle_prop_lock);
    889 
    890 	return (rc);
    891 }
    892 
    893 int
    894 cpu_idle_prop_create_handle(const char *name, cpu_idle_prop_handle_t *hdlp)
    895 {
    896 	int rc = ENODEV;
    897 	cpu_idle_prop_impl_t *prop;
    898 
    899 	ASSERT(!CPU_ON_INTR(CPU));
    900 	if (name == NULL || hdlp == NULL) {
    901 		cmn_err(CE_WARN,
    902 		    "!cpu_event: NULL parameters in create_handle.");
    903 		return (EINVAL);
    904 	}
    905 
    906 	mutex_enter(&cpu_idle_prop_lock);
    907 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
    908 		if (strcmp(prop->name, name) == 0) {
    909 			/* Hold one refcount on object. */
    910 			ASSERT(prop->refcnt > 0);
    911 			atomic_inc_32(&prop->refcnt);
    912 			*hdlp = (cpu_idle_prop_handle_t)prop;
    913 			rc = 0;
    914 			break;
    915 		}
    916 	}
    917 	mutex_exit(&cpu_idle_prop_lock);
    918 
    919 	return (rc);
    920 }
    921 
    922 int
    923 cpu_idle_prop_destroy_handle(cpu_idle_prop_handle_t hdl)
    924 {
    925 	int rc = ENODEV;
    926 	cpu_idle_prop_impl_t *prop;
    927 
    928 	ASSERT(!CPU_ON_INTR(CPU));
    929 	if (hdl == NULL) {
    930 		cmn_err(CE_WARN,
    931 		    "!cpu_event: hdl is NULL in destroy_handle.");
    932 		return (EINVAL);
    933 	}
    934 
    935 	mutex_enter(&cpu_idle_prop_lock);
    936 	for (prop = cpu_idle_prop_busy; prop != NULL; prop = prop->next) {
    937 		if (prop == hdl) {
    938 			/* Release refcnt held in create_handle. */
    939 			ASSERT(prop->refcnt > 1);
    940 			atomic_dec_32(&prop->refcnt);
    941 			rc = 0;
    942 			break;
    943 		}
    944 	}
    945 	mutex_exit(&cpu_idle_prop_lock);
    946 
    947 	return (rc);
    948 }
    949 
    950 cpu_idle_prop_type_t
    951 cpu_idle_prop_get_type(cpu_idle_prop_handle_t hdl)
    952 {
    953 	ASSERT(hdl != NULL);
    954 	return (((cpu_idle_prop_impl_t *)hdl)->type);
    955 }
    956 
    957 const char *
    958 cpu_idle_prop_get_name(cpu_idle_prop_handle_t hdl)
    959 {
    960 	ASSERT(hdl != NULL);
    961 	return (((cpu_idle_prop_impl_t *)hdl)->name);
    962 }
    963 
    964 int
    965 cpu_idle_prop_get_value(cpu_idle_prop_handle_t hdl,
    966     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t *valp)
    967 {
    968 	int idx, rc = 0;
    969 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
    970 
    971 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
    972 	if (hdl == NULL || valp == NULL) {
    973 		cmn_err(CE_NOTE, "!cpu_event: NULL parameters in prop_get.");
    974 		return (EINVAL);
    975 	}
    976 	idx = CPU_IDLE_CTX2IDX(ctx);
    977 	if (prop->update != NULL) {
    978 		cpu_idle_cb_state_t *sp;
    979 
    980 		ASSERT(CPU->cpu_seqid < max_ncpus);
    981 		sp = &cpu_idle_cb_state[CPU->cpu_seqid];
    982 		/* CPU's idle enter timestamp as sequence number. */
    983 		rc = prop->update(prop->private,
    984 		    (uint64_t)sp->v.enter_ts->cipv_hrtime, &prop->value[idx]);
    985 	}
    986 	if (rc == 0) {
    987 		*valp = prop->value[idx];
    988 	}
    989 
    990 	return (rc);
    991 }
    992 
    993 uint32_t
    994 cpu_idle_prop_get_uint32(cpu_idle_prop_handle_t hdl,
    995     cpu_idle_callback_context_t ctx)
    996 {
    997 	int idx;
    998 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
    999 
   1000 	ASSERT(hdl != NULL);
   1001 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
   1002 	idx = CPU_IDLE_CTX2IDX(ctx);
   1003 	return (prop->value[idx].cipv_uint32);
   1004 }
   1005 
   1006 uint64_t
   1007 cpu_idle_prop_get_uint64(cpu_idle_prop_handle_t hdl,
   1008     cpu_idle_callback_context_t ctx)
   1009 {
   1010 	int idx;
   1011 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
   1012 
   1013 	ASSERT(hdl != NULL);
   1014 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
   1015 	idx = CPU_IDLE_CTX2IDX(ctx);
   1016 	return (prop->value[idx].cipv_uint64);
   1017 }
   1018 
   1019 intptr_t
   1020 cpu_idle_prop_get_intptr(cpu_idle_prop_handle_t hdl,
   1021     cpu_idle_callback_context_t ctx)
   1022 {
   1023 	int idx;
   1024 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
   1025 
   1026 	ASSERT(hdl != NULL);
   1027 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
   1028 	idx = CPU_IDLE_CTX2IDX(ctx);
   1029 	return (prop->value[idx].cipv_intptr);
   1030 }
   1031 
   1032 hrtime_t
   1033 cpu_idle_prop_get_hrtime(cpu_idle_prop_handle_t hdl,
   1034     cpu_idle_callback_context_t ctx)
   1035 {
   1036 	int idx;
   1037 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
   1038 
   1039 	ASSERT(hdl != NULL);
   1040 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
   1041 	idx = CPU_IDLE_CTX2IDX(ctx);
   1042 	return (prop->value[idx].cipv_hrtime);
   1043 }
   1044 
   1045 void
   1046 cpu_idle_prop_set_value(cpu_idle_prop_handle_t hdl,
   1047     cpu_idle_callback_context_t ctx, cpu_idle_prop_value_t val)
   1048 {
   1049 	int idx;
   1050 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
   1051 
   1052 	ASSERT(hdl != NULL);
   1053 	ASSERT(CPU_IDLE_CTX2CPUID(ctx) < max_ncpus);
   1054 	idx = CPU_IDLE_CTX2IDX(ctx);
   1055 	prop->value[idx] = val;
   1056 }
   1057 
   1058 void
   1059 cpu_idle_prop_set_all(cpu_idle_prop_handle_t hdl, cpu_idle_prop_value_t val)
   1060 {
   1061 	int i, idx;
   1062 	cpu_idle_prop_impl_t *prop = (cpu_idle_prop_impl_t *)hdl;
   1063 
   1064 	ASSERT(hdl != NULL);
   1065 	for (i = 0; i < max_ncpus; i++) {
   1066 		idx = CPU_IDLE_CTX2IDX(i);
   1067 		prop->value[idx] = val;
   1068 	}
   1069 }
   1070 
   1071 /*ARGSUSED*/
   1072 static int cpu_idle_prop_update_intr_cnt(void *arg, uint64_t seqnum,
   1073     cpu_idle_prop_value_t *valp)
   1074 {
   1075 	int i;
   1076 	uint64_t val;
   1077 
   1078 	for (val = 0, i = 0; i < PIL_MAX; i++) {
   1079 		val += CPU->cpu_stats.sys.intr[i];
   1080 	}
   1081 	valp->cipv_uint64 = val;
   1082 
   1083 	return (0);
   1084 }
   1085 
   1086 uint_t
   1087 cpu_idle_get_cpu_state(cpu_t *cp)
   1088 {
   1089 	ASSERT(cp != NULL && cp->cpu_seqid < max_ncpus);
   1090 	return ((uint_t)cpu_idle_prop_get_uint32(
   1091 	    cpu_idle_prop_array[CPU_IDLE_PROP_IDX_IDLE_STATE].handle,
   1092 	    CPU_IDLE_GET_CTX(cp)));
   1093 }
   1094