Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)kcpc.c	1.18	07/10/14 SMI"
     28 
     29 #include <sys/param.h>
     30 #include <sys/thread.h>
     31 #include <sys/cpuvar.h>
     32 #include <sys/inttypes.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/time.h>
     35 #include <sys/mutex.h>
     36 #include <sys/systm.h>
     37 #include <sys/kcpc.h>
     38 #include <sys/cpc_impl.h>
     39 #include <sys/cpc_pcbe.h>
     40 #include <sys/atomic.h>
     41 #include <sys/sunddi.h>
     42 #include <sys/modctl.h>
     43 #include <sys/sdt.h>
     44 #if defined(__x86)
     45 #include <asm/clock.h>
     46 #endif
     47 
     48 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
     49 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
     50 
     51 
     52 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
     53 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
     54 
     55 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
     56 
     57 /*
     58  * These are set when a PCBE module is loaded.
     59  */
     60 uint_t		cpc_ncounters = 0;
     61 pcbe_ops_t	*pcbe_ops = NULL;
     62 
     63 /*
     64  * Statistics on (mis)behavior
     65  */
     66 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
     67 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
     68 
     69 /*
     70  * Is misbehaviour (overflow in a thread with no context) fatal?
     71  */
     72 #ifdef DEBUG
     73 static int kcpc_nullctx_panic = 1;
     74 #else
     75 static int kcpc_nullctx_panic = 0;
     76 #endif
     77 
     78 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
     79 static void kcpc_restore(kcpc_ctx_t *ctx);
     80 static void kcpc_save(kcpc_ctx_t *ctx);
     81 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
     82 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
     83 static void kcpc_free_configs(kcpc_set_t *set);
     84 static kcpc_ctx_t *kcpc_ctx_alloc(void);
     85 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
     86 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
     87 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
     88 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
     89 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
     90 
     91 void
     92 kcpc_register_pcbe(pcbe_ops_t *ops)
     93 {
     94 	pcbe_ops = ops;
     95 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
     96 }
     97 
     98 int
     99 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
    100 {
    101 	cpu_t		*cp;
    102 	kcpc_ctx_t	*ctx;
    103 	int		error;
    104 
    105 	ctx = kcpc_ctx_alloc();
    106 
    107 	if (kcpc_assign_reqs(set, ctx) != 0) {
    108 		kcpc_ctx_free(ctx);
    109 		*subcode = CPC_RESOURCE_UNAVAIL;
    110 		return (EINVAL);
    111 	}
    112 
    113 	ctx->kc_cpuid = cpuid;
    114 	ctx->kc_thread = curthread;
    115 
    116 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    117 
    118 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    119 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    120 		kcpc_ctx_free(ctx);
    121 		return (error);
    122 	}
    123 
    124 	set->ks_ctx = ctx;
    125 	ctx->kc_set = set;
    126 
    127 	/*
    128 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
    129 	 * we are manipulating the cpu_t and programming the hardware, else the
    130 	 * the cpu_t could go away while we're looking at it.
    131 	 */
    132 	mutex_enter(&cpu_lock);
    133 	cp = cpu_get(cpuid);
    134 
    135 	if (cp == NULL)
    136 		/*
    137 		 * The CPU could have been DRd out while we were getting set up.
    138 		 */
    139 		goto unbound;
    140 
    141 	mutex_enter(&cp->cpu_cpc_ctxlock);
    142 
    143 	if (cp->cpu_cpc_ctx != NULL) {
    144 		/*
    145 		 * If this CPU already has a bound set, return an error.
    146 		 */
    147 		mutex_exit(&cp->cpu_cpc_ctxlock);
    148 		goto unbound;
    149 	}
    150 
    151 	if (curthread->t_bind_cpu != cpuid) {
    152 		mutex_exit(&cp->cpu_cpc_ctxlock);
    153 		goto unbound;
    154 	}
    155 	cp->cpu_cpc_ctx = ctx;
    156 
    157 	/*
    158 	 * Kernel preemption must be disabled while fiddling with the hardware
    159 	 * registers to prevent partial updates.
    160 	 */
    161 	kpreempt_disable();
    162 	ctx->kc_rawtick = KCPC_GET_TICK();
    163 	pcbe_ops->pcbe_program(ctx);
    164 	kpreempt_enable();
    165 
    166 	mutex_exit(&cp->cpu_cpc_ctxlock);
    167 	mutex_exit(&cpu_lock);
    168 
    169 	return (0);
    170 
    171 unbound:
    172 	mutex_exit(&cpu_lock);
    173 	set->ks_ctx = NULL;
    174 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    175 	kcpc_ctx_free(ctx);
    176 	return (EAGAIN);
    177 }
    178 
    179 int
    180 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
    181 {
    182 	kcpc_ctx_t	*ctx;
    183 	int		error;
    184 
    185 	/*
    186 	 * Only one set is allowed per context, so ensure there is no
    187 	 * existing context.
    188 	 */
    189 
    190 	if (t->t_cpc_ctx != NULL)
    191 		return (EEXIST);
    192 
    193 	ctx = kcpc_ctx_alloc();
    194 
    195 	/*
    196 	 * The context must begin life frozen until it has been properly
    197 	 * programmed onto the hardware. This prevents the context ops from
    198 	 * worrying about it until we're ready.
    199 	 */
    200 	ctx->kc_flags |= KCPC_CTX_FREEZE;
    201 	ctx->kc_hrtime = gethrtime();
    202 
    203 	if (kcpc_assign_reqs(set, ctx) != 0) {
    204 		kcpc_ctx_free(ctx);
    205 		*subcode = CPC_RESOURCE_UNAVAIL;
    206 		return (EINVAL);
    207 	}
    208 
    209 	ctx->kc_cpuid = -1;
    210 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
    211 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
    212 	ctx->kc_thread = t;
    213 	t->t_cpc_ctx = ctx;
    214 	/*
    215 	 * Permit threads to look at their own hardware counters from userland.
    216 	 */
    217 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
    218 
    219 	/*
    220 	 * Create the data store for this set.
    221 	 */
    222 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    223 
    224 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    225 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    226 		kcpc_ctx_free(ctx);
    227 		t->t_cpc_ctx = NULL;
    228 		return (error);
    229 	}
    230 
    231 	set->ks_ctx = ctx;
    232 	ctx->kc_set = set;
    233 
    234 	/*
    235 	 * Add a device context to the subject thread.
    236 	 */
    237 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
    238 	    kcpc_lwp_create, NULL, kcpc_free);
    239 
    240 	/*
    241 	 * Ask the backend to program the hardware.
    242 	 */
    243 	if (t == curthread) {
    244 		kpreempt_disable();
    245 		ctx->kc_rawtick = KCPC_GET_TICK();
    246 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    247 		pcbe_ops->pcbe_program(ctx);
    248 		kpreempt_enable();
    249 	} else
    250 		/*
    251 		 * Since we are the agent LWP, we know the victim LWP is stopped
    252 		 * until we're done here; no need to worry about preemption or
    253 		 * migration here. We still use an atomic op to clear the flag
    254 		 * to ensure the flags are always self-consistent; they can
    255 		 * still be accessed from, for instance, another CPU doing a
    256 		 * kcpc_invalidate_all().
    257 		 */
    258 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    259 
    260 
    261 	return (0);
    262 }
    263 
    264 /*
    265  * Walk through each request in the set and ask the PCBE to configure a
    266  * corresponding counter.
    267  */
    268 static int
    269 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
    270 {
    271 	int		i;
    272 	int		ret;
    273 	kcpc_request_t	*rp;
    274 
    275 	for (i = 0; i < set->ks_nreqs; i++) {
    276 		int n;
    277 		rp = &set->ks_req[i];
    278 
    279 		n = rp->kr_picnum;
    280 
    281 		ASSERT(n >= 0 && n < cpc_ncounters);
    282 
    283 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
    284 
    285 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
    286 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
    287 			    == 0) {
    288 				*subcode = -1;
    289 				return (ENOTSUP);
    290 			}
    291 			/*
    292 			 * If any of the counters have requested overflow
    293 			 * notification, we flag the context as being one that
    294 			 * cares about overflow.
    295 			 */
    296 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
    297 		}
    298 
    299 		rp->kr_config = NULL;
    300 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
    301 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
    302 		    &(rp->kr_config), (void *)ctx)) != 0) {
    303 			kcpc_free_configs(set);
    304 			*subcode = ret;
    305 			switch (ret) {
    306 			case CPC_ATTR_REQUIRES_PRIVILEGE:
    307 			case CPC_HV_NO_ACCESS:
    308 				return (EACCES);
    309 			default:
    310 				return (EINVAL);
    311 			}
    312 		}
    313 
    314 		ctx->kc_pics[n].kp_req = rp;
    315 		rp->kr_picp = &ctx->kc_pics[n];
    316 		rp->kr_data = set->ks_data + rp->kr_index;
    317 		*rp->kr_data = rp->kr_preset;
    318 	}
    319 
    320 	return (0);
    321 }
    322 
    323 static void
    324 kcpc_free_configs(kcpc_set_t *set)
    325 {
    326 	int i;
    327 
    328 	for (i = 0; i < set->ks_nreqs; i++)
    329 		if (set->ks_req[i].kr_config != NULL)
    330 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
    331 }
    332 
    333 /*
    334  * buf points to a user address and the data should be copied out to that
    335  * address in the current process.
    336  */
    337 int
    338 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
    339 {
    340 	kcpc_ctx_t	*ctx = set->ks_ctx;
    341 	uint64_t	curtick = KCPC_GET_TICK();
    342 
    343 	if (ctx == NULL)
    344 		return (EINVAL);
    345 	else if (ctx->kc_flags & KCPC_CTX_INVALID)
    346 		return (EAGAIN);
    347 
    348 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
    349 		/*
    350 		 * Kernel preemption must be disabled while reading the
    351 		 * hardware regs, and if this is a CPU-bound context, while
    352 		 * checking the CPU binding of the current thread.
    353 		 */
    354 		kpreempt_disable();
    355 
    356 		if (ctx->kc_cpuid != -1) {
    357 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
    358 				kpreempt_enable();
    359 				return (EAGAIN);
    360 			}
    361 		}
    362 
    363 		if (ctx->kc_thread == curthread) {
    364 			ctx->kc_hrtime = gethrtime();
    365 			pcbe_ops->pcbe_sample(ctx);
    366 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
    367 			ctx->kc_rawtick = curtick;
    368 		}
    369 
    370 		kpreempt_enable();
    371 
    372 		/*
    373 		 * The config may have been invalidated by
    374 		 * the pcbe_sample op.
    375 		 */
    376 		if (ctx->kc_flags & KCPC_CTX_INVALID)
    377 			return (EAGAIN);
    378 	}
    379 
    380 	if (copyout(set->ks_data, buf,
    381 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
    382 		return (EFAULT);
    383 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
    384 		return (EFAULT);
    385 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
    386 		return (EFAULT);
    387 
    388 	return (0);
    389 }
    390 
    391 /*
    392  * Stop the counters on the CPU this context is bound to.
    393  */
    394 static void
    395 kcpc_stop_hw(kcpc_ctx_t *ctx)
    396 {
    397 	cpu_t *cp;
    398 
    399 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
    400 	    == KCPC_CTX_INVALID);
    401 
    402 	kpreempt_disable();
    403 
    404 	cp = cpu_get(ctx->kc_cpuid);
    405 	ASSERT(cp != NULL);
    406 
    407 	if (cp == CPU) {
    408 		pcbe_ops->pcbe_allstop();
    409 		atomic_or_uint(&ctx->kc_flags,
    410 		    KCPC_CTX_INVALID_STOPPED);
    411 	} else
    412 		kcpc_remote_stop(cp);
    413 	kpreempt_enable();
    414 }
    415 
    416 int
    417 kcpc_unbind(kcpc_set_t *set)
    418 {
    419 	kcpc_ctx_t	*ctx = set->ks_ctx;
    420 	kthread_t	*t;
    421 
    422 	if (ctx == NULL)
    423 		return (EINVAL);
    424 
    425 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
    426 
    427 	if (ctx->kc_cpuid == -1) {
    428 		t = ctx->kc_thread;
    429 		/*
    430 		 * The context is thread-bound and therefore has a device
    431 		 * context.  It will be freed via removectx() calling
    432 		 * freectx() calling kcpc_free().
    433 		 */
    434 		if (t == curthread &&
    435 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
    436 			kpreempt_disable();
    437 			pcbe_ops->pcbe_allstop();
    438 			atomic_or_uint(&ctx->kc_flags,
    439 			    KCPC_CTX_INVALID_STOPPED);
    440 			kpreempt_enable();
    441 		}
    442 #ifdef DEBUG
    443 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    444 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
    445 			panic("kcpc_unbind: context %p not preset on thread %p",
    446 			    ctx, t);
    447 #else
    448 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    449 		    kcpc_lwp_create, NULL, kcpc_free);
    450 #endif /* DEBUG */
    451 		t->t_cpc_set = NULL;
    452 		t->t_cpc_ctx = NULL;
    453 	} else {
    454 		/*
    455 		 * If we are unbinding a CPU-bound set from a remote CPU, the
    456 		 * native CPU's idle thread could be in the midst of programming
    457 		 * this context onto the CPU. We grab the context's lock here to
    458 		 * ensure that the idle thread is done with it. When we release
    459 		 * the lock, the CPU no longer has a context and the idle thread
    460 		 * will move on.
    461 		 *
    462 		 * cpu_lock must be held to prevent the CPU from being DR'd out
    463 		 * while we disassociate the context from the cpu_t.
    464 		 */
    465 		cpu_t *cp;
    466 		mutex_enter(&cpu_lock);
    467 		cp = cpu_get(ctx->kc_cpuid);
    468 		if (cp != NULL) {
    469 			/*
    470 			 * The CPU may have been DR'd out of the system.
    471 			 */
    472 			mutex_enter(&cp->cpu_cpc_ctxlock);
    473 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
    474 				kcpc_stop_hw(ctx);
    475 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
    476 			cp->cpu_cpc_ctx = NULL;
    477 			mutex_exit(&cp->cpu_cpc_ctxlock);
    478 		}
    479 		mutex_exit(&cpu_lock);
    480 		if (ctx->kc_thread == curthread) {
    481 			kcpc_free(ctx, 0);
    482 			curthread->t_cpc_set = NULL;
    483 		}
    484 	}
    485 
    486 	return (0);
    487 }
    488 
    489 int
    490 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
    491 {
    492 	int i;
    493 
    494 	ASSERT(set != NULL);
    495 	ASSERT(set->ks_ctx != NULL);
    496 	ASSERT(set->ks_ctx->kc_thread == curthread);
    497 	ASSERT(set->ks_ctx->kc_cpuid == -1);
    498 
    499 	if (index < 0 || index >= set->ks_nreqs)
    500 		return (EINVAL);
    501 
    502 	for (i = 0; i < set->ks_nreqs; i++)
    503 		if (set->ks_req[i].kr_index == index)
    504 			break;
    505 	ASSERT(i != set->ks_nreqs);
    506 
    507 	set->ks_req[i].kr_preset = preset;
    508 	return (0);
    509 }
    510 
    511 int
    512 kcpc_restart(kcpc_set_t *set)
    513 {
    514 	kcpc_ctx_t	*ctx = set->ks_ctx;
    515 	int		i;
    516 
    517 	ASSERT(ctx != NULL);
    518 	ASSERT(ctx->kc_thread == curthread);
    519 	ASSERT(ctx->kc_cpuid == -1);
    520 
    521 	kpreempt_disable();
    522 
    523 	/*
    524 	 * If the user is doing this on a running set, make sure the counters
    525 	 * are stopped first.
    526 	 */
    527 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    528 		pcbe_ops->pcbe_allstop();
    529 
    530 	for (i = 0; i < set->ks_nreqs; i++) {
    531 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
    532 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
    533 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
    534 	}
    535 
    536 	/*
    537 	 * Ask the backend to program the hardware.
    538 	 */
    539 	ctx->kc_rawtick = KCPC_GET_TICK();
    540 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    541 	pcbe_ops->pcbe_program(ctx);
    542 	kpreempt_enable();
    543 
    544 	return (0);
    545 }
    546 
    547 /*
    548  * Caller must hold kcpc_cpuctx_lock.
    549  */
    550 int
    551 kcpc_enable(kthread_t *t, int cmd, int enable)
    552 {
    553 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
    554 	kcpc_set_t	*set = t->t_cpc_set;
    555 	kcpc_set_t	*newset;
    556 	int		i;
    557 	int		flag;
    558 	int		err;
    559 
    560 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
    561 
    562 	if (ctx == NULL) {
    563 		/*
    564 		 * This thread has a set but no context; it must be a
    565 		 * CPU-bound set.
    566 		 */
    567 		ASSERT(t->t_cpc_set != NULL);
    568 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
    569 		return (EINVAL);
    570 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
    571 		return (EAGAIN);
    572 
    573 	if (cmd == CPC_ENABLE) {
    574 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    575 			return (EINVAL);
    576 		kpreempt_disable();
    577 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    578 		kcpc_restore(ctx);
    579 		kpreempt_enable();
    580 	} else if (cmd == CPC_DISABLE) {
    581 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
    582 			return (EINVAL);
    583 		kpreempt_disable();
    584 		kcpc_save(ctx);
    585 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    586 		kpreempt_enable();
    587 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
    588 		/*
    589 		 * Strategy for usr/sys: stop counters and update set's presets
    590 		 * with current counter values, unbind, update requests with
    591 		 * new config, then re-bind.
    592 		 */
    593 		flag = (cmd == CPC_USR_EVENTS) ?
    594 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
    595 
    596 		kpreempt_disable();
    597 		atomic_or_uint(&ctx->kc_flags,
    598 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
    599 		pcbe_ops->pcbe_allstop();
    600 		kpreempt_enable();
    601 		for (i = 0; i < set->ks_nreqs; i++) {
    602 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
    603 			if (enable)
    604 				set->ks_req[i].kr_flags |= flag;
    605 			else
    606 				set->ks_req[i].kr_flags &= ~flag;
    607 		}
    608 		newset = kcpc_dup_set(set);
    609 		if (kcpc_unbind(set) != 0)
    610 			return (EINVAL);
    611 		t->t_cpc_set = newset;
    612 		if (kcpc_bind_thread(newset, t, &err) != 0) {
    613 			t->t_cpc_set = NULL;
    614 			kcpc_free_set(newset);
    615 			return (EINVAL);
    616 		}
    617 	} else
    618 		return (EINVAL);
    619 
    620 	return (0);
    621 }
    622 
    623 /*
    624  * Provide PCBEs with a way of obtaining the configs of every counter which will
    625  * be programmed together.
    626  *
    627  * If current is NULL, provide the first config.
    628  *
    629  * If data != NULL, caller wants to know where the data store associated with
    630  * the config we return is located.
    631  */
    632 void *
    633 kcpc_next_config(void *token, void *current, uint64_t **data)
    634 {
    635 	int		i;
    636 	kcpc_pic_t	*pic;
    637 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
    638 
    639 	if (current == NULL) {
    640 		/*
    641 		 * Client would like the first config, which may not be in
    642 		 * counter 0; we need to search through the counters for the
    643 		 * first config.
    644 		 */
    645 		for (i = 0; i < cpc_ncounters; i++)
    646 			if (ctx->kc_pics[i].kp_req != NULL)
    647 				break;
    648 		/*
    649 		 * There are no counters configured for the given context.
    650 		 */
    651 		if (i == cpc_ncounters)
    652 			return (NULL);
    653 	} else {
    654 		/*
    655 		 * There surely is a faster way to do this.
    656 		 */
    657 		for (i = 0; i < cpc_ncounters; i++) {
    658 			pic = &ctx->kc_pics[i];
    659 
    660 			if (pic->kp_req != NULL &&
    661 			    current == pic->kp_req->kr_config)
    662 				break;
    663 		}
    664 
    665 		/*
    666 		 * We found the current config at picnum i. Now search for the
    667 		 * next configured PIC.
    668 		 */
    669 		for (i++; i < cpc_ncounters; i++) {
    670 			pic = &ctx->kc_pics[i];
    671 			if (pic->kp_req != NULL)
    672 				break;
    673 		}
    674 
    675 		if (i == cpc_ncounters)
    676 			return (NULL);
    677 	}
    678 
    679 	if (data != NULL) {
    680 		*data = ctx->kc_pics[i].kp_req->kr_data;
    681 	}
    682 
    683 	return (ctx->kc_pics[i].kp_req->kr_config);
    684 }
    685 
    686 
    687 static kcpc_ctx_t *
    688 kcpc_ctx_alloc(void)
    689 {
    690 	kcpc_ctx_t	*ctx;
    691 	long		hash;
    692 
    693 	ctx = (kcpc_ctx_t *)kmem_alloc(sizeof (kcpc_ctx_t), KM_SLEEP);
    694 
    695 	hash = CPC_HASH_CTX(ctx);
    696 	mutex_enter(&kcpc_ctx_llock[hash]);
    697 	ctx->kc_next = kcpc_ctx_list[hash];
    698 	kcpc_ctx_list[hash] = ctx;
    699 	mutex_exit(&kcpc_ctx_llock[hash]);
    700 
    701 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
    702 	    cpc_ncounters, KM_SLEEP);
    703 
    704 	ctx->kc_flags = 0;
    705 	ctx->kc_vtick = 0;
    706 	ctx->kc_rawtick = 0;
    707 	ctx->kc_cpuid = -1;
    708 
    709 	return (ctx);
    710 }
    711 
    712 /*
    713  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
    714  * in the flags.
    715  */
    716 static void
    717 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
    718 {
    719 	kcpc_set_t	*ks = ctx->kc_set, *cks;
    720 	int		i, j;
    721 	int		code;
    722 
    723 	ASSERT(ks != NULL);
    724 
    725 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
    726 		return;
    727 
    728 	cks = kmem_alloc(sizeof (*cks), KM_SLEEP);
    729 	cctx->kc_set = cks;
    730 	cks->ks_flags = ks->ks_flags;
    731 	cks->ks_nreqs = ks->ks_nreqs;
    732 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
    733 	    sizeof (kcpc_request_t), KM_SLEEP);
    734 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
    735 	    KM_SLEEP);
    736 	cks->ks_ctx = cctx;
    737 
    738 	for (i = 0; i < cks->ks_nreqs; i++) {
    739 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
    740 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
    741 		(void) strncpy(cks->ks_req[i].kr_event,
    742 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
    743 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
    744 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
    745 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
    746 		if (ks->ks_req[i].kr_nattrs > 0) {
    747 			cks->ks_req[i].kr_attr =
    748 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
    749 			    sizeof (kcpc_attr_t), KM_SLEEP);
    750 		}
    751 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
    752 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
    753 			    ks->ks_req[i].kr_attr[j].ka_name,
    754 			    CPC_MAX_ATTR_LEN);
    755 			cks->ks_req[i].kr_attr[j].ka_val =
    756 			    ks->ks_req[i].kr_attr[j].ka_val;
    757 		}
    758 	}
    759 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
    760 		kcpc_invalidate_config(cctx);
    761 }
    762 
    763 
    764 static void
    765 kcpc_ctx_free(kcpc_ctx_t *ctx)
    766 {
    767 	kcpc_ctx_t	**loc;
    768 	long		hash = CPC_HASH_CTX(ctx);
    769 
    770 	mutex_enter(&kcpc_ctx_llock[hash]);
    771 	loc = &kcpc_ctx_list[hash];
    772 	ASSERT(*loc != NULL);
    773 	while (*loc != ctx)
    774 		loc = &(*loc)->kc_next;
    775 	*loc = ctx->kc_next;
    776 	mutex_exit(&kcpc_ctx_llock[hash]);
    777 
    778 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
    779 	kmem_free(ctx, sizeof (*ctx));
    780 }
    781 
    782 /*
    783  * Generic interrupt handler used on hardware that generates
    784  * overflow interrupts.
    785  *
    786  * Note: executed at high-level interrupt context!
    787  */
    788 /*ARGSUSED*/
    789 kcpc_ctx_t *
    790 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
    791 {
    792 	kcpc_ctx_t	*ctx;
    793 	kthread_t	*t = curthread;
    794 	int		i;
    795 
    796 	/*
    797 	 * On both x86 and UltraSPARC, we may deliver the high-level
    798 	 * interrupt in kernel mode, just after we've started to run an
    799 	 * interrupt thread.  (That's because the hardware helpfully
    800 	 * delivers the overflow interrupt some random number of cycles
    801 	 * after the instruction that caused the overflow by which time
    802 	 * we're in some part of the kernel, not necessarily running on
    803 	 * the right thread).
    804 	 *
    805 	 * Check for this case here -- find the pinned thread
    806 	 * that was running when the interrupt went off.
    807 	 */
    808 	if (t->t_flag & T_INTR_THREAD) {
    809 		klwp_t *lwp;
    810 
    811 		atomic_add_32(&kcpc_intrctx_count, 1);
    812 
    813 		/*
    814 		 * Note that t_lwp is always set to point at the underlying
    815 		 * thread, thus this will work in the presence of nested
    816 		 * interrupts.
    817 		 */
    818 		ctx = NULL;
    819 		if ((lwp = t->t_lwp) != NULL) {
    820 			t = lwptot(lwp);
    821 			ctx = t->t_cpc_ctx;
    822 		}
    823 	} else
    824 		ctx = t->t_cpc_ctx;
    825 
    826 	if (ctx == NULL) {
    827 		/*
    828 		 * This can easily happen if we're using the counters in
    829 		 * "shared" mode, for example, and an overflow interrupt
    830 		 * occurs while we are running cpustat.  In that case, the
    831 		 * bound thread that has the context that belongs to this
    832 		 * CPU is almost certainly sleeping (if it was running on
    833 		 * the CPU we'd have found it above), and the actual
    834 		 * interrupted thread has no knowledge of performance counters!
    835 		 */
    836 		ctx = curthread->t_cpu->cpu_cpc_ctx;
    837 		if (ctx != NULL) {
    838 			/*
    839 			 * Return the bound context for this CPU to
    840 			 * the interrupt handler so that it can synchronously
    841 			 * sample the hardware counters and restart them.
    842 			 */
    843 			return (ctx);
    844 		}
    845 
    846 		/*
    847 		 * As long as the overflow interrupt really is delivered early
    848 		 * enough after trapping into the kernel to avoid switching
    849 		 * threads, we must always be able to find the cpc context,
    850 		 * or something went terribly wrong i.e. we ended up
    851 		 * running a passivated interrupt thread, a kernel
    852 		 * thread or we interrupted idle, all of which are Very Bad.
    853 		 */
    854 		if (kcpc_nullctx_panic)
    855 			panic("null cpc context, thread %p", (void *)t);
    856 		atomic_add_32(&kcpc_nullctx_count, 1);
    857 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
    858 		/*
    859 		 * Schedule an ast to sample the counters, which will
    860 		 * propagate any overflow into the virtualized performance
    861 		 * counter(s), and may deliver a signal.
    862 		 */
    863 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
    864 		/*
    865 		 * If a counter has overflowed which was counting on behalf of
    866 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
    867 		 * process a signal.
    868 		 */
    869 		for (i = 0; i < cpc_ncounters; i++) {
    870 			if (ctx->kc_pics[i].kp_req != NULL &&
    871 			    bitmap & (1 << i) &&
    872 			    ctx->kc_pics[i].kp_req->kr_flags &
    873 			    CPC_OVF_NOTIFY_EMT) {
    874 				/*
    875 				 * A signal has been requested for this PIC, so
    876 				 * so freeze the context. The interrupt handler
    877 				 * has already stopped the counter hardware.
    878 				 */
    879 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    880 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
    881 				    KCPC_PIC_OVERFLOWED);
    882 			}
    883 		}
    884 		aston(t);
    885 	}
    886 	return (NULL);
    887 }
    888 
    889 /*
    890  * The current thread context had an overflow interrupt; we're
    891  * executing here in high-level interrupt context.
    892  */
    893 /*ARGSUSED*/
    894 uint_t
    895 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
    896 {
    897 	kcpc_ctx_t	*ctx;
    898 	uint64_t	bitmap;
    899 
    900 	if (pcbe_ops == NULL ||
    901 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
    902 		return (DDI_INTR_UNCLAIMED);
    903 
    904 	/*
    905 	 * Prevent any further interrupts.
    906 	 */
    907 	pcbe_ops->pcbe_allstop();
    908 
    909 	/*
    910 	 * Invoke the "generic" handler.
    911 	 *
    912 	 * If the interrupt has occurred in the context of an lwp owning
    913 	 * the counters, then the handler posts an AST to the lwp to
    914 	 * trigger the actual sampling, and optionally deliver a signal or
    915 	 * restart the counters, on the way out of the kernel using
    916 	 * kcpc_hw_overflow_ast() (see below).
    917 	 *
    918 	 * On the other hand, if the handler returns the context to us
    919 	 * directly, then it means that there are no other threads in
    920 	 * the middle of updating it, no AST has been posted, and so we
    921 	 * should sample the counters here, and restart them with no
    922 	 * further fuss.
    923 	 */
    924 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
    925 		uint64_t curtick = KCPC_GET_TICK();
    926 
    927 		ctx->kc_hrtime = gethrtime_waitfree();
    928 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
    929 		ctx->kc_rawtick = curtick;
    930 		pcbe_ops->pcbe_sample(ctx);
    931 		pcbe_ops->pcbe_program(ctx);
    932 	}
    933 
    934 	return (DDI_INTR_CLAIMED);
    935 }
    936 
    937 /*
    938  * Called from trap() when processing the ast posted by the high-level
    939  * interrupt handler.
    940  */
    941 int
    942 kcpc_overflow_ast()
    943 {
    944 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
    945 	int		i;
    946 	int		found = 0;
    947 	uint64_t	curtick = KCPC_GET_TICK();
    948 
    949 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
    950 
    951 	/*
    952 	 * An overflow happened: sample the context to ensure that
    953 	 * the overflow is propagated into the upper bits of the
    954 	 * virtualized 64-bit counter(s).
    955 	 */
    956 	kpreempt_disable();
    957 	ctx->kc_hrtime = gethrtime_waitfree();
    958 	pcbe_ops->pcbe_sample(ctx);
    959 	kpreempt_enable();
    960 
    961 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
    962 
    963 	/*
    964 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
    965 	 * if that pic generated an overflow and if the request it was counting
    966 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
    967 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
    968 	 * found any overflowed pics, keep the context frozen and return true
    969 	 * (thus causing a signal to be sent).
    970 	 */
    971 	for (i = 0; i < cpc_ncounters; i++) {
    972 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
    973 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
    974 			    ~KCPC_PIC_OVERFLOWED);
    975 			found = 1;
    976 		}
    977 	}
    978 	if (found)
    979 		return (1);
    980 
    981 	/*
    982 	 * Otherwise, re-enable the counters and continue life as before.
    983 	 */
    984 	kpreempt_disable();
    985 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    986 	pcbe_ops->pcbe_program(ctx);
    987 	kpreempt_enable();
    988 	return (0);
    989 }
    990 
    991 /*
    992  * Called when switching away from current thread.
    993  */
    994 static void
    995 kcpc_save(kcpc_ctx_t *ctx)
    996 {
    997 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
    998 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
    999 			return;
   1000 		/*
   1001 		 * This context has been invalidated but the counters have not
   1002 		 * been stopped. Stop them here and mark the context stopped.
   1003 		 */
   1004 		pcbe_ops->pcbe_allstop();
   1005 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
   1006 		return;
   1007 	}
   1008 
   1009 	pcbe_ops->pcbe_allstop();
   1010 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
   1011 		return;
   1012 
   1013 	/*
   1014 	 * Need to sample for all reqs into each req's current mpic.
   1015 	 */
   1016 	ctx->kc_hrtime = gethrtime();
   1017 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
   1018 	pcbe_ops->pcbe_sample(ctx);
   1019 }
   1020 
   1021 static void
   1022 kcpc_restore(kcpc_ctx_t *ctx)
   1023 {
   1024 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
   1025 	    KCPC_CTX_INVALID)
   1026 		/*
   1027 		 * The context is invalidated but has not been marked stopped.
   1028 		 * We mark it as such here because we will not start the
   1029 		 * counters during this context switch.
   1030 		 */
   1031 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
   1032 
   1033 
   1034 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE))
   1035 		return;
   1036 
   1037 	/*
   1038 	 * While programming the hardware, the counters should be stopped. We
   1039 	 * don't do an explicit pcbe_allstop() here because they should have
   1040 	 * been stopped already by the last consumer.
   1041 	 */
   1042 	ctx->kc_rawtick = KCPC_GET_TICK();
   1043 	pcbe_ops->pcbe_program(ctx);
   1044 }
   1045 
   1046 /*
   1047  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
   1048  * following context operators to the idle thread on each CPU. They stop the
   1049  * counters when the idle thread is switched on, and they start them again when
   1050  * it is switched off.
   1051  */
   1052 
   1053 /*ARGSUSED*/
   1054 void
   1055 kcpc_idle_save(struct cpu *cp)
   1056 {
   1057 	/*
   1058 	 * The idle thread shouldn't be run anywhere else.
   1059 	 */
   1060 	ASSERT(CPU == cp);
   1061 
   1062 	/*
   1063 	 * We must hold the CPU's context lock to ensure the context isn't freed
   1064 	 * while we're looking at it.
   1065 	 */
   1066 	mutex_enter(&cp->cpu_cpc_ctxlock);
   1067 
   1068 	if ((cp->cpu_cpc_ctx == NULL) ||
   1069 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
   1070 		mutex_exit(&cp->cpu_cpc_ctxlock);
   1071 		return;
   1072 	}
   1073 
   1074 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
   1075 	mutex_exit(&cp->cpu_cpc_ctxlock);
   1076 }
   1077 
   1078 void
   1079 kcpc_idle_restore(struct cpu *cp)
   1080 {
   1081 	/*
   1082 	 * The idle thread shouldn't be run anywhere else.
   1083 	 */
   1084 	ASSERT(CPU == cp);
   1085 
   1086 	/*
   1087 	 * We must hold the CPU's context lock to ensure the context isn't freed
   1088 	 * while we're looking at it.
   1089 	 */
   1090 	mutex_enter(&cp->cpu_cpc_ctxlock);
   1091 
   1092 	if ((cp->cpu_cpc_ctx == NULL) ||
   1093 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
   1094 		mutex_exit(&cp->cpu_cpc_ctxlock);
   1095 		return;
   1096 	}
   1097 
   1098 	pcbe_ops->pcbe_allstop();
   1099 	mutex_exit(&cp->cpu_cpc_ctxlock);
   1100 }
   1101 
   1102 /*ARGSUSED*/
   1103 static void
   1104 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
   1105 {
   1106 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
   1107 	int		i;
   1108 
   1109 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
   1110 		return;
   1111 
   1112 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
   1113 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
   1114 		rw_exit(&kcpc_cpuctx_lock);
   1115 		return;
   1116 	}
   1117 	cctx = kcpc_ctx_alloc();
   1118 	kcpc_ctx_clone(ctx, cctx);
   1119 	rw_exit(&kcpc_cpuctx_lock);
   1120 
   1121 	/*
   1122 	 * Copy the parent context's kc_flags field, but don't overwrite
   1123 	 * the child's in case it was modified during kcpc_ctx_clone.
   1124 	 */
   1125 	cctx->kc_flags |= ctx->kc_flags;
   1126 	cctx->kc_thread = ct;
   1127 	cctx->kc_cpuid = -1;
   1128 	ct->t_c