Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/param.h>
     28 #include <sys/thread.h>
     29 #include <sys/cpuvar.h>
     30 #include <sys/inttypes.h>
     31 #include <sys/cmn_err.h>
     32 #include <sys/time.h>
     33 #include <sys/ksynch.h>
     34 #include <sys/systm.h>
     35 #include <sys/kcpc.h>
     36 #include <sys/cpc_impl.h>
     37 #include <sys/cpc_pcbe.h>
     38 #include <sys/atomic.h>
     39 #include <sys/sunddi.h>
     40 #include <sys/modctl.h>
     41 #include <sys/sdt.h>
     42 #if defined(__x86)
     43 #include <asm/clock.h>
     44 #endif
     45 
     46 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
     47 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
     48 
     49 
     50 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
     51 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
     52 
     53 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
     54 
     55 /*
     56  * These are set when a PCBE module is loaded.
     57  */
     58 uint_t		cpc_ncounters = 0;
     59 pcbe_ops_t	*pcbe_ops = NULL;
     60 
     61 /*
     62  * Statistics on (mis)behavior
     63  */
     64 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
     65 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
     66 
     67 /*
     68  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
     69  * with no valid context will result in a panic.
     70  */
     71 static int kcpc_nullctx_panic = 0;
     72 
     73 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
     74 static void kcpc_restore(kcpc_ctx_t *ctx);
     75 static void kcpc_save(kcpc_ctx_t *ctx);
     76 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
     77 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
     78 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
     79 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
     80 
     81 void
     82 kcpc_register_pcbe(pcbe_ops_t *ops)
     83 {
     84 	pcbe_ops = ops;
     85 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
     86 }
     87 
     88 void
     89 kcpc_register_dcpc(void (*func)(uint64_t))
     90 {
     91 	dtrace_cpc_fire = func;
     92 }
     93 
     94 void
     95 kcpc_unregister_dcpc(void)
     96 {
     97 	dtrace_cpc_fire = NULL;
     98 }
     99 
    100 int
    101 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
    102 {
    103 	cpu_t		*cp;
    104 	kcpc_ctx_t	*ctx;
    105 	int		error;
    106 
    107 	ctx = kcpc_ctx_alloc();
    108 
    109 	if (kcpc_assign_reqs(set, ctx) != 0) {
    110 		kcpc_ctx_free(ctx);
    111 		*subcode = CPC_RESOURCE_UNAVAIL;
    112 		return (EINVAL);
    113 	}
    114 
    115 	ctx->kc_cpuid = cpuid;
    116 	ctx->kc_thread = curthread;
    117 
    118 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    119 
    120 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    121 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    122 		kcpc_ctx_free(ctx);
    123 		return (error);
    124 	}
    125 
    126 	set->ks_ctx = ctx;
    127 	ctx->kc_set = set;
    128 
    129 	/*
    130 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
    131 	 * we are manipulating the cpu_t and programming the hardware, else the
    132 	 * the cpu_t could go away while we're looking at it.
    133 	 */
    134 	mutex_enter(&cpu_lock);
    135 	cp = cpu_get(cpuid);
    136 
    137 	if (cp == NULL)
    138 		/*
    139 		 * The CPU could have been DRd out while we were getting set up.
    140 		 */
    141 		goto unbound;
    142 
    143 	mutex_enter(&cp->cpu_cpc_ctxlock);
    144 
    145 	if (cp->cpu_cpc_ctx != NULL) {
    146 		/*
    147 		 * If this CPU already has a bound set, return an error.
    148 		 */
    149 		mutex_exit(&cp->cpu_cpc_ctxlock);
    150 		goto unbound;
    151 	}
    152 
    153 	if (curthread->t_bind_cpu != cpuid) {
    154 		mutex_exit(&cp->cpu_cpc_ctxlock);
    155 		goto unbound;
    156 	}
    157 	cp->cpu_cpc_ctx = ctx;
    158 
    159 	/*
    160 	 * Kernel preemption must be disabled while fiddling with the hardware
    161 	 * registers to prevent partial updates.
    162 	 */
    163 	kpreempt_disable();
    164 	ctx->kc_rawtick = KCPC_GET_TICK();
    165 	pcbe_ops->pcbe_program(ctx);
    166 	kpreempt_enable();
    167 
    168 	mutex_exit(&cp->cpu_cpc_ctxlock);
    169 	mutex_exit(&cpu_lock);
    170 
    171 	mutex_enter(&set->ks_lock);
    172 	set->ks_state |= KCPC_SET_BOUND;
    173 	cv_signal(&set->ks_condv);
    174 	mutex_exit(&set->ks_lock);
    175 
    176 	return (0);
    177 
    178 unbound:
    179 	mutex_exit(&cpu_lock);
    180 	set->ks_ctx = NULL;
    181 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    182 	kcpc_ctx_free(ctx);
    183 	return (EAGAIN);
    184 }
    185 
    186 int
    187 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
    188 {
    189 	kcpc_ctx_t	*ctx;
    190 	int		error;
    191 
    192 	/*
    193 	 * Only one set is allowed per context, so ensure there is no
    194 	 * existing context.
    195 	 */
    196 
    197 	if (t->t_cpc_ctx != NULL)
    198 		return (EEXIST);
    199 
    200 	ctx = kcpc_ctx_alloc();
    201 
    202 	/*
    203 	 * The context must begin life frozen until it has been properly
    204 	 * programmed onto the hardware. This prevents the context ops from
    205 	 * worrying about it until we're ready.
    206 	 */
    207 	ctx->kc_flags |= KCPC_CTX_FREEZE;
    208 	ctx->kc_hrtime = gethrtime();
    209 
    210 	if (kcpc_assign_reqs(set, ctx) != 0) {
    211 		kcpc_ctx_free(ctx);
    212 		*subcode = CPC_RESOURCE_UNAVAIL;
    213 		return (EINVAL);
    214 	}
    215 
    216 	ctx->kc_cpuid = -1;
    217 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
    218 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
    219 	ctx->kc_thread = t;
    220 	t->t_cpc_ctx = ctx;
    221 	/*
    222 	 * Permit threads to look at their own hardware counters from userland.
    223 	 */
    224 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
    225 
    226 	/*
    227 	 * Create the data store for this set.
    228 	 */
    229 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    230 
    231 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    232 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    233 		kcpc_ctx_free(ctx);
    234 		t->t_cpc_ctx = NULL;
    235 		return (error);
    236 	}
    237 
    238 	set->ks_ctx = ctx;
    239 	ctx->kc_set = set;
    240 
    241 	/*
    242 	 * Add a device context to the subject thread.
    243 	 */
    244 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
    245 	    kcpc_lwp_create, NULL, kcpc_free);
    246 
    247 	/*
    248 	 * Ask the backend to program the hardware.
    249 	 */
    250 	if (t == curthread) {
    251 		kpreempt_disable();
    252 		ctx->kc_rawtick = KCPC_GET_TICK();
    253 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    254 		pcbe_ops->pcbe_program(ctx);
    255 		kpreempt_enable();
    256 	} else
    257 		/*
    258 		 * Since we are the agent LWP, we know the victim LWP is stopped
    259 		 * until we're done here; no need to worry about preemption or
    260 		 * migration here. We still use an atomic op to clear the flag
    261 		 * to ensure the flags are always self-consistent; they can
    262 		 * still be accessed from, for instance, another CPU doing a
    263 		 * kcpc_invalidate_all().
    264 		 */
    265 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    266 
    267 	mutex_enter(&set->ks_lock);
    268 	set->ks_state |= KCPC_SET_BOUND;
    269 	cv_signal(&set->ks_condv);
    270 	mutex_exit(&set->ks_lock);
    271 
    272 	return (0);
    273 }
    274 
    275 /*
    276  * Walk through each request in the set and ask the PCBE to configure a
    277  * corresponding counter.
    278  */
    279 int
    280 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
    281 {
    282 	int		i;
    283 	int		ret;
    284 	kcpc_request_t	*rp;
    285 
    286 	for (i = 0; i < set->ks_nreqs; i++) {
    287 		int n;
    288 		rp = &set->ks_req[i];
    289 
    290 		n = rp->kr_picnum;
    291 
    292 		ASSERT(n >= 0 && n < cpc_ncounters);
    293 
    294 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
    295 
    296 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
    297 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
    298 			    == 0) {
    299 				*subcode = -1;
    300 				return (ENOTSUP);
    301 			}
    302 			/*
    303 			 * If any of the counters have requested overflow
    304 			 * notification, we flag the context as being one that
    305 			 * cares about overflow.
    306 			 */
    307 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
    308 		}
    309 
    310 		rp->kr_config = NULL;
    311 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
    312 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
    313 		    &(rp->kr_config), (void *)ctx)) != 0) {
    314 			kcpc_free_configs(set);
    315 			*subcode = ret;
    316 			switch (ret) {
    317 			case CPC_ATTR_REQUIRES_PRIVILEGE:
    318 			case CPC_HV_NO_ACCESS:
    319 				return (EACCES);
    320 			default:
    321 				return (EINVAL);
    322 			}
    323 		}
    324 
    325 		ctx->kc_pics[n].kp_req = rp;
    326 		rp->kr_picp = &ctx->kc_pics[n];
    327 		rp->kr_data = set->ks_data + rp->kr_index;
    328 		*rp->kr_data = rp->kr_preset;
    329 	}
    330 
    331 	return (0);
    332 }
    333 
    334 void
    335 kcpc_free_configs(kcpc_set_t *set)
    336 {
    337 	int i;
    338 
    339 	for (i = 0; i < set->ks_nreqs; i++)
    340 		if (set->ks_req[i].kr_config != NULL)
    341 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
    342 }
    343 
    344 /*
    345  * buf points to a user address and the data should be copied out to that
    346  * address in the current process.
    347  */
    348 int
    349 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
    350 {
    351 	kcpc_ctx_t	*ctx = set->ks_ctx;
    352 	uint64_t	curtick = KCPC_GET_TICK();
    353 
    354 	mutex_enter(&set->ks_lock);
    355 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
    356 		mutex_exit(&set->ks_lock);
    357 		return (EINVAL);
    358 	}
    359 	mutex_exit(&set->ks_lock);
    360 
    361 	if (ctx->kc_flags & KCPC_CTX_INVALID)
    362 		return (EAGAIN);
    363 
    364 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
    365 		/*
    366 		 * Kernel preemption must be disabled while reading the
    367 		 * hardware regs, and if this is a CPU-bound context, while
    368 		 * checking the CPU binding of the current thread.
    369 		 */
    370 		kpreempt_disable();
    371 
    372 		if (ctx->kc_cpuid != -1) {
    373 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
    374 				kpreempt_enable();
    375 				return (EAGAIN);
    376 			}
    377 		}
    378 
    379 		if (ctx->kc_thread == curthread) {
    380 			ctx->kc_hrtime = gethrtime();
    381 			pcbe_ops->pcbe_sample(ctx);
    382 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
    383 			ctx->kc_rawtick = curtick;
    384 		}
    385 
    386 		kpreempt_enable();
    387 
    388 		/*
    389 		 * The config may have been invalidated by
    390 		 * the pcbe_sample op.
    391 		 */
    392 		if (ctx->kc_flags & KCPC_CTX_INVALID)
    393 			return (EAGAIN);
    394 	}
    395 
    396 	if (copyout(set->ks_data, buf,
    397 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
    398 		return (EFAULT);
    399 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
    400 		return (EFAULT);
    401 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
    402 		return (EFAULT);
    403 
    404 	return (0);
    405 }
    406 
    407 /*
    408  * Stop the counters on the CPU this context is bound to.
    409  */
    410 static void
    411 kcpc_stop_hw(kcpc_ctx_t *ctx)
    412 {
    413 	cpu_t *cp;
    414 
    415 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
    416 	    == KCPC_CTX_INVALID);
    417 
    418 	kpreempt_disable();
    419 
    420 	cp = cpu_get(ctx->kc_cpuid);
    421 	ASSERT(cp != NULL);
    422 
    423 	if (cp == CPU) {
    424 		pcbe_ops->pcbe_allstop();
    425 		atomic_or_uint(&ctx->kc_flags,
    426 		    KCPC_CTX_INVALID_STOPPED);
    427 	} else
    428 		kcpc_remote_stop(cp);
    429 	kpreempt_enable();
    430 }
    431 
    432 int
    433 kcpc_unbind(kcpc_set_t *set)
    434 {
    435 	kcpc_ctx_t	*ctx;
    436 	kthread_t	*t;
    437 
    438 	/*
    439 	 * We could be racing with the process's agent thread as it
    440 	 * binds the set; we must wait for the set to finish binding
    441 	 * before attempting to tear it down.
    442 	 */
    443 	mutex_enter(&set->ks_lock);
    444 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
    445 		cv_wait(&set->ks_condv, &set->ks_lock);
    446 	mutex_exit(&set->ks_lock);
    447 
    448 	ctx = set->ks_ctx;
    449 
    450 	/*
    451 	 * Use kc_lock to synchronize with kcpc_restore().
    452 	 */
    453 	mutex_enter(&ctx->kc_lock);
    454 	ctx->kc_flags |= KCPC_CTX_INVALID;
    455 	mutex_exit(&ctx->kc_lock);
    456 
    457 	if (ctx->kc_cpuid == -1) {
    458 		t = ctx->kc_thread;
    459 		/*
    460 		 * The context is thread-bound and therefore has a device
    461 		 * context.  It will be freed via removectx() calling
    462 		 * freectx() calling kcpc_free().
    463 		 */
    464 		if (t == curthread &&
    465 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
    466 			kpreempt_disable();
    467 			pcbe_ops->pcbe_allstop();
    468 			atomic_or_uint(&ctx->kc_flags,
    469 			    KCPC_CTX_INVALID_STOPPED);
    470 			kpreempt_enable();
    471 		}
    472 #ifdef DEBUG
    473 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    474 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
    475 			panic("kcpc_unbind: context %p not preset on thread %p",
    476 			    (void *)ctx, (void *)t);
    477 #else
    478 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    479 		    kcpc_lwp_create, NULL, kcpc_free);
    480 #endif /* DEBUG */
    481 		t->t_cpc_set = NULL;
    482 		t->t_cpc_ctx = NULL;
    483 	} else {
    484 		/*
    485 		 * If we are unbinding a CPU-bound set from a remote CPU, the
    486 		 * native CPU's idle thread could be in the midst of programming
    487 		 * this context onto the CPU. We grab the context's lock here to
    488 		 * ensure that the idle thread is done with it. When we release
    489 		 * the lock, the CPU no longer has a context and the idle thread
    490 		 * will move on.
    491 		 *
    492 		 * cpu_lock must be held to prevent the CPU from being DR'd out
    493 		 * while we disassociate the context from the cpu_t.
    494 		 */
    495 		cpu_t *cp;
    496 		mutex_enter(&cpu_lock);
    497 		cp = cpu_get(ctx->kc_cpuid);
    498 		if (cp != NULL) {
    499 			/*
    500 			 * The CPU may have been DR'd out of the system.
    501 			 */
    502 			mutex_enter(&cp->cpu_cpc_ctxlock);
    503 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
    504 				kcpc_stop_hw(ctx);
    505 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
    506 			cp->cpu_cpc_ctx = NULL;
    507 			mutex_exit(&cp->cpu_cpc_ctxlock);
    508 		}
    509 		mutex_exit(&cpu_lock);
    510 		if (ctx->kc_thread == curthread) {
    511 			kcpc_free(ctx, 0);
    512 			curthread->t_cpc_set = NULL;
    513 		}
    514 	}
    515 
    516 	return (0);
    517 }
    518 
    519 int
    520 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
    521 {
    522 	int i;
    523 
    524 	ASSERT(set != NULL);
    525 	ASSERT(set->ks_state & KCPC_SET_BOUND);
    526 	ASSERT(set->ks_ctx->kc_thread == curthread);
    527 	ASSERT(set->ks_ctx->kc_cpuid == -1);
    528 
    529 	if (index < 0 || index >= set->ks_nreqs)
    530 		return (EINVAL);
    531 
    532 	for (i = 0; i < set->ks_nreqs; i++)
    533 		if (set->ks_req[i].kr_index == index)
    534 			break;
    535 	ASSERT(i != set->ks_nreqs);
    536 
    537 	set->ks_req[i].kr_preset = preset;
    538 	return (0);
    539 }
    540 
    541 int
    542 kcpc_restart(kcpc_set_t *set)
    543 {
    544 	kcpc_ctx_t	*ctx = set->ks_ctx;
    545 	int		i;
    546 
    547 	ASSERT(set->ks_state & KCPC_SET_BOUND);
    548 	ASSERT(ctx->kc_thread == curthread);
    549 	ASSERT(ctx->kc_cpuid == -1);
    550 
    551 	kpreempt_disable();
    552 
    553 	/*
    554 	 * If the user is doing this on a running set, make sure the counters
    555 	 * are stopped first.
    556 	 */
    557 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    558 		pcbe_ops->pcbe_allstop();
    559 
    560 	for (i = 0; i < set->ks_nreqs; i++) {
    561 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
    562 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
    563 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
    564 	}
    565 
    566 	/*
    567 	 * Ask the backend to program the hardware.
    568 	 */
    569 	ctx->kc_rawtick = KCPC_GET_TICK();
    570 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    571 	pcbe_ops->pcbe_program(ctx);
    572 	kpreempt_enable();
    573 
    574 	return (0);
    575 }
    576 
    577 /*
    578  * Caller must hold kcpc_cpuctx_lock.
    579  */
    580 int
    581 kcpc_enable(kthread_t *t, int cmd, int enable)
    582 {
    583 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
    584 	kcpc_set_t	*set = t->t_cpc_set;
    585 	kcpc_set_t	*newset;
    586 	int		i;
    587 	int		flag;
    588 	int		err;
    589 
    590 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
    591 
    592 	if (ctx == NULL) {
    593 		/*
    594 		 * This thread has a set but no context; it must be a
    595 		 * CPU-bound set.
    596 		 */
    597 		ASSERT(t->t_cpc_set != NULL);
    598 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
    599 		return (EINVAL);
    600 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
    601 		return (EAGAIN);
    602 
    603 	if (cmd == CPC_ENABLE) {
    604 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    605 			return (EINVAL);
    606 		kpreempt_disable();
    607 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    608 		kcpc_restore(ctx);
    609 		kpreempt_enable();
    610 	} else if (cmd == CPC_DISABLE) {
    611 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
    612 			return (EINVAL);
    613 		kpreempt_disable();
    614 		kcpc_save(ctx);
    615 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    616 		kpreempt_enable();
    617 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
    618 		/*
    619 		 * Strategy for usr/sys: stop counters and update set's presets
    620 		 * with current counter values, unbind, update requests with
    621 		 * new config, then re-bind.
    622 		 */
    623 		flag = (cmd == CPC_USR_EVENTS) ?
    624 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
    625 
    626 		kpreempt_disable();
    627 		atomic_or_uint(&ctx->kc_flags,
    628 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
    629 		pcbe_ops->pcbe_allstop();
    630 		kpreempt_enable();
    631 		for (i = 0; i < set->ks_nreqs; i++) {
    632 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
    633 			if (enable)
    634 				set->ks_req[i].kr_flags |= flag;
    635 			else
    636 				set->ks_req[i].kr_flags &= ~flag;
    637 		}
    638 		newset = kcpc_dup_set(set);
    639 		if (kcpc_unbind(set) != 0)
    640 			return (EINVAL);
    641 		t->t_cpc_set = newset;
    642 		if (kcpc_bind_thread(newset, t, &err) != 0) {
    643 			t->t_cpc_set = NULL;
    644 			kcpc_free_set(newset);
    645 			return (EINVAL);
    646 		}
    647 	} else
    648 		return (EINVAL);
    649 
    650 	return (0);
    651 }
    652 
    653 /*
    654  * Provide PCBEs with a way of obtaining the configs of every counter which will
    655  * be programmed together.
    656  *
    657  * If current is NULL, provide the first config.
    658  *
    659  * If data != NULL, caller wants to know where the data store associated with
    660  * the config we return is located.
    661  */
    662 void *
    663 kcpc_next_config(void *token, void *current, uint64_t **data)
    664 {
    665 	int		i;
    666 	kcpc_pic_t	*pic;
    667 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
    668 
    669 	if (current == NULL) {
    670 		/*
    671 		 * Client would like the first config, which may not be in
    672 		 * counter 0; we need to search through the counters for the
    673 		 * first config.
    674 		 */
    675 		for (i = 0; i < cpc_ncounters; i++)
    676 			if (ctx->kc_pics[i].kp_req != NULL)
    677 				break;
    678 		/*
    679 		 * There are no counters configured for the given context.
    680 		 */
    681 		if (i == cpc_ncounters)
    682 			return (NULL);
    683 	} else {
    684 		/*
    685 		 * There surely is a faster way to do this.
    686 		 */
    687 		for (i = 0; i < cpc_ncounters; i++) {
    688 			pic = &ctx->kc_pics[i];
    689 
    690 			if (pic->kp_req != NULL &&
    691 			    current == pic->kp_req->kr_config)
    692 				break;
    693 		}
    694 
    695 		/*
    696 		 * We found the current config at picnum i. Now search for the
    697 		 * next configured PIC.
    698 		 */
    699 		for (i++; i < cpc_ncounters; i++) {
    700 			pic = &ctx->kc_pics[i];
    701 			if (pic->kp_req != NULL)
    702 				break;
    703 		}
    704 
    705 		if (i == cpc_ncounters)
    706 			return (NULL);
    707 	}
    708 
    709 	if (data != NULL) {
    710 		*data = ctx->kc_pics[i].kp_req->kr_data;
    711 	}
    712 
    713 	return (ctx->kc_pics[i].kp_req->kr_config);
    714 }
    715 
    716 
    717 kcpc_ctx_t *
    718 kcpc_ctx_alloc(void)
    719 {
    720 	kcpc_ctx_t	*ctx;
    721 	long		hash;
    722 
    723 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
    724 
    725 	hash = CPC_HASH_CTX(ctx);
    726 	mutex_enter(&kcpc_ctx_llock[hash]);
    727 	ctx->kc_next = kcpc_ctx_list[hash];
    728 	kcpc_ctx_list[hash] = ctx;
    729 	mutex_exit(&kcpc_ctx_llock[hash]);
    730 
    731 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
    732 	    cpc_ncounters, KM_SLEEP);
    733 
    734 	ctx->kc_cpuid = -1;
    735 
    736 	return (ctx);
    737 }
    738 
    739 /*
    740  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
    741  * in the flags.
    742  */
    743 static void
    744 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
    745 {
    746 	kcpc_set_t	*ks = ctx->kc_set, *cks;
    747 	int		i, j;
    748 	int		code;
    749 
    750 	ASSERT(ks != NULL);
    751 
    752 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
    753 		return;
    754 
    755 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
    756 	cks->ks_state &= ~KCPC_SET_BOUND;
    757 	cctx->kc_set = cks;
    758 	cks->ks_flags = ks->ks_flags;
    759 	cks->ks_nreqs = ks->ks_nreqs;
    760 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
    761 	    sizeof (kcpc_request_t), KM_SLEEP);
    762 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
    763 	    KM_SLEEP);
    764 	cks->ks_ctx = cctx;
    765 
    766 	for (i = 0; i < cks->ks_nreqs; i++) {
    767 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
    768 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
    769 		(void) strncpy(cks->ks_req[i].kr_event,
    770 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
    771 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
    772 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
    773 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
    774 		if (ks->ks_req[i].kr_nattrs > 0) {
    775 			cks->ks_req[i].kr_attr =
    776 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
    777 			    sizeof (kcpc_attr_t), KM_SLEEP);
    778 		}
    779 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
    780 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
    781 			    ks->ks_req[i].kr_attr[j].ka_name,
    782 			    CPC_MAX_ATTR_LEN);
    783 			cks->ks_req[i].kr_attr[j].ka_val =
    784 			    ks->ks_req[i].kr_attr[j].ka_val;
    785 		}
    786 	}
    787 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
    788 		kcpc_invalidate_config(cctx);
    789 
    790 	mutex_enter(&cks->ks_lock);
    791 	cks->ks_state |= KCPC_SET_BOUND;
    792 	cv_signal(&cks->ks_condv);
    793 	mutex_exit(&cks->ks_lock);
    794 }
    795 
    796 
    797 void
    798 kcpc_ctx_free(kcpc_ctx_t *ctx)
    799 {
    800 	kcpc_ctx_t	**loc;
    801 	long		hash = CPC_HASH_CTX(ctx);
    802 
    803 	mutex_enter(&kcpc_ctx_llock[hash]);
    804 	loc = &kcpc_ctx_list[hash];
    805 	ASSERT(*loc != NULL);
    806 	while (*loc != ctx)
    807 		loc = &(*loc)->kc_next;
    808 	*loc = ctx->kc_next;
    809 	mutex_exit(&kcpc_ctx_llock[hash]);
    810 
    811 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
    812 	cv_destroy(&ctx->kc_condv);
    813 	mutex_destroy(&ctx->kc_lock);
    814 	kmem_free(ctx, sizeof (*ctx));
    815 }
    816 
    817 /*
    818  * Generic interrupt handler used on hardware that generates
    819  * overflow interrupts.
    820  *
    821  * Note: executed at high-level interrupt context!
    822  */
    823 /*ARGSUSED*/
    824 kcpc_ctx_t *
    825 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
    826 {
    827 	kcpc_ctx_t	*ctx;
    828 	kthread_t	*t = curthread;
    829 	int		i;
    830 
    831 	/*
    832 	 * On both x86 and UltraSPARC, we may deliver the high-level
    833 	 * interrupt in kernel mode, just after we've started to run an
    834 	 * interrupt thread.  (That's because the hardware helpfully
    835 	 * delivers the overflow interrupt some random number of cycles
    836 	 * after the instruction that caused the overflow by which time
    837 	 * we're in some part of the kernel, not necessarily running on
    838 	 * the right thread).
    839 	 *
    840 	 * Check for this case here -- find the pinned thread
    841 	 * that was running when the interrupt went off.
    842 	 */
    843 	if (t->t_flag & T_INTR_THREAD) {
    844 		klwp_t *lwp;
    845 
    846 		atomic_add_32(&kcpc_intrctx_count, 1);
    847 
    848 		/*
    849 		 * Note that t_lwp is always set to point at the underlying
    850 		 * thread, thus this will work in the presence of nested
    851 		 * interrupts.
    852 		 */
    853 		ctx = NULL;
    854 		if ((lwp = t->t_lwp) != NULL) {
    855 			t = lwptot(lwp);
    856 			ctx = t->t_cpc_ctx;
    857 		}
    858 	} else
    859 		ctx = t->t_cpc_ctx;
    860 
    861 	if (ctx == NULL) {
    862 		/*
    863 		 * This can easily happen if we're using the counters in
    864 		 * "shared" mode, for example, and an overflow interrupt
    865 		 * occurs while we are running cpustat.  In that case, the
    866 		 * bound thread that has the context that belongs to this
    867 		 * CPU is almost certainly sleeping (if it was running on
    868 		 * the CPU we'd have found it above), and the actual
    869 		 * interrupted thread has no knowledge of performance counters!
    870 		 */
    871 		ctx = curthread->t_cpu->cpu_cpc_ctx;
    872 		if (ctx != NULL) {
    873 			/*
    874 			 * Return the bound context for this CPU to
    875 			 * the interrupt handler so that it can synchronously
    876 			 * sample the hardware counters and restart them.
    877 			 */
    878 			return (ctx);
    879 		}
    880 
    881 		/*
    882 		 * As long as the overflow interrupt really is delivered early
    883 		 * enough after trapping into the kernel to avoid switching
    884 		 * threads, we must always be able to find the cpc context,
    885 		 * or something went terribly wrong i.e. we ended up
    886 		 * running a passivated interrupt thread, a kernel
    887 		 * thread or we interrupted idle, all of which are Very Bad.
    888 		 *
    889 		 * We also could end up here owing to an incredibly unlikely
    890 		 * race condition that exists on x86 based architectures when
    891 		 * the cpc provider is in use; overflow interrupts are directed
    892 		 * to the cpc provider if the 'dtrace_cpc_in_use' variable is
    893 		 * set when we enter the handler. This variable is unset after
    894 		 * overflow interrupts have been disabled on all CPUs and all
    895 		 * contexts have been torn down. To stop interrupts, the cpc
    896 		 * provider issues a xcall to the remote CPU before it tears
    897 		 * down that CPUs context. As high priority xcalls, on an x86
    898 		 * architecture, execute at a higher PIL than this handler, it
    899 		 * is possible (though extremely unlikely) that the xcall could
    900 		 * interrupt the overflow handler before the handler has
    901 		 * checked the 'dtrace_cpc_in_use' variable, stop the counters,
    902 		 * return to the cpc provider which could then rip down
    903 		 * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
    904 		 * overflow handler has had a chance to check the variable. In
    905 		 * that case, the handler would direct the overflow into this
    906 		 * code and no valid context will be found. The default behavior
    907 		 * when no valid context is found is now to shout a warning to
    908 		 * the console and bump the 'kcpc_nullctx_count' variable.
    909 		 */
    910 		if (kcpc_nullctx_panic)
    911 			panic("null cpc context, thread %p", (void *)t);
    912 
    913 		cmn_err(CE_WARN,
    914 		    "null cpc context found in overflow handler!\n");
    915 		atomic_add_32(&kcpc_nullctx_count, 1);
    916 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
    917 		/*
    918 		 * Schedule an ast to sample the counters, which will
    919 		 * propagate any overflow into the virtualized performance
    920 		 * counter(s), and may deliver a signal.
    921 		 */
    922 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
    923 		/*
    924 		 * If a counter has overflowed which was counting on behalf of
    925 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
    926 		 * process a signal.
    927 		 */
    928 		for (i = 0; i < cpc_ncounters; i++) {
    929 			if (ctx->kc_pics[i].kp_req != NULL &&
    930 			    bitmap & (1 << i) &&
    931 			    ctx->kc_pics[i].kp_req->kr_flags &
    932 			    CPC_OVF_NOTIFY_EMT) {
    933 				/*
    934 				 * A signal has been requested for this PIC, so
    935 				 * so freeze the context. The interrupt handler
    936 				 * has already stopped the counter hardware.
    937 				 */
    938 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    939 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
    940 				    KCPC_PIC_OVERFLOWED);
    941 			}
    942 		}
    943 		aston(t);
    944 	}
    945 	return (NULL);
    946 }
    947 
    948 /*
    949  * The current thread context had an overflow interrupt; we're
    950  * executing here in high-level interrupt context.
    951  */
    952 /*ARGSUSED*/
    953 uint_t
    954 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
    955 {
    956 	kcpc_ctx_t *ctx;
    957 	uint64_t bitmap;
    958 	uint8_t *state;
    959 
    960 	if (pcbe_ops == NULL ||
    961 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
    962 		return (DDI_INTR_UNCLAIMED);
    963 
    964 	/*
    965 	 * Prevent any further interrupts.
    966 	 */
    967 	pcbe_ops->pcbe_allstop();
    968 
    969 	if (dtrace_cpc_in_use) {
    970 		state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
    971 
    972 		/*
    973 		 * Set the per-CPU state bit to indicate that we are currently
    974 		 * processing an interrupt if it is currently free. Drop the
    975 		 * interrupt if the state isn't free (i.e. a configuration
    976 		 * event is taking place).
    977 		 */
    978 		if (atomic_cas_8(state, DCPC_INTR_FREE,
    979 		    DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
    980 			int i;
    981 			kcpc_request_t req;
    982 
    983 			ASSERT(dtrace_cpc_fire != NULL);
    984 
    985 			(*dtrace_cpc_fire)(bitmap);
    986 
    987 			ctx = curthread->t_cpu->cpu_cpc_ctx;
    988 
    989 			/* Reset any counters that have overflowed */
    990 			for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
    991 				req = ctx->kc_set->ks_req[i];
    992 
    993 				if (bitmap & (1 << req.kr_picnum)) {
    994 					pcbe_ops->pcbe_configure(req.kr_picnum,
    995 					    req.kr_event, req.kr_preset,
    996 					    req.kr_flags, req.kr_nattrs,
    997 					    req.kr_attr, &(req.kr_config),
    998 					    (void *)ctx);
    999 				}
   1000 			}
   1001 			pcbe_ops->pcbe_program(ctx);
   1002 
   1003 			/*
   1004 			 * We've finished processing the interrupt so set
   1005 			 * the state back to free.
   1006 			 */
   1007 			cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
   1008 			    DCPC_INTR_FREE;
   1009 			membar_producer();
   1010 		}
   1011 		return (DDI_INTR_CLAIMED);
   1012 	}
   1013 
   1014 	/*
   1015 	 * DTrace isn't involved so pass on accordingly.
   1016 	 *
   1017 	 * If the interrupt has occurred in the context of an lwp owning
   1018 	 * the counters, then the handler posts an AST to the lwp to
   1019 	 * trigger the actual sampling, and optionally deliver a signal or
   1020 	 * restart the counters, on the way out of the kernel using
   1021 	 * kcpc_hw_overflow_ast() (see below).
   1022 	 *
   1023 	 * On the other hand, if the handler returns the context to us
   1024 	 * directly, then it means that there are no other threads in
   1025 	 * the middle of updating it, no AST has been posted, and so we
   1026 	 * should sample the counters here, and restart them with no
   1027 	 * further fuss.
   1028 	 */
   1029 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
   1030 		uint64_t curtick = KCPC_GET_TICK();
   1031 
   1032 		ctx->kc_hrtime = gethrtime_waitfree();
   1033 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
   1034 		ctx->kc_rawtick = curtick;
   1035 		pcbe_ops->pcbe_sample(ctx);
   1036 		pcbe_ops->pcbe_program(ctx);
   1037 	}
   1038 
   1039 	return (DDI_INTR_CLAIMED);
   1040 }
   1041 
   1042 /*
   1043  * Called from trap() when processing the ast posted by the high-level
   1044  * interrupt handler.
   1045  */
   1046 int
   1047 kcpc_overflow_ast()
   1048 {
   1049 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
   1050 	int		i;
   1051 	int		found = 0;
   1052 	uint64_t	curtick = KCPC_GET_TICK();
   1053 
   1054 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
   1055 
   1056 	/*
   1057 	 * An overflow happened: sample the context to ensure that
   1058 	 * the overflow is propagated into the upper bits of the
   1059 	 * virtualized 64-bit counter(s).
   1060 	 */
   1061 	kpreempt_disable();
   1062 	ctx->kc_hrtime = gethrtime_waitfree();
   1063 	pcbe_ops->pcbe_sample(ctx);
   1064 	kpreempt_enable();
   1065 
   1066 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
   1067 
   1068 	/*
   1069 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
   1070 	 * if that pic generated an overflow and if the request it was counting
   1071 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
   1072 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
   1073 	 * found any overflowed pics, keep the context frozen and return true
   1074 	 * (thus causing a signal to be sent).
   1075 	 */
   1076 	for (i = 0; i < cpc_ncounters; i++) {
   1077 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
   1078 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
   1079 			    ~KCPC_PIC_OVERFLOWED);
   1080 			found = 1;
   1081 		}
   1082 	}
   1083 	if (found)
   1084 		return (1);
   1085 
   1086 	/*
   1087 	 * Otherwise, re-enable the counters and continue life as before.
   1088 	 */
   1089 	kpreempt_disable();
   1090 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
   1091 	pcbe_ops->pcbe_program(ctx);
   1092 	kpreempt_enable();
   1093 	return (0);
   1094 }
   1095 
   1096 /*
   1097  * Called when switching away from current thread.
   1098  */
   1099 static void
   1100 kcpc_save(kcpc_ctx_t *ctx)
   1101 {
   1102 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
   1103 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
   1104 			return;
   1105 		/*
   1106 		 * This context has been invalidated but the counters have not
   1107 		 * been stopped. Stop them here and mark the context stopped.
   1108 		 */
   1109 		pcbe_ops->pcbe_allstop();
   1110 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
   1111 		return;
   1112 	}
   1113 
   1114 	pcbe_ops->pcbe_allstop();
   1115 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
   1116 		return;
   1117 
   1118 	/*
   1119 	 * Need to sample for all reqs into each req's current mpic.
   1120 	 */
   1121 	ctx->kc_hrtime = gethrtime();
   1122 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
   1123 	pcbe_ops->pcbe_sample(ctx);
   1124 }
   1125 
   1126 static void
   1127 kcpc_restore(kcpc_ctx_t *ctx)
   1128 {
   1129 	mutex_enter(&ctx->kc_lock);
   1130 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
   1131 	    KCPC_CTX_INVALID)
   1132 		/*
   1133 		 * The context is invalidated but has not been marked stopped.
   1134 		 * We mark it as such here because we will not start the
   1135 		 * counters during this context switch.
   1136 		 */
   1137 		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
   1138 
   1139 
   1140 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
   1141 		mutex_exit(&ctx->kc_lock);
   1142 		return;
   1143 	}
   1144 
   1145 	/*
   1146 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
   1147 	 * ctx & set related memory objects being freed without us knowing.
   1148 	 * This can happen if an agent thread is executing a kcpc_unbind(),
   1149 	 * with this thread as the target, whilst we're concurrently doing a
   1150 	 * restorectx() during, for example, a proc_exit().  Effectively, by
   1151 	 * doing this, we're asking kcpc_free() to cv_wait() until
   1152 	 * kcpc_restore() has completed.
   1153 	 */
   1154 	ctx->kc_flags |= KCPC_CTX_RESTORE;
   1155 	mutex_exit(&ctx->kc_lock);
   1156 
   1157 	/*
   1158 	 * While programming the hardware, the counters should be stopped. We
   1159 	 * don't do an explicit pcbe_allstop() here because they should have
   1160 	 * been stopped already by the last consumer.
   1161 	 */
   1162 	ctx->kc_rawtick = KCPC_GET_TICK();
   1163 	pcbe_ops->pcbe_program(ctx);
   1164 
   1165 	/*
   1166 	 * Wake the agent thread if it's waiting in kcpc_free().
   1167 	 */
   1168 	mutex_enter(&ctx->kc_lock);
   1169 	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
   1170 	cv_signal(&ctx->kc_condv);
   1171 	mutex_exit(&ctx->kc_lock);
   1172 }
   1173 
   1174 /*
   1175  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
   1176  * following context operators to the idle thread on each CPU. They stop the
   1177  * counters when the idle thread is switched on, and they start them again when
   1178  * it is switched off.
   1179  */
   1180 
   1181 /*ARGSUSED*/
   1182 void
   1183 kcpc_idle_save(struct cpu *cp)
   1184 {
   1185 	/*
   1186 	 * The idle thread shouldn't be run anywhere else.
   1187 	 */
   1188 	ASSERT(CPU == cp);
   1189 
   1190 	/*
   1191 	 * We must hold the CPU's context lock to ensure the context isn't freed
   1192 	 * while we're looking at it.
   1193 	 */
   1194 	mutex_enter(&cp->cpu_cpc_ctxlock);
   1195 
   1196 	if ((cp->cpu_cpc_ctx == NULL) ||
   1197 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
   1198 		mutex_exit(&cp->cpu_cpc_ctxlock);
   1199 		return;
   1200 	}
   1201 
   1202 	pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
   1203 	mutex_exit(&cp->cpu_cpc_ctxlock);
   1204 }
   1205 
   1206 void
   1207 kcpc_idle_restore(struct cpu *cp)
   1208 {
   1209 	/*
   1210 	 * The idle thread shouldn't be run anywhere else.
   1211 	 */
   1212 	ASSERT(CPU == cp);
   1213 
   1214 	/*
   1215 	 * We must hold the CPU's context lock to ensure the context isn't freed
   1216 	 * while we're looking at it.
   1217 	 */
   1218 	mutex_enter(&cp->cpu_cpc_ctxlock);
   1219 
   1220 	if ((cp->cpu_cpc_ctx == NULL) ||
   1221 	    (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
   1222 		mutex_exit(&cp->cpu_cpc_ctxlock);
   1223 		return;
   1224 	}
   1225 
   1226 	pcbe_ops->pcbe_allstop();
   1227 	mutex_exit(&cp->cpu_cpc_ctxlock);
   1228 }
   1229 
   1230 /*ARGSUSED*/
   1231 static void
   1232 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
   1233 {
   1234 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
   1235 	int		i;
   1236 
   1237 	if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
   1238 		return;
   1239 
   1240 	rw_enter(&kcpc_cpuctx_lock, RW_READER);
   1241 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
   1242 		rw_exit(&kcpc_cpuctx_lock);
   1243 		return;
   1244 	}
   1245 	cctx = kcpc_ctx_alloc();
   1246 	kcpc_ctx_clone(ctx, cctx);
   1247 	rw_exit(&kcpc_cpuctx_lock);
   1248 
   1249 	/*
   1250 	 * Copy the parent context's kc_flags field, but don't overwrite
   1251 	 * the child's in case it was modified during kcpc_ctx_clone.
   1252 	 */
   1253 	cctx->kc_flags |= ctx->kc_flags;
   1254 	cctx->kc_thread = ct;
   1255 	cctx->kc_cpuid = -1;
   1256 	ct->t_cpc_set = cctx->kc_set;
   1257 	ct->t_cpc_ctx = cctx;
   1258 
   1259 	if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
   1260 		kcpc_set_t *ks = cctx->kc_set;
   1261 		/*
   1262 		 * Our contract with the user requires us to immediately send an
   1263 		 * overflow signal to all children if we have the LWPINHERIT
   1264 		 * and SIGOVF flags set. In addition, all counters should be
   1265 		 * set to UINT64_MAX, and their pic's overflow flag turned on
   1266 		 * so that our trap() processing knows to send a signal.
   1267 		 */
   1268 		atomic_or_uint(&cctx->kc_flags, KCPC_CTX_FREEZE);
   1269 		for (i = 0; i < ks->ks_nreqs; i++) {
   1270 			kcpc_request_t *kr = &ks->ks_req[i];
   1271 
   1272 			if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
   1273 				*(kr->kr_data) = UINT64_MAX;
   1274 				kr->kr_picp->kp_flags |= KCPC_PIC_OVERFLOWED;
   1275 			}
   1276 		}
   1277 		ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
   1278 		aston(ct);
   1279 	}
   1280 
   1281 	installctx(ct, cctx, kcpc_save, kcpc_restore,
   1282 	    NULL, kcpc_lwp_create, NULL, kcpc_free);
   1283 }
   1284 
   1285 /*
   1286  * Counter Stoppage Theory
   1287  *
   1288  * The counters may need to be stopped properly at the following occasions:
   1289  *
   1290  * 1) An LWP exits.
   1291  * 2) A thread exits.
   1292  * 3) An LWP performs an exec().
   1293  * 4) A bound set is unbound.
   1294  *
   1295  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
   1296  * to be freed as well.
   1297  *
   1298  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
   1299  * when the thread is freed, kcpc_free(), called by freectx(), frees the
   1300  * context.
   1301  *
   1302  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
   1303  *
   1304  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
   1305  * been called from exec. It stops the counters _and_ frees the context.
   1306  *
   1307  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
   1308  *
   1309  * CPU-bound counters are always stopped via kcpc_unbind().
   1310  */
   1311 
   1312 /*
   1313  * We're being called to delete the context; we ensure that all associated data
   1314  * structures are freed, and that the hardware is passivated if this is an exec.
   1315  */
   1316 
   1317 /*ARGSUSED*/
   1318 static void
   1319 kcpc_free(kcpc_ctx_t *ctx, int isexec)
   1320 {
   1321 	int		i;
   1322 	kcpc_set_t	*set = ctx->kc_set;
   1323 
   1324 	ASSERT(set != NULL);
   1325 
   1326 	/*
   1327 	 * Wait for kcpc_restore() to finish before we tear things down.
   1328 	 */
   1329 	mutex_enter(&ctx->kc_lock);
   1330 	while (ctx->kc_flags & KCPC_CTX_RESTORE)
   1331 		cv_wait(&ctx->kc_condv, &ctx->kc_lock);
   1332 	ctx->kc_flags |= KCPC_CTX_INVALID;
   1333 	mutex_exit(&ctx->kc_lock);
   1334 
   1335 	if (isexec) {
   1336 		/*
   1337 		 * This thread is execing, and after the exec it should not have
   1338 		 * any performance counter context. Stop the counters properly
   1339 		 * here so the system isn't surprised by an overflow interrupt
   1340 		 * later.
   1341 		 */
   1342 		if (ctx->kc_cpuid != -1) {
   1343 			cpu_t *cp;
   1344 			/*
   1345 			 * CPU-bound context; stop the appropriate CPU's ctrs.
   1346 			 * Hold cpu_lock while examining the CPU to ensure it
   1347 			 * doesn't go away.
   1348 			 */
   1349 			mutex_enter(&cpu_lock);
   1350 			cp = cpu_get(ctx->kc_cpuid);
   1351 			/*
   1352 			 * The CPU could have been DR'd out, so only stop the
   1353 			 * CPU and clear its context pointer if the CPU still
   1354 			 * exists.
   1355 			 */
   1356 			if (cp != NULL) {
   1357 				mutex_enter(&cp->cpu_cpc_ctxlock);
   1358 				kcpc_stop_hw(ctx);
   1359 				cp->cpu_cpc_ctx = NULL;
   1360 				mutex_exit(&cp->cpu_cpc_ctxlock);
   1361 			}
   1362 			mutex_exit(&cpu_lock);
   1363 			ASSERT(curthread->t_cpc_ctx == NULL);
   1364 		} else {
   1365 			/*
   1366 			 * Thread-bound context; stop _this_ CPU's counters.
   1367 			 */
   1368 			kpreempt_disable();
   1369 			pcbe_ops->pcbe_allstop();
   1370 			atomic_or_uint(&ctx->kc_flags,
   1371 			    KCPC_CTX_INVALID_STOPPED);
   1372 			kpreempt_enable();
   1373 			curthread->t_cpc_ctx = NULL;
   1374 		}
   1375 
   1376 		/*
   1377 		 * Since we are being called from an exec and we know that
   1378 		 * exec is not permitted via the agent thread, we should clean
   1379 		 * up this thread's CPC state completely, and not leave dangling
   1380 		 * CPC pointers behind.
   1381 		 */
   1382 		ASSERT(ctx->kc_thread == curthread);
   1383 		curthread->t_cpc_set = NULL;
   1384 	}
   1385 
   1386 	/*
   1387 	 * Walk through each request in this context's set and free the PCBE's
   1388 	 * configuration if it exists.
   1389 	 */
   1390 	for (i = 0; i < set->ks_nreqs; i++) {
   1391 		if (set->ks_req[i].kr_config != NULL)
   1392 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
   1393 	}
   1394 
   1395 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
   1396 	kcpc_ctx_free(ctx);
   1397 	kcpc_free_set(set);
   1398 }
   1399 
   1400 /*
   1401  * Free the memory associated with a request set.
   1402  */
   1403 void
   1404 kcpc_free_set(kcpc_set_t *set)
   1405 {
   1406 	int		i;
   1407 	kcpc_request_t	*req;
   1408 
   1409 	ASSERT(set->ks_req != NULL);
   1410 
   1411 	for (i = 0; i < set->ks_nreqs; i++) {
   1412 		req = &set->ks_req[i];
   1413 
   1414 		if (req->kr_nattrs != 0) {
   1415 			kmem_free(req->kr_attr,
   1416 			    req->kr_nattrs * sizeof (kcpc_attr_t));
   1417 		}
   1418 	}
   1419 
   1420 	kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
   1421 	cv_destroy(&set->ks_condv);
   1422 	mutex_destroy(&set->ks_lock);
   1423 	kmem_free(set, sizeof (kcpc_set_t));
   1424 }
   1425 
   1426 /*
   1427  * Grab every existing context and mark it as invalid.
   1428  */
   1429 void
   1430 kcpc_invalidate_all(void)
   1431 {
   1432 	kcpc_ctx_t *ctx;
   1433 	long hash;
   1434 
   1435 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
   1436 		mutex_enter(&kcpc_ctx_llock[hash]);
   1437 		for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
   1438 			atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
   1439 		mutex_exit(&kcpc_ctx_llock[hash]);
   1440 	}
   1441 }
   1442 
   1443 /*
   1444  * Interface for PCBEs to signal that an existing configuration has suddenly
   1445  * become invalid.
   1446  */
   1447 void
   1448 kcpc_invalidate_config(void *token)
   1449 {
   1450 	kcpc_ctx_t *ctx = token;
   1451 
   1452 	ASSERT(ctx != NULL);
   1453 
   1454 	atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
   1455 }
   1456 
   1457 /*
   1458  * Called from lwp_exit() and thread_exit()
   1459  */
   1460 void
   1461 kcpc_passivate(void)
   1462 {
   1463 	kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
   1464 	kcpc_set_t *set = curthread->t_cpc_set;
   1465 
   1466 	if (set == NULL)
   1467 		return;
   1468 
   1469 	/*
   1470 	 * We're cleaning up after this thread; ensure there are no dangling
   1471 	 * CPC pointers left behind. The context and set will be freed by
   1472 	 * freectx() in the case of an LWP-bound set, and by kcpc_unbind() in
   1473 	 * the case of a CPU-bound set.
   1474 	 */
   1475 	curthread->t_cpc_ctx = NULL;
   1476 
   1477 	if (ctx == NULL) {
   1478 		/*
   1479 		 * This thread has a set but no context; it must be a CPU-bound
   1480 		 * set. The hardware will be stopped via kcpc_unbind() when the
   1481 		 * process exits and closes its file descriptors with
   1482 		 * kcpc_close(). Our only job here is to clean up this thread's
   1483 		 * state; the set will be freed with the unbind().
   1484 		 */
   1485 		(void) kcpc_unbind(set);
   1486 		/*
   1487 		 * Unbinding a set belonging to the current thread should clear
   1488 		 * its set pointer.
   1489 		 */
   1490 		ASSERT(curthread->t_cpc_set == NULL);
   1491 		return;
   1492 	}
   1493 
   1494 	curthread->t_cpc_set = NULL;
   1495 
   1496 	/*
   1497 	 * This thread/LWP is exiting but context switches will continue to
   1498 	 * happen for a bit as the exit proceeds.  Kernel preemption must be
   1499 	 * disabled here to prevent a race between checking or setting the
   1500 	 * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
   1501 	 * a context switch.
   1502 	 */
   1503 
   1504 	kpreempt_disable();
   1505 	if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
   1506 		pcbe_ops->pcbe_allstop();
   1507 		atomic_or_uint(&ctx->kc_flags,
   1508 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
   1509 	}
   1510 	kpreempt_enable();
   1511 }
   1512 
   1513 /*
   1514  * Assign the requests in the given set to the PICs in the context.
   1515  * Returns 0 if successful, -1 on failure.
   1516  */
   1517 /*ARGSUSED*/
   1518 int
   1519 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
   1520 {
   1521 	int i;
   1522 	int *picnum_save;
   1523 
   1524 	ASSERT(set->ks_nreqs <= cpc_ncounters);
   1525 
   1526 	/*
   1527 	 * Provide kcpc_tryassign() with scratch space to avoid doing an
   1528 	 * alloc/free with every invocation.
   1529 	 */
   1530 	picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
   1531 	/*
   1532 	 * kcpc_tryassign() blindly walks through each request in the set,
   1533 	 * seeing if a counter can count its event. If yes, it assigns that
   1534 	 * counter. However, that counter may have been the only capable counter
   1535 	 * for _another_ request's event. The solution is to try every possible
   1536 	 * request first. Note that this does not cover all solutions, as
   1537 	 * that would require all unique orderings of requests, an n^n operation
   1538 	 * which would be unacceptable for architectures with many counters.
   1539 	 */
   1540 	for (i = 0; i < set->ks_nreqs; i++)
   1541 		if (kcpc_tryassign(set, i, picnum_save) == 0)
   1542 			break;
   1543 
   1544 	kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
   1545 	if (i == set->ks_nreqs)
   1546 		return (-1);
   1547 	return (0);
   1548 }
   1549 
   1550 static int
   1551 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
   1552 {
   1553 	int		i;
   1554 	int		j;
   1555 	uint64_t	bitmap = 0, resmap = 0;
   1556 	uint64_t	ctrmap;
   1557 
   1558 	/*
   1559 	 * We are attempting to assign the reqs to pics, but we may fail. If we
   1560 	 * fail, we need to restore the state of the requests to what it was
   1561 	 * when we found it, as some reqs may have been explicitly assigned to
   1562 	 * a specific PIC beforehand. We do this by snapshotting the assignments
   1563 	 * now and restoring from it later if we fail.
   1564 	 *
   1565 	 * Also we note here which counters have already been claimed by
   1566 	 * requests with explicit counter assignments.
   1567 	 */
   1568 	for (i = 0; i < set->ks_nreqs; i++) {
   1569 		scratch[i] = set->ks_req[i].kr_picnum;
   1570 		if (set->ks_req[i].kr_picnum != -1)
   1571 			resmap |= (1 << set->ks_req[i].kr_picnum);
   1572 	}
   1573 
   1574 	/*
   1575 	 * Walk through requests assigning them to the first PIC that is
   1576 	 * capable.
   1577 	 */
   1578 	i = starting_req;
   1579 	do {
   1580 		if (set->ks_req[i].kr_picnum != -1) {
   1581 			ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
   1582 			bitmap |= (1 << set->ks_req[i].kr_picnum);
   1583 			if (++i == set->ks_nreqs)
   1584 				i = 0;
   1585 			continue;
   1586 		}
   1587 
   1588 		ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
   1589 		for (j = 0; j < cpc_ncounters; j++) {
   1590 			if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
   1591 			    (resmap & (1 << j)) == 0) {
   1592 				/*
   1593 				 * We can assign this counter because:
   1594 				 *
   1595 				 * 1. It can count the event (ctrmap)
   1596 				 * 2. It hasn't been assigned yet (bitmap)
   1597 				 * 3. It wasn't reserved by a request (resmap)
   1598 				 */
   1599 				bitmap |= (1 << j);
   1600 				break;
   1601 			}
   1602 		}
   1603 		if (j == cpc_ncounters) {
   1604 			for (i = 0; i < set->ks_nreqs; i++)
   1605 				set->ks_req[i].kr_picnum = scratch[i];
   1606 			return (-1);
   1607 		}
   1608 		set->ks_req[i].kr_picnum = j;
   1609 
   1610 		if (++i == set->ks_nreqs)
   1611 			i = 0;
   1612 	} while (i != starting_req);
   1613 
   1614 	return (0);
   1615 }
   1616 
   1617 kcpc_set_t *
   1618 kcpc_dup_set(kcpc_set_t *set)
   1619 {
   1620 	kcpc_set_t	*new;
   1621 	int		i;
   1622 	int		j;
   1623 
   1624 	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
   1625 	new->ks_state &= ~KCPC_SET_BOUND;
   1626 	new->ks_flags = set->ks_flags;
   1627 	new->ks_nreqs = set->ks_nreqs;
   1628 	new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
   1629 	    KM_SLEEP);
   1630 	new->ks_data = NULL;
   1631 	new->ks_ctx = NULL;
   1632 
   1633 	for (i = 0; i < new->ks_nreqs; i++) {
   1634 		new->ks_req[i].kr_config = NULL;
   1635 		new->ks_req[i].kr_index = set->ks_req[i].kr_index;
   1636 		new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
   1637 		new->ks_req[i].kr_picp = NULL;
   1638 		new->ks_req[i].kr_data = NULL;
   1639 		(void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
   1640 		    CPC_MAX_EVENT_LEN);
   1641 		new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
   1642 		new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
   1643 		new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
   1644 		new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
   1645 		    sizeof (kcpc_attr_t), KM_SLEEP);
   1646 		for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
   1647 			new->ks_req[i].kr_attr[j].ka_val =
   1648 			    set->ks_req[i].kr_attr[j].ka_val;
   1649 			(void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
   1650 			    set->ks_req[i].kr_attr[j].ka_name,
   1651 			    CPC_MAX_ATTR_LEN);
   1652 		}
   1653 	}
   1654 
   1655 	return (new);
   1656 }
   1657 
   1658 int
   1659 kcpc_allow_nonpriv(void *token)
   1660 {
   1661 	return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
   1662 }
   1663 
   1664 void
   1665 kcpc_invalidate(kthread_t *t)
   1666 {
   1667 	kcpc_ctx_t *ctx = t->t_cpc_ctx;
   1668 
   1669 	if (ctx != NULL)
   1670 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID);
   1671 }
   1672 
   1673 /*
   1674  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
   1675  * are used to construct PCBE names, starting with the most specific,
   1676  * "pcbe.first.second.third.fourth" and ending with the least specific,
   1677  * "pcbe.first".
   1678  *
   1679  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
   1680  */
   1681 int
   1682 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
   1683 {
   1684 	uint_t s[3];
   1685 
   1686 	s[0] = first;
   1687 	s[1] = second;
   1688 	s[2] = third;
   1689 
   1690 	return (modload_qualified("pcbe",
   1691 	    "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
   1692 }
   1693 
   1694 char *
   1695 kcpc_list_attrs(void)
   1696 {
   1697 	ASSERT(pcbe_ops != NULL);
   1698 
   1699 	return (pcbe_ops->pcbe_list_attrs());
   1700 }
   1701 
   1702 char *
   1703 kcpc_list_events(uint_t pic)
   1704 {
   1705 	ASSERT(pcbe_ops != NULL);
   1706 
   1707 	return (pcbe_ops->pcbe_list_events(pic));
   1708 }
   1709 
   1710 uint_t
   1711 kcpc_pcbe_capabilities(void)
   1712 {
   1713 	ASSERT(pcbe_ops != NULL);
   1714 
   1715 	return (pcbe_ops->pcbe_caps);
   1716 }
   1717 
   1718 int
   1719 kcpc_pcbe_loaded(void)
   1720 {
   1721 	return (pcbe_ops == NULL ? -1 : 0);
   1722 }
   1723