Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include <sys/param.h>
     28 #include <sys/thread.h>
     29 #include <sys/cpuvar.h>
     30 #include <sys/inttypes.h>
     31 #include <sys/cmn_err.h>
     32 #include <sys/time.h>
     33 #include <sys/ksynch.h>
     34 #include <sys/systm.h>
     35 #include <sys/kcpc.h>
     36 #include <sys/cpc_impl.h>
     37 #include <sys/cpc_pcbe.h>
     38 #include <sys/atomic.h>
     39 #include <sys/sunddi.h>
     40 #include <sys/modctl.h>
     41 #include <sys/sdt.h>
     42 #if defined(__x86)
     43 #include <asm/clock.h>
     44 #endif
     45 
     46 kmutex_t	kcpc_ctx_llock[CPC_HASH_BUCKETS];	/* protects ctx_list */
     47 kcpc_ctx_t	*kcpc_ctx_list[CPC_HASH_BUCKETS];	/* head of list */
     48 
     49 
     50 krwlock_t	kcpc_cpuctx_lock;	/* lock for 'kcpc_cpuctx' below */
     51 int		kcpc_cpuctx;		/* number of cpu-specific contexts */
     52 
     53 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
     54 
     55 /*
     56  * These are set when a PCBE module is loaded.
     57  */
     58 uint_t		cpc_ncounters = 0;
     59 pcbe_ops_t	*pcbe_ops = NULL;
     60 
     61 /*
     62  * Statistics on (mis)behavior
     63  */
     64 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
     65 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
     66 
     67 /*
     68  * Is misbehaviour (overflow in a thread with no context) fatal?
     69  */
     70 #ifdef DEBUG
     71 static int kcpc_nullctx_panic = 1;
     72 #else
     73 static int kcpc_nullctx_panic = 0;
     74 #endif
     75 
     76 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
     77 static void kcpc_restore(kcpc_ctx_t *ctx);
     78 static void kcpc_save(kcpc_ctx_t *ctx);
     79 static void kcpc_free(kcpc_ctx_t *ctx, int isexec);
     80 static int kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode);
     81 static void kcpc_free_configs(kcpc_set_t *set);
     82 static kcpc_ctx_t *kcpc_ctx_alloc(void);
     83 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
     84 static void kcpc_ctx_free(kcpc_ctx_t *ctx);
     85 static int kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx);
     86 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
     87 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
     88 
     89 void
     90 kcpc_register_pcbe(pcbe_ops_t *ops)
     91 {
     92 	pcbe_ops = ops;
     93 	cpc_ncounters = pcbe_ops->pcbe_ncounters();
     94 }
     95 
     96 int
     97 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
     98 {
     99 	cpu_t		*cp;
    100 	kcpc_ctx_t	*ctx;
    101 	int		error;
    102 
    103 	ctx = kcpc_ctx_alloc();
    104 
    105 	if (kcpc_assign_reqs(set, ctx) != 0) {
    106 		kcpc_ctx_free(ctx);
    107 		*subcode = CPC_RESOURCE_UNAVAIL;
    108 		return (EINVAL);
    109 	}
    110 
    111 	ctx->kc_cpuid = cpuid;
    112 	ctx->kc_thread = curthread;
    113 
    114 	set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    115 
    116 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    117 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    118 		kcpc_ctx_free(ctx);
    119 		return (error);
    120 	}
    121 
    122 	set->ks_ctx = ctx;
    123 	ctx->kc_set = set;
    124 
    125 	/*
    126 	 * We must hold cpu_lock to prevent DR, offlining, or unbinding while
    127 	 * we are manipulating the cpu_t and programming the hardware, else the
    128 	 * the cpu_t could go away while we're looking at it.
    129 	 */
    130 	mutex_enter(&cpu_lock);
    131 	cp = cpu_get(cpuid);
    132 
    133 	if (cp == NULL)
    134 		/*
    135 		 * The CPU could have been DRd out while we were getting set up.
    136 		 */
    137 		goto unbound;
    138 
    139 	mutex_enter(&cp->cpu_cpc_ctxlock);
    140 
    141 	if (cp->cpu_cpc_ctx != NULL) {
    142 		/*
    143 		 * If this CPU already has a bound set, return an error.
    144 		 */
    145 		mutex_exit(&cp->cpu_cpc_ctxlock);
    146 		goto unbound;
    147 	}
    148 
    149 	if (curthread->t_bind_cpu != cpuid) {
    150 		mutex_exit(&cp->cpu_cpc_ctxlock);
    151 		goto unbound;
    152 	}
    153 	cp->cpu_cpc_ctx = ctx;
    154 
    155 	/*
    156 	 * Kernel preemption must be disabled while fiddling with the hardware
    157 	 * registers to prevent partial updates.
    158 	 */
    159 	kpreempt_disable();
    160 	ctx->kc_rawtick = KCPC_GET_TICK();
    161 	pcbe_ops->pcbe_program(ctx);
    162 	kpreempt_enable();
    163 
    164 	mutex_exit(&cp->cpu_cpc_ctxlock);
    165 	mutex_exit(&cpu_lock);
    166 
    167 	mutex_enter(&set->ks_lock);
    168 	set->ks_state |= KCPC_SET_BOUND;
    169 	cv_signal(&set->ks_condv);
    170 	mutex_exit(&set->ks_lock);
    171 
    172 	return (0);
    173 
    174 unbound:
    175 	mutex_exit(&cpu_lock);
    176 	set->ks_ctx = NULL;
    177 	kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    178 	kcpc_ctx_free(ctx);
    179 	return (EAGAIN);
    180 }
    181 
    182 int
    183 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
    184 {
    185 	kcpc_ctx_t	*ctx;
    186 	int		error;
    187 
    188 	/*
    189 	 * Only one set is allowed per context, so ensure there is no
    190 	 * existing context.
    191 	 */
    192 
    193 	if (t->t_cpc_ctx != NULL)
    194 		return (EEXIST);
    195 
    196 	ctx = kcpc_ctx_alloc();
    197 
    198 	/*
    199 	 * The context must begin life frozen until it has been properly
    200 	 * programmed onto the hardware. This prevents the context ops from
    201 	 * worrying about it until we're ready.
    202 	 */
    203 	ctx->kc_flags |= KCPC_CTX_FREEZE;
    204 	ctx->kc_hrtime = gethrtime();
    205 
    206 	if (kcpc_assign_reqs(set, ctx) != 0) {
    207 		kcpc_ctx_free(ctx);
    208 		*subcode = CPC_RESOURCE_UNAVAIL;
    209 		return (EINVAL);
    210 	}
    211 
    212 	ctx->kc_cpuid = -1;
    213 	if (set->ks_flags & CPC_BIND_LWP_INHERIT)
    214 		ctx->kc_flags |= KCPC_CTX_LWPINHERIT;
    215 	ctx->kc_thread = t;
    216 	t->t_cpc_ctx = ctx;
    217 	/*
    218 	 * Permit threads to look at their own hardware counters from userland.
    219 	 */
    220 	ctx->kc_flags |= KCPC_CTX_NONPRIV;
    221 
    222 	/*
    223 	 * Create the data store for this set.
    224 	 */
    225 	set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
    226 
    227 	if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
    228 		kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
    229 		kcpc_ctx_free(ctx);
    230 		t->t_cpc_ctx = NULL;
    231 		return (error);
    232 	}
    233 
    234 	set->ks_ctx = ctx;
    235 	ctx->kc_set = set;
    236 
    237 	/*
    238 	 * Add a device context to the subject thread.
    239 	 */
    240 	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
    241 	    kcpc_lwp_create, NULL, kcpc_free);
    242 
    243 	/*
    244 	 * Ask the backend to program the hardware.
    245 	 */
    246 	if (t == curthread) {
    247 		kpreempt_disable();
    248 		ctx->kc_rawtick = KCPC_GET_TICK();
    249 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    250 		pcbe_ops->pcbe_program(ctx);
    251 		kpreempt_enable();
    252 	} else
    253 		/*
    254 		 * Since we are the agent LWP, we know the victim LWP is stopped
    255 		 * until we're done here; no need to worry about preemption or
    256 		 * migration here. We still use an atomic op to clear the flag
    257 		 * to ensure the flags are always self-consistent; they can
    258 		 * still be accessed from, for instance, another CPU doing a
    259 		 * kcpc_invalidate_all().
    260 		 */
    261 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    262 
    263 	mutex_enter(&set->ks_lock);
    264 	set->ks_state |= KCPC_SET_BOUND;
    265 	cv_signal(&set->ks_condv);
    266 	mutex_exit(&set->ks_lock);
    267 
    268 	return (0);
    269 }
    270 
    271 /*
    272  * Walk through each request in the set and ask the PCBE to configure a
    273  * corresponding counter.
    274  */
    275 static int
    276 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
    277 {
    278 	int		i;
    279 	int		ret;
    280 	kcpc_request_t	*rp;
    281 
    282 	for (i = 0; i < set->ks_nreqs; i++) {
    283 		int n;
    284 		rp = &set->ks_req[i];
    285 
    286 		n = rp->kr_picnum;
    287 
    288 		ASSERT(n >= 0 && n < cpc_ncounters);
    289 
    290 		ASSERT(ctx->kc_pics[n].kp_req == NULL);
    291 
    292 		if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
    293 			if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
    294 			    == 0) {
    295 				*subcode = -1;
    296 				return (ENOTSUP);
    297 			}
    298 			/*
    299 			 * If any of the counters have requested overflow
    300 			 * notification, we flag the context as being one that
    301 			 * cares about overflow.
    302 			 */
    303 			ctx->kc_flags |= KCPC_CTX_SIGOVF;
    304 		}
    305 
    306 		rp->kr_config = NULL;
    307 		if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
    308 		    rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
    309 		    &(rp->kr_config), (void *)ctx)) != 0) {
    310 			kcpc_free_configs(set);
    311 			*subcode = ret;
    312 			switch (ret) {
    313 			case CPC_ATTR_REQUIRES_PRIVILEGE:
    314 			case CPC_HV_NO_ACCESS:
    315 				return (EACCES);
    316 			default:
    317 				return (EINVAL);
    318 			}
    319 		}
    320 
    321 		ctx->kc_pics[n].kp_req = rp;
    322 		rp->kr_picp = &ctx->kc_pics[n];
    323 		rp->kr_data = set->ks_data + rp->kr_index;
    324 		*rp->kr_data = rp->kr_preset;
    325 	}
    326 
    327 	return (0);
    328 }
    329 
    330 static void
    331 kcpc_free_configs(kcpc_set_t *set)
    332 {
    333 	int i;
    334 
    335 	for (i = 0; i < set->ks_nreqs; i++)
    336 		if (set->ks_req[i].kr_config != NULL)
    337 			pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
    338 }
    339 
    340 /*
    341  * buf points to a user address and the data should be copied out to that
    342  * address in the current process.
    343  */
    344 int
    345 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
    346 {
    347 	kcpc_ctx_t	*ctx = set->ks_ctx;
    348 	uint64_t	curtick = KCPC_GET_TICK();
    349 
    350 	mutex_enter(&set->ks_lock);
    351 	if ((set->ks_state & KCPC_SET_BOUND) == 0) {
    352 		mutex_exit(&set->ks_lock);
    353 		return (EINVAL);
    354 	}
    355 	mutex_exit(&set->ks_lock);
    356 
    357 	if (ctx->kc_flags & KCPC_CTX_INVALID)
    358 		return (EAGAIN);
    359 
    360 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
    361 		/*
    362 		 * Kernel preemption must be disabled while reading the
    363 		 * hardware regs, and if this is a CPU-bound context, while
    364 		 * checking the CPU binding of the current thread.
    365 		 */
    366 		kpreempt_disable();
    367 
    368 		if (ctx->kc_cpuid != -1) {
    369 			if (curthread->t_bind_cpu != ctx->kc_cpuid) {
    370 				kpreempt_enable();
    371 				return (EAGAIN);
    372 			}
    373 		}
    374 
    375 		if (ctx->kc_thread == curthread) {
    376 			ctx->kc_hrtime = gethrtime();
    377 			pcbe_ops->pcbe_sample(ctx);
    378 			ctx->kc_vtick += curtick - ctx->kc_rawtick;
    379 			ctx->kc_rawtick = curtick;
    380 		}
    381 
    382 		kpreempt_enable();
    383 
    384 		/*
    385 		 * The config may have been invalidated by
    386 		 * the pcbe_sample op.
    387 		 */
    388 		if (ctx->kc_flags & KCPC_CTX_INVALID)
    389 			return (EAGAIN);
    390 	}
    391 
    392 	if (copyout(set->ks_data, buf,
    393 	    set->ks_nreqs * sizeof (uint64_t)) == -1)
    394 		return (EFAULT);
    395 	if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
    396 		return (EFAULT);
    397 	if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
    398 		return (EFAULT);
    399 
    400 	return (0);
    401 }
    402 
    403 /*
    404  * Stop the counters on the CPU this context is bound to.
    405  */
    406 static void
    407 kcpc_stop_hw(kcpc_ctx_t *ctx)
    408 {
    409 	cpu_t *cp;
    410 
    411 	ASSERT((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED))
    412 	    == KCPC_CTX_INVALID);
    413 
    414 	kpreempt_disable();
    415 
    416 	cp = cpu_get(ctx->kc_cpuid);
    417 	ASSERT(cp != NULL);
    418 
    419 	if (cp == CPU) {
    420 		pcbe_ops->pcbe_allstop();
    421 		atomic_or_uint(&ctx->kc_flags,
    422 		    KCPC_CTX_INVALID_STOPPED);
    423 	} else
    424 		kcpc_remote_stop(cp);
    425 	kpreempt_enable();
    426 }
    427 
    428 int
    429 kcpc_unbind(kcpc_set_t *set)
    430 {
    431 	kcpc_ctx_t	*ctx;
    432 	kthread_t	*t;
    433 
    434 	/*
    435 	 * We could be racing with the process's agent thread as it
    436 	 * binds the set; we must wait for the set to finish binding
    437 	 * before attempting to tear it down.
    438 	 */
    439 	mutex_enter(&set->ks_lock);
    440 	while ((set->ks_state & KCPC_SET_BOUND) == 0)
    441 		cv_wait(&set->ks_condv, &set->ks_lock);
    442 	mutex_exit(&set->ks_lock);
    443 
    444 	ctx = set->ks_ctx;
    445 
    446 	/*
    447 	 * Use kc_lock to synchronize with kcpc_restore().
    448 	 */
    449 	mutex_enter(&ctx->kc_lock);
    450 	ctx->kc_flags |= KCPC_CTX_INVALID;
    451 	mutex_exit(&ctx->kc_lock);
    452 
    453 	if (ctx->kc_cpuid == -1) {
    454 		t = ctx->kc_thread;
    455 		/*
    456 		 * The context is thread-bound and therefore has a device
    457 		 * context.  It will be freed via removectx() calling
    458 		 * freectx() calling kcpc_free().
    459 		 */
    460 		if (t == curthread &&
    461 		    (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
    462 			kpreempt_disable();
    463 			pcbe_ops->pcbe_allstop();
    464 			atomic_or_uint(&ctx->kc_flags,
    465 			    KCPC_CTX_INVALID_STOPPED);
    466 			kpreempt_enable();
    467 		}
    468 #ifdef DEBUG
    469 		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    470 		    kcpc_lwp_create, NULL, kcpc_free) == 0)
    471 			panic("kcpc_unbind: context %p not preset on thread %p",
    472 			    (void *)ctx, (void *)t);
    473 #else
    474 		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
    475 		    kcpc_lwp_create, NULL, kcpc_free);
    476 #endif /* DEBUG */
    477 		t->t_cpc_set = NULL;
    478 		t->t_cpc_ctx = NULL;
    479 	} else {
    480 		/*
    481 		 * If we are unbinding a CPU-bound set from a remote CPU, the
    482 		 * native CPU's idle thread could be in the midst of programming
    483 		 * this context onto the CPU. We grab the context's lock here to
    484 		 * ensure that the idle thread is done with it. When we release
    485 		 * the lock, the CPU no longer has a context and the idle thread
    486 		 * will move on.
    487 		 *
    488 		 * cpu_lock must be held to prevent the CPU from being DR'd out
    489 		 * while we disassociate the context from the cpu_t.
    490 		 */
    491 		cpu_t *cp;
    492 		mutex_enter(&cpu_lock);
    493 		cp = cpu_get(ctx->kc_cpuid);
    494 		if (cp != NULL) {
    495 			/*
    496 			 * The CPU may have been DR'd out of the system.
    497 			 */
    498 			mutex_enter(&cp->cpu_cpc_ctxlock);
    499 			if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
    500 				kcpc_stop_hw(ctx);
    501 			ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
    502 			cp->cpu_cpc_ctx = NULL;
    503 			mutex_exit(&cp->cpu_cpc_ctxlock);
    504 		}
    505 		mutex_exit(&cpu_lock);
    506 		if (ctx->kc_thread == curthread) {
    507 			kcpc_free(ctx, 0);
    508 			curthread->t_cpc_set = NULL;
    509 		}
    510 	}
    511 
    512 	return (0);
    513 }
    514 
    515 int
    516 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
    517 {
    518 	int i;
    519 
    520 	ASSERT(set != NULL);
    521 	ASSERT(set->ks_state & KCPC_SET_BOUND);
    522 	ASSERT(set->ks_ctx->kc_thread == curthread);
    523 	ASSERT(set->ks_ctx->kc_cpuid == -1);
    524 
    525 	if (index < 0 || index >= set->ks_nreqs)
    526 		return (EINVAL);
    527 
    528 	for (i = 0; i < set->ks_nreqs; i++)
    529 		if (set->ks_req[i].kr_index == index)
    530 			break;
    531 	ASSERT(i != set->ks_nreqs);
    532 
    533 	set->ks_req[i].kr_preset = preset;
    534 	return (0);
    535 }
    536 
    537 int
    538 kcpc_restart(kcpc_set_t *set)
    539 {
    540 	kcpc_ctx_t	*ctx = set->ks_ctx;
    541 	int		i;
    542 
    543 	ASSERT(set->ks_state & KCPC_SET_BOUND);
    544 	ASSERT(ctx->kc_thread == curthread);
    545 	ASSERT(ctx->kc_cpuid == -1);
    546 
    547 	kpreempt_disable();
    548 
    549 	/*
    550 	 * If the user is doing this on a running set, make sure the counters
    551 	 * are stopped first.
    552 	 */
    553 	if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    554 		pcbe_ops->pcbe_allstop();
    555 
    556 	for (i = 0; i < set->ks_nreqs; i++) {
    557 		*(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
    558 		pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
    559 		    0, 0, NULL, &set->ks_req[i].kr_config, NULL);
    560 	}
    561 
    562 	/*
    563 	 * Ask the backend to program the hardware.
    564 	 */
    565 	ctx->kc_rawtick = KCPC_GET_TICK();
    566 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    567 	pcbe_ops->pcbe_program(ctx);
    568 	kpreempt_enable();
    569 
    570 	return (0);
    571 }
    572 
    573 /*
    574  * Caller must hold kcpc_cpuctx_lock.
    575  */
    576 int
    577 kcpc_enable(kthread_t *t, int cmd, int enable)
    578 {
    579 	kcpc_ctx_t	*ctx = t->t_cpc_ctx;
    580 	kcpc_set_t	*set = t->t_cpc_set;
    581 	kcpc_set_t	*newset;
    582 	int		i;
    583 	int		flag;
    584 	int		err;
    585 
    586 	ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
    587 
    588 	if (ctx == NULL) {
    589 		/*
    590 		 * This thread has a set but no context; it must be a
    591 		 * CPU-bound set.
    592 		 */
    593 		ASSERT(t->t_cpc_set != NULL);
    594 		ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
    595 		return (EINVAL);
    596 	} else if (ctx->kc_flags & KCPC_CTX_INVALID)
    597 		return (EAGAIN);
    598 
    599 	if (cmd == CPC_ENABLE) {
    600 		if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
    601 			return (EINVAL);
    602 		kpreempt_disable();
    603 		atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
    604 		kcpc_restore(ctx);
    605 		kpreempt_enable();
    606 	} else if (cmd == CPC_DISABLE) {
    607 		if (ctx->kc_flags & KCPC_CTX_FREEZE)
    608 			return (EINVAL);
    609 		kpreempt_disable();
    610 		kcpc_save(ctx);
    611 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    612 		kpreempt_enable();
    613 	} else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
    614 		/*
    615 		 * Strategy for usr/sys: stop counters and update set's presets
    616 		 * with current counter values, unbind, update requests with
    617 		 * new config, then re-bind.
    618 		 */
    619 		flag = (cmd == CPC_USR_EVENTS) ?
    620 		    CPC_COUNT_USER: CPC_COUNT_SYSTEM;
    621 
    622 		kpreempt_disable();
    623 		atomic_or_uint(&ctx->kc_flags,
    624 		    KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
    625 		pcbe_ops->pcbe_allstop();
    626 		kpreempt_enable();
    627 		for (i = 0; i < set->ks_nreqs; i++) {
    628 			set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
    629 			if (enable)
    630 				set->ks_req[i].kr_flags |= flag;
    631 			else
    632 				set->ks_req[i].kr_flags &= ~flag;
    633 		}
    634 		newset = kcpc_dup_set(set);
    635 		if (kcpc_unbind(set) != 0)
    636 			return (EINVAL);
    637 		t->t_cpc_set = newset;
    638 		if (kcpc_bind_thread(newset, t, &err) != 0) {
    639 			t->t_cpc_set = NULL;
    640 			kcpc_free_set(newset);
    641 			return (EINVAL);
    642 		}
    643 	} else
    644 		return (EINVAL);
    645 
    646 	return (0);
    647 }
    648 
    649 /*
    650  * Provide PCBEs with a way of obtaining the configs of every counter which will
    651  * be programmed together.
    652  *
    653  * If current is NULL, provide the first config.
    654  *
    655  * If data != NULL, caller wants to know where the data store associated with
    656  * the config we return is located.
    657  */
    658 void *
    659 kcpc_next_config(void *token, void *current, uint64_t **data)
    660 {
    661 	int		i;
    662 	kcpc_pic_t	*pic;
    663 	kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
    664 
    665 	if (current == NULL) {
    666 		/*
    667 		 * Client would like the first config, which may not be in
    668 		 * counter 0; we need to search through the counters for the
    669 		 * first config.
    670 		 */
    671 		for (i = 0; i < cpc_ncounters; i++)
    672 			if (ctx->kc_pics[i].kp_req != NULL)
    673 				break;
    674 		/*
    675 		 * There are no counters configured for the given context.
    676 		 */
    677 		if (i == cpc_ncounters)
    678 			return (NULL);
    679 	} else {
    680 		/*
    681 		 * There surely is a faster way to do this.
    682 		 */
    683 		for (i = 0; i < cpc_ncounters; i++) {
    684 			pic = &ctx->kc_pics[i];
    685 
    686 			if (pic->kp_req != NULL &&
    687 			    current == pic->kp_req->kr_config)
    688 				break;
    689 		}
    690 
    691 		/*
    692 		 * We found the current config at picnum i. Now search for the
    693 		 * next configured PIC.
    694 		 */
    695 		for (i++; i < cpc_ncounters; i++) {
    696 			pic = &ctx->kc_pics[i];
    697 			if (pic->kp_req != NULL)
    698 				break;
    699 		}
    700 
    701 		if (i == cpc_ncounters)
    702 			return (NULL);
    703 	}
    704 
    705 	if (data != NULL) {
    706 		*data = ctx->kc_pics[i].kp_req->kr_data;
    707 	}
    708 
    709 	return (ctx->kc_pics[i].kp_req->kr_config);
    710 }
    711 
    712 
    713 static kcpc_ctx_t *
    714 kcpc_ctx_alloc(void)
    715 {
    716 	kcpc_ctx_t	*ctx;
    717 	long		hash;
    718 
    719 	ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), KM_SLEEP);
    720 
    721 	hash = CPC_HASH_CTX(ctx);
    722 	mutex_enter(&kcpc_ctx_llock[hash]);
    723 	ctx->kc_next = kcpc_ctx_list[hash];
    724 	kcpc_ctx_list[hash] = ctx;
    725 	mutex_exit(&kcpc_ctx_llock[hash]);
    726 
    727 	ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
    728 	    cpc_ncounters, KM_SLEEP);
    729 
    730 	ctx->kc_cpuid = -1;
    731 
    732 	return (ctx);
    733 }
    734 
    735 /*
    736  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
    737  * in the flags.
    738  */
    739 static void
    740 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
    741 {
    742 	kcpc_set_t	*ks = ctx->kc_set, *cks;
    743 	int		i, j;
    744 	int		code;
    745 
    746 	ASSERT(ks != NULL);
    747 
    748 	if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
    749 		return;
    750 
    751 	cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
    752 	cks->ks_state &= ~KCPC_SET_BOUND;
    753 	cctx->kc_set = cks;
    754 	cks->ks_flags = ks->ks_flags;
    755 	cks->ks_nreqs = ks->ks_nreqs;
    756 	cks->ks_req = kmem_alloc(cks->ks_nreqs *
    757 	    sizeof (kcpc_request_t), KM_SLEEP);
    758 	cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
    759 	    KM_SLEEP);
    760 	cks->ks_ctx = cctx;
    761 
    762 	for (i = 0; i < cks->ks_nreqs; i++) {
    763 		cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
    764 		cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
    765 		(void) strncpy(cks->ks_req[i].kr_event,
    766 		    ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
    767 		cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
    768 		cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
    769 		cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
    770 		if (ks->ks_req[i].kr_nattrs > 0) {
    771 			cks->ks_req[i].kr_attr =
    772 			    kmem_alloc(ks->ks_req[i].kr_nattrs *
    773 			    sizeof (kcpc_attr_t), KM_SLEEP);
    774 		}
    775 		for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
    776 			(void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
    777 			    ks->ks_req[i].kr_attr[j].ka_name,
    778 			    CPC_MAX_ATTR_LEN);
    779 			cks->ks_req[i].kr_attr[j].ka_val =
    780 			    ks->ks_req[i].kr_attr[j].ka_val;
    781 		}
    782 	}
    783 	if (kcpc_configure_reqs(cctx, cks, &code) != 0)
    784 		kcpc_invalidate_config(cctx);
    785 
    786 	mutex_enter(&cks->ks_lock);
    787 	cks->ks_state |= KCPC_SET_BOUND;
    788 	cv_signal(&cks->ks_condv);
    789 	mutex_exit(&cks->ks_lock);
    790 }
    791 
    792 
    793 static void
    794 kcpc_ctx_free(kcpc_ctx_t *ctx)
    795 {
    796 	kcpc_ctx_t	**loc;
    797 	long		hash = CPC_HASH_CTX(ctx);
    798 
    799 	mutex_enter(&kcpc_ctx_llock[hash]);
    800 	loc = &kcpc_ctx_list[hash];
    801 	ASSERT(*loc != NULL);
    802 	while (*loc != ctx)
    803 		loc = &(*loc)->kc_next;
    804 	*loc = ctx->kc_next;
    805 	mutex_exit(&kcpc_ctx_llock[hash]);
    806 
    807 	kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
    808 	cv_destroy(&ctx->kc_condv);
    809 	mutex_destroy(&ctx->kc_lock);
    810 	kmem_free(ctx, sizeof (*ctx));
    811 }
    812 
    813 /*
    814  * Generic interrupt handler used on hardware that generates
    815  * overflow interrupts.
    816  *
    817  * Note: executed at high-level interrupt context!
    818  */
    819 /*ARGSUSED*/
    820 kcpc_ctx_t *
    821 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
    822 {
    823 	kcpc_ctx_t	*ctx;
    824 	kthread_t	*t = curthread;
    825 	int		i;
    826 
    827 	/*
    828 	 * On both x86 and UltraSPARC, we may deliver the high-level
    829 	 * interrupt in kernel mode, just after we've started to run an
    830 	 * interrupt thread.  (That's because the hardware helpfully
    831 	 * delivers the overflow interrupt some random number of cycles
    832 	 * after the instruction that caused the overflow by which time
    833 	 * we're in some part of the kernel, not necessarily running on
    834 	 * the right thread).
    835 	 *
    836 	 * Check for this case here -- find the pinned thread
    837 	 * that was running when the interrupt went off.
    838 	 */
    839 	if (t->t_flag & T_INTR_THREAD) {
    840 		klwp_t *lwp;
    841 
    842 		atomic_add_32(&kcpc_intrctx_count, 1);
    843 
    844 		/*
    845 		 * Note that t_lwp is always set to point at the underlying
    846 		 * thread, thus this will work in the presence of nested
    847 		 * interrupts.
    848 		 */
    849 		ctx = NULL;
    850 		if ((lwp = t->t_lwp) != NULL) {
    851 			t = lwptot(lwp);
    852 			ctx = t->t_cpc_ctx;
    853 		}
    854 	} else
    855 		ctx = t->t_cpc_ctx;
    856 
    857 	if (ctx == NULL) {
    858 		/*
    859 		 * This can easily happen if we're using the counters in
    860 		 * "shared" mode, for example, and an overflow interrupt
    861 		 * occurs while we are running cpustat.  In that case, the
    862 		 * bound thread that has the context that belongs to this
    863 		 * CPU is almost certainly sleeping (if it was running on
    864 		 * the CPU we'd have found it above), and the actual
    865 		 * interrupted thread has no knowledge of performance counters!
    866 		 */
    867 		ctx = curthread->t_cpu->cpu_cpc_ctx;
    868 		if (ctx != NULL) {
    869 			/*
    870 			 * Return the bound context for this CPU to
    871 			 * the interrupt handler so that it can synchronously
    872 			 * sample the hardware counters and restart them.
    873 			 */
    874 			return (ctx);
    875 		}
    876 
    877 		/*
    878 		 * As long as the overflow interrupt really is delivered early
    879 		 * enough after trapping into the kernel to avoid switching
    880 		 * threads, we must always be able to find the cpc context,
    881 		 * or something went terribly wrong i.e. we ended up
    882 		 * running a passivated interrupt thread, a kernel
    883 		 * thread or we interrupted idle, all of which are Very Bad.
    884 		 */
    885 		if (kcpc_nullctx_panic)
    886 			panic("null cpc context, thread %p", (void *)t);
    887 		atomic_add_32(&kcpc_nullctx_count, 1);
    888 	} else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
    889 		/*
    890 		 * Schedule an ast to sample the counters, which will
    891 		 * propagate any overflow into the virtualized performance
    892 		 * counter(s), and may deliver a signal.
    893 		 */
    894 		ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
    895 		/*
    896 		 * If a counter has overflowed which was counting on behalf of
    897 		 * a request which specified CPC_OVF_NOTIFY_EMT, send the
    898 		 * process a signal.
    899 		 */
    900 		for (i = 0; i < cpc_ncounters; i++) {
    901 			if (ctx->kc_pics[i].kp_req != NULL &&
    902 			    bitmap & (1 << i) &&
    903 			    ctx->kc_pics[i].kp_req->kr_flags &
    904 			    CPC_OVF_NOTIFY_EMT) {
    905 				/*
    906 				 * A signal has been requested for this PIC, so
    907 				 * so freeze the context. The interrupt handler
    908 				 * has already stopped the counter hardware.
    909 				 */
    910 				atomic_or_uint(&ctx->kc_flags, KCPC_CTX_FREEZE);
    911 				atomic_or_uint(&ctx->kc_pics[i].kp_flags,
    912 				    KCPC_PIC_OVERFLOWED);
    913 			}
    914 		}
    915 		aston(t);
    916 	}
    917 	return (NULL);
    918 }
    919 
    920 /*
    921  * The current thread context had an overflow interrupt; we're
    922  * executing here in high-level interrupt context.
    923  */
    924 /*ARGSUSED*/
    925 uint_t
    926 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
    927 {
    928 	kcpc_ctx_t	*ctx;
    929 	uint64_t	bitmap;
    930 
    931 	if (pcbe_ops == NULL ||
    932 	    (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
    933 		return (DDI_INTR_UNCLAIMED);
    934 
    935 	/*
    936 	 * Prevent any further interrupts.
    937 	 */
    938 	pcbe_ops->pcbe_allstop();
    939 
    940 	/*
    941 	 * Invoke the "generic" handler.
    942 	 *
    943 	 * If the interrupt has occurred in the context of an lwp owning
    944 	 * the counters, then the handler posts an AST to the lwp to
    945 	 * trigger the actual sampling, and optionally deliver a signal or
    946 	 * restart the counters, on the way out of the kernel using
    947 	 * kcpc_hw_overflow_ast() (see below).
    948 	 *
    949 	 * On the other hand, if the handler returns the context to us
    950 	 * directly, then it means that there are no other threads in
    951 	 * the middle of updating it, no AST has been posted, and so we
    952 	 * should sample the counters here, and restart them with no
    953 	 * further fuss.
    954 	 */
    955 	if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
    956 		uint64_t curtick = KCPC_GET_TICK();
    957 
    958 		ctx->kc_hrtime = gethrtime_waitfree();
    959 		ctx->kc_vtick += curtick - ctx->kc_rawtick;
    960 		ctx->kc_rawtick = curtick;
    961 		pcbe_ops->pcbe_sample(ctx);
    962 		pcbe_ops->pcbe_program(ctx);
    963 	}
    964 
    965 	return (DDI_INTR_CLAIMED);
    966 }
    967 
    968 /*
    969  * Called from trap() when processing the ast posted by the high-level
    970  * interrupt handler.
    971  */
    972 int
    973 kcpc_overflow_ast()
    974 {
    975 	kcpc_ctx_t	*ctx = curthread->t_cpc_ctx;
    976 	int		i;
    977 	int		found = 0;
    978 	uint64_t	curtick = KCPC_GET_TICK();
    979 
    980 	ASSERT(ctx != NULL);	/* Beware of interrupt skid. */
    981 
    982 	/*
    983 	 * An overflow happened: sample the context to ensure that
    984 	 * the overflow is propagated into the upper bits of the
    985 	 * virtualized 64-bit counter(s).
    986 	 */
    987 	kpreempt_disable();
    988 	ctx->kc_hrtime = gethrtime_waitfree();
    989 	pcbe_ops->pcbe_sample(ctx);
    990 	kpreempt_enable();
    991 
    992 	ctx->kc_vtick += curtick - ctx->kc_rawtick;
    993 
    994 	/*
    995 	 * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
    996 	 * if that pic generated an overflow and if the request it was counting
    997 	 * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
    998 	 * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
    999 	 * found any overflowed pics, keep the context frozen and return true
   1000 	 * (thus causing a signal to be sent).
   1001 	 */
   1002 	for (i = 0; i < cpc_ncounters; i++) {
   1003 		if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
   1004 			atomic_and_uint(&ctx->kc_pics[i].kp_flags,
   1005 			    ~KCPC_PIC_OVERFLOWED);
   1006 			found = 1;
   1007 		}
   1008 	}
   1009 	if (found)
   1010 		return (1);
   1011 
   1012 	/*
   1013 	 * Otherwise, re-enable the counters and continue life as before.
   1014 	 */
   1015 	kpreempt_disable();
   1016 	atomic_and_uint(&ctx->kc_flags, ~KCPC_CTX_FREEZE);
   1017 	pcbe_ops->pcbe_program(ctx);
   1018 	kpreempt_enable();
   1019 	return (0);
   1020 }
   1021 
   1022 /*
   1023  * Called when switching away from current thread.
   1024  */
   1025 static void
   1026 kcpc_save(kcpc_ctx_t *ctx)
   1027 {
   1028 	if (ctx->kc_flags & KCPC_CTX_INVALID) {
   1029 		if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)
   1030 			return;
   1031 		/*
   1032 		 * This context has been invalidated but the counters have not
   1033 		 * been stopped. Stop them here and mark the context stopped.
   1034 		 */
   1035 		pcbe_ops->pcbe_allstop();
   1036 		atomic_or_uint(&ctx->kc_flags, KCPC_CTX_INVALID_STOPPED);
   1037 		return;
   1038 	}
   1039 
   1040 	pcbe_ops->pcbe_allstop();
   1041 	if (ctx->kc_flags & KCPC_CTX_FREEZE)
   1042 		return;
   1043 
   1044 	/*
   1045 	 * Need to sample for all reqs into each req's current mpic.
   1046 	 */
   1047 	ctx->kc_hrtime = gethrtime();
   1048 	ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
   1049 	pcbe_ops->pcbe_sample(ctx);
   1050 }
   1051 
   1052 static void
   1053 kcpc_restore(kcpc_ctx_t *ctx)
   1054 {
   1055 	mutex_enter(&ctx->kc_lock);
   1056 	if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
   1057 	    KCPC_CTX_INVALID)
   1058 		/*
   1059 		 * The context is invalidated but has not been marked stopped.
   1060 		 * We mark it as such here because we will not start the
   1061 		 * counters during this context switch.
   1062 		 */
   1063 		ctx->kc_flags |= KCPC_CTX_INVALID_STOPPED;
   1064 
   1065 
   1066 	if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
   1067 		mutex_exit(&ctx->kc_lock);
   1068 		return;
   1069 	}
   1070 
   1071 	/*
   1072 	 * Set kc_flags to show that a kcpc_restore() is in progress to avoid
   1073 	 * ctx & set related memory objects being freed without us knowing.
   1074 	 * This can happen if an agent thread is executing a kcpc_unbind(),
   1075 	 * with this thread as the target, whilst we're concurrently doing a
   1076 	 * restorectx() during, for example, a proc_exit().  Effectively, by
   1077 	 * doing this, we're asking kcpc_free() to cv_wait() until
   1078 	 * kcpc_restore() has completed.
   1079 	 */
   1080 	ctx->kc_flags |= KCPC_CTX_RESTORE;
   1081 	mutex_exit(&ctx->kc_lock);
   1082 
   1083 	/*
   1084 	 * While programming the hardware, the counters should be stopped. We
   1085 	 * don't do an explicit pcbe_allstop() here because they should have
   1086 	 * been stopped already by the last consumer.
   1087 	 */
   1088 	ctx->kc_rawtick = KCPC_GET_TICK();
   1089 	pcbe_ops->pcbe_program(ctx);
   1090 
   1091 	/*
   1092 	 * Wake the agent thread if it's waiting in kcpc_free().
   1093 	 */
   1094 	mutex_enter(&ctx->kc_lock);
   1095 	ctx->kc_flags &= ~KCPC_CTX_RESTORE;
   1096 	cv_signal(&ctx->kc_condv);
   1097 	mutex_exit(&ctx->kc_lock);
   1098 }
   1099 
   1100 /*
   1101  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
   1102  * following context operators to the idle thread on each CPU. They stop the
   1103  * counters when the idle thread is switched on, and they start them again when
   1104  * it is switched off.
   1105  */
   1106 
   1107 /*ARGSUSED*/
   1108 void
   1109 kcpc_idle_save(struct cpu *cp)
   1110 {
   1111 	/*
   1112 	 * The idle thread shouldn't be run anywhere else.
   1113 	 */
   1114 	ASSERT(CPU == cp);
   1115 
   1116 	/*
   1117 	 * We must hold the CPU's context lock to ensure the context isn't freed
   1118 	 * while we're looking at it.
   1119 	 */
   1120 	mutex_enter(&cp->cpu_cpc_ctxlock);
   1121 
   1122 	if ((cp->cpu_cpc_ctx == NULL) ||
   1123 	    (cp-&