Home | History | Annotate | Download | only in io
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * CPU Performance Counter system calls and device driver.
     28  *
     29  * This module uses a combination of thread context operators, and
     30  * thread-specific data to export CPU performance counters
     31  * via both a system call and a driver interface.
     32  *
     33  * There are three access methods exported - the 'shared' device
     34  * and the 'private' and 'agent' variants of the system call.
     35  *
     36  * The shared device treats the performance counter registers as
     37  * a processor metric, regardless of the work scheduled on them.
     38  * The private system call treats the performance counter registers
     39  * as a property of a single lwp.  This is achieved by using the
     40  * thread context operators to virtualize the contents of the
     41  * performance counter registers between lwps.
     42  *
     43  * The agent method is like the private method, except that it must
     44  * be accessed via /proc's agent lwp to allow the counter context of
     45  * other threads to be examined safely.
     46  *
     47  * The shared usage fundamentally conflicts with the agent and private usage;
     48  * almost all of the complexity of the module is needed to allow these two
     49  * models to co-exist in a reasonable way.
     50  */
     51 
     52 #include <sys/types.h>
     53 #include <sys/file.h>
     54 #include <sys/errno.h>
     55 #include <sys/open.h>
     56 #include <sys/cred.h>
     57 #include <sys/conf.h>
     58 #include <sys/stat.h>
     59 #include <sys/processor.h>
     60 #include <sys/cpuvar.h>
     61 #include <sys/disp.h>
     62 #include <sys/kmem.h>
     63 #include <sys/modctl.h>
     64 #include <sys/ddi.h>
     65 #include <sys/sunddi.h>
     66 #include <sys/nvpair.h>
     67 #include <sys/policy.h>
     68 #include <sys/machsystm.h>
     69 #include <sys/cpc_impl.h>
     70 #include <sys/cpc_pcbe.h>
     71 #include <sys/kcpc.h>
     72 
     73 static int kcpc_copyin_set(kcpc_set_t **set, void *ubuf, size_t len);
     74 static int kcpc_verify_set(kcpc_set_t *set);
     75 static uint32_t kcpc_nvlist_npairs(nvlist_t *list);
     76 
     77 /*
     78  * Generic attributes supported regardless of processor.
     79  */
     80 
     81 #define	ATTRLIST "picnum"
     82 #define	SEPARATOR ","
     83 
     84 /*
     85  * System call to access CPU performance counters.
     86  */
     87 static int
     88 cpc(int cmd, id_t lwpid, void *udata1, void *udata2, void *udata3)
     89 {
     90 	kthread_t	*t;
     91 	int		error;
     92 	int		size;
     93 	const char	*str;
     94 	int		code;
     95 
     96 	/*
     97 	 * This CPC syscall should only be loaded if it found a PCBE to use.
     98 	 */
     99 	ASSERT(pcbe_ops != NULL);
    100 
    101 	if (curproc->p_agenttp == curthread) {
    102 		/*
    103 		 * Only if /proc is invoking this system call from
    104 		 * the agent thread do we allow the caller to examine
    105 		 * the contexts of other lwps in the process.  And
    106 		 * because we know we're the agent, we know we don't
    107 		 * have to grab p_lock because no-one else can change
    108 		 * the state of the process.
    109 		 */
    110 		if ((t = idtot(curproc, lwpid)) == NULL || t == curthread)
    111 			return (set_errno(ESRCH));
    112 		ASSERT(t->t_tid == lwpid && ttolwp(t) != NULL);
    113 	} else
    114 		t = curthread;
    115 
    116 	if (t->t_cpc_set == NULL && (cmd == CPC_SAMPLE || cmd == CPC_RELE))
    117 		return (set_errno(EINVAL));
    118 
    119 	switch (cmd) {
    120 	case CPC_BIND:
    121 		/*
    122 		 * udata1 = pointer to packed nvlist buffer
    123 		 * udata2 = size of packed nvlist buffer
    124 		 * udata3 = User addr to return error subcode in.
    125 		 */
    126 
    127 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
    128 		if (kcpc_cpuctx || dtrace_cpc_in_use) {
    129 			rw_exit(&kcpc_cpuctx_lock);
    130 			return (set_errno(EAGAIN));
    131 		}
    132 
    133 		if (kcpc_hw_lwp_hook() != 0) {
    134 			rw_exit(&kcpc_cpuctx_lock);
    135 			return (set_errno(EACCES));
    136 		}
    137 
    138 		/*
    139 		 * An LWP may only have one set bound to it at a time; if there
    140 		 * is a set bound to this LWP already, we unbind it here.
    141 		 */
    142 		if (t->t_cpc_set != NULL)
    143 			(void) kcpc_unbind(t->t_cpc_set);
    144 		ASSERT(t->t_cpc_set == NULL);
    145 
    146 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
    147 		    (size_t)udata2)) != 0) {
    148 			rw_exit(&kcpc_cpuctx_lock);
    149 			return (set_errno(error));
    150 		}
    151 
    152 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
    153 			rw_exit(&kcpc_cpuctx_lock);
    154 			kcpc_free_set(t->t_cpc_set);
    155 			t->t_cpc_set = NULL;
    156 			if (copyout(&error, udata3, sizeof (error)) == -1)
    157 				return (set_errno(EFAULT));
    158 			return (set_errno(EINVAL));
    159 		}
    160 
    161 		if ((error = kcpc_bind_thread(t->t_cpc_set, t, &code)) != 0) {
    162 			rw_exit(&kcpc_cpuctx_lock);
    163 			kcpc_free_set(t->t_cpc_set);
    164 			t->t_cpc_set = NULL;
    165 			/*
    166 			 * EINVAL and EACCES are the only errors with more
    167 			 * specific subcodes.
    168 			 */
    169 			if ((error == EINVAL || error == EACCES) &&
    170 			    copyout(&code, udata3, sizeof (code)) == -1)
    171 				return (set_errno(EFAULT));
    172 			return (set_errno(error));
    173 		}
    174 
    175 		rw_exit(&kcpc_cpuctx_lock);
    176 		return (0);
    177 	case CPC_SAMPLE:
    178 		/*
    179 		 * udata1 = pointer to user's buffer
    180 		 * udata2 = pointer to user's hrtime
    181 		 * udata3 = pointer to user's tick
    182 		 */
    183 		/*
    184 		 * We only allow thread-bound sets to be sampled via the
    185 		 * syscall, so if this set has a CPU-bound context, return an
    186 		 * error.
    187 		 */
    188 		if (t->t_cpc_set->ks_ctx->kc_cpuid != -1)
    189 			return (set_errno(EINVAL));
    190 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
    191 		    udata3)) != 0)
    192 			return (set_errno(error));
    193 
    194 		return (0);
    195 	case CPC_PRESET:
    196 	case CPC_RESTART:
    197 		/*
    198 		 * These are valid only if this lwp has a bound set.
    199 		 */
    200 		if (t->t_cpc_set == NULL)
    201 			return (set_errno(EINVAL));
    202 		if (cmd == CPC_PRESET) {
    203 			/*
    204 			 * The preset is shipped up to us from userland in two
    205 			 * parts. This lets us handle 64-bit values from 32-bit
    206 			 * and 64-bit applications in the same manner.
    207 			 *
    208 			 * udata1 = index of request to preset
    209 			 * udata2 = new 64-bit preset (most sig. 32 bits)
    210 			 * udata3 = new 64-bit preset (least sig. 32 bits)
    211 			 */
    212 			if ((error = kcpc_preset(t->t_cpc_set, (intptr_t)udata1,
    213 			    ((uint64_t)(uintptr_t)udata2 << 32ULL) |
    214 			    (uint64_t)(uintptr_t)udata3)) != 0)
    215 				return (set_errno(error));
    216 		} else {
    217 			/*
    218 			 * udata[1-3] = unused
    219 			 */
    220 			if ((error = kcpc_restart(t->t_cpc_set)) != 0)
    221 				return (set_errno(error));
    222 		}
    223 		return (0);
    224 	case CPC_ENABLE:
    225 	case CPC_DISABLE:
    226 		udata1 = 0;
    227 		/*FALLTHROUGH*/
    228 	case CPC_USR_EVENTS:
    229 	case CPC_SYS_EVENTS:
    230 		if (t != curthread || t->t_cpc_set == NULL)
    231 			return (set_errno(EINVAL));
    232 		/*
    233 		 * Provided for backwards compatibility with CPCv1.
    234 		 *
    235 		 * Stop the counters and record the current counts. Use the
    236 		 * counts as the preset to rebind a new set with the requests
    237 		 * reconfigured as requested.
    238 		 *
    239 		 * udata1: 1 == enable; 0 == disable
    240 		 * udata{2,3}: unused
    241 		 */
    242 		rw_enter(&kcpc_cpuctx_lock, RW_READER);
    243 		if ((error = kcpc_enable(t,
    244 		    cmd, (int)(uintptr_t)udata1)) != 0) {
    245 			rw_exit(&kcpc_cpuctx_lock);
    246 			return (set_errno(error));
    247 		}
    248 		rw_exit(&kcpc_cpuctx_lock);
    249 		return (0);
    250 	case CPC_NPIC:
    251 		return (cpc_ncounters);
    252 	case CPC_CAPS:
    253 		return (pcbe_ops->pcbe_caps);
    254 	case CPC_EVLIST_SIZE:
    255 	case CPC_LIST_EVENTS:
    256 		/*
    257 		 * udata1 = pointer to user's int or buffer
    258 		 * udata2 = picnum
    259 		 * udata3 = unused
    260 		 */
    261 		if ((uintptr_t)udata2 >= cpc_ncounters)
    262 			return (set_errno(EINVAL));
    263 
    264 		size = strlen(
    265 		    pcbe_ops->pcbe_list_events((uintptr_t)udata2)) + 1;
    266 
    267 		if (cmd == CPC_EVLIST_SIZE) {
    268 			if (suword32(udata1, size) == -1)
    269 				return (set_errno(EFAULT));
    270 		} else {
    271 			if (copyout(
    272 			    pcbe_ops->pcbe_list_events((uintptr_t)udata2),
    273 			    udata1, size) == -1)
    274 				return (set_errno(EFAULT));
    275 		}
    276 		return (0);
    277 	case CPC_ATTRLIST_SIZE:
    278 	case CPC_LIST_ATTRS:
    279 		/*
    280 		 * udata1 = pointer to user's int or buffer
    281 		 * udata2 = unused
    282 		 * udata3 = unused
    283 		 *
    284 		 * attrlist size is length of PCBE-supported attributes, plus
    285 		 * room for "picnum\0" plus an optional ',' separator char.
    286 		 */
    287 		str = pcbe_ops->pcbe_list_attrs();
    288 		size = strlen(str) + sizeof (SEPARATOR ATTRLIST) + 1;
    289 		if (str[0] != '\0')
    290 			/*
    291 			 * A ',' separator character is necessary.
    292 			 */
    293 			size += 1;
    294 
    295 		if (cmd == CPC_ATTRLIST_SIZE) {
    296 			if (suword32(udata1, size) == -1)
    297 				return (set_errno(EFAULT));
    298 		} else {
    299 			/*
    300 			 * Copyout the PCBE attributes, and then append the
    301 			 * generic attribute list (with separator if necessary).
    302 			 */
    303 			if (copyout(str, udata1, strlen(str)) == -1)
    304 				return (set_errno(EFAULT));
    305 			if (str[0] != '\0') {
    306 				if (copyout(SEPARATOR ATTRLIST,
    307 				    ((char *)udata1) + strlen(str),
    308 				    strlen(SEPARATOR ATTRLIST) + 1)
    309 				    == -1)
    310 					return (set_errno(EFAULT));
    311 			} else
    312 				if (copyout(ATTRLIST,
    313 				    (char *)udata1 + strlen(str),
    314 				    strlen(ATTRLIST) + 1) == -1)
    315 					return (set_errno(EFAULT));
    316 		}
    317 		return (0);
    318 	case CPC_IMPL_NAME:
    319 	case CPC_CPUREF:
    320 		/*
    321 		 * udata1 = pointer to user's buffer
    322 		 * udata2 = unused
    323 		 * udata3 = unused
    324 		 */
    325 		if (cmd == CPC_IMPL_NAME) {
    326 			str = pcbe_ops->pcbe_impl_name();
    327 			ASSERT(strlen(str) < CPC_MAX_IMPL_NAME);
    328 		} else {
    329 			str = pcbe_ops->pcbe_cpuref();
    330 			ASSERT(strlen(str) < CPC_MAX_CPUREF);
    331 		}
    332 
    333 		if (copyout(str, udata1, strlen(str) + 1) != 0)
    334 			return (set_errno(EFAULT));
    335 		return (0);
    336 	case CPC_INVALIDATE:
    337 		kcpc_invalidate(t);
    338 		return (0);
    339 	case CPC_RELE:
    340 		if ((error = kcpc_unbind(t->t_cpc_set)) != 0)
    341 			return (set_errno(error));
    342 		return (0);
    343 	default:
    344 		return (set_errno(EINVAL));
    345 	}
    346 }
    347 
    348 /*
    349  * The 'shared' device allows direct access to the
    350  * performance counter control register of the current CPU.
    351  * The major difference between the contexts created here and those
    352  * above is that the context handlers are -not- installed, thus
    353  * no context switching behaviour occurs.
    354  *
    355  * Because they manipulate per-cpu state, these ioctls can
    356  * only be invoked from a bound lwp, by a caller with the cpc_cpu privilege
    357  * who can open the relevant entry in /devices (the act of holding it open
    358  * causes other uses of the counters to be suspended).
    359  *
    360  * Note that for correct results, the caller -must- ensure that
    361  * all existing per-lwp contexts are either inactive or marked invalid;
    362  * that's what the open routine does.
    363  */
    364 /*ARGSUSED*/
    365 static int
    366 kcpc_ioctl(dev_t dev, int cmd, intptr_t data, int flags, cred_t *cr, int *rvp)
    367 {
    368 	kthread_t	*t = curthread;
    369 	processorid_t	cpuid;
    370 	void		*udata1 = NULL;
    371 	void		*udata2 = NULL;
    372 	void		*udata3 = NULL;
    373 	int		error;
    374 	int		code;
    375 
    376 	STRUCT_DECL(__cpc_args, args);
    377 
    378 	STRUCT_INIT(args, flags);
    379 
    380 	if (curthread->t_bind_cpu != getminor(dev))
    381 		return (EAGAIN);  /* someone unbound it? */
    382 
    383 	cpuid = getminor(dev);
    384 
    385 	if (cmd == CPCIO_BIND || cmd == CPCIO_SAMPLE) {
    386 		if (copyin((void *)data, STRUCT_BUF(args),
    387 		    STRUCT_SIZE(args)) == -1)
    388 			return (EFAULT);
    389 
    390 		udata1 = STRUCT_FGETP(args, udata1);
    391 		udata2 = STRUCT_FGETP(args, udata2);
    392 		udata3 = STRUCT_FGETP(args, udata3);
    393 	}
    394 
    395 	switch (cmd) {
    396 	case CPCIO_BIND:
    397 		/*
    398 		 * udata1 = pointer to packed nvlist buffer
    399 		 * udata2 = size of packed nvlist buffer
    400 		 * udata3 = User addr to return error subcode in.
    401 		 */
    402 		if (t->t_cpc_set != NULL) {
    403 			(void) kcpc_unbind(t->t_cpc_set);
    404 			ASSERT(t->t_cpc_set == NULL);
    405 		}
    406 
    407 		if ((error = kcpc_copyin_set(&t->t_cpc_set, udata1,
    408 		    (size_t)udata2)) != 0) {
    409 			return (error);
    410 		}
    411 
    412 		if ((error = kcpc_verify_set(t->t_cpc_set)) != 0) {
    413 			kcpc_free_set(t->t_cpc_set);
    414 			t->t_cpc_set = NULL;
    415 			if (copyout(&error, udata3, sizeof (error)) == -1)
    416 				return (EFAULT);
    417 			return (EINVAL);
    418 		}
    419 
    420 		if ((error = kcpc_bind_cpu(t->t_cpc_set, cpuid, &code)) != 0) {
    421 			kcpc_free_set(t->t_cpc_set);
    422 			t->t_cpc_set = NULL;
    423 			/*
    424 			 * Subcodes are only returned for EINVAL and EACCESS.
    425 			 */
    426 			if ((error == EINVAL || error == EACCES) &&
    427 			    copyout(&code, udata3, sizeof (code)) == -1)
    428 				return (EFAULT);
    429 			return (error);
    430 		}
    431 
    432 		return (0);
    433 	case CPCIO_SAMPLE:
    434 		/*
    435 		 * udata1 = pointer to user's buffer
    436 		 * udata2 = pointer to user's hrtime
    437 		 * udata3 = pointer to user's tick
    438 		 */
    439 		/*
    440 		 * Only CPU-bound sets may be sampled via the ioctl(). If this
    441 		 * set has no CPU-bound context, return an error.
    442 		 */
    443 		if (t->t_cpc_set == NULL)
    444 			return (EINVAL);
    445 		if ((error = kcpc_sample(t->t_cpc_set, udata1, udata2,
    446 		    udata3)) != 0)
    447 			return (error);
    448 		return (0);
    449 	case CPCIO_RELE:
    450 		if (t->t_cpc_set == NULL)
    451 			return (EINVAL);
    452 		return (kcpc_unbind(t->t_cpc_set));
    453 	default:
    454 		return (EINVAL);
    455 	}
    456 }
    457 
    458 /*
    459  * The device supports multiple opens, but only one open
    460  * is allowed per processor.  This is to enable multiple
    461  * instances of tools looking at different processors.
    462  */
    463 #define	KCPC_MINOR_SHARED		((minor_t)0x3fffful)
    464 
    465 static ulong_t *kcpc_cpumap;		/* bitmap of cpus */
    466 
    467 /*ARGSUSED1*/
    468 static int
    469 kcpc_open(dev_t *dev, int flags, int otyp, cred_t *cr)
    470 {
    471 	processorid_t	cpuid;
    472 	int		error;
    473 
    474 	ASSERT(pcbe_ops != NULL);
    475 
    476 	if ((error = secpolicy_cpc_cpu(cr)) != 0)
    477 		return (error);
    478 	if (getminor(*dev) != KCPC_MINOR_SHARED)
    479 		return (ENXIO);
    480 	if ((cpuid = curthread->t_bind_cpu) == PBIND_NONE)
    481 		return (EINVAL);
    482 	if (cpuid > max_cpuid)
    483 		return (EINVAL);
    484 
    485 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
    486 	if (++kcpc_cpuctx == 1) {
    487 		ASSERT(kcpc_cpumap == NULL);
    488 
    489 		/*
    490 		 * Bail out if DTrace is already using the counters.
    491 		 */
    492 		if (dtrace_cpc_in_use) {
    493 			kcpc_cpuctx--;
    494 			rw_exit(&kcpc_cpuctx_lock);
    495 			return (EAGAIN);
    496 		}
    497 		kcpc_cpumap = kmem_zalloc(BT_SIZEOFMAP(max_cpuid + 1),
    498 		    KM_SLEEP);
    499 		/*
    500 		 * When this device is open for processor-based contexts,
    501 		 * no further lwp-based contexts can be created.
    502 		 *
    503 		 * Since this is the first open, ensure that all existing
    504 		 * contexts are invalidated.
    505 		 */
    506 		kcpc_invalidate_all();
    507 	} else if (BT_TEST(kcpc_cpumap, cpuid)) {
    508 		kcpc_cpuctx--;
    509 		rw_exit(&kcpc_cpuctx_lock);
    510 		return (EAGAIN);
    511 	} else if (kcpc_hw_cpu_hook(cpuid, kcpc_cpumap) != 0) {
    512 		kcpc_cpuctx--;
    513 		rw_exit(&kcpc_cpuctx_lock);
    514 		return (EACCES);
    515 	}
    516 	BT_SET(kcpc_cpumap, cpuid);
    517 	rw_exit(&kcpc_cpuctx_lock);
    518 
    519 	*dev = makedevice(getmajor(*dev), (minor_t)cpuid);
    520 
    521 	return (0);
    522 }
    523 
    524 /*ARGSUSED1*/
    525 static int
    526 kcpc_close(dev_t dev, int flags, int otyp, cred_t *cr)
    527 {
    528 	rw_enter(&kcpc_cpuctx_lock, RW_WRITER);
    529 	BT_CLEAR(kcpc_cpumap, getminor(dev));
    530 	if (--kcpc_cpuctx == 0) {
    531 		kmem_free(kcpc_cpumap, BT_SIZEOFMAP(max_cpuid + 1));
    532 		kcpc_cpumap = NULL;
    533 	}
    534 	ASSERT(kcpc_cpuctx >= 0);
    535 	rw_exit(&kcpc_cpuctx_lock);
    536 
    537 	return (0);
    538 }
    539 
    540 /*
    541  * Sane boundaries on the size of packed lists. In bytes.
    542  */
    543 #define	CPC_MIN_PACKSIZE 4
    544 #define	CPC_MAX_PACKSIZE 10000
    545 
    546 /*
    547  * Sane boundary on the number of requests a set can contain.
    548  */
    549 #define	CPC_MAX_NREQS 100
    550 
    551 /*
    552  * Sane boundary on the number of attributes a request can contain.
    553  */
    554 #define	CPC_MAX_ATTRS 50
    555 
    556 /*
    557  * Copy in a packed nvlist from the user and create a request set out of it.
    558  * If successful, return 0 and store a pointer to the set we've created. Returns
    559  * error code on error.
    560  */
    561 int
    562 kcpc_copyin_set(kcpc_set_t **inset, void *ubuf, size_t len)
    563 {
    564 	kcpc_set_t	*set;
    565 	int		i;
    566 	int		j;
    567 	char		*packbuf;
    568 
    569 	nvlist_t	*nvl;
    570 	nvpair_t	*nvp = NULL;
    571 
    572 	nvlist_t	*attrs;
    573 	nvpair_t	*nvp_attr;
    574 	kcpc_attr_t	*attrp;
    575 
    576 	nvlist_t	**reqlist;
    577 	uint_t		nreqs;
    578 	uint64_t	uint64;
    579 	uint32_t	uint32;
    580 	uint32_t	setflags = (uint32_t)-1;
    581 	char		*string;
    582 	char		*name;
    583 
    584 	if (len < CPC_MIN_PACKSIZE || len > CPC_MAX_PACKSIZE)
    585 		return (EINVAL);
    586 
    587 	packbuf = kmem_alloc(len, KM_SLEEP);
    588 
    589 	if (copyin(ubuf, packbuf, len) == -1) {
    590 		kmem_free(packbuf, len);
    591 		return (EFAULT);
    592 	}
    593 
    594 	if (nvlist_unpack(packbuf, len, &nvl, KM_SLEEP) != 0) {
    595 		kmem_free(packbuf, len);
    596 		return (EINVAL);
    597 	}
    598 
    599 	/*
    600 	 * The nvlist has been unpacked so there is no need for the packed
    601 	 * representation from this point on.
    602 	 */
    603 	kmem_free(packbuf, len);
    604 
    605 	i = 0;
    606 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
    607 		switch (nvpair_type(nvp)) {
    608 		case DATA_TYPE_UINT32:
    609 			if (strcmp(nvpair_name(nvp), "flags") != 0 ||
    610 			    nvpair_value_uint32(nvp, &setflags) != 0) {
    611 				nvlist_free(nvl);
    612 				return (EINVAL);
    613 			}
    614 			break;
    615 		case DATA_TYPE_NVLIST_ARRAY:
    616 			if (strcmp(nvpair_name(nvp), "reqs") != 0 ||
    617 			    nvpair_value_nvlist_array(nvp, &reqlist,
    618 			    &nreqs) != 0) {
    619 				nvlist_free(nvl);
    620 				return (EINVAL);
    621 			}
    622 			break;
    623 		default:
    624 			nvlist_free(nvl);
    625 			return (EINVAL);
    626 		}
    627 		i++;
    628 	}
    629 
    630 	/*
    631 	 * There should be two members in the top-level nvlist:
    632 	 * an array of nvlists consisting of the requests, and flags.
    633 	 * Anything else is an invalid set.
    634 	 */
    635 	if (i != 2) {
    636 		nvlist_free(nvl);
    637 		return (EINVAL);
    638 	}
    639 
    640 	if (nreqs > CPC_MAX_NREQS) {
    641 		nvlist_free(nvl);
    642 		return (EINVAL);
    643 	}
    644 
    645 	/*
    646 	 * The requests are now stored in the nvlist array at reqlist.
    647 	 * Note that the use of kmem_zalloc() to alloc the kcpc_set_t means
    648 	 * we don't need to call the init routines for ks_lock and ks_condv.
    649 	 */
    650 	set = kmem_zalloc(sizeof (kcpc_set_t), KM_SLEEP);
    651 	set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
    652 	    nreqs, KM_SLEEP);
    653 	set->ks_nreqs = nreqs;
    654 	/*
    655 	 * If the nvlist didn't contain a flags member, setflags was initialized
    656 	 * with an illegal value and this set will fail sanity checks later on.
    657 	 */
    658 	set->ks_flags = setflags;
    659 	/*
    660 	 * Initialize bind/unbind set synchronization.
    661 	 */
    662 	set->ks_state &= ~KCPC_SET_BOUND;
    663 
    664 	/*
    665 	 * Build the set up one request at a time, always keeping it self-
    666 	 * consistent so we can give it to kcpc_free_set() if we need to back
    667 	 * out and return and error.
    668 	 */
    669 	for (i = 0; i < nreqs; i++) {
    670 		nvp = NULL;
    671 		set->ks_req[i].kr_picnum = -1;
    672 		while ((nvp = nvlist_next_nvpair(reqlist[i], nvp)) != NULL) {
    673 			name = nvpair_name(nvp);
    674 			switch (nvpair_type(nvp)) {
    675 			case DATA_TYPE_UINT32:
    676 				if (nvpair_value_uint32(nvp, &uint32) == EINVAL)
    677 					goto inval;
    678 				if (strcmp(name, "cr_flags") == 0)
    679 					set->ks_req[i].kr_flags = uint32;
    680 				if (strcmp(name, "cr_index") == 0)
    681 					set->ks_req[i].kr_index = uint32;
    682 				break;
    683 			case DATA_TYPE_UINT64:
    684 				if (nvpair_value_uint64(nvp, &uint64) == EINVAL)
    685 					goto inval;
    686 				if (strcmp(name, "cr_preset") == 0)
    687 					set->ks_req[i].kr_preset = uint64;
    688 				break;
    689 			case DATA_TYPE_STRING:
    690 				if (nvpair_value_string(nvp, &string) == EINVAL)
    691 					goto inval;
    692 				if (strcmp(name, "cr_event") == 0)
    693 					(void) strncpy(set->ks_req[i].kr_event,
    694 					    string, CPC_MAX_EVENT_LEN);
    695 				break;
    696 			case DATA_TYPE_NVLIST:
    697 				if (strcmp(name, "cr_attr") != 0)
    698 					goto inval;
    699 				if (nvpair_value_nvlist(nvp, &attrs) == EINVAL)
    700 					goto inval;
    701 				nvp_attr = NULL;
    702 				/*
    703 				 * If the picnum has been specified as an
    704 				 * attribute, consume that attribute here and
    705 				 * remove it from the list of attributes.
    706 				 */
    707 				if (nvlist_lookup_uint64(attrs, "picnum",
    708 				    &uint64) == 0) {
    709 					if (nvlist_remove(attrs, "picnum",
    710 					    DATA_TYPE_UINT64) != 0)
    711 						panic("nvlist %p faulty",
    712 						    (void *)attrs);
    713 					set->ks_req[i].kr_picnum = uint64;
    714 				}
    715 
    716 				if ((set->ks_req[i].kr_nattrs =
    717 				    kcpc_nvlist_npairs(attrs)) == 0)
    718 					break;
    719 
    720 				if (set->ks_req[i].kr_nattrs > CPC_MAX_ATTRS)
    721 					goto inval;
    722 
    723 				set->ks_req[i].kr_attr =
    724 				    kmem_alloc(set->ks_req[i].kr_nattrs *
    725 				    sizeof (kcpc_attr_t), KM_SLEEP);
    726 				j = 0;
    727 
    728 				while ((nvp_attr = nvlist_next_nvpair(attrs,
    729 				    nvp_attr)) != NULL) {
    730 					attrp = &set->ks_req[i].kr_attr[j];
    731 
    732 					if (nvpair_type(nvp_attr) !=
    733 					    DATA_TYPE_UINT64)
    734 						goto inval;
    735 
    736 					(void) strncpy(attrp->ka_name,
    737 					    nvpair_name(nvp_attr),
    738 					    CPC_MAX_ATTR_LEN);
    739 
    740 					if (nvpair_value_uint64(nvp_attr,
    741 					    &(attrp->ka_val)) == EINVAL)
    742 						goto inval;
    743 					j++;
    744 				}
    745 				ASSERT(j == set->ks_req[i].kr_nattrs);
    746 			default:
    747 				break;
    748 			}
    749 		}
    750 	}
    751 
    752 	nvlist_free(nvl);
    753 	*inset = set;
    754 	return (0);
    755 
    756 inval:
    757 	nvlist_free(nvl);
    758 	kcpc_free_set(set);
    759 	return (EINVAL);
    760 }
    761 
    762 /*
    763  * Count the number of nvpairs in the supplied nvlist.
    764  */
    765 static uint32_t
    766 kcpc_nvlist_npairs(nvlist_t *list)
    767 {
    768 	nvpair_t *nvp = NULL;
    769 	uint32_t n = 0;
    770 
    771 	while ((nvp = nvlist_next_nvpair(list, nvp)) != NULL)
    772 		n++;
    773 
    774 	return (n);
    775 }
    776 
    777 /*
    778  * Performs sanity checks on the given set.
    779  * Returns 0 if the set checks out OK.
    780  * Returns a detailed error subcode, or -1 if there is no applicable subcode.
    781  */
    782 static int
    783 kcpc_verify_set(kcpc_set_t *set)
    784 {
    785 	kcpc_request_t	*rp;
    786 	int		i;
    787 	uint64_t	bitmap = 0;
    788 	int		n;
    789 
    790 	if (set->ks_nreqs > cpc_ncounters)
    791 		return (-1);
    792 
    793 	if (CPC_SET_VALID_FLAGS(set->ks_flags) == 0)
    794 		return (-1);
    795 
    796 	for (i = 0; i < set->ks_nreqs; i++) {
    797 		rp = &set->ks_req[i];
    798 
    799 		/*
    800 		 * The following comparison must cast cpc_ncounters to an int,
    801 		 * because kr_picnum will be -1 if the request didn't explicitly
    802 		 * choose a PIC.
    803 		 */
    804 		if (rp->kr_picnum >= (int)cpc_ncounters)
    805 			return (CPC_INVALID_PICNUM);
    806 
    807 		/*
    808 		 * Of the pics whose physical picnum has been specified, make
    809 		 * sure each PIC appears only once in set.
    810 		 */
    811 		if ((n = set->ks_req[i].kr_picnum) != -1) {
    812 			if ((bitmap & (1 << n)) != 0)
    813 				return (-1);
    814 			bitmap |= (1 << n);
    815 		}
    816 
    817 		/*
    818 		 * Make sure the requested index falls within the range of all
    819 		 * requests.
    820 		 */
    821 		if (rp->kr_index < 0 || rp->kr_index >= set->ks_nreqs)
    822 			return (-1);
    823 
    824 		/*
    825 		 * Make sure there are no unknown flags.
    826 		 */
    827 		if (KCPC_REQ_VALID_FLAGS(rp->kr_flags) == 0)
    828 			return (CPC_REQ_INVALID_FLAGS);
    829 	}
    830 
    831 	return (0);
    832 }
    833 
    834 static struct cb_ops cb_ops = {
    835 	kcpc_open,
    836 	kcpc_close,
    837 	nodev,		/* strategy */
    838 	nodev,		/* print */
    839 	nodev,		/* dump */
    840 	nodev,		/* read */
    841 	nodev,		/* write */
    842 	kcpc_ioctl,
    843 	nodev,		/* devmap */
    844 	nodev,		/* mmap */
    845 	nodev,		/* segmap */
    846 	nochpoll,	/* poll */
    847 	ddi_prop_op,
    848 	NULL,
    849 	D_NEW | D_MP
    850 };
    851 
    852 /*ARGSUSED*/
    853 static int
    854 kcpc_probe(dev_info_t *devi)
    855 {
    856 	return (DDI_PROBE_SUCCESS);
    857 }
    858 
    859 static dev_info_t *kcpc_devi;
    860 
    861 static int
    862 kcpc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
    863 {
    864 	if (cmd != DDI_ATTACH)
    865 		return (DDI_FAILURE);
    866 	kcpc_devi = devi;
    867 	return (ddi_create_minor_node(devi, "shared", S_IFCHR,
    868 	    KCPC_MINOR_SHARED, DDI_PSEUDO, 0));
    869 }
    870 
    871 /*ARGSUSED*/
    872 static int
    873 kcpc_getinfo(dev_info_t *devi, ddi_info_cmd_t cmd, void *arg, void **result)
    874 {
    875 	switch (cmd) {
    876 	case DDI_INFO_DEVT2DEVINFO:
    877 		switch (getminor((dev_t)arg)) {
    878 		case KCPC_MINOR_SHARED:
    879 			*result = kcpc_devi;
    880 			return (DDI_SUCCESS);
    881 		default:
    882 			break;
    883 		}
    884 		break;
    885 	case DDI_INFO_DEVT2INSTANCE:
    886 		*result = 0;
    887 		return (DDI_SUCCESS);
    888 	default:
    889 		break;
    890 	}
    891 
    892 	return (DDI_FAILURE);
    893 }
    894 
    895 static struct dev_ops dev_ops = {
    896 	DEVO_REV,
    897 	0,
    898 	kcpc_getinfo,
    899 	nulldev,		/* identify */
    900 	kcpc_probe,
    901 	kcpc_attach,
    902 	nodev,			/* detach */
    903 	nodev,			/* reset */
    904 	&cb_ops,
    905 	(struct bus_ops *)0,
    906 	NULL,
    907 	ddi_quiesce_not_needed,		/* quiesce */
    908 };
    909 
    910 static struct modldrv modldrv = {
    911 	&mod_driverops,
    912 	"cpc sampling driver",
    913 	&dev_ops
    914 };
    915 
    916 static struct sysent cpc_sysent = {
    917 	5,
    918 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
    919 	cpc
    920 };
    921 
    922 static struct modlsys modlsys = {
    923 	&mod_syscallops,
    924 	"cpc sampling system call",
    925 	&cpc_sysent
    926 };
    927 
    928 #ifdef _SYSCALL32_IMPL
    929 static struct modlsys modlsys32 = {
    930 	&mod_syscallops32,
    931 	"32-bit cpc sampling system call",
    932 	&cpc_sysent
    933 };
    934 #endif
    935 
    936 static struct modlinkage modl = {
    937 	MODREV_1,
    938 	&modldrv,
    939 	&modlsys,
    940 #ifdef _SYSCALL32_IMPL
    941 	&modlsys32,
    942 #endif
    943 };
    944 
    945 static void
    946 kcpc_init(void)
    947 {
    948 	long hash;
    949 
    950 	rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
    951 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
    952 		mutex_init(&kcpc_ctx_llock[hash],
    953 		    NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
    954 }
    955 
    956 static void
    957 kcpc_fini(void)
    958 {
    959 	long hash;
    960 
    961 	for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
    962 		mutex_destroy(&kcpc_ctx_llock[hash]);
    963 	rw_destroy(&kcpc_cpuctx_lock);
    964 }
    965 
    966 int
    967 _init(void)
    968 {
    969 	int ret;
    970 
    971 	if (kcpc_hw_load_pcbe() != 0)
    972 		return (ENOTSUP);
    973 
    974 	kcpc_init();
    975 	if ((ret = mod_install(&modl)) != 0)
    976 		kcpc_fini();
    977 	return (ret);
    978 }
    979 
    980 int
    981 _fini(void)
    982 {
    983 	int ret;
    984 
    985 	if ((ret = mod_remove(&modl)) == 0)
    986 		kcpc_fini();
    987 	return (ret);
    988 }
    989 
    990 int
    991 _info(struct modinfo *mi)
    992 {
    993 	return (mod_info(&modl, mi));
    994 }
    995