Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"@(#)lgrpsys.c	1.12	06/10/24 SMI"
     28 
     29 /*
     30  * lgroup system calls
     31  */
     32 
     33 #include <sys/types.h>
     34 #include <sys/errno.h>
     35 #include <sys/sunddi.h>
     36 #include <sys/systm.h>
     37 #include <sys/mman.h>
     38 #include <sys/cpupart.h>
     39 #include <sys/lgrp.h>
     40 #include <sys/lgrp_user.h>
     41 #include <sys/promif.h>		/* for prom_printf() */
     42 #include <sys/sysmacros.h>
     43 
     44 #include <vm/as.h>
     45 
     46 
     47 /* definitions for mi_validity */
     48 #define	VALID_ADDR	1
     49 #define	VALID_REQ	2
     50 
     51 /*
     52  * run through the given number of addresses and requests and return the
     53  * corresponding memory information for each address
     54  */
     55 static int
     56 meminfo(int addr_count, struct meminfo *mip)
     57 {
     58 	size_t		in_size, out_size, req_size, val_size;
     59 	struct as	*as;
     60 	struct hat	*hat;
     61 	int		i, j, out_idx, info_count;
     62 	lgrp_t		*lgrp;
     63 	pfn_t		pfn;
     64 	ssize_t		pgsz;
     65 	int		*req_array, *val_array;
     66 	uint64_t	*in_array, *out_array;
     67 	uint64_t	addr, paddr;
     68 	uintptr_t	vaddr;
     69 	int		ret = 0;
     70 	struct meminfo minfo;
     71 #if defined(_SYSCALL32_IMPL)
     72 	struct meminfo32 minfo32;
     73 #endif
     74 
     75 	/*
     76 	 * Make sure that there is at least one address to translate and
     77 	 * limit how many virtual addresses the kernel can do per call
     78 	 */
     79 	if (addr_count < 1)
     80 		return (set_errno(EINVAL));
     81 	else if (addr_count > MAX_MEMINFO_CNT)
     82 		addr_count = MAX_MEMINFO_CNT;
     83 
     84 	if (get_udatamodel() == DATAMODEL_NATIVE) {
     85 		if (copyin(mip, &minfo, sizeof (struct meminfo)))
     86 			return (set_errno(EFAULT));
     87 	}
     88 #if defined(_SYSCALL32_IMPL)
     89 	else {
     90 		bzero(&minfo, sizeof (minfo));
     91 		if (copyin(mip, &minfo32, sizeof (struct meminfo32)))
     92 			return (set_errno(EFAULT));
     93 		minfo.mi_inaddr = (const uint64_t *)(uintptr_t)
     94 		    minfo32.mi_inaddr;
     95 		minfo.mi_info_req = (const uint_t *)(uintptr_t)
     96 		    minfo32.mi_info_req;
     97 		minfo.mi_info_count = minfo32.mi_info_count;
     98 		minfo.mi_outdata = (uint64_t *)(uintptr_t)
     99 		    minfo32.mi_outdata;
    100 		minfo.mi_validity = (uint_t *)(uintptr_t)
    101 		    minfo32.mi_validity;
    102 	}
    103 #endif
    104 	/*
    105 	 * all the input parameters have been copied in:-
    106 	 * addr_count - number of input addresses
    107 	 * minfo.mi_inaddr - array of input addresses
    108 	 * minfo.mi_info_req - array of types of information requested
    109 	 * minfo.mi_info_count - no. of pieces of info requested for each addr
    110 	 * minfo.mi_outdata - array into which the results are placed
    111 	 * minfo.mi_validity -  array containing bitwise result codes; 0th bit
    112 	 *			evaluates validity of corresponding input
    113 	 *			address, 1st bit validity of response to first
    114 	 *			member of info_req, etc.
    115 	 */
    116 
    117 	/* make sure mi_info_count is within limit */
    118 	info_count = minfo.mi_info_count;
    119 	if (info_count < 1 || info_count > MAX_MEMINFO_REQ)
    120 		return (set_errno(EINVAL));
    121 
    122 	/*
    123 	 * allocate buffer in_array for the input addresses and copy them in
    124 	 */
    125 	in_size = sizeof (uint64_t) * addr_count;
    126 	in_array = kmem_alloc(in_size, KM_SLEEP);
    127 	if (copyin(minfo.mi_inaddr, in_array, in_size)) {
    128 		kmem_free(in_array, in_size);
    129 		return (set_errno(EFAULT));
    130 	}
    131 
    132 	/*
    133 	 * allocate buffer req_array for the input info_reqs and copy them in
    134 	 */
    135 	req_size = sizeof (uint_t) * info_count;
    136 	req_array = kmem_alloc(req_size, KM_SLEEP);
    137 	if (copyin(minfo.mi_info_req, req_array, req_size)) {
    138 		kmem_free(req_array, req_size);
    139 		kmem_free(in_array, in_size);
    140 		return (set_errno(EFAULT));
    141 	}
    142 
    143 	/*
    144 	 * allocate buffer out_array which holds the results and will have
    145 	 * to be copied out later
    146 	 */
    147 	out_size = sizeof (uint64_t) * addr_count * info_count;
    148 	out_array = kmem_alloc(out_size, KM_SLEEP);
    149 
    150 	/*
    151 	 * allocate buffer val_array which holds the validity bits and will
    152 	 * have to be copied out later
    153 	 */
    154 	val_size = sizeof (uint_t) * addr_count;
    155 	val_array = kmem_alloc(val_size, KM_SLEEP);
    156 
    157 	if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) {
    158 		/* find the corresponding lgroup for each physical address */
    159 		for (i = 0; i < addr_count; i++) {
    160 			paddr = in_array[i];
    161 			pfn = btop(paddr);
    162 			lgrp = lgrp_pfn_to_lgrp(pfn);
    163 			if (lgrp) {
    164 				out_array[i] = lgrp->lgrp_id;
    165 				val_array[i] = VALID_ADDR | VALID_REQ;
    166 			} else {
    167 				out_array[i] = NULL;
    168 				val_array[i] = 0;
    169 			}
    170 		}
    171 	} else {
    172 		/* get the corresponding memory info for each virtual address */
    173 		as = curproc->p_as;
    174 
    175 		AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
    176 		hat = as->a_hat;
    177 		for (i = out_idx = 0; i < addr_count; i++, out_idx +=
    178 		    info_count) {
    179 			addr = in_array[i];
    180 			vaddr = (uintptr_t)(addr & ~PAGEOFFSET);
    181 			if (!as_segat(as, (caddr_t)vaddr)) {
    182 				val_array[i] = 0;
    183 				continue;
    184 			}
    185 			val_array[i] = VALID_ADDR;
    186 			pfn = hat_getpfnum(hat, (caddr_t)vaddr);
    187 			if (pfn != PFN_INVALID) {
    188 				paddr = (uint64_t)((pfn << PAGESHIFT) |
    189 					(addr & PAGEOFFSET));
    190 				for (j = 0; j < info_count; j++) {
    191 					switch (req_array[j] & MEMINFO_MASK) {
    192 					case MEMINFO_VPHYSICAL:
    193 						/*
    194 						 * return the physical address
    195 						 * corresponding to the input
    196 						 * virtual address
    197 						 */
    198 						out_array[out_idx + j] = paddr;
    199 						val_array[i] |= VALID_REQ << j;
    200 						break;
    201 					case MEMINFO_VLGRP:
    202 						/*
    203 						 * return the lgroup of physical
    204 						 * page corresponding to the
    205 						 * input virtual address
    206 						 */
    207 						lgrp = lgrp_pfn_to_lgrp(pfn);
    208 						if (lgrp) {
    209 							out_array[out_idx + j] =
    210 								lgrp->lgrp_id;
    211 							val_array[i] |=
    212 								VALID_REQ << j;
    213 						}
    214 						break;
    215 					case MEMINFO_VPAGESIZE:
    216 						/*
    217 						 * return the size of physical
    218 						 * page corresponding to the
    219 						 * input virtual address
    220 						 */
    221 						pgsz = hat_getpagesize(hat,
    222 							(caddr_t)vaddr);
    223 						if (pgsz != -1) {
    224 							out_array[out_idx + j] =
    225 									pgsz;
    226 							val_array[i] |=
    227 								VALID_REQ << j;
    228 						}
    229 						break;
    230 					case MEMINFO_VREPLCNT:
    231 						/*
    232 						 * for future use:-
    233 						 * return the no. replicated
    234 						 * physical pages corresponding
    235 						 * to the input virtual address,
    236 						 * so it is always 0 at the
    237 						 * moment
    238 						 */
    239 						out_array[out_idx + j] = 0;
    240 						val_array[i] |= VALID_REQ << j;
    241 						break;
    242 					case MEMINFO_VREPL:
    243 						/*
    244 						 * for future use:-
    245 						 * return the nth physical
    246 						 * replica of the specified
    247 						 * virtual address
    248 						 */
    249 						break;
    250 					case MEMINFO_VREPL_LGRP:
    251 						/*
    252 						 * for future use:-
    253 						 * return the lgroup of nth
    254 						 * physical replica of the
    255 						 * specified virtual address
    256 						 */
    257 						break;
    258 					case MEMINFO_PLGRP:
    259 						/*
    260 						 * this is for physical address
    261 						 * only, shouldn't mix with
    262 						 * virtual address
    263 						 */
    264 						break;
    265 					default:
    266 						break;
    267 					}
    268 				}
    269 			}
    270 		}
    271 		AS_LOCK_EXIT(as, &as->a_lock);
    272 	}
    273 
    274 	/* copy out the results and validity bits and free the buffers */
    275 	if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) ||
    276 		(copyout(val_array, minfo.mi_validity, val_size) != 0))
    277 		ret = set_errno(EFAULT);
    278 
    279 	kmem_free(in_array, in_size);
    280 	kmem_free(out_array, out_size);
    281 	kmem_free(req_array, req_size);
    282 	kmem_free(val_array, val_size);
    283 
    284 	return (ret);
    285 }
    286 
    287 
    288 /*
    289  * Initialize lgroup affinities for thread
    290  */
    291 void
    292 lgrp_affinity_init(lgrp_affinity_t **bufaddr)
    293 {
    294 	if (bufaddr)
    295 		*bufaddr = NULL;
    296 }
    297 
    298 
    299 /*
    300  * Free lgroup affinities for thread and set to NULL
    301  * just in case thread gets recycled
    302  */
    303 void
    304 lgrp_affinity_free(lgrp_affinity_t **bufaddr)
    305 {
    306 	if (bufaddr && *bufaddr) {
    307 		kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t));
    308 		*bufaddr = NULL;
    309 	}
    310 }
    311 
    312 
    313 #define	P_ANY	-2	/* cookie specifying any ID */
    314 
    315 
    316 /*
    317  * Find LWP with given ID in specified process and get its affinity for
    318  * specified lgroup
    319  */
    320 lgrp_affinity_t
    321 lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp)
    322 {
    323 	lgrp_affinity_t aff;
    324 	int		found;
    325 	kthread_t	*t;
    326 
    327 	ASSERT(MUTEX_HELD(&p->p_lock));
    328 
    329 	aff = LGRP_AFF_NONE;
    330 	found = 0;
    331 	t = p->p_tlist;
    332 	/*
    333 	 * The process may be executing in proc_exit() and its p->p_list may be
    334 	 * already NULL.
    335 	 */
    336 	if (t == NULL)
    337 		return (set_errno(ESRCH));
    338 
    339 	do {
    340 		if (t->t_tid == lwpid || lwpid == P_ANY) {
    341 			thread_lock(t);
    342 			/*
    343 			 * Check to see whether caller has permission to set
    344 			 * affinity for LWP
    345 			 */
    346 			if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
    347 				thread_unlock(t);
    348 				return (set_errno(EPERM));
    349 			}
    350 
    351 			if (t->t_lgrp_affinity)
    352 				aff = t->t_lgrp_affinity[lgrp];
    353 			thread_unlock(t);
    354 			found = 1;
    355 			break;
    356 		}
    357 	} while ((t = t->t_forw) != p->p_tlist);
    358 	if (!found)
    359 		aff = set_errno(ESRCH);
    360 
    361 	return (aff);
    362 }
    363 
    364 
    365 /*
    366  * Get lgroup affinity for given LWP
    367  */
    368 lgrp_affinity_t
    369 lgrp_affinity_get(lgrp_affinity_args_t *ap)
    370 {
    371 	lgrp_affinity_t		aff;
    372 	lgrp_affinity_args_t	args;
    373 	id_t			id;
    374 	idtype_t		idtype;
    375 	lgrp_id_t		lgrp;
    376 	proc_t			*p;
    377 	kthread_t		*t;
    378 
    379 	/*
    380 	 * Copyin arguments
    381 	 */
    382 	if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
    383 		return (set_errno(EFAULT));
    384 
    385 	id = args.id;
    386 	idtype = args.idtype;
    387 	lgrp = args.lgrp;
    388 
    389 	/*
    390 	 * Check for invalid lgroup
    391 	 */
    392 	if (lgrp < 0 || lgrp == LGRP_NONE)
    393 		return (set_errno(EINVAL));
    394 
    395 	/*
    396 	 * Check for existing lgroup
    397 	 */
    398 	if (lgrp > lgrp_alloc_max)
    399 		return (set_errno(ESRCH));
    400 
    401 	/*
    402 	 * Get lgroup affinity for given LWP or process
    403 	 */
    404 	switch (idtype) {
    405 
    406 	case P_LWPID:
    407 		/*
    408 		 * LWP in current process
    409 		 */
    410 		p = curproc;
    411 		mutex_enter(&p->p_lock);
    412 		if (id != P_MYID)	/* different thread */
    413 			aff = lgrp_affinity_get_thread(p, id, lgrp);
    414 		else {			/* current thread */
    415 			aff = LGRP_AFF_NONE;
    416 			t = curthread;
    417 			thread_lock(t);
    418 			if (t->t_lgrp_affinity)
    419 				aff = t->t_lgrp_affinity[lgrp];
    420 			thread_unlock(t);
    421 		}
    422 		mutex_exit(&p->p_lock);
    423 		break;
    424 
    425 	case P_PID:
    426 		/*
    427 		 * Process
    428 		 */
    429 		mutex_enter(&pidlock);
    430 
    431 		if (id == P_MYID)
    432 			p = curproc;
    433 		else {
    434 			p = prfind(id);
    435 			if (p == NULL) {
    436 				mutex_exit(&pidlock);
    437 				return (set_errno(ESRCH));
    438 			}
    439 		}
    440 
    441 		mutex_enter(&p->p_lock);
    442 		aff = lgrp_affinity_get_thread(p, P_ANY, lgrp);
    443 		mutex_exit(&p->p_lock);
    444 
    445 		mutex_exit(&pidlock);
    446 		break;
    447 
    448 	default:
    449 		aff = set_errno(EINVAL);
    450 		break;
    451 	}
    452 
    453 	return (aff);
    454 }
    455 
    456 
    457 /*
    458  * Find lgroup for which this thread has most affinity in specified partition
    459  * starting from home lgroup unless specified starting lgroup is preferred
    460  */
    461 lpl_t *
    462 lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start,
    463     boolean_t prefer_start)
    464 {
    465 	lgrp_affinity_t	*affs;
    466 	lgrp_affinity_t	best_aff;
    467 	lpl_t		*best_lpl;
    468 	lgrp_id_t	finish;
    469 	lgrp_id_t	home;
    470 	lgrp_id_t	lgrpid;
    471 	lpl_t		*lpl;
    472 
    473 	ASSERT(t != NULL);
    474 	ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) ||
    475 	    (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t)));
    476 	ASSERT(cpupart != NULL);
    477 
    478 	if (t->t_lgrp_affinity == NULL)
    479 		return (NULL);
    480 
    481 	affs = t->t_lgrp_affinity;
    482 
    483 	/*
    484 	 * Thread bound to CPU
    485 	 */
    486 	if (t->t_bind_cpu != PBIND_NONE) {
    487 		cpu_t	*cp;
    488 
    489 		/*
    490 		 * Find which lpl has most affinity among leaf lpl directly
    491 		 * containing CPU and its ancestor lpls
    492 		 */
    493 		cp = cpu[t->t_bind_cpu];
    494 
    495 		best_lpl = lpl = cp->cpu_lpl;
    496 		best_aff = affs[best_lpl->lpl_lgrpid];
    497 		while (lpl->lpl_parent != NULL) {
    498 			lpl = lpl->lpl_parent;
    499 			lgrpid = lpl->lpl_lgrpid;
    500 			if (affs[lgrpid] > best_aff) {
    501 				best_lpl = lpl;
    502 				best_aff = affs[lgrpid];
    503 			}
    504 		}
    505 		return (best_lpl);
    506 	}
    507 
    508 	/*
    509 	 * Start searching from home lgroup unless given starting lgroup is
    510 	 * preferred or home lgroup isn't in given pset.  Use root lgroup as
    511 	 * starting point if both home and starting lgroups aren't in given
    512 	 * pset.
    513 	 */
    514 	ASSERT(start >= 0 && start <= lgrp_alloc_max);
    515 	home = t->t_lpl->lpl_lgrpid;
    516 	if (!prefer_start && LGRP_CPUS_IN_PART(home, cpupart))
    517 		lgrpid = home;
    518 	else if (start != LGRP_NONE && LGRP_CPUS_IN_PART(start, cpupart))
    519 		lgrpid = start;
    520 	else
    521 		lgrpid = LGRP_ROOTID;
    522 
    523 	best_lpl = &cpupart->cp_lgrploads[lgrpid];
    524 	best_aff = affs[lgrpid];
    525 	finish = lgrpid;
    526 	do {
    527 		/*
    528 		 * Skip any lgroups that don't have CPU resources
    529 		 * in this processor set.
    530 		 */
    531 		if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) {
    532 			if (++lgrpid > lgrp_alloc_max)
    533 				lgrpid = 0;	/* wrap the search */
    534 			continue;
    535 		}
    536 
    537 		/*
    538 		 * Find lgroup with most affinity
    539 		 */
    540 		lpl = &cpupart->cp_lgrploads[lgrpid];
    541 		if (affs[lgrpid] > best_aff) {
    542 			best_aff = affs[lgrpid];
    543 			best_lpl = lpl;
    544 		}
    545 
    546 		if (++lgrpid > lgrp_alloc_max)
    547 			lgrpid = 0;	/* wrap the search */
    548 
    549 	} while (lgrpid != finish);
    550 
    551 	/*
    552 	 * No lgroup (in this pset) with any affinity
    553 	 */
    554 	if (best_aff == LGRP_AFF_NONE)
    555 		return (NULL);
    556 
    557 	lgrpid = best_lpl->lpl_lgrpid;
    558 	ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0);
    559 
    560 	return (best_lpl);
    561 }
    562 
    563 
    564 /*
    565  * Set thread's affinity for given lgroup
    566  */
    567 int
    568 lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff,
    569     lgrp_affinity_t **aff_buf)
    570 {
    571 	lgrp_affinity_t	*affs;
    572 	lgrp_id_t	best;
    573 	lpl_t		*best_lpl;
    574 	lgrp_id_t	home;
    575 	int		retval;
    576 
    577 	ASSERT(t != NULL);
    578 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
    579 
    580 	retval = 0;
    581 
    582 	thread_lock(t);
    583 
    584 	/*
    585 	 * Check to see whether caller has permission to set affinity for
    586 	 * thread
    587 	 */
    588 	if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
    589 		thread_unlock(t);
    590 		return (set_errno(EPERM));
    591 	}
    592 
    593 	if (t->t_lgrp_affinity == NULL) {
    594 		if (aff == LGRP_AFF_NONE) {
    595 			thread_unlock(t);
    596 			return (0);
    597 		}
    598 		ASSERT(aff_buf != NULL && *aff_buf != NULL);
    599 		t->t_lgrp_affinity = *aff_buf;
    600 		*aff_buf = NULL;
    601 	}
    602 
    603 	affs = t->t_lgrp_affinity;
    604 	affs[lgrp] = aff;
    605 
    606 	/*
    607 	 * Find lgroup for which thread has most affinity,
    608 	 * starting with lgroup for which affinity being set
    609 	 */
    610 	best_lpl = lgrp_affinity_best(t, t->t_cpupart, lgrp, B_TRUE);
    611 
    612 	/*
    613 	 * Rehome if found lgroup with more affinity than home or lgroup for
    614 	 * which affinity is being set has same affinity as home
    615 	 */
    616 	home = t->t_lpl->lpl_lgrpid;
    617 	if (best_lpl != NULL && best_lpl != t->t_lpl) {
    618 		best = best_lpl->lpl_lgrpid;
    619 		if (affs[best] > affs[home] || (affs[best] == affs[home] &&
    620 		    best == lgrp))
    621 			lgrp_move_thread(t, best_lpl, 1);
    622 	}
    623 
    624 	thread_unlock(t);
    625 
    626 	return (retval);
    627 }
    628 
    629 
    630 /*
    631  * Set process' affinity for specified lgroup
    632  */
    633 int
    634 lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff,
    635     lgrp_affinity_t **aff_buf_array)
    636 {
    637 	lgrp_affinity_t	*buf;
    638 	int		err = 0;
    639 	int		i;
    640 	int		retval;
    641 	kthread_t	*t;
    642 
    643 	ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock));
    644 	ASSERT(aff_buf_array != NULL);
    645 
    646 	i = 0;
    647 	t = p->p_tlist;
    648 	if (t != NULL) {
    649 		do {
    650 			/*
    651 			 * Set lgroup affinity for thread
    652 			 */
    653 			buf = aff_buf_array[i];
    654 			retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf);
    655 
    656 			if (err == 0 && retval != 0)
    657 				err = retval;
    658 
    659 			/*
    660 			 * Advance pointer to next buffer
    661 			 */
    662 			if (buf == NULL) {
    663 				ASSERT(i < p->p_lwpcnt);
    664 				aff_buf_array[i] = NULL;
    665 				i++;
    666 			}
    667 
    668 		} while ((t = t->t_forw) != p->p_tlist);
    669 	}
    670 	return (err);
    671 }
    672 
    673 
    674 /*
    675  * Set LWP's or process' affinity for specified lgroup
    676  *
    677  * When setting affinities, pidlock, process p_lock, and thread_lock()
    678  * need to be held in that order to protect target thread's pset, process,
    679  * process contents, and thread contents.  thread_lock() does splhigh(),
    680  * so it ends up having similiar effect as kpreempt_disable(), so it will
    681  * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes.
    682  */
    683 int
    684 lgrp_affinity_set(lgrp_affinity_args_t *ap)
    685 {
    686 	lgrp_affinity_t		aff;
    687 	lgrp_affinity_t		*aff_buf;
    688 	lgrp_affinity_args_t	args;
    689 	id_t			id;
    690 	idtype_t		idtype;
    691 	lgrp_id_t		lgrp;
    692 	int			nthreads;
    693 	proc_t			*p;
    694 	int			retval;
    695 
    696 	/*
    697 	 * Copyin arguments
    698 	 */
    699 	if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
    700 		return (set_errno(EFAULT));
    701 
    702 	idtype = args.idtype;
    703 	id = args.id;
    704 	lgrp = args.lgrp;
    705 	aff = args.aff;
    706 
    707 	/*
    708 	 * Check for invalid lgroup
    709 	 */
    710 	if (lgrp < 0 || lgrp == LGRP_NONE)
    711 		return (set_errno(EINVAL));
    712 
    713 	/*
    714 	 * Check for existing lgroup
    715 	 */
    716 	if (lgrp > lgrp_alloc_max)
    717 		return (set_errno(ESRCH));
    718 
    719 	/*
    720 	 * Check for legal affinity
    721 	 */
    722 	if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK &&
    723 	    aff != LGRP_AFF_STRONG)
    724 		return (set_errno(EINVAL));
    725 
    726 	/*
    727 	 * Must be process or LWP ID
    728 	 */
    729 	if (idtype != P_LWPID && idtype != P_PID)
    730 		return (set_errno(EINVAL));
    731 
    732 	/*
    733 	 * Set given LWP's or process' affinity for specified lgroup
    734 	 */
    735 	switch (idtype) {
    736 
    737 	case P_LWPID:
    738 		/*
    739 		 * Allocate memory for thread's lgroup affinities
    740 		 * ahead of time w/o holding locks
    741 		 */
    742 		aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t),
    743 		    KM_SLEEP);
    744 
    745 		p = curproc;
    746 
    747 		/*
    748 		 * Set affinity for thread
    749 		 */
    750 		mutex_enter(&p->p_lock);
    751 		if (id == P_MYID) {		/* current thread */
    752 			retval = lgrp_affinity_set_thread(curthread, lgrp, aff,
    753 			    &aff_buf);
    754 		} else if (p->p_tlist == NULL) {
    755 			retval = set_errno(ESRCH);
    756 		} else {			/* other thread */
    757 			int		found = 0;
    758 			kthread_t	*t;
    759 
    760 			t = p->p_tlist;
    761 			do {
    762 				if (t->t_tid == id) {
    763 					retval = lgrp_affinity_set_thread(t,
    764 					    lgrp, aff, &aff_buf);
    765 					found = 1;
    766 					break;
    767 				}
    768 			} while ((t = t->t_forw) != p->p_tlist);
    769 			if (!found)
    770 				retval = set_errno(ESRCH);
    771 		}
    772 		mutex_exit(&p->p_lock);
    773 
    774 		/*
    775 		 * Free memory for lgroup affinities,
    776 		 * since thread didn't need it
    777 		 */
    778 		if (aff_buf)
    779 			kmem_free(aff_buf,
    780 			    nlgrpsmax * sizeof (lgrp_affinity_t));
    781 
    782 		break;
    783 
    784 	case P_PID:
    785 
    786 		do {
    787 			lgrp_affinity_t	**aff_buf_array;
    788 			int		i;
    789 			size_t		size;
    790 
    791 			/*
    792 			 * Get process
    793 			 */
    794 			mutex_enter(&pidlock);
    795 
    796 			if (id == P_MYID)
    797 				p = curproc;
    798 			else
    799 				p = prfind(id);
    800 
    801 			if (p == NULL) {
    802 				mutex_exit(&pidlock);
    803 				return (set_errno(ESRCH));
    804 			}
    805 
    806 			/*
    807 			 * Get number of threads in process
    808 			 *
    809 			 * NOTE: Only care about user processes,
    810 			 *	 so p_lwpcnt should be number of threads.
    811 			 */
    812 			mutex_enter(&p->p_lock);
    813 			nthreads = p->p_lwpcnt;
    814 			mutex_exit(&p->p_lock);
    815 
    816 			mutex_exit(&pidlock);
    817 
    818 			if (nthreads < 1)
    819 				return (set_errno(ESRCH));
    820 
    821 			/*
    822 			 * Preallocate memory for lgroup affinities for
    823 			 * each thread in process now to avoid holding
    824 			 * any locks.  Allocate an array to hold a buffer
    825 			 * for each thread.
    826 			 */
    827 			aff_buf_array = kmem_zalloc(nthreads *
    828 			    sizeof (lgrp_affinity_t *), KM_SLEEP);
    829 
    830 			size = nlgrpsmax * sizeof (lgrp_affinity_t);
    831 			for (i = 0; i < nthreads; i++)
    832 				aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP);
    833 
    834 			mutex_enter(&pidlock);
    835 
    836 			/*
    837 			 * Get process again since dropped locks to allocate
    838 			 * memory (except current process)
    839 			 */
    840 			if (id != P_MYID)
    841 				p = prfind(id);
    842 
    843 			/*
    844 			 * Process went away after we dropped locks and before
    845 			 * reacquiring them, so drop locks, free memory, and
    846 			 * return.
    847 			 */
    848 			if (p == NULL) {
    849 				mutex_exit(&pidlock);
    850 				for (i = 0; i < nthreads; i++)
    851 					kmem_free(aff_buf_array[i], size);
    852 				kmem_free(aff_buf_array,
    853 				    nthreads * sizeof (lgrp_affinity_t *));
    854 				return (set_errno(ESRCH));
    855 			}
    856 
    857 			mutex_enter(&p->p_lock);
    858 
    859 			/*
    860 			 * See whether number of threads is same
    861 			 * If not, drop locks, free memory, and try again
    862 			 */
    863 			if (nthreads != p->p_lwpcnt) {
    864 				mutex_exit(&p->p_lock);
    865 				mutex_exit(&pidlock);
    866 				for (i = 0; i < nthreads; i++)
    867 					kmem_free(aff_buf_array[i], size);
    868 				kmem_free(aff_buf_array,
    869 				    nthreads * sizeof (lgrp_affinity_t *));
    870 				continue;
    871 			}
    872 
    873 			/*
    874 			 * Set lgroup affinity for threads in process
    875 			 */
    876 			retval = lgrp_affinity_set_proc(p, lgrp, aff,
    877 			    aff_buf_array);
    878 
    879 			mutex_exit(&p->p_lock);
    880 			mutex_exit(&pidlock);
    881 
    882 			/*
    883 			 * Free any leftover memory, since some threads may
    884 			 * have already allocated memory and set lgroup
    885 			 * affinities before
    886 			 */
    887 			for (i = 0; i < nthreads; i++)
    888 				if (aff_buf_array[i] != NULL)
    889 					kmem_free(aff_buf_array[i], size);
    890 			kmem_free(aff_buf_array,
    891 			    nthreads * sizeof (lgrp_affinity_t *));
    892 
    893 			break;
    894 
    895 		} while (nthreads != p->p_lwpcnt);
    896 
    897 		break;
    898 
    899 	default:
    900 		retval = set_errno(EINVAL);
    901 		break;
    902 	}
    903 
    904 	return (retval);
    905 }
    906 
    907 
    908 /*
    909  * Return the latest generation number for the lgroup hierarchy
    910  * with the given view
    911  */
    912 lgrp_gen_t
    913 lgrp_generation(lgrp_view_t view)
    914 {
    915 	cpupart_t	*cpupart;
    916 	uint_t		gen;
    917 
    918 	kpreempt_disable();
    919 
    920 	/*
    921 	 * Determine generation number for given view
    922 	 */
    923 	if (view == LGRP_VIEW_OS)
    924 		/*
    925 		 * Return generation number of lgroup hierarchy for OS view
    926 		 */
    927 		gen = lgrp_gen;
    928 	else {
    929 		/*
    930 		 * For caller's view, use generation numbers for lgroup
    931 		 * hierarchy and caller's pset
    932 		 * NOTE: Caller needs to check for change in pset ID
    933 		 */
    934 		cpupart = curthread->t_cpupart;
    935 		ASSERT(cpupart);
    936 		gen = lgrp_gen + cpupart->cp_gen;
    937 	}
    938 
    939 	kpreempt_enable();
    940 
    941 	return (gen);
    942 }
    943 
    944 
    945 lgrp_id_t
    946 lgrp_home_thread(kthread_t *t)
    947 {
    948 	lgrp_id_t	home;
    949 
    950 	ASSERT(t != NULL);
    951 	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
    952 
    953 	thread_lock(t);
    954 
    955 	/*
    956 	 * Check to see whether caller has permission to set affinity for
    957 	 * thread
    958 	 */
    959 	if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
    960 		thread_unlock(t);
    961 		return (set_errno(EPERM));
    962 	}
    963 
    964 	home = lgrp_home_id(t);
    965 
    966 	thread_unlock(t);
    967 	return (home);
    968 }
    969 
    970 
    971 /*
    972  * Get home lgroup of given process or thread
    973  */
    974 lgrp_id_t
    975 lgrp_home_get(idtype_t idtype, id_t id)
    976 {
    977 	proc_t		*p;
    978 	lgrp_id_t	retval;
    979 	kthread_t	*t;
    980 
    981 	/*
    982 	 * Get home lgroup of given LWP or process
    983 	 */
    984 	switch (idtype) {
    985 
    986 	case P_LWPID:
    987 		p = curproc;
    988 
    989 		/*
    990 		 * Set affinity for thread
    991 		 */
    992 		mutex_enter(&p->p_lock);
    993 		if (id == P_MYID) {		/* current thread */
    994 			retval = lgrp_home_thread(curthread);
    995 		} else if (p->p_tlist == NULL) {
    996 			retval = set_errno(ESRCH);
    997 		} else {			/* other thread */
    998 			int	found = 0;
    999 
   1000 			t = p->p_tlist;
   1001 			do {
   1002 				if (t->t_tid == id) {
   1003 					retval = lgrp_home_thread(t);
   1004 					found = 1;
   1005 					break;
   1006 				}
   1007 			} while ((t = t->t_forw) != p->p_tlist);
   1008 			if (!found)
   1009 				retval = set_errno(ESRCH);
   1010 		}
   1011 		mutex_exit(&p->p_lock);
   1012 		break;
   1013 
   1014 	case P_PID:
   1015 		/*
   1016 		 * Get process
   1017 		 */
   1018 		mutex_enter(&pidlock);
   1019 
   1020 		if (id == P_MYID)
   1021 			p = curproc;
   1022 		else
   1023 			p = prfind(id);
   1024 
   1025 		if (p == NULL) {
   1026 			mutex_exit(&pidlock);
   1027 			return (set_errno(ESRCH));
   1028 		}
   1029 
   1030 		mutex_enter(&p->p_lock);
   1031 		t = p->p_tlist;
   1032 		if (t == NULL)
   1033 			retval = set_errno(ESRCH);
   1034 		else
   1035 			retval = lgrp_home_thread(t);
   1036 		mutex_exit(&p->p_lock);
   1037 
   1038 		mutex_exit(&pidlock);
   1039 
   1040 		break;
   1041 
   1042 	default:
   1043 		retval = set_errno(EINVAL);
   1044 		break;
   1045 	}
   1046 
   1047 	return (retval);
   1048 }
   1049 
   1050 
   1051 /*
   1052  * Return latency between "from" and "to" lgroups
   1053  *
   1054  * This latency number can only be used for relative comparison
   1055  * between lgroups on the running system, cannot be used across platforms,
   1056  * and may not reflect the actual latency.  It is platform and implementation
   1057  * specific, so platform gets to decide its value.  It would be nice if the
   1058  * number was at least proportional to make comparisons more meaningful though.
   1059  */
   1060 int
   1061 lgrp_latency(lgrp_id_t from, lgrp_id_t to)
   1062 {
   1063 	lgrp_t		*from_lgrp;
   1064 	int		i;
   1065 	int		latency;
   1066 	int		latency_max;
   1067 	lgrp_t		*to_lgrp;
   1068 
   1069 	ASSERT(MUTEX_HELD(&cpu_lock));
   1070 
   1071 	if (from < 0 || to < 0)
   1072 		return (set_errno(EINVAL));
   1073 
   1074 	if (from > lgrp_alloc_max || to > lgrp_alloc_max)
   1075 		return (set_errno(ESRCH));
   1076 
   1077 	from_lgrp = lgrp_table[from];
   1078 	to_lgrp = lgrp_table[to];
   1079 
   1080 	if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) {
   1081 		return (set_errno(ESRCH));
   1082 	}
   1083 
   1084 	/*
   1085 	 * Get latency for same lgroup
   1086 	 */
   1087 	if (from == to) {
   1088 		latency = from_lgrp->lgrp_latency;
   1089 		return (latency);
   1090 	}
   1091 
   1092 	/*
   1093 	 * Get latency between leaf lgroups
   1094 	 */
   1095 	if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0)
   1096 		return (lgrp_plat_latency(from_lgrp->lgrp_plathand,
   1097 		    to_lgrp->lgrp_plathand));
   1098 
   1099 	/*
   1100 	 * Determine max latency between resources in two lgroups
   1101 	 */
   1102 	latency_max = 0;
   1103 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1104 		lgrp_t	*from_rsrc;
   1105 		int	j;
   1106 		lgrp_t	*to_rsrc;
   1107 
   1108 		from_rsrc = lgrp_table[i];
   1109 		if (!LGRP_EXISTS(from_rsrc) ||
   1110 		    !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i))
   1111 			continue;
   1112 
   1113 		for (j = 0; j <= lgrp_alloc_max; j++) {
   1114 			to_rsrc = lgrp_table[j];
   1115 			if (!LGRP_EXISTS(to_rsrc) ||
   1116 			    klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM],
   1117 			    j) == 0)
   1118 				continue;
   1119 			latency = lgrp_plat_latency(from_rsrc->lgrp_plathand,
   1120 			    to_rsrc->lgrp_plathand);
   1121 			if (latency > latency_max)
   1122 				latency_max = latency;
   1123 		}
   1124 	}
   1125 	return (latency_max);
   1126 }
   1127 
   1128 
   1129 /*
   1130  * Return lgroup interface version number
   1131  * 0 - none
   1132  * 1 - original
   1133  * 2 - lgrp_latency_cookie() and lgrp_resources() added
   1134  */
   1135 int
   1136 lgrp_version(int version)
   1137 {
   1138 	/*
   1139 	 * Return LGRP_VER_NONE when requested version isn't supported
   1140 	 */
   1141 	if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT)
   1142 		return (LGRP_VER_NONE);
   1143 
   1144 	/*
   1145 	 * Return current version when LGRP_VER_NONE passed in
   1146 	 */
   1147 	if (version == LGRP_VER_NONE)
   1148 		return (LGRP_VER_CURRENT);
   1149 
   1150 	/*
   1151 	 * Otherwise, return supported version.
   1152 	 */
   1153 	return (version);
   1154 }
   1155 
   1156 
   1157 /*
   1158  * Snapshot of lgroup hieararchy
   1159  *
   1160  * One snapshot is kept and is based on the kernel's native data model, so
   1161  * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the
   1162  * 64-bit kernel.  If a 32-bit user wants a snapshot from the 64-bit kernel,
   1163  * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot.
   1164  *
   1165  * The format is defined by lgroup snapshot header and the layout of
   1166  * the snapshot in memory is as follows:
   1167  * 1) lgroup snapshot header
   1168  *    - specifies format of snapshot
   1169  *    - defined by lgrp_snapshot_header_t
   1170  * 2) lgroup info array
   1171  *    - contains information about each lgroup
   1172  *    - one element for each lgroup
   1173  *    - each element is defined by lgrp_info_t
   1174  * 3) lgroup CPU ID array
   1175  *    - contains list (array) of CPU IDs for each lgroup
   1176  *    - lgrp_info_t points into array and specifies how many CPUs belong to
   1177  *      given lgroup
   1178  * 4) lgroup parents array
   1179  *    - contains lgroup bitmask of parents for each lgroup
   1180  *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
   1181  * 5) lgroup children array
   1182  *    - contains lgroup bitmask of children for each lgroup
   1183  *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
   1184  * 6) lgroup resources array
   1185  *    - contains lgroup bitmask of resources for each lgroup
   1186  *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
   1187  * 7) lgroup latency table
   1188  *    - contains latency from each lgroup to each of other lgroups
   1189  *
   1190  * NOTE:  Must use nlgrpsmax for per lgroup data structures because lgroups
   1191  *	  may be sparsely allocated.
   1192  */
   1193 lgrp_snapshot_header_t	*lgrp_snap = NULL;	/* lgroup snapshot */
   1194 static kmutex_t		lgrp_snap_lock;		/* snapshot lock */
   1195 
   1196 
   1197 /*
   1198  * Take a snapshot of lgroup hierarchy and return size of buffer
   1199  * needed to hold snapshot
   1200  */
   1201 static int
   1202 lgrp_snapshot(void)
   1203 {
   1204 	size_t		bitmask_size;
   1205 	size_t		bitmasks_size;
   1206 	size_t		bufsize;
   1207 	int		cpu_index;
   1208 	size_t		cpuids_size;
   1209 	int		i;
   1210 	int		j;
   1211 	size_t		info_size;
   1212 	size_t		lats_size;
   1213 	ulong_t		*lgrp_children;
   1214 	processorid_t	*lgrp_cpuids;
   1215 	lgrp_info_t	*lgrp_info;
   1216 	int		**lgrp_lats;
   1217 	ulong_t		*lgrp_parents;
   1218 	ulong_t		*lgrp_rsets;
   1219 	ulong_t		*lgrpset;
   1220 	int		snap_ncpus;
   1221 	int		snap_nlgrps;
   1222 	int		snap_nlgrpsmax;
   1223 	size_t		snap_hdr_size;
   1224 #ifdef	_SYSCALL32_IMPL
   1225 	model_t		model = DATAMODEL_NATIVE;
   1226 
   1227 	/*
   1228 	 * Have up-to-date snapshot, so check to see whether caller is 32-bit
   1229 	 * program and need to return size of 32-bit snapshot now.
   1230 	 */
   1231 	model = get_udatamodel();
   1232 	if (model == DATAMODEL_ILP32 && lgrp_snap &&
   1233 	    lgrp_snap->ss_gen == lgrp_gen) {
   1234 
   1235 		snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;
   1236 
   1237 		/*
   1238 		 * Calculate size of buffer needed for 32-bit snapshot,
   1239 		 * rounding up size of each object to allow for alignment
   1240 		 * of next object in buffer.
   1241 		 */
   1242 		snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
   1243 		    sizeof (caddr32_t));
   1244 		info_size =
   1245 		    P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
   1246 		    sizeof (processorid_t));
   1247 		cpuids_size =
   1248 		    P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
   1249 		    sizeof (ulong_t));
   1250 
   1251 		/*
   1252 		 * lgroup bitmasks needed for parents, children, and resources
   1253 		 * for each lgroup and pset lgroup set
   1254 		 */
   1255 		bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
   1256 		bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
   1257 		    snap_nlgrpsmax) + 1) * bitmask_size;
   1258 
   1259 		/*
   1260 		 * Size of latency table and buffer
   1261 		 */
   1262 		lats_size = snap_nlgrpsmax * sizeof (caddr32_t) +
   1263 		    snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);
   1264 
   1265 		bufsize =