Home | History | Annotate | Download | only in os
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Basic NUMA support in terms of locality groups
     28  *
     29  * Solaris needs to know which CPUs, memory, etc. are near each other to
     30  * provide good performance on NUMA machines by optimizing for locality.
     31  * In order to do this, a new abstraction called a "locality group (lgroup)"
     32  * has been introduced to keep track of which CPU-like and memory-like hardware
     33  * resources are close to each other.  Currently, latency is the only measure
     34  * used to determine how to group hardware resources into lgroups, but this
     35  * does not limit the groupings to be based solely on latency.  Other factors
     36  * may be used to determine the groupings in the future.
     37  *
     38  * Lgroups are organized into a hieararchy or topology that represents the
     39  * latency topology of the machine.  There is always at least a root lgroup in
     40  * the system.  It represents all the hardware resources in the machine at a
     41  * latency big enough that any hardware resource can at least access any other
     42  * hardware resource within that latency.  A Uniform Memory Access (UMA)
     43  * machine is represented with one lgroup (the root).  In contrast, a NUMA
     44  * machine is represented at least by the root lgroup and some number of leaf
     45  * lgroups where the leaf lgroups contain the hardware resources within the
     46  * least latency of each other and the root lgroup still contains all the
     47  * resources in the machine.  Some number of intermediate lgroups may exist
     48  * which represent more levels of locality than just the local latency of the
     49  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
     50  * (eg. root and intermediate lgroups) contain the next nearest resources to
     51  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
     52  * to the root lgroup shows the hardware resources from closest to farthest
     53  * from the leaf lgroup such that each successive ancestor lgroup contains
     54  * the next nearest resources at the next level of locality from the previous.
     55  *
     56  * The kernel uses the lgroup abstraction to know how to allocate resources
     57  * near a given process/thread.  At fork() and lwp/thread_create() time, a
     58  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
     59  * with the lowest load average.  Binding to a processor or processor set will
     60  * change the home lgroup for a thread.  The scheduler has been modified to try
     61  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
     62  * allocation is lgroup aware too, so memory will be allocated from the current
     63  * thread's home lgroup if possible.  If the desired resources are not
     64  * available, the kernel traverses the lgroup hierarchy going to the parent
     65  * lgroup to find resources at the next level of locality until it reaches the
     66  * root lgroup.
     67  */
     68 
     69 #include <sys/lgrp.h>
     70 #include <sys/lgrp_user.h>
     71 #include <sys/types.h>
     72 #include <sys/mman.h>
     73 #include <sys/param.h>
     74 #include <sys/var.h>
     75 #include <sys/thread.h>
     76 #include <sys/cpuvar.h>
     77 #include <sys/cpupart.h>
     78 #include <sys/kmem.h>
     79 #include <vm/seg.h>
     80 #include <vm/seg_kmem.h>
     81 #include <vm/seg_spt.h>
     82 #include <vm/seg_vn.h>
     83 #include <vm/as.h>
     84 #include <sys/atomic.h>
     85 #include <sys/systm.h>
     86 #include <sys/errno.h>
     87 #include <sys/cmn_err.h>
     88 #include <sys/kstat.h>
     89 #include <sys/sysmacros.h>
     90 #include <sys/pg.h>
     91 #include <sys/promif.h>
     92 #include <sys/sdt.h>
     93 
     94 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
     95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
     96 				/* indexed by lgrp_id */
     97 int	nlgrps;			/* number of lgroups in machine */
     98 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
     99 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
    100 
    101 /*
    102  * Kstat data for lgroups.
    103  *
    104  * Actual kstat data is collected in lgrp_stats array.
    105  * The lgrp_kstat_data array of named kstats is used to extract data from
    106  * lgrp_stats and present it to kstat framework. It is protected from partallel
    107  * modifications by lgrp_kstat_mutex. This may cause some contention when
    108  * several kstat commands run in parallel but this is not the
    109  * performance-critical path.
    110  */
    111 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
    112 
    113 /*
    114  * Declare kstat names statically for enums as defined in the header file.
    115  */
    116 LGRP_KSTAT_NAMES;
    117 
    118 static void	lgrp_kstat_init(void);
    119 static int	lgrp_kstat_extract(kstat_t *, int);
    120 static void	lgrp_kstat_reset(lgrp_id_t);
    121 
    122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
    123 static kmutex_t lgrp_kstat_mutex;
    124 
    125 
    126 /*
    127  * max number of lgroups supported by the platform
    128  */
    129 int	nlgrpsmax = 0;
    130 
    131 /*
    132  * The root lgroup. Represents the set of resources at the system wide
    133  * level of locality.
    134  */
    135 lgrp_t		*lgrp_root = NULL;
    136 
    137 /*
    138  * During system bootstrap cp_default does not contain the list of lgrp load
    139  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
    140  * on-line when cp_default is initialized by cpupart_initialize_default().
    141  * Configuring CPU0 may create a two-level topology with root and one leaf node
    142  * containing CPU0. This topology is initially constructed in a special
    143  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
    144  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
    145  * for all lpl operations until cp_default is fully constructed.
    146  *
    147  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
    148  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
    149  * the first element of lpl_bootstrap_list.
    150  *
    151  * CPUs that are added to the system, but have not yet been assigned to an
    152  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
    153  * on some architectures (x86) it's possible for the slave CPU startup thread
    154  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
    155  */
    156 #define	LPL_BOOTSTRAP_SIZE 2
    157 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
    158 lpl_t		*lpl_bootstrap;
    159 static lpl_t	*lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
    160 static int	lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
    161 
    162 /*
    163  * If cp still references the bootstrap lpl, it has not yet been added to
    164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
    165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
    166  */
    167 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
    168 
    169 static lgrp_t	lroot;
    170 
    171 /*
    172  * Size, in bytes, beyond which random memory allocation policy is applied
    173  * to non-shared memory.  Default is the maximum size, so random memory
    174  * allocation won't be used for non-shared memory by default.
    175  */
    176 size_t	lgrp_privm_random_thresh = (size_t)(-1);
    177 
    178 /* the maximum effect that a single thread can have on it's lgroup's load */
    179 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
    180 	((lgrp_loadavg_max_effect) / (ncpu))
    181 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
    182 
    183 
    184 /*
    185  * Size, in bytes, beyond which random memory allocation policy is applied to
    186  * shared memory.  Default is 8MB (2 ISM pages).
    187  */
    188 size_t	lgrp_shm_random_thresh = 8*1024*1024;
    189 
    190 /*
    191  * Whether to do processor set aware memory allocation by default
    192  */
    193 int	lgrp_mem_pset_aware = 0;
    194 
    195 /*
    196  * Set the default memory allocation policy for root lgroup
    197  */
    198 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
    199 
    200 /*
    201  * Set the default memory allocation policy.  For most platforms,
    202  * next touch is sufficient, but some platforms may wish to override
    203  * this.
    204  */
    205 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
    206 
    207 
    208 /*
    209  * lgroup CPU event handlers
    210  */
    211 static void	lgrp_cpu_init(struct cpu *);
    212 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
    213 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
    214 
    215 /*
    216  * lgroup memory event handlers
    217  */
    218 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
    219 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
    220 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
    221 
    222 /*
    223  * lgroup CPU partition event handlers
    224  */
    225 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
    226 static void	lgrp_part_del_cpu(struct cpu *);
    227 
    228 /*
    229  * lgroup framework initialization
    230  */
    231 static void	lgrp_main_init(void);
    232 static void	lgrp_main_mp_init(void);
    233 static void	lgrp_root_init(void);
    234 static void	lgrp_setup(void);
    235 
    236 /*
    237  * lpl topology
    238  */
    239 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
    240 static void	lpl_clear(lpl_t *);
    241 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
    242 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
    243 static void	lpl_rset_add(lpl_t *, lpl_t *);
    244 static void	lpl_rset_del(lpl_t *, lpl_t *);
    245 static int	lpl_rset_contains(lpl_t *, lpl_t *);
    246 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
    247 static void	lpl_child_update(lpl_t *, struct cpupart *);
    248 static int	lpl_pick(lpl_t *, lpl_t *);
    249 static void	lpl_verify_wrapper(struct cpupart *);
    250 
    251 /*
    252  * defines for lpl topology verifier return codes
    253  */
    254 
    255 #define	LPL_TOPO_CORRECT			0
    256 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
    257 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
    258 #define	LPL_TOPO_LGRP_MISMATCH			-3
    259 #define	LPL_TOPO_MISSING_PARENT			-4
    260 #define	LPL_TOPO_PARENT_MISMATCH		-5
    261 #define	LPL_TOPO_BAD_CPUCNT			-6
    262 #define	LPL_TOPO_RSET_MISMATCH			-7
    263 #define	LPL_TOPO_LPL_ORPHANED			-8
    264 #define	LPL_TOPO_LPL_BAD_NCPU			-9
    265 #define	LPL_TOPO_RSET_MSSNG_LF			-10
    266 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
    267 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-12
    268 #define	LPL_TOPO_LGRP_NOT_LEAF			-13
    269 #define	LPL_TOPO_BAD_RSETCNT			-14
    270 
    271 /*
    272  * Return whether lgroup optimizations should be enabled on this system
    273  */
    274 int
    275 lgrp_optimizations(void)
    276 {
    277 	/*
    278 	 * System must have more than 2 lgroups to enable lgroup optimizations
    279 	 *
    280 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
    281 	 * with one child lgroup containing all the resources. A 2 lgroup
    282 	 * system with a root lgroup directly containing CPUs or memory might
    283 	 * need lgroup optimizations with its child lgroup, but there
    284 	 * isn't such a machine for now....
    285 	 */
    286 	if (nlgrps > 2)
    287 		return (1);
    288 
    289 	return (0);
    290 }
    291 
    292 /*
    293  * Setup root lgroup
    294  */
    295 static void
    296 lgrp_root_init(void)
    297 {
    298 	lgrp_handle_t	hand;
    299 	int		i;
    300 	lgrp_id_t	id;
    301 
    302 	/*
    303 	 * Create the "root" lgroup
    304 	 */
    305 	ASSERT(nlgrps == 0);
    306 	id = nlgrps++;
    307 
    308 	lgrp_root = &lroot;
    309 
    310 	lgrp_root->lgrp_cpu = NULL;
    311 	lgrp_root->lgrp_mnodes = 0;
    312 	lgrp_root->lgrp_nmnodes = 0;
    313 	hand = lgrp_plat_root_hand();
    314 	lgrp_root->lgrp_plathand = hand;
    315 
    316 	lgrp_root->lgrp_id = id;
    317 	lgrp_root->lgrp_cpucnt = 0;
    318 	lgrp_root->lgrp_childcnt = 0;
    319 	klgrpset_clear(lgrp_root->lgrp_children);
    320 	klgrpset_clear(lgrp_root->lgrp_leaves);
    321 	lgrp_root->lgrp_parent = NULL;
    322 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
    323 
    324 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
    325 		klgrpset_clear(lgrp_root->lgrp_set[i]);
    326 
    327 	lgrp_root->lgrp_kstat = NULL;
    328 
    329 	lgrp_table[id] = lgrp_root;
    330 
    331 	/*
    332 	 * Setup initial lpl list for CPU0 and initial t0 home.
    333 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
    334 	 * all topology operations until cp_default is initialized at which
    335 	 * point t0.t_lpl will be updated.
    336 	 */
    337 	lpl_bootstrap = lpl_bootstrap_list;
    338 	t0.t_lpl = lpl_bootstrap;
    339 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
    340 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
    341 
    342 	/*
    343 	 * Set up the bootstrap rset
    344 	 * Since the bootstrap toplogy has just the root, and a leaf,
    345 	 * the rset contains just the leaf, and both lpls can use the same rset
    346 	 */
    347 	lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
    348 	lpl_bootstrap_list[0].lpl_rset_sz = 1;
    349 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
    350 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
    351 
    352 	lpl_bootstrap_list[1].lpl_rset_sz = 1;
    353 	lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
    354 	lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
    355 
    356 	cp_default.cp_lgrploads = lpl_bootstrap;
    357 }
    358 
    359 /*
    360  * Initialize the lgroup framework and allow the platform to do the same
    361  *
    362  * This happens in stages during boot and is all funnelled through this routine
    363  * (see definition of lgrp_init_stages_t to see what happens at each stage and
    364  * when)
    365  */
    366 void
    367 lgrp_init(lgrp_init_stages_t stage)
    368 {
    369 	/*
    370 	 * Initialize the platform
    371 	 */
    372 	lgrp_plat_init(stage);
    373 
    374 	switch (stage) {
    375 	case LGRP_INIT_STAGE1:
    376 		/*
    377 		 * Set max number of lgroups supported on this platform which
    378 		 * must be less than the max number of lgroups supported by the
    379 		 * common lgroup framework (eg. NLGRPS_MAX is max elements in
    380 		 * lgrp_table[], etc.)
    381 		 */
    382 		nlgrpsmax = lgrp_plat_max_lgrps();
    383 		ASSERT(nlgrpsmax <= NLGRPS_MAX);
    384 		break;
    385 
    386 	case LGRP_INIT_STAGE2:
    387 		lgrp_setup();
    388 		break;
    389 
    390 	case LGRP_INIT_STAGE4:
    391 		lgrp_main_init();
    392 		break;
    393 
    394 	case LGRP_INIT_STAGE5:
    395 		lgrp_main_mp_init();
    396 		break;
    397 
    398 	default:
    399 		break;
    400 	}
    401 }
    402 
    403 /*
    404  * Create the root and cpu0's lgroup, and set t0's home.
    405  */
    406 static void
    407 lgrp_setup(void)
    408 {
    409 	/*
    410 	 * Setup the root lgroup
    411 	 */
    412 	lgrp_root_init();
    413 
    414 	/*
    415 	 * Add cpu0 to an lgroup
    416 	 */
    417 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
    418 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
    419 }
    420 
    421 /*
    422  * true when lgrp initialization has been completed.
    423  */
    424 int	lgrp_initialized = 0;
    425 
    426 /*
    427  * True when lgrp topology is constructed.
    428  */
    429 int	lgrp_topo_initialized = 0;
    430 
    431 /*
    432  * Init routine called after startup(), /etc/system has been processed,
    433  * and cpu0 has been added to an lgroup.
    434  */
    435 static void
    436 lgrp_main_init(void)
    437 {
    438 	cpu_t		*cp = CPU;
    439 	lgrp_id_t	lgrpid;
    440 	int		i;
    441 	extern void	pg_cpu0_reinit();
    442 
    443 	/*
    444 	 * Enforce a valid lgrp_mem_default_policy
    445 	 */
    446 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
    447 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
    448 	    (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
    449 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
    450 
    451 	/*
    452 	 * See if mpo should be disabled.
    453 	 * This may happen in the case of null proc LPA on Starcat.
    454 	 * The platform won't be able to detect null proc LPA until after
    455 	 * cpu0 and memory have already been added to lgroups.
    456 	 * When and if it is detected, the Starcat platform will return
    457 	 * a different platform handle for cpu0 which is what we check for
    458 	 * here. If mpo should be disabled move cpu0 to it's rightful place
    459 	 * (the root), and destroy the remaining lgroups. This effectively
    460 	 * provides an UMA lgroup topology.
    461 	 */
    462 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
    463 	if (lgrp_table[lgrpid]->lgrp_plathand !=
    464 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
    465 		lgrp_part_del_cpu(cp);
    466 		lgrp_cpu_fini(cp, lgrpid);
    467 
    468 		lgrp_cpu_init(cp);
    469 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
    470 
    471 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
    472 
    473 		/*
    474 		 * Notify the PG subsystem that the CPU's lgrp
    475 		 * association has changed
    476 		 */
    477 		pg_cpu0_reinit();
    478 
    479 		/*
    480 		 * Destroy all lgroups except for root
    481 		 */
    482 		for (i = 0; i <= lgrp_alloc_max; i++) {
    483 			if (LGRP_EXISTS(lgrp_table[i]) &&
    484 			    lgrp_table[i] != lgrp_root)
    485 				lgrp_destroy(lgrp_table[i]);
    486 		}
    487 
    488 		/*
    489 		 * Fix up root to point at itself for leaves and resources
    490 		 * and not have any children
    491 		 */
    492 		lgrp_root->lgrp_childcnt = 0;
    493 		klgrpset_clear(lgrp_root->lgrp_children);
    494 		klgrpset_clear(lgrp_root->lgrp_leaves);
    495 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
    496 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
    497 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
    498 	}
    499 
    500 	/*
    501 	 * Initialize kstats framework.
    502 	 */
    503 	lgrp_kstat_init();
    504 	/*
    505 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
    506 	 */
    507 	mutex_enter(&cpu_lock);
    508 	lgrp_kstat_create(cp);
    509 	mutex_exit(&cpu_lock);
    510 
    511 	lgrp_initialized = 1;
    512 }
    513 
    514 /*
    515  * Finish lgrp initialization after all CPUS are brought on-line.
    516  * This routine is called after start_other_cpus().
    517  */
    518 static void
    519 lgrp_main_mp_init(void)
    520 {
    521 	klgrpset_t changed;
    522 
    523 	/*
    524 	 * Update lgroup topology (if necessary)
    525 	 */
    526 	klgrpset_clear(changed);
    527 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
    528 	lgrp_topo_initialized = 1;
    529 }
    530 
    531 /*
    532  * Change latency of lgroup with specified lgroup platform handle (if one is
    533  * given) or change all lgroups with old latency to new latency
    534  */
    535 void
    536 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
    537     u_longlong_t newtime)
    538 {
    539 	lgrp_t		*lgrp;
    540 	int		i;
    541 
    542 	for (i = 0; i <= lgrp_alloc_max; i++) {
    543 		lgrp = lgrp_table[i];
    544 
    545 		if (!LGRP_EXISTS(lgrp))
    546 			continue;
    547 
    548 		if ((hand == LGRP_NULL_HANDLE &&
    549 		    lgrp->lgrp_latency == oldtime) ||
    550 		    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
    551 			lgrp->lgrp_latency = (int)newtime;
    552 	}
    553 }
    554 
    555 /*
    556  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
    557  */
    558 void
    559 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
    560 {
    561 	klgrpset_t	changed;
    562 	cpu_t		*cp;
    563 	lgrp_id_t	id;
    564 	int		rc;
    565 
    566 	switch (event) {
    567 	/*
    568 	 * The following (re)configuration events are common code
    569 	 * initiated. lgrp_plat_config() is called here to inform the
    570 	 * platform of the reconfiguration event.
    571 	 */
    572 	case LGRP_CONFIG_CPU_ADD:
    573 		cp = (cpu_t *)resource;
    574 
    575 		/*
    576 		 * Initialize the new CPU's lgrp related next/prev
    577 		 * links, and give it a bootstrap lpl so that it can
    578 		 * survive should it need to enter the dispatcher.
    579 		 */
    580 		cp->cpu_next_lpl = cp;
    581 		cp->cpu_prev_lpl = cp;
    582 		cp->cpu_next_lgrp = cp;
    583 		cp->cpu_prev_lgrp = cp;
    584 		cp->cpu_lpl = lpl_bootstrap;
    585 
    586 		lgrp_plat_config(event, resource);
    587 		atomic_add_32(&lgrp_gen, 1);
    588 
    589 		break;
    590 	case LGRP_CONFIG_CPU_DEL:
    591 		lgrp_plat_config(event, resource);
    592 		atomic_add_32(&lgrp_gen, 1);
    593 
    594 		break;
    595 	case LGRP_CONFIG_CPU_ONLINE:
    596 		cp = (cpu_t *)resource;
    597 		lgrp_cpu_init(cp);
    598 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
    599 		rc = lpl_topo_verify(cp->cpu_part);
    600 		if (rc != LPL_TOPO_CORRECT) {
    601 			panic("lpl_topo_verify failed: %d", rc);
    602 		}
    603 		lgrp_plat_config(event, resource);
    604 		atomic_add_32(&lgrp_gen, 1);
    605 
    606 		break;
    607 	case LGRP_CONFIG_CPU_OFFLINE:
    608 		cp = (cpu_t *)resource;
    609 		id = cp->cpu_lpl->lpl_lgrpid;
    610 		lgrp_part_del_cpu(cp);
    611 		lgrp_cpu_fini(cp, id);
    612 		rc = lpl_topo_verify(cp->cpu_part);
    613 		if (rc != LPL_TOPO_CORRECT) {
    614 			panic("lpl_topo_verify failed: %d", rc);
    615 		}
    616 		lgrp_plat_config(event, resource);
    617 		atomic_add_32(&lgrp_gen, 1);
    618 
    619 		break;
    620 	case LGRP_CONFIG_CPUPART_ADD:
    621 		cp = (cpu_t *)resource;
    622 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
    623 		rc = lpl_topo_verify(cp->cpu_part);
    624 		if (rc != LPL_TOPO_CORRECT) {
    625 			panic("lpl_topo_verify failed: %d", rc);
    626 		}
    627 		lgrp_plat_config(event, resource);
    628 
    629 		break;
    630 	case LGRP_CONFIG_CPUPART_DEL:
    631 		cp = (cpu_t *)resource;
    632 		lgrp_part_del_cpu((cpu_t *)resource);
    633 		rc = lpl_topo_verify(cp->cpu_part);
    634 		if (rc != LPL_TOPO_CORRECT) {
    635 			panic("lpl_topo_verify failed: %d", rc);
    636 		}
    637 		lgrp_plat_config(event, resource);
    638 
    639 		break;
    640 	/*
    641 	 * The following events are initiated by the memnode
    642 	 * subsystem.
    643 	 */
    644 	case LGRP_CONFIG_MEM_ADD:
    645 		lgrp_mem_init((int)resource, where, B_FALSE);
    646 		atomic_add_32(&lgrp_gen, 1);
    647 
    648 		break;
    649 	case LGRP_CONFIG_MEM_DEL:
    650 		lgrp_mem_fini((int)resource, where, B_FALSE);
    651 		atomic_add_32(&lgrp_gen, 1);
    652 
    653 		break;
    654 	case LGRP_CONFIG_MEM_RENAME: {
    655 		lgrp_config_mem_rename_t *ren_arg =
    656 		    (lgrp_config_mem_rename_t *)where;
    657 
    658 		lgrp_mem_rename((int)resource,
    659 		    ren_arg->lmem_rename_from,
    660 		    ren_arg->lmem_rename_to);
    661 		atomic_add_32(&lgrp_gen, 1);
    662 
    663 		break;
    664 	}
    665 	case LGRP_CONFIG_GEN_UPDATE:
    666 		atomic_add_32(&lgrp_gen, 1);
    667 
    668 		break;
    669 	case LGRP_CONFIG_FLATTEN:
    670 		if (where == 0)
    671 			lgrp_topo_levels = (int)resource;
    672 		else
    673 			(void) lgrp_topo_flatten(resource,
    674 			    lgrp_table, lgrp_alloc_max, &changed);
    675 
    676 		break;
    677 	/*
    678 	 * Update any lgroups with old latency to new latency
    679 	 */
    680 	case LGRP_CONFIG_LAT_CHANGE_ALL:
    681 		lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
    682 		    (u_longlong_t)where);
    683 
    684 		break;
    685 	/*
    686 	 * Update lgroup with specified lgroup platform handle to have
    687 	 * new latency
    688 	 */
    689 	case LGRP_CONFIG_LAT_CHANGE:
    690 		lgrp_latency_change((lgrp_handle_t)resource, 0,
    691 		    (u_longlong_t)where);
    692 
    693 		break;
    694 	case LGRP_CONFIG_NOP:
    695 
    696 		break;
    697 	default:
    698 		break;
    699 	}
    700 
    701 }
    702 
    703 /*
    704  * Called to add lgrp info into cpu structure from cpu_add_unit;
    705  * do not assume cpu is in cpu[] yet!
    706  *
    707  * CPUs are brought online with all other CPUs paused so we can't
    708  * allocate memory or we could deadlock the system, so we rely on
    709  * the platform to statically allocate as much space as we need
    710  * for the lgrp structs and stats.
    711  */
    712 static void
    713 lgrp_cpu_init(struct cpu *cp)
    714 {
    715 	klgrpset_t	changed;
    716 	int		count;
    717 	lgrp_handle_t	hand;
    718 	int		first_cpu;
    719 	lgrp_t		*my_lgrp;
    720 	lgrp_id_t	lgrpid;
    721 	struct cpu	*cptr;
    722 
    723 	/*
    724 	 * This is the first time through if the resource set
    725 	 * for the root lgroup is empty. After cpu0 has been
    726 	 * initially added to an lgroup, the root's CPU resource
    727 	 * set can never be empty, since the system's last CPU
    728 	 * cannot be offlined.
    729 	 */
    730 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
    731 		/*
    732 		 * First time through.
    733 		 */
    734 		first_cpu = 1;
    735 	} else {
    736 		/*
    737 		 * If cpu0 needs to move lgroups, we may come
    738 		 * through here again, at which time cpu_lock won't
    739 		 * be held, and lgrp_initialized will be false.
    740 		 */
    741 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
    742 		ASSERT(cp->cpu_part != NULL);
    743 		first_cpu = 0;
    744 	}
    745 
    746 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
    747 	my_lgrp = lgrp_hand_to_lgrp(hand);
    748 
    749 	if (my_lgrp == NULL) {
    750 		/*
    751 		 * Create new lgrp and add it to lgroup topology
    752 		 */
    753 		my_lgrp = lgrp_create();
    754 		my_lgrp->lgrp_plathand = hand;
    755 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
    756 		lgrpid = my_lgrp->lgrp_id;
    757 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
    758 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
    759 
    760 		count = 0;
    761 		klgrpset_clear(changed);
    762 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
    763 		    &changed);
    764 		/*
    765 		 * May have added new intermediate lgroups, so need to add
    766 		 * resources other than CPUs which are added below
    767 		 */
    768 		(void) lgrp_mnode_update(changed, NULL);
    769 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
    770 	    > 0) {
    771 		/*
    772 		 * Leaf lgroup was created, but latency wasn't available
    773 		 * then.  So, set latency for it and fill in rest of lgroup
    774 		 * topology  now that we know how far it is from other leaf
    775 		 * lgroups.
    776 		 */
    777 		lgrpid = my_lgrp->lgrp_id;
    778 		klgrpset_clear(changed);
    779 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
    780 		    lgrpid))
    781 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
    782 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
    783 		    &changed);
    784 
    785 		/*
    786 		 * May have added new intermediate lgroups, so need to add
    787 		 * resources other than CPUs which are added below
    788 		 */
    789 		(void) lgrp_mnode_update(changed, NULL);
    790 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
    791 	    my_lgrp->lgrp_id)) {
    792 		int	i;
    793 
    794 		/*
    795 		 * Update existing lgroup and lgroups containing it with CPU
    796 		 * resource
    797 		 */
    798 		lgrpid = my_lgrp->lgrp_id;
    799 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
    800 		for (i = 0; i <= lgrp_alloc_max; i++) {
    801 			lgrp_t		*lgrp;
    802 
    803 			lgrp = lgrp_table[i];
    804 			if (!LGRP_EXISTS(lgrp) ||
    805 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
    806 				continue;
    807 
    808 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
    809 		}
    810 	}
    811 
    812 	lgrpid = my_lgrp->lgrp_id;
    813 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
    814 
    815 	/*
    816 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
    817 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
    818 	 * not since none of lgroup IDs in the lpl's have been set yet.
    819 	 */
    820 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
    821 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
    822 
    823 	/*
    824 	 * link the CPU into the lgrp's CPU list
    825 	 */
    826 	if (my_lgrp->lgrp_cpucnt == 0) {
    827 		my_lgrp->lgrp_cpu = cp;
    828 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
    829 	} else {
    830 		cptr = my_lgrp->lgrp_cpu;
    831 		cp->cpu_next_lgrp = cptr;
    832 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
    833 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
    834 		cptr->cpu_prev_lgrp = cp;
    835 	}
    836 	my_lgrp->lgrp_cpucnt++;
    837 }
    838 
    839 lgrp_t *
    840 lgrp_create(void)
    841 {
    842 	lgrp_t		*my_lgrp;
    843 	lgrp_id_t	lgrpid;
    844 	int		i;
    845 
    846 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
    847 
    848 	/*
    849 	 * Find an open slot in the lgroup table and recycle unused lgroup
    850 	 * left there if any
    851 	 */
    852 	my_lgrp = NULL;
    853 	if (lgrp_alloc_hint == -1)
    854 		/*
    855 		 * Allocate from end when hint not set yet because no lgroups
    856 		 * have been deleted yet
    857 		 */
    858 		lgrpid = nlgrps++;
    859 	else {
    860 		/*
    861 		 * Start looking for next open slot from hint and leave hint
    862 		 * at slot allocated
    863 		 */
    864 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
    865 			my_lgrp = lgrp_table[i];
    866 			if (!LGRP_EXISTS(my_lgrp)) {
    867 				lgrpid = i;
    868 				nlgrps++;
    869 				break;
    870 			}
    871 		}
    872 		lgrp_alloc_hint = lgrpid;
    873 	}
    874 
    875 	/*
    876 	 * Keep track of max lgroup ID allocated so far to cut down on searches
    877 	 */
    878 	if (lgrpid > lgrp_alloc_max)
    879 		lgrp_alloc_max = lgrpid;
    880 
    881 	/*
    882 	 * Need to allocate new lgroup if next open slot didn't have one
    883 	 * for recycling
    884 	 */
    885 	if (my_lgrp == NULL)
    886 		my_lgrp = lgrp_plat_alloc(lgrpid);
    887 
    888 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
    889 		panic("Too many lgrps for platform (%d)", nlgrps);
    890 
    891 	my_lgrp->lgrp_id = lgrpid;
    892 	my_lgrp->lgrp_latency = 0;
    893 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
    894 	my_lgrp->lgrp_parent = NULL;
    895 	my_lgrp->lgrp_childcnt = 0;
    896 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
    897 	my_lgrp->lgrp_nmnodes = 0;
    898 	klgrpset_clear(my_lgrp->lgrp_children);
    899 	klgrpset_clear(my_lgrp->lgrp_leaves);
    900 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
    901 		klgrpset_clear(my_lgrp->lgrp_set[i]);
    902 
    903 	my_lgrp->lgrp_cpu = NULL;
    904 	my_lgrp->lgrp_cpucnt = 0;
    905 
    906 	if (my_lgrp->lgrp_kstat != NULL)
    907 		lgrp_kstat_reset(lgrpid);
    908 
    909 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
    910 
    911 	return (my_lgrp);
    912 }
    913 
    914 void
    915 lgrp_destroy(lgrp_t *lgrp)
    916 {
    917 	int		i;
    918 
    919 	/*
    920 	 * Unless this lgroup is being destroyed on behalf of
    921 	 * the boot CPU, cpu_lock must be held
    922 	 */
    923 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
    924 
    925 	if (nlgrps == 1)
    926 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
    927 
    928 	if (!LGRP_EXISTS(lgrp))
    929 		return;
    930 
    931 	/*
    932 	 * Set hint to lgroup being deleted and try to keep lower numbered
    933 	 * hints to facilitate finding empty slots
    934 	 */
    935 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
    936 		lgrp_alloc_hint = lgrp->lgrp_id;
    937 
    938 	/*
    939 	 * Mark this lgroup to be recycled by setting its lgroup ID to
    940 	 * LGRP_NONE and clear relevant fields
    941 	 */
    942 	lgrp->lgrp_id = LGRP_NONE;
    943 	lgrp->lgrp_latency = 0;
    944 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
    945 	lgrp->lgrp_parent = NULL;
    946 	lgrp->lgrp_childcnt = 0;
    947 
    948 	klgrpset_clear(lgrp->lgrp_children);
    949 	klgrpset_clear(lgrp->lgrp_leaves);
    950 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
    951 		klgrpset_clear(lgrp->lgrp_set[i]);
    952 
    953 	lgrp->lgrp_mnodes = (mnodeset_t)0;
    954 	lgrp->lgrp_nmnodes = 0;
    955 
    956 	lgrp->lgrp_cpu = NULL;
    957 	lgrp->lgrp_cpucnt = 0;
    958 
    959 	nlgrps--;
    960 }
    961 
    962 /*
    963  * Initialize kstat data. Called from lgrp intialization code.
    964  */
    965 static void
    966 lgrp_kstat_init(void)
    967 {
    968 	lgrp_stat_t	stat;
    969 
    970 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
    971 
    972 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
    973 		kstat_named_init(&lgrp_kstat_data[stat],
    974 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
    975 }
    976 
    977 /*
    978  * initialize an lgrp's kstats if needed
    979  * called with cpu_lock held but not with cpus paused.
    980  * we don't tear these down now because we don't know about
    981  * memory leaving the lgrp yet...
    982  */
    983 
    984 void
    985 lgrp_kstat_create(cpu_t *cp)
    986 {
    987 	kstat_t		*lgrp_kstat;
    988 	lgrp_id_t	lgrpid;
    989 	lgrp_t		*my_lgrp;
    990 
    991 	ASSERT(MUTEX_HELD(&cpu_lock));
    992 
    993 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
    994 	my_lgrp = lgrp_table[lgrpid];
    995 
    996 	if (my_lgrp->lgrp_kstat != NULL)
    997 		return; /* already initialized */
    998 
    999 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
   1000 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
   1001 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
   1002 
   1003 	if (lgrp_kstat != NULL) {
   1004 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
   1005 		lgrp_kstat->ks_private = my_lgrp;
   1006 		lgrp_kstat->ks_data = &lgrp_kstat_data;
   1007 		lgrp_kstat->ks_update = lgrp_kstat_extract;
   1008 		my_lgrp->lgrp_kstat = lgrp_kstat;
   1009 		kstat_install(lgrp_kstat);
   1010 	}
   1011 }
   1012 
   1013 /*
   1014  * this will do something when we manage to remove now unused lgrps
   1015  */
   1016 
   1017 /* ARGSUSED */
   1018 void
   1019 lgrp_kstat_destroy(cpu_t *cp)
   1020 {
   1021 	ASSERT(MUTEX_HELD(&cpu_lock));
   1022 }
   1023 
   1024 /*
   1025  * Called when a CPU is off-lined.
   1026  */
   1027 static void
   1028 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
   1029 {
   1030 	lgrp_t *my_lgrp;
   1031 	struct cpu *prev;
   1032 	struct cpu *next;
   1033 
   1034 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
   1035 
   1036 	prev = cp->cpu_prev_lgrp;
   1037 	next = cp->cpu_next_lgrp;
   1038 
   1039 	prev->cpu_next_lgrp = next;
   1040 	next->cpu_prev_lgrp = prev;
   1041 
   1042 	/*
   1043 	 * just because I'm paranoid doesn't mean...
   1044 	 */
   1045 
   1046 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
   1047 
   1048 	my_lgrp = lgrp_table[lgrpid];
   1049 	my_lgrp->lgrp_cpucnt--;
   1050 
   1051 	/*
   1052 	 * Removing last CPU in lgroup, so update lgroup topology
   1053 	 */
   1054 	if (my_lgrp->lgrp_cpucnt == 0) {
   1055 		klgrpset_t	changed;
   1056 		int		count;
   1057 		int		i;
   1058 
   1059 		my_lgrp->lgrp_cpu = NULL;
   1060 
   1061 		/*
   1062 		 * Remove this lgroup from its lgroup CPU resources and remove
   1063 		 * lgroup from lgroup topology if it doesn't have any more
   1064 		 * resources in it now
   1065 		 */
   1066 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
   1067 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
   1068 			count = 0;
   1069 			klgrpset_clear(changed);
   1070 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
   1071 			    lgrp_alloc_max + 1, &changed);
   1072 			return;
   1073 		}
   1074 
   1075 		/*
   1076 		 * This lgroup isn't empty, so just remove it from CPU
   1077 		 * resources of any lgroups that contain it as such
   1078 		 */
   1079 		for (i = 0; i <= lgrp_alloc_max; i++) {
   1080 			lgrp_t		*lgrp;
   1081 
   1082 			lgrp = lgrp_table[i];
   1083 			if (!LGRP_EXISTS(lgrp) ||
   1084 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
   1085 			    lgrpid))
   1086 				continue;
   1087 
   1088 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
   1089 		}
   1090 		return;
   1091 	}
   1092 
   1093 	if (my_lgrp->lgrp_cpu == cp)
   1094 		my_lgrp->lgrp_cpu = next;
   1095 
   1096 }
   1097 
   1098 /*
   1099  * Update memory nodes in target lgroups and return ones that get changed
   1100  */
   1101 int
   1102 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
   1103 {
   1104 	int	count;
   1105 	int	i;
   1106 	int	j;
   1107 	lgrp_t	*lgrp;
   1108 	lgrp_t	*lgrp_rsrc;
   1109 
   1110 	count = 0;
   1111 	if (changed)
   1112 		klgrpset_clear(*changed);
   1113 
   1114 	if (klgrpset_isempty(target))
   1115 		return (0);
   1116 
   1117 	/*
   1118 	 * Find each lgroup in target lgroups
   1119 	 */
   1120 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1121 		/*
   1122 		 * Skip any lgroups that don't exist or aren't in target group
   1123 		 */
   1124 		lgrp = lgrp_table[i];
   1125 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
   1126 			continue;
   1127 		}
   1128 
   1129 		/*
   1130 		 * Initialize memnodes for intermediate lgroups to 0
   1131 		 * and update them from scratch since they may have completely
   1132 		 * changed
   1133 		 */
   1134 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
   1135 			lgrp->lgrp_mnodes = (mnodeset_t)0;
   1136 			lgrp->lgrp_nmnodes = 0;
   1137 		}
   1138 
   1139 		/*
   1140 		 * Update memory nodes of of target lgroup with memory nodes
   1141 		 * from each lgroup in its lgroup memory resource set
   1142 		 */
   1143 		for (j = 0; j <= lgrp_alloc_max; j++) {
   1144 			int	k;
   1145 
   1146 			/*
   1147 			 * Skip any lgroups that don't exist or aren't in
   1148 			 * memory resources of target lgroup
   1149 			 */
   1150 			lgrp_rsrc = lgrp_table[j];
   1151 			if (!LGRP_EXISTS(lgrp_rsrc) ||
   1152 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
   1153 			    j))
   1154 				continue;
   1155 
   1156 			/*
   1157 			 * Update target lgroup's memnodes to include memnodes
   1158 			 * of this lgroup
   1159 			 */
   1160 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
   1161 				mnodeset_t	mnode_mask;
   1162 
   1163 				mnode_mask = (mnodeset_t)1 << k;
   1164 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
   1165 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
   1166 					lgrp->lgrp_mnodes |= mnode_mask;
   1167 					lgrp->lgrp_nmnodes++;
   1168 				}
   1169 			}
   1170 			count++;
   1171 			if (changed)
   1172 				klgrpset_add(*changed, lgrp->lgrp_id);
   1173 		}
   1174 	}
   1175 
   1176 	return (count);
   1177 }
   1178 
   1179 /*
   1180  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
   1181  * is moved from one board to another. The "from" and "to" arguments specify the
   1182  * source and the destination of the move.
   1183  *
   1184  * See plat_lgrp_config() for a detailed description of the copy-rename
   1185  * semantics.
   1186  *
   1187  * The lgrp_mem_rename() is called by the platform copy-rename code to update
   1188  * the lgroup topology which is changing as memory moves from one lgroup to
   1189  * another. It removes the mnode from the source lgroup and re-inserts it in the
   1190  * target lgroup.
   1191  *
   1192  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
   1193  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
   1194  * copy-rename operation.
   1195  *
   1196  * There is one case which requires special handling. If the system contains
   1197  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
   1198  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
   1199  * lgrp_mem_init), but there is a window when the system has no memory in the
   1200  * lgroup hierarchy. If another thread tries to allocate memory during this
   1201  * window, the allocation will fail, although the system has physical memory.
   1202  * This may cause a system panic or a deadlock (some sleeping memory allocations
   1203  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
   1204  * the mnode back).
   1205  *
   1206  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
   1207  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
   1208  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
   1209  * but it updates the rest of the lgroup topology as if the mnode was actually
   1210  * removed. The lgrp_mem_init() function recognizes that the mnode being
   1211  * inserted represents such a special case and updates the topology
   1212  * appropriately.
   1213  */
   1214 void
   1215 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
   1216 {
   1217 	/*
   1218 	 * Remove the memory from the source node and add it to the destination
   1219 	 * node.
   1220 	 */
   1221 	lgrp_mem_fini(mnode, from, B_TRUE);
   1222 	lgrp_mem_init(mnode, to, B_TRUE);
   1223 }
   1224 
   1225 /*
   1226  * Called to indicate that the lgrp with platform handle "hand" now
   1227  * contains the memory identified by "mnode".
   1228  *
   1229  * LOCKING for this routine is a bit tricky. Usually it is called without
   1230  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
   1231  * callers. During DR of the board containing the caged memory it may be called
   1232  * with cpu_lock already held and CPUs paused.
   1233  *
   1234  * If the insertion is part of the DR copy-rename and the inserted mnode (and
   1235  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
   1236  * dealing with the special case of DR copy-rename described in
   1237  * lgrp_mem_rename().
   1238  */
   1239 void
   1240 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
   1241 {
   1242 	klgrpset_t	changed;
   1243 	int		count;
   1244 	int		i;
   1245 	lgrp_t		*my_lgrp;
   1246 	lgrp_id_t	lgrpid;
   1247 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
   1248 	boolean_t	drop_lock = B_FALSE;
   1249 	boolean_t	need_synch = B_FALSE;
   1250 
   1251 	/*
   1252 	 * Grab CPU lock (if we haven't already)
   1253 	 */
   1254 	if (!MUTEX_HELD(&cpu_lock)) {
   1255 		mutex_enter(&cpu_lock);
   1256 		drop_lock = B_TRUE;
   1257 	}
   1258 
   1259 	/*
   1260 	 * This routine may be called from a context where we already
   1261 	 * hold cpu_lock, and have already paused cpus.
   1262 	 */
   1263 	if (!cpus_paused())
   1264 		need_synch = B_TRUE;
   1265 
   1266 	/*
   1267 	 * Check if this mnode is already configured and return immediately if
   1268 	 * it is.
   1269 	 *
   1270 	 * NOTE: in special case of copy-rename of the only remaining mnode,
   1271 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
   1272 	 * recognize this case and continue as usual, but skip the update to
   1273 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
   1274 	 * in topology, temporarily introduced by lgrp_mem_fini().
   1275 	 */
   1276 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
   1277 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
   1278 		if (drop_lock)
   1279 			mutex_exit(&cpu_lock);
   1280 		return;
   1281 	}
   1282 
   1283 	/*
   1284 	 * Update lgroup topology with new memory resources, keeping track of
   1285 	 * which lgroups change
   1286 	 */
   1287 	count = 0;
   1288 	klgrpset_clear(changed);
   1289 	my_lgrp = lgrp_hand_to_lgrp(hand);
   1290 	if (my_lgrp == NULL) {
   1291 		/* new lgrp */
   1292 		my_lgrp = lgrp_create();
   1293 		lgrpid = my_lgrp->lgrp_id;
   1294 		my_lgrp->lgrp_plathand = hand;
   1295 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
   1296 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
   1297 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
   1298 
   1299 		if (need_synch)
   1300 			pause_cpus(NULL);
   1301 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
   1302 		    &changed);
   1303 		if (need_synch)
   1304 			start_cpus();
   1305 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
   1306 	    > 0) {
   1307 		/*
   1308 		 * Leaf lgroup was created, but latency wasn't available
   1309 		 * then.  So, set latency for it and fill in rest of lgroup
   1310 		 * topology  now that we know how far it is from other leaf
   1311 		 * lgroups.
   1312 		 */
   1313 		klgrpset_clear(changed);
   1314 		lgrpid = my_lgrp->lgrp_id;
   1315 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
   1316 		    lgrpid))
   1317 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
   1318 		if (need_synch)
   1319 			pause_cpus(NULL);
   1320 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
   1321 		    &changed);
   1322 		if (need_synch)
   1323 			start_cpus();
   1324 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
   1325 	    my_lgrp->lgrp_id)) {
   1326 		/*
   1327 		 * Add new lgroup memory resource to existing lgroup
   1328 		 */
   1329 		lgrpid = my_lgrp->lgrp_id;
   1330 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
   1331 		klgrpset_add(changed, lgrpid);
   1332 		count++;
   1333 		for (i = 0; i <= lgrp_alloc_max; i++) {
   1334 			lgrp_t		*lgrp;
   1335 
   1336 			lgrp = lgrp_table[i];
   1337 			if (!LGRP_EXISTS(lgrp) ||
   1338 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
   1339 				continue;
   1340 
   1341 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
   1342 			klgrpset_add(changed, lgrp->lgrp_id);
   1343 			count++;
   1344 		}
   1345 	}
   1346 
   1347 	/*
   1348 	 * Add memory node to lgroup and remove lgroup from ones that need
   1349 	 * to be updated
   1350 	 */
   1351 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
   1352 		my_lgrp->lgrp_mnodes |= mnodes_mask;
   1353 		my_lgrp->lgrp_nmnodes++;
   1354 	}
   1355 	klgrpset_del(changed, lgrpid);
   1356 
   1357 	/*
   1358 	 * Update memory node information for all lgroups that changed and
   1359 	 * contain new memory node as a resource
   1360 	 */
   1361 	if (count)
   1362 		(void) lgrp_mnode_update(changed, NULL);
   1363 
   1364 	if (drop_lock)
   1365 		mutex_exit(&cpu_lock);
   1366 }
   1367 
   1368 /*
   1369  * Called to indicate that the lgroup associated with the platform
   1370  * handle "hand" no longer contains given memory node
   1371  *
   1372  * LOCKING for this routine is a bit tricky. Usually it is called without
   1373  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
   1374  * callers. During DR of the board containing the caged memory it may be called
   1375  * with cpu_lock already held and CPUs paused.
   1376  *
   1377  * If the deletion is part of the DR copy-rename and the deleted mnode is the
   1378  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
   1379  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
   1380  * the same mnode back into the topology. See lgrp_mem_rename() and
   1381  * lgrp_mem_init() for additional details.
   1382  */
   1383 void
   1384 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
   1385 {
   1386 	klgrpset_t	changed;
   1387 	int		count;
   1388 	int		i;
   1389 	lgrp_t		*my_lgrp;
   1390 	lgrp_id_t	lgrpid;
   1391 	mnodeset_t	mnodes_mask;
   1392 	boolean_t	drop_lock = B_FALSE;
   1393 	boolean_t	need_synch = B_FALSE;
   1394 
   1395 	/*
   1396 	 * Grab CPU lock (if we haven't already)
   1397 	 */
   1398 	if (!MUTEX_HELD(&cpu_lock)) {
   1399 		mutex_enter(&cpu_lock);
   1400 		drop_lock = B_TRUE;
   1401 	}
   1402 
   1403 	/*
   1404 	 * This routine may be called from a context where we already
   1405 	 * hold cpu_lock and have already paused cpus.
   1406 	 */
   1407 	if (!cpus_paused())
   1408 		need_synch = B_TRUE;
   1409 
   1410 	my_lgrp = lgrp_hand_to_lgrp(hand);
   1411 
   1412 	/*
   1413 	 * The lgrp *must* be pre-existing
   1414 	 */
   1415 	ASSERT(my_lgrp != NULL);
   1416 
   1417 	/*
   1418 	 * Delete memory node from lgroups which contain it
   1419 	 */
   1420 	mnodes_mask = ((mnodeset_t)1 << mnode);
   1421 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1422 		lgrp_t *lgrp = lgrp_table[i];
   1423 		/*
   1424 		 * Skip any non-existent lgroups and any lgroups that don't
   1425 		 * contain leaf lgroup of memory as a memory resource
   1426 		 */
   1427 		if (!LGRP_EXISTS(lgrp) ||
   1428 		    !(lgrp->lgrp_mnodes & mnodes_mask))
   1429 			continue;
   1430 
   1431 		/*
   1432 		 * Avoid removing the last mnode from the root in the DR
   1433 		 * copy-rename case. See lgrp_mem_rename() for details.
   1434 		 */
   1435 		if (is_copy_rename &&
   1436 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
   1437 			continue;
   1438 
   1439 		/*
   1440 		 * Remove memory node from lgroup.
   1441 		 */
   1442 		lgrp->lgrp_mnodes &= ~mnodes_mask;
   1443 		lgrp->lgrp_nmnodes--;
   1444 		ASSERT(lgrp->lgrp_nmnodes >= 0);
   1445 	}
   1446 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
   1447 
   1448 	/*
   1449 	 * Don't need to update lgroup topology if this lgroup still has memory.
   1450 	 *
   1451 	 * In the special case of DR copy-rename with the only mnode being
   1452 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
   1453 	 * still need to update the lgroup topology.
   1454 	 */
   1455 	if ((my_lgrp->lgrp_nmnodes > 0) &&
   1456 	    !(is_copy_rename && (my_lgrp == lgrp_root) &&
   1457 	    (my_lgrp->lgrp_mnodes == mnodes_mask))) {
   1458 		if (drop_lock)
   1459 			mutex_exit(&cpu_lock);
   1460 		return;
   1461 	}
   1462 
   1463 	/*
   1464 	 * This lgroup does not contain any memory now
   1465 	 */
   1466 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
   1467 
   1468 	/*
   1469 	 * Remove this lgroup from lgroup topology if it does not contain any
   1470 	 * resources now
   1471 	 */
   1472 	lgrpid = my_lgrp->lgrp_id;
   1473 	count = 0;
   1474 	klgrpset_clear(changed);
   1475 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
   1476 		/*
   1477 		 * Delete lgroup when no more resources
   1478 		 */
   1479 		if (need_synch)
   1480 			pause_cpus(NULL);
   1481 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
   1482 		    lgrp_alloc_max + 1, &changed);
   1483 		ASSERT(count > 0);
   1484 		if (need_synch)
   1485 			start_cpus();
   1486 	} else {
   1487 		/*
   1488 		 * Remove lgroup from memory resources of any lgroups that
   1489 		 * contain it as such
   1490 		 */
   1491 		for (i = 0; i <= lgrp_alloc_max; i++) {
   1492 			lgrp_t		*lgrp;
   1493 
   1494 			lgrp = lgrp_table[i];
   1495 			if (!LGRP_EXISTS(lgrp) ||
   1496 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
   1497 			    lgrpid))
   1498 				continue;
   1499 
   1500 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
   1501 		}
   1502 	}
   1503 	if (drop_lock)
   1504 		mutex_exit(&cpu_lock);
   1505 }
   1506 
   1507 /*
   1508  * Return lgroup with given platform handle
   1509  */
   1510 lgrp_t *
   1511 lgrp_hand_to_lgrp(lgrp_handle_t hand)
   1512 {
   1513 	int	i;
   1514 	lgrp_t	*lgrp;
   1515 
   1516 	if (hand == LGRP_NULL_HANDLE)
   1517 		return (NULL);
   1518 
   1519 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1520 		lgrp = lgrp_table[i];
   1521 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
   1522 			return (lgrp);
   1523 	}
   1524 	return (NULL);
   1525 }
   1526 
   1527 /*
   1528  * Return the home lgroup of the current thread.
   1529  * We must do this with kernel preemption disabled, since we don't want our
   1530  * thread to be re-homed while we're poking around with its lpl, and the lpl
   1531  * should never be NULL.
   1532  *
   1533  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
   1534  * is enabled because of DR.  Callers can use disable kernel preemption
   1535  * around this call to guarantee that the lgroup will be valid beyond this
   1536  * routine, since kernel preemption can be recursive.
   1537  */
   1538 lgrp_t *
   1539 lgrp_home_lgrp(void)
   1540 {
   1541 	lgrp_t	*lgrp;
   1542 	lpl_t	*lpl;
   1543 
   1544 	kpreempt_disable();
   1545 
   1546 	lpl = curthread->t_lpl;
   1547 	ASSERT(lpl != NULL);
   1548 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
   1549 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
   1550 	lgrp = lgrp_table[lpl->lpl_lgrpid];
   1551 
   1552 	kpreempt_enable();
   1553 
   1554 	return (lgrp);
   1555 }
   1556 
   1557 /*
   1558  * Return ID of home lgroup for given thread
   1559  * (See comments for lgrp_home_lgrp() for special care and handling
   1560  * instructions)
   1561  */
   1562 lgrp_id_t
   1563 lgrp_home_id(kthread_t *t)
   1564 {
   1565 	lgrp_id_t	lgrp;
   1566 	lpl_t		*lpl;
   1567 
   1568 	ASSERT(t != NULL);
   1569 	/*
   1570 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
   1571 	 * cannot since the HAT layer can call into this routine to
   1572 	 * determine the locality for its data structures in the context
   1573 	 * of a page fault.
   1574 	 */
   1575 
   1576 	kpreempt_disable();
   1577 
   1578 	lpl = t->t_lpl;
   1579 	ASSERT(lpl != NULL);
   1580 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
   1581 	lgrp = lpl->lpl_lgrpid;
   1582 
   1583 	kpreempt_enable();
   1584 
   1585 	return (lgrp);
   1586 }
   1587 
   1588 /*
   1589  * Return lgroup containing the physical memory for the given page frame number
   1590  */
   1591 lgrp_t *
   1592 lgrp_pfn_to_lgrp(pfn_t pfn)
   1593 {
   1594 	lgrp_handle_t	hand;
   1595 	int		i;
   1596 	lgrp_t		*lgrp;
   1597 
   1598 	hand = lgrp_plat_pfn_to_hand(pfn);
   1599 	if (hand != LGRP_NULL_HANDLE)
   1600 		for (i = 0; i <= lgrp_alloc_max; i++) {
   1601 			lgrp = lgrp_table[i];
   1602 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
   1603 				return (lgrp);
   1604 		}
   1605 	return (NULL);
   1606 }
   1607 
   1608 /*
   1609  * Return lgroup containing the physical memory for the given page frame number
   1610  */
   1611 lgrp_t *
   1612 lgrp_phys_to_lgrp(u_longlong_t physaddr)
   1613 {
   1614 	lgrp_handle_t	hand;
   1615 	int		i;
   1616 	lgrp_t		*lgrp;
   1617 	pfn_t		pfn;
   1618 
   1619 	pfn = btop(physaddr);
   1620 	hand = lgrp_plat_pfn_to_hand(pfn);
   1621 	if (hand != LGRP_NULL_HANDLE)
   1622 		for (i = 0; i <= lgrp_alloc_max; i++) {
   1623 			lgrp = lgrp_table[i];
   1624 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
   1625 				return (lgrp);
   1626 		}
   1627 	return (NULL);
   1628 }
   1629 
   1630 /*
   1631  * Return the leaf lgroup containing the given CPU
   1632  *
   1633  * The caller needs to take precautions necessary to prevent
   1634  * "cpu", and it's lpl from going away across a call to this function.
   1635  * hint: kpreempt_disable()/kpreempt_enable()
   1636  */
   1637 static lgrp_t *
   1638 lgrp_cpu_to_lgrp(cpu_t *cpu)
   1639 {
   1640 	return (cpu->cpu_lpl->lpl_lgrp);
   1641 }
   1642 
   1643 /*
   1644  * Return the sum of the partition loads in an lgrp divided by
   1645  * the number of CPUs in the lgrp.  This is our best approximation
   1646  * of an 'lgroup load average' for a useful per-lgroup kstat.
   1647  */
   1648 static uint64_t
   1649 lgrp_sum_loadavgs(lgrp_t *lgrp)
   1650 {
   1651 	cpu_t *cpu;
   1652 	int ncpu;
   1653 	uint64_t loads = 0;
   1654 
   1655 	mutex_enter(&cpu_lock);
   1656 
   1657 	cpu = lgrp->lgrp_cpu;
   1658 	ncpu = lgrp->lgrp_cpucnt;
   1659 
   1660 	if (cpu == NULL || ncpu == 0) {
   1661 		mutex_exit(&cpu_lock);
   1662 		return (0ull);
   1663 	}
   1664 
   1665 	do {
   1666 		loads += cpu->cpu_lpl->lpl_loadavg;
   1667 		cpu = cpu->cpu_next_lgrp;
   1668 	} while (cpu != lgrp->lgrp_cpu);
   1669 
   1670 	mutex_exit(&cpu_lock);
   1671 
   1672 	return (loads / ncpu);
   1673 }
   1674 
   1675 void
   1676 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
   1677 {
   1678 	struct lgrp_stats *pstats;
   1679 
   1680 	/*
   1681 	 * Verify that the caller isn't trying to add to
   1682 	 * a statistic for an lgroup that has gone away
   1683 	 */
   1684 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
   1685 		return;
   1686 
   1687 	pstats = &lgrp_stats[lgrpid];
   1688 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
   1689 }
   1690 
   1691 int64_t
   1692 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
   1693 {
   1694 	uint64_t val;
   1695 	struct lgrp_stats *pstats;
   1696 
   1697 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
   1698 		return ((int64_t)0);
   1699 
   1700 	pstats = &lgrp_stats[lgrpid];
   1701 	LGRP_STAT_READ(pstats, stat, val);
   1702 	return (val);
   1703 }
   1704 
   1705 /*
   1706  * Reset all kstats for lgrp specified by its lgrpid.
   1707  */
   1708 static void
   1709 lgrp_kstat_reset(lgrp_id_t lgrpid)
   1710 {
   1711 	lgrp_stat_t stat;
   1712 
   1713 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
   1714 		return;
   1715 
   1716 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
   1717 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
   1718 	}
   1719 }
   1720 
   1721 /*
   1722  * Collect all per-lgrp statistics for the lgrp associated with this
   1723  * kstat, and store them in the ks_data array.
   1724  *
   1725  * The superuser can reset all the running counter statistics for an
   1726  * lgrp by writing to any of the lgrp's stats.
   1727  */
   1728 static int
   1729 lgrp_kstat_extract(kstat_t *ksp, int rw)
   1730 {
   1731 	lgrp_stat_t		stat;
   1732 	struct kstat_named	*ksd;
   1733 	lgrp_t			*lgrp;
   1734 	lgrp_id_t		lgrpid;
   1735 
   1736 	lgrp = (lgrp_t *)ksp->ks_private;
   1737 
   1738 	ksd = (struct kstat_named *)ksp->ks_data;
   1739 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
   1740 
   1741 	lgrpid = lgrp->lgrp_id;
   1742 
   1743 	if (lgrpid == LGRP_NONE) {
   1744 		/*
   1745 		 * Return all zeroes as stats for freed lgrp.
   1746 		 */
   1747 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
   1748 			ksd[stat].value.i64 = 0;
   1749 		}
   1750 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
   1751 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
   1752 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
   1753 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
   1754 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
   1755 	} else if (rw != KSTAT_WRITE) {
   1756 		/*
   1757 		 * Handle counter stats
   1758 		 */
   1759 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
   1760 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
   1761 		}
   1762 
   1763 		/*
   1764 		 * Handle kernel data snapshot stats
   1765 		 */
   1766 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
   1767 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
   1768 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
   1769 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
   1770 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
   1771 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
   1772 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
   1773 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
   1774 		ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
   1775 		    lgrp_loadavg_max_effect;
   1776 	} else {
   1777 		lgrp_kstat_reset(lgrpid);
   1778 	}
   1779 
   1780 	return (0);
   1781 }
   1782 
   1783 int
   1784 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
   1785 {
   1786 	cpu_t	*cp;
   1787 
   1788 	mutex_enter(&cpu_lock);
   1789 
   1790 	if ((cp = cpu_get(id)) == NULL) {
   1791 		mutex_exit(&cpu_lock);
   1792 		return (EINVAL);
   1793 	}
   1794 
   1795 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
   1796 		mutex_exit(&cpu_lock);
   1797 		return (EINVAL);
   1798 	}
   1799 
   1800 	ASSERT(cp->cpu_lpl != NULL);
   1801 
   1802 	*lp = cp->cpu_lpl->lpl_lgrpid;
   1803 
   1804 	mutex_exit(&cpu_lock);
   1805 
   1806 	return (0);
   1807 }
   1808 
   1809 int
   1810 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
   1811 {
   1812 	cpu_t *cp;
   1813 
   1814 	mutex_enter(&cpu_lock);
   1815 
   1816 	if ((cp = cpu_get(id)) == NULL) {
   1817 		mutex_exit(&cpu_lock);
   1818 		return (EINVAL);
   1819 	}
   1820 
   1821 	ASSERT(cp->cpu_lpl != NULL);
   1822 
   1823 	*lp = cp->cpu_lpl->lpl_loadavg;
   1824 
   1825 	mutex_exit(&cpu_lock);
   1826 
   1827 	return (0);
   1828 }
   1829 
   1830 /*
   1831  * Add a resource named by lpl_leaf to rset of lpl_target
   1832  *
   1833  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
   1834  * resource. It is adjusted here, as this is presently the only place that we
   1835  * can be certain a resource addition has succeeded.
   1836  *
   1837  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
   1838  * list in order until it reaches a NULL.  (This list is required to be NULL
   1839  * terminated, too).  This is done so that we can mark start pos + 1, so that
   1840  * each lpl is traversed sequentially, but in a different order.  We hope this
   1841  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
   1842  */
   1843 
   1844 void
   1845 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
   1846 {
   1847 	int		i;
   1848 	int		entry_slot = 0;
   1849 
   1850 	/* return if leaf is already present */
   1851 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
   1852 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
   1853 			return;
   1854 		}
   1855 
   1856 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
   1857 		    lpl_leaf->lpl_lgrpid) {
   1858 			break;
   1859 		}
   1860 	}
   1861 
   1862 	/* insert leaf, update counts */
   1863 	entry_slot = i;
   1864 	i = lpl_target->lpl_nrset++;
   1865 
   1866 	/*
   1867 	 * Start at the end of the rset array and work backwards towards the
   1868 	 * slot into which the new lpl will be inserted. This effectively
   1869 	 * preserves the current ordering by scooting everybody over one entry,
   1870 	 * and placing the new entry into the space created.
   1871 	 */
   1872 	while (i-- > entry_slot) {
   1873 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
   1874 		lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
   1875 		    i + 1;
   1876 	}
   1877 
   1878 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
   1879 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
   1880 
   1881 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
   1882 }
   1883 
   1884 /*
   1885  * Update each of lpl_parent's children with a reference to their parent.
   1886  * The lgrp topology is used as the reference since it is fully
   1887  * consistent and correct at this point.
   1888  * This should be called after any potential change in lpl_parent's
   1889  * rset.
   1890  */
   1891 static void
   1892 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
   1893 {
   1894 	klgrpset_t	children;
   1895 	int		i;
   1896 
   1897 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
   1898 	if (klgrpset_isempty(children))
   1899 		return; /* nothing to do */
   1900 
   1901 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1902 		if (klgrpset_ismember(children, i)) {
   1903 			/*
   1904 			 * (Re)set the parent. It may be incorrect if
   1905 			 * lpl_parent is new in the topology.
   1906 			 */
   1907 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
   1908 		}
   1909 	}
   1910 }
   1911 
   1912 /*
   1913  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
   1914  *
   1915  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
   1916  * resource. The values are adjusted here, as this is the only place that we can
   1917  * be certain a resource was successfully deleted.
   1918  */
   1919 void
   1920 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
   1921 {
   1922 	int i;
   1923 	lpl_t *leaf;
   1924 
   1925 	if (lpl_target->lpl_nrset == 0)
   1926 		return;
   1927 
   1928 	/* find leaf in intermediate node */
   1929 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
   1930 		if (lpl_target->lpl_rset[i] == lpl_leaf)
   1931 			break;
   1932 	}
   1933 
   1934 	/* return if leaf not found */
   1935 	if (lpl_target->lpl_rset[i] != lpl_leaf)
   1936 		return;
   1937 
   1938 	/* prune leaf, compress array */
   1939 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
   1940 	lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
   1941 	lpl_target->lpl_ncpu--;
   1942 	do {
   1943 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
   1944 		/*
   1945 		 * Update the lgrp id <=> rset mapping
   1946 		 */
   1947 		if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
   1948 			lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
   1949 		}
   1950 	} while (i++ < lpl_target->lpl_nrset);
   1951 }
   1952 
   1953 /*
   1954  * Check to see if the resource set of the target lpl contains the
   1955  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
   1956  */
   1957 
   1958 int
   1959 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
   1960 {
   1961 	int i;
   1962 
   1963 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
   1964 		if (lpl_target->lpl_rset[i] == lpl_leaf)
   1965 			return (1);
   1966 	}
   1967 
   1968 	return (0);
   1969 }
   1970 
   1971 /*
   1972  * Called when we change cpu lpl membership.  This increments or decrements the
   1973  * per-cpu counter in every lpl in which our leaf appears.
   1974  */
   1975 void
   1976 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
   1977 {
   1978 	cpupart_t	*cpupart;
   1979 	lgrp_t		*lgrp_leaf;
   1980 	lgrp_t		*lgrp_cur;
   1981 	lpl_t		*lpl_leaf;
   1982 	lpl_t		*lpl_cur;
   1983 	int		i;
   1984 
   1985 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
   1986 
   1987 	cpupart = cp->cpu_part;
   1988 	lpl_leaf = cp->cpu_lpl;
   1989 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
   1990 
   1991 	for (i = 0; i <= lgrp_alloc_max; i++) {
   1992 		lgrp_cur = lgrp_table[i];
   1993 
   1994 		/*
   1995 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
   1996 		 * for the cpu in question, or if the current lgrp and leaf
   1997 		 * don't share the same resources.
   1998 		 */
   1999 
   2000 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
   2001 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
   2002 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
   2003 			continue;
   2004 
   2005 
   2006 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
   2007 
   2008 		if (lpl_cur->lpl_nrset > 0) {
   2009 			if (act == LPL_INCREMENT) {
   2010 				lpl_cur->lpl_ncpu++;
   2011 			} else if (act == LPL_DECREMENT) {
   2012 				lpl_cur->lpl_ncpu--;
   2013 			}
   2014 		}
   2015 	}
   2016 }
   2017 
   2018 /*
   2019  * Initialize lpl with given resources and specified lgrp
   2020  */
   2021 void
   2022 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
   2023 {
   2024 	lpl->lpl_lgrpid = lgrp->lgrp_id;
   2025 	lpl->lpl_loadavg = 0;
   2026 	if (lpl == lpl_leaf)
   2027 		lpl->lpl_ncpu = 1;
   2028 	else
   2029 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
   2030 	lpl->lpl_nrset = 1;
   2031 	lpl->lpl_rset[0] = lpl_leaf;
   2032 	lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
   2033 	lpl->lpl_lgrp = lgrp;
   2034 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
   2035 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
   2036 }
   2037 
   2038 /*
   2039  * Clear an unused lpl
   2040  */
   2041 void
   2042 lpl_clear(lpl_t *lpl)
   2043 {
   2044 	/*
   2045 	 * Clear out all fields in the lpl except:
   2046 	 *    lpl_lgrpid - to facilitate debugging
   2047 	 *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
   2048 	 *
   2049 	 * Note that the lpl's rset and id2rset mapping are cleared as well.
   2050 	 */
   2051 	lpl->lpl_loadavg = 0;
   2052 	lpl->lpl_ncpu = 0;
   2053 	lpl->lpl_lgrp = NULL;
   2054 	lpl->lpl_parent = NULL;
   2055 	lpl->lpl_cpus = NULL;
   2056 	lpl->lpl_nrset = 0;
   2057 	lpl->lpl_homed_time = 0;
   2058 	bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
   2059 	bzero(lpl->lpl_id2rset,
   2060 	    sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
   2061 }
   2062 
   2063 /*
   2064  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
   2065  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
   2066  * make full use of all of the lgroup topology, but this checks to make sure
   2067  * that for the parts that it does use, it has correctly understood the
   2068  * relationships that exist. This function returns
   2069  * 0 if the topology is correct, and a non-zero error code, for non-debug
   2070  * kernels if incorrect.  Asserts are spread throughout the code to aid in
   2071  * debugging on a DEBUG kernel.
   2072  */
   2073 int
   2074 lpl_topo_verify(cpupart_t *cpupart)
   2075 {
   2076 	lgrp_t		*lgrp;
   2077 	lpl_t		*lpl;
   2078 	klgrpset_t	rset;
   2079 	klgrpset_t	cset;
   2080 	cpu_t		*cpu;
   2081 	cpu_t		*cp_start;
   2082 	int		i;
   2083 	int		j;
   2084 	int		sum;
   2085 
   2086 	/* topology can't be incorrect if it doesn't exist */
   2087 	if (!lgrp_topo_initialized || !lgrp_initialized)
   2088 		return (LPL_TOPO_CORRECT);
   2089 
   2090 	ASSERT(cpupart != NULL);
   2091 
   2092 	for (i = 0; i <= lgrp_alloc_max; i++) {
   2093 		lgrp = lgrp_table[i];
   2094 		lpl = NULL;
   2095 		/* make sure lpls are allocated */
   2096 		ASSERT(cpupart->cp_lgrploads);
   2097 		if (!cpupart->cp_lgrploads)
   2098 			return (LPL_TOPO_PART_HAS_NO_LPL);
   2099 
   2100 		lpl = &cpupart->cp_lgrploads[i];
   2101 		/* make sure our index is good */
   2102 		ASSERT(i < cpupart->cp_nlgrploads);
   2103 
   2104 		/* if lgroup doesn't exist, make sure lpl is empty */
   2105 		if (!LGRP_EXISTS(lgrp)) {
   2106 			ASSERT(lpl->lpl_ncpu == 0);
   2107 			if (lpl->lpl_ncpu > 0) {
   2108 				return (LPL_TOPO_CPUS_NOT_EMPTY);
   2109 			} else {
   2110 				continue;
   2111 			}
   2112 		}
   2113 
   2114 		/* verify that lgroup and lpl are identically numbered */
   2115 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
   2116 
   2117 		/* if lgroup isn't in our partition, make sure lpl is empty */
   2118 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
   2119 		    cpupart->cp_lgrpset)) {
   2120 			ASSERT(lpl->lpl_ncpu == 0);
   2121 			if (lpl->lpl_ncpu > 0) {
   2122 				return (LPL_TOPO_CPUS_NOT_EMPTY);
   2123 			}
   2124 			/*
   2125 			 * lpl is empty, and lgroup isn't in partition.  verify
   2126 			 * that lpl doesn't show up in anyone else's rsets (in
   2127 			 * this partition, anyway)
   2128 			 */
   2129 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
   2130 				lpl_t *i_lpl; /* lpl we're iterating over */
   2131 
   2132 				i_lpl = &cpupart->cp_lgrploads[j];
   2133 
   2134 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
   2135 				if (lpl_rset_contains(i_lpl, lpl)) {
   2136 					return (LPL_TOPO_LPL_ORPHANED);
   2137 				}
   2138 			}
   2139 			/* lgroup is empty, and everything is ok. continue */
   2140 			continue;
   2141 		}
   2142 
   2143 
   2144 		/* lgroup is in this partition, now check it against lpl */
   2145 
   2146 		/* do both have matching lgrps? */
   2147 		ASSERT(lgrp == lpl->lpl_lgrp);
   2148 		if (lgrp != lpl->lpl_lgrp) {
   2149 			return (LPL_TOPO_LGRP_MISMATCH);
   2150 		}
   2151 
   2152 		/* do the parent lgroups exist and do they match? */
   2153 		if (lgrp->lgrp_parent) {
   2154 			ASSERT(lpl->lpl_parent);
   2155 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
   2156 			    lpl->lpl_parent->lpl_lgrpid);
   2157 
   2158 			if (!lpl->lpl_parent) {
   2159 				return (LPL_TOPO_MISSING_PARENT);
   2160 			} else if (lgrp->lgrp_parent->lgrp_id !=
   2161 			    lpl->lpl_parent->lpl_lgrpid) {
   2162 				return (LPL_TOPO_PARENT_MISMATCH);
   2163 			}
   2164 		}
   2165 
   2166 		/* only leaf lgroups keep a cpucnt, only check leaves */
   2167 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
   2168 
   2169 			/* verify that lgrp is also a leaf */
   2170 			ASSERT((lgrp->lgrp_childcnt == 0) &&
   2171 			    (klgrpset_ismember(lgrp->lgrp_leaves,
   2172 			    lpl->lpl_lgrpid)));
   2173 
   2174 			if ((lgrp->lgrp_childcnt > 0) ||
   2175 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
   2176 			    lpl->lpl_lgrpid))) {
   2177 				return (LPL_TOPO_LGRP_NOT_LEAF);
   2178 			}
   2179 
   2180 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
   2181 			    (lpl->lpl_ncpu > 0));
   2182 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
   2183 			    (lpl->lpl_ncpu <= 0)) {
   2184 				return (LPL_TOPO_BAD_CPUCNT);
   2185 			}
   2186 
   2187 			/*
   2188 			 * Check that lpl_ncpu also matches the number of
   2189 			 * cpus in the lpl's linked list.  This only exists in
   2190 			 * leaves, but they should always match.
   2191 			 */
   2192 			j = 0;
   2193 			cpu = cp_start = lpl->lpl_cpus;
   2194 			while (cpu != NULL) {
   2195 				j++;
   2196 
   2197 				/* check to make sure cpu's lpl is leaf lpl */
   2198 				ASSERT(cpu->cpu_lpl == lpl);
   2199 				if (cpu->cpu_lpl != lpl) {
   2200 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
   2201 				}
   2202 
   2203 				/* check next cpu */
   2204 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
   2205 					continue;
   2206 				} else {
   2207 					cpu = NULL;
   2208 				}
   2209 			}
   2210 
   2211 			ASSERT(j == lpl->lpl_ncpu);
   2212 			if (j != lpl->lpl_ncpu) {
   2213 				return (LPL_TOPO_LPL_BAD_NCPU);
   2214 			}
   2215 
   2216 			/*
   2217 			 * Also, check that leaf lpl is contained in all
   2218 			 * intermediate lpls that name the leaf as a descendant
   2219 			 */
   2220 			for (j = 0; j <= lgrp_alloc_max; j++) {
   2221 				klgrpset_t intersect;
   2222 				lgrp_t *lgrp_cand;
   2223 				lpl_t *lpl_cand;
   2224 
   2225 				lgrp_cand = lgrp_table[j];
   2226 				intersect = klgrpset_intersects(
   2227 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
   2228 				    cpupart->cp_lgrpset);
   2229 
   2230 				if (!LGRP_EXISTS(lgrp_cand) ||
   2231 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
   2232 				    cpupart->cp_lgrpset) ||
   2233 				    (intersect == 0))
   2234 					continue;
   2235 
   2236 				lpl_cand =
   2237 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
   2238 
   2239 				if (klgrpset_ismember(intersect,
   2240 				    lgrp->lgrp_id)) {
   2241 					ASSERT(lpl_rset_contains(lpl_cand,
   2242 					    lpl));
   2243 
   2244 					if (!lpl_rset_contains(lpl_cand, lpl)) {
   2245 						return (LPL_TOPO_RSET_MSSNG_LF);
   2246 					}
   2247 				}
   2248 			}
   2249 
   2250 		} else { /* non-leaf specific checks */
   2251 
   2252 			/*
   2253 			 * Non-leaf lpls should have lpl_cpus == NULL
   2254 			 * verify that this is so
   2255 			 */
   2256 			ASSERT(lpl->lpl_cpus == NULL);
   2257 			if (lpl->lpl_cpus != NULL) {
   2258 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
   2259 			}
   2260 
   2261 			/*
   2262 			 * verify that the sum of the cpus in the leaf resources
   2263 			 * is equal to the total ncpu in the intermediate
   2264 			 */
   2265 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
   2266 				sum += lpl->lpl_rset[j]->lpl_ncpu;
   2267 			}
   2268 
   2269 			ASSERT(sum == lpl->lpl_ncpu);
   2270 			if (sum != lpl->lpl_ncpu) {
   2271 				return (LPL_TOPO_LPL_BAD_NCPU);
   2272 			}
   2273 		}
   2274 
   2275 		/*
   2276 		 * Check the rset of the lpl in question.  Make sure that each
   2277 		 * rset contains a subset of the resources in
   2278 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
   2279 		 * sure that each rset doesn't include resources that are
   2280 		 * outside of that set.  (Which would be resources somehow not
   2281 		 * accounted for).
   2282 		 */
   2283 		klgrpset_clear(rset);
   2284 		for (j = 0; j < lpl->lpl_nrset; j++) {
   2285 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
   2286 		}
   2287 		klgrpset_copy(cset, rset);
   2288 		/* make sure lpl rset matches lgrp rset */
   2289 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
   2290 		/* make sure rset is contained with in partition, too */
   2291 		klgrpset_diff(cset, cpupart->cp_lgrpset);
   2292 
   2293 		ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
   2294 		if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
   2295 			return (LPL_TOPO_RSET_MISMATCH);
   2296 		}
   2297 
   2298 		/*
   2299 		 * check to make sure lpl_nrset matches the number of rsets
   2300 		 * contained in the lpl
   2301 		 */
   2302 		for (j = 0; j < lpl->lpl_nrset; j++) {
   2303 			if (lpl->lpl_rset[j] == NULL)
   2304 				break;
   2305 		}
   2306 
   2307 		ASSERT(j == lpl->lpl_nrset);
   2308 		if (j != lpl->lpl_nrset) {
   2309 			return (LPL_TOPO_BAD_RSETCNT);
   2310 		}
   2311 
   2312 	}
   2313 	return (LPL_TOPO_CORRECT);
   2314 }
   2315 
   2316 /*
   2317  * Flatten lpl topology to given number of levels.  This is presently only
   2318  * implemented for a flatten to 2 levels, which will prune out the intermediates
   2319  * and home the leaf lpls to the root lpl.
   2320  */
   2321 int
   2322 lpl_topo_flatten(int levels)
   2323 {
   2324 	int		i;
   2325 	uint_t		sum;
   2326 	lgrp_t		*lgrp_cur;
   2327 	lpl_t		*lpl_cur;
   2328 	lpl_t		*lpl_root;
   2329 	cpupart_t	*cp;
   2330 
   2331 	if (levels != 2)
   2332 		return (0);
   2333 
   2334 	/* called w/ cpus paused - grab no locks! */
   2335 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
   2336 	    !lgrp_initialized);
   2337 
   2338 	cp = cp_list_head;
   2339 	do {
   2340 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
   2341 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
   2342 
   2343 		for (i = 0; i <= lgrp_alloc_max; i++) {
   2344 			lgrp_cur = lgrp_table[i];
   2345 			lpl_cur = &cp->cp_lgrploads[i];
   2346 
   2347 			if ((lgrp_cur == lgrp_root) ||
   2348 			    (!LGRP_EXISTS(lgrp_cur) &&
   2349 			    (lpl_cur->lpl_ncpu == 0)))
   2350 				continue;
   2351 
   2352 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
   2353 				/*
   2354 				 * this should be a deleted intermediate, so
   2355 				 * clear it
   2356 				 */
   2357 				lpl_clear(lpl_cur);
   2358 			} else if ((lpl_cur->lpl_nrset == 1) &&
   2359 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
   2360 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
   2361 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
   2362 				/*
   2363 				 * this is a leaf whose parent was deleted, or
   2364 				 * whose parent had their lgrp deleted.  (And
   2365 				 * whose parent will soon be deleted).  Point
   2366 				 * this guy back to the root lpl.
   2367 				 */
   2368 				lpl_cur->lpl_parent = lpl_root;
   2369 				lpl_rset_add(lpl_root, lpl_cur);
   2370 			}
   2371 
   2372 		}
   2373 
   2374 		/*
   2375 		 * Now that we're done, make sure the count on the root lpl is
   2376 		 * correct, and update the hints of the children for the sake of
   2377 		 * thoroughness
   2378 		 */
   2379 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
   2380 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
   2381 		}
   2382 		lpl_root->lpl_ncpu = sum;
   2383 		lpl_child_update(lpl_root, cp);
   2384 
   2385 		cp = cp->cp_next;
   2386 	} while (cp != cp_list_head);
   2387 
   2388 	return (levels);
   2389 }
   2390 
   2391 /*
   2392  * Insert a lpl into the resource hierarchy and create any additional lpls that
   2393  * are necessary to represent the varying states of locality for the cpu
   2394  * resoruces newly added to the partition.
   2395  *
   2396  * This routine is clever enough that it can correctly add resources from the
   2397  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
   2398  * those for which the lpl is a leaf as opposed to simply a named equally local
   2399  * resource).  The one special case that needs additional processing is when a
   2400  * new intermediate lpl is introduced.  Since the main loop only traverses
   2401  * looking to add the leaf resource where it does not yet exist, additional work
   2402  * is necessary to add other leaf resources that may need to exist in the newly
   2403  * created intermediate.  This is performed by the second inner loop, and is
   2404  * only done when the check for more than one overlapping resource succeeds.
   2405  */
   2406 
   2407 void
   2408 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
   2409 {
   2410 	int		i;
   2411 	int		j;
   2412 	int		rset_num_intersect;
   2413 	lgrp_t		*lgrp_cur;
   2414 	lpl_t		*lpl_cur;
   2415 	lpl_t		*lpl_parent;
   2416 	lgrp_id_t	parent_id;
   2417 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
   2418 
   2419 	for (i = 0; i <= lgrp_alloc_max; i++) {
   2420 		lgrp_cur = lgrp_table[i];
   2421 
   2422 		/*
   2423 		 * Don't insert if the lgrp isn't there, if the leaf isn't
   2424 		 * contained within the current lgrp, or if the current lgrp has
   2425 		 * no leaves in this partition
   2426 		 */
   2427 
   2428 		if (!LGRP_EXISTS(lgrp_cur) ||
   2429 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
   2430 		    lpl_leaf->lpl_lgrpid) ||
   2431 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
   2432 		    cpupart->cp_lgrpset))
   2433 			continue;
   2434 
   2435 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
   2436 		if (lgrp_cur->lgrp_parent != NULL) {
   2437 			/* if lgrp has a parent, assign it properly */
   2438 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
   2439 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
   2440 		} else {
   2441 			/* if not, make sure parent ptr gets set to null */
   2442 			lpl_parent = NULL;
   2443 		}
   2444 
   2445 		if (lpl_cur == lpl_leaf) {
   2446 			/*
   2447 			 * Almost all leaf state was initialized elsewhere.  The
   2448 			 * only thing left to do is to set the parent.
   2449 			 */
   2450 			lpl_cur->lpl_parent = lpl_parent;
   2451 			continue;
   2452 		}
   2453 
   2454 		lpl_clear(lpl_cur);
   2455 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
   2456 
   2457 		lpl_cur->lpl_parent = lpl_parent;
   2458 
   2459 		/* does new lpl need to be populated with other resources? */
   2460 		rset_intersect =
   2461 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
   2462 		    cpupart->cp_lgrpset);
   2463 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
   2464 
   2465 		if (rset_num_intersect > 1) {
   2466 			/*
   2467 			 * If so, figure out what lpls have resources that
   2468 			 * intersect this one, and add them.
   2469 			 */
   2470 			for (j = 0; j <= lgrp_alloc_max; j++) {
   2471 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
   2472 				lpl_t	*lpl_cand;	/* candidate lpl */
   2473 
   2474 				lgrp_cand = lgrp_table[j];
   2475 				if (!LGRP_EXISTS(lgrp_cand) ||
   2476 				    !klgrpset_ismember(rset_intersect,
   2477 				    lgrp_cand->lgrp_id))
   2478 					continue;
   2479 				lpl_cand =
   2480 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
   2481 				lpl_rset_add(lpl_cur, lpl_cand);
   2482 			}
   2483 		}
   2484 		/*
   2485 		 * This lpl's rset has changed. Update the hint in it's
   2486 		 * children.
   2487 		 */
   2488 		lpl_child_update(lpl_cur, cpupart);
   2489 	}
   2490 }
   2491 
   2492 /*
   2493  * remove a lpl from the hierarchy of resources, clearing its state when
   2494  * finished.  If the lpls at the intermediate levels of the hierarchy have no
   2495  * remaining resources, or no longer name a leaf resource in the cpu-partition,
   2496  * delete them as well.
   2497  */
   2498 
   2499 void
   2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
   2501 {
   2502 	int		i;
   2503 	lgrp_t		*lgrp_cur;
   2504 	lpl_t		*lpl_cur;
   2505 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
   2506 
   2507 	for (i = 0; i <= lgrp_alloc_max; i++) {
   2508 		lgrp_cur = lgrp_table[i];
   2509 
   2510 		/*
   2511 		 * Don't attempt to remove from lgrps that aren't there, that
   2512 		 * don't contain our leaf, or from the leaf itself. (We do that
   2513 		 * later)
   2514 		 */
   2515 
   2516 		if (!LGRP_EXISTS(lgrp_cur))
   2517 			continue;
   2518 
   2519 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
   2520 
   2521 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
   2522 		    lpl_leaf->lpl_lgrpid) ||
   2523 		    (lpl_cur == lpl_leaf)) {
   2524 			continue;
   2525 		}
   2526 
   2527 		/*
   2528 		 * This is a slightly sleazy simplification in that we have
   2529 		 * already marked the cp_lgrpset as no longer containing the
   2530 		 * leaf we've deleted.  Any lpls that pass the above checks
   2531 		 * based upon lgrp membership but not necessarily cpu-part
   2532 		 * membership also get cleared by the checks below.  Currently
   2533 		 * this is harmless, as the lpls should be empty anyway.
   2534 		 *
   2535 		 * In particular, we want to preserve lpls that have additional
   2536 		 * leaf resources, even though we don't yet have a processor
   2537 		 * architecture that represents resources this way.
   2538 		 */
   2539 
   2540 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
   2541 		    cpupart->cp_lgrpset);
   2542 
   2543 		lpl_rset_del(lpl_cur, lpl_leaf);
   2544 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
   2545 			lpl_clear(lpl_cur);
   2546 		} else {
   2547 			/*
   2548 			 * Update this lpl's children
   2549 			 */
   2550 			lpl_child_update(lpl_cur, cpupart);
   2551 		}
   2552 	}
   2553 	lpl_clear(lpl_leaf);
   2554 }
   2555 
   2556 /*
   2557  * add a cpu to a partition in terms of lgrp load avg bookeeping
   2558  *
   2559  * The lpl (cpu partition load average information) is now arranged in a
   2560  * hierarchical fashion whereby resources that are closest, ie. most local, to
   2561  * the cpu in question are considered to be leaves in a tree of resources.
   2562  * There are two general cases for cpu additon:
   2563  *
   2564  * 1. A lpl structure that contains resources already in the hierarchy tree.
   2565  * In this case, all of the associated lpl relationships have been defined, and
   2566  * all that is necessary is that we link the new cpu into the per-lpl list of
   2567  * cpus, and increment the ncpu count of all places where this cpu resource will
   2568  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
   2569  * pushing is accomplished by this routine.
   2570  *
   2571  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
   2572  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
   2573  * construct the hierarchy of state necessary to name it's more distant
   2574  * resources, if they should exist.  The leaf structure is initialized by this
   2575  * routine, as is the cpu-partition state for the lgrp membership.  This routine
   2576  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
   2577  * and builds all of the "ancestoral" state necessary to identify resources at
   2578  * differing levels of locality.
   2579  */
   2580 void
   2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
   2582 {
   2583 	cpupart_t	*cpupart;
   2584 	lgrp_t		*lgrp_leaf;
   2585 	lpl_t		*lpl_leaf;
   2586 
   2587 	/* called sometimes w/ cpus paused - grab no locks */
   2588 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
   2589 
   2590 	cpupart = cp->cpu_part;
   2591 	lgrp_leaf = lgrp_table[lgrpid];
   2592 
   2593 	/* don't add non-existent lgrp */
   2594 	ASSERT(LGRP_EXISTS(lgrp_leaf));
   2595 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
   2596 	cp->cpu_lpl = lpl_leaf;
   2597 
   2598 	/* only leaf lpls contain cpus */
   2599 
   2600 	if (lpl_leaf->lpl_ncpu++ == 0) {
   2601 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
   2602 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
   2603 		lpl_leaf_insert(lpl_leaf, cpupart);
   2604 	} else {
   2605 		/*
   2606 		 * the lpl should already exist in the parent, so just update
   2607 		 * the count of available CPUs
   2608 		 */
   2609 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
   2610 	}
   2611 
   2612 	/* link cpu into list of cpus in lpl */
   2613 
   2614 	if (lpl_leaf->lpl_cpus) {
   2615 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
   2616 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
   2617 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
   2618 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
   2619 	} else {
   2620 		/*
   2621 		 * We increment ncpu immediately after we create a new leaf
   2622 		 * lpl, so assert that ncpu == 1 for the case where we don't
   2623 		 * have any cpu pointers yet.
   2624 		 */
   2625 		ASSERT(lpl_leaf->lpl_ncpu == 1);
   2626 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
   2627 	}
   2628 
   2629 }
   2630 
   2631 
   2632 /*
   2633  * remove a cpu from a partition in terms of lgrp load avg bookeeping
   2634  *
   2635  * The lpl (cpu partition load average information) is now arranged in a
   2636  * hierarchical fashion whereby resources that are closest, ie. most local, to
   2637  * the cpu in question are considered to be leaves in a tree of resources.
   2638  * There are two removal cases in question:
   2639  *
   2640  * 1. Removal of the resource in the leaf leaves other resources remaining in
   2641  * that leaf.  (Another cpu still exists at this level of locality).  In this
   2642  * case, the count of available cpus is decremented in all assocated lpls by
   2643  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
   2644  * from the per-cpu lpl list.
   2645  *
   2646  * 2. Removal of the resource results in the lpl containing no resources.  (It's
   2647  * empty)  In this case, all of what has occurred for the first step must take
   2648  * place; however, additionally we must remove the lpl structure itself, prune
   2649  * out any stranded lpls that do not directly name a leaf resource, and mark the
   2650  * cpu partition in question as no longer containing resources from the lgrp of
   2651  * the lpl that has been delted.  Cpu-partition changes are handled by this
   2652  * method, but the lpl_leaf_remove function deals with the details of pruning
   2653  * out the empty lpl and any of its orphaned direct ancestors.
   2654  */
   2655 void
   2656 lgrp_part_del_cpu(cpu_t *cp)
   2657 {
   2658 	lpl_t		*lpl;
   2659 	lpl_t		*leaf_lpl;
   2660 	lgrp_t		*lgrp_leaf;
   2661 
   2662 	/* called sometimes w/ cpus paused - grab no locks */
   2663 
   2664 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
   2665 
   2666 	lpl = leaf_lpl = cp->cpu_lpl;
   2667 	lgrp_leaf = leaf_lpl->lpl_lgrp;
   2668 
   2669 	/* don't delete a leaf that isn't there */
   2670 	ASSERT(LGRP_EXISTS(lgrp_leaf));
   2671 
   2672 	/* no double-deletes */
   2673 	ASSERT(lpl->lpl_ncpu);
   2674 	if (--lpl->lpl_ncpu == 0) {
   2675 		/*
   2676 		 * This was the last cpu in this lgroup for this partition,
   2677 		 * clear its bit in the partition's lgroup bitmask
   2678 		 */
   2679 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
   2680 
   2681 		/* eliminate remaning lpl link pointers in cpu, lpl */
   2682 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
   2683 
   2684 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
   2685 	} else {
   2686 
   2687 		/* unlink cpu from lists of cpus in lpl */
   2688 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
   2689 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
   2690 		if (lpl->lpl_cpus == cp) {
   2691 			lpl->lpl_cpus = cp->cpu_next_lpl;
   2692 		}
   2693 
   2694 		/*
   2695 		 * Update the cpu count in the lpls associated with parent
   2696 		 * lgroups.
   2697 		 */
   2698 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
   2699 
   2700 	}
   2701 	/* clear cpu's lpl ptr when we're all done */
   2702 	cp->cpu_lpl = NULL;
   2703 }
   2704 
   2705 /*
   2706  * Recompute load average for the specified partition/lgrp fragment.
   2707  *
   2708  * We rely on the fact that this routine is called from the clock thread
   2709  * at a point before the clock thread can block (i.e. before its first
   2710  * lock request).  Since the clock thread can not be preempted (since it
   2711  * runs at highest priority), we know that cpu partitions can not change
   2712  * (since doing so would require either the repartition requester or the
   2713  * cpu_pause thread to run on this cpu), so we can update the cpu's load
   2714  * without grabbing cpu_lock.
   2715  */
   2716 void
   2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
   2718 {
   2719 	uint_t		ncpu;
   2720 	int64_t		old, new, f;
   2721 
   2722 	/*
   2723 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
   2724 	 */
   2725 	static short expval[] = {
   2726 	    0, 3196, 1618, 1083,
   2727 	    814, 652, 543, 466,
   2728 	    408, 363, 326, 297,
   2729 	    272, 251, 233, 218,
   2730 	    204, 192, 181, 172,
   2731 	    163, 155, 148, 142,
   2732 	    136, 130, 125, 121,
   2733 	    116, 112, 109, 105
   2734 	};
   2735 
   2736 	/* ASSERT (called from clock level) */
   2737 
   2738 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
   2739 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
   2740 		return;
   2741 	}
   2742 
   2743 	for (;;) {
   2744 
   2745 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
   2746 			f = expval[1]/ncpu; /* good approx. for large ncpu */
   2747 		else
   2748 			f = expval[ncpu];
   2749 
   2750 		/*
   2751 		 * Modify the load average atomically to avoid losing
   2752 		 * anticipatory load updates (see lgrp_move_thread()).
   2753 		 */
   2754 		if (ageflag) {
   2755 			/*
   2756 			 * We're supposed to both update and age the load.
   2757 			 * This happens 10 times/sec. per cpu.  We do a
   2758 			 * little hoop-jumping to avoid integer overflow.
   2759 			 */
   2760 			int64_t		q, r;
   2761 
   2762 			do {
   2763 				old = new = lpl->lpl_loadavg;
   2764 				q = (old  >> 16) << 7;
   2765 				r = (old  & 0xffff) << 7;
   2766 				new += ((long long)(nrcpus - q) * f -
   2767 				    ((r * f) >> 16)) >> 7;
   2768 
   2769 				/*
   2770 				 * Check for overflow
   2771 				 */
   2772 				if (new > LGRP_LOADAVG_MAX)
   2773 					new = LGRP_LOADAVG_MAX;
   2774 				else if (new < 0)
   2775 					new = 0;
   2776 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
   2777 			    new) != old);
   2778 		} else {
   2779 			/*
   2780 			 * We're supposed to update the load, but not age it.
   2781 			 * This option is used to update the load (which either
   2782 			 * has already been aged in this 1/10 sec. interval or
   2783 			 * soon will be) to account for a remotely executing
   2784 			 * thread.
   2785 			 */
   2786 			do {
   2787 				old = new = lpl->lpl_loadavg;
   2788 				new += f;
   2789 				/*
   2790 				 * Check for overflow
   2791 				 * Underflow not possible here
   2792 				 */
   2793 				if (new < old)
   2794 					new = LGRP_LOADAVG_MAX;
   2795 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
   2796 			    new) != old);
   2797 		}
   2798 
   2799 		/*
   2800 		 * Do the same for this lpl's parent
   2801 		 */
   2802 		if ((lpl = lpl->lpl_parent) == NULL)
   2803 			break;
   2804 		ncpu = lpl->lpl_ncpu;
   2805 	}
   2806 }
   2807 
   2808 /*
   2809  * Initialize lpl topology in the target based on topology currently present in
   2810  * lpl_bootstrap.
   2811  *
   2812  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
   2813  * initialize cp_default list of lpls. Up to this point all topology operations
   2814  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
   2815  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
   2816  * `target' points to the list of lpls in cp_default and `size' is the size of
   2817  * this list.
   2818  *
   2819  * This function walks the lpl topology in lpl_bootstrap and does for things:
   2820  *
   2821  * 1) Copies all fields from lpl_bootstrap to the target.
   2822  *
   2823  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
   2824  *
   2825  * 3) Updates lpl_parent pointers to point to the lpls in the target list
   2826  *    instead of lpl_bootstrap.
   2827  *
   2828  * 4) Updates pointers in the resource list of the target to point to the lpls
   2829  *    in the target list instead of lpl_bootstrap.
   2830  *
   2831  * After lpl_topo_bootstrap() completes, target contains the same information
   2832  * that would be present there if it were used during boot instead of
   2833  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
   2834  * and it is bzeroed.
   2835  */
   2836 void
   2837 lpl_topo_bootstrap(lpl_t *target, int size)
   2838 {
   2839 	lpl_t	*lpl = lpl_bootstrap;
   2840 	lpl_t	*target_lpl = target;
   2841 	lpl_t	**rset;
   2842 	int	*id2rset;
   2843 	int	sz;
   2844 	int	howmany;
   2845 	int	id;
   2846 	int	i;
   2847 
   2848 	/*
   2849 	 * The only target that should be passed here is cp_default lpl list.
   2850 	 */
   2851 	ASSERT(target == cp_default.cp_lgrploads);
   2852 	ASSERT(size == cp_default.cp_nlgrploads);
   2853 	ASSERT(!lgrp_topo_initialized);
   2854 	ASSERT(ncpus == 1);
   2855 
   2856 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
   2857 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
   2858 		/*
   2859 		 * Copy all fields from lpl, except for the rset,
   2860 		 * lgrp id <=> rset mapping storage,
   2861 		 * and amount of storage
   2862 		 */
   2863 		rset = target_lpl->lpl_rset;
   2864 		id2rset = target_lpl->lpl_id2rset;
   2865 		sz = target_lpl->lpl_rset_sz;
   2866 
   2867 		*target_lpl = *lpl;
   2868 
   2869 		target_lpl->lpl_rset_sz = sz;
   2870 		target_lpl->lpl_rset = rset;
   2871 		target_lpl->lpl_id2rset = id2rset;
   2872 
   2873 		/*
   2874 		 * Substitute CPU0 lpl pointer with one relative to target.
   2875 		 */
   2876 		if (lpl->lpl_cpus == CPU) {
   2877 			ASSERT(CPU->cpu_lpl == lpl);
   2878 			CPU->cpu_lpl = target_lpl;
   2879 		}
   2880 
   2881 		/*
   2882 		 * Substitute parent information with parent relative to target.
   2883 		 */
   2884 		if (lpl->lpl_parent != NULL)
   2885 			target_lpl->lpl_parent = (lpl_t *)
   2886 			    (((uintptr_t)lpl->lpl_parent -
   2887 			    (uintptr_t)lpl_bootstrap) +
   2888 			    (uintptr_t)target);
   2889 
   2890 		/*
   2891 		 * Walk over resource set substituting pointers relative to
   2892 		 * lpl_bootstrap's rset to pointers relative to target's
   2893 		 */
   2894 		ASSERT(lpl->lpl_nrset <= 1);
   2895 
   2896 		for (id = 0; id < lpl->lpl_nrset; id++) {
   2897 			if (lpl->lpl_rset[id] != NULL) {
   2898 				target_lpl->lpl_rset[id] = (lpl_t *)
   2899 				    (((uintptr_t)lpl->lpl_rset[id] -
   2900 				    (uintptr_t)lpl_bootstrap) +
   2901 				    (uintptr_t)target);
   2902 			}
   2903 			target_lpl->lpl_id2rset[id] =
   2904 			    lpl->lpl_id2rset[id];
   2905 		}
   2906 	}
   2907 
   2908 	/*
   2909 	 * Clean up the bootstrap lpls since we have switched over to the
   2910 	 * actual lpl array in the default cpu partition.
   2911 	 *
   2912 	 * We still need to keep one empty lpl around for newly starting
   2913 	 * slave CPUs to reference should they need to make it through the
   2914 	 * dispatcher prior to their lgrp/lpl initialization.
   2915 	 *
   2916 	 * The lpl related dispatcher code has been designed to work properly
   2917 	 * (and without extra checks) for this special case of a zero'ed
   2918 	 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
   2919 	 * with lgrpid 0 and an empty resource set. Iteration over the rset
   2920 	 * array by the dispatcher is also NULL terminated for this reason.
   2921 	 *
   2922 	 * This provides the desired behaviour for an uninitialized CPU.
   2923 	 * It shouldn't see any other CPU to either dispatch to or steal
   2924 	 * from until it is properly initialized.
   2925 	 */
   2926 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
   2927 	bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
   2928 	bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
   2929 
   2930 	lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
   2931 	lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
   2932 }
   2933 
   2934 /*
   2935  * If the lowest load among the lgroups a process' threads are currently
   2936  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
   2937  * expanding the process to a new lgroup.
   2938  */
   2939 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
   2940 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
   2941 
   2942 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
   2943 	((lgrp_expand_proc_thresh) / (ncpu))
   2944 
   2945 /*
   2946  * A process will be expanded to a new lgroup only if the difference between
   2947  * the lowest load on the lgroups the process' thread's are currently spread
   2948  * across and the lowest load on the other lgroups in the process' partition
   2949  * is greater than lgrp_expand_proc_diff.
   2950  */
   2951 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
   2952 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
   2953 
   2954 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
   2955 	((lgrp_expand_proc_diff) / (ncpu))
   2956 
   2957 /*
   2958  * The loadavg tolerance accounts for "noise" inherent in the load, which may
   2959  * be present due to impreciseness of the load average decay algorithm.
   2960  *
   2961  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
   2962  * tolerance is scaled by the number of cpus in the lgroup just like
   2963  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
   2964  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
   2965  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
   2966  */
   2967 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
   2968 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
   2969 	((lgrp_loadavg_tolerance) / ncpu)
   2970 
   2971 /*
   2972  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
   2973  * average is above this threshold
   2974  */
   2975 uint32_t	lgrp_load_thresh = UINT32_MAX;
   2976 
   2977 /*
   2978  * lgrp_choose() will try to skip any lgroups with less memory
   2979  * than this free when choosing a home lgroup
   2980  */
   2981 pgcnt_t	lgrp_mem_free_thresh = 0;
   2982 
   2983 /*
   2984  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
   2985  * one based on one of the following policies:
   2986  * - Random selection
   2987  * - Pseudo round robin placement
   2988  * - Longest time since a thread was last placed
   2989  */
   2990 #define	LGRP_CHOOSE_RANDOM	1
   2991 #define	LGRP_CHOOSE_RR		2
   2992 #define	LGRP_CHOOSE_TIME	3
   2993 
   2994 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
   2995 
   2996 /*
   2997  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
   2998  * be bound to a CPU or processor set.
   2999  *
   3000  * Arguments:
   3001  *	t		The thread
   3002  *	cpupart		The partition the thread belongs to.
   3003  *
   3004  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
   3005  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
   3006  *	 partitions changing out from under us and assumes that given thread is
   3007  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
   3008  *	 disabled, so don't grab any locks because we should never block under
   3009  *	 those conditions.
   3010  */
   3011 lpl_t *
   3012 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
   3013 {
   3014 	lgrp_load_t	bestload, bestrload;
   3015 	int		lgrpid_offset, lgrp_count;
   3016 	lgrp_id_t	lgrpid, lgrpid_start;
   3017 	lpl_t		*lpl, *bestlpl, *bestrlpl;
   3018 	klgrpset_t	lgrpset;
   3019 	proc_t		*p;
   3020 
   3021 	ASSERT(t != NULL);
   3022 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
   3023 	    THREAD_LOCK_HELD(t));
   3024 	ASSERT(cpupart != NULL);
   3025 
   3026 	p = t->t_procp;
   3027 
   3028 	/* A process should always be in an active partition */
   3029 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
   3030 
   3031 	bestlpl = bestrlpl = NULL;
   3032 	bestload = bestrload = LGRP_LOADAVG_MAX;
   3033 	lgrpset = cpupart->cp_lgrpset;
   3034 
   3035 	switch (lgrp_choose_policy) {
   3036 	case LGRP_CHOOSE_RR:
   3037 		lgrpid = cpupart->cp_lgrp_hint;
   3038 		do {
   3039 			if (++lgrpid > lgrp_alloc_max)
   3040 				lgrpid = 0;
   3041 		} while (!klgrpset_ismember(lgrpset, lgrpid));
   3042 
   3043 		break;
   3044 	default:
   3045 	case LGRP_CHOOSE_TIME:
   3046 	case LGRP_CHOOSE_RANDOM:
   3047 		klgrpset_nlgrps(lgrpset, lgrp_count);
   3048 		lgrpid_offset =
   3049 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
   3050 		for (lgrpid = 0; ; lgrpid++) {
   3051 			if (klgrpset_ismember(lgrpset, lgrpid)) {
   3052 				if (--lgrpid_offset == 0)
   3053 					break;
   3054 			}
   3055 		}
   3056 		break;
   3057 	}
   3058 
   3059 	lgrpid_start = lgrpid;
   3060 
   3061 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
   3062 	    lgrp_id_t, cpupart->cp_lgrp_hint);
   3063 
   3064 	/*
   3065 	 * Use lgroup affinities (if any) to choose best lgroup
   3066 	 *
   3067 	 * NOTE: Assumes that thread is protected from going away and its
   3068 	 *	 lgroup affinities won't change (ie. p_lock, or
   3069 	 *	 thread_lock() being held and/or CPUs paused)
   3070 	 */
   3071 	if (t->t_lgrp_affinity) {
   3072 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
   3073 		if (lpl != NULL)
   3074 			return (lpl);
   3075 	}
   3076 
   3077 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
   3078 
   3079 	do {
   3080 		pgcnt_t	npgs;
   3081 
   3082 		/*
   3083 		 * Skip any lgroups outside of thread's pset
   3084 		 */
   3085 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
   3086 			if (++lgrpid > lgrp_alloc_max)
   3087 				lgrpid = 0;	/* wrap the search */
   3088 			continue;
   3089 		}
   3090 
   3091 		/*
   3092 		 * Skip any non-leaf lgroups
   3093 		 */
   3094 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
   3095 			continue;
   3096 
   3097 		/*
   3098 		 * Skip any lgroups without enough free memory
   3099 		 * (when threshold set to nonzero positive value)
   3100 		 */
   3101 		if (lgrp_mem_free_thresh > 0) {
   3102 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
   3103 			if (npgs < lgrp_mem_free_thresh) {
   3104 				if (++lgrpid > lgrp_alloc_max)
   3105 					lgrpid = 0;	/* wrap the search */
   3106 				continue;
   3107 			}
   3108 		}
   3109 
   3110 		lpl = &cpupart->cp_lgrploads[lgrpid];
   3111 		if (klgrpset_isempty(p->p_lgrpset) ||
   3112 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
   3113 			/*
   3114 			 * Either this is a new process or the process already
   3115 			 * has threads on this lgrp, so this is a preferred
   3116 			 * lgroup for the thread.
   3117 			 */
   3118 			if (bestlpl == NULL ||
   3119 			    lpl_pick(lpl, bestlpl)) {
   3120 				bestload = lpl->lpl_loadavg;
   3121 				bestlpl = lpl;
   3122 			}
   3123 		} else {
   3124 			/*
   3125 			 * The process doesn't have any threads on this lgrp,
   3126 			 * but we're willing to consider this lgrp if the load
   3127 			 * difference is big enough to justify splitting up
   3128 			 * the process' threads.
   3129 			 */
   3130 			if (bestrlpl == NULL ||
   3131 			    lpl_pick(lpl, bestrlpl)) {
   3132 				bestrload = lpl->lpl_loadavg;
   3133 				bestrlpl = lpl;
   3134 			}
   3135 		}
   3136 		if (++lgrpid > lgrp_alloc_max)
   3137 			lgrpid = 0;	/* wrap the search */
   3138 	} while (lgrpid != lgrpid_start);
   3139 
   3140 	/*
   3141 	 * Return root lgroup if threshold isn't set to maximum value and
   3142 	 * lowest lgroup load average more than a certain threshold
   3143 	 */
   3144 	if (lgrp_load_thresh != UINT32_MAX &&
   3145 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
   3146 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
   3147 
   3148 	/*
   3149 	 * If all the lgroups over which the thread's process is spread are
   3150 	 * heavily loaded, or otherwise undesirable, we'll consider placing
   3151 	 * the thread on one of the other leaf lgroups in the thread's
   3152 	 * partition.
   3153 	 */
   3154 	if ((bestlpl == NULL) ||
   3155 	    ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
   3156 	    (bestrload < bestload) &&	/* paranoid about wraparound */
   3157 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
   3158 	    bestload))) {
   3159 		bestlpl = bestrlpl;
   3160 	}
   3161 
   3162 	if (bestlpl == NULL) {
   3163 		/*
   3164 		 * No lgroup looked particularly good, but we still
   3165 		 * have to pick something. Go with the randomly selected
   3166 		 * legal lgroup we started with above.
   3167 		 */
   3168 		bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
   3169 	}
   3170 
   3171 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
   3172 	bestlpl->lpl_homed_time = gethrtime_unscaled();
   3173 
   3174 	ASSERT(bestlpl->lpl_ncpu > 0);
   3175 	return (bestlpl);
   3176 }
   3177 
   3178 /*
   3179  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
   3180  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
   3181  */
   3182 static int
   3183 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
   3184 {
   3185 	lgrp_load_t	l1, l2;
   3186 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
   3187 
   3188 	l1 = lpl1->lpl_loadavg;
   3189 	l2 = lpl2->lpl_loadavg;
   3190 
   3191 	if ((l1 + tolerance < l2) && (l1 < l2)) {
   3192 		/* lpl1 is significantly less loaded than lpl2 */
   3193 		return (1);
   3194 	}
   3195 
   3196 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
   3197 	    l1 + tolerance >= l2 && l1 < l2 &&
   3198 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
   3199 		/*
   3200 		 * lpl1's load is within the tolerance of lpl2. We're
   3201 		 * willing to consider it be to better however if
   3202 		 * it has been longer since we last homed a thread there
   3203 		 */
   3204 		return (1);
   3205 	}
   3206 
   3207 	return (0);
   3208 }
   3209 
   3210 /*
   3211  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
   3212  * process that uses text replication changed home lgrp. This info is used by
   3213  * segvn asyncronous thread to detect if it needs to recheck what lgrps
   3214  * should be used for text replication.
   3215  */
   3216 static uint64_t lgrp_trthr_moves = 0;
   3217 
   3218 uint64_t
   3219 lgrp_get_trthr_migrations(void)
   3220 {
   3221 	return (lgrp_trthr_moves);
   3222 }
   3223 
   3224 void
   3225 lgrp_update_trthr_migrations(uint64_t incr)
   3226 {
   3227 	atomic_add_64(&lgrp_trthr_moves, incr);
   3228 }
   3229 
   3230 /*
   3231  * An LWP is expected to be assigned to an lgroup for at least this long
   3232  * for its anticipatory load to be justified.  NOTE that this value should
   3233  * not be set extremely huge (say, larger than 100 years), to avoid problems
   3234  * with overflow in the calculation that uses it.
   3235  */
   3236 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
   3237 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
   3238 
   3239 /*
   3240  * Routine to change a thread's lgroup affiliation.  This routine updates
   3241  * the thread's kthread_t struct and its process' proc_t struct to note the
   3242  * thread's new lgroup affiliation, and its lgroup affinities.
   3243  *
   3244  * Note that this is the only routine that modifies a thread's t_lpl field,
   3245  * and that adds in or removes anticipatory load.
   3246  *
   3247  * If the thread is exiting, newlpl is NULL.
   3248  *
   3249  * Locking:
   3250  * The following lock must be held on entry:
   3251  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
   3252  *		doesn't get removed from t's partition
   3253  *
   3254  * This routine is not allowed to grab any locks, since it may be called
   3255  * with cpus paused (such as from cpu_offline).
   3256  */
   3257 void
   3258 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
   3259 {
   3260 	proc_t		*p;
   3261 	lpl_t		*lpl, *oldlpl;
   3262 	lgrp_id_t	oldid;
   3263 	kthread_t	*tp;
   3264 	uint_t		ncpu;
   3265 	lgrp_load_t	old, new;
   3266 
   3267 	ASSERT(t);
   3268 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
   3269 	    THREAD_LOCK_HELD(t));
   3270 
   3271 	/*
   3272 	 * If not changing lpls, just return
   3273 	 */
   3274 	if ((oldlpl = t->t_lpl) == newlpl)
   3275 		return;
   3276 
   3277 	/*
   3278 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
   3279 	 * associated with process 0 rather than with its original process).
   3280 	 */
   3281 	if (t->t_proc_flag & TP_LWPEXIT) {
   3282 		if (newlpl != NULL) {
   3283 			t->t_lpl = newlpl;
   3284 		}
   3285 		return;
   3286 	}
   3287 
   3288 	p = ttoproc(t);
   3289 
   3290 	/*
   3291 	 * If the thread had a previous lgroup, update its process' p_lgrpset
   3292 	 * to account for it being moved from its old lgroup.
   3293 	 */
   3294 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
   3295 	    (p->p_tlist != NULL)) {
   3296 		oldid = oldlpl->lpl_lgrpid;
   3297 
   3298 		if (newlpl != NULL)
   3299 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
   3300 
   3301 		if ((do_lgrpset_delete) &&
   3302 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
   3303 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
   3304 				/*
   3305 				 * Check if a thread other than the thread
   3306 				 * that's moving is assigned to the same
   3307 				 * lgroup as the thread that's moving.  Note
   3308 				 * that we have to compare lgroup IDs, rather
   3309 				 * than simply comparing t_lpl's, since the
   3310 				 * threads may belong to different partitions
   3311 				 * but be assigned to the same lgroup.
   3312 				 */
   3313 				ASSERT(tp->t_lpl != NULL);
   3314 
   3315 				if ((tp != t) &&
   3316 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
   3317 					/*
   3318 					 * Another thread is assigned to the
   3319 					 * same lgroup as the thread that's
   3320 					 * moving, p_lgrpset doesn't change.
   3321 					 */
   3322 					break;
   3323 				} else if (tp == p->p_tlist) {
   3324 					/*
   3325 					 * No other thread is assigned to the
   3326 					 * same lgroup as the exiting thread,
   3327 					 * clear the lgroup's bit in p_lgrpset.
   3328 					 */
   3329 					klgrpset_del(p->p_lgrpset, oldid);
   3330 					break;
   3331 				}
   3332 			}
   3333 		}
   3334 
   3335 		/*
   3336 		 * If this thread was assigned to its old lgroup for such a
   3337 		 * short amount of time that the anticipatory load that was
   3338 		 * added on its behalf has aged very little, remove that
   3339 		 * anticipatory load.
   3340 		 */
   3341 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
   3342 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
   3343 			lpl = oldlpl;
   3344 			for (;;) {
   3345 				do {
   3346 					old = new = lpl->lpl_loadavg;
   3347 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
   3348 					if (new > old) {
   3349 						/*
   3350 						 * this can happen if the load
   3351 						 * average was aged since we
   3352 						 * added in the anticipatory
   3353 						 * load
   3354 						 */
   3355 						new = 0;
   3356 					}
   3357 				} while (cas32(
   3358 				    (lgrp_load_t *)&lpl->lpl_loadavg, old,
   3359 				    new) != old);
   3360 
   3361 				lpl = lpl->lpl_parent;
   3362 				if (lpl == NULL)
   3363 					break;
   3364 
   3365 				ncpu = lpl->lpl_ncpu;
   3366 				ASSERT(ncpu > 0);
   3367 			}
   3368 		}
   3369 	}
   3370 	/*
   3371 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
   3372 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
   3373 	 * to its new lgroup to account for its move to its new lgroup.
   3374 	 */
   3375 	if (newlpl != NULL) {
   3376 		/*
   3377 		 * This thread is moving to a new lgroup
   3378 		 */
   3379 		t->t_lpl = newlpl;
   3380 		if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
   3381 			p->p_t1_lgrpid = newlpl->lpl_lgrpid;
   3382 			membar_producer();
   3383 			if (p->p_tr_lgrpid != LGRP_NONE &&
   3384 			    p->p_tr_lgrpid != p->p_t1_lgrpid) {
   3385 				lgrp_update_trthr_migrations(1);
   3386 			}
   3387 		}
   3388 
   3389 		/*
   3390 		 * Reflect move in load average of new lgroup
   3391 		 * unless it is root lgroup
   3392 		 */
   3393 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
   3394 			return;
   3395 
   3396 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
   3397 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
   3398 		}
   3399 
   3400 		/*
   3401 		 * It'll take some time for the load on the new lgroup
   3402 		 * to reflect this thread's placement on it.  We'd
   3403 		 * like not, however, to have all threads between now
   3404 		 * and then also piling on to this lgroup.  To avoid
   3405 		 * this pileup, we anticipate the load this thread
   3406 		 * will generate on its new lgroup.  The goal is to
   3407 		 * make the lgroup's load appear as though the thread
   3408 		 * had been there all along.  We're very conservative
   3409 		 * in calculating this anticipatory load, we assume
   3410 		 * the worst case case (100% CPU-bound thread).  This
   3411 		 * may be modified in the future to be more accurate.
   3412 		 */
   3413 		lpl = newlpl;
   3414 		for (;;) {
   3415 			ncpu = lpl->lpl_ncpu;
   3416 			ASSERT(ncpu > 0);
   3417 			do {
   3418 				old = new = lpl->lpl_loadavg;
   3419 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
   3420 				/*
   3421 				 * Check for overflow
   3422 				 * Underflow not possible here
   3423 				 */
   3424 				if (new < old)
   3425 					new = UINT32_MAX;
   3426 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
   3427 			    new) != old);
   3428 
   3429 			lpl = lpl->lpl_parent;
   3430 			if (lpl == NULL)
   3431 				break;
   3432 		}
   3433 		t->t_anttime = gethrtime();
   3434 	}
   3435 }
   3436 
   3437 /*
   3438  * Return lgroup memory allocation policy given advice from madvise(3C)
   3439  */
   3440 lgrp_mem_policy_t
   3441 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
   3442 {
   3443 	switch (advice) {
   3444 	case MADV_ACCESS_LWP:
   3445 		return (LGRP_MEM_POLICY_NEXT);
   3446 	case MADV_ACCESS_MANY:
   3447 		return (LGRP_MEM_POLICY_RANDOM);
   3448 	default:
   3449 		return (lgrp_mem_policy_default(size, type));
   3450 	}
   3451 }
   3452 
   3453 /*
   3454  * Figure out default policy
   3455  */
   3456 lgrp_mem_policy_t
   3457 lgrp_mem_policy_default(size_t size, int type)
   3458 {
   3459 	cpupart_t		*cp;
   3460 	lgrp_mem_policy_t	policy;
   3461 	size_t			pset_mem_size;
   3462 
   3463 	/*
   3464 	 * Randomly allocate memory across lgroups for shared memory
   3465 	 * beyond a certain threshold
   3466 	 */
   3467 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
   3468 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
   3469 		/*
   3470 		 * Get total memory size of current thread's pset
   3471 		 */
   3472 		kpreempt_disable();
   3473 		cp = curthread->t_cpupart;
   3474 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
   3475 		kpreempt_enable();
   3476 
   3477 		/*
   3478 		 * Choose policy to randomly allocate memory across
   3479 		 * lgroups in pset if it will fit and is not default
   3480 		 * partition.  Otherwise, allocate memory randomly
   3481 		 * across machine.
   3482 		 */
   3483 		if (lgrp_mem_pset_aware && size < pset_mem_size)
   3484 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
   3485 		else
   3486 			policy = LGRP_MEM_POLICY_RANDOM;
   3487 	} else
   3488 		/*
   3489 		 * Apply default policy for private memory and
   3490 		 * shared memory under the respective random
   3491 		 * threshold.
   3492 		 */
   3493 		policy = lgrp_mem_default_policy;
   3494 
   3495 	return (policy);
   3496 }
   3497 
   3498 /*
   3499  * Get memory allocation policy for this segment
   3500  */
   3501 lgrp_mem_policy_info_t *
   3502 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
   3503 {
   3504 	lgrp_mem_policy_info_t	*policy_info;
   3505 	extern struct seg_ops	segspt_ops;
   3506 	extern struct seg_ops	segspt_shmops;
   3507 
   3508 	/*
   3509 	 * This is for binary compatibility to protect against third party
   3510 	 * segment drivers which haven't recompiled to allow for
   3511 	 * SEGOP_GETPOLICY()
   3512 	 */
   3513 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
   3514 	    seg->s_ops != &segspt_shmops)
   3515 		return (NULL);
   3516 
   3517 	policy_info = NULL;
   3518 	if (seg->s_ops->getpolicy != NULL)
   3519 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
   3520 
   3521 	return (policy_info);
   3522 }
   3523 
   3524 /*
   3525  * Set policy for allocating private memory given desired policy, policy info,
   3526  * size in bytes of memory that policy is being applied.
   3527  * Return 0 if policy wasn't set already and 1 if policy was set already
   3528  */
   3529 int
   3530 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
   3531     lgrp_mem_policy_info_t *policy_info, size_t size)
   3532 {
   3533 
   3534 	ASSERT(policy_info != NULL);
   3535 
   3536 	if (policy == LGRP_MEM_POLICY_DEFAULT)
   3537 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
   3538 
   3539 	/*
   3540 	 * Policy set already?
   3541 	 */
   3542 	if (policy == policy_info->mem_policy)
   3543 		return (1);
   3544 
   3545 	/*
   3546 	 * Set policy
   3547 	 */
   3548 	policy_info->mem_policy = policy;
   3549 	policy_info->mem_lgrpid = LGRP_NONE;
   3550 
   3551 	return (0);
   3552 }
   3553 
   3554 
   3555 /*
   3556  * Get shared memory allocation policy with given tree and offset
   3557  */
   3558 lgrp_mem_policy_info_t *
   3559 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
   3560     u_offset_t vn_off)
   3561 {
   3562 	u_offset_t		off;
   3563 	lgrp_mem_policy_info_t	*policy_info;
   3564 	lgrp_shm_policy_seg_t	*policy_seg;
   3565 	lgrp_shm_locality_t	*shm_locality;
   3566 	avl_tree_t		*tree;
   3567 	avl_index_t		where;
   3568 
   3569 	/*
   3570 	 * Get policy segment tree from anon_map or vnode and use specified
   3571 	 * anon index or vnode offset as offset
   3572 	 *
   3573 	 * Assume that no lock needs to be held on anon_map or vnode, since
   3574 	 * they should be protected by their reference count which must be
   3575 	 * nonzero for an existing segment
   3576 	 */
   3577 	if (amp) {
   3578 		ASSERT(amp->refcnt != 0);
   3579 		shm_locality = amp->locality;
   3580 		if (shm_locality == NULL)
   3581 			return (NULL);
   3582 		tree = shm_locality->loc_tree;
   3583 		off = ptob(anon_index);
   3584 	} else if (vp) {
   3585 		shm_locality = vp->v_locality;
   3586 		if (shm_locality == NULL)
   3587 			return (NULL);
   3588 		ASSERT(shm_locality->loc_count != 0);
   3589 		tree = shm_locality->loc_tree;
   3590 		off = vn_off;
   3591 	}
   3592 
   3593 	if (tree == NULL)
   3594 		return (NULL);
   3595 
   3596 	/*
   3597 	 * Lookup policy segment for offset into shared object and return
   3598 	 * policy info
   3599 	 */
   3600 	rw_enter(&shm_locality->loc_lock, RW_READER);
   3601 	policy_info = NULL;
   3602 	policy_seg = avl_find(tree, &off, &where);
   3603 	if (policy_seg)
   3604 		policy_info = &policy_seg->shm_policy;
   3605 	rw_exit(&shm_locality->loc_lock);
   3606 
   3607 	return (policy_info);
   3608 }
   3609 
   3610 /*
   3611  * Default memory allocation policy for kernel segmap pages
   3612  */
   3613 lgrp_mem_policy_t	lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
   3614 
   3615 /*
   3616  * Return lgroup to use for allocating memory
   3617  * given the segment and address
   3618  *
   3619  * There isn't any mutual exclusion that exists between calls
   3620  * to this routine and DR, so this routine and whomever calls it
   3621  * should be mindful of the possibility that the lgrp returned
   3622  * may be deleted. If this happens, dereferences of the lgrp
   3623  * pointer will still be safe, but the resources in the lgrp will
   3624  * be gone, and LGRP_EXISTS() will no longer be true.
   3625  */
   3626 lgrp_t *
   3627 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
   3628 {
   3629 	int			i;
   3630 	lgrp_t			*lgrp;
   3631 	klgrpset_t		lgrpset;
   3632 	int			lgrps_spanned;
   3633 	unsigned long		off;
   3634 	lgrp_mem_policy_t	policy;
   3635 	lgrp_mem_policy_info_t	*policy_info;
   3636 	ushort_t		random;
   3637 	int			stat = 0;
   3638 	extern struct seg	*segkmap;
   3639 
   3640 	/*
   3641 	 * Just return null if the lgrp framework hasn't finished
   3642 	 * initializing or if this is a UMA machine.
   3643 	 */
   3644 	if (nlgrps == 1 || !lgrp_initialized)
   3645 		return (lgrp_root);
   3646 
   3647 	/*
   3648 	 * Get memory allocation policy for this segment
   3649 	 */
   3650 	policy = lgrp_mem_default_policy;
   3651 	if (seg != NULL) {
   3652 		if (seg->s_as == &kas) {
   3653 			if (seg == segkmap)
   3654 				policy = lgrp_segmap_default_policy;
   3655 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
   3656 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
   3657 				policy = LGRP_MEM_POLICY_RANDOM;
   3658 		} else {
   3659 			policy_info = lgrp_mem_policy_get(seg, vaddr);
   3660 			if (policy_info != NULL) {
   3661 				policy = policy_info->mem_policy;
   3662 				if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
   3663 					lgrp_id_t id = policy_info->mem_lgrpid;
   3664 					ASSERT(id != LGRP_NONE);
   3665 					ASSERT(id < NLGRPS_MAX);
   3666 					lgrp = lgrp_table[id];
   3667 					if (!LGRP_EXISTS(lgrp)) {
   3668 						policy = LGRP_MEM_POLICY_NEXT;
   3669 					} else {
   3670 						lgrp_stat_add(id,
   3671 						    LGRP_NUM_NEXT_SEG, 1);
   3672 						return (lgrp);
   3673 					}
   3674 				}
   3675 			}
   3676 		}
   3677 	}
   3678 	lgrpset = 0;
   3679 
   3680 	/*
   3681 	 * Initialize lgroup to home by default
   3682 	 */
   3683 	lgrp = lgrp_home_lgrp();
   3684 
   3685 	/*
   3686 	 * When homing threads on root lgrp, override default memory
   3687 	 * allocation policies with root lgroup memory allocation policy
   3688 	 */
   3689 	if (lgrp == lgrp_root)
   3690 		policy = lgrp_mem_policy_root;
   3691 
   3692 	/*
   3693 	 * Implement policy
   3694 	 */
   3695 	switch (policy) {
   3696 	case LGRP_MEM_POLICY_NEXT_CPU:
   3697 
   3698 		/*
   3699 		 * Return lgroup of current CPU which faulted on memory
   3700 		 * If the CPU isn't currently in an lgrp, then opt to
   3701 		 * allocate from the root.
   3702 		 *
   3703 		 * Kernel preemption needs to be disabled here to prevent
   3704 		 * the current CPU from going away before lgrp is found.
   3705 		 */
   3706 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
   3707 			lgrp = lgrp_root;
   3708 		} else {
   3709 			kpreempt_disable();
   3710 			lgrp = lgrp_cpu_to_lgrp(CPU);
   3711 			kpreempt_enable();
   3712 		}
   3713 		break;
   3714 
   3715 	case LGRP_MEM_POLICY_NEXT:
   3716 	case LGRP_MEM_POLICY_DEFAULT:
   3717 	default:
   3718 
   3719 		/*
   3720 		 * Just return current thread's home lgroup
   3721 		 * for default policy (next touch)
   3722 		 * If the thread is homed to the root,
   3723 		 * then the default policy is random across lgroups.
   3724 		 * Fallthrough to the random case.
   3725 		 */
   3726 		if (lgrp != lgrp_root) {
   3727 			if (policy == LGRP_MEM_POLICY_NEXT)
   3728 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
   3729 			else
   3730 				lgrp_stat_add(lgrp->lgrp_id,
   3731 				    LGRP_NUM_DEFAULT, 1);
   3732 			break;
   3733 		}
   3734 		/* LINTED fallthrough on case statement */
   3735 	case LGRP_MEM_POLICY_RANDOM:
   3736 
   3737 		/*
   3738 		 * Return a random leaf lgroup with memory
   3739 		 */
   3740 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
   3741 		/*
   3742 		 * Count how many lgroups are spanned
   3743 		 */
   3744 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
   3745 
   3746 		/*
   3747 		 * There may be no memnodes in the root lgroup during DR copy
   3748 		 * rename on a system with only two boards (memnodes)
   3749 		 * configured. In this case just return the root lgrp.
   3750 		 */
   3751 		if (lgrps_spanned == 0) {
   3752 			lgrp = lgrp_root;
   3753 			break;
   3754 		}
   3755 
   3756 		/*
   3757 		 * Pick a random offset within lgroups spanned
   3758 		 * and return lgroup at that offset
   3759 		 */
   3760 		random = (ushort_t)gethrtime() >> 4;
   3761 		off = random % lgrps_spanned;
   3762 		ASSERT(off <= lgrp_alloc_max);
   3763 
   3764 		for (i = 0; i <= lgrp_alloc_max; i++) {
   3765 			if (!klgrpset_ismember(lgrpset, i))
   3766 				continue;
   3767 			if (off)
   3768 				off--;
   3769 			else {
   3770 				lgrp = lgrp_table[i];
   3771 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
   3772 				    1);
   3773 				break;
   3774 			}
   3775 		}
   3776 		break;
   3777 
   3778 	case LGRP_MEM_POLICY_RANDOM_PROC:
   3779 
   3780 		/*
   3781 		 * Grab copy of bitmask of lgroups spanned by
   3782 		 * this process
   3783 		 */
   3784 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
   3785 		stat = LGRP_NUM_RANDOM_PROC;
   3786 
   3787 		/* LINTED fallthrough on case statement */
   3788 	case LGRP_MEM_POLICY_RANDOM_PSET:
   3789 
   3790 		if (!stat)
   3791 			stat = LGRP_NUM_RANDOM_PSET;
   3792 
   3793 		if (klgrpset_isempty(lgrpset)) {
   3794 			/*
   3795 			 * Grab copy of bitmask of lgroups spanned by
   3796 			 * this processor set
   3797 			 */
   3798 			kpreempt_disable();
   3799 			klgrpset_copy(lgrpset,
   3800 			    curthread->t_cpupart->cp_lgrpset);
   3801 			kpreempt_enable();
   3802 		}
   3803 
   3804 		/*
   3805 		 * Count how many lgroups are spanned
   3806 		 */
   3807 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
   3808 		ASSERT(lgrps_spanned <= nlgrps);
   3809 
   3810 		/*
   3811 		 * Probably lgrps_spanned should be always non-zero, but to be
   3812 		 * on the safe side we return lgrp_root if it is empty.
   3813 		 */
   3814 		if (lgrps_spanned == 0) {
   3815 			lgrp = lgrp_root;
   3816 			break;
   3817 		}
   3818 
   3819 		/*
   3820 		 * Pick a random offset within lgroups spanned
   3821 		 * and return lgroup at that offset
   3822 		 */
   3823 		random = (ushort_t)gethrtime() >> 4;
   3824 		off = random % lgrps_spanned;
   3825 		ASSERT(off <= lgrp_alloc_max);
   3826 
   3827 		for (i = 0; i <= lgrp_alloc_max; i++) {
   3828 			if (!klgrpset_ismember(lgrpset, i))
   3829 				continue;
   3830 			if (off)
   3831 				off--;
   3832 			else {
   3833 				lgrp = lgrp_table[i];
   3834 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
   3835 				    1);
   3836 				break;
   3837 			}
   3838 		}
   3839 		break;
   3840 
   3841 	case LGRP_MEM_POLICY_ROUNDROBIN:
   3842 
   3843 		/*
   3844 		 * Use offset within segment to determine
   3845 		 * offset from home lgroup to choose for
   3846 		 * next lgroup to allocate memory from
   3847 		 */
   3848 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
   3849 		    (lgrp_alloc_max + 1);
   3850 
   3851 		kpreempt_disable();
   3852 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
   3853 		i = lgrp->lgrp_id;
   3854 		kpreempt_enable();
   3855 
   3856 		while (off > 0) {
   3857 			i = (i + 1) % (lgrp_alloc_max + 1);
   3858 			lgrp = lgrp_table[i];
   3859 			if (klgrpset_ismember(lgrpset, i))
   3860 				off--;
   3861 		}
   3862 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
   3863 
   3864 		break;
   3865 	}
   3866 
   3867 	ASSERT(lgrp != NULL);
   3868 	return (lgrp);
   3869 }
   3870 
   3871 /*
   3872  * Return the number of pages in an lgroup
   3873  *
   3874  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
   3875  *	 could cause tests that rely on the numat driver to fail....
   3876  */
   3877 pgcnt_t
   3878 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
   3879 {
   3880 	lgrp_t *lgrp;
   3881 
   3882 	lgrp = lgrp_table[lgrpid];
   3883 	if (!LGRP_EXISTS(lgrp) ||
   3884 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
   3885 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
   3886 		return (0);
   3887 
   3888 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
   3889 }
   3890 
   3891 /*
   3892  * Initialize lgroup shared memory allocation policy support
   3893  */
   3894 void
   3895 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
   3896 {
   3897 	lgrp_shm_locality_t	*shm_locality;
   3898 
   3899 	/*
   3900 	 * Initialize locality field in anon_map
   3901 	 * Don't need any locks because this is called when anon_map is
   3902 	 * allocated, but not used anywhere yet.
   3903 	 */
   3904 	if (amp) {
   3905 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   3906 		if (amp->locality == NULL) {
   3907 			/*
   3908 			 * Allocate and initialize shared memory locality info
   3909 			 * and set anon_map locality pointer to it
   3910 			 * Drop lock across kmem_alloc(KM_SLEEP)
   3911 			 */
   3912 			ANON_LOCK_EXIT(&amp->a_rwlock);
   3913 			shm_locality = kmem_alloc(sizeof (*shm_locality),
   3914 			    KM_SLEEP);
   3915 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
   3916 			    NULL);
   3917 			shm_locality->loc_count = 1;	/* not used for amp */
   3918 			shm_locality->loc_tree = NULL;
   3919 
   3920 			/*
   3921 			 * Reacquire lock and check to see whether anyone beat
   3922 			 * us to initializing the locality info
   3923 			 */
   3924 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
   3925 			if (amp->locality != NULL) {
   3926 				rw_destroy(&shm_locality->loc_lock);
   3927 				kmem_free(shm_locality,
   3928 				    sizeof (*shm_locality));
   3929 			} else
   3930 				amp->locality = shm_locality;
   3931 		}
   3932 		ANON_LOCK_EXIT(&amp->a_rwlock);
   3933 		return;
   3934 	}
   3935 
   3936 	/*
   3937 	 * Allocate shared vnode policy info if vnode is not locality aware yet
   3938 	 */
   3939 	mutex_enter(&vp->v_lock);
   3940 	if ((vp->v_flag & V_LOCALITY) == 0) {
   3941 		/*
   3942 		 * Allocate and initialize shared memory locality info
   3943 		 */
   3944 		mutex_exit(&vp->v_lock);
   3945 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
   3946 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
   3947 		shm_locality->loc_count = 1;
   3948 		shm_locality->loc_tree = NULL;
   3949 
   3950 		/*
   3951 		 * Point vnode locality field at shared vnode policy info
   3952 		 * and set locality aware flag in vnode
   3953 		 */
   3954 		mutex_enter(&vp->v_lock);
   3955 		if ((vp->v_flag & V_LOCALITY) == 0) {
   3956 			vp->v_locality = shm_locality;
   3957 			vp->v_flag |= V_LOCALITY;
   3958 		} else {
   3959 			/*
   3960 			 * Lost race so free locality info and increment count.
   3961 			 */
   3962 			rw_destroy(&shm_locality->loc_lock);
   3963 			kmem_free(shm_locality, sizeof (*shm_locality));
   3964 			shm_locality = vp->v_locality;
   3965 			shm_locality->loc_count++;
   3966 		}
   3967 		mutex_exit(&vp->v_lock);
   3968 
   3969 		return;
   3970 	}
   3971 
   3972 	/*
   3973 	 * Increment reference count of number of segments mapping this vnode
   3974 	 * shared
   3975 	 */
   3976 	shm_locality = vp->v_locality;
   3977 	shm_locality->loc_count++;
   3978 	mutex_exit(&vp->v_lock);
   3979 }
   3980 
   3981 /*
   3982  * Destroy the given shared memory policy segment tree
   3983  */
   3984 void
   3985 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
   3986 {
   3987 	lgrp_shm_policy_seg_t	*cur;
   3988 	lgrp_shm_policy_seg_t	*next;
   3989 
   3990 	if (tree == NULL)
   3991 		return;
   3992 
   3993 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
   3994 	while (cur != NULL) {
   3995 		next = AVL_NEXT(tree, cur);
   3996 		avl_remove(tree, cur);
   3997 		kmem_free(cur, sizeof (*cur));
   3998 		cur = next;
   3999 	}
   4000 	kmem_free(tree, sizeof (avl_tree_t));
   4001 }
   4002 
   4003 /*
   4004  * Uninitialize lgroup shared memory allocation policy support
   4005  */
   4006 void
   4007 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
   4008 {
   4009 	lgrp_shm_locality_t	*shm_locality;
   4010 
   4011 	/*
   4012 	 * For anon_map, deallocate shared memory policy tree and
   4013 	 * zero locality field
   4014 	 * Don't need any locks because anon_map is being freed
   4015 	 */
   4016 	if (amp) {
   4017 		if (amp->locality == NULL)
   4018 			return;
   4019 		shm_locality = amp->locality;
   4020 		shm_locality->loc_count = 0;	/* not really used for amp */
   4021 		rw_destroy(&shm_locality->loc_lock);
   4022 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
   4023 		kmem_free(shm_locality, sizeof (*shm_locality));
   4024 		amp->locality = 0;
   4025 		return;
   4026 	}
   4027 
   4028 	/*
   4029 	 * For vnode, decrement reference count of segments mapping this vnode
   4030 	 * shared and delete locality info if reference count drops to 0
   4031 	 */
   4032 	mutex_enter(&vp->v_lock);
   4033 	shm_locality = vp->v_locality;
   4034 	shm_locality->loc_count--;
   4035 
   4036 	if (shm_locality->loc_count == 0) {
   4037 		rw_destroy(&shm_locality->loc_lock);
   4038 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
   4039 		kmem_free(shm_locality, sizeof (*shm_locality));
   4040 		vp->v_locality = 0;
   4041 		vp->v_flag &= ~V_LOCALITY;
   4042 	}
   4043 	mutex_exit(&vp->v_lock);
   4044 }
   4045 
   4046 /*
   4047  * Compare two shared memory policy segments
   4048  * Used by AVL tree code for searching
   4049  */
   4050 int
   4051 lgrp_shm_policy_compar(const void *x, const void *y)
   4052 {
   4053 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
   4054 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
   4055 
   4056 	if (a->shm_off < b->shm_off)
   4057 		return (-1);
   4058 	if (a->shm_off >= b->shm_off + b->shm_size)
   4059 		return (1);
   4060 	return (0);
   4061 }
   4062 
   4063 /*
   4064  * Concatenate seg1 with seg2 and remove seg2
   4065  */
   4066 static int
   4067 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
   4068     lgrp_shm_policy_seg_t *seg2)
   4069 {
   4070 	if (!seg1 || !seg2 ||
   4071 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
   4072 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
   4073 		return (-1);
   4074 
   4075 	seg1->shm_size += seg2->shm_size;
   4076 	avl_remove(tree, seg2);
   4077 	kmem_free(seg2, sizeof (*seg2));
   4078 	return (0);
   4079 }
   4080 
   4081 /*
   4082  * Split segment at given offset and return rightmost (uppermost) segment
   4083  * Assumes that there are no overlapping segments
   4084  */
   4085 static lgrp_shm_policy_seg_t *
   4086 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
   4087     u_offset_t off)
   4088 {
   4089 	lgrp_shm_policy_seg_t	*newseg;
   4090 	avl_index_t		where;
   4091 
   4092 	ASSERT(seg != NULL);
   4093 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
   4094 
   4095 	if (!seg || off < seg->shm_off || off > seg->shm_off +
   4096 	    seg->shm_size)
   4097 		return (NULL);
   4098 
   4099 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
   4100 		return (seg);
   4101 
   4102 	/*
   4103 	 * Adjust size of left segment and allocate new (right) segment
   4104 	 */
   4105 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
   4106 	newseg->shm_policy = seg->shm_policy;
   4107 	newseg->shm_off = off;
   4108 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
   4109 	seg->shm_size = off - seg->shm_off;
   4110 
   4111 	/*
   4112 	 * Find where to insert new segment in AVL tree and insert it
   4113 	 */
   4114 	(void) avl_find(tree, &off, &where);
   4115 	avl_insert(tree, newseg, where);
   4116 
   4117 	return (newseg);
   4118 }
   4119 
   4120 /*
   4121  * Set shared memory allocation policy on specified shared object at given
   4122  * offset and length
   4123  *
   4124  * Return 0 if policy wasn't set already, 1 if policy was set already, and
   4125  * -1 if can't set policy.
   4126  */
   4127 int
   4128 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
   4129     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
   4130 {
   4131 	u_offset_t		eoff;
   4132 	lgrp_shm_policy_seg_t	*next;
   4133 	lgrp_shm_policy_seg_t	*newseg;
   4134 	u_offset_t		off;
   4135 	u_offset_t		oldeoff;
   4136 	lgrp_shm_policy_seg_t	*prev;
   4137 	int			retval;
   4138 	lgrp_shm_policy_seg_t	*seg;
   4139 	lgrp_shm_locality_t	*shm_locality;
   4140 	avl_tree_t		*tree;
   4141 	avl_index_t		where;
   4142 
   4143 	ASSERT(amp || vp);
   4144 	ASSERT((len & PAGEOFFSET) == 0);
   4145 
   4146 	if (len == 0)
   4147 		return (-1);
   4148 
   4149 	retval = 0;
   4150 
   4151 	/*
   4152 	 * Get locality info and starting offset into shared object
   4153 	 * Try anon map first and then vnode
   4154 	 * Assume that no locks need to be held on anon_map or vnode, since
   4155 	 * it should be protected by its reference count which must be nonzero
   4156 	 * for an existing segment.
   4157 	 */
   4158 	if (amp) {
   4159 		/*
   4160 		 * Get policy info from anon_map
   4161 		 *
   4162 		 */
   4163 		ASSERT(amp->refcnt != 0);
   4164 		if (amp->locality == NULL)
   4165 			lgrp_shm_policy_init(amp, NULL);
   4166 		shm_locality = amp->locality;
   4167 		off = ptob(anon_index);
   4168 	} else if (vp) {
   4169 		/*
   4170 		 * Get policy info from vnode
   4171 		 */
   4172 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
   4173 			lgrp_shm_policy_init(NULL, vp);
   4174 		shm_locality = vp->v_locality;
   4175 		ASSERT(shm_locality->loc_count != 0);
   4176 		off = vn_off;
   4177 	} else
   4178 		return (-1);
   4179 
   4180 	ASSERT((off & PAGEOFFSET) == 0);
   4181 
   4182 	/*
   4183 	 * Figure out default policy
   4184 	 */
   4185 	if (policy == LGRP_MEM_POLICY_DEFAULT)
   4186 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
   4187 
   4188 	/*
   4189 	 * Create AVL tree if there isn't one yet
   4190 	 * and set locality field to point at it
   4191 	 */
   4192 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
   4193 	tree = shm_locality->loc_tree;
   4194 	if (!tree) {
   4195 		rw_exit(&shm_locality->loc_lock);
   4196 
   4197 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
   4198 
   4199 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
   4200 		if (shm_locality->loc_tree == NULL) {
   4201 			avl_create(tree, lgrp_shm_policy_compar,
   4202 			    sizeof (lgrp_shm_policy_seg_t),
   4203 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
   4204 			shm_locality->loc_tree = tree;
   4205 		} else {
   4206 			/*
   4207 			 * Another thread managed to set up the tree
   4208 			 * before we could. Free the tree we allocated
   4209 			 * and use the one that's already there.
   4210 			 */
   4211 			kmem_free(tree, sizeof (*tree));
   4212 			tree = shm_locality->loc_tree;
   4213 		}
   4214 	}
   4215 
   4216 	/*
   4217 	 * Set policy
   4218 	 *
   4219 	 * Need to maintain hold on writer's lock to keep tree from
   4220 	 * changing out from under us
   4221 	 */
   4222 	while (len != 0) {
   4223 		/*
   4224 		 * Find policy segment for specified offset into shared object
   4225 		 */
   4226 		seg = avl_find(tree, &off, &where);
   4227 
   4228 		/*
   4229 		 * Didn't find any existing segment that contains specified
   4230 		 * offset, so allocate new segment, insert it, and concatenate
   4231 		 * with adjacent segments if possible
   4232 		 */
   4233 		if (seg == NULL) {
   4234 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
   4235 			    KM_SLEEP);
   4236 			newseg->shm_policy.mem_policy = policy;
   4237 			newseg->shm_policy.mem_lgrpid = LGRP_NONE;
   4238 			newseg->shm_off = off;
   4239 			avl_insert(tree, newseg, where);
   4240 
   4241 			/*
   4242 			 * Check to see whether new segment overlaps with next
   4243 			 * one, set length of new segment accordingly, and
   4244 			 * calculate remaining length and next offset
   4245 			 */
   4246 			seg = AVL_NEXT(tree, newseg);
   4247 			if (seg == NULL || off + len <= seg->shm_off) {
   4248 				newseg->shm_size = len;
   4249 				len = 0;
   4250 			} else {
   4251 				newseg->shm_size = seg->shm_off - off;
   4252 				off = seg->shm_off;
   4253 				len -= newseg->shm_size;
   4254 			}
   4255 
   4256 			/*
   4257 			 * Try to concatenate new segment with next and
   4258 			 * previous ones, since they might have the same policy
   4259 			 * now.  Grab previous and next segments first because
   4260 			 * they will change on concatenation.
   4261 			 */
   4262 			prev =  AVL_PREV(tree, newseg);
   4263 			next = AVL_NEXT(tree, newseg);
   4264 			(void) lgrp_shm_policy_concat(tree, newseg, next);
   4265 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
   4266 
   4267 			continue;
   4268 		}
   4269 
   4270 		eoff = off + len;
   4271 		oldeoff = seg->shm_off + seg->shm_size;
   4272 
   4273 		/*
   4274 		 * Policy set already?
   4275 		 */
   4276 		if (policy == seg->shm_policy.mem_policy) {
   4277 			/*
   4278 			 * Nothing left to do if offset and length
   4279 			 * fall within this segment
   4280 			 */
   4281 			if (eoff <= oldeoff) {
   4282 				retval = 1;
   4283 				break;
   4284 			} else {
   4285 				len = eoff - oldeoff;
   4286 				off = oldeoff;
   4287 				continue;
   4288 			}
   4289 		}
   4290 
   4291 		/*
   4292 		 * Specified offset and length match existing segment exactly
   4293 		 */
   4294 		if (off == seg->shm_off && len == seg->shm_size) {
   4295 			/*
   4296 			 * Set policy and update current length
   4297 			 */
   4298 			seg->shm_policy.mem_policy = policy;
   4299 			seg->shm_policy.mem_lgrpid = LGRP_NONE;
   4300 			len = 0;
   4301 
   4302 			/*
   4303 			 * Try concatenating new segment with previous and next
   4304 			 * segments, since they might have the same policy now.
   4305 			 * Grab previous and next segments first because they
   4306 			 * will change on concatenation.
   4307 			 */
   4308 			prev =  AVL_PREV(tree, seg);
   4309 			next = AVL_NEXT(tree, seg);
   4310 			(void) lgrp_shm_policy_concat(tree, seg, next);
   4311 			(void) lgrp_shm_policy_concat(tree, prev, seg);
   4312 		} else {
   4313 			/*
   4314 			 * Specified offset and length only apply to part of
   4315 			 * existing segment
   4316 			 */
   4317 
   4318 			/*
   4319 			 * New segment starts in middle of old one, so split
   4320 			 * new one off near beginning of old one
   4321 			 */
   4322 			newseg = NULL;
   4323 			if (off > seg->shm_off) {
   4324 				newseg = lgrp_shm_policy_split(tree, seg, off);
   4325 
   4326 				/*
   4327 				 * New segment ends where old one did, so try
   4328 				 * to concatenate with next segment
   4329 				 */
   4330 				if (eoff == oldeoff) {
   4331 					newseg->shm_policy.mem_policy = policy;
   4332 					newseg->shm_policy.mem_lgrpid =
   4333 					    LGRP_NONE;
   4334 					(void) lgrp_shm_policy_concat(tree,
   4335 					    newseg, AVL_NEXT(tree, newseg));
   4336 					break;
   4337 				}
   4338 			}
   4339 
   4340 			/*
   4341 			 * New segment ends before old one, so split off end of
   4342 			 * old one
   4343 			 */
   4344 			if (eoff < oldeoff) {
   4345 				if (newseg) {
   4346 					(void) lgrp_shm_policy_split(tree,
   4347 					    newseg, eoff);
   4348 					newseg->shm_policy.mem_policy = policy;
   4349 					newseg->shm_policy.mem_lgrpid =
   4350 					    LGRP_NONE;
   4351 				} else {
   4352 					(void) lgrp_shm_policy_split(tree, seg,
   4353 					    eoff);
   4354 					seg->shm_policy.mem_policy = policy;
   4355 					seg->shm_policy.mem_lgrpid = LGRP_NONE;
   4356 				}
   4357 
   4358 				if (off == seg->shm_off)
   4359 					(void) lgrp_shm_policy_concat(tree,
   4360 					    AVL_PREV(tree, seg), seg);
   4361 				break;
   4362 			}
   4363 
   4364 			/*
   4365 			 * Calculate remaining length and next offset
   4366 			 */
   4367 			len = eoff - oldeoff;
   4368 			off = oldeoff;
   4369 		}
   4370 	}
   4371 
   4372 	rw_exit(&shm_locality->loc_lock);
   4373 	return (retval);
   4374 }
   4375 
   4376 /*
   4377  * Return the best memnode from which to allocate memory given
   4378  * an lgroup.
   4379  *
   4380  * "c" is for cookie, which is good enough for me.
   4381  * It references a cookie struct that should be zero'ed to initialize.
   4382  * The cookie should live on the caller's stack.
   4383  *
   4384  * The routine returns -1 when:
   4385  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
   4386  *	- traverse is 1, and all the memnodes in the system have been
   4387  *	  returned.
   4388  */
   4389 int
   4390 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
   4391 {
   4392 	lgrp_t		*lp = c->lmc_lgrp;
   4393 	mnodeset_t	nodes = c->lmc_nodes;
   4394 	int		cnt = c->lmc_cnt;
   4395 	int		offset, mnode;
   4396 
   4397 	extern int	max_mem_nodes;
   4398 
   4399 	/*
   4400 	 * If the set is empty, and the caller is willing, traverse
   4401 	 * up the hierarchy until we find a non-empty set.
   4402 	 */
   4403 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
   4404 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
   4405 		    ((lp = lp->lgrp_parent) == NULL))
   4406 			return (-1);
   4407 
   4408 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
   4409 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
   4410 	}
   4411 
   4412 	/*
   4413 	 * Select a memnode by picking one at a "random" offset.
   4414 	 * Because of DR, memnodes can come and go at any time.
   4415 	 * This code must be able to cope with the possibility
   4416 	 * that the nodes count "cnt" is inconsistent with respect
   4417 	 * to the number of elements actually in "nodes", and
   4418 	 * therefore that the offset chosen could be greater than
   4419 	 * the number of elements in the set (some memnodes may
   4420 	 * have dissapeared just before cnt was read).
   4421 	 * If this happens, the search simply wraps back to the
   4422 	 * beginning of the set.
   4423 	 */
   4424 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
   4425 	offset = c->lmc_rand % cnt;
   4426 	do {
   4427 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
   4428 			if (nodes & ((mnodeset_t)1 << mnode))
   4429 				if (!offset--)
   4430 					break;
   4431 	} while (mnode >= max_mem_nodes);
   4432 
   4433 	/* Found a node. Store state before returning. */
   4434 	c->lmc_lgrp = lp;
   4435 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
   4436 	c->lmc_cnt = cnt - 1;
   4437 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
   4438 	c->lmc_ntried++;
   4439 
   4440 	return (mnode);
   4441 }
   4442