Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 #include <sys/systm.h>
     27 #include <sys/types.h>
     28 #include <sys/param.h>
     29 #include <sys/thread.h>
     30 #include <sys/cpuvar.h>
     31 #include <sys/cpupart.h>
     32 #include <sys/kmem.h>
     33 #include <sys/cmn_err.h>
     34 #include <sys/kstat.h>
     35 #include <sys/processor.h>
     36 #include <sys/disp.h>
     37 #include <sys/group.h>
     38 #include <sys/pghw.h>
     39 #include <sys/bitset.h>
     40 #include <sys/lgrp.h>
     41 #include <sys/cmt.h>
     42 #include <sys/cpu_pm.h>
     43 
     44 /*
     45  * CMT scheduler / dispatcher support
     46  *
     47  * This file implements CMT scheduler support using Processor Groups.
     48  * The CMT processor group class creates and maintains the CMT class
     49  * specific processor group pg_cmt_t.
     50  *
     51  * ---------------------------- <-- pg_cmt_t *
     52  * | pghw_t                   |
     53  * ----------------------------
     54  * | CMT class specific data  |
     55  * | - hierarchy linkage      |
     56  * | - CMT load balancing data|
     57  * | - active CPU group/bitset|
     58  * ----------------------------
     59  *
     60  * The scheduler/dispatcher leverages knowledge of the performance
     61  * relevant CMT sharing relationships existing between cpus to implement
     62  * optimized affinity, load balancing, and coalescence policies.
     63  *
     64  * Load balancing policy seeks to improve performance by minimizing
     65  * contention over shared processor resources / facilities, Affinity
     66  * policies seek to improve cache and TLB utilization. Coalescence
     67  * policies improve resource utilization and ultimately power efficiency.
     68  *
     69  * The CMT PGs created by this class are already arranged into a
     70  * hierarchy (which is done in the pghw layer). To implement the top-down
     71  * CMT load balancing algorithm, the CMT PGs additionally maintain
     72  * parent, child and sibling hierarchy relationships.
     73  * Parent PGs always contain a superset of their children(s) resources,
     74  * each PG can have at most one parent, and siblings are the group of PGs
     75  * sharing the same parent.
     76  *
     77  * On UMA based systems, the CMT load balancing algorithm begins by balancing
     78  * load across the group of top level PGs in the system hierarchy.
     79  * On NUMA systems, the CMT load balancing algorithm balances load across the
     80  * group of top level PGs in each leaf lgroup...but for root homed threads,
     81  * is willing to balance against all the top level PGs in the system.
     82  *
     83  * Groups of top level PGs are maintained to implement the above, one for each
     84  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
     85  * root lgroup) that contains all the top level PGs in the system.
     86  */
     87 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
     88 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
     89 						/* used for null_proc_lpa */
     90 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
     91 
     92 static int		is_cpu0 = 1; /* true if this is boot CPU context */
     93 
     94 /*
     95  * Array of hardware sharing relationships that are blacklisted.
     96  * CMT scheduling optimizations won't be performed for blacklisted sharing
     97  * relationships.
     98  */
     99 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
    100 
    101 /*
    102  * Set this to non-zero to disable CMT scheduling
    103  * This must be done via kmdb -d, as /etc/system will be too late
    104  */
    105 int			cmt_sched_disabled = 0;
    106 
    107 /*
    108  * Status codes for CMT lineage validation
    109  * See pg_cmt_lineage_validate() below
    110  */
    111 typedef enum cmt_lineage_validation {
    112 	CMT_LINEAGE_VALID,
    113 	CMT_LINEAGE_NON_CONCENTRIC,
    114 	CMT_LINEAGE_PG_SPANS_LGRPS,
    115 	CMT_LINEAGE_NON_PROMOTABLE,
    116 	CMT_LINEAGE_REPAIRED,
    117 	CMT_LINEAGE_UNRECOVERABLE
    118 } cmt_lineage_validation_t;
    119 
    120 /*
    121  * Status of the current lineage under construction.
    122  * One must be holding cpu_lock to change this.
    123  */
    124 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
    125 
    126 /*
    127  * Power domain definitions (on x86) are defined by ACPI, and
    128  * therefore may be subject to BIOS bugs.
    129  */
    130 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
    131 
    132 /*
    133  * Macro to test if PG is managed by the CMT PG class
    134  */
    135 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
    136 
    137 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
    138 
    139 static pg_t		*pg_cmt_alloc();
    140 static void		pg_cmt_free(pg_t *);
    141 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
    142 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
    143 static void		pg_cmt_cpu_active(cpu_t *);
    144 static void		pg_cmt_cpu_inactive(cpu_t *);
    145 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
    146 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
    147 static char		*pg_cmt_policy_name(pg_t *);
    148 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
    149 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
    150 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
    151 static int		pg_cmt_hw(pghw_type_t);
    152 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
    153 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
    154 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
    155 			    kthread_t *, kthread_t *);
    156 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
    157 			    kthread_t *, kthread_t *);
    158 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
    159 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
    160 			    cpu_pg_t *);
    161 
    162 
    163 /*
    164  * CMT PG ops
    165  */
    166 struct pg_ops pg_ops_cmt = {
    167 	pg_cmt_alloc,
    168 	pg_cmt_free,
    169 	pg_cmt_cpu_init,
    170 	pg_cmt_cpu_fini,
    171 	pg_cmt_cpu_active,
    172 	pg_cmt_cpu_inactive,
    173 	pg_cmt_cpupart_in,
    174 	NULL,			/* cpupart_out */
    175 	pg_cmt_cpupart_move,
    176 	pg_cmt_cpu_belongs,
    177 	pg_cmt_policy_name,
    178 };
    179 
    180 /*
    181  * Initialize the CMT PG class
    182  */
    183 void
    184 pg_cmt_class_init(void)
    185 {
    186 	if (cmt_sched_disabled)
    187 		return;
    188 
    189 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
    190 }
    191 
    192 /*
    193  * Called to indicate a new CPU has started up so
    194  * that either t0 or the slave startup thread can
    195  * be accounted for.
    196  */
    197 void
    198 pg_cmt_cpu_startup(cpu_t *cp)
    199 {
    200 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
    201 	    cp->cpu_thread);
    202 }
    203 
    204 /*
    205  * Return non-zero if thread can migrate between "from" and "to"
    206  * without a performance penalty
    207  */
    208 int
    209 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
    210 {
    211 	if (from->cpu_physid->cpu_cacheid ==
    212 	    to->cpu_physid->cpu_cacheid)
    213 		return (1);
    214 	return (0);
    215 }
    216 
    217 /*
    218  * CMT class specific PG allocation
    219  */
    220 static pg_t *
    221 pg_cmt_alloc(void)
    222 {
    223 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
    224 }
    225 
    226 /*
    227  * Class specific PG de-allocation
    228  */
    229 static void
    230 pg_cmt_free(pg_t *pg)
    231 {
    232 	ASSERT(pg != NULL);
    233 	ASSERT(IS_CMT_PG(pg));
    234 
    235 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
    236 }
    237 
    238 /*
    239  * Given a hardware sharing relationship, return which dispatcher
    240  * policies should be implemented to optimize performance and efficiency
    241  */
    242 static pg_cmt_policy_t
    243 pg_cmt_policy(pghw_type_t hw)
    244 {
    245 	pg_cmt_policy_t p;
    246 
    247 	/*
    248 	 * Give the platform a chance to override the default
    249 	 */
    250 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
    251 		return (p);
    252 
    253 	switch (hw) {
    254 	case PGHW_IPIPE:
    255 	case PGHW_FPU:
    256 	case PGHW_PROCNODE:
    257 	case PGHW_CHIP:
    258 		return (CMT_BALANCE);
    259 	case PGHW_CACHE:
    260 		return (CMT_AFFINITY);
    261 	case PGHW_POW_ACTIVE:
    262 	case PGHW_POW_IDLE:
    263 		return (CMT_BALANCE);
    264 	default:
    265 		return (CMT_NO_POLICY);
    266 	}
    267 }
    268 
    269 /*
    270  * Rank the importance of optimizing for the pg1 relationship vs.
    271  * the pg2 relationship.
    272  */
    273 static pg_cmt_t *
    274 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
    275 {
    276 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
    277 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
    278 
    279 	/*
    280 	 * A power domain is only important if CPUPM is enabled.
    281 	 */
    282 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
    283 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
    284 			return (pg2);
    285 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
    286 			return (pg1);
    287 	}
    288 
    289 	/*
    290 	 * Otherwise, ask the platform
    291 	 */
    292 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
    293 		return (pg1);
    294 	else
    295 		return (pg2);
    296 }
    297 
    298 /*
    299  * Initialize CMT callbacks for the given PG
    300  */
    301 static void
    302 cmt_callback_init(pg_t *pg)
    303 {
    304 	/*
    305 	 * Stick with the default callbacks if there isn't going to be
    306 	 * any CMT thread placement optimizations implemented.
    307 	 */
    308 	if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
    309 		return;
    310 
    311 	switch (((pghw_t *)pg)->pghw_hw) {
    312 	case PGHW_POW_ACTIVE:
    313 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
    314 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
    315 		break;
    316 	default:
    317 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
    318 
    319 	}
    320 }
    321 
    322 /*
    323  * Promote PG above it's current parent.
    324  * This is only legal if PG has an equal or greater number of CPUs than its
    325  * parent.
    326  *
    327  * This routine operates on the CPU specific processor group data (for the CPUs
    328  * in the PG being promoted), and may be invoked from a context where one CPU's
    329  * PG data is under construction. In this case the argument "pgdata", if not
    330  * NULL, is a reference to the CPU's under-construction PG data.
    331  */
    332 static void
    333 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
    334 {
    335 	pg_cmt_t	*parent;
    336 	group_t		*children;
    337 	cpu_t		*cpu;
    338 	group_iter_t	iter;
    339 	pg_cpu_itr_t	cpu_iter;
    340 	int		r;
    341 	int		err;
    342 
    343 	ASSERT(MUTEX_HELD(&cpu_lock));
    344 
    345 	parent = pg->cmt_parent;
    346 	if (parent == NULL) {
    347 		/*
    348 		 * Nothing to do
    349 		 */
    350 		return;
    351 	}
    352 
    353 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
    354 
    355 	/*
    356 	 * We're changing around the hierarchy, which is actively traversed
    357 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
    358 	 */
    359 	pause_cpus(NULL);
    360 
    361 	/*
    362 	 * If necessary, update the parent's sibling set, replacing parent
    363 	 * with PG.
    364 	 */
    365 	if (parent->cmt_siblings) {
    366 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
    367 		    != -1) {
    368 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
    369 			ASSERT(r != -1);
    370 		}
    371 	}
    372 
    373 	/*
    374 	 * If the parent is at the top of the hierarchy, replace it's entry
    375 	 * in the root lgroup's group of top level PGs.
    376 	 */
    377 	if (parent->cmt_parent == NULL &&
    378 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
    379 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
    380 		    != -1) {
    381 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
    382 			ASSERT(r != -1);
    383 		}
    384 	}
    385 
    386 	/*
    387 	 * We assume (and therefore assert) that the PG being promoted is an
    388 	 * only child of it's parent. Update the parent's children set
    389 	 * replacing PG's entry with the parent (since the parent is becoming
    390 	 * the child). Then have PG and the parent swap children sets.
    391 	 */
    392 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
    393 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
    394 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
    395 		ASSERT(r != -1);
    396 	}
    397 
    398 	children = pg->cmt_children;
    399 	pg->cmt_children = parent->cmt_children;
    400 	parent->cmt_children = children;
    401 
    402 	/*
    403 	 * Update the sibling references for PG and it's parent
    404 	 */
    405 	pg->cmt_siblings = parent->cmt_siblings;
    406 	parent->cmt_siblings = pg->cmt_children;
    407 
    408 	/*
    409 	 * Update any cached lineages in the per CPU pg data.
    410 	 */
    411 	PG_CPU_ITR_INIT(pg, cpu_iter);
    412 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
    413 		int		idx;
    414 		pg_cmt_t	*cpu_pg;
    415 		cpu_pg_t	*pgd;	/* CPU's PG data */
    416 
    417 		/*
    418 		 * The CPU's whose lineage is under construction still
    419 		 * references the bootstrap CPU PG data structure.
    420 		 */
    421 		if (pg_cpu_is_bootstrapped(cpu))
    422 			pgd = pgdata;
    423 		else
    424 			pgd = cpu->cpu_pg;
    425 
    426 		/*
    427 		 * Iterate over the CPU's PGs updating the children
    428 		 * of the PG being promoted, since they have a new parent.
    429 		 */
    430 		group_iter_init(&iter);
    431 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
    432 			if (cpu_pg->cmt_parent == pg) {
    433 				cpu_pg->cmt_parent = parent;
    434 			}
    435 		}
    436 
    437 		/*
    438 		 * Update the CMT load balancing lineage
    439 		 */
    440 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
    441 			/*
    442 			 * Unless this is the CPU who's lineage is being
    443 			 * constructed, the PG being promoted should be
    444 			 * in the lineage.
    445 			 */
    446 			ASSERT(pg_cpu_is_bootstrapped(cpu));
    447 			continue;
    448 		}
    449 
    450 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
    451 		ASSERT(idx > 0);
    452 
    453 		/*
    454 		 * Have the child and the parent swap places in the CPU's
    455 		 * lineage
    456 		 */
    457 		group_remove_at(&pgd->cmt_pgs, idx);
    458 		group_remove_at(&pgd->cmt_pgs, idx - 1);
    459 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
    460 		ASSERT(err == 0);
    461 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
    462 		ASSERT(err == 0);
    463 	}
    464 
    465 	/*
    466 	 * Update the parent references for PG and it's parent
    467 	 */
    468 	pg->cmt_parent = parent->cmt_parent;
    469 	parent->cmt_parent = pg;
    470 
    471 	start_cpus();
    472 }
    473 
    474 /*
    475  * CMT class callback for a new CPU entering the system
    476  *
    477  * This routine operates on the CPU specific processor group data (for the CPU
    478  * being initialized). The argument "pgdata" is a reference to the CPU's PG
    479  * data to be constructed.
    480  *
    481  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
    482  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
    483  * calls must be careful to operate only on the "pgdata" argument, and not
    484  * cp->cpu_pg.
    485  */
    486 static void
    487 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
    488 {
    489 	pg_cmt_t	*pg;
    490 	group_t		*cmt_pgs;
    491 	int		levels, level;
    492 	pghw_type_t	hw;
    493 	pg_t		*pg_cache = NULL;
    494 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
    495 	lgrp_handle_t	lgrp_handle;
    496 	cmt_lgrp_t	*lgrp;
    497 	cmt_lineage_validation_t	lineage_status;
    498 
    499 	ASSERT(MUTEX_HELD(&cpu_lock));
    500 	ASSERT(pg_cpu_is_bootstrapped(cp));
    501 
    502 	if (cmt_sched_disabled)
    503 		return;
    504 
    505 	/*
    506 	 * A new CPU is coming into the system.
    507 	 * Interrogate the platform to see if the CPU
    508 	 * has any performance or efficiency relevant
    509 	 * sharing relationships
    510 	 */
    511 	cmt_pgs = &pgdata->cmt_pgs;
    512 	pgdata->cmt_lineage = NULL;
    513 
    514 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
    515 	levels = 0;
    516 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
    517 
    518 		pg_cmt_policy_t	policy;
    519 
    520 		/*
    521 		 * We're only interested in the hw sharing relationships
    522 		 * for which we know how to optimize.
    523 		 */
    524 		policy = pg_cmt_policy(hw);
    525 		if (policy == CMT_NO_POLICY ||
    526 		    pg_plat_hw_shared(cp, hw) == 0)
    527 			continue;
    528 
    529 		/*
    530 		 * We will still create the PGs for hardware sharing
    531 		 * relationships that have been blacklisted, but won't
    532 		 * implement CMT thread placement optimizations against them.
    533 		 */
    534 		if (cmt_hw_blacklisted[hw] == 1)
    535 			policy = CMT_NO_POLICY;
    536 
    537 		/*
    538 		 * Find (or create) the PG associated with
    539 		 * the hw sharing relationship in which cp
    540 		 * belongs.
    541 		 *
    542 		 * Determine if a suitable PG already
    543 		 * exists, or if one needs to be created.
    544 		 */
    545 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
    546 		if (pg == NULL) {
    547 			/*
    548 			 * Create a new one.
    549 			 * Initialize the common...
    550 			 */
    551 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
    552 
    553 			/* ... physical ... */
    554 			pghw_init((pghw_t *)pg, cp, hw);
    555 
    556 			/*
    557 			 * ... and CMT specific portions of the
    558 			 * structure.
    559 			 */
    560 			pg->cmt_policy = policy;
    561 
    562 			/* CMT event callbacks */
    563 			cmt_callback_init((pg_t *)pg);
    564 
    565 			bitset_init(&pg->cmt_cpus_actv_set);
    566 			group_create(&pg->cmt_cpus_actv);
    567 		} else {
    568 			ASSERT(IS_CMT_PG(pg));
    569 		}
    570 
    571 		/* Add the CPU to the PG */
    572 		pg_cpu_add((pg_t *)pg, cp, pgdata);
    573 
    574 		/*
    575 		 * Ensure capacity of the active CPU group/bitset
    576 		 */
    577 		group_expand(&pg->cmt_cpus_actv,
    578 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
    579 
    580 		if (cp->cpu_seqid >=
    581 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
    582 			bitset_resize(&pg->cmt_cpus_actv_set,
    583 			    cp->cpu_seqid + 1);
    584 		}
    585 
    586 		/*
    587 		 * Build a lineage of CMT PGs for load balancing / coalescence
    588 		 */
    589 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
    590 			cpu_cmt_hier[levels++] = pg;
    591 		}
    592 
    593 		/* Cache this for later */
    594 		if (hw == PGHW_CACHE)
    595 			pg_cache = (pg_t *)pg;
    596 	}
    597 
    598 	group_expand(cmt_pgs, levels);
    599 
    600 	if (cmt_root == NULL)
    601 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
    602 
    603 	/*
    604 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
    605 	 */
    606 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    607 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
    608 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
    609 
    610 	/*
    611 	 * Ascendingly sort the PGs in the lineage by number of CPUs
    612 	 */
    613 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
    614 
    615 	/*
    616 	 * Examine the lineage and validate it.
    617 	 * This routine will also try to fix the lineage along with the
    618 	 * rest of the PG hierarchy should it detect an issue.
    619 	 *
    620 	 * If it returns anything other than VALID or REPAIRED, an
    621 	 * unrecoverable error has occurred, and we cannot proceed.
    622 	 */
    623 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
    624 	if ((lineage_status != CMT_LINEAGE_VALID) &&
    625 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
    626 		/*
    627 		 * In the case of an unrecoverable error where CMT scheduling
    628 		 * has been disabled, assert that the under construction CPU's
    629 		 * PG data has an empty CMT load balancing lineage.
    630 		 */
    631 		ASSERT((cmt_sched_disabled == 0) ||
    632 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
    633 		return;
    634 	}
    635 
    636 	/*
    637 	 * For existing PGs in the lineage, verify that the parent is
    638 	 * correct, as the generation in the lineage may have changed
    639 	 * as a result of the sorting. Start the traversal at the top
    640 	 * of the lineage, moving down.
    641 	 */
    642 	for (level = levels - 1; level >= 0; ) {
    643 		int reorg;
    644 
    645 		reorg = 0;
    646 		pg = cpu_cmt_hier[level];
    647 
    648 		/*
    649 		 * Promote PGs at an incorrect generation into place.
    650 		 */
    651 		while (pg->cmt_parent &&
    652 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
    653 			cmt_hier_promote(pg, pgdata);
    654 			reorg++;
    655 		}
    656 		if (reorg > 0)
    657 			level = levels - 1;
    658 		else
    659 			level--;
    660 	}
    661 
    662 	/*
    663 	 * For each of the PGs in the CPU's lineage:
    664 	 *	- Add an entry in the CPU sorted CMT PG group
    665 	 *	  which is used for top down CMT load balancing
    666 	 *	- Tie the PG into the CMT hierarchy by connecting
    667 	 *	  it to it's parent and siblings.
    668 	 */
    669 	for (level = 0; level < levels; level++) {
    670 		uint_t		children;
    671 		int		err;
    672 
    673 		pg = cpu_cmt_hier[level];
    674 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
    675 		ASSERT(err == 0);
    676 
    677 		if (level == 0)
    678 			pgdata->cmt_lineage = (pg_t *)pg;
    679 
    680 		if (pg->cmt_siblings != NULL) {
    681 			/* Already initialized */
    682 			ASSERT(pg->cmt_parent == NULL ||
    683 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
    684 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
    685 			    ((pg->cmt_parent != NULL) &&
    686 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
    687 			continue;
    688 		}
    689 
    690 		if ((level + 1) == levels) {
    691 			pg->cmt_parent = NULL;
    692 
    693 			pg->cmt_siblings = &lgrp->cl_pgs;
    694 			children = ++lgrp->cl_npgs;
    695 			if (cmt_root != lgrp)
    696 				cmt_root->cl_npgs++;
    697 		} else {
    698 			pg->cmt_parent = cpu_cmt_hier[level + 1];
    699 
    700 			/*
    701 			 * A good parent keeps track of their children.
    702 			 * The parent's children group is also the PG's
    703 			 * siblings.
    704 			 */
    705 			if (pg->cmt_parent->cmt_children == NULL) {
    706 				pg->cmt_parent->cmt_children =
    707 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
    708 				group_create(pg->cmt_parent->cmt_children);
    709 			}
    710 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
    711 			children = ++pg->cmt_parent->cmt_nchildren;
    712 		}
    713 
    714 		group_expand(pg->cmt_siblings, children);
    715 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
    716 	}
    717 
    718 	/*
    719 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
    720 	 * for fast lookups later.
    721 	 */
    722 	if (cp->cpu_physid) {
    723 		cp->cpu_physid->cpu_chipid =
    724 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
    725 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
    726 
    727 		/*
    728 		 * If this cpu has a PG representing shared cache, then set
    729 		 * cpu_cacheid to that PG's logical id
    730 		 */
    731 		if (pg_cache)
    732 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
    733 	}
    734 
    735 	/* CPU0 only initialization */
    736 	if (is_cpu0) {
    737 		is_cpu0 = 0;
    738 		cpu0_lgrp = lgrp;
    739 	}
    740 
    741 }
    742 
    743 /*
    744  * Class callback when a CPU is leaving the system (deletion)
    745  *
    746  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
    747  *
    748  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
    749  * references a "bootstrap" structure across this function's invocation.
    750  * pg_cmt_cpu_init() and the routines it calls must be careful to operate only
    751  * on the "pgdata" argument, and not cp->cpu_pg.
    752  */
    753 static void
    754 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
    755 {
    756 	group_iter_t	i;
    757 	pg_cmt_t	*pg;
    758 	group_t		*pgs, *cmt_pgs;
    759 	lgrp_handle_t	lgrp_handle;
    760 	cmt_lgrp_t	*lgrp;
    761 
    762 	if (cmt_sched_disabled)
    763 		return;
    764 
    765 	ASSERT(pg_cpu_is_bootstrapped(cp));
    766 
    767 	pgs = &pgdata->pgs;
    768 	cmt_pgs = &pgdata->cmt_pgs;
    769 
    770 	/*
    771 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
    772 	 */
    773 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
    774 
    775 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
    776 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
    777 		/*
    778 		 * One might wonder how we could be deconfiguring the
    779 		 * only CPU in the system.
    780 		 *
    781 		 * On Starcat systems when null_proc_lpa is detected,
    782 		 * the boot CPU (which is already configured into a leaf
    783 		 * lgroup), is moved into the root lgroup. This is done by
    784 		 * deconfiguring it from both lgroups and processor
    785 		 * groups), and then later reconfiguring it back in.  This
    786 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
    787 		 *
    788 		 * This special case is detected by noting that the platform
    789 		 * has changed the CPU's lgrp affiliation (since it now
    790 		 * belongs in the root). In this case, use the cmt_lgrp_t
    791 		 * cached for the boot CPU, since this is what needs to be
    792 		 * torn down.
    793 		 */
    794 		lgrp = cpu0_lgrp;
    795 	}
    796 
    797 	ASSERT(lgrp != NULL);
    798 
    799 	/*
    800 	 * First, clean up anything load balancing specific for each of
    801 	 * the CPU's PGs that participated in CMT load balancing
    802 	 */
    803 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
    804 	while (pg != NULL) {
    805 
    806 		/*
    807 		 * Remove the PG from the CPU's load balancing lineage
    808 		 */
    809 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
    810 
    811 		/*
    812 		 * If it's about to become empty, destroy it's children
    813 		 * group, and remove it's reference from it's siblings.
    814 		 * This is done here (rather than below) to avoid removing
    815 		 * our reference from a PG that we just eliminated.
    816 		 */
    817 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
    818 			if (pg->cmt_children != NULL)
    819 				group_destroy(pg->cmt_children);
    820 			if (pg->cmt_siblings != NULL) {
    821 				if (pg->cmt_siblings == &lgrp->cl_pgs)
    822 					lgrp->cl_npgs--;
    823 				else
    824 					pg->cmt_parent->cmt_nchildren--;
    825 			}
    826 		}
    827 		pg = pg->cmt_parent;
    828 	}
    829 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
    830 
    831 	/*
    832 	 * Now that the load balancing lineage updates have happened,
    833 	 * remove the CPU from all it's PGs (destroying any that become
    834 	 * empty).
    835 	 */
    836 	group_iter_init(&i);
    837 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    838 		if (IS_CMT_PG(pg) == 0)
    839 			continue;
    840 
    841 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
    842 		/*
    843 		 * Deleting the CPU from the PG changes the CPU's
    844 		 * PG group over which we are actively iterating
    845 		 * Re-initialize the iteration
    846 		 */
    847 		group_iter_init(&i);
    848 
    849 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
    850 
    851 			/*
    852 			 * The PG has become zero sized, so destroy it.
    853 			 */
    854 			group_destroy(&pg->cmt_cpus_actv);
    855 			bitset_fini(&pg->cmt_cpus_actv_set);
    856 			pghw_fini((pghw_t *)pg);
    857 
    858 			pg_destroy((pg_t *)pg);
    859 		}
    860 	}
    861 }
    862 
    863 /*
    864  * Class callback when a CPU is entering a cpu partition
    865  */
    866 static void
    867 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
    868 {
    869 	group_t		*pgs;
    870 	pg_t		*pg;
    871 	group_iter_t	i;
    872 
    873 	ASSERT(MUTEX_HELD(&cpu_lock));
    874 
    875 	if (cmt_sched_disabled)
    876 		return;
    877 
    878 	pgs = &cp->cpu_pg->pgs;
    879 
    880 	/*
    881 	 * Ensure that the new partition's PG bitset
    882 	 * is large enough for all CMT PG's to which cp
    883 	 * belongs
    884 	 */
    885 	group_iter_init(&i);
    886 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    887 		if (IS_CMT_PG(pg) == 0)
    888 			continue;
    889 
    890 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
    891 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
    892 	}
    893 }
    894 
    895 /*
    896  * Class callback when a CPU is actually moving partitions
    897  */
    898 static void
    899 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
    900 {
    901 	cpu_t		*cpp;
    902 	group_t		*pgs;
    903 	pg_t		*pg;
    904 	group_iter_t	pg_iter;
    905 	pg_cpu_itr_t	cpu_iter;
    906 	boolean_t	found;
    907 
    908 	ASSERT(MUTEX_HELD(&cpu_lock));
    909 
    910 	if (cmt_sched_disabled)
    911 		return;
    912 
    913 	pgs = &cp->cpu_pg->pgs;
    914 	group_iter_init(&pg_iter);
    915 
    916 	/*
    917 	 * Iterate over the CPUs CMT PGs
    918 	 */
    919 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
    920 
    921 		if (IS_CMT_PG(pg) == 0)
    922 			continue;
    923 
    924 		/*
    925 		 * Add the PG to the bitset in the new partition.
    926 		 */
    927 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
    928 
    929 		/*
    930 		 * Remove the PG from the bitset in the old partition
    931 		 * if the last of the PG's CPUs have left.
    932 		 */
    933 		found = B_FALSE;
    934 		PG_CPU_ITR_INIT(pg, cpu_iter);
    935 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
    936 			if (cpp == cp)
    937 				continue;
    938 			if (CPU_ACTIVE(cpp) &&
    939 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
    940 				found = B_TRUE;
    941 				break;
    942 			}
    943 		}
    944 		if (!found)
    945 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
    946 	}
    947 }
    948 
    949 /*
    950  * Class callback when a CPU becomes active (online)
    951  *
    952  * This is called in a context where CPUs are paused
    953  */
    954 static void
    955 pg_cmt_cpu_active(cpu_t *cp)
    956 {
    957 	int		err;
    958 	group_iter_t	i;
    959 	pg_cmt_t	*pg;
    960 	group_t		*pgs;
    961 
    962 	ASSERT(MUTEX_HELD(&cpu_lock));
    963 
    964 	if (cmt_sched_disabled)
    965 		return;
    966 
    967 	pgs = &cp->cpu_pg->pgs;
    968 	group_iter_init(&i);
    969 
    970 	/*
    971 	 * Iterate over the CPU's PGs
    972 	 */
    973 	while ((pg = group_iterate(pgs, &i)) != NULL) {
    974 
    975 		if (IS_CMT_PG(pg) == 0)
    976 			continue;
    977 
    978 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
    979 		ASSERT(err == 0);
    980 
    981 		/*
    982 		 * If this is the first active CPU in the PG, and it
    983 		 * represents a hardware sharing relationship over which
    984 		 * CMT load balancing is performed, add it as a candidate
    985 		 * for balancing with it's siblings.
    986 		 */
    987 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
    988 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
    989 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
    990 			ASSERT(err == 0);
    991 
    992 			/*
    993 			 * If this is a top level PG, add it as a balancing
    994 			 * candidate when balancing within the root lgroup.
    995 			 */
    996 			if (pg->cmt_parent == NULL &&
    997 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
    998 				err = group_add(&cmt_root->cl_pgs, pg,
    999 				    GRP_NORESIZE);
   1000 				ASSERT(err == 0);
   1001 			}
   1002 		}
   1003 
   1004 		/*
   1005 		 * Notate the CPU in the PGs active CPU bitset.
   1006 		 * Also notate the PG as being active in it's associated
   1007 		 * partition
   1008 		 */
   1009 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
   1010 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
   1011 	}
   1012 }
   1013 
   1014 /*
   1015  * Class callback when a CPU goes inactive (offline)
   1016  *
   1017  * This is called in a context where CPUs are paused
   1018  */
   1019 static void
   1020 pg_cmt_cpu_inactive(cpu_t *cp)
   1021 {
   1022 	int		err;
   1023 	group_t		*pgs;
   1024 	pg_cmt_t	*pg;
   1025 	cpu_t		*cpp;
   1026 	group_iter_t	i;
   1027 	pg_cpu_itr_t	cpu_itr;
   1028 	boolean_t	found;
   1029 
   1030 	ASSERT(MUTEX_HELD(&cpu_lock));
   1031 
   1032 	if (cmt_sched_disabled)
   1033 		return;
   1034 
   1035 	pgs = &cp->cpu_pg->pgs;
   1036 	group_iter_init(&i);
   1037 
   1038 	while ((pg = group_iterate(pgs, &i)) != NULL) {
   1039 
   1040 		if (IS_CMT_PG(pg) == 0)
   1041 			continue;
   1042 
   1043 		/*
   1044 		 * Remove the CPU from the CMT PGs active CPU group
   1045 		 * bitmap
   1046 		 */
   1047 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
   1048 		ASSERT(err == 0);
   1049 
   1050 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
   1051 
   1052 		/*
   1053 		 * If there are no more active CPUs in this PG over which
   1054 		 * load was balanced, remove it as a balancing candidate.
   1055 		 */
   1056 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
   1057 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
   1058 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
   1059 			ASSERT(err == 0);
   1060 
   1061 			if (pg->cmt_parent == NULL &&
   1062 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
   1063 				err = group_remove(&cmt_root->cl_pgs, pg,
   1064 				    GRP_NORESIZE);
   1065 				ASSERT(err == 0);
   1066 			}
   1067 		}
   1068 
   1069 		/*
   1070 		 * Assert the number of active CPUs does not exceed
   1071 		 * the total number of CPUs in the PG
   1072 		 */
   1073 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
   1074 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
   1075 
   1076 		/*
   1077 		 * Update the PG bitset in the CPU's old partition
   1078 		 */
   1079 		found = B_FALSE;
   1080 		PG_CPU_ITR_INIT(pg, cpu_itr);
   1081 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
   1082 			if (cpp == cp)
   1083 				continue;
   1084 			if (CPU_ACTIVE(cpp) &&
   1085 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
   1086 				found = B_TRUE;
   1087 				break;
   1088 			}
   1089 		}
   1090 		if (!found) {
   1091 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
   1092 			    ((pg_t *)pg)->pg_id);
   1093 		}
   1094 	}
   1095 }
   1096 
   1097 /*
   1098  * Return non-zero if the CPU belongs in the given PG
   1099  */
   1100 static int
   1101 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
   1102 {
   1103 	cpu_t	*pg_cpu;
   1104 
   1105 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
   1106 
   1107 	ASSERT(pg_cpu != NULL);
   1108 
   1109 	/*
   1110 	 * The CPU belongs if, given the nature of the hardware sharing
   1111 	 * relationship represented by the PG, the CPU has that
   1112 	 * relationship with some other CPU already in the PG
   1113 	 */
   1114 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
   1115 		return (1);
   1116 
   1117 	return (0);
   1118 }
   1119 
   1120 /*
   1121  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
   1122  */
   1123 static void
   1124 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
   1125 {
   1126 	int		i, j, inc, sz;
   1127 	int		start, end;
   1128 	pg_t		*tmp;
   1129 	pg_t		**h = (pg_t **)hier;
   1130 
   1131 	/*
   1132 	 * First sort by number of CPUs
   1133 	 */
   1134 	inc = size / 2;
   1135 	while (inc > 0) {
   1136 		for (i = inc; i < size; i++) {
   1137 			j = i;
   1138 			tmp = h[i];
   1139 			while ((j >= inc) &&
   1140 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
   1141 				h[j] = h[j - inc];
   1142 				j = j - inc;
   1143 			}
   1144 			h[j] = tmp;
   1145 		}
   1146 		if (inc == 2)
   1147 			inc = 1;
   1148 		else
   1149 			inc = (inc * 5) / 11;
   1150 	}
   1151 
   1152 	/*
   1153 	 * Break ties by asking the platform.
   1154 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
   1155 	 */
   1156 	for (start = 0; start < size; start++) {
   1157 
   1158 		/*
   1159 		 * Find various contiguous sets of elements,
   1160 		 * in the array, with the same number of cpus
   1161 		 */
   1162 		end = start;
   1163 		sz = PG_NUM_CPUS(h[start]);
   1164 		while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
   1165 			end++;
   1166 		/*
   1167 		 * Sort each such set of the array by rank
   1168 		 */
   1169 		for (i = start + 1; i < end; i++) {
   1170 			j = i - 1;
   1171 			tmp = h[i];
   1172 			while (j >= start &&
   1173 			    pg_cmt_hier_rank(hier[j],
   1174 			    (pg_cmt_t *)tmp) == hier[j]) {
   1175 				h[j + 1] = h[j];
   1176 				j--;
   1177 			}
   1178 			h[j + 1] = tmp;
   1179 		}
   1180 	}
   1181 }
   1182 
   1183 /*
   1184  * Return a cmt_lgrp_t * given an lgroup handle.
   1185  */
   1186 static cmt_lgrp_t *
   1187 pg_cmt_find_lgrp(lgrp_handle_t hand)
   1188 {
   1189 	cmt_lgrp_t	*lgrp;
   1190 
   1191 	ASSERT(MUTEX_HELD(&cpu_lock));
   1192 
   1193 	lgrp = cmt_lgrps;
   1194 	while (lgrp != NULL) {
   1195 		if (lgrp->cl_hand == hand)
   1196 			break;
   1197 		lgrp = lgrp->cl_next;
   1198 	}
   1199 	return (lgrp);
   1200 }
   1201 
   1202 /*
   1203  * Create a cmt_lgrp_t with the specified handle.
   1204  */
   1205 static cmt_lgrp_t *
   1206 pg_cmt_lgrp_create(lgrp_handle_t hand)
   1207 {
   1208 	cmt_lgrp_t	*lgrp;
   1209 
   1210 	ASSERT(MUTEX_HELD(&cpu_lock));
   1211 
   1212 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
   1213 
   1214 	lgrp->cl_hand = hand;
   1215 	lgrp->cl_npgs = 0;
   1216 	lgrp->cl_next = cmt_lgrps;
   1217 	cmt_lgrps = lgrp;
   1218 	group_create(&lgrp->cl_pgs);
   1219 
   1220 	return (lgrp);
   1221 }
   1222 
   1223 /*
   1224  * Interfaces to enable and disable power aware dispatching
   1225  * The caller must be holding cpu_lock.
   1226  *
   1227  * Return 0 on success and -1 on failure.
   1228  */
   1229 int
   1230 cmt_pad_enable(pghw_type_t type)
   1231 {
   1232 	group_t		*hwset;
   1233 	group_iter_t	iter;
   1234 	pg_cmt_t	*pg;
   1235 
   1236 	ASSERT(PGHW_IS_PM_DOMAIN(type));
   1237 	ASSERT(MUTEX_HELD(&cpu_lock));
   1238 
   1239 	if ((hwset = pghw_set_lookup(type)) == NULL ||
   1240 	    cmt_hw_blacklisted[type]) {
   1241 		/*
   1242 		 * Unable to find any instances of the specified type
   1243 		 * of power domain, or the power domains have been blacklisted.
   1244 		 */
   1245 		return (-1);
   1246 	}
   1247 
   1248 	/*
   1249 	 * Iterate over the power domains, setting the default dispatcher
   1250 	 * policy for power/performance optimization.
   1251 	 *
   1252 	 * Simply setting the policy isn't enough in the case where the power
   1253 	 * domain is an only child of another PG. Because the dispatcher walks
   1254 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
   1255 	 * will dominate. So promote the power domain above it's parent if both
   1256 	 * PG and it's parent have the same CPUs to ensure it's policy
   1257 	 * dominates.
   1258 	 */
   1259 	group_iter_init(&iter);
   1260 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
   1261 		/*
   1262 		 * If the power domain is an only child to a parent
   1263 		 * not implementing the same policy, promote the child
   1264 		 * above the parent to activate the policy.
   1265 		 */
   1266 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
   1267 		while ((pg->cmt_parent != NULL) &&
   1268 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
   1269 		    (PG_NUM_CPUS((pg_t *)pg) ==
   1270 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
   1271 			cmt_hier_promote(pg, NULL);
   1272 		}
   1273 	}
   1274 
   1275 	return (0);
   1276 }
   1277 
   1278 int
   1279 cmt_pad_disable(pghw_type_t type)
   1280 {
   1281 	group_t		*hwset;
   1282 	group_iter_t	iter;
   1283 	pg_cmt_t	*pg;
   1284 	pg_cmt_t	*child;
   1285 
   1286 	ASSERT(PGHW_IS_PM_DOMAIN(type));
   1287 	ASSERT(MUTEX_HELD(&cpu_lock));
   1288 
   1289 	if ((hwset = pghw_set_lookup(type)) == NULL) {
   1290 		/*
   1291 		 * Unable to find any instances of the specified type of
   1292 		 * power domain.
   1293 		 */
   1294 		return (-1);
   1295 	}
   1296 	/*
   1297 	 * Iterate over the power domains, setting the default dispatcher
   1298 	 * policy for performance optimization (load balancing).
   1299 	 */
   1300 	group_iter_init(&iter);
   1301 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
   1302 
   1303 		/*
   1304 		 * If the power domain has an only child that implements
   1305 		 * policy other than load balancing, promote the child
   1306 		 * above the power domain to ensure it's policy dominates.
   1307 		 */
   1308 		if (pg->cmt_children != NULL &&
   1309 		    GROUP_SIZE(pg->cmt_children) == 1) {
   1310 			child = GROUP_ACCESS(pg->cmt_children, 0);
   1311 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
   1312 				cmt_hier_promote(child, NULL);
   1313 			}
   1314 		}
   1315 		pg->cmt_policy = CMT_BALANCE;
   1316 	}
   1317 	return (0);
   1318 }
   1319 
   1320 /* ARGSUSED */
   1321 static void
   1322 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
   1323 		    kthread_t *new)
   1324 {
   1325 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
   1326 
   1327 	if (old == cp->cpu_idle_thread) {
   1328 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
   1329 	} else if (new == cp->cpu_idle_thread) {
   1330 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
   1331 	}
   1332 }
   1333 
   1334 /*
   1335  * Macro to test whether a thread is currently runnable on a CPU in a PG.
   1336  */
   1337 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
   1338 	((t)->t_state == TS_RUN &&					\
   1339 	    (t)->t_disp_queue->disp_cpu &&				\
   1340 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
   1341 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
   1342 
   1343 static void
   1344 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
   1345     kthread_t *new)
   1346 {
   1347 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
   1348 	cpupm_domain_t	*dom;
   1349 	uint32_t	u;
   1350 
   1351 	if (old == cp->cpu_idle_thread) {
   1352 		ASSERT(new != cp->cpu_idle_thread);
   1353 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
   1354 		if (u == 1) {
   1355 			/*
   1356 			 * Notify the CPU power manager that the domain
   1357 			 * is non-idle.
   1358 			 */
   1359 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
   1360 			cpupm_utilization_event(cp, now, dom,
   1361 			    CPUPM_DOM_BUSY_FROM_IDLE);
   1362 		}
   1363 	} else if (new == cp->cpu_idle_thread) {
   1364 		ASSERT(old != cp->cpu_idle_thread);
   1365 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
   1366 		if (u == 0) {
   1367 			/*
   1368 			 * The domain is idle, notify the CPU power
   1369 			 * manager.
   1370 			 *
   1371 			 * Avoid notifying if the thread is simply migrating
   1372 			 * between CPUs in the domain.
   1373 			 */
   1374 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
   1375 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
   1376 				cpupm_utilization_event(cp, now, dom,
   1377 				    CPUPM_DOM_IDLE_FROM_BUSY);
   1378 			}
   1379 		}
   1380 	}
   1381 }
   1382 
   1383 /* ARGSUSED */
   1384 static void
   1385 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
   1386 {
   1387 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
   1388 	cpupm_domain_t	*dom;
   1389 
   1390 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
   1391 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
   1392 }
   1393 
   1394 /*
   1395  * Return the name of the CMT scheduling policy
   1396  * being implemented across this PG
   1397  */
   1398 static char *
   1399 pg_cmt_policy_name(pg_t *pg)
   1400 {
   1401 	pg_cmt_policy_t policy;
   1402 
   1403 	policy = ((pg_cmt_t *)pg)->cmt_policy;
   1404 
   1405 	if (policy & CMT_AFFINITY) {
   1406 		if (policy & CMT_BALANCE)
   1407 			return ("Load Balancing & Affinity");
   1408 		else if (policy & CMT_COALESCE)
   1409 			return ("Load Coalescence & Affinity");
   1410 		else
   1411 			return ("Affinity");
   1412 	} else {
   1413 		if (policy & CMT_BALANCE)
   1414 			return ("Load Balancing");
   1415 		else if (policy & CMT_COALESCE)
   1416 			return ("Load Coalescence");
   1417 		else
   1418 			return ("None");
   1419 	}
   1420 }
   1421 
   1422 /*
   1423  * Prune PG, and all other instances of PG's hardware sharing relationship
   1424  * from the CMT PG hierarchy.
   1425  *
   1426  * This routine operates on the CPU specific processor group data (for the CPUs
   1427  * in the PG being pruned), and may be invoked from a context where one CPU's
   1428  * PG data is under construction. In this case the argument "pgdata", if not
   1429  * NULL, is a reference to the CPU's under-construction PG data.
   1430  */
   1431 static int
   1432 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
   1433 {
   1434 	group_t		*hwset, *children;
   1435 	int		i, j, r, size = *sz;
   1436 	group_iter_t	hw_iter, child_iter;
   1437 	pg_cpu_itr_t	cpu_iter;
   1438 	pg_cmt_t	*pg, *child;
   1439 	cpu_t		*cpu;
   1440 	int		cap_needed;
   1441 	pghw_type_t	hw;
   1442 
   1443 	ASSERT(MUTEX_HELD(&cpu_lock));
   1444 
   1445 	hw = ((pghw_t *)pg_bad)->pghw_hw;
   1446 
   1447 	if (hw == PGHW_POW_ACTIVE) {
   1448 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
   1449 		    "Event Based CPUPM Unavailable");
   1450 	} else if (hw == PGHW_POW_IDLE) {
   1451 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
   1452 		    "Dispatcher assisted CPUPM disabled.");
   1453 	}
   1454 
   1455 	/*
   1456 	 * Find and eliminate the PG from the lineage.
   1457 	 */
   1458 	for (i = 0; i < size; i++) {
   1459 		if (lineage[i] == pg_bad) {
   1460 			for (j = i; j < size - 1; j++)
   1461 				lineage[j] = lineage[j + 1];
   1462 			*sz = size - 1;
   1463 			break;
   1464 		}
   1465 	}
   1466 
   1467 	/*
   1468 	 * We'll prune all instances of the hardware sharing relationship
   1469 	 * represented by pg. But before we do that (and pause CPUs) we need
   1470 	 * to ensure the hierarchy's groups are properly sized.
   1471 	 */
   1472 	hwset = pghw_set_lookup(hw);
   1473 
   1474 	/*
   1475 	 * Blacklist the hardware so future processor groups of this type won't
   1476 	 * participate in CMT thread placement.
   1477 	 *
   1478 	 * XXX
   1479 	 * For heterogeneous system configurations, this might be overkill.
   1480 	 * We may only need to blacklist the illegal PGs, and other instances
   1481 	 * of this hardware sharing relationship may be ok.
   1482 	 */
   1483 	cmt_hw_blacklisted[hw] = 1;
   1484 
   1485 	/*
   1486 	 * For each of the PGs being pruned, ensure sufficient capacity in
   1487 	 * the siblings set for the PG's children
   1488 	 */
   1489 	group_iter_init(&hw_iter);
   1490 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
   1491 		/*
   1492 		 * PG is being pruned, but if it is bringing up more than
   1493 		 * one child, ask for more capacity in the siblings group.
   1494 		 */
   1495 		cap_needed = 0;
   1496 		if (pg->cmt_children &&
   1497 		    GROUP_SIZE(pg->cmt_children) > 1) {
   1498 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
   1499 
   1500 			group_expand(pg->cmt_siblings,
   1501 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
   1502 
   1503 			/*
   1504 			 * If this is a top level group, also ensure the
   1505 			 * capacity in the root lgrp level CMT grouping.
   1506 			 */
   1507 			if (pg->cmt_parent == NULL &&
   1508 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
   1509 				group_expand(&cmt_root->cl_pgs,
   1510 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
   1511 				cmt_root->cl_npgs += cap_needed;
   1512 			}
   1513 		}
   1514 	}
   1515 
   1516 	/*
   1517 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
   1518 	 * exclusivity with respect to the dispatcher.
   1519 	 */
   1520 	pause_cpus(NULL);
   1521 
   1522 	/*
   1523 	 * Prune all PG instances of the hardware sharing relationship
   1524 	 * represented by pg.
   1525 	 */
   1526 	group_iter_init(&hw_iter);
   1527 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
   1528 
   1529 		/*
   1530 		 * Remove PG from it's group of siblings, if it's there.
   1531 		 */
   1532 		if (pg->cmt_siblings) {
   1533 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
   1534 		}
   1535 		if (pg->cmt_parent == NULL &&
   1536 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
   1537 			(void) group_remove(&cmt_root->cl_pgs, pg,
   1538 			    GRP_NORESIZE);
   1539 		}
   1540 
   1541 		/*
   1542 		 * Indicate that no CMT policy will be implemented across
   1543 		 * this PG.
   1544 		 */
   1545 		pg->cmt_policy = CMT_NO_POLICY;
   1546 
   1547 		/*
   1548 		 * Move PG's children from it's children set to it's parent's
   1549 		 * children set. Note that the parent's children set, and PG's
   1550 		 * siblings set are the same thing.
   1551 		 *
   1552 		 * Because we are iterating over the same group that we are
   1553 		 * operating on (removing the children), first add all of PG's
   1554 		 * children to the parent's children set, and once we are done
   1555 		 * iterating, empty PG's children set.
   1556 		 */
   1557 		if (pg->cmt_children != NULL) {
   1558 			children = pg->cmt_children;
   1559 
   1560 			group_iter_init(&child_iter);
   1561 			while ((child = group_iterate(children, &child_iter))
   1562 			    != NULL) {
   1563 				if (pg->cmt_siblings != NULL) {
   1564 					r = group_add(pg->cmt_siblings, child,
   1565 					    GRP_NORESIZE);
   1566 					ASSERT(r == 0);
   1567 
   1568 					if (pg->cmt_parent == NULL &&
   1569 					    pg->cmt_siblings !=
   1570 					    &cmt_root->cl_pgs) {
   1571 						r = group_add(&cmt_root->cl_pgs,
   1572 						    child, GRP_NORESIZE);
   1573 						ASSERT(r == 0);
   1574 					}
   1575 				}
   1576 			}
   1577 			group_empty(pg->cmt_children);
   1578 		}
   1579 
   1580 		/*
   1581 		 * Reset the callbacks to the defaults
   1582 		 */
   1583 		pg_callback_set_defaults((pg_t *)pg);
   1584 
   1585 		/*
   1586 		 * Update all the CPU lineages in each of PG's CPUs
   1587 		 */
   1588 		PG_CPU_ITR_INIT(pg, cpu_iter);
   1589 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
   1590 			pg_cmt_t	*cpu_pg;
   1591 			group_iter_t	liter;	/* Iterator for the lineage */
   1592 			cpu_pg_t	*cpd;	/* CPU's PG data */
   1593 
   1594 			/*
   1595 			 * The CPU's lineage is under construction still
   1596 			 * references the bootstrap CPU PG data structure.
   1597 			 */
   1598 			if (pg_cpu_is_bootstrapped(cpu))
   1599 				cpd = pgdata;
   1600 			else
   1601 				cpd = cpu->cpu_pg;
   1602 
   1603 			/*
   1604 			 * Iterate over the CPU's PGs updating the children
   1605 			 * of the PG being promoted, since they have a new
   1606 			 * parent and siblings set.
   1607 			 */
   1608 			group_iter_init(&liter);
   1609 			while ((cpu_pg = group_iterate(&cpd->pgs,
   1610 			    &liter)) != NULL) {
   1611 				if (cpu_pg->cmt_parent == pg) {
   1612 					cpu_pg->cmt_parent = pg->cmt_parent;
   1613 					cpu_pg->cmt_siblings = pg->cmt_siblings;
   1614 				}
   1615 			}
   1616 
   1617 			/*
   1618 			 * Update the CPU's lineages
   1619 			 *
   1620 			 * Remove the PG from the CPU's group used for CMT
   1621 			 * scheduling.
   1622 			 */
   1623 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
   1624 		}
   1625 	}
   1626 	start_cpus();
   1627 	return (0);
   1628 }
   1629 
   1630 /*
   1631  * Disable CMT scheduling
   1632  */
   1633 static void
   1634 pg_cmt_disable(void)
   1635 {
   1636 	cpu_t		*cpu;
   1637 
   1638 	ASSERT(MUTEX_HELD(&cpu_lock));
   1639 
   1640 	pause_cpus(NULL);
   1641 	cpu = cpu_list;
   1642 
   1643 	do {
   1644 		if (cpu->cpu_pg)
   1645 			group_empty(&cpu->cpu_pg->cmt_pgs);
   1646 	} while ((cpu = cpu->cpu_next) != cpu_list);
   1647 
   1648 	cmt_sched_disabled = 1;
   1649 	start_cpus();
   1650 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
   1651 }
   1652 
   1653 /*
   1654  * CMT lineage validation
   1655  *
   1656  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
   1657  * of the PGs in a CPU's lineage. This is necessary because it's possible that
   1658  * some groupings (power domain groupings in particular) may be defined by
   1659  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
   1660  * possible to integrate those groupings into the CMT PG hierarchy, if doing
   1661  * so would violate the subset invariant of the hierarchy, which says that
   1662  * a PG must be subset of its parent (if it has one).
   1663  *
   1664  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
   1665  * would result in a violation of this invariant. If a violation is found,
   1666  * and the PG is of a grouping type who's definition is known to originate from
   1667  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
   1668  * PG (and all other instances PG's sharing relationship type) from the
   1669  * hierarchy. Further, future instances of that sharing relationship type won't
   1670  * be instantiated. If the grouping definition doesn't originate from suspect
   1671  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
   1672  * CMT scheduling altogether.
   1673  *
   1674  * This routine is invoked after the CPU has been added to the PGs in which
   1675  * it belongs, but before those PGs have been added to (or had their place
   1676  * adjusted in) the CMT PG hierarchy.
   1677  *
   1678  * The first argument is the CPUs PG lineage (essentially an array of PGs in
   1679  * which the CPU belongs) that has already been sorted in ascending order
   1680  * by CPU count. Some of the PGs in the CPUs lineage may already have other
   1681  * CPUs in them, and have already been integrated into the CMT hierarchy.
   1682  *
   1683  * The addition of this new CPU to these pre-existing PGs means that those
   1684  * PGs may need to be promoted up in the hierarchy to satisfy the subset
   1685  * invariant. In additon to testing the subset invariant for the lineage,
   1686  * this routine also verifies that the addition of the new CPU to the
   1687  * existing PGs wouldn't cause the subset invariant to be violated in
   1688  * the exiting lineages.
   1689  *
   1690  * This routine will normally return one of the following:
   1691  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
   1692  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
   1693  *
   1694  * Otherwise, this routine will return a value indicating which error it
   1695  * was unable to recover from (and set cmt_lineage_status along the way).
   1696  *
   1697  *
   1698  * This routine operates on the CPU specific processor group data (for the CPU
   1699  * whose lineage is being validated), which is under-construction.
   1700  * "pgdata" is a reference to the CPU's under-construction PG data.
   1701  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
   1702  */
   1703 static cmt_lineage_validation_t
   1704 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
   1705 {
   1706 	int		i, j, size;
   1707 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp;
   1708 	cpu_t		*cp;
   1709 	pg_cpu_itr_t	cpu_iter;
   1710 	lgrp_handle_t	lgrp;
   1711 
   1712 	ASSERT(MUTEX_HELD(&cpu_lock));
   1713 
   1714 revalidate:
   1715 	size = *sz;
   1716 	pg_bad = NULL;
   1717 	lgrp = LGRP_NULL_HANDLE;
   1718 	for (i = 0; i < size; i++) {
   1719 
   1720 		pg = lineage[i];
   1721 		if (i < size - 1)
   1722 			pg_next = lineage[i + 1];
   1723 		else
   1724 			pg_next = NULL;
   1725 
   1726 		/*
   1727 		 * We assume that the lineage has already been sorted
   1728 		 * by the number of CPUs. In fact, we depend on it.
   1729 		 */
   1730 		ASSERT(pg_next == NULL ||
   1731 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
   1732 
   1733 		/*
   1734 		 * Check to make sure that the existing parent of PG (if any)
   1735 		 * is either in the PG's lineage, or the PG has more CPUs than
   1736 		 * its existing parent and can and should be promoted above its
   1737 		 * parent.
   1738 		 *
   1739 		 * Since the PG topology is in the middle of being changed, we
   1740 		 * need to check whether the PG's existing parent (if any) is
   1741 		 * part of its lineage (and therefore should contain the new
   1742 		 * CPU). If not, it means that the addition of the new CPU
   1743 		 * should have made this PG have more CPUs than its parent, and
   1744 		 * this PG should be promoted to be above its existing parent
   1745 		 * now. We need to verify all of this to defend against a buggy
   1746 		 * BIOS giving bad power domain CPU groupings. Sigh.
   1747 		 */
   1748 		if (pg->cmt_parent) {
   1749 			/*
   1750 			 * Determine if cmt_parent is in this lineage
   1751 			 */
   1752 			for (j = 0; j < size; j++) {
   1753 				pg_tmp = lineage[j];
   1754 				if (pg_tmp == pg->cmt_parent)
   1755 					break;
   1756 			}
   1757 			if (pg_tmp != pg->cmt_parent) {
   1758 				/*
   1759 				 * cmt_parent is not in the lineage, verify
   1760 				 * it is a proper subset of PG.
   1761 				 */
   1762 				if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >=
   1763 				    PG_NUM_CPUS((pg_t *)pg)) {
   1764 					/*
   1765 					 * Not a proper subset if pg has less
   1766 					 * CPUs than cmt_parent...
   1767 					 */
   1768 					cmt_lineage_status =
   1769 					    CMT_LINEAGE_NON_PROMOTABLE;
   1770 					goto handle_error;
   1771 				}
   1772 			}
   1773 		}
   1774 
   1775 		/*
   1776 		 * Walk each of the CPUs in the PGs group and perform
   1777 		 * consistency checks along the way.
   1778 		 */
   1779 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
   1780 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
   1781 			/*
   1782 			 * Verify that there aren't any CPUs contained in PG
   1783 			 * that the next PG in the lineage (which is larger
   1784 			 * or same size) doesn't also contain.
   1785 			 */
   1786 			if (pg_next != NULL &&
   1787 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
   1788 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
   1789 				goto handle_error;
   1790 			}
   1791 
   1792 			/*
   1793 			 * Verify that all the CPUs in the PG are in the same
   1794 			 * lgroup.
   1795 			 */
   1796 			if (lgrp == LGRP_NULL_HANDLE) {
   1797 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
   1798 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
   1799 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
   1800 				goto handle_error;
   1801 			}
   1802 		}
   1803 	}
   1804 
   1805 handle_error:
   1806 	/*
   1807 	 * Some of these validation errors can result when the CPU grouping
   1808 	 * information is derived from buggy sources (for example, incorrect
   1809 	 * ACPI tables on x86 systems).
   1810 	 *
   1811 	 * We'll try to recover in such cases by pruning out the illegal
   1812 	 * groupings from the PG hierarchy, which means that we won't optimize
   1813 	 * for those levels, but we will for the remaining ones.
   1814 	 */
   1815 	switch (cmt_lineage_status) {
   1816 	case CMT_LINEAGE_VALID:
   1817 	case CMT_LINEAGE_REPAIRED:
   1818 		break;
   1819 	case CMT_LINEAGE_PG_SPANS_LGRPS:
   1820 		/*
   1821 		 * We've detected a PG whose CPUs span lgroups.
   1822 		 *
   1823 		 * This isn't supported, as the dispatcher isn't allowed to
   1824 		 * to do CMT thread placement across lgroups, as this would
   1825 		 * conflict with policies implementing MPO thread affinity.
   1826 		 *
   1827 		 * If the PG is of a sharing relationship type known to
   1828 		 * legitimately span lgroups, specify that no CMT thread
   1829 		 * placement policy should be implemented, and prune the PG
   1830 		 * from the existing CMT PG hierarchy.
   1831 		 *
   1832 		 * Otherwise, fall though to the case below for handling.
   1833 		 */
   1834 		if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
   1835 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
   1836 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
   1837 				goto revalidate;
   1838 			}
   1839 		}
   1840 		/*LINTED*/
   1841 	case CMT_LINEAGE_NON_PROMOTABLE:
   1842 		/*
   1843 		 * We've detected a PG that already exists in another CPU's
   1844 		 * lineage that cannot cannot legally be promoted into place
   1845 		 * without breaking the invariants of the hierarchy.
   1846 		 */
   1847 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
   1848 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
   1849 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
   1850 				goto revalidate;
   1851 			}
   1852 		}
   1853 		/*
   1854 		 * Something went wrong trying to prune out the bad level.
   1855 		 * Disable CMT scheduling altogether.
   1856 		 */
   1857 		pg_cmt_disable();
   1858 		break;
   1859 	case CMT_LINEAGE_NON_CONCENTRIC:
   1860 		/*
   1861 		 * We've detected a non-concentric PG lineage, which means that
   1862 		 * there's a PG in the lineage that has CPUs that the next PG
   1863 		 * over in the lineage (which is the same size or larger)
   1864 		 * doesn't have.
   1865 		 *
   1866 		 * In this case, we examine the two PGs to see if either
   1867 		 * grouping is defined by potentially buggy sources.
   1868 		 *
   1869 		 * If one has less CPUs than the other, and contains CPUs
   1870 		 * not found in the parent, and it is an untrusted enumeration,
   1871 		 * then prune it. If both have the same number of CPUs, then
   1872 		 * prune the one that is untrusted.
   1873 		 *
   1874 		 * This process repeats until we have a concentric lineage,
   1875 		 * or we would have to prune out level derived from what we
   1876 		 * thought was a reliable source, in which case CMT scheduling
   1877 		 * is disabled altogether.
   1878 		 */
   1879 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
   1880 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
   1881 			pg_bad = pg;
   1882 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
   1883 		    PG_NUM_CPUS((pg_t *)pg_next)) {
   1884 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
   1885 				pg_bad = pg_next;
   1886 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
   1887 				pg_bad = pg;
   1888 			}
   1889 		}
   1890 		if (pg_bad) {
   1891 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
   1892 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
   1893 				goto revalidate;
   1894 			}
   1895 		}
   1896 		/*
   1897 		 * Something went wrong trying to identify and/or prune out
   1898 		 * the bad level. Disable CMT scheduling altogether.
   1899 		 */
   1900 		pg_cmt_disable();
   1901 		break;
   1902 	default:
   1903 		/*
   1904 		 * If we're here, we've encountered a validation error for
   1905 		 * which we don't know how to recover. In this case, disable
   1906 		 * CMT scheduling altogether.
   1907 		 */
   1908 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
   1909 		pg_cmt_disable();
   1910 	}
   1911 	return (cmt_lineage_status);
   1912 }
   1913