Home | History | Annotate | Download | only in disp
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     27 /*	  All Rights Reserved  	*/
     28 
     29 
     30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.30 */
     31 
     32 #include <sys/types.h>
     33 #include <sys/param.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/signal.h>
     36 #include <sys/user.h>
     37 #include <sys/systm.h>
     38 #include <sys/sysinfo.h>
     39 #include <sys/var.h>
     40 #include <sys/errno.h>
     41 #include <sys/cmn_err.h>
     42 #include <sys/debug.h>
     43 #include <sys/inline.h>
     44 #include <sys/disp.h>
     45 #include <sys/class.h>
     46 #include <sys/bitmap.h>
     47 #include <sys/kmem.h>
     48 #include <sys/cpuvar.h>
     49 #include <sys/vtrace.h>
     50 #include <sys/tnf.h>
     51 #include <sys/cpupart.h>
     52 #include <sys/lgrp.h>
     53 #include <sys/pg.h>
     54 #include <sys/cmt.h>
     55 #include <sys/bitset.h>
     56 #include <sys/schedctl.h>
     57 #include <sys/atomic.h>
     58 #include <sys/dtrace.h>
     59 #include <sys/sdt.h>
     60 #include <sys/archsystm.h>
     61 
     62 #include <vm/as.h>
     63 
     64 #define	BOUND_CPU	0x1
     65 #define	BOUND_PARTITION	0x2
     66 #define	BOUND_INTR	0x4
     67 
     68 /* Dispatch queue allocation structure and functions */
     69 struct disp_queue_info {
     70 	disp_t	*dp;
     71 	dispq_t *olddispq;
     72 	dispq_t *newdispq;
     73 	ulong_t	*olddqactmap;
     74 	ulong_t	*newdqactmap;
     75 	int	oldnglobpris;
     76 };
     77 static void	disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
     78     disp_t *dp);
     79 static void	disp_dq_assign(struct disp_queue_info *dptr, int numpris);
     80 static void	disp_dq_free(struct disp_queue_info *dptr);
     81 
     82 /* platform-specific routine to call when processor is idle */
     83 static void	generic_idle_cpu();
     84 void		(*idle_cpu)() = generic_idle_cpu;
     85 
     86 /* routines invoked when a CPU enters/exits the idle loop */
     87 static void	idle_enter();
     88 static void	idle_exit();
     89 
     90 /* platform-specific routine to call when thread is enqueued */
     91 static void	generic_enq_thread(cpu_t *, int);
     92 void		(*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
     93 
     94 pri_t	kpreemptpri;		/* priority where kernel preemption applies */
     95 pri_t	upreemptpri = 0; 	/* priority where normal preemption applies */
     96 pri_t	intr_pri;		/* interrupt thread priority base level */
     97 
     98 #define	KPQPRI	-1 		/* pri where cpu affinity is dropped for kpq */
     99 pri_t	kpqpri = KPQPRI; 	/* can be set in /etc/system */
    100 disp_t	cpu0_disp;		/* boot CPU's dispatch queue */
    101 disp_lock_t	swapped_lock;	/* lock swapped threads and swap queue */
    102 int	nswapped;		/* total number of swapped threads */
    103 void	disp_swapped_enq(kthread_t *tp);
    104 static void	disp_swapped_setrun(kthread_t *tp);
    105 static void	cpu_resched(cpu_t *cp, pri_t tpri);
    106 
    107 /*
    108  * If this is set, only interrupt threads will cause kernel preemptions.
    109  * This is done by changing the value of kpreemptpri.  kpreemptpri
    110  * will either be the max sysclass pri + 1 or the min interrupt pri.
    111  */
    112 int	only_intr_kpreempt;
    113 
    114 extern void set_idle_cpu(int cpun);
    115 extern void unset_idle_cpu(int cpun);
    116 static void setkpdq(kthread_t *tp, int borf);
    117 #define	SETKP_BACK	0
    118 #define	SETKP_FRONT	1
    119 /*
    120  * Parameter that determines how recently a thread must have run
    121  * on the CPU to be considered loosely-bound to that CPU to reduce
    122  * cold cache effects.  The interval is in hertz.
    123  */
    124 #define	RECHOOSE_INTERVAL 3
    125 int	rechoose_interval = RECHOOSE_INTERVAL;
    126 static cpu_t	*cpu_choose(kthread_t *, pri_t);
    127 
    128 /*
    129  * Parameter that determines how long (in nanoseconds) a thread must
    130  * be sitting on a run queue before it can be stolen by another CPU
    131  * to reduce migrations.  The interval is in nanoseconds.
    132  *
    133  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
    134  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
    135  * here indicating it is uninitiallized.
    136  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
    137  *
    138  */
    139 #define	NOSTEAL_UNINITIALIZED	(-1)
    140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
    141 extern void cmp_set_nosteal_interval(void);
    142 
    143 id_t	defaultcid;	/* system "default" class; see dispadmin(1M) */
    144 
    145 disp_lock_t	transition_lock;	/* lock on transitioning threads */
    146 disp_lock_t	stop_lock;		/* lock on stopped threads */
    147 
    148 static void	cpu_dispqalloc(int numpris);
    149 
    150 /*
    151  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
    152  * a thread because it was sitting on its run queue for a very short
    153  * period of time.
    154  */
    155 #define	T_DONTSTEAL	(kthread_t *)(-1) /* returned by disp_getwork/getbest */
    156 
    157 static kthread_t	*disp_getwork(cpu_t *to);
    158 static kthread_t	*disp_getbest(disp_t *from);
    159 static kthread_t	*disp_ratify(kthread_t *tp, disp_t *kpq);
    160 
    161 void	swtch_to(kthread_t *);
    162 
    163 /*
    164  * dispatcher and scheduler initialization
    165  */
    166 
    167 /*
    168  * disp_setup - Common code to calculate and allocate dispatcher
    169  *		variables and structures based on the maximum priority.
    170  */
    171 static void
    172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
    173 {
    174 	pri_t	newnglobpris;
    175 
    176 	ASSERT(MUTEX_HELD(&cpu_lock));
    177 
    178 	newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
    179 
    180 	if (newnglobpris > oldnglobpris) {
    181 		/*
    182 		 * Allocate new kp queues for each CPU partition.
    183 		 */
    184 		cpupart_kpqalloc(newnglobpris);
    185 
    186 		/*
    187 		 * Allocate new dispatch queues for each CPU.
    188 		 */
    189 		cpu_dispqalloc(newnglobpris);
    190 
    191 		/*
    192 		 * compute new interrupt thread base priority
    193 		 */
    194 		intr_pri = maxglobpri;
    195 		if (only_intr_kpreempt) {
    196 			kpreemptpri = intr_pri + 1;
    197 			if (kpqpri == KPQPRI)
    198 				kpqpri = kpreemptpri;
    199 		}
    200 		v.v_nglobpris = newnglobpris;
    201 	}
    202 }
    203 
    204 /*
    205  * dispinit - Called to initialize all loaded classes and the
    206  *	      dispatcher framework.
    207  */
    208 void
    209 dispinit(void)
    210 {
    211 	id_t	cid;
    212 	pri_t	maxglobpri;
    213 	pri_t	cl_maxglobpri;
    214 
    215 	maxglobpri = -1;
    216 
    217 	/*
    218 	 * Initialize transition lock, which will always be set.
    219 	 */
    220 	DISP_LOCK_INIT(&transition_lock);
    221 	disp_lock_enter_high(&transition_lock);
    222 	DISP_LOCK_INIT(&stop_lock);
    223 
    224 	mutex_enter(&cpu_lock);
    225 	CPU->cpu_disp->disp_maxrunpri = -1;
    226 	CPU->cpu_disp->disp_max_unbound_pri = -1;
    227 
    228 	/*
    229 	 * Initialize the default CPU partition.
    230 	 */
    231 	cpupart_initialize_default();
    232 	/*
    233 	 * Call the class specific initialization functions for
    234 	 * all pre-installed schedulers.
    235 	 *
    236 	 * We pass the size of a class specific parameter
    237 	 * buffer to each of the initialization functions
    238 	 * to try to catch problems with backward compatibility
    239 	 * of class modules.
    240 	 *
    241 	 * For example a new class module running on an old system
    242 	 * which didn't provide sufficiently large parameter buffers
    243 	 * would be bad news. Class initialization modules can check for
    244 	 * this and take action if they detect a problem.
    245 	 */
    246 
    247 	for (cid = 0; cid < nclass; cid++) {
    248 		sclass_t	*sc;
    249 
    250 		sc = &sclass[cid];
    251 		if (SCHED_INSTALLED(sc)) {
    252 			cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
    253 			    &sc->cl_funcs);
    254 			if (cl_maxglobpri > maxglobpri)
    255 				maxglobpri = cl_maxglobpri;
    256 		}
    257 	}
    258 	kpreemptpri = (pri_t)v.v_maxsyspri + 1;
    259 	if (kpqpri == KPQPRI)
    260 		kpqpri = kpreemptpri;
    261 
    262 	ASSERT(maxglobpri >= 0);
    263 	disp_setup(maxglobpri, 0);
    264 
    265 	mutex_exit(&cpu_lock);
    266 
    267 	/*
    268 	 * Platform specific sticky scheduler setup.
    269 	 */
    270 	if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
    271 		cmp_set_nosteal_interval();
    272 
    273 	/*
    274 	 * Get the default class ID; this may be later modified via
    275 	 * dispadmin(1M).  This will load the class (normally TS) and that will
    276 	 * call disp_add(), which is why we had to drop cpu_lock first.
    277 	 */
    278 	if (getcid(defaultclass, &defaultcid) != 0) {
    279 		cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
    280 		    defaultclass);
    281 	}
    282 }
    283 
    284 /*
    285  * disp_add - Called with class pointer to initialize the dispatcher
    286  *	      for a newly loaded class.
    287  */
    288 void
    289 disp_add(sclass_t *clp)
    290 {
    291 	pri_t	maxglobpri;
    292 	pri_t	cl_maxglobpri;
    293 
    294 	mutex_enter(&cpu_lock);
    295 	/*
    296 	 * Initialize the scheduler class.
    297 	 */
    298 	maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
    299 	cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
    300 	if (cl_maxglobpri > maxglobpri)
    301 		maxglobpri = cl_maxglobpri;
    302 
    303 	/*
    304 	 * Save old queue information.  Since we're initializing a
    305 	 * new scheduling class which has just been loaded, then
    306 	 * the size of the dispq may have changed.  We need to handle
    307 	 * that here.
    308 	 */
    309 	disp_setup(maxglobpri, v.v_nglobpris);
    310 
    311 	mutex_exit(&cpu_lock);
    312 }
    313 
    314 
    315 /*
    316  * For each CPU, allocate new dispatch queues
    317  * with the stated number of priorities.
    318  */
    319 static void
    320 cpu_dispqalloc(int numpris)
    321 {
    322 	cpu_t	*cpup;
    323 	struct disp_queue_info	*disp_mem;
    324 	int i, num;
    325 
    326 	ASSERT(MUTEX_HELD(&cpu_lock));
    327 
    328 	disp_mem = kmem_zalloc(NCPU *
    329 	    sizeof (struct disp_queue_info), KM_SLEEP);
    330 
    331 	/*
    332 	 * This routine must allocate all of the memory before stopping
    333 	 * the cpus because it must not sleep in kmem_alloc while the
    334 	 * CPUs are stopped.  Locks they hold will not be freed until they
    335 	 * are restarted.
    336 	 */
    337 	i = 0;
    338 	cpup = cpu_list;
    339 	do {
    340 		disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
    341 		i++;
    342 		cpup = cpup->cpu_next;
    343 	} while (cpup != cpu_list);
    344 	num = i;
    345 
    346 	pause_cpus(NULL);
    347 	for (i = 0; i < num; i++)
    348 		disp_dq_assign(&disp_mem[i], numpris);
    349 	start_cpus();
    350 
    351 	/*
    352 	 * I must free all of the memory after starting the cpus because
    353 	 * I can not risk sleeping in kmem_free while the cpus are stopped.
    354 	 */
    355 	for (i = 0; i < num; i++)
    356 		disp_dq_free(&disp_mem[i]);
    357 
    358 	kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
    359 }
    360 
    361 static void
    362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t	*dp)
    363 {
    364 	dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
    365 	dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
    366 	    sizeof (long), KM_SLEEP);
    367 	dptr->dp = dp;
    368 }
    369 
    370 static void
    371 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
    372 {
    373 	disp_t	*dp;
    374 
    375 	dp = dptr->dp;
    376 	dptr->olddispq = dp->disp_q;
    377 	dptr->olddqactmap = dp->disp_qactmap;
    378 	dptr->oldnglobpris = dp->disp_npri;
    379 
    380 	ASSERT(dptr->oldnglobpris < numpris);
    381 
    382 	if (dptr->olddispq != NULL) {
    383 		/*
    384 		 * Use kcopy because bcopy is platform-specific
    385 		 * and could block while we might have paused the cpus.
    386 		 */
    387 		(void) kcopy(dptr->olddispq, dptr->newdispq,
    388 		    dptr->oldnglobpris * sizeof (dispq_t));
    389 		(void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
    390 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
    391 		    sizeof (long));
    392 	}
    393 	dp->disp_q = dptr->newdispq;
    394 	dp->disp_qactmap = dptr->newdqactmap;
    395 	dp->disp_q_limit = &dptr->newdispq[numpris];
    396 	dp->disp_npri = numpris;
    397 }
    398 
    399 static void
    400 disp_dq_free(struct disp_queue_info *dptr)
    401 {
    402 	if (dptr->olddispq != NULL)
    403 		kmem_free(dptr->olddispq,
    404 		    dptr->oldnglobpris * sizeof (dispq_t));
    405 	if (dptr->olddqactmap != NULL)
    406 		kmem_free(dptr->olddqactmap,
    407 		    ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
    408 }
    409 
    410 /*
    411  * For a newly created CPU, initialize the dispatch queue.
    412  * This is called before the CPU is known through cpu[] or on any lists.
    413  */
    414 void
    415 disp_cpu_init(cpu_t *cp)
    416 {
    417 	disp_t	*dp;
    418 	dispq_t	*newdispq;
    419 	ulong_t	*newdqactmap;
    420 
    421 	ASSERT(MUTEX_HELD(&cpu_lock));	/* protect dispatcher queue sizes */
    422 
    423 	if (cp == cpu0_disp.disp_cpu)
    424 		dp = &cpu0_disp;
    425 	else
    426 		dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
    427 	bzero(dp, sizeof (disp_t));
    428 	cp->cpu_disp = dp;
    429 	dp->disp_cpu = cp;
    430 	dp->disp_maxrunpri = -1;
    431 	dp->disp_max_unbound_pri = -1;
    432 	DISP_LOCK_INIT(&cp->cpu_thread_lock);
    433 	/*
    434 	 * Allocate memory for the dispatcher queue headers
    435 	 * and the active queue bitmap.
    436 	 */
    437 	newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
    438 	newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
    439 	    sizeof (long), KM_SLEEP);
    440 	dp->disp_q = newdispq;
    441 	dp->disp_qactmap = newdqactmap;
    442 	dp->disp_q_limit = &newdispq[v.v_nglobpris];
    443 	dp->disp_npri = v.v_nglobpris;
    444 }
    445 
    446 void
    447 disp_cpu_fini(cpu_t *cp)
    448 {
    449 	ASSERT(MUTEX_HELD(&cpu_lock));
    450 
    451 	disp_kp_free(cp->cpu_disp);
    452 	if (cp->cpu_disp != &cpu0_disp)
    453 		kmem_free(cp->cpu_disp, sizeof (disp_t));
    454 }
    455 
    456 /*
    457  * Allocate new, larger kpreempt dispatch queue to replace the old one.
    458  */
    459 void
    460 disp_kp_alloc(disp_t *dq, pri_t npri)
    461 {
    462 	struct disp_queue_info	mem_info;
    463 
    464 	if (npri > dq->disp_npri) {
    465 		/*
    466 		 * Allocate memory for the new array.
    467 		 */
    468 		disp_dq_alloc(&mem_info, npri, dq);
    469 
    470 		/*
    471 		 * We need to copy the old structures to the new
    472 		 * and free the old.
    473 		 */
    474 		disp_dq_assign(&mem_info, npri);
    475 		disp_dq_free(&mem_info);
    476 	}
    477 }
    478 
    479 /*
    480  * Free dispatch queue.
    481  * Used for the kpreempt queues for a removed CPU partition and
    482  * for the per-CPU queues of deleted CPUs.
    483  */
    484 void
    485 disp_kp_free(disp_t *dq)
    486 {
    487 	struct disp_queue_info	mem_info;
    488 
    489 	mem_info.olddispq = dq->disp_q;
    490 	mem_info.olddqactmap = dq->disp_qactmap;
    491 	mem_info.oldnglobpris = dq->disp_npri;
    492 	disp_dq_free(&mem_info);
    493 }
    494 
    495 /*
    496  * End dispatcher and scheduler initialization.
    497  */
    498 
    499 /*
    500  * See if there's anything to do other than remain idle.
    501  * Return non-zero if there is.
    502  *
    503  * This function must be called with high spl, or with
    504  * kernel preemption disabled to prevent the partition's
    505  * active cpu list from changing while being traversed.
    506  *
    507  */
    508 int
    509 disp_anywork(void)
    510 {
    511 	cpu_t   *cp = CPU;
    512 	cpu_t   *ocp;
    513 
    514 	if (cp->cpu_disp->disp_nrunnable != 0)
    515 		return (1);
    516 
    517 	if (!(cp->cpu_flags & CPU_OFFLINE)) {
    518 		if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
    519 			return (1);
    520 
    521 		/*
    522 		 * Work can be taken from another CPU if:
    523 		 *	- There is unbound work on the run queue
    524 		 *	- That work isn't a thread undergoing a
    525 		 *	- context switch on an otherwise empty queue.
    526 		 *	- The CPU isn't running the idle loop.
    527 		 */
    528 		for (ocp = cp->cpu_next_part; ocp != cp;
    529 		    ocp = ocp->cpu_next_part) {
    530 			ASSERT(CPU_ACTIVE(ocp));
    531 
    532 			if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
    533 			    !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
    534 			    ocp->cpu_disp->disp_nrunnable == 1) &&
    535 			    ocp->cpu_dispatch_pri != -1)
    536 				return (1);
    537 		}
    538 	}
    539 	return (0);
    540 }
    541 
    542 /*
    543  * Called when CPU enters the idle loop
    544  */
    545 static void
    546 idle_enter()
    547 {
    548 	cpu_t		*cp = CPU;
    549 
    550 	new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
    551 	CPU_STATS_ADDQ(cp, sys, idlethread, 1);
    552 	set_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    553 }
    554 
    555 /*
    556  * Called when CPU exits the idle loop
    557  */
    558 static void
    559 idle_exit()
    560 {
    561 	cpu_t		*cp = CPU;
    562 
    563 	new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
    564 	unset_idle_cpu(cp->cpu_id);	/* arch-dependent hook */
    565 }
    566 
    567 /*
    568  * Idle loop.
    569  */
    570 void
    571 idle()
    572 {
    573 	struct cpu	*cp = CPU;		/* pointer to this CPU */
    574 	kthread_t	*t;			/* taken thread */
    575 
    576 	idle_enter();
    577 
    578 	/*
    579 	 * Uniprocessor version of idle loop.
    580 	 * Do this until notified that we're on an actual multiprocessor.
    581 	 */
    582 	while (ncpus == 1) {
    583 		if (cp->cpu_disp->disp_nrunnable == 0) {
    584 			(*idle_cpu)();
    585 			continue;
    586 		}
    587 		idle_exit();
    588 		swtch();
    589 
    590 		idle_enter(); /* returned from swtch */
    591 	}
    592 
    593 	/*
    594 	 * Multiprocessor idle loop.
    595 	 */
    596 	for (;;) {
    597 		/*
    598 		 * If CPU is completely quiesced by p_online(2), just wait
    599 		 * here with minimal bus traffic until put online.
    600 		 */
    601 		while (cp->cpu_flags & CPU_QUIESCED)
    602 			(*idle_cpu)();
    603 
    604 		if (cp->cpu_disp->disp_nrunnable != 0) {
    605 			idle_exit();
    606 			swtch();
    607 		} else {
    608 			if (cp->cpu_flags & CPU_OFFLINE)
    609 				continue;
    610 			if ((t = disp_getwork(cp)) == NULL) {
    611 				if (cp->cpu_chosen_level != -1) {
    612 					disp_t *dp = cp->cpu_disp;
    613 					disp_t *kpq;
    614 
    615 					disp_lock_enter(&dp->disp_lock);
    616 					/*
    617 					 * Set kpq under lock to prevent
    618 					 * migration between partitions.
    619 					 */
    620 					kpq = &cp->cpu_part->cp_kp_queue;
    621 					if (kpq->disp_maxrunpri == -1)
    622 						cp->cpu_chosen_level = -1;
    623 					disp_lock_exit(&dp->disp_lock);
    624 				}
    625 				(*idle_cpu)();
    626 				continue;
    627 			}
    628 			/*
    629 			 * If there was a thread but we couldn't steal
    630 			 * it, then keep trying.
    631 			 */
    632 			if (t == T_DONTSTEAL)
    633 				continue;
    634 			idle_exit();
    635 			swtch_to(t);
    636 		}
    637 		idle_enter(); /* returned from swtch/swtch_to */
    638 	}
    639 }
    640 
    641 
    642 /*
    643  * Preempt the currently running thread in favor of the highest
    644  * priority thread.  The class of the current thread controls
    645  * where it goes on the dispatcher queues. If panicking, turn
    646  * preemption off.
    647  */
    648 void
    649 preempt()
    650 {
    651 	kthread_t 	*t = curthread;
    652 	klwp_t 		*lwp = ttolwp(curthread);
    653 
    654 	if (panicstr)
    655 		return;
    656 
    657 	TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
    658 
    659 	thread_lock(t);
    660 
    661 	if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
    662 		/*
    663 		 * this thread has already been chosen to be run on
    664 		 * another CPU. Clear kprunrun on this CPU since we're
    665 		 * already headed for swtch().
    666 		 */
    667 		CPU->cpu_kprunrun = 0;
    668 		thread_unlock_nopreempt(t);
    669 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    670 	} else {
    671 		if (lwp != NULL)
    672 			lwp->lwp_ru.nivcsw++;
    673 		CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
    674 		THREAD_TRANSITION(t);
    675 		CL_PREEMPT(t);
    676 		DTRACE_SCHED(preempt);
    677 		thread_unlock_nopreempt(t);
    678 
    679 		TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
    680 
    681 		swtch();		/* clears CPU->cpu_runrun via disp() */
    682 	}
    683 }
    684 
    685 extern kthread_t *thread_unpin();
    686 
    687 /*
    688  * disp() - find the highest priority thread for this processor to run, and
    689  * set it in TS_ONPROC state so that resume() can be called to run it.
    690  */
    691 static kthread_t *
    692 disp()
    693 {
    694 	cpu_t		*cpup;
    695 	disp_t		*dp;
    696 	kthread_t	*tp;
    697 	dispq_t		*dq;
    698 	int		maxrunword;
    699 	pri_t		pri;
    700 	disp_t		*kpq;
    701 
    702 	TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
    703 
    704 	cpup = CPU;
    705 	/*
    706 	 * Find the highest priority loaded, runnable thread.
    707 	 */
    708 	dp = cpup->cpu_disp;
    709 
    710 reschedule:
    711 	/*
    712 	 * If there is more important work on the global queue with a better
    713 	 * priority than the maximum on this CPU, take it now.
    714 	 */
    715 	kpq = &cpup->cpu_part->cp_kp_queue;
    716 	while ((pri = kpq->disp_maxrunpri) >= 0 &&
    717 	    pri >= dp->disp_maxrunpri &&
    718 	    (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
    719 	    (tp = disp_getbest(kpq)) != NULL) {
    720 		if (disp_ratify(tp, kpq) != NULL) {
    721 			TRACE_1(TR_FAC_DISP, TR_DISP_END,
    722 			    "disp_end:tid %p", tp);
    723 			return (tp);
    724 		}
    725 	}
    726 
    727 	disp_lock_enter(&dp->disp_lock);
    728 	pri = dp->disp_maxrunpri;
    729 
    730 	/*
    731 	 * If there is nothing to run, look at what's runnable on other queues.
    732 	 * Choose the idle thread if the CPU is quiesced.
    733 	 * Note that CPUs that have the CPU_OFFLINE flag set can still run
    734 	 * interrupt threads, which will be the only threads on the CPU's own
    735 	 * queue, but cannot run threads from other queues.
    736 	 */
    737 	if (pri == -1) {
    738 		if (!(cpup->cpu_flags & CPU_OFFLINE)) {
    739 			disp_lock_exit(&dp->disp_lock);
    740 			if ((tp = disp_getwork(cpup)) == NULL ||
    741 			    tp == T_DONTSTEAL) {
    742 				tp = cpup->cpu_idle_thread;
    743 				(void) splhigh();
    744 				THREAD_ONPROC(tp, cpup);
    745 				cpup->cpu_dispthread = tp;
    746 				cpup->cpu_dispatch_pri = -1;
    747 				cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    748 				cpup->cpu_chosen_level = -1;
    749 			}
    750 		} else {
    751 			disp_lock_exit_high(&dp->disp_lock);
    752 			tp = cpup->cpu_idle_thread;
    753 			THREAD_ONPROC(tp, cpup);
    754 			cpup->cpu_dispthread = tp;
    755 			cpup->cpu_dispatch_pri = -1;
    756 			cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
    757 			cpup->cpu_chosen_level = -1;
    758 		}
    759 		TRACE_1(TR_FAC_DISP, TR_DISP_END,
    760 		    "disp_end:tid %p", tp);
    761 		return (tp);
    762 	}
    763 
    764 	dq = &dp->disp_q[pri];
    765 	tp = dq->dq_first;
    766 
    767 	ASSERT(tp != NULL);
    768 	ASSERT(tp->t_schedflag & TS_LOAD);	/* thread must be swapped in */
    769 
    770 	DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
    771 
    772 	/*
    773 	 * Found it so remove it from queue.
    774 	 */
    775 	dp->disp_nrunnable--;
    776 	dq->dq_sruncnt--;
    777 	if ((dq->dq_first = tp->t_link) == NULL) {
    778 		ulong_t	*dqactmap = dp->disp_qactmap;
    779 
    780 		ASSERT(dq->dq_sruncnt == 0);
    781 		dq->dq_last = NULL;
    782 
    783 		/*
    784 		 * The queue is empty, so the corresponding bit needs to be
    785 		 * turned off in dqactmap.   If nrunnable != 0 just took the
    786 		 * last runnable thread off the
    787 		 * highest queue, so recompute disp_maxrunpri.
    788 		 */
    789 		maxrunword = pri >> BT_ULSHIFT;
    790 		dqactmap[maxrunword] &= ~BT_BIW(pri);
    791 
    792 		if (dp->disp_nrunnable == 0) {
    793 			dp->disp_max_unbound_pri = -1;
    794 			dp->disp_maxrunpri = -1;
    795 		} else {
    796 			int ipri;
    797 
    798 			ipri = bt_gethighbit(dqactmap, maxrunword);
    799 			dp->disp_maxrunpri = ipri;
    800 			if (ipri < dp->disp_max_unbound_pri)
    801 				dp->disp_max_unbound_pri = ipri;
    802 		}
    803 	} else {
    804 		tp->t_link = NULL;
    805 	}
    806 
    807 	/*
    808 	 * Set TS_DONT_SWAP flag to prevent another processor from swapping
    809 	 * out this thread before we have a chance to run it.
    810 	 * While running, it is protected against swapping by t_lock.
    811 	 */
    812 	tp->t_schedflag |= TS_DONT_SWAP;
    813 	cpup->cpu_dispthread = tp;		/* protected by spl only */
    814 	cpup->cpu_dispatch_pri = pri;
    815 	ASSERT(pri == DISP_PRIO(tp));
    816 	thread_onproc(tp, cpup);  		/* set t_state to TS_ONPROC */
    817 	disp_lock_exit_high(&dp->disp_lock);	/* drop run queue lock */
    818 
    819 	ASSERT(tp != NULL);
    820 	TRACE_1(TR_FAC_DISP, TR_DISP_END,
    821 	    "disp_end:tid %p", tp);
    822 
    823 	if (disp_ratify(tp, kpq) == NULL)
    824 		goto reschedule;
    825 
    826 	return (tp);
    827 }
    828 
    829 /*
    830  * swtch()
    831  *	Find best runnable thread and run it.
    832  *	Called with the current thread already switched to a new state,
    833  *	on a sleep queue, run queue, stopped, and not zombied.
    834  *	May be called at any spl level less than or equal to LOCK_LEVEL.
    835  *	Always drops spl to the base level (spl0()).
    836  */
    837 void
    838 swtch()
    839 {
    840 	kthread_t	*t = curthread;
    841 	kthread_t	*next;
    842 	cpu_t		*cp;
    843 
    844 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    845 
    846 	if (t->t_flag & T_INTR_THREAD)
    847 		cpu_intr_swtch_enter(t);
    848 
    849 	if (t->t_intr != NULL) {
    850 		/*
    851 		 * We are an interrupt thread.  Setup and return
    852 		 * the interrupted thread to be resumed.
    853 		 */
    854 		(void) splhigh();	/* block other scheduler action */
    855 		cp = CPU;		/* now protected against migration */
    856 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    857 		CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    858 		CPU_STATS_ADDQ(cp, sys, intrblk, 1);
    859 		next = thread_unpin();
    860 		TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    861 		resume_from_intr(next);
    862 	} else {
    863 #ifdef	DEBUG
    864 		if (t->t_state == TS_ONPROC &&
    865 		    t->t_disp_queue->disp_cpu == CPU &&
    866 		    t->t_preempt == 0) {
    867 			thread_lock(t);
    868 			ASSERT(t->t_state != TS_ONPROC ||
    869 			    t->t_disp_queue->disp_cpu != CPU ||
    870 			    t->t_preempt != 0);	/* cannot migrate */
    871 			thread_unlock_nopreempt(t);
    872 		}
    873 #endif	/* DEBUG */
    874 		cp = CPU;
    875 		next = disp();		/* returns with spl high */
    876 		ASSERT(CPU_ON_INTR(cp) == 0);	/* not called with PIL > 10 */
    877 
    878 		/* OK to steal anything left on run queue */
    879 		cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
    880 
    881 		if (next != t) {
    882 			if (t == cp->cpu_idle_thread) {
    883 				PG_NRUN_UPDATE(cp, 1);
    884 			} else if (next == cp->cpu_idle_thread) {
    885 				PG_NRUN_UPDATE(cp, -1);
    886 			}
    887 
    888 			/*
    889 			 * If t was previously in the TS_ONPROC state,
    890 			 * setfrontdq and setbackdq won't have set its t_waitrq.
    891 			 * Since we now finally know that we're switching away
    892 			 * from this thread, set its t_waitrq if it is on a run
    893 			 * queue.
    894 			 */
    895 			if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
    896 				t->t_waitrq = gethrtime_unscaled();
    897 			}
    898 
    899 			/*
    900 			 * restore mstate of thread that we are switching to
    901 			 */
    902 			restore_mstate(next);
    903 
    904 			CPU_STATS_ADDQ(cp, sys, pswitch, 1);
    905 			cp->cpu_last_swtch = t->t_disp_time = lbolt;
    906 			TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    907 
    908 			if (dtrace_vtime_active)
    909 				dtrace_vtime_switch(next);
    910 
    911 			resume(next);
    912 			/*
    913 			 * The TR_RESUME_END and TR_SWTCH_END trace points
    914 			 * appear at the end of resume(), because we may not
    915 			 * return here
    916 			 */
    917 		} else {
    918 			if (t->t_flag & T_INTR_THREAD)
    919 				cpu_intr_swtch_exit(t);
    920 
    921 			DTRACE_SCHED(remain__cpu);
    922 			TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
    923 			(void) spl0();
    924 		}
    925 	}
    926 }
    927 
    928 /*
    929  * swtch_from_zombie()
    930  *	Special case of swtch(), which allows checks for TS_ZOMB to be
    931  *	eliminated from normal resume.
    932  *	Find best runnable thread and run it.
    933  *	Called with the current thread zombied.
    934  *	Zombies cannot migrate, so CPU references are safe.
    935  */
    936 void
    937 swtch_from_zombie()
    938 {
    939 	kthread_t	*next;
    940 	cpu_t		*cpu = CPU;
    941 
    942 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
    943 
    944 	ASSERT(curthread->t_state == TS_ZOMB);
    945 
    946 	next = disp();			/* returns with spl high */
    947 	ASSERT(CPU_ON_INTR(CPU) == 0);	/* not called with PIL > 10 */
    948 	CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
    949 	ASSERT(next != curthread);
    950 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
    951 
    952 	if (next == cpu->cpu_idle_thread)
    953 		PG_NRUN_UPDATE(cpu, -1);
    954 
    955 	restore_mstate(next);
    956 
    957 	if (dtrace_vtime_active)
    958 		dtrace_vtime_switch(next);
    959 
    960 	resume_from_zombie(next);
    961 	/*
    962 	 * The TR_RESUME_END and TR_SWTCH_END trace points
    963 	 * appear at the end of resume(), because we certainly will not
    964 	 * return here
    965 	 */
    966 }
    967 
    968 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
    969 
    970 /*
    971  * search_disp_queues()
    972  *	Search the given dispatch queues for thread tp.
    973  *	Return 1 if tp is found, otherwise return 0.
    974  */
    975 static int
    976 search_disp_queues(disp_t *dp, kthread_t *tp)
    977 {
    978 	dispq_t		*dq;
    979 	dispq_t		*eq;
    980 
    981 	disp_lock_enter_high(&dp->disp_lock);
    982 
    983 	for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
    984 		kthread_t	*rp;
    985 
    986 		ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
    987 
    988 		for (rp = dq->dq_first; rp; rp = rp->t_link)
    989 			if (tp == rp) {
    990 				disp_lock_exit_high(&dp->disp_lock);
    991 				return (1);
    992 			}
    993 	}
    994 	disp_lock_exit_high(&dp->disp_lock);
    995 
    996 	return (0);
    997 }
    998 
    999 /*
   1000  * thread_on_queue()
   1001  *	Search all per-CPU dispatch queues and all partition-wide kpreempt
   1002  *	queues for thread tp. Return 1 if tp is found, otherwise return 0.
   1003  */
   1004 static int
   1005 thread_on_queue(kthread_t *tp)
   1006 {
   1007 	cpu_t		*cp;
   1008 	struct cpupart	*part;
   1009 
   1010 	ASSERT(getpil() >= DISP_LEVEL);
   1011 
   1012 	/*
   1013 	 * Search the per-CPU dispatch queues for tp.
   1014 	 */
   1015 	cp = CPU;
   1016 	do {
   1017 		if (search_disp_queues(cp->cpu_disp, tp))
   1018 			return (1);
   1019 	} while ((cp = cp->cpu_next_onln) != CPU);
   1020 
   1021 	/*
   1022 	 * Search the partition-wide kpreempt queues for tp.
   1023 	 */
   1024 	part = CPU->cpu_part;
   1025 	do {
   1026 		if (search_disp_queues(&part->cp_kp_queue, tp))
   1027 			return (1);
   1028 	} while ((part = part->cp_next) != CPU->cpu_part);
   1029 
   1030 	return (0);
   1031 }
   1032 
   1033 #else
   1034 
   1035 #define	thread_on_queue(tp)	0	/* ASSERT must be !thread_on_queue */
   1036 
   1037 #endif  /* DEBUG */
   1038 
   1039 /*
   1040  * like swtch(), but switch to a specified thread taken from another CPU.
   1041  *	called with spl high..
   1042  */
   1043 void
   1044 swtch_to(kthread_t *next)
   1045 {
   1046 	cpu_t			*cp = CPU;
   1047 
   1048 	TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
   1049 
   1050 	/*
   1051 	 * Update context switch statistics.
   1052 	 */
   1053 	CPU_STATS_ADDQ(cp, sys, pswitch, 1);
   1054 
   1055 	TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
   1056 
   1057 	if (curthread == cp->cpu_idle_thread)
   1058 		PG_NRUN_UPDATE(cp, 1);
   1059 
   1060 	/* OK to steal anything left on run queue */
   1061 	cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
   1062 
   1063 	/* record last execution time */
   1064 	cp->cpu_last_swtch = curthread->t_disp_time = lbolt;
   1065 
   1066 	/*
   1067 	 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
   1068 	 * won't have set its t_waitrq.  Since we now finally know that we're
   1069 	 * switching away from this thread, set its t_waitrq if it is on a run
   1070 	 * queue.
   1071 	 */
   1072 	if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
   1073 		curthread->t_waitrq = gethrtime_unscaled();
   1074 	}
   1075 
   1076 	/* restore next thread to previously running microstate */
   1077 	restore_mstate(next);
   1078 
   1079 	if (dtrace_vtime_active)
   1080 		dtrace_vtime_switch(next);
   1081 
   1082 	resume(next);
   1083 	/*
   1084 	 * The TR_RESUME_END and TR_SWTCH_END trace points
   1085 	 * appear at the end of resume(), because we may not
   1086 	 * return here
   1087 	 */
   1088 }
   1089 
   1090 
   1091 
   1092 #define	CPU_IDLING(pri)	((pri) == -1)
   1093 
   1094 static void
   1095 cpu_resched(cpu_t *cp, pri_t tpri)
   1096 {
   1097 	int	call_poke_cpu = 0;
   1098 	pri_t   cpupri = cp->cpu_dispatch_pri;
   1099 
   1100 	if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
   1101 		TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
   1102 		    "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
   1103 		if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
   1104 			cp->cpu_runrun = 1;
   1105 			aston(cp->cpu_dispthread);
   1106 			if (tpri < kpreemptpri && cp != CPU)
   1107 				call_poke_cpu = 1;
   1108 		}
   1109 		if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
   1110 			cp->cpu_kprunrun = 1;
   1111 			if (cp != CPU)
   1112 				call_poke_cpu = 1;
   1113 		}
   1114 	}
   1115 
   1116 	/*
   1117 	 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
   1118 	 */
   1119 	membar_enter();
   1120 
   1121 	if (call_poke_cpu)
   1122 		poke_cpu(cp->cpu_id);
   1123 }
   1124 
   1125 /*
   1126  * Perform multi-level CMT load balancing of running threads.
   1127  * tp is the thread being enqueued
   1128  * cp is the hint CPU (chosen by cpu_choose()).
   1129  */
   1130 static cpu_t *
   1131 cmt_balance(kthread_t *tp, cpu_t *cp)
   1132 {
   1133 	int		hint, i, cpu, nsiblings;
   1134 	int		self = 0;
   1135 	group_t		*cmt_pgs, *siblings;
   1136 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
   1137 	int		pg_nrun, tpg_nrun;
   1138 	int		level = 0;
   1139 	cpu_t		*newcp;
   1140 
   1141 	ASSERT(THREAD_LOCK_HELD(tp));
   1142 
   1143 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
   1144 
   1145 	if (GROUP_SIZE(cmt_pgs) == 0)
   1146 		return (cp);	/* nothing to do */
   1147