1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 30 #pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/sysmacros.h> 35 #include <sys/signal.h> 36 #include <sys/user.h> 37 #include <sys/systm.h> 38 #include <sys/sysinfo.h> 39 #include <sys/var.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/inline.h> 44 #include <sys/disp.h> 45 #include <sys/class.h> 46 #include <sys/bitmap.h> 47 #include <sys/kmem.h> 48 #include <sys/cpuvar.h> 49 #include <sys/vtrace.h> 50 #include <sys/tnf.h> 51 #include <sys/cpupart.h> 52 #include <sys/lgrp.h> 53 #include <sys/pg.h> 54 #include <sys/cmt.h> 55 #include <sys/bitset.h> 56 #include <sys/schedctl.h> 57 #include <sys/atomic.h> 58 #include <sys/dtrace.h> 59 #include <sys/sdt.h> 60 #include <sys/archsystm.h> 61 62 #include <vm/as.h> 63 64 #define BOUND_CPU 0x1 65 #define BOUND_PARTITION 0x2 66 #define BOUND_INTR 0x4 67 68 /* Dispatch queue allocation structure and functions */ 69 struct disp_queue_info { 70 disp_t *dp; 71 dispq_t *olddispq; 72 dispq_t *newdispq; 73 ulong_t *olddqactmap; 74 ulong_t *newdqactmap; 75 int oldnglobpris; 76 }; 77 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris, 78 disp_t *dp); 79 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris); 80 static void disp_dq_free(struct disp_queue_info *dptr); 81 82 /* platform-specific routine to call when processor is idle */ 83 static void generic_idle_cpu(); 84 void (*idle_cpu)() = generic_idle_cpu; 85 86 /* routines invoked when a CPU enters/exits the idle loop */ 87 static void idle_enter(); 88 static void idle_exit(); 89 90 /* platform-specific routine to call when thread is enqueued */ 91 static void generic_enq_thread(cpu_t *, int); 92 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread; 93 94 pri_t kpreemptpri; /* priority where kernel preemption applies */ 95 pri_t upreemptpri = 0; /* priority where normal preemption applies */ 96 pri_t intr_pri; /* interrupt thread priority base level */ 97 98 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */ 99 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */ 100 disp_t cpu0_disp; /* boot CPU's dispatch queue */ 101 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */ 102 int nswapped; /* total number of swapped threads */ 103 void disp_swapped_enq(kthread_t *tp); 104 static void disp_swapped_setrun(kthread_t *tp); 105 static void cpu_resched(cpu_t *cp, pri_t tpri); 106 107 /* 108 * If this is set, only interrupt threads will cause kernel preemptions. 109 * This is done by changing the value of kpreemptpri. kpreemptpri 110 * will either be the max sysclass pri + 1 or the min interrupt pri. 111 */ 112 int only_intr_kpreempt; 113 114 extern void set_idle_cpu(int cpun); 115 extern void unset_idle_cpu(int cpun); 116 static void setkpdq(kthread_t *tp, int borf); 117 #define SETKP_BACK 0 118 #define SETKP_FRONT 1 119 /* 120 * Parameter that determines how recently a thread must have run 121 * on the CPU to be considered loosely-bound to that CPU to reduce 122 * cold cache effects. The interval is in hertz. 123 */ 124 #define RECHOOSE_INTERVAL 3 125 int rechoose_interval = RECHOOSE_INTERVAL; 126 static cpu_t *cpu_choose(kthread_t *, pri_t); 127 128 /* 129 * Parameter that determines how long (in nanoseconds) a thread must 130 * be sitting on a run queue before it can be stolen by another CPU 131 * to reduce migrations. The interval is in nanoseconds. 132 * 133 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval() 134 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED 135 * here indicating it is uninitiallized. 136 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'. 137 * 138 */ 139 #define NOSTEAL_UNINITIALIZED (-1) 140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED; 141 extern void cmp_set_nosteal_interval(void); 142 143 id_t defaultcid; /* system "default" class; see dispadmin(1M) */ 144 145 disp_lock_t transition_lock; /* lock on transitioning threads */ 146 disp_lock_t stop_lock; /* lock on stopped threads */ 147 148 static void cpu_dispqalloc(int numpris); 149 150 /* 151 * This gets returned by disp_getwork/disp_getbest if we couldn't steal 152 * a thread because it was sitting on its run queue for a very short 153 * period of time. 154 */ 155 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */ 156 157 static kthread_t *disp_getwork(cpu_t *to); 158 static kthread_t *disp_getbest(disp_t *from); 159 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq); 160 161 void swtch_to(kthread_t *); 162 163 /* 164 * dispatcher and scheduler initialization 165 */ 166 167 /* 168 * disp_setup - Common code to calculate and allocate dispatcher 169 * variables and structures based on the maximum priority. 170 */ 171 static void 172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris) 173 { 174 pri_t newnglobpris; 175 176 ASSERT(MUTEX_HELD(&cpu_lock)); 177 178 newnglobpris = maxglobpri + 1 + LOCK_LEVEL; 179 180 if (newnglobpris > oldnglobpris) { 181 /* 182 * Allocate new kp queues for each CPU partition. 183 */ 184 cpupart_kpqalloc(newnglobpris); 185 186 /* 187 * Allocate new dispatch queues for each CPU. 188 */ 189 cpu_dispqalloc(newnglobpris); 190 191 /* 192 * compute new interrupt thread base priority 193 */ 194 intr_pri = maxglobpri; 195 if (only_intr_kpreempt) { 196 kpreemptpri = intr_pri + 1; 197 if (kpqpri == KPQPRI) 198 kpqpri = kpreemptpri; 199 } 200 v.v_nglobpris = newnglobpris; 201 } 202 } 203 204 /* 205 * dispinit - Called to initialize all loaded classes and the 206 * dispatcher framework. 207 */ 208 void 209 dispinit(void) 210 { 211 id_t cid; 212 pri_t maxglobpri; 213 pri_t cl_maxglobpri; 214 215 maxglobpri = -1; 216 217 /* 218 * Initialize transition lock, which will always be set. 219 */ 220 DISP_LOCK_INIT(&transition_lock); 221 disp_lock_enter_high(&transition_lock); 222 DISP_LOCK_INIT(&stop_lock); 223 224 mutex_enter(&cpu_lock); 225 CPU->cpu_disp->disp_maxrunpri = -1; 226 CPU->cpu_disp->disp_max_unbound_pri = -1; 227 228 /* 229 * Initialize the default CPU partition. 230 */ 231 cpupart_initialize_default(); 232 /* 233 * Call the class specific initialization functions for 234 * all pre-installed schedulers. 235 * 236 * We pass the size of a class specific parameter 237 * buffer to each of the initialization functions 238 * to try to catch problems with backward compatibility 239 * of class modules. 240 * 241 * For example a new class module running on an old system 242 * which didn't provide sufficiently large parameter buffers 243 * would be bad news. Class initialization modules can check for 244 * this and take action if they detect a problem. 245 */ 246 247 for (cid = 0; cid < nclass; cid++) { 248 sclass_t *sc; 249 250 sc = &sclass[cid]; 251 if (SCHED_INSTALLED(sc)) { 252 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ, 253 &sc->cl_funcs); 254 if (cl_maxglobpri > maxglobpri) 255 maxglobpri = cl_maxglobpri; 256 } 257 } 258 kpreemptpri = (pri_t)v.v_maxsyspri + 1; 259 if (kpqpri == KPQPRI) 260 kpqpri = kpreemptpri; 261 262 ASSERT(maxglobpri >= 0); 263 disp_setup(maxglobpri, 0); 264 265 mutex_exit(&cpu_lock); 266 267 /* 268 * Platform specific sticky scheduler setup. 269 */ 270 if (nosteal_nsec == NOSTEAL_UNINITIALIZED) 271 cmp_set_nosteal_interval(); 272 273 /* 274 * Get the default class ID; this may be later modified via 275 * dispadmin(1M). This will load the class (normally TS) and that will 276 * call disp_add(), which is why we had to drop cpu_lock first. 277 */ 278 if (getcid(defaultclass, &defaultcid) != 0) { 279 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'", 280 defaultclass); 281 } 282 } 283 284 /* 285 * disp_add - Called with class pointer to initialize the dispatcher 286 * for a newly loaded class. 287 */ 288 void 289 disp_add(sclass_t *clp) 290 { 291 pri_t maxglobpri; 292 pri_t cl_maxglobpri; 293 294 mutex_enter(&cpu_lock); 295 /* 296 * Initialize the scheduler class. 297 */ 298 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1); 299 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs); 300 if (cl_maxglobpri > maxglobpri) 301 maxglobpri = cl_maxglobpri; 302 303 /* 304 * Save old queue information. Since we're initializing a 305 * new scheduling class which has just been loaded, then 306 * the size of the dispq may have changed. We need to handle 307 * that here. 308 */ 309 disp_setup(maxglobpri, v.v_nglobpris); 310 311 mutex_exit(&cpu_lock); 312 } 313 314 315 /* 316 * For each CPU, allocate new dispatch queues 317 * with the stated number of priorities. 318 */ 319 static void 320 cpu_dispqalloc(int numpris) 321 { 322 cpu_t *cpup; 323 struct disp_queue_info *disp_mem; 324 int i, num; 325 326 ASSERT(MUTEX_HELD(&cpu_lock)); 327 328 disp_mem = kmem_zalloc(NCPU * 329 sizeof (struct disp_queue_info), KM_SLEEP); 330 331 /* 332 * This routine must allocate all of the memory before stopping 333 * the cpus because it must not sleep in kmem_alloc while the 334 * CPUs are stopped. Locks they hold will not be freed until they 335 * are restarted. 336 */ 337 i = 0; 338 cpup = cpu_list; 339 do { 340 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp); 341 i++; 342 cpup = cpup->cpu_next; 343 } while (cpup != cpu_list); 344 num = i; 345 346 pause_cpus(NULL); 347 for (i = 0; i < num; i++) 348 disp_dq_assign(&disp_mem[i], numpris); 349 start_cpus(); 350 351 /* 352 * I must free all of the memory after starting the cpus because 353 * I can not risk sleeping in kmem_free while the cpus are stopped. 354 */ 355 for (i = 0; i < num; i++) 356 disp_dq_free(&disp_mem[i]); 357 358 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info)); 359 } 360 361 static void 362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp) 363 { 364 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP); 365 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) * 366 sizeof (long), KM_SLEEP); 367 dptr->dp = dp; 368 } 369 370 static void 371 disp_dq_assign(struct disp_queue_info *dptr, int numpris) 372 { 373 disp_t *dp; 374 375 dp = dptr->dp; 376 dptr->olddispq = dp->disp_q; 377 dptr->olddqactmap = dp->disp_qactmap; 378 dptr->oldnglobpris = dp->disp_npri; 379 380 ASSERT(dptr->oldnglobpris < numpris); 381 382 if (dptr->olddispq != NULL) { 383 /* 384 * Use kcopy because bcopy is platform-specific 385 * and could block while we might have paused the cpus. 386 */ 387 (void) kcopy(dptr->olddispq, dptr->newdispq, 388 dptr->oldnglobpris * sizeof (dispq_t)); 389 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap, 390 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * 391 sizeof (long)); 392 } 393 dp->disp_q = dptr->newdispq; 394 dp->disp_qactmap = dptr->newdqactmap; 395 dp->disp_q_limit = &dptr->newdispq[numpris]; 396 dp->disp_npri = numpris; 397 } 398 399 static void 400 disp_dq_free(struct disp_queue_info *dptr) 401 { 402 if (dptr->olddispq != NULL) 403 kmem_free(dptr->olddispq, 404 dptr->oldnglobpris * sizeof (dispq_t)); 405 if (dptr->olddqactmap != NULL) 406 kmem_free(dptr->olddqactmap, 407 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long)); 408 } 409 410 /* 411 * For a newly created CPU, initialize the dispatch queue. 412 * This is called before the CPU is known through cpu[] or on any lists. 413 */ 414 void 415 disp_cpu_init(cpu_t *cp) 416 { 417 disp_t *dp; 418 dispq_t *newdispq; 419 ulong_t *newdqactmap; 420 421 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */ 422 423 if (cp == cpu0_disp.disp_cpu) 424 dp = &cpu0_disp; 425 else 426 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP); 427 bzero(dp, sizeof (disp_t)); 428 cp->cpu_disp = dp; 429 dp->disp_cpu = cp; 430 dp->disp_maxrunpri = -1; 431 dp->disp_max_unbound_pri = -1; 432 DISP_LOCK_INIT(&cp->cpu_thread_lock); 433 /* 434 * Allocate memory for the dispatcher queue headers 435 * and the active queue bitmap. 436 */ 437 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP); 438 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) * 439 sizeof (long), KM_SLEEP); 440 dp->disp_q = newdispq; 441 dp->disp_qactmap = newdqactmap; 442 dp->disp_q_limit = &newdispq[v.v_nglobpris]; 443 dp->disp_npri = v.v_nglobpris; 444 } 445 446 void 447 disp_cpu_fini(cpu_t *cp) 448 { 449 ASSERT(MUTEX_HELD(&cpu_lock)); 450 451 disp_kp_free(cp->cpu_disp); 452 if (cp->cpu_disp != &cpu0_disp) 453 kmem_free(cp->cpu_disp, sizeof (disp_t)); 454 } 455 456 /* 457 * Allocate new, larger kpreempt dispatch queue to replace the old one. 458 */ 459 void 460 disp_kp_alloc(disp_t *dq, pri_t npri) 461 { 462 struct disp_queue_info mem_info; 463 464 if (npri > dq->disp_npri) { 465 /* 466 * Allocate memory for the new array. 467 */ 468 disp_dq_alloc(&mem_info, npri, dq); 469 470 /* 471 * We need to copy the old structures to the new 472 * and free the old. 473 */ 474 disp_dq_assign(&mem_info, npri); 475 disp_dq_free(&mem_info); 476 } 477 } 478 479 /* 480 * Free dispatch queue. 481 * Used for the kpreempt queues for a removed CPU partition and 482 * for the per-CPU queues of deleted CPUs. 483 */ 484 void 485 disp_kp_free(disp_t *dq) 486 { 487 struct disp_queue_info mem_info; 488 489 mem_info.olddispq = dq->disp_q; 490 mem_info.olddqactmap = dq->disp_qactmap; 491 mem_info.oldnglobpris = dq->disp_npri; 492 disp_dq_free(&mem_info); 493 } 494 495 /* 496 * End dispatcher and scheduler initialization. 497 */ 498 499 /* 500 * See if there's anything to do other than remain idle. 501 * Return non-zero if there is. 502 * 503 * This function must be called with high spl, or with 504 * kernel preemption disabled to prevent the partition's 505 * active cpu list from changing while being traversed. 506 * 507 */ 508 int 509 disp_anywork(void) 510 { 511 cpu_t *cp = CPU; 512 cpu_t *ocp; 513 514 if (cp->cpu_disp->disp_nrunnable != 0) 515 return (1); 516 517 if (!(cp->cpu_flags & CPU_OFFLINE)) { 518 if (CP_MAXRUNPRI(cp->cpu_part) >= 0) 519 return (1); 520 521 /* 522 * Work can be taken from another CPU if: 523 * - There is unbound work on the run queue 524 * - That work isn't a thread undergoing a 525 * - context switch on an otherwise empty queue. 526 * - The CPU isn't running the idle loop. 527 */ 528 for (ocp = cp->cpu_next_part; ocp != cp; 529 ocp = ocp->cpu_next_part) { 530 ASSERT(CPU_ACTIVE(ocp)); 531 532 if (ocp->cpu_disp->disp_max_unbound_pri != -1 && 533 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) && 534 ocp->cpu_disp->disp_nrunnable == 1) && 535 ocp->cpu_dispatch_pri != -1) 536 return (1); 537 } 538 } 539 return (0); 540 } 541 542 /* 543 * Called when CPU enters the idle loop 544 */ 545 static void 546 idle_enter() 547 { 548 cpu_t *cp = CPU; 549 550 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled()); 551 CPU_STATS_ADDQ(cp, sys, idlethread, 1); 552 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 553 } 554 555 /* 556 * Called when CPU exits the idle loop 557 */ 558 static void 559 idle_exit() 560 { 561 cpu_t *cp = CPU; 562 563 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled()); 564 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */ 565 } 566 567 /* 568 * Idle loop. 569 */ 570 void 571 idle() 572 { 573 struct cpu *cp = CPU; /* pointer to this CPU */ 574 kthread_t *t; /* taken thread */ 575 576 idle_enter(); 577 578 /* 579 * Uniprocessor version of idle loop. 580 * Do this until notified that we're on an actual multiprocessor. 581 */ 582 while (ncpus == 1) { 583 if (cp->cpu_disp->disp_nrunnable == 0) { 584 (*idle_cpu)(); 585 continue; 586 } 587 idle_exit(); 588 swtch(); 589 590 idle_enter(); /* returned from swtch */ 591 } 592 593 /* 594 * Multiprocessor idle loop. 595 */ 596 for (;;) { 597 /* 598 * If CPU is completely quiesced by p_online(2), just wait 599 * here with minimal bus traffic until put online. 600 */ 601 while (cp->cpu_flags & CPU_QUIESCED) 602 (*idle_cpu)(); 603 604 if (cp->cpu_disp->disp_nrunnable != 0) { 605 idle_exit(); 606 swtch(); 607 } else { 608 if (cp->cpu_flags & CPU_OFFLINE) 609 continue; 610 if ((t = disp_getwork(cp)) == NULL) { 611 if (cp->cpu_chosen_level != -1) { 612 disp_t *dp = cp->cpu_disp; 613 disp_t *kpq; 614 615 disp_lock_enter(&dp->disp_lock); 616 /* 617 * Set kpq under lock to prevent 618 * migration between partitions. 619 */ 620 kpq = &cp->cpu_part->cp_kp_queue; 621 if (kpq->disp_maxrunpri == -1) 622 cp->cpu_chosen_level = -1; 623 disp_lock_exit(&dp->disp_lock); 624 } 625 (*idle_cpu)(); 626 continue; 627 } 628 /* 629 * If there was a thread but we couldn't steal 630 * it, then keep trying. 631 */ 632 if (t == T_DONTSTEAL) 633 continue; 634 idle_exit(); 635 swtch_to(t); 636 } 637 idle_enter(); /* returned from swtch/swtch_to */ 638 } 639 } 640 641 642 /* 643 * Preempt the currently running thread in favor of the highest 644 * priority thread. The class of the current thread controls 645 * where it goes on the dispatcher queues. If panicking, turn 646 * preemption off. 647 */ 648 void 649 preempt() 650 { 651 kthread_t *t = curthread; 652 klwp_t *lwp = ttolwp(curthread); 653 654 if (panicstr) 655 return; 656 657 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start"); 658 659 thread_lock(t); 660 661 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) { 662 /* 663 * this thread has already been chosen to be run on 664 * another CPU. Clear kprunrun on this CPU since we're 665 * already headed for swtch(). 666 */ 667 CPU->cpu_kprunrun = 0; 668 thread_unlock_nopreempt(t); 669 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 670 } else { 671 if (lwp != NULL) 672 lwp->lwp_ru.nivcsw++; 673 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1); 674 THREAD_TRANSITION(t); 675 CL_PREEMPT(t); 676 DTRACE_SCHED(preempt); 677 thread_unlock_nopreempt(t); 678 679 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end"); 680 681 swtch(); /* clears CPU->cpu_runrun via disp() */ 682 } 683 } 684 685 extern kthread_t *thread_unpin(); 686 687 /* 688 * disp() - find the highest priority thread for this processor to run, and 689 * set it in TS_ONPROC state so that resume() can be called to run it. 690 */ 691 static kthread_t * 692 disp() 693 { 694 cpu_t *cpup; 695 disp_t *dp; 696 kthread_t *tp; 697 dispq_t *dq; 698 int maxrunword; 699 pri_t pri; 700 disp_t *kpq; 701 702 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start"); 703 704 cpup = CPU; 705 /* 706 * Find the highest priority loaded, runnable thread. 707 */ 708 dp = cpup->cpu_disp; 709 710 reschedule: 711 /* 712 * If there is more important work on the global queue with a better 713 * priority than the maximum on this CPU, take it now. 714 */ 715 kpq = &cpup->cpu_part->cp_kp_queue; 716 while ((pri = kpq->disp_maxrunpri) >= 0 && 717 pri >= dp->disp_maxrunpri && 718 (cpup->cpu_flags & CPU_OFFLINE) == 0 && 719 (tp = disp_getbest(kpq)) != NULL) { 720 if (disp_ratify(tp, kpq) != NULL) { 721 TRACE_1(TR_FAC_DISP, TR_DISP_END, 722 "disp_end:tid %p", tp); 723 return (tp); 724 } 725 } 726 727 disp_lock_enter(&dp->disp_lock); 728 pri = dp->disp_maxrunpri; 729 730 /* 731 * If there is nothing to run, look at what's runnable on other queues. 732 * Choose the idle thread if the CPU is quiesced. 733 * Note that CPUs that have the CPU_OFFLINE flag set can still run 734 * interrupt threads, which will be the only threads on the CPU's own 735 * queue, but cannot run threads from other queues. 736 */ 737 if (pri == -1) { 738 if (!(cpup->cpu_flags & CPU_OFFLINE)) { 739 disp_lock_exit(&dp->disp_lock); 740 if ((tp = disp_getwork(cpup)) == NULL || 741 tp == T_DONTSTEAL) { 742 tp = cpup->cpu_idle_thread; 743 (void) splhigh(); 744 THREAD_ONPROC(tp, cpup); 745 cpup->cpu_dispthread = tp; 746 cpup->cpu_dispatch_pri = -1; 747 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 748 cpup->cpu_chosen_level = -1; 749 } 750 } else { 751 disp_lock_exit_high(&dp->disp_lock); 752 tp = cpup->cpu_idle_thread; 753 THREAD_ONPROC(tp, cpup); 754 cpup->cpu_dispthread = tp; 755 cpup->cpu_dispatch_pri = -1; 756 cpup->cpu_runrun = cpup->cpu_kprunrun = 0; 757 cpup->cpu_chosen_level = -1; 758 } 759 TRACE_1(TR_FAC_DISP, TR_DISP_END, 760 "disp_end:tid %p", tp); 761 return (tp); 762 } 763 764 dq = &dp->disp_q[pri]; 765 tp = dq->dq_first; 766 767 ASSERT(tp != NULL); 768 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */ 769 770 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp); 771 772 /* 773 * Found it so remove it from queue. 774 */ 775 dp->disp_nrunnable--; 776 dq->dq_sruncnt--; 777 if ((dq->dq_first = tp->t_link) == NULL) { 778 ulong_t *dqactmap = dp->disp_qactmap; 779 780 ASSERT(dq->dq_sruncnt == 0); 781 dq->dq_last = NULL; 782 783 /* 784 * The queue is empty, so the corresponding bit needs to be 785 * turned off in dqactmap. If nrunnable != 0 just took the 786 * last runnable thread off the 787 * highest queue, so recompute disp_maxrunpri. 788 */ 789 maxrunword = pri >> BT_ULSHIFT; 790 dqactmap[maxrunword] &= ~BT_BIW(pri); 791 792 if (dp->disp_nrunnable == 0) { 793 dp->disp_max_unbound_pri = -1; 794 dp->disp_maxrunpri = -1; 795 } else { 796 int ipri; 797 798 ipri = bt_gethighbit(dqactmap, maxrunword); 799 dp->disp_maxrunpri = ipri; 800 if (ipri < dp->disp_max_unbound_pri) 801 dp->disp_max_unbound_pri = ipri; 802 } 803 } else { 804 tp->t_link = NULL; 805 } 806 807 /* 808 * Set TS_DONT_SWAP flag to prevent another processor from swapping 809 * out this thread before we have a chance to run it. 810 * While running, it is protected against swapping by t_lock. 811 */ 812 tp->t_schedflag |= TS_DONT_SWAP; 813 cpup->cpu_dispthread = tp; /* protected by spl only */ 814 cpup->cpu_dispatch_pri = pri; 815 ASSERT(pri == DISP_PRIO(tp)); 816 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */ 817 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */ 818 819 ASSERT(tp != NULL); 820 TRACE_1(TR_FAC_DISP, TR_DISP_END, 821 "disp_end:tid %p", tp); 822 823 if (disp_ratify(tp, kpq) == NULL) 824 goto reschedule; 825 826 return (tp); 827 } 828 829 /* 830 * swtch() 831 * Find best runnable thread and run it. 832 * Called with the current thread already switched to a new state, 833 * on a sleep queue, run queue, stopped, and not zombied. 834 * May be called at any spl level less than or equal to LOCK_LEVEL. 835 * Always drops spl to the base level (spl0()). 836 */ 837 void 838 swtch() 839 { 840 kthread_t *t = curthread; 841 kthread_t *next; 842 cpu_t *cp; 843 844 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 845 846 if (t->t_flag & T_INTR_THREAD) 847 cpu_intr_swtch_enter(t); 848 849 if (t->t_intr != NULL) { 850 /* 851 * We are an interrupt thread. Setup and return 852 * the interrupted thread to be resumed. 853 */ 854 (void) splhigh(); /* block other scheduler action */ 855 cp = CPU; /* now protected against migration */ 856 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 857 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 858 CPU_STATS_ADDQ(cp, sys, intrblk, 1); 859 next = thread_unpin(); 860 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 861 resume_from_intr(next); 862 } else { 863 #ifdef DEBUG 864 if (t->t_state == TS_ONPROC && 865 t->t_disp_queue->disp_cpu == CPU && 866 t->t_preempt == 0) { 867 thread_lock(t); 868 ASSERT(t->t_state != TS_ONPROC || 869 t->t_disp_queue->disp_cpu != CPU || 870 t->t_preempt != 0); /* cannot migrate */ 871 thread_unlock_nopreempt(t); 872 } 873 #endif /* DEBUG */ 874 cp = CPU; 875 next = disp(); /* returns with spl high */ 876 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */ 877 878 /* OK to steal anything left on run queue */ 879 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 880 881 if (next != t) { 882 if (t == cp->cpu_idle_thread) { 883 PG_NRUN_UPDATE(cp, 1); 884 } else if (next == cp->cpu_idle_thread) { 885 PG_NRUN_UPDATE(cp, -1); 886 } 887 888 /* 889 * If t was previously in the TS_ONPROC state, 890 * setfrontdq and setbackdq won't have set its t_waitrq. 891 * Since we now finally know that we're switching away 892 * from this thread, set its t_waitrq if it is on a run 893 * queue. 894 */ 895 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) { 896 t->t_waitrq = gethrtime_unscaled(); 897 } 898 899 /* 900 * restore mstate of thread that we are switching to 901 */ 902 restore_mstate(next); 903 904 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 905 cp->cpu_last_swtch = t->t_disp_time = lbolt; 906 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 907 908 if (dtrace_vtime_active) 909 dtrace_vtime_switch(next); 910 911 resume(next); 912 /* 913 * The TR_RESUME_END and TR_SWTCH_END trace points 914 * appear at the end of resume(), because we may not 915 * return here 916 */ 917 } else { 918 if (t->t_flag & T_INTR_THREAD) 919 cpu_intr_swtch_exit(t); 920 921 DTRACE_SCHED(remain__cpu); 922 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end"); 923 (void) spl0(); 924 } 925 } 926 } 927 928 /* 929 * swtch_from_zombie() 930 * Special case of swtch(), which allows checks for TS_ZOMB to be 931 * eliminated from normal resume. 932 * Find best runnable thread and run it. 933 * Called with the current thread zombied. 934 * Zombies cannot migrate, so CPU references are safe. 935 */ 936 void 937 swtch_from_zombie() 938 { 939 kthread_t *next; 940 cpu_t *cpu = CPU; 941 942 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 943 944 ASSERT(curthread->t_state == TS_ZOMB); 945 946 next = disp(); /* returns with spl high */ 947 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */ 948 CPU_STATS_ADDQ(CPU, sys, pswitch, 1); 949 ASSERT(next != curthread); 950 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 951 952 if (next == cpu->cpu_idle_thread) 953 PG_NRUN_UPDATE(cpu, -1); 954 955 restore_mstate(next); 956 957 if (dtrace_vtime_active) 958 dtrace_vtime_switch(next); 959 960 resume_from_zombie(next); 961 /* 962 * The TR_RESUME_END and TR_SWTCH_END trace points 963 * appear at the end of resume(), because we certainly will not 964 * return here 965 */ 966 } 967 968 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint)) 969 970 /* 971 * search_disp_queues() 972 * Search the given dispatch queues for thread tp. 973 * Return 1 if tp is found, otherwise return 0. 974 */ 975 static int 976 search_disp_queues(disp_t *dp, kthread_t *tp) 977 { 978 dispq_t *dq; 979 dispq_t *eq; 980 981 disp_lock_enter_high(&dp->disp_lock); 982 983 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) { 984 kthread_t *rp; 985 986 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL); 987 988 for (rp = dq->dq_first; rp; rp = rp->t_link) 989 if (tp == rp) { 990 disp_lock_exit_high(&dp->disp_lock); 991 return (1); 992 } 993 } 994 disp_lock_exit_high(&dp->disp_lock); 995 996 return (0); 997 } 998 999 /* 1000 * thread_on_queue() 1001 * Search all per-CPU dispatch queues and all partition-wide kpreempt 1002 * queues for thread tp. Return 1 if tp is found, otherwise return 0. 1003 */ 1004 static int 1005 thread_on_queue(kthread_t *tp) 1006 { 1007 cpu_t *cp; 1008 struct cpupart *part; 1009 1010 ASSERT(getpil() >= DISP_LEVEL); 1011 1012 /* 1013 * Search the per-CPU dispatch queues for tp. 1014 */ 1015 cp = CPU; 1016 do { 1017 if (search_disp_queues(cp->cpu_disp, tp)) 1018 return (1); 1019 } while ((cp = cp->cpu_next_onln) != CPU); 1020 1021 /* 1022 * Search the partition-wide kpreempt queues for tp. 1023 */ 1024 part = CPU->cpu_part; 1025 do { 1026 if (search_disp_queues(&part->cp_kp_queue, tp)) 1027 return (1); 1028 } while ((part = part->cp_next) != CPU->cpu_part); 1029 1030 return (0); 1031 } 1032 1033 #else 1034 1035 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */ 1036 1037 #endif /* DEBUG */ 1038 1039 /* 1040 * like swtch(), but switch to a specified thread taken from another CPU. 1041 * called with spl high.. 1042 */ 1043 void 1044 swtch_to(kthread_t *next) 1045 { 1046 cpu_t *cp = CPU; 1047 1048 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start"); 1049 1050 /* 1051 * Update context switch statistics. 1052 */ 1053 CPU_STATS_ADDQ(cp, sys, pswitch, 1); 1054 1055 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start"); 1056 1057 if (curthread == cp->cpu_idle_thread) 1058 PG_NRUN_UPDATE(cp, 1); 1059 1060 /* OK to steal anything left on run queue */ 1061 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL; 1062 1063 /* record last execution time */ 1064 cp->cpu_last_swtch = curthread->t_disp_time = lbolt; 1065 1066 /* 1067 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq 1068 * won't have set its t_waitrq. Since we now finally know that we're 1069 * switching away from this thread, set its t_waitrq if it is on a run 1070 * queue. 1071 */ 1072 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) { 1073 curthread->t_waitrq = gethrtime_unscaled(); 1074 } 1075 1076 /* restore next thread to previously running microstate */ 1077 restore_mstate(next); 1078 1079 if (dtrace_vtime_active) 1080 dtrace_vtime_switch(next); 1081 1082 resume(next); 1083 /* 1084 * The TR_RESUME_END and TR_SWTCH_END trace points 1085 * appear at the end of resume(), because we may not 1086 * return here 1087 */ 1088 } 1089 1090 1091 1092 #define CPU_IDLING(pri) ((pri) == -1) 1093 1094 static void 1095 cpu_resched(cpu_t *cp, pri_t tpri) 1096 { 1097 int call_poke_cpu = 0; 1098 pri_t cpupri = cp->cpu_dispatch_pri; 1099 1100 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) { 1101 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED, 1102 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri); 1103 if (tpri >= upreemptpri && cp->cpu_runrun == 0) { 1104 cp->cpu_runrun = 1; 1105 aston(cp->cpu_dispthread); 1106 if (tpri < kpreemptpri && cp != CPU) 1107 call_poke_cpu = 1; 1108 } 1109 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) { 1110 cp->cpu_kprunrun = 1; 1111 if (cp != CPU) 1112 call_poke_cpu = 1; 1113 } 1114 } 1115 1116 /* 1117 * Propagate cpu_runrun, and cpu_kprunrun to global visibility. 1118 */ 1119 membar_enter(); 1120 1121 if (call_poke_cpu) 1122 poke_cpu(cp->cpu_id); 1123 } 1124 1125 /* 1126 * Perform multi-level CMT load balancing of running threads. 1127 * tp is the thread being enqueued 1128 * cp is the hint CPU (chosen by cpu_choose()). 1129 */ 1130 static cpu_t * 1131 cmt_balance(kthread_t *tp, cpu_t *cp) 1132 { 1133 int hint, i, cpu, nsiblings; 1134 int self = 0; 1135 group_t *cmt_pgs, *siblings; 1136 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 1137 int pg_nrun, tpg_nrun; 1138 int level = 0; 1139 cpu_t *newcp; 1140 1141 ASSERT(THREAD_LOCK_HELD(tp)); 1142 1143 cmt_pgs = &cp->cpu_pg->cmt_pgs; 1144 1145 if (GROUP_SIZE(cmt_pgs) == 0) 1146 return (cp); /* nothing to do */ 1147