1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "@(#)cpu.c 1.190 08/01/03 SMI" 27 28 /* 29 * Architecture-independent CPU control functions. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/param.h> 34 #include <sys/var.h> 35 #include <sys/thread.h> 36 #include <sys/cpuvar.h> 37 #include <sys/kstat.h> 38 #include <sys/uadmin.h> 39 #include <sys/systm.h> 40 #include <sys/errno.h> 41 #include <sys/cmn_err.h> 42 #include <sys/procset.h> 43 #include <sys/processor.h> 44 #include <sys/debug.h> 45 #include <sys/cpupart.h> 46 #include <sys/lgrp.h> 47 #include <sys/pset.h> 48 #include <sys/pghw.h> 49 #include <sys/kmem.h> 50 #include <sys/kmem_impl.h> /* to set per-cpu kmem_cache offset */ 51 #include <sys/atomic.h> 52 #include <sys/callb.h> 53 #include <sys/vtrace.h> 54 #include <sys/cyclic.h> 55 #include <sys/bitmap.h> 56 #include <sys/nvpair.h> 57 #include <sys/pool_pset.h> 58 #include <sys/msacct.h> 59 #include <sys/time.h> 60 #include <sys/archsystm.h> 61 #if defined(__x86) 62 #include <sys/x86_archext.h> 63 #endif 64 65 extern int mp_cpu_start(cpu_t *); 66 extern int mp_cpu_stop(cpu_t *); 67 extern int mp_cpu_poweron(cpu_t *); 68 extern int mp_cpu_poweroff(cpu_t *); 69 extern int mp_cpu_configure(int); 70 extern int mp_cpu_unconfigure(int); 71 extern void mp_cpu_faulted_enter(cpu_t *); 72 extern void mp_cpu_faulted_exit(cpu_t *); 73 74 extern int cmp_cpu_to_chip(processorid_t cpuid); 75 #ifdef __sparcv9 76 extern char *cpu_fru_fmri(cpu_t *cp); 77 #endif 78 79 static void cpu_add_active_internal(cpu_t *cp); 80 static void cpu_remove_active(cpu_t *cp); 81 static void cpu_info_kstat_create(cpu_t *cp); 82 static void cpu_info_kstat_destroy(cpu_t *cp); 83 static void cpu_stats_kstat_create(cpu_t *cp); 84 static void cpu_stats_kstat_destroy(cpu_t *cp); 85 86 static int cpu_sys_stats_ks_update(kstat_t *ksp, int rw); 87 static int cpu_vm_stats_ks_update(kstat_t *ksp, int rw); 88 static int cpu_stat_ks_update(kstat_t *ksp, int rw); 89 static int cpu_state_change_hooks(int, cpu_setup_t, cpu_setup_t); 90 91 /* 92 * cpu_lock protects ncpus, ncpus_online, cpu_flag, cpu_list, cpu_active, 93 * and dispatch queue reallocations. The lock ordering with respect to 94 * related locks is: 95 * 96 * cpu_lock --> thread_free_lock ---> p_lock ---> thread_lock() 97 * 98 * Warning: Certain sections of code do not use the cpu_lock when 99 * traversing the cpu_list (e.g. mutex_vector_enter(), clock()). Since 100 * all cpus are paused during modifications to this list, a solution 101 * to protect the list is too either disable kernel preemption while 102 * walking the list, *or* recheck the cpu_next pointer at each 103 * iteration in the loop. Note that in no cases can any cached 104 * copies of the cpu pointers be kept as they may become invalid. 105 */ 106 kmutex_t cpu_lock; 107 cpu_t *cpu_list; /* list of all CPUs */ 108 cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ 109 cpu_t *cpu_active; /* list of active CPUs */ 110 static cpuset_t cpu_available; /* set of available CPUs */ 111 cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ 112 113 /* 114 * max_ncpus keeps the max cpus the system can have. Initially 115 * it's NCPU, but since most archs scan the devtree for cpus 116 * fairly early on during boot, the real max can be known before 117 * ncpus is set (useful for early NCPU based allocations). 118 */ 119 int max_ncpus = NCPU; 120 /* 121 * platforms that set max_ncpus to maxiumum number of cpus that can be 122 * dynamically added will set boot_max_ncpus to the number of cpus found 123 * at device tree scan time during boot. 124 */ 125 int boot_max_ncpus = -1; 126 /* 127 * Maximum possible CPU id. This can never be >= NCPU since NCPU is 128 * used to size arrays that are indexed by CPU id. 129 */ 130 processorid_t max_cpuid = NCPU - 1; 131 132 int ncpus = 1; 133 int ncpus_online = 1; 134 135 /* 136 * CPU that we're trying to offline. Protected by cpu_lock. 137 */ 138 cpu_t *cpu_inmotion; 139 140 /* 141 * Can be raised to suppress further weakbinding, which are instead 142 * satisfied by disabling preemption. Must be raised/lowered under cpu_lock, 143 * while individual thread weakbinding synchronisation is done under thread 144 * lock. 145 */ 146 int weakbindingbarrier; 147 148 /* 149 * Variables used in pause_cpus(). 150 */ 151 static volatile char safe_list[NCPU]; 152 153 static struct _cpu_pause_info { 154 int cp_spl; /* spl saved in pause_cpus() */ 155 volatile int cp_go; /* Go signal sent after all ready */ 156 int cp_count; /* # of CPUs to pause */ 157 ksema_t cp_sem; /* synch pause_cpus & cpu_pause */ 158 kthread_id_t cp_paused; 159 } cpu_pause_info; 160 161 static kmutex_t pause_free_mutex; 162 static kcondvar_t pause_free_cv; 163 164 void *(*cpu_pause_func)(void *) = NULL; 165 166 167 static struct cpu_sys_stats_ks_data { 168 kstat_named_t cpu_ticks_idle; 169 kstat_named_t cpu_ticks_user; 170 kstat_named_t cpu_ticks_kernel; 171 kstat_named_t cpu_ticks_wait; 172 kstat_named_t cpu_nsec_idle; 173 kstat_named_t cpu_nsec_user; 174 kstat_named_t cpu_nsec_kernel; 175 kstat_named_t cpu_nsec_intr; 176 kstat_named_t cpu_load_intr; 177 kstat_named_t wait_ticks_io; 178 kstat_named_t bread; 179 kstat_named_t bwrite; 180 kstat_named_t lread; 181 kstat_named_t lwrite; 182 kstat_named_t phread; 183 kstat_named_t phwrite; 184 kstat_named_t pswitch; 185 kstat_named_t trap; 186 kstat_named_t intr; 187 kstat_named_t syscall; 188 kstat_named_t sysread; 189 kstat_named_t syswrite; 190 kstat_named_t sysfork; 191 kstat_named_t sysvfork; 192 kstat_named_t sysexec; 193 kstat_named_t readch; 194 kstat_named_t writech; 195 kstat_named_t rcvint; 196 kstat_named_t xmtint; 197 kstat_named_t mdmint; 198 kstat_named_t rawch; 199 kstat_named_t canch; 200 kstat_named_t outch; 201 kstat_named_t msg; 202 kstat_named_t sema; 203 kstat_named_t namei; 204 kstat_named_t ufsiget; 205 kstat_named_t ufsdirblk; 206 kstat_named_t ufsipage; 207 kstat_named_t ufsinopage; 208 kstat_named_t procovf; 209 kstat_named_t intrthread; 210 kstat_named_t intrblk; 211 kstat_named_t intrunpin; 212 kstat_named_t idlethread; 213 kstat_named_t inv_swtch; 214 kstat_named_t nthreads; 215 kstat_named_t cpumigrate; 216 kstat_named_t xcalls; 217 kstat_named_t mutex_adenters; 218 kstat_named_t rw_rdfails; 219 kstat_named_t rw_wrfails; 220 kstat_named_t modload; 221 kstat_named_t modunload; 222 kstat_named_t bawrite; 223 kstat_named_t iowait; 224 } cpu_sys_stats_ks_data_template = { 225 { "cpu_ticks_idle", KSTAT_DATA_UINT64 }, 226 { "cpu_ticks_user", KSTAT_DATA_UINT64 }, 227 { "cpu_ticks_kernel", KSTAT_DATA_UINT64 }, 228 { "cpu_ticks_wait", KSTAT_DATA_UINT64 }, 229 { "cpu_nsec_idle", KSTAT_DATA_UINT64 }, 230 { "cpu_nsec_user", KSTAT_DATA_UINT64 }, 231 { "cpu_nsec_kernel", KSTAT_DATA_UINT64 }, 232 { "cpu_nsec_intr", KSTAT_DATA_UINT64 }, 233 { "cpu_load_intr", KSTAT_DATA_UINT64 }, 234 { "wait_ticks_io", KSTAT_DATA_UINT64 }, 235 { "bread", KSTAT_DATA_UINT64 }, 236 { "bwrite", KSTAT_DATA_UINT64 }, 237 { "lread", KSTAT_DATA_UINT64 }, 238 { "lwrite", KSTAT_DATA_UINT64 }, 239 { "phread", KSTAT_DATA_UINT64 }, 240 { "phwrite", KSTAT_DATA_UINT64 }, 241 { "pswitch", KSTAT_DATA_UINT64 }, 242 { "trap", KSTAT_DATA_UINT64 }, 243 { "intr", KSTAT_DATA_UINT64 }, 244 { "syscall", KSTAT_DATA_UINT64 }, 245 { "sysread", KSTAT_DATA_UINT64 }, 246 { "syswrite", KSTAT_DATA_UINT64 }, 247 { "sysfork", KSTAT_DATA_UINT64 }, 248 { "sysvfork", KSTAT_DATA_UINT64 }, 249 { "sysexec", KSTAT_DATA_UINT64 }, 250 { "readch", KSTAT_DATA_UINT64 }, 251 { "writech", KSTAT_DATA_UINT64 }, 252 { "rcvint", KSTAT_DATA_UINT64 }, 253 { "xmtint", KSTAT_DATA_UINT64 }, 254 { "mdmint", KSTAT_DATA_UINT64 }, 255 { "rawch", KSTAT_DATA_UINT64 }, 256 { "canch", KSTAT_DATA_UINT64 }, 257 { "outch", KSTAT_DATA_UINT64 }, 258 { "msg", KSTAT_DATA_UINT64 }, 259 { "sema", KSTAT_DATA_UINT64 }, 260 { "namei", KSTAT_DATA_UINT64 }, 261 { "ufsiget", KSTAT_DATA_UINT64 }, 262 { "ufsdirblk", KSTAT_DATA_UINT64 }, 263 { "ufsipage", KSTAT_DATA_UINT64 }, 264 { "ufsinopage", KSTAT_DATA_UINT64 }, 265 { "procovf", KSTAT_DATA_UINT64 }, 266 { "intrthread", KSTAT_DATA_UINT64 }, 267 { "intrblk", KSTAT_DATA_UINT64 }, 268 { "intrunpin", KSTAT_DATA_UINT64 }, 269 { "idlethread", KSTAT_DATA_UINT64 }, 270 { "inv_swtch", KSTAT_DATA_UINT64 }, 271 { "nthreads", KSTAT_DATA_UINT64 }, 272 { "cpumigrate", KSTAT_DATA_UINT64 }, 273 { "xcalls", KSTAT_DATA_UINT64 }, 274 { "mutex_adenters", KSTAT_DATA_UINT64 }, 275 { "rw_rdfails", KSTAT_DATA_UINT64 }, 276 { "rw_wrfails", KSTAT_DATA_UINT64 }, 277 { "modload", KSTAT_DATA_UINT64 }, 278 { "modunload", KSTAT_DATA_UINT64 }, 279 { "bawrite", KSTAT_DATA_UINT64 }, 280 { "iowait", KSTAT_DATA_UINT64 }, 281 }; 282 283 static struct cpu_vm_stats_ks_data { 284 kstat_named_t pgrec; 285 kstat_named_t pgfrec; 286 kstat_named_t pgin; 287 kstat_named_t pgpgin; 288 kstat_named_t pgout; 289 kstat_named_t pgpgout; 290 kstat_named_t swapin; 291 kstat_named_t pgswapin; 292 kstat_named_t swapout; 293 kstat_named_t pgswapout; 294 kstat_named_t zfod; 295 kstat_named_t dfree; 296 kstat_named_t scan; 297 kstat_named_t rev; 298 kstat_named_t hat_fault; 299 kstat_named_t as_fault; 300 kstat_named_t maj_fault; 301 kstat_named_t cow_fault; 302 kstat_named_t prot_fault; 303 kstat_named_t softlock; 304 kstat_named_t kernel_asflt; 305 kstat_named_t pgrrun; 306 kstat_named_t execpgin; 307 kstat_named_t execpgout; 308 kstat_named_t execfree; 309 kstat_named_t anonpgin; 310 kstat_named_t anonpgout; 311 kstat_named_t anonfree; 312 kstat_named_t fspgin; 313 kstat_named_t fspgout; 314 kstat_named_t fsfree; 315 } cpu_vm_stats_ks_data_template = { 316 { "pgrec", KSTAT_DATA_UINT64 }, 317 { "pgfrec", KSTAT_DATA_UINT64 }, 318 { "pgin", KSTAT_DATA_UINT64 }, 319 { "pgpgin", KSTAT_DATA_UINT64 }, 320 { "pgout", KSTAT_DATA_UINT64 }, 321 { "pgpgout", KSTAT_DATA_UINT64 }, 322 { "swapin", KSTAT_DATA_UINT64 }, 323 { "pgswapin", KSTAT_DATA_UINT64 }, 324 { "swapout", KSTAT_DATA_UINT64 }, 325 { "pgswapout", KSTAT_DATA_UINT64 }, 326 { "zfod", KSTAT_DATA_UINT64 }, 327 { "dfree", KSTAT_DATA_UINT64 }, 328 { "scan", KSTAT_DATA_UINT64 }, 329 { "rev", KSTAT_DATA_UINT64 }, 330 { "hat_fault", KSTAT_DATA_UINT64 }, 331 { "as_fault", KSTAT_DATA_UINT64 }, 332 { "maj_fault", KSTAT_DATA_UINT64 }, 333 { "cow_fault", KSTAT_DATA_UINT64 }, 334 { "prot_fault", KSTAT_DATA_UINT64 }, 335 { "softlock", KSTAT_DATA_UINT64 }, 336 { "kernel_asflt", KSTAT_DATA_UINT64 }, 337 { "pgrrun", KSTAT_DATA_UINT64 }, 338 { "execpgin", KSTAT_DATA_UINT64 }, 339 { "execpgout", KSTAT_DATA_UINT64 }, 340 { "execfree", KSTAT_DATA_UINT64 }, 341 { "anonpgin", KSTAT_DATA_UINT64 }, 342 { "anonpgout", KSTAT_DATA_UINT64 }, 343 { "anonfree", KSTAT_DATA_UINT64 }, 344 { "fspgin", KSTAT_DATA_UINT64 }, 345 { "fspgout", KSTAT_DATA_UINT64 }, 346 { "fsfree", KSTAT_DATA_UINT64 }, 347 }; 348 349 /* 350 * Force the specified thread to migrate to the appropriate processor. 351 * Called with thread lock held, returns with it dropped. 352 */ 353 static void 354 force_thread_migrate(kthread_id_t tp) 355 { 356 ASSERT(THREAD_LOCK_HELD(tp)); 357 if (tp == curthread) { 358 THREAD_TRANSITION(tp); 359 CL_SETRUN(tp); 360 thread_unlock_nopreempt(tp); 361 swtch(); 362 } else { 363 if (tp->t_state == TS_ONPROC) { 364 cpu_surrender(tp); 365 } else if (tp->t_state == TS_RUN) { 366 (void) dispdeq(tp); 367 setbackdq(tp); 368 } 369 thread_unlock(tp); 370 } 371 } 372 373 /* 374 * Set affinity for a specified CPU. 375 * A reference count is incremented and the affinity is held until the 376 * reference count is decremented to zero by thread_affinity_clear(). 377 * This is so regions of code requiring affinity can be nested. 378 * Caller needs to ensure that cpu_id remains valid, which can be 379 * done by holding cpu_lock across this call, unless the caller 380 * specifies CPU_CURRENT in which case the cpu_lock will be acquired 381 * by thread_affinity_set and CPU->cpu_id will be the target CPU. 382 */ 383 void 384 thread_affinity_set(kthread_id_t t, int cpu_id) 385 { 386 cpu_t *cp; 387 int c; 388 389 ASSERT(!(t == curthread && t->t_weakbound_cpu != NULL)); 390 391 if ((c = cpu_id) == CPU_CURRENT) { 392 mutex_enter(&cpu_lock); 393 cpu_id = CPU->cpu_id; 394 } 395 /* 396 * We should be asserting that cpu_lock is held here, but 397 * the NCA code doesn't acquire it. The following assert 398 * should be uncommented when the NCA code is fixed. 399 * 400 * ASSERT(MUTEX_HELD(&cpu_lock)); 401 */ 402 ASSERT((cpu_id >= 0) && (cpu_id < NCPU)); 403 cp = cpu[cpu_id]; 404 ASSERT(cp != NULL); /* user must provide a good cpu_id */ 405 /* 406 * If there is already a hard affinity requested, and this affinity 407 * conflicts with that, panic. 408 */ 409 thread_lock(t); 410 if (t->t_affinitycnt > 0 && t->t_bound_cpu != cp) { 411 panic("affinity_set: setting %p but already bound to %p", 412 (void *)cp, (void *)t->t_bound_cpu); 413 } 414 t->t_affinitycnt++; 415 t->t_bound_cpu = cp; 416 417 /* 418 * Make sure we're running on the right CPU. 419 */ 420 if (cp != t->t_cpu || t != curthread) { 421 force_thread_migrate(t); /* drops thread lock */ 422 } else { 423 thread_unlock(t); 424 } 425 426 if (c == CPU_CURRENT) 427 mutex_exit(&cpu_lock); 428 } 429 430 /* 431 * Wrapper for backward compatibility. 432 */ 433 void 434 affinity_set(int cpu_id) 435 { 436 thread_affinity_set(curthread, cpu_id); 437 } 438 439 /* 440 * Decrement the affinity reservation count and if it becomes zero, 441 * clear the CPU affinity for the current thread, or set it to the user's 442 * software binding request. 443 */ 444 void 445 thread_affinity_clear(kthread_id_t t) 446 { 447 register processorid_t binding; 448 449 thread_lock(t); 450 if (--t->t_affinitycnt == 0) { 451 if ((binding = t->t_bind_cpu) == PBIND_NONE) { 452 /* 453 * Adjust disp_max_unbound_pri if necessary. 454 */ 455 disp_adjust_unbound_pri(t); 456 t->t_bound_cpu = NULL; 457 if (t->t_cpu->cpu_part != t->t_cpupart) { 458 force_thread_migrate(t); 459 return; 460 } 461 } else { 462 t->t_bound_cpu = cpu[binding]; 463 /* 464 * Make sure the thread is running on the bound CPU. 465 */ 466 if (t->t_cpu != t->t_bound_cpu) { 467 force_thread_migrate(t); 468 return; /* already dropped lock */ 469 } 470 } 471 } 472 thread_unlock(t); 473 } 474 475 /* 476 * Wrapper for backward compatibility. 477 */ 478 void 479 affinity_clear(void) 480 { 481 thread_affinity_clear(curthread); 482 } 483 484 /* 485 * Weak cpu affinity. Bind to the "current" cpu for short periods 486 * of time during which the thread must not block (but may be preempted). 487 * Use this instead of kpreempt_disable() when it is only "no migration" 488 * rather than "no preemption" semantics that are required - disabling 489 * preemption holds higher priority threads off of cpu and if the 490 * operation that is protected is more than momentary this is not good 491 * for realtime etc. 492 * 493 * Weakly bound threads will not prevent a cpu from being offlined - 494 * we'll only run them on the cpu to which they are weakly bound but 495 * (because they do not block) we'll always be able to move them on to 496 * another cpu at offline time if we give them just a short moment to 497 * run during which they will unbind. To give a cpu a chance of offlining, 498 * however, we require a barrier to weak bindings that may be raised for a 499 * given cpu (offline/move code may set this and then wait a short time for 500 * existing weak bindings to drop); the cpu_inmotion pointer is that barrier. 501 * 502 * There are few restrictions on the calling context of thread_nomigrate. 503 * The caller must not hold the thread lock. Calls may be nested. 504 * 505 * After weakbinding a thread must not perform actions that may block. 506 * In particular it must not call thread_affinity_set; calling that when 507 * already weakbound is nonsensical anyway. 508 * 509 * If curthread is prevented from migrating for other reasons 510 * (kernel preemption disabled; high pil; strongly bound; interrupt thread) 511 * then the weak binding will succeed even if this cpu is the target of an 512 * offline/move request. 513 */ 514 void 515 thread_nomigrate(void) 516 { 517 cpu_t *cp; 518 kthread_id_t t = curthread; 519 520 again: 521 kpreempt_disable(); 522 cp = CPU; 523 524 /* 525 * A highlevel interrupt must not modify t_nomigrate or 526 * t_weakbound_cpu of the thread it has interrupted. A lowlevel 527 * interrupt thread cannot migrate and we can avoid the 528 * thread_lock call below by short-circuiting here. In either 529 * case we can just return since no migration is possible and 530 * the condition will persist (ie, when we test for these again 531 * in thread_allowmigrate they can't have changed). Migration 532 * is also impossible if we're at or above DISP_LEVEL pil. 533 */ 534 if (CPU_ON_INTR(cp) || t->t_flag & T_INTR_THREAD || 535 getpil() >= DISP_LEVEL) { 536 kpreempt_enable(); 537 return; 538 } 539 540 /* 541 * We must be consistent with existing weak bindings. Since we 542 * may be interrupted between the increment of t_nomigrate and 543 * the store to t_weakbound_cpu below we cannot assume that 544 * t_weakbound_cpu will be set if t_nomigrate is. Note that we 545 * cannot assert t_weakbound_cpu == t_bind_cpu since that is not 546 * always the case. 547 */ 548 if (t->t_nomigrate && t->t_weakbound_cpu && t->t_weakbound_cpu != cp) { 549 if (!panicstr) 550 panic("thread_nomigrate: binding to %p but already " 551 "bound to %p", (void *)cp, 552 (void *)t->t_weakbound_cpu); 553 } 554 555 /* 556 * At this point we have preemption disabled and we don't yet hold 557 * the thread lock. So it's possible that somebody else could 558 * set t_bind_cpu here and not be able to force us across to the 559 * new cpu (since we have preemption disabled). 560 */ 561 thread_lock(curthread); 562 563 /* 564 * If further weak bindings are being (temporarily) suppressed then 565 * we'll settle for disabling kernel preemption (which assures 566 * no migration provided the thread does not block which it is 567 * not allowed to if using thread_nomigrate). We must remember 568 * this disposition so we can take appropriate action in 569 * thread_allowmigrate. If this is a nested call and the 570 * thread is already weakbound then fall through as normal. 571 * We remember the decision to settle for kpreempt_disable through 572 * negative nesting counting in t_nomigrate. Once a thread has had one 573 * weakbinding request satisfied in this way any further (nested) 574 * requests will continue to be satisfied in the same way, 575 * even if weak bindings have recommenced. 576 */ 577 if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) { 578 --t->t_nomigrate; 579 thread_unlock(curthread); 580 return; /* with kpreempt_disable still active */ 581 } 582 583 /* 584 * We hold thread_lock so t_bind_cpu cannot change. We could, 585 * however, be running on a different cpu to which we are t_bound_cpu 586 * to (as explained above). If we grant the weak binding request 587 * in that case then the dispatcher must favour our weak binding 588 * over our strong (in which case, just as when preemption is 589 * disabled, we can continue to run on a cpu other than the one to 590 * which we are strongbound; the difference in this case is that 591 * this thread can be preempted and so can appear on the dispatch 592 * queues of a cpu other than the one it is strongbound to). 593 * 594 * If the cpu we are running on does not appear to be a current 595 * offline target (we check cpu_inmotion to determine this - since 596 * we don't hold cpu_lock we may not see a recent store to that, 597 * so it's possible that we at times can grant a weak binding to a 598 * cpu that is an offline target, but that one request will not 599 * prevent the offline from succeeding) then we will always grant 600 * the weak binding request. This includes the case above where 601 * we grant a weakbinding not commensurate with our strong binding. 602 * 603 * If our cpu does appear to be an offline target then we're inclined 604 * not to grant the weakbinding request just yet - we'd prefer to 605 * migrate to another cpu and grant the request there. The 606 * exceptions are those cases where going through preemption code 607 * will not result in us changing cpu: 608 * 609 * . interrupts have already bypassed this case (see above) 610 * . we are already weakbound to this cpu (dispatcher code will 611 * always return us to the weakbound cpu) 612 * . preemption was disabled even before we disabled it above 613 * . we are strongbound to this cpu (if we're strongbound to 614 * another and not yet running there the trip through the 615 * dispatcher will move us to the strongbound cpu and we 616 * will grant the weak binding there) 617 */ 618 if (cp != cpu_inmotion || t->t_nomigrate > 0 || t->t_preempt > 1 || 619 t->t_bound_cpu == cp) { 620 /* 621 * Don't be tempted to store to t_weakbound_cpu only on 622 * the first nested bind request - if we're interrupted 623 * after the increment of t_nomigrate and before the 624 * store to t_weakbound_cpu and the interrupt calls 625 * thread_nomigrate then the assertion in thread_allowmigrate 626 * would fail. 627 */ 628 t->t_nomigrate++; 629 t->t_weakbound_cpu = cp; 630 membar_producer(); 631 thread_unlock(curthread); 632 /* 633 * Now that we have dropped the thread_lock another thread 634 * can set our t_weakbound_cpu, and will try to migrate us 635 * to the strongbound cpu (which will not be prevented by 636 * preemption being disabled since we're about to enable 637 * preemption). We have granted the weakbinding to the current 638 * cpu, so again we are in the position that is is is possible 639 * that our weak and strong bindings differ. Again this 640 * is catered for by dispatcher code which will favour our 641 * weak binding. 642 */ 643 kpreempt_enable(); 644 } else { 645 /* 646 * Move to another cpu before granting the request by 647 * forcing this thread through preemption code. When we 648 * get to set{front,back}dq called from CL_PREEMPT() 649 * cpu_choose() will be used to select a cpu to queue 650 * us on - that will see cpu_inmotion and take 651 * steps to avoid returning us to this cpu. 652 */ 653 cp->cpu_kprunrun = 1; 654 thread_unlock(curthread); 655 kpreempt_enable(); /* will call preempt() */ 656 goto again; 657 } 658 } 659 660 void 661 thread_allowmigrate(void) 662 { 663 kthread_id_t t = curthread; 664 665 ASSERT(t->t_weakbound_cpu == CPU || 666 (t->t_nomigrate < 0 && t->t_preempt > 0) || 667 CPU_ON_INTR(CPU) || t->t_flag & T_INTR_THREAD || 668 getpil() >= DISP_LEVEL); 669 670 if (CPU_ON_INTR(CPU) || (t->t_flag & T_INTR_THREAD) || 671 getpil() >= DISP_LEVEL) 672 return; 673 674 if (t->t_nomigrate < 0) { 675 /* 676 * This thread was granted "weak binding" in the 677 * stronger form of kernel preemption disabling. 678 * Undo a level of nesting for both t_nomigrate 679 * and t_preempt. 680 */ 681 ++t->t_nomigrate; 682 kpreempt_enable(); 683 } else if (--t->t_nomigrate == 0) { 684 /* 685 * Time to drop the weak binding. We need to cater 686 * for the case where we're weakbound to a different 687 * cpu than that to which we're strongbound (a very 688 * temporary arrangement that must only persist until 689 * weak binding drops). We don't acquire thread_lock 690 * here so even as this code executes t_bound_cpu 691 * may be changing. So we disable preemption and 692 * a) in the case that t_bound_cpu changes while we 693 * have preemption disabled kprunrun will be set 694 * asynchronously, and b) if before disabling 695 * preemption we were already on a different cpu to 696 * our t_bound_cpu then we set kprunrun ourselves 697 * to force a trip through the dispatcher when 698 * preemption is enabled. 699 */ 700 kpreempt_disable(); 701 if (t->t_bound_cpu && 702 t->t_weakbound_cpu != t->t_bound_cpu) 703 CPU->cpu_kprunrun = 1; 704 t->t_weakbound_cpu = NULL; 705 membar_producer(); 706 kpreempt_enable(); 707 } 708 } 709 710 /* 711 * weakbinding_stop can be used to temporarily cause weakbindings made 712 * with thread_nomigrate to be satisfied through the stronger action of 713 * kpreempt_disable. weakbinding_start recommences normal weakbinding. 714 */ 715 716 void 717 weakbinding_stop(void) 718 { 719 ASSERT(MUTEX_HELD(&cpu_lock)); 720 weakbindingbarrier = 1; 721 membar_producer(); /* make visible before subsequent thread_lock */ 722 } 723 724 void 725 weakbinding_start(void) 726 { 727 ASSERT(MUTEX_HELD(&cpu_lock)); 728 weakbindingbarrier = 0; 729 } 730 731 /* 732 * This routine is called to place the CPUs in a safe place so that 733 * one of them can be taken off line or placed on line. What we are 734 * trying to do here is prevent a thread from traversing the list 735 * of active CPUs while we are changing it or from getting placed on 736 * the run queue of a CPU that has just gone off line. We do this by 737 * creating a thread with the highest possible prio for each CPU and 738 * having it call this routine. The advantage of this method is that 739 * we can eliminate all checks for CPU_ACTIVE in the disp routines. 740 * This makes disp faster at the expense of making p_online() slower 741 * which is a good trade off. 742 */ 743 static void 744 cpu_pause(int index) 745 { 746 int s; 747 struct _cpu_pause_info *cpi = &cpu_pause_info; 748 volatile char *safe = &safe_list[index]; 749 long lindex = index; 750 751 ASSERT((curthread->t_bound_cpu != NULL) || (*safe == PAUSE_DIE)); 752 753 while (*safe != PAUSE_DIE) { 754 *safe = PAUSE_READY; 755 membar_enter(); /* make sure stores are flushed */ 756 sema_v(&cpi->cp_sem); /* signal requesting thread */ 757 758 /* 759 * Wait here until all pause threads are running. That 760 * indicates that it's safe to do the spl. Until 761 * cpu_pause_info.cp_go is set, we don't want to spl 762 * because that might block clock interrupts needed 763 * to preempt threads on other CPUs. 764 */ 765 while (cpi->cp_go == 0) 766 ; 767 /* 768 * Even though we are at the highest disp prio, we need 769 * to block out all interrupts below LOCK_LEVEL so that 770 * an intr doesn't come in, wake up a thread, and call 771 * setbackdq/setfrontdq. 772 */ 773 s = splhigh(); 774 /* 775 * if cpu_pause_func() has been set then call it using 776 * index as the argument, currently only used by 777 * cpr_suspend_cpus(). This function is used as the 778 * code to execute on the "paused" cpu's when a machine 779 * comes out of a sleep state and CPU's were powered off. 780 * (could also be used for hotplugging CPU's). 781 */ 782 if (cpu_pause_func != NULL) 783 (*cpu_pause_func)((void *)lindex); 784 785 mach_cpu_pause(safe); 786 787 splx(s); 788 /* 789 * Waiting is at an end. Switch out of cpu_pause 790 * loop and resume useful work. 791 */ 792 swtch(); 793 } 794 795 mutex_enter(&pause_free_mutex); 796 *safe = PAUSE_DEAD; 797 cv_broadcast(&pause_free_cv); 798 mutex_exit(&pause_free_mutex); 799 } 800 801 /* 802 * Allow the cpus to start running again. 803 */ 804 void 805 start_cpus() 806 { 807 int i; 808 809 ASSERT(MUTEX_HELD(&cpu_lock)); 810 ASSERT(cpu_pause_info.cp_paused); 811 cpu_pause_info.cp_paused = NULL; 812 for (i = 0; i < NCPU; i++) 813 safe_list[i] = PAUSE_IDLE; 814 membar_enter(); /* make sure stores are flushed */ 815 affinity_clear(); 816 splx(cpu_pause_info.cp_spl); 817 kpreempt_enable(); 818 } 819 820 /* 821 * Allocate a pause thread for a CPU. 822 */ 823 static void 824 cpu_pause_alloc(cpu_t *cp) 825 { 826 kthread_id_t t; 827 long cpun = cp->cpu_id; 828 829 /* 830 * Note, v.v_nglobpris will not change value as long as I hold 831 * cpu_lock. 832 */ 833 t = thread_create(NULL, 0, cpu_pause, (void *)cpun, 834 0, &p0, TS_STOPPED, v.v_nglobpris - 1); 835 thread_lock(t); 836 t->t_bound_cpu = cp; 837 t->t_disp_queue = cp->cpu_disp; 838 t->t_affinitycnt = 1; 839 t->t_preempt = 1; 840 thread_unlock(t); 841 cp->cpu_pause_thread = t; 842 /* 843 * Registering a thread in the callback table is usually done 844 * in the initialization code of the thread. In this 845 * case, we do it right after thread creation because the 846 * thread itself may never run, and we need to register the 847 * fact that it is safe for cpr suspend. 848 */ 849 CALLB_CPR_INIT_SAFE(t, "cpu_pause"); 850 } 851 852 /* 853 * Free a pause thread for a CPU. 854 */ 855 static void 856 cpu_pause_free(cpu_t *cp) 857 { 858 kthread_id_t t; 859 int cpun = cp->cpu_id; 860 861 ASSERT(MUTEX_HELD(&cpu_lock)); 862 /* 863 * We have to get the thread and tell him to die. 864 */ 865 if ((t = cp->cpu_pause_thread) == NULL) { 866 ASSERT(safe_list[cpun] == PAUSE_IDLE); 867 return; 868 } 869 thread_lock(t); 870 t->t_cpu = CPU; /* disp gets upset if last cpu is quiesced. */ 871 t->t_bound_cpu = NULL; /* Must un-bind; cpu may not be running. */ 872 t->t_pri = v.v_nglobpris - 1; 873 ASSERT(safe_list[cpun] == PAUSE_IDLE); 874 safe_list[cpun] = PAUSE_DIE; 875 THREAD_TRANSITION(t); 876 setbackdq(t); 877 thread_unlock_nopreempt(t); 878 879 /* 880 * If we don't wait for the thread to actually die, it may try to 881 * run on the wrong cpu as part of an actual call to pause_cpus(). 882 */ 883 mutex_enter(&pause_free_mutex); 884 while (safe_list[cpun] != PAUSE_DEAD) { 885 cv_wait(&pause_free_cv, &pause_free_mutex); 886 } 887 mutex_exit(&pause_free_mutex); 888 safe_list[cpun] = PAUSE_IDLE; 889 890 cp->cpu_pause_thread = NULL; 891 } 892 893 /* 894 * Initialize basic structures for pausing CPUs. 895 */ 896 void 897 cpu_pause_init() 898 { 899 sema_init(&cpu_pause_info.cp_sem, 0, NULL, SEMA_DEFAULT, NULL); 900 /* 901 * Create initial CPU pause thread. 902 */ 903 cpu_pause_alloc(CPU); 904 } 905 906 /* 907 * Start the threads used to pause another CPU. 908 */ 909 static int 910 cpu_pause_start(processorid_t cpu_id) 911 { 912 int i; 913 int cpu_count = 0; 914 915 for (i = 0; i < NCPU; i++) { 916 cpu_t *cp; 917 kthread_id_t t; 918 919 cp = cpu[i]; 920 if (!CPU_IN_SET(cpu_available, i) || (i == cpu_id)) { 921 safe_list[i] = PAUSE_WAIT; 922 continue; 923 } 924 925 /* 926 * Skip CPU if it is quiesced or not yet started. 927 */ 928 if ((cp->cpu_flags & (CPU_QUIESCED | CPU_READY)) != CPU_READY) { 929 safe_list[i] = PAUSE_WAIT; 930 continue; 931 } 932 933 /* 934 * Start this CPU's pause thread. 935 */ 936 t = cp->cpu_pause_thread; 937 thread_lock(t); 938 /* 939 * Reset the priority, since nglobpris may have 940 * changed since the thread was created, if someone 941 * has loaded the RT (or some other) scheduling 942 * class. 943 */ 944 t->t_pri = v.v_nglobpris - 1; 945 THREAD_TRANSITION(t); 946 setbackdq(t); 947 thread_unlock_nopreempt(t); 948 ++cpu_count; 949 } 950 return (cpu_count); 951 } 952 953 954 /* 955 * Pause all of the CPUs except the one we are on by creating a high 956 * priority thread bound to those CPUs. 957 * 958 * Note that one must be extremely careful regarding code 959 * executed while CPUs are paused. Since a CPU may be paused 960 * while a thread scheduling on that CPU is holding an adaptive 961 * lock, code executed with CPUs paused must not acquire adaptive 962 * (or low-level spin) locks. Also, such code must not block, 963 * since the thread that is supposed to initiate the wakeup may 964 * never run. 965 * 966 * With a few exceptions, the restrictions on code executed with CPUs 967 * paused match those for code executed at high-level interrupt 968 * context. 969 */ 970 void 971 pause_cpus(cpu_t *off_cp) 972 { 973 processorid_t cpu_id; 974 int i; 975 struct _cpu_pause_info *cpi = &cpu_pause_info; 976 977 ASSERT(MUTEX_HELD(&cpu_lock)); 978 ASSERT(cpi->cp_paused == NULL); 979 cpi->cp_count = 0; 980 cpi->cp_go = 0; 981 for (i = 0; i < NCPU; i++) 982 safe_list[i] = PAUSE_IDLE; 983 kpreempt_disable(); 984 985 /* 986 * If running on the cpu that is going offline, get off it. 987 * This is so that it won't be necessary to rechoose a CPU 988 * when done. 989 */ 990 if (CPU == off_cp) 991 cpu_id = off_cp->cpu_next_part->cpu_id; 992 else 993 cpu_id = CPU->cpu_id; 994 affinity_set(cpu_id); 995 996 /* 997 * Start the pause threads and record how many were started 998 */ 999 cpi->cp_count = cpu_pause_start(cpu_id); 1000 1001 /* 1002 * Now wait for all CPUs to be running the pause thread. 1003 */ 1004 while (cpi->cp_count > 0) { 1005 /* 1006 * Spin reading the count without grabbing the disp 1007 * lock to make sure we don't prevent the pause 1008 * threads from getting the lock. 1009 */ 1010 while (sema_held(&cpi->cp_sem)) 1011 ; 1012 if (sema_tryp(&cpi->cp_sem)) 1013 --cpi->cp_count; 1014 } 1015 cpi->cp_go = 1; /* all have reached cpu_pause */ 1016 1017 /* 1018 * Now wait for all CPUs to spl. (Transition from PAUSE_READY 1019 * to PAUSE_WAIT.) 1020 */ 1021 for (i = 0; i < NCPU; i++) { 1022 while (safe_list[i] != PAUSE_WAIT) 1023 ; 1024 } 1025 cpi->cp_spl = splhigh(); /* block dispatcher on this CPU */ 1026 cpi->cp_paused = curthread; 1027 } 1028 1029 /* 1030 * Check whether the current thread has CPUs paused 1031 */ 1032 int 1033 cpus_paused(void) 1034 { 1035 if (cpu_pause_info.cp_paused != NULL) { 1036 ASSERT(cpu_pause_info.cp_paused == curthread); 1037 return (1); 1038 } 1039 return (0); 1040 } 1041 1042 static cpu_t * 1043 cpu_get_all(processorid_t cpun) 1044 { 1045 ASSERT(MUTEX_HELD(&cpu_lock)); 1046 1047 if (cpun >= NCPU || cpun < 0 || !CPU_IN_SET(cpu_available, cpun)) 1048 return (NULL); 1049 return (cpu[cpun]); 1050 } 1051 1052 /* 1053 * Check whether cpun is a valid processor id and whether it should be 1054 * visible from the current zone. If it is, return a pointer to the 1055 * associated CPU structure. 1056 */ 1057 cpu_t * 1058 cpu_get(processorid_t cpun) 1059 { 1060 cpu_t *c; 1061 1062 ASSERT(MUTEX_HELD(&cpu_lock)); 1063 c = cpu_get_all(cpun); 1064 if (c != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && 1065 zone_pset_get(curproc->p_zone) != cpupart_query_cpu(c)) 1066 return (NULL); 1067 return (c); 1068 } 1069 1070 /* 1071 * The following functions should be used to check CPU states in the kernel. 1072 * They should be invoked with cpu_lock held. Kernel subsystems interested 1073 * in CPU states should *not* use cpu_get_state() and various P_ONLINE/etc 1074 * states. Those are for user-land (and system call) use only. 1075 */ 1076 1077 /* 1078 * Determine whether the CPU is online and handling interrupts. 1079 */ 1080 int 1081 cpu_is_online(cpu_t *cpu) 1082 { 1083 ASSERT(MUTEX_HELD(&cpu_lock)); 1084 return (cpu_flagged_online(cpu->cpu_flags)); 1085 } 1086 1087 /* 1088 * Determine whether the CPU is offline (this includes spare and faulted). 1089 */ 1090 int 1091 cpu_is_offline(cpu_t *cpu) 1092 { 1093 ASSERT(MUTEX_HELD(&cpu_lock)); 1094 return (cpu_flagged_offline(cpu->cpu_flags)); 1095 } 1096 1097 /* 1098 * Determine whether the CPU is powered off. 1099 */ 1100 int 1101 cpu_is_poweredoff(cpu_t *cpu) 1102 { 1103 ASSERT(MUTEX_HELD(&cpu_lock)); 1104 return (cpu_flagged_poweredoff(cpu->cpu_flags)); 1105 } 1106 1107 /* 1108 * Determine whether the CPU is handling interrupts. 1109 */ 1110 int 1111 cpu_is_nointr(cpu_t *cpu) 1112 { 1113 ASSERT(MUTEX_HELD(&cpu_lock)); 1114 return (cpu_flagged_nointr(cpu->cpu_flags)); 1115 } 1116 1117 /* 1118 * Determine whether the CPU is active (scheduling threads). 1119 */ 1120 int 1121 cpu_is_active(cpu_t *cpu) 1122 { 1123 ASSERT(MUTEX_HELD(&cpu_lock)); 1124 return (cpu_flagged_active(cpu->cpu_flags)); 1125 } 1126 1127 /* 1128 * Same as above, but these require cpu_flags instead of cpu_t pointers. 1129 */ 1130 int 1131 cpu_flagged_online(cpu_flag_t cpu_flags) 1132 { 1133 return (cpu_flagged_active(cpu_flags) && 1134 (cpu_flags & CPU_ENABLE)); 1135 } 1136 1137 int 1138 cpu_flagged_offline(cpu_flag_t cpu_flags) 1139 { 1140 return (((cpu_flags & CPU_POWEROFF) == 0) && 1141 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) != CPU_READY)); 1142 } 1143 1144 int 1145 cpu_flagged_poweredoff(cpu_flag_t cpu_flags) 1146 { 1147 return ((cpu_flags & CPU_POWEROFF) == CPU_POWEROFF); 1148 } 1149 1150 int 1151 cpu_flagged_nointr(cpu_flag_t cpu_flags) 1152 { 1153 return (cpu_flagged_active(cpu_flags) && 1154 (cpu_flags & CPU_ENABLE) == 0); 1155 } 1156 1157 int 1158 cpu_flagged_active(cpu_flag_t cpu_flags) 1159 { 1160 return (((cpu_flags & (CPU_POWEROFF | CPU_FAULTED | CPU_SPARE)) == 0) && 1161 ((cpu_flags & (CPU_READY | CPU_OFFLINE)) == CPU_READY)); 1162 } 1163 1164 /* 1165 * Bring the indicated CPU online. 1166 */ 1167 int 1168 cpu_online(cpu_t *cp) 1169 { 1170 int error = 0; 1171 1172 /* 1173 * Handle on-line request. 1174 * This code must put the new CPU on the active list before 1175 * starting it because it will not be paused, and will start 1176 * using the active list immediately. The real start occurs 1177 * when the CPU_QUIESCED flag is turned off. 1178 */ 1179 1180 ASSERT(MUTEX_HELD(&cpu_lock)); 1181 1182 /* 1183 * Put all the cpus into a known safe place. 1184 * No mutexes can be entered while CPUs are paused. 1185 */ 1186 error = mp_cpu_start(cp); /* arch-dep hook */ 1187 if (error == 0) { 1188 pg_cpupart_in(cp, cp->cpu_part); 1189 pause_cpus(NULL); 1190 cpu_add_active_internal(cp); 1191 if (cp->cpu_flags & CPU_FAULTED) { 1192 cp->cpu_flags &= ~CPU_FAULTED; 1193 mp_cpu_faulted_exit(cp); 1194 } 1195 cp->cpu_flags &= ~(CPU_QUIESCED | CPU_OFFLINE | CPU_FROZEN | 1196 CPU_SPARE); 1197 start_cpus(); 1198 cpu_stats_kstat_create(cp); 1199 cpu_create_intrstat(cp); 1200 lgrp_kstat_create(cp); 1201 cpu_state_change_notify(cp->cpu_id, CPU_ON); 1202 cpu_intr_enable(cp); /* arch-dep hook */ 1203 cpu_set_state(cp); 1204 cyclic_online(cp); 1205 poke_cpu(cp->cpu_id); 1206 }