1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #pragma ident "@(#)lwp_sobj.c 1.77 07/06/17 SMI" 31 32 #include <sys/param.h> 33 #include <sys/types.h> 34 #include <sys/sysmacros.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/user.h> 38 #include <sys/errno.h> 39 #include <sys/file.h> 40 #include <sys/proc.h> 41 #include <sys/prsystm.h> 42 #include <sys/kmem.h> 43 #include <sys/sobject.h> 44 #include <sys/fault.h> 45 #include <sys/procfs.h> 46 #include <sys/watchpoint.h> 47 #include <sys/time.h> 48 #include <sys/cmn_err.h> 49 #include <sys/machlock.h> 50 #include <sys/debug.h> 51 #include <sys/synch.h> 52 #include <sys/synch32.h> 53 #include <sys/mman.h> 54 #include <sys/class.h> 55 #include <sys/schedctl.h> 56 #include <sys/sleepq.h> 57 #include <sys/policy.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/lwpchan_impl.h> 60 #include <sys/turnstile.h> 61 #include <sys/atomic.h> 62 #include <sys/lwp_timer_impl.h> 63 #include <sys/lwp_upimutex_impl.h> 64 #include <vm/as.h> 65 #include <sys/sdt.h> 66 67 static kthread_t *lwpsobj_owner(caddr_t); 68 static void lwp_unsleep(kthread_t *t); 69 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip); 70 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg); 71 72 extern int lwp_cond_signal(lwp_cond_t *cv); 73 74 /* 75 * Maximum number of user prio inheritance locks that can be held by a thread. 76 * Used to limit kmem for each thread. This is a per-thread limit that 77 * can be administered on a system wide basis (using /etc/system). 78 * 79 * Also, when a limit, say maxlwps is added for numbers of lwps within a 80 * process, the per-thread limit automatically becomes a process-wide limit 81 * of maximum number of held upi locks within a process: 82 * maxheldupimx = maxnestupimx * maxlwps; 83 */ 84 static uint32_t maxnestupimx = 2000; 85 86 /* 87 * The sobj_ops vector exports a set of functions needed when a thread 88 * is asleep on a synchronization object of this type. 89 */ 90 static sobj_ops_t lwp_sobj_ops = { 91 SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri 92 }; 93 94 static kthread_t *lwpsobj_pi_owner(upimutex_t *up); 95 96 static sobj_ops_t lwp_sobj_pi_ops = { 97 SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep, 98 turnstile_change_pri 99 }; 100 101 static sleepq_head_t lwpsleepq[NSLEEPQ]; 102 upib_t upimutextab[UPIMUTEX_TABSIZE]; 103 104 #define LWPCHAN_LOCK_SHIFT 10 /* 1024 locks for each pool */ 105 #define LWPCHAN_LOCK_SIZE (1 << LWPCHAN_LOCK_SHIFT) 106 107 /* 108 * We know that both lc_wchan and lc_wchan0 are addresses that most 109 * likely are 8-byte aligned, so we shift off the low-order 3 bits. 110 * 'pool' is either 0 or 1. 111 */ 112 #define LWPCHAN_LOCK_HASH(X, pool) \ 113 (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \ 114 (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0)) 115 116 static kmutex_t lwpchanlock[2 * LWPCHAN_LOCK_SIZE]; 117 118 /* 119 * Is this a POSIX threads user-level lock requiring priority inheritance? 120 */ 121 #define UPIMUTEX(type) ((type) & LOCK_PRIO_INHERIT) 122 123 static sleepq_head_t * 124 lwpsqhash(lwpchan_t *lwpchan) 125 { 126 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; 127 return (&lwpsleepq[SQHASHINDEX(x)]); 128 } 129 130 /* 131 * Lock an lwpchan. 132 * Keep this in sync with lwpchan_unlock(), below. 133 */ 134 static void 135 lwpchan_lock(lwpchan_t *lwpchan, int pool) 136 { 137 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; 138 mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); 139 } 140 141 /* 142 * Unlock an lwpchan. 143 * Keep this in sync with lwpchan_lock(), above. 144 */ 145 static void 146 lwpchan_unlock(lwpchan_t *lwpchan, int pool) 147 { 148 uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; 149 mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); 150 } 151 152 /* 153 * Delete mappings from the lwpchan cache for pages that are being 154 * unmapped by as_unmap(). Given a range of addresses, "start" to "end", 155 * all mappings within the range are deleted from the lwpchan cache. 156 */ 157 void 158 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end) 159 { 160 lwpchan_data_t *lcp; 161 lwpchan_hashbucket_t *hashbucket; 162 lwpchan_hashbucket_t *endbucket; 163 lwpchan_entry_t *ent; 164 lwpchan_entry_t **prev; 165 caddr_t addr; 166 167 mutex_enter(&p->p_lcp_lock); 168 lcp = p->p_lcp; 169 hashbucket = lcp->lwpchan_cache; 170 endbucket = hashbucket + lcp->lwpchan_size; 171 for (; hashbucket < endbucket; hashbucket++) { 172 if (hashbucket->lwpchan_chain == NULL) 173 continue; 174 mutex_enter(&hashbucket->lwpchan_lock); 175 prev = &hashbucket->lwpchan_chain; 176 /* check entire chain */ 177 while ((ent = *prev) != NULL) { 178 addr = ent->lwpchan_addr; 179 if (start <= addr && addr < end) { 180 *prev = ent->lwpchan_next; 181 if (ent->lwpchan_pool == LWPCHAN_MPPOOL && 182 (ent->lwpchan_type & LOCK_ROBUST)) 183 lwp_mutex_cleanup(ent, LOCK_UNMAPPED); 184 kmem_free(ent, sizeof (*ent)); 185 atomic_add_32(&lcp->lwpchan_entries, -1); 186 } else { 187 prev = &ent->lwpchan_next; 188 } 189 } 190 mutex_exit(&hashbucket->lwpchan_lock); 191 } 192 mutex_exit(&p->p_lcp_lock); 193 } 194 195 /* 196 * Given an lwpchan cache pointer and a process virtual address, 197 * return a pointer to the corresponding lwpchan hash bucket. 198 */ 199 static lwpchan_hashbucket_t * 200 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr) 201 { 202 uint_t i; 203 204 /* 205 * All user-level sync object addresses are 8-byte aligned. 206 * Ignore the lowest 3 bits of the address and use the 207 * higher-order 2*lwpchan_bits bits for the hash index. 208 */ 209 addr >>= 3; 210 i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask; 211 return (lcp->lwpchan_cache + i); 212 } 213 214 /* 215 * (Re)allocate the per-process lwpchan cache. 216 */ 217 static void 218 lwpchan_alloc_cache(proc_t *p, uint_t bits) 219 { 220 lwpchan_data_t *lcp; 221 lwpchan_data_t *old_lcp; 222 lwpchan_hashbucket_t *hashbucket; 223 lwpchan_hashbucket_t *endbucket; 224 lwpchan_hashbucket_t *newbucket; 225 lwpchan_entry_t *ent; 226 lwpchan_entry_t *next; 227 uint_t count; 228 229 ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS); 230 231 lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP); 232 lcp->lwpchan_bits = bits; 233 lcp->lwpchan_size = 1 << lcp->lwpchan_bits; 234 lcp->lwpchan_mask = lcp->lwpchan_size - 1; 235 lcp->lwpchan_entries = 0; 236 lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size * 237 sizeof (lwpchan_hashbucket_t), KM_SLEEP); 238 lcp->lwpchan_next_data = NULL; 239 240 mutex_enter(&p->p_lcp_lock); 241 if ((old_lcp = p->p_lcp) != NULL) { 242 if (old_lcp->lwpchan_bits >= bits) { 243 /* someone beat us to it */ 244 mutex_exit(&p->p_lcp_lock); 245 kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * 246 sizeof (lwpchan_hashbucket_t)); 247 kmem_free(lcp, sizeof (lwpchan_data_t)); 248 return; 249 } 250 /* 251 * Acquire all of the old hash table locks. 252 */ 253 hashbucket = old_lcp->lwpchan_cache; 254 endbucket = hashbucket + old_lcp->lwpchan_size; 255 for (; hashbucket < endbucket; hashbucket++) 256 mutex_enter(&hashbucket->lwpchan_lock); 257 /* 258 * Move all of the old hash table entries to the 259 * new hash table. The new hash table has not yet 260 * been installed so we don't need any of its locks. 261 */ 262 count = 0; 263 hashbucket = old_lcp->lwpchan_cache; 264 for (; hashbucket < endbucket; hashbucket++) { 265 ent = hashbucket->lwpchan_chain; 266 while (ent != NULL) { 267 next = ent->lwpchan_next; 268 newbucket = lwpchan_bucket(lcp, 269 (uintptr_t)ent->lwpchan_addr); 270 ent->lwpchan_next = newbucket->lwpchan_chain; 271 newbucket->lwpchan_chain = ent; 272 ent = next; 273 count++; 274 } 275 hashbucket->lwpchan_chain = NULL; 276 } 277 lcp->lwpchan_entries = count; 278 } 279 280 /* 281 * Retire the old hash table. We can't actually kmem_free() it 282 * now because someone may still have a pointer to it. Instead, 283 * we link it onto the new hash table's list of retired hash tables. 284 * The new hash table is double the size of the previous one, so 285 * the total size of all retired hash tables is less than the size 286 * of the new one. exit() and exec() free the retired hash tables 287 * (see lwpchan_destroy_cache(), below). 288 */ 289 lcp->lwpchan_next_data = old_lcp; 290 291 /* 292 * As soon as we store the new lcp, future locking operations will 293 * use it. Therefore, we must ensure that all the state we've just 294 * established reaches global visibility before the new lcp does. 295 */ 296 membar_producer(); 297 p->p_lcp = lcp; 298 299 if (old_lcp != NULL) { 300 /* 301 * Release all of the old hash table locks. 302 */ 303 hashbucket = old_lcp->lwpchan_cache; 304 for (; hashbucket < endbucket; hashbucket++) 305 mutex_exit(&hashbucket->lwpchan_lock); 306 } 307 mutex_exit(&p->p_lcp_lock); 308 } 309 310 /* 311 * Deallocate the lwpchan cache, and any dynamically allocated mappings. 312 * Called when the process exits or execs. All lwps except one have 313 * exited so we need no locks here. 314 */ 315 void 316 lwpchan_destroy_cache(int exec) 317 { 318 proc_t *p = curproc; 319 lwpchan_hashbucket_t *hashbucket; 320 lwpchan_hashbucket_t *endbucket; 321 lwpchan_data_t *lcp; 322 lwpchan_entry_t *ent; 323 lwpchan_entry_t *next; 324 uint16_t lockflg; 325 326 lcp = p->p_lcp; 327 p->p_lcp = NULL; 328 329 lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD; 330 hashbucket = lcp->lwpchan_cache; 331 endbucket = hashbucket + lcp->lwpchan_size; 332 for (; hashbucket < endbucket; hashbucket++) { 333 ent = hashbucket->lwpchan_chain; 334 hashbucket->lwpchan_chain = NULL; 335 while (ent != NULL) { 336 next = ent->lwpchan_next; 337 if (ent->lwpchan_pool == LWPCHAN_MPPOOL && 338 (ent->lwpchan_type & LOCK_ROBUST)) 339 lwp_mutex_cleanup(ent, lockflg); 340 kmem_free(ent, sizeof (*ent)); 341 ent = next; 342 } 343 } 344 345 while (lcp != NULL) { 346 lwpchan_data_t *next_lcp = lcp->lwpchan_next_data; 347 kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * 348 sizeof (lwpchan_hashbucket_t)); 349 kmem_free(lcp, sizeof (lwpchan_data_t)); 350 lcp = next_lcp; 351 } 352 } 353 354 /* 355 * Return zero when there is an entry in the lwpchan cache for the 356 * given process virtual address and non-zero when there is not. 357 * The returned non-zero value is the current length of the 358 * hash chain plus one. The caller holds the hash bucket lock. 359 */ 360 static uint_t 361 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan, 362 lwpchan_hashbucket_t *hashbucket) 363 { 364 lwpchan_entry_t *ent; 365 uint_t count = 1; 366 367 for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) { 368 if (ent->lwpchan_addr == addr) { 369 if (ent->lwpchan_type != type || 370 ent->lwpchan_pool != pool) { 371 /* 372 * This shouldn't happen, but might if the 373 * process reuses its memory for different 374 * types of sync objects. We test first 375 * to avoid grabbing the memory cache line. 376 */ 377 ent->lwpchan_type = (uint16_t)type; 378 ent->lwpchan_pool = (uint16_t)pool; 379 } 380 *lwpchan = ent->lwpchan_lwpchan; 381 return (0); 382 } 383 count++; 384 } 385 return (count); 386 } 387 388 /* 389 * Return the cached lwpchan mapping if cached, otherwise insert 390 * a virtual address to lwpchan mapping into the cache. 391 */ 392 static int 393 lwpchan_get_mapping(struct as *as, caddr_t addr, 394 int type, lwpchan_t *lwpchan, int pool) 395 { 396 proc_t *p = curproc; 397 lwpchan_data_t *lcp; 398 lwpchan_hashbucket_t *hashbucket; 399 lwpchan_entry_t *ent; 400 memid_t memid; 401 uint_t count; 402 uint_t bits; 403 404 top: 405 /* initialize the lwpchan cache, if necesary */ 406 if ((lcp = p->p_lcp) == NULL) { 407 lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS); 408 goto top; 409 } 410 hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr); 411 mutex_enter(&hashbucket->lwpchan_lock); 412 if (lcp != p->p_lcp) { 413 /* someone resized the lwpchan cache; start over */ 414 mutex_exit(&hashbucket->lwpchan_lock); 415 goto top; 416 } 417 if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) { 418 /* it's in the cache */ 419 mutex_exit(&hashbucket->lwpchan_lock); 420 return (1); 421 } 422 mutex_exit(&hashbucket->lwpchan_lock); 423 if (as_getmemid(as, addr, &memid) != 0) 424 return (0); 425 lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0]; 426 lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1]; 427 ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP); 428 mutex_enter(&hashbucket->lwpchan_lock); 429 if (lcp != p->p_lcp) { 430 /* someone resized the lwpchan cache; start over */ 431 mutex_exit(&hashbucket->lwpchan_lock); 432 kmem_free(ent, sizeof (*ent)); 433 goto top; 434 } 435 count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket); 436 if (count == 0) { 437 /* someone else added this entry to the cache */ 438 mutex_exit(&hashbucket->lwpchan_lock); 439 kmem_free(ent, sizeof (*ent)); 440 return (1); 441 } 442 if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */ 443 (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) { 444 /* hash chain too long; reallocate the hash table */ 445 mutex_exit(&hashbucket->lwpchan_lock); 446 kmem_free(ent, sizeof (*ent)); 447 lwpchan_alloc_cache(p, bits + 1); 448 goto top; 449 } 450 ent->lwpchan_addr = addr; 451 ent->lwpchan_type = (uint16_t)type; 452 ent->lwpchan_pool = (uint16_t)pool; 453 ent->lwpchan_lwpchan = *lwpchan; 454 ent->lwpchan_next = hashbucket->lwpchan_chain; 455 hashbucket->lwpchan_chain = ent; 456 atomic_add_32(&lcp->lwpchan_entries, 1); 457 mutex_exit(&hashbucket->lwpchan_lock); 458 return (1); 459 } 460 461 /* 462 * Return a unique pair of identifiers that corresponds to a 463 * synchronization object's virtual address. Process-shared 464 * sync objects usually get vnode/offset from as_getmemid(). 465 */ 466 static int 467 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool) 468 { 469 /* 470 * If the lwp synch object is defined to be process-private, 471 * we just make the first field of the lwpchan be 'as' and 472 * the second field be the synch object's virtual address. 473 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.) 474 * The lwpchan cache is used only for process-shared objects. 475 */ 476 if (!(type & USYNC_PROCESS)) { 477 lwpchan->lc_wchan0 = (caddr_t)as; 478 lwpchan->lc_wchan = addr; 479 return (1); 480 } 481 482 return (lwpchan_get_mapping(as, addr, type, lwpchan, pool)); 483 } 484 485 static void 486 lwp_block(lwpchan_t *lwpchan) 487 { 488 kthread_t *t = curthread; 489 klwp_t *lwp = ttolwp(t); 490 sleepq_head_t *sqh; 491 492 thread_lock(t); 493 t->t_flag |= T_WAKEABLE; 494 t->t_lwpchan = *lwpchan; 495 t->t_sobj_ops = &lwp_sobj_ops; 496 t->t_release = 0; 497 sqh = lwpsqhash(lwpchan); 498 disp_lock_enter_high(&sqh->sq_lock); 499 CL_SLEEP(t); 500 DTRACE_SCHED(sleep); 501 THREAD_SLEEP(t, &sqh->sq_lock); 502 sleepq_insert(&sqh->sq_queue, t); 503 thread_unlock(t); 504 lwp->lwp_asleep = 1; 505 lwp->lwp_sysabort = 0; 506 lwp->lwp_ru.nvcsw++; 507 (void) new_mstate(curthread, LMS_SLEEP); 508 } 509 510 static kthread_t * 511 lwpsobj_pi_owner(upimutex_t *up) 512 { 513 return (up->upi_owner); 514 } 515 516 static struct upimutex * 517 upi_get(upib_t *upibp, lwpchan_t *lcp) 518 { 519 struct upimutex *upip; 520 521 for (upip = upibp->upib_first; upip != NULL; 522 upip = upip->upi_nextchain) { 523 if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 && 524 upip->upi_lwpchan.lc_wchan == lcp->lc_wchan) 525 break; 526 } 527 return (upip); 528 } 529 530 static void 531 upi_chain_add(upib_t *upibp, struct upimutex *upimutex) 532 { 533 ASSERT(MUTEX_HELD(&upibp->upib_lock)); 534 535 /* 536 * Insert upimutex at front of list. Maybe a bit unfair 537 * but assume that not many lwpchans hash to the same 538 * upimutextab bucket, i.e. the list of upimutexes from 539 * upib_first is not too long. 540 */ 541 upimutex->upi_nextchain = upibp->upib_first; 542 upibp->upib_first = upimutex; 543 } 544 545 static void 546 upi_chain_del(upib_t *upibp, struct upimutex *upimutex) 547 { 548 struct upimutex **prev; 549 550 ASSERT(MUTEX_HELD(&upibp->upib_lock)); 551 552 prev = &upibp->upib_first; 553 while (*prev != upimutex) { 554 prev = &(*prev)->upi_nextchain; 555 } 556 *prev = upimutex->upi_nextchain; 557 upimutex->upi_nextchain = NULL; 558 } 559 560 /* 561 * Add upimutex to chain of upimutexes held by curthread. 562 * Returns number of upimutexes held by curthread. 563 */ 564 static uint32_t 565 upi_mylist_add(struct upimutex *upimutex) 566 { 567 kthread_t *t = curthread; 568 569 /* 570 * Insert upimutex at front of list of upimutexes owned by t. This 571 * would match typical LIFO order in which nested locks are acquired 572 * and released. 573 */ 574 upimutex->upi_nextowned = t->t_upimutex; 575 t->t_upimutex = upimutex; 576 t->t_nupinest++; 577 ASSERT(t->t_nupinest > 0); 578 return (t->t_nupinest); 579 } 580 581 /* 582 * Delete upimutex from list of upimutexes owned by curthread. 583 */ 584 static void 585 upi_mylist_del(struct upimutex *upimutex) 586 { 587 kthread_t *t = curthread; 588 struct upimutex **prev; 589 590 /* 591 * Since the order in which nested locks are acquired and released, 592 * is typically LIFO, and typical nesting levels are not too deep, the 593 * following should not be expensive in the general case. 594 */ 595 prev = &t->t_upimutex; 596 while (*prev != upimutex) { 597 prev = &(*prev)->upi_nextowned; 598 } 599 *prev = upimutex->upi_nextowned; 600 upimutex->upi_nextowned = NULL; 601 ASSERT(t->t_nupinest > 0); 602 t->t_nupinest--; 603 } 604 605 /* 606 * Returns true if upimutex is owned. Should be called only when upim points 607 * to kmem which cannot disappear from underneath. 608 */ 609 static int 610 upi_owned(upimutex_t *upim) 611 { 612 return (upim->upi_owner == curthread); 613 } 614 615 /* 616 * Returns pointer to kernel object (upimutex_t *) if lp is owned. 617 */ 618 static struct upimutex * 619 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type) 620 { 621 lwpchan_t lwpchan; 622 upib_t *upibp; 623 struct upimutex *upimutex; 624 625 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, 626 &lwpchan, LWPCHAN_MPPOOL)) 627 return (NULL); 628 629 upibp = &UPI_CHAIN(lwpchan); 630 mutex_enter(&upibp->upib_lock); 631 upimutex = upi_get(upibp, &lwpchan); 632 if (upimutex == NULL || upimutex->upi_owner != curthread) { 633 mutex_exit(&upibp->upib_lock); 634 return (NULL); 635 } 636 mutex_exit(&upibp->upib_lock); 637 return (upimutex); 638 } 639 640 /* 641 * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if 642 * no lock hand-off occurrs. 643 */ 644 static void 645 upimutex_unlock(struct upimutex *upimutex, uint16_t flag) 646 { 647 turnstile_t *ts; 648 upib_t *upibp; 649 kthread_t *newowner; 650 651 upi_mylist_del(upimutex); 652 upibp = upimutex->upi_upibp; 653 mutex_enter(&upibp->upib_lock); 654 if (upimutex->upi_waiter != 0) { /* if waiters */ 655 ts = turnstile_lookup(upimutex); 656 if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) { 657 /* hand-off lock to highest prio waiter */ 658 newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first; 659 upimutex->upi_owner = newowner; 660 if (ts->ts_waiters == 1) 661 upimutex->upi_waiter = 0; 662 turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner); 663 mutex_exit(&upibp->upib_lock); 664 return; 665 } else if (ts != NULL) { 666 /* LOCK_NOTRECOVERABLE: wakeup all */ 667 turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL); 668 } else { 669 /* 670 * Misleading w bit. Waiters might have been 671 * interrupted. No need to clear the w bit (upimutex 672 * will soon be freed). Re-calculate PI from existing 673 * waiters. 674 */ 675 turnstile_exit(upimutex); 676 turnstile_pi_recalc(); 677 } 678 } 679 /* 680 * no waiters, or LOCK_NOTRECOVERABLE. 681 * remove from the bucket chain of upi mutexes. 682 * de-allocate kernel memory (upimutex). 683 */ 684 upi_chain_del(upimutex->upi_upibp, upimutex); 685 mutex_exit(&upibp->upib_lock); 686 kmem_free(upimutex, sizeof (upimutex_t)); 687 } 688 689 static int 690 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp) 691 { 692 label_t ljb; 693 int error = 0; 694 lwpchan_t lwpchan; 695 uint16_t flag; 696 upib_t *upibp; 697 volatile struct upimutex *upimutex = NULL; 698 turnstile_t *ts; 699 uint32_t nupinest; 700 volatile int upilocked = 0; 701 702 if (on_fault(&ljb)) { 703 if (upilocked) 704 upimutex_unlock((upimutex_t *)upimutex, 0); 705 error = EFAULT; 706 goto out; 707 } 708 /* 709 * The apparent assumption made in implementing other _lwp_* synch 710 * primitives, is that get_lwpchan() does not return a unique cookie 711 * for the case where 2 processes (one forked from the other) point 712 * at the same underlying object, which is typed USYNC_PROCESS, but 713 * mapped MAP_PRIVATE, since the object has not yet been written to, 714 * in the child process. 715 * 716 * Since get_lwpchan() has been fixed, it is not necessary to do the 717 * dummy writes to force a COW fault as in other places (which should 718 * be fixed). 719 */ 720 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, 721 &lwpchan, LWPCHAN_MPPOOL)) { 722 error = EFAULT; 723 goto out; 724 } 725 upibp = &UPI_CHAIN(lwpchan); 726 retry: 727 mutex_enter(&upibp->upib_lock); 728 upimutex = upi_get(upibp, &lwpchan); 729 if (upimutex == NULL) { 730 /* lock available since lwpchan has no upimutex */ 731 upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP); 732 upi_chain_add(upibp, (upimutex_t *)upimutex); 733 upimutex->upi_owner = curthread; /* grab lock */ 734 upimutex->upi_upibp = upibp; 735 upimutex->upi_vaddr = lp; 736 upimutex->upi_lwpchan = lwpchan; 737 mutex_exit(&upibp->upib_lock); 738 nupinest = upi_mylist_add((upimutex_t *)upimutex); 739 upilocked = 1; 740 fuword16_noerr(&lp->mutex_flag, &flag); 741 if (nupinest > maxnestupimx && 742 secpolicy_resource(CRED()) != 0) { 743 upimutex_unlock((upimutex_t *)upimutex, flag); 744 error = ENOMEM; 745 goto out; 746 } 747 if (flag & LOCK_NOTRECOVERABLE) { 748 /* 749 * Since the setting of LOCK_NOTRECOVERABLE 750 * was done under the high-level upi mutex, 751 * in lwp_upimutex_unlock(), this flag needs to 752 * be checked while holding the upi mutex. 753 * If set, this thread should return without 754 * the lock held, and with the right error code. 755 */ 756 upimutex_unlock((upimutex_t *)upimutex, flag); 757 upilocked = 0; 758 error = ENOTRECOVERABLE; 759 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { 760 if (flag & LOCK_OWNERDEAD) 761 error = EOWNERDEAD; 762 else if (type & USYNC_PROCESS_ROBUST) 763 error = ELOCKUNMAPPED; 764 else 765 error = EOWNERDEAD; 766 } 767 goto out; 768 } 769 /* 770 * If a upimutex object exists, it must have an owner. 771 * This is due to lock hand-off, and release of upimutex when no 772 * waiters are present at unlock time, 773 */ 774 ASSERT(upimutex->upi_owner != NULL); 775 if (upimutex->upi_owner == curthread) { 776 /* 777 * The user wrapper can check if the mutex type is 778 * ERRORCHECK: if not, it should stall at user-level. 779 * If so, it should return the error code. 780 */ 781 mutex_exit(&upibp->upib_lock); 782 error = EDEADLK; 783 goto out; 784 } 785 if (try == UPIMUTEX_TRY) { 786 mutex_exit(&upibp->upib_lock); 787 error = EBUSY; 788 goto out; 789 } 790 /* 791 * Block for the lock. 792 * Put the lwp in an orderly state for debugging. 793 * Calling prstop() has to be done here, and not in 794 * turnstile_block(), since the preceding call to 795 * turnstile_lookup() raises the PIL to a level 796 * at which calls to prstop() should not be made. 797 */ 798 if ((error = lwptp->lwpt_time_error) != 0) { 799 /* 800 * The SUSV3 Posix spec is very clear that we 801 * should get no error from validating the 802 * timer until we would actually sleep. 803 */ 804 mutex_exit(&upibp->upib_lock); 805 goto out; 806 } 807 prstop(PR_REQUESTED, 0); 808 if (lwptp->lwpt_tsp != NULL) { 809 /* 810 * If we successfully queue the timeout 811 * (lwp_timer_enqueue() returns zero), 812 * then don't drop t_delay_lock until we are 813 * on the sleep queue (in turnstile_block()). 814 * Otherwise we will get an immediate timeout 815 * when we attempt to sleep in turnstile_block(). 816 */ 817 mutex_enter(&curthread->t_delay_lock); 818 if (lwp_timer_enqueue(lwptp) != 0) 819 mutex_exit(&curthread->t_delay_lock); 820 } 821 /* 822 * Now, set the waiter bit and block for the lock in turnstile_block(). 823 * No need to preserve the previous wbit since a lock try is not 824 * attempted after setting the wait bit. Wait bit is set under 825 * the upib_lock, which is not released until the turnstile lock 826 * is acquired. Say, the upimutex is L: 827 * 828 * 1. upib_lock is held so the waiter does not have to retry L after 829 * setting the wait bit: since the owner has to grab the upib_lock 830 * to unlock L, it will certainly see the wait bit set. 831 * 2. upib_lock is not released until the turnstile lock is acquired. 832 * This is the key to preventing a missed wake-up. Otherwise, the 833 * owner could acquire the upib_lock, and the tc_lock, to call 834 * turnstile_wakeup(). All this, before the waiter gets tc_lock 835 * to sleep in turnstile_block(). turnstile_wakeup() will then not 836 * find this waiter, resulting in the missed wakeup. 837 * 3. The upib_lock, being a kernel mutex, cannot be released while 838 * holding the tc_lock (since mutex_exit() could need to acquire 839 * the same tc_lock)...and so is held when calling turnstile_block(). 840 * The address of upib_lock is passed to turnstile_block() which 841 * releases it after releasing all turnstile locks, and before going 842 * to sleep in swtch(). 843 * 4. The waiter value cannot be a count of waiters, because a waiter 844 * can be interrupted. The interrupt occurs under the tc_lock, at 845 * which point, the upib_lock cannot be locked, to decrement waiter 846 * count. So, just treat the waiter state as a bit, not a count. 847 */ 848 ts = turnstile_lookup((upimutex_t *)upimutex); 849 upimutex->upi_waiter = 1; 850 error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex, 851 &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp); 852 /* 853 * Hand-off implies that we wakeup holding the lock, except when: 854 * - deadlock is detected 855 * - lock is not recoverable 856 * - we got an interrupt or timeout 857 * If we wake up due to an interrupt or timeout, we may 858 * or may not be holding the lock due to mutex hand-off. 859 * Use lwp_upimutex_owned() to check if we do hold the lock. 860 */ 861 if (error != 0) { 862 if ((error == EINTR || error == ETIME) && 863 (upimutex = lwp_upimutex_owned(lp, type))) { 864 /* 865 * Unlock and return - the re-startable syscall will 866 * try the lock again if we got EINTR. 867 */ 868 (void) upi_mylist_add((upimutex_t *)upimutex); 869 upimutex_unlock((upimutex_t *)upimutex, 0); 870 } 871 /* 872 * The only other possible error is EDEADLK. If so, upimutex 873 * is valid, since its owner is deadlocked with curthread. 874 */ 875 ASSERT(error == EINTR || error == ETIME || 876 (error == EDEADLK && !upi_owned((upimutex_t *)upimutex))); 877 ASSERT(!lwp_upimutex_owned(lp, type)); 878 goto out; 879 } 880 if (lwp_upimutex_owned(lp, type)) { 881 ASSERT(lwp_upimutex_owned(lp, type) == upimutex); 882 nupinest = upi_mylist_add((upimutex_t *)upimutex); 883 upilocked = 1; 884 } 885 /* 886 * Now, need to read the user-level lp->mutex_flag to do the following: 887 * 888 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED 889 * should be returned. 890 * - if lock isn't held, check if ENOTRECOVERABLE should 891 * be returned. 892 * 893 * Now, either lp->mutex_flag is readable or it's not. If not 894 * readable, the on_fault path will cause a return with EFAULT 895 * as it should. If it is readable, the state of the flag 896 * encodes the robustness state of the lock: 897 * 898 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD 899 * or LOCK_UNMAPPED setting will influence the return code 900 * appropriately. If the upimutex is not locked here, this 901 * could be due to a spurious wake-up or a NOTRECOVERABLE 902 * event. The flag's setting can be used to distinguish 903 * between these two events. 904 */ 905 fuword16_noerr(&lp->mutex_flag, &flag); 906 if (upilocked) { 907 /* 908 * If the thread wakes up from turnstile_block with the lock 909 * held, the flag could not be set to LOCK_NOTRECOVERABLE, 910 * since it would not have been handed-off the lock. 911 * So, no need to check for this case. 912 */ 913 if (nupinest > maxnestupimx && 914 secpolicy_resource(CRED()) != 0) { 915 upimutex_unlock((upimutex_t *)upimutex, flag); 916 upilocked = 0; 917 error = ENOMEM; 918 } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { 919 if (flag & LOCK_OWNERDEAD) 920 error = EOWNERDEAD; 921 else if (type & USYNC_PROCESS_ROBUST) 922 error = ELOCKUNMAPPED; 923 else 924 error = EOWNERDEAD; 925 } 926 } else { 927 /* 928 * Wake-up without the upimutex held. Either this is a 929 * spurious wake-up (due to signals, forkall(), whatever), or 930 * it is a LOCK_NOTRECOVERABLE robustness event. The setting 931 * of the mutex flag can be used to distinguish between the 932 * two events. 933 */ 934 if (flag & LOCK_NOTRECOVERABLE) { 935 error = ENOTRECOVERABLE; 936 } else { 937 /* 938 * Here, the flag could be set to LOCK_OWNERDEAD or 939 * not. In both cases, this is a spurious wakeup, 940 * since the upi lock is not held, but the thread 941 * has returned from turnstile_block(). 942 * 943 * The user flag could be LOCK_OWNERDEAD if, at the 944 * same time as curthread having been woken up 945 * spuriously, the owner (say Tdead) has died, marked 946 * the mutex flag accordingly, and handed off the lock 947 * to some other waiter (say Tnew). curthread just 948 * happened to read the flag while Tnew has yet to deal 949 * with the owner-dead event. 950 * 951 * In this event, curthread should retry the lock. 952 * If Tnew is able to cleanup the lock, curthread 953 * will eventually get the lock with a zero error code, 954 * If Tnew is unable to cleanup, its eventual call to 955 * unlock the lock will result in the mutex flag being 956 * set to LOCK_NOTRECOVERABLE, and the wake-up of 957 * all waiters, including curthread, which will then 958 * eventually return ENOTRECOVERABLE due to the above 959 * check. 960 * 961 * Of course, if the user-flag is not set with 962 * LOCK_OWNERDEAD, retrying is the thing to do, since 963 * this is definitely a spurious wakeup. 964 */ 965 goto retry; 966 } 967 } 968 969 out: 970 no_fault(); 971 return (error); 972 } 973 974 975 static int 976 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type) 977 { 978 label_t ljb; 979 int error = 0; 980 lwpchan_t lwpchan; 981 uint16_t flag; 982 upib_t *upibp; 983 volatile struct upimutex *upimutex = NULL; 984 volatile int upilocked = 0; 985 986 if (on_fault(&ljb)) { 987 if (upilocked) 988 upimutex_unlock((upimutex_t *)upimutex, 0); 989 error = EFAULT; 990 goto out; 991 } 992 if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, 993 &lwpchan, LWPCHAN_MPPOOL)) { 994 error = EFAULT; 995 goto out; 996 } 997 upibp = &UPI_CHAIN(lwpchan); 998 mutex_enter(&upibp->upib_lock); 999 upimutex = upi_get(upibp, &lwpchan); 1000 /* 1001 * If the lock is not held, or the owner is not curthread, return 1002 * error. The user-level wrapper can return this error or stall, 1003 * depending on whether mutex is of ERRORCHECK type or not. 1004 */ 1005 if (upimutex == NULL || upimutex->upi_owner != curthread) { 1006 mutex_exit(&upibp->upib_lock); 1007 error = EPERM; 1008 goto out; 1009 } 1010 mutex_exit(&upibp->upib_lock); /* release for user memory access */ 1011 upilocked = 1; 1012 fuword16_noerr(&lp->mutex_flag, &flag); 1013 if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { 1014 /* 1015 * transition mutex to the LOCK_NOTRECOVERABLE state. 1016 */ 1017 flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED); 1018 flag |= LOCK_NOTRECOVERABLE; 1019 suword16_noerr(&lp->mutex_flag, flag); 1020 } 1021 if (type & USYNC_PROCESS) 1022 suword32_noerr(&lp->mutex_ownerpid, 0); 1023 upimutex_unlock((upimutex_t *)upimutex, flag); 1024 upilocked = 0; 1025 out: 1026 no_fault(); 1027 return (error); 1028 } 1029 1030 /* 1031 * Clear the contents of a user-level mutex; return the flags. 1032 * Used only by upi_dead() and lwp_mutex_cleanup(), below. 1033 */ 1034 static uint16_t 1035 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg) 1036 { 1037 uint16_t flag; 1038 1039 fuword16_noerr(&lp->mutex_flag, &flag); 1040 if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) { 1041 flag |= lockflg; 1042 suword16_noerr(&lp->mutex_flag, flag); 1043 } 1044 suword32_noerr((uint32_t *)&lp->mutex_owner, 0); 1045 suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0); 1046 suword32_noerr(&lp->mutex_ownerpid, 0); 1047 suword8_noerr(&lp->mutex_rcount, 0); 1048 1049 return (flag); 1050 } 1051 1052 /* 1053 * Mark user mutex state, corresponding to kernel upimutex, 1054 * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate 1055 */ 1056 static int 1057 upi_dead(upimutex_t *upip, uint16_t lockflg) 1058 { 1059 label_t ljb; 1060 int error = 0; 1061 lwp_mutex_t *lp; 1062 1063 if (