Home | History | Annotate | Download | only in syscall
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
     28 /*	  All Rights Reserved	*/
     29 
     30 #pragma ident	"@(#)lwp_sobj.c	1.77	07/06/17 SMI"
     31 
     32 #include <sys/param.h>
     33 #include <sys/types.h>
     34 #include <sys/sysmacros.h>
     35 #include <sys/systm.h>
     36 #include <sys/cred.h>
     37 #include <sys/user.h>
     38 #include <sys/errno.h>
     39 #include <sys/file.h>
     40 #include <sys/proc.h>
     41 #include <sys/prsystm.h>
     42 #include <sys/kmem.h>
     43 #include <sys/sobject.h>
     44 #include <sys/fault.h>
     45 #include <sys/procfs.h>
     46 #include <sys/watchpoint.h>
     47 #include <sys/time.h>
     48 #include <sys/cmn_err.h>
     49 #include <sys/machlock.h>
     50 #include <sys/debug.h>
     51 #include <sys/synch.h>
     52 #include <sys/synch32.h>
     53 #include <sys/mman.h>
     54 #include <sys/class.h>
     55 #include <sys/schedctl.h>
     56 #include <sys/sleepq.h>
     57 #include <sys/policy.h>
     58 #include <sys/tnf_probe.h>
     59 #include <sys/lwpchan_impl.h>
     60 #include <sys/turnstile.h>
     61 #include <sys/atomic.h>
     62 #include <sys/lwp_timer_impl.h>
     63 #include <sys/lwp_upimutex_impl.h>
     64 #include <vm/as.h>
     65 #include <sys/sdt.h>
     66 
     67 static kthread_t *lwpsobj_owner(caddr_t);
     68 static void lwp_unsleep(kthread_t *t);
     69 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
     70 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
     71 
     72 extern int lwp_cond_signal(lwp_cond_t *cv);
     73 
     74 /*
     75  * Maximum number of user prio inheritance locks that can be held by a thread.
     76  * Used to limit kmem for each thread. This is a per-thread limit that
     77  * can be administered on a system wide basis (using /etc/system).
     78  *
     79  * Also, when a limit, say maxlwps is added for numbers of lwps within a
     80  * process, the per-thread limit automatically becomes a process-wide limit
     81  * of maximum number of held upi locks within a process:
     82  *      maxheldupimx = maxnestupimx * maxlwps;
     83  */
     84 static uint32_t maxnestupimx = 2000;
     85 
     86 /*
     87  * The sobj_ops vector exports a set of functions needed when a thread
     88  * is asleep on a synchronization object of this type.
     89  */
     90 static sobj_ops_t lwp_sobj_ops = {
     91 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
     92 };
     93 
     94 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
     95 
     96 static sobj_ops_t lwp_sobj_pi_ops = {
     97 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
     98 	turnstile_change_pri
     99 };
    100 
    101 static sleepq_head_t	lwpsleepq[NSLEEPQ];
    102 upib_t			upimutextab[UPIMUTEX_TABSIZE];
    103 
    104 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
    105 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
    106 
    107 /*
    108  * We know that both lc_wchan and lc_wchan0 are addresses that most
    109  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
    110  * 'pool' is either 0 or 1.
    111  */
    112 #define	LWPCHAN_LOCK_HASH(X, pool) \
    113 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
    114 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
    115 
    116 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
    117 
    118 /*
    119  * Is this a POSIX threads user-level lock requiring priority inheritance?
    120  */
    121 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
    122 
    123 static sleepq_head_t *
    124 lwpsqhash(lwpchan_t *lwpchan)
    125 {
    126 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    127 	return (&lwpsleepq[SQHASHINDEX(x)]);
    128 }
    129 
    130 /*
    131  * Lock an lwpchan.
    132  * Keep this in sync with lwpchan_unlock(), below.
    133  */
    134 static void
    135 lwpchan_lock(lwpchan_t *lwpchan, int pool)
    136 {
    137 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    138 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
    139 }
    140 
    141 /*
    142  * Unlock an lwpchan.
    143  * Keep this in sync with lwpchan_lock(), above.
    144  */
    145 static void
    146 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
    147 {
    148 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
    149 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
    150 }
    151 
    152 /*
    153  * Delete mappings from the lwpchan cache for pages that are being
    154  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
    155  * all mappings within the range are deleted from the lwpchan cache.
    156  */
    157 void
    158 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
    159 {
    160 	lwpchan_data_t *lcp;
    161 	lwpchan_hashbucket_t *hashbucket;
    162 	lwpchan_hashbucket_t *endbucket;
    163 	lwpchan_entry_t *ent;
    164 	lwpchan_entry_t **prev;
    165 	caddr_t addr;
    166 
    167 	mutex_enter(&p->p_lcp_lock);
    168 	lcp = p->p_lcp;
    169 	hashbucket = lcp->lwpchan_cache;
    170 	endbucket = hashbucket + lcp->lwpchan_size;
    171 	for (; hashbucket < endbucket; hashbucket++) {
    172 		if (hashbucket->lwpchan_chain == NULL)
    173 			continue;
    174 		mutex_enter(&hashbucket->lwpchan_lock);
    175 		prev = &hashbucket->lwpchan_chain;
    176 		/* check entire chain */
    177 		while ((ent = *prev) != NULL) {
    178 			addr = ent->lwpchan_addr;
    179 			if (start <= addr && addr < end) {
    180 				*prev = ent->lwpchan_next;
    181 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
    182 				    (ent->lwpchan_type & LOCK_ROBUST))
    183 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
    184 				kmem_free(ent, sizeof (*ent));
    185 				atomic_add_32(&lcp->lwpchan_entries, -1);
    186 			} else {
    187 				prev = &ent->lwpchan_next;
    188 			}
    189 		}
    190 		mutex_exit(&hashbucket->lwpchan_lock);
    191 	}
    192 	mutex_exit(&p->p_lcp_lock);
    193 }
    194 
    195 /*
    196  * Given an lwpchan cache pointer and a process virtual address,
    197  * return a pointer to the corresponding lwpchan hash bucket.
    198  */
    199 static lwpchan_hashbucket_t *
    200 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
    201 {
    202 	uint_t i;
    203 
    204 	/*
    205 	 * All user-level sync object addresses are 8-byte aligned.
    206 	 * Ignore the lowest 3 bits of the address and use the
    207 	 * higher-order 2*lwpchan_bits bits for the hash index.
    208 	 */
    209 	addr >>= 3;
    210 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
    211 	return (lcp->lwpchan_cache + i);
    212 }
    213 
    214 /*
    215  * (Re)allocate the per-process lwpchan cache.
    216  */
    217 static void
    218 lwpchan_alloc_cache(proc_t *p, uint_t bits)
    219 {
    220 	lwpchan_data_t *lcp;
    221 	lwpchan_data_t *old_lcp;
    222 	lwpchan_hashbucket_t *hashbucket;
    223 	lwpchan_hashbucket_t *endbucket;
    224 	lwpchan_hashbucket_t *newbucket;
    225 	lwpchan_entry_t *ent;
    226 	lwpchan_entry_t *next;
    227 	uint_t count;
    228 
    229 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
    230 
    231 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
    232 	lcp->lwpchan_bits = bits;
    233 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
    234 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
    235 	lcp->lwpchan_entries = 0;
    236 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
    237 		sizeof (lwpchan_hashbucket_t), KM_SLEEP);
    238 	lcp->lwpchan_next_data = NULL;
    239 
    240 	mutex_enter(&p->p_lcp_lock);
    241 	if ((old_lcp = p->p_lcp) != NULL) {
    242 		if (old_lcp->lwpchan_bits >= bits) {
    243 			/* someone beat us to it */
    244 			mutex_exit(&p->p_lcp_lock);
    245 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
    246 				sizeof (lwpchan_hashbucket_t));
    247 			kmem_free(lcp, sizeof (lwpchan_data_t));
    248 			return;
    249 		}
    250 		/*
    251 		 * Acquire all of the old hash table locks.
    252 		 */
    253 		hashbucket = old_lcp->lwpchan_cache;
    254 		endbucket = hashbucket + old_lcp->lwpchan_size;
    255 		for (; hashbucket < endbucket; hashbucket++)
    256 			mutex_enter(&hashbucket->lwpchan_lock);
    257 		/*
    258 		 * Move all of the old hash table entries to the
    259 		 * new hash table.  The new hash table has not yet
    260 		 * been installed so we don't need any of its locks.
    261 		 */
    262 		count = 0;
    263 		hashbucket = old_lcp->lwpchan_cache;
    264 		for (; hashbucket < endbucket; hashbucket++) {
    265 			ent = hashbucket->lwpchan_chain;
    266 			while (ent != NULL) {
    267 				next = ent->lwpchan_next;
    268 				newbucket = lwpchan_bucket(lcp,
    269 					(uintptr_t)ent->lwpchan_addr);
    270 				ent->lwpchan_next = newbucket->lwpchan_chain;
    271 				newbucket->lwpchan_chain = ent;
    272 				ent = next;
    273 				count++;
    274 			}
    275 			hashbucket->lwpchan_chain = NULL;
    276 		}
    277 		lcp->lwpchan_entries = count;
    278 	}
    279 
    280 	/*
    281 	 * Retire the old hash table.  We can't actually kmem_free() it
    282 	 * now because someone may still have a pointer to it.  Instead,
    283 	 * we link it onto the new hash table's list of retired hash tables.
    284 	 * The new hash table is double the size of the previous one, so
    285 	 * the total size of all retired hash tables is less than the size
    286 	 * of the new one.  exit() and exec() free the retired hash tables
    287 	 * (see lwpchan_destroy_cache(), below).
    288 	 */
    289 	lcp->lwpchan_next_data = old_lcp;
    290 
    291 	/*
    292 	 * As soon as we store the new lcp, future locking operations will
    293 	 * use it.  Therefore, we must ensure that all the state we've just
    294 	 * established reaches global visibility before the new lcp does.
    295 	 */
    296 	membar_producer();
    297 	p->p_lcp = lcp;
    298 
    299 	if (old_lcp != NULL) {
    300 		/*
    301 		 * Release all of the old hash table locks.
    302 		 */
    303 		hashbucket = old_lcp->lwpchan_cache;
    304 		for (; hashbucket < endbucket; hashbucket++)
    305 			mutex_exit(&hashbucket->lwpchan_lock);
    306 	}
    307 	mutex_exit(&p->p_lcp_lock);
    308 }
    309 
    310 /*
    311  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
    312  * Called when the process exits or execs.  All lwps except one have
    313  * exited so we need no locks here.
    314  */
    315 void
    316 lwpchan_destroy_cache(int exec)
    317 {
    318 	proc_t *p = curproc;
    319 	lwpchan_hashbucket_t *hashbucket;
    320 	lwpchan_hashbucket_t *endbucket;
    321 	lwpchan_data_t *lcp;
    322 	lwpchan_entry_t *ent;
    323 	lwpchan_entry_t *next;
    324 	uint16_t lockflg;
    325 
    326 	lcp = p->p_lcp;
    327 	p->p_lcp = NULL;
    328 
    329 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
    330 	hashbucket = lcp->lwpchan_cache;
    331 	endbucket = hashbucket + lcp->lwpchan_size;
    332 	for (; hashbucket < endbucket; hashbucket++) {
    333 		ent = hashbucket->lwpchan_chain;
    334 		hashbucket->lwpchan_chain = NULL;
    335 		while (ent != NULL) {
    336 			next = ent->lwpchan_next;
    337 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
    338 			    (ent->lwpchan_type & LOCK_ROBUST))
    339 				lwp_mutex_cleanup(ent, lockflg);
    340 			kmem_free(ent, sizeof (*ent));
    341 			ent = next;
    342 		}
    343 	}
    344 
    345 	while (lcp != NULL) {
    346 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
    347 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
    348 			sizeof (lwpchan_hashbucket_t));
    349 		kmem_free(lcp, sizeof (lwpchan_data_t));
    350 		lcp = next_lcp;
    351 	}
    352 }
    353 
    354 /*
    355  * Return zero when there is an entry in the lwpchan cache for the
    356  * given process virtual address and non-zero when there is not.
    357  * The returned non-zero value is the current length of the
    358  * hash chain plus one.  The caller holds the hash bucket lock.
    359  */
    360 static uint_t
    361 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
    362 	lwpchan_hashbucket_t *hashbucket)
    363 {
    364 	lwpchan_entry_t *ent;
    365 	uint_t count = 1;
    366 
    367 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
    368 		if (ent->lwpchan_addr == addr) {
    369 			if (ent->lwpchan_type != type ||
    370 			    ent->lwpchan_pool != pool) {
    371 				/*
    372 				 * This shouldn't happen, but might if the
    373 				 * process reuses its memory for different
    374 				 * types of sync objects.  We test first
    375 				 * to avoid grabbing the memory cache line.
    376 				 */
    377 				ent->lwpchan_type = (uint16_t)type;
    378 				ent->lwpchan_pool = (uint16_t)pool;
    379 			}
    380 			*lwpchan = ent->lwpchan_lwpchan;
    381 			return (0);
    382 		}
    383 		count++;
    384 	}
    385 	return (count);
    386 }
    387 
    388 /*
    389  * Return the cached lwpchan mapping if cached, otherwise insert
    390  * a virtual address to lwpchan mapping into the cache.
    391  */
    392 static int
    393 lwpchan_get_mapping(struct as *as, caddr_t addr,
    394 	int type, lwpchan_t *lwpchan, int pool)
    395 {
    396 	proc_t *p = curproc;
    397 	lwpchan_data_t *lcp;
    398 	lwpchan_hashbucket_t *hashbucket;
    399 	lwpchan_entry_t *ent;
    400 	memid_t	memid;
    401 	uint_t count;
    402 	uint_t bits;
    403 
    404 top:
    405 	/* initialize the lwpchan cache, if necesary */
    406 	if ((lcp = p->p_lcp) == NULL) {
    407 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
    408 		goto top;
    409 	}
    410 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
    411 	mutex_enter(&hashbucket->lwpchan_lock);
    412 	if (lcp != p->p_lcp) {
    413 		/* someone resized the lwpchan cache; start over */
    414 		mutex_exit(&hashbucket->lwpchan_lock);
    415 		goto top;
    416 	}
    417 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
    418 		/* it's in the cache */
    419 		mutex_exit(&hashbucket->lwpchan_lock);
    420 		return (1);
    421 	}
    422 	mutex_exit(&hashbucket->lwpchan_lock);
    423 	if (as_getmemid(as, addr, &memid) != 0)
    424 		return (0);
    425 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
    426 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
    427 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
    428 	mutex_enter(&hashbucket->lwpchan_lock);
    429 	if (lcp != p->p_lcp) {
    430 		/* someone resized the lwpchan cache; start over */
    431 		mutex_exit(&hashbucket->lwpchan_lock);
    432 		kmem_free(ent, sizeof (*ent));
    433 		goto top;
    434 	}
    435 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
    436 	if (count == 0) {
    437 		/* someone else added this entry to the cache */
    438 		mutex_exit(&hashbucket->lwpchan_lock);
    439 		kmem_free(ent, sizeof (*ent));
    440 		return (1);
    441 	}
    442 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
    443 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
    444 		/* hash chain too long; reallocate the hash table */
    445 		mutex_exit(&hashbucket->lwpchan_lock);
    446 		kmem_free(ent, sizeof (*ent));
    447 		lwpchan_alloc_cache(p, bits + 1);
    448 		goto top;
    449 	}
    450 	ent->lwpchan_addr = addr;
    451 	ent->lwpchan_type = (uint16_t)type;
    452 	ent->lwpchan_pool = (uint16_t)pool;
    453 	ent->lwpchan_lwpchan = *lwpchan;
    454 	ent->lwpchan_next = hashbucket->lwpchan_chain;
    455 	hashbucket->lwpchan_chain = ent;
    456 	atomic_add_32(&lcp->lwpchan_entries, 1);
    457 	mutex_exit(&hashbucket->lwpchan_lock);
    458 	return (1);
    459 }
    460 
    461 /*
    462  * Return a unique pair of identifiers that corresponds to a
    463  * synchronization object's virtual address.  Process-shared
    464  * sync objects usually get vnode/offset from as_getmemid().
    465  */
    466 static int
    467 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
    468 {
    469 	/*
    470 	 * If the lwp synch object is defined to be process-private,
    471 	 * we just make the first field of the lwpchan be 'as' and
    472 	 * the second field be the synch object's virtual address.
    473 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
    474 	 * The lwpchan cache is used only for process-shared objects.
    475 	 */
    476 	if (!(type & USYNC_PROCESS)) {
    477 		lwpchan->lc_wchan0 = (caddr_t)as;
    478 		lwpchan->lc_wchan = addr;
    479 		return (1);
    480 	}
    481 
    482 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
    483 }
    484 
    485 static void
    486 lwp_block(lwpchan_t *lwpchan)
    487 {
    488 	kthread_t *t = curthread;
    489 	klwp_t *lwp = ttolwp(t);
    490 	sleepq_head_t *sqh;
    491 
    492 	thread_lock(t);
    493 	t->t_flag |= T_WAKEABLE;
    494 	t->t_lwpchan = *lwpchan;
    495 	t->t_sobj_ops = &lwp_sobj_ops;
    496 	t->t_release = 0;
    497 	sqh = lwpsqhash(lwpchan);
    498 	disp_lock_enter_high(&sqh->sq_lock);
    499 	CL_SLEEP(t);
    500 	DTRACE_SCHED(sleep);
    501 	THREAD_SLEEP(t, &sqh->sq_lock);
    502 	sleepq_insert(&sqh->sq_queue, t);
    503 	thread_unlock(t);
    504 	lwp->lwp_asleep = 1;
    505 	lwp->lwp_sysabort = 0;
    506 	lwp->lwp_ru.nvcsw++;
    507 	(void) new_mstate(curthread, LMS_SLEEP);
    508 }
    509 
    510 static kthread_t *
    511 lwpsobj_pi_owner(upimutex_t *up)
    512 {
    513 	return (up->upi_owner);
    514 }
    515 
    516 static struct upimutex *
    517 upi_get(upib_t *upibp, lwpchan_t *lcp)
    518 {
    519 	struct upimutex *upip;
    520 
    521 	for (upip = upibp->upib_first; upip != NULL;
    522 	    upip = upip->upi_nextchain) {
    523 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
    524 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
    525 			break;
    526 	}
    527 	return (upip);
    528 }
    529 
    530 static void
    531 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
    532 {
    533 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
    534 
    535 	/*
    536 	 * Insert upimutex at front of list. Maybe a bit unfair
    537 	 * but assume that not many lwpchans hash to the same
    538 	 * upimutextab bucket, i.e. the list of upimutexes from
    539 	 * upib_first is not too long.
    540 	 */
    541 	upimutex->upi_nextchain = upibp->upib_first;
    542 	upibp->upib_first = upimutex;
    543 }
    544 
    545 static void
    546 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
    547 {
    548 	struct upimutex **prev;
    549 
    550 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
    551 
    552 	prev = &upibp->upib_first;
    553 	while (*prev != upimutex) {
    554 		prev = &(*prev)->upi_nextchain;
    555 	}
    556 	*prev = upimutex->upi_nextchain;
    557 	upimutex->upi_nextchain = NULL;
    558 }
    559 
    560 /*
    561  * Add upimutex to chain of upimutexes held by curthread.
    562  * Returns number of upimutexes held by curthread.
    563  */
    564 static uint32_t
    565 upi_mylist_add(struct upimutex *upimutex)
    566 {
    567 	kthread_t *t = curthread;
    568 
    569 	/*
    570 	 * Insert upimutex at front of list of upimutexes owned by t. This
    571 	 * would match typical LIFO order in which nested locks are acquired
    572 	 * and released.
    573 	 */
    574 	upimutex->upi_nextowned = t->t_upimutex;
    575 	t->t_upimutex = upimutex;
    576 	t->t_nupinest++;
    577 	ASSERT(t->t_nupinest > 0);
    578 	return (t->t_nupinest);
    579 }
    580 
    581 /*
    582  * Delete upimutex from list of upimutexes owned by curthread.
    583  */
    584 static void
    585 upi_mylist_del(struct upimutex *upimutex)
    586 {
    587 	kthread_t *t = curthread;
    588 	struct upimutex **prev;
    589 
    590 	/*
    591 	 * Since the order in which nested locks are acquired and released,
    592 	 * is typically LIFO, and typical nesting levels are not too deep, the
    593 	 * following should not be expensive in the general case.
    594 	 */
    595 	prev = &t->t_upimutex;
    596 	while (*prev != upimutex) {
    597 		prev = &(*prev)->upi_nextowned;
    598 	}
    599 	*prev = upimutex->upi_nextowned;
    600 	upimutex->upi_nextowned = NULL;
    601 	ASSERT(t->t_nupinest > 0);
    602 	t->t_nupinest--;
    603 }
    604 
    605 /*
    606  * Returns true if upimutex is owned. Should be called only when upim points
    607  * to kmem which cannot disappear from underneath.
    608  */
    609 static int
    610 upi_owned(upimutex_t *upim)
    611 {
    612 	return (upim->upi_owner == curthread);
    613 }
    614 
    615 /*
    616  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
    617  */
    618 static struct upimutex *
    619 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
    620 {
    621 	lwpchan_t lwpchan;
    622 	upib_t *upibp;
    623 	struct upimutex *upimutex;
    624 
    625 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    626 	    &lwpchan, LWPCHAN_MPPOOL))
    627 		return (NULL);
    628 
    629 	upibp = &UPI_CHAIN(lwpchan);
    630 	mutex_enter(&upibp->upib_lock);
    631 	upimutex = upi_get(upibp, &lwpchan);
    632 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
    633 		mutex_exit(&upibp->upib_lock);
    634 		return (NULL);
    635 	}
    636 	mutex_exit(&upibp->upib_lock);
    637 	return (upimutex);
    638 }
    639 
    640 /*
    641  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
    642  * no lock hand-off occurrs.
    643  */
    644 static void
    645 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
    646 {
    647 	turnstile_t *ts;
    648 	upib_t *upibp;
    649 	kthread_t *newowner;
    650 
    651 	upi_mylist_del(upimutex);
    652 	upibp = upimutex->upi_upibp;
    653 	mutex_enter(&upibp->upib_lock);
    654 	if (upimutex->upi_waiter != 0) { /* if waiters */
    655 		ts = turnstile_lookup(upimutex);
    656 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
    657 			/* hand-off lock to highest prio waiter */
    658 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
    659 			upimutex->upi_owner = newowner;
    660 			if (ts->ts_waiters == 1)
    661 				upimutex->upi_waiter = 0;
    662 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
    663 			mutex_exit(&upibp->upib_lock);
    664 			return;
    665 		} else if (ts != NULL) {
    666 			/* LOCK_NOTRECOVERABLE: wakeup all */
    667 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
    668 		} else {
    669 			/*
    670 			 * Misleading w bit. Waiters might have been
    671 			 * interrupted. No need to clear the w bit (upimutex
    672 			 * will soon be freed). Re-calculate PI from existing
    673 			 * waiters.
    674 			 */
    675 			turnstile_exit(upimutex);
    676 			turnstile_pi_recalc();
    677 		}
    678 	}
    679 	/*
    680 	 * no waiters, or LOCK_NOTRECOVERABLE.
    681 	 * remove from the bucket chain of upi mutexes.
    682 	 * de-allocate kernel memory (upimutex).
    683 	 */
    684 	upi_chain_del(upimutex->upi_upibp, upimutex);
    685 	mutex_exit(&upibp->upib_lock);
    686 	kmem_free(upimutex, sizeof (upimutex_t));
    687 }
    688 
    689 static int
    690 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
    691 {
    692 	label_t ljb;
    693 	int error = 0;
    694 	lwpchan_t lwpchan;
    695 	uint16_t flag;
    696 	upib_t *upibp;
    697 	volatile struct upimutex *upimutex = NULL;
    698 	turnstile_t *ts;
    699 	uint32_t nupinest;
    700 	volatile int upilocked = 0;
    701 
    702 	if (on_fault(&ljb)) {
    703 		if (upilocked)
    704 			upimutex_unlock((upimutex_t *)upimutex, 0);
    705 		error = EFAULT;
    706 		goto out;
    707 	}
    708 	/*
    709 	 * The apparent assumption made in implementing other _lwp_* synch
    710 	 * primitives, is that get_lwpchan() does not return a unique cookie
    711 	 * for the case where 2 processes (one forked from the other) point
    712 	 * at the same underlying object, which is typed USYNC_PROCESS, but
    713 	 * mapped MAP_PRIVATE, since the object has not yet been written to,
    714 	 * in the child process.
    715 	 *
    716 	 * Since get_lwpchan() has been fixed, it is not necessary to do the
    717 	 * dummy writes to force a COW fault as in other places (which should
    718 	 * be fixed).
    719 	 */
    720 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    721 	    &lwpchan, LWPCHAN_MPPOOL)) {
    722 		error = EFAULT;
    723 		goto out;
    724 	}
    725 	upibp = &UPI_CHAIN(lwpchan);
    726 retry:
    727 	mutex_enter(&upibp->upib_lock);
    728 	upimutex = upi_get(upibp, &lwpchan);
    729 	if (upimutex == NULL)  {
    730 		/* lock available since lwpchan has no upimutex */
    731 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
    732 		upi_chain_add(upibp, (upimutex_t *)upimutex);
    733 		upimutex->upi_owner = curthread; /* grab lock */
    734 		upimutex->upi_upibp = upibp;
    735 		upimutex->upi_vaddr = lp;
    736 		upimutex->upi_lwpchan = lwpchan;
    737 		mutex_exit(&upibp->upib_lock);
    738 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
    739 		upilocked = 1;
    740 		fuword16_noerr(&lp->mutex_flag, &flag);
    741 		if (nupinest > maxnestupimx &&
    742 		    secpolicy_resource(CRED()) != 0) {
    743 			upimutex_unlock((upimutex_t *)upimutex, flag);
    744 			error = ENOMEM;
    745 			goto out;
    746 		}
    747 		if (flag & LOCK_NOTRECOVERABLE) {
    748 			/*
    749 			 * Since the setting of LOCK_NOTRECOVERABLE
    750 			 * was done under the high-level upi mutex,
    751 			 * in lwp_upimutex_unlock(), this flag needs to
    752 			 * be checked while holding the upi mutex.
    753 			 * If set, this thread should return without
    754 			 * the lock held, and with the right error code.
    755 			 */
    756 			upimutex_unlock((upimutex_t *)upimutex, flag);
    757 			upilocked = 0;
    758 			error = ENOTRECOVERABLE;
    759 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
    760 			if (flag & LOCK_OWNERDEAD)
    761 				error = EOWNERDEAD;
    762 			else if (type & USYNC_PROCESS_ROBUST)
    763 				error = ELOCKUNMAPPED;
    764 			else
    765 				error = EOWNERDEAD;
    766 		}
    767 		goto out;
    768 	}
    769 	/*
    770 	 * If a upimutex object exists, it must have an owner.
    771 	 * This is due to lock hand-off, and release of upimutex when no
    772 	 * waiters are present at unlock time,
    773 	 */
    774 	ASSERT(upimutex->upi_owner != NULL);
    775 	if (upimutex->upi_owner == curthread) {
    776 		/*
    777 		 * The user wrapper can check if the mutex type is
    778 		 * ERRORCHECK: if not, it should stall at user-level.
    779 		 * If so, it should return the error code.
    780 		 */
    781 		mutex_exit(&upibp->upib_lock);
    782 		error = EDEADLK;
    783 		goto out;
    784 	}
    785 	if (try == UPIMUTEX_TRY) {
    786 		mutex_exit(&upibp->upib_lock);
    787 		error = EBUSY;
    788 		goto out;
    789 	}
    790 	/*
    791 	 * Block for the lock.
    792 	 * Put the lwp in an orderly state for debugging.
    793 	 * Calling prstop() has to be done here, and not in
    794 	 * turnstile_block(), since the preceding call to
    795 	 * turnstile_lookup() raises the PIL to a level
    796 	 * at which calls to prstop() should not be made.
    797 	 */
    798 	if ((error = lwptp->lwpt_time_error) != 0) {
    799 		/*
    800 		 * The SUSV3 Posix spec is very clear that we
    801 		 * should get no error from validating the
    802 		 * timer until we would actually sleep.
    803 		 */
    804 		mutex_exit(&upibp->upib_lock);
    805 		goto out;
    806 	}
    807 	prstop(PR_REQUESTED, 0);
    808 	if (lwptp->lwpt_tsp != NULL) {
    809 		/*
    810 		 * If we successfully queue the timeout
    811 		 * (lwp_timer_enqueue() returns zero),
    812 		 * then don't drop t_delay_lock until we are
    813 		 * on the sleep queue (in turnstile_block()).
    814 		 * Otherwise we will get an immediate timeout
    815 		 * when we attempt to sleep in turnstile_block().
    816 		 */
    817 		mutex_enter(&curthread->t_delay_lock);
    818 		if (lwp_timer_enqueue(lwptp) != 0)
    819 			mutex_exit(&curthread->t_delay_lock);
    820 	}
    821 	/*
    822 	 * Now, set the waiter bit and block for the lock in turnstile_block().
    823 	 * No need to preserve the previous wbit since a lock try is not
    824 	 * attempted after setting the wait bit. Wait bit is set under
    825 	 * the upib_lock, which is not released until the turnstile lock
    826 	 * is acquired. Say, the upimutex is L:
    827 	 *
    828 	 * 1. upib_lock is held so the waiter does not have to retry L after
    829 	 *    setting the wait bit: since the owner has to grab the upib_lock
    830 	 *    to unlock L, it will certainly see the wait bit set.
    831 	 * 2. upib_lock is not released until the turnstile lock is acquired.
    832 	 *    This is the key to preventing a missed wake-up. Otherwise, the
    833 	 *    owner could acquire the upib_lock, and the tc_lock, to call
    834 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
    835 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
    836 	 *    find this waiter, resulting in the missed wakeup.
    837 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
    838 	 *    holding the tc_lock (since mutex_exit() could need to acquire
    839 	 *    the same tc_lock)...and so is held when calling turnstile_block().
    840 	 *    The address of upib_lock is passed to turnstile_block() which
    841 	 *    releases it after releasing all turnstile locks, and before going
    842 	 *    to sleep in swtch().
    843 	 * 4. The waiter value cannot be a count of waiters, because a waiter
    844 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
    845 	 *    which point, the upib_lock cannot be locked, to decrement waiter
    846 	 *    count. So, just treat the waiter state as a bit, not a count.
    847 	 */
    848 	ts = turnstile_lookup((upimutex_t *)upimutex);
    849 	upimutex->upi_waiter = 1;
    850 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
    851 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
    852 	/*
    853 	 * Hand-off implies that we wakeup holding the lock, except when:
    854 	 *	- deadlock is detected
    855 	 *	- lock is not recoverable
    856 	 *	- we got an interrupt or timeout
    857 	 * If we wake up due to an interrupt or timeout, we may
    858 	 * or may not be holding the lock due to mutex hand-off.
    859 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
    860 	 */
    861 	if (error != 0) {
    862 		if ((error == EINTR || error == ETIME) &&
    863 		    (upimutex = lwp_upimutex_owned(lp, type))) {
    864 			/*
    865 			 * Unlock and return - the re-startable syscall will
    866 			 * try the lock again if we got EINTR.
    867 			 */
    868 			(void) upi_mylist_add((upimutex_t *)upimutex);
    869 			upimutex_unlock((upimutex_t *)upimutex, 0);
    870 		}
    871 		/*
    872 		 * The only other possible error is EDEADLK.  If so, upimutex
    873 		 * is valid, since its owner is deadlocked with curthread.
    874 		 */
    875 		ASSERT(error == EINTR || error == ETIME ||
    876 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
    877 		ASSERT(!lwp_upimutex_owned(lp, type));
    878 		goto out;
    879 	}
    880 	if (lwp_upimutex_owned(lp, type)) {
    881 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
    882 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
    883 		upilocked = 1;
    884 	}
    885 	/*
    886 	 * Now, need to read the user-level lp->mutex_flag to do the following:
    887 	 *
    888 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
    889 	 *   should be returned.
    890 	 * - if lock isn't held, check if ENOTRECOVERABLE should
    891 	 *   be returned.
    892 	 *
    893 	 * Now, either lp->mutex_flag is readable or it's not. If not
    894 	 * readable, the on_fault path will cause a return with EFAULT
    895 	 * as it should.  If it is readable, the state of the flag
    896 	 * encodes the robustness state of the lock:
    897 	 *
    898 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
    899 	 * or LOCK_UNMAPPED setting will influence the return code
    900 	 * appropriately.  If the upimutex is not locked here, this
    901 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
    902 	 * event.  The flag's setting can be used to distinguish
    903 	 * between these two events.
    904 	 */
    905 	fuword16_noerr(&lp->mutex_flag, &flag);
    906 	if (upilocked) {
    907 		/*
    908 		 * If the thread wakes up from turnstile_block with the lock
    909 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
    910 		 * since it would not have been handed-off the lock.
    911 		 * So, no need to check for this case.
    912 		 */
    913 		if (nupinest > maxnestupimx &&
    914 		    secpolicy_resource(CRED()) != 0) {
    915 			upimutex_unlock((upimutex_t *)upimutex, flag);
    916 			upilocked = 0;
    917 			error = ENOMEM;
    918 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
    919 			if (flag & LOCK_OWNERDEAD)
    920 				error = EOWNERDEAD;
    921 			else if (type & USYNC_PROCESS_ROBUST)
    922 				error = ELOCKUNMAPPED;
    923 			else
    924 				error = EOWNERDEAD;
    925 		}
    926 	} else {
    927 		/*
    928 		 * Wake-up without the upimutex held. Either this is a
    929 		 * spurious wake-up (due to signals, forkall(), whatever), or
    930 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
    931 		 * of the mutex flag can be used to distinguish between the
    932 		 * two events.
    933 		 */
    934 		if (flag & LOCK_NOTRECOVERABLE) {
    935 			error = ENOTRECOVERABLE;
    936 		} else {
    937 			/*
    938 			 * Here, the flag could be set to LOCK_OWNERDEAD or
    939 			 * not. In both cases, this is a spurious wakeup,
    940 			 * since the upi lock is not held, but the thread
    941 			 * has returned from turnstile_block().
    942 			 *
    943 			 * The user flag could be LOCK_OWNERDEAD if, at the
    944 			 * same time as curthread having been woken up
    945 			 * spuriously, the owner (say Tdead) has died, marked
    946 			 * the mutex flag accordingly, and handed off the lock
    947 			 * to some other waiter (say Tnew). curthread just
    948 			 * happened to read the flag while Tnew has yet to deal
    949 			 * with the owner-dead event.
    950 			 *
    951 			 * In this event, curthread should retry the lock.
    952 			 * If Tnew is able to cleanup the lock, curthread
    953 			 * will eventually get the lock with a zero error code,
    954 			 * If Tnew is unable to cleanup, its eventual call to
    955 			 * unlock the lock will result in the mutex flag being
    956 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
    957 			 * all waiters, including curthread, which will then
    958 			 * eventually return ENOTRECOVERABLE due to the above
    959 			 * check.
    960 			 *
    961 			 * Of course, if the user-flag is not set with
    962 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
    963 			 * this is definitely a spurious wakeup.
    964 			 */
    965 			goto retry;
    966 		}
    967 	}
    968 
    969 out:
    970 	no_fault();
    971 	return (error);
    972 }
    973 
    974 
    975 static int
    976 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
    977 {
    978 	label_t ljb;
    979 	int error = 0;
    980 	lwpchan_t lwpchan;
    981 	uint16_t flag;
    982 	upib_t *upibp;
    983 	volatile struct upimutex *upimutex = NULL;
    984 	volatile int upilocked = 0;
    985 
    986 	if (on_fault(&ljb)) {
    987 		if (upilocked)
    988 			upimutex_unlock((upimutex_t *)upimutex, 0);
    989 		error = EFAULT;
    990 		goto out;
    991 	}
    992 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
    993 	    &lwpchan, LWPCHAN_MPPOOL)) {
    994 		error = EFAULT;
    995 		goto out;
    996 	}
    997 	upibp = &UPI_CHAIN(lwpchan);
    998 	mutex_enter(&upibp->upib_lock);
    999 	upimutex = upi_get(upibp, &lwpchan);
   1000 	/*
   1001 	 * If the lock is not held, or the owner is not curthread, return
   1002 	 * error. The user-level wrapper can return this error or stall,
   1003 	 * depending on whether mutex is of ERRORCHECK type or not.
   1004 	 */
   1005 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
   1006 		mutex_exit(&upibp->upib_lock);
   1007 		error = EPERM;
   1008 		goto out;
   1009 	}
   1010 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
   1011 	upilocked = 1;
   1012 	fuword16_noerr(&lp->mutex_flag, &flag);
   1013 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   1014 		/*
   1015 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
   1016 		 */
   1017 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
   1018 		flag |= LOCK_NOTRECOVERABLE;
   1019 		suword16_noerr(&lp->mutex_flag, flag);
   1020 	}
   1021 	if (type & USYNC_PROCESS)
   1022 		suword32_noerr(&lp->mutex_ownerpid, 0);
   1023 	upimutex_unlock((upimutex_t *)upimutex, flag);
   1024 	upilocked = 0;
   1025 out:
   1026 	no_fault();
   1027 	return (error);
   1028 }
   1029 
   1030 /*
   1031  * Clear the contents of a user-level mutex; return the flags.
   1032  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
   1033  */
   1034 static uint16_t
   1035 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
   1036 {
   1037 	uint16_t flag;
   1038 
   1039 	fuword16_noerr(&lp->mutex_flag, &flag);
   1040 	if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) {
   1041 		flag |= lockflg;
   1042 		suword16_noerr(&lp->mutex_flag, flag);
   1043 	}
   1044 	suword32_noerr((uint32_t *)&lp->mutex_owner, 0);
   1045 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0);
   1046 	suword32_noerr(&lp->mutex_ownerpid, 0);
   1047 	suword8_noerr(&lp->mutex_rcount, 0);
   1048 
   1049 	return (flag);
   1050 }
   1051 
   1052 /*
   1053  * Mark user mutex state, corresponding to kernel upimutex,
   1054  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
   1055  */
   1056 static int
   1057 upi_dead(upimutex_t *upip, uint16_t lockflg)
   1058 {
   1059 	label_t ljb;
   1060 	int error = 0;
   1061 	lwp_mutex_t *lp;
   1062 
   1063 	if (