Home | History | Annotate | Download | only in threads
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #include "lint.h"
     28 #include "thr_uberdata.h"
     29 #include <sys/rtpriocntl.h>
     30 #include <sys/sdt.h>
     31 #include <atomic.h>
     32 
     33 #if defined(THREAD_DEBUG)
     34 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
     35 #define	INCR(x)		((x)++)
     36 #define	DECR(x)		((x)--)
     37 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
     38 #else
     39 #define	INCR32(x)
     40 #define	INCR(x)
     41 #define	DECR(x)
     42 #define	MAXINCR(m, x)
     43 #endif
     44 
     45 /*
     46  * This mutex is initialized to be held by lwp#1.
     47  * It is used to block a thread that has returned from a mutex_lock()
     48  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
     49  */
     50 mutex_t	stall_mutex = DEFAULTMUTEX;
     51 
     52 static int shared_mutex_held(mutex_t *);
     53 static int mutex_queuelock_adaptive(mutex_t *);
     54 static void mutex_wakeup_all(mutex_t *);
     55 
     56 /*
     57  * Lock statistics support functions.
     58  */
     59 void
     60 record_begin_hold(tdb_mutex_stats_t *msp)
     61 {
     62 	tdb_incr(msp->mutex_lock);
     63 	msp->mutex_begin_hold = gethrtime();
     64 }
     65 
     66 hrtime_t
     67 record_hold_time(tdb_mutex_stats_t *msp)
     68 {
     69 	hrtime_t now = gethrtime();
     70 
     71 	if (msp->mutex_begin_hold)
     72 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
     73 	msp->mutex_begin_hold = 0;
     74 	return (now);
     75 }
     76 
     77 /*
     78  * Called once at library initialization.
     79  */
     80 void
     81 mutex_setup(void)
     82 {
     83 	if (set_lock_byte(&stall_mutex.mutex_lockw))
     84 		thr_panic("mutex_setup() cannot acquire stall_mutex");
     85 	stall_mutex.mutex_owner = (uintptr_t)curthread;
     86 }
     87 
     88 /*
     89  * The default spin count of 1000 is experimentally determined.
     90  * On sun4u machines with any number of processors it could be raised
     91  * to 10,000 but that (experimentally) makes almost no difference.
     92  * The environment variable:
     93  *	_THREAD_ADAPTIVE_SPIN=count
     94  * can be used to override and set the count in the range [0 .. 1,000,000].
     95  */
     96 int	thread_adaptive_spin = 1000;
     97 uint_t	thread_max_spinners = 100;
     98 int	thread_queue_verify = 0;
     99 static	int	ncpus;
    100 
    101 /*
    102  * Distinguish spinning for queue locks from spinning for regular locks.
    103  * We try harder to acquire queue locks by spinning.
    104  * The environment variable:
    105  *	_THREAD_QUEUE_SPIN=count
    106  * can be used to override and set the count in the range [0 .. 1,000,000].
    107  */
    108 int	thread_queue_spin = 10000;
    109 
    110 #define	ALL_ATTRIBUTES				\
    111 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
    112 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
    113 	LOCK_ROBUST)
    114 
    115 /*
    116  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
    117  * augmented by zero or more the flags:
    118  *	LOCK_RECURSIVE
    119  *	LOCK_ERRORCHECK
    120  *	LOCK_PRIO_INHERIT
    121  *	LOCK_PRIO_PROTECT
    122  *	LOCK_ROBUST
    123  */
    124 #pragma weak _mutex_init = mutex_init
    125 /* ARGSUSED2 */
    126 int
    127 mutex_init(mutex_t *mp, int type, void *arg)
    128 {
    129 	int basetype = (type & ~ALL_ATTRIBUTES);
    130 	const pcclass_t *pccp;
    131 	int error = 0;
    132 	int ceil;
    133 
    134 	if (basetype == USYNC_PROCESS_ROBUST) {
    135 		/*
    136 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
    137 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
    138 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
    139 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
    140 		 * mutexes will ever draw ELOCKUNMAPPED).
    141 		 */
    142 		type |= (USYNC_PROCESS | LOCK_ROBUST);
    143 		basetype = USYNC_PROCESS;
    144 	}
    145 
    146 	if (type & LOCK_PRIO_PROTECT)
    147 		pccp = get_info_by_policy(SCHED_FIFO);
    148 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
    149 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
    150 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
    151 	    ((type & LOCK_PRIO_PROTECT) &&
    152 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
    153 	    ceil > pccp->pcc_primax))) {
    154 		error = EINVAL;
    155 	} else if (type & LOCK_ROBUST) {
    156 		/*
    157 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
    158 		 * are required to pass an initially all-zero mutex.
    159 		 * Multiple calls to mutex_init() are allowed; all but
    160 		 * the first return EBUSY.  A call to mutex_init() is
    161 		 * allowed to make an inconsistent robust lock consistent
    162 		 * (for historical usage, even though the proper interface
    163 		 * for this is mutex_consistent()).  Note that we use
    164 		 * atomic_or_16() to set the LOCK_INITED flag so as
    165 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
    166 		 */
    167 		if (!(mp->mutex_flag & LOCK_INITED)) {
    168 			mp->mutex_type = (uint8_t)type;
    169 			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
    170 			mp->mutex_magic = MUTEX_MAGIC;
    171 		} else if (type != mp->mutex_type ||
    172 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
    173 			error = EINVAL;
    174 		} else if (mutex_consistent(mp) != 0) {
    175 			error = EBUSY;
    176 		}
    177 		/* register a process robust mutex with the kernel */
    178 		if (basetype == USYNC_PROCESS)
    179 			register_lock(mp);
    180 	} else {
    181 		(void) memset(mp, 0, sizeof (*mp));
    182 		mp->mutex_type = (uint8_t)type;
    183 		mp->mutex_flag = LOCK_INITED;
    184 		mp->mutex_magic = MUTEX_MAGIC;
    185 	}
    186 
    187 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
    188 		mp->mutex_ceiling = ceil;
    189 	}
    190 
    191 	/*
    192 	 * This should be at the beginning of the function,
    193 	 * but for the sake of old broken applications that
    194 	 * do not have proper alignment for their mutexes
    195 	 * (and don't check the return code from mutex_init),
    196 	 * we put it here, after initializing the mutex regardless.
    197 	 */
    198 	if (error == 0 &&
    199 	    ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
    200 	    curthread->ul_misaligned == 0)
    201 		error = EINVAL;
    202 
    203 	return (error);
    204 }
    205 
    206 /*
    207  * Delete mp from list of ceiling mutexes owned by curthread.
    208  * Return 1 if the head of the chain was updated.
    209  */
    210 int
    211 _ceil_mylist_del(mutex_t *mp)
    212 {
    213 	ulwp_t *self = curthread;
    214 	mxchain_t **mcpp;
    215 	mxchain_t *mcp;
    216 
    217 	for (mcpp = &self->ul_mxchain;
    218 	    (mcp = *mcpp) != NULL;
    219 	    mcpp = &mcp->mxchain_next) {
    220 		if (mcp->mxchain_mx == mp) {
    221 			*mcpp = mcp->mxchain_next;
    222 			lfree(mcp, sizeof (*mcp));
    223 			return (mcpp == &self->ul_mxchain);
    224 		}
    225 	}
    226 	return (0);
    227 }
    228 
    229 /*
    230  * Add mp to the list of ceiling mutexes owned by curthread.
    231  * Return ENOMEM if no memory could be allocated.
    232  */
    233 int
    234 _ceil_mylist_add(mutex_t *mp)
    235 {
    236 	ulwp_t *self = curthread;
    237 	mxchain_t *mcp;
    238 
    239 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
    240 		return (ENOMEM);
    241 	mcp->mxchain_mx = mp;
    242 	mcp->mxchain_next = self->ul_mxchain;
    243 	self->ul_mxchain = mcp;
    244 	return (0);
    245 }
    246 
    247 /*
    248  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
    249  */
    250 static void
    251 set_rt_priority(ulwp_t *self, int prio)
    252 {
    253 	pcparms_t pcparm;
    254 
    255 	pcparm.pc_cid = self->ul_rtclassid;
    256 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
    257 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
    258 	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
    259 }
    260 
    261 /*
    262  * Inherit priority from ceiling.
    263  * This changes the effective priority, not the assigned priority.
    264  */
    265 void
    266 _ceil_prio_inherit(int prio)
    267 {
    268 	ulwp_t *self = curthread;
    269 
    270 	self->ul_epri = prio;
    271 	set_rt_priority(self, prio);
    272 }
    273 
    274 /*
    275  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
    276  * if holding at least one ceiling lock.  If no ceiling locks are held at this
    277  * point, disinherit completely, reverting back to assigned priority.
    278  */
    279 void
    280 _ceil_prio_waive(void)
    281 {
    282 	ulwp_t *self = curthread;
    283 	mxchain_t *mcp = self->ul_mxchain;
    284 	int prio;
    285 
    286 	if (mcp == NULL) {
    287 		prio = self->ul_pri;
    288 		self->ul_epri = 0;
    289 	} else {
    290 		prio = mcp->mxchain_mx->mutex_ceiling;
    291 		self->ul_epri = prio;
    292 	}
    293 	set_rt_priority(self, prio);
    294 }
    295 
    296 /*
    297  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
    298  * Return the old value of the lock word.
    299  */
    300 static uint32_t
    301 clear_lockbyte(volatile uint32_t *lockword)
    302 {
    303 	uint32_t old;
    304 	uint32_t new;
    305 
    306 	do {
    307 		old = *lockword;
    308 		new = old & ~LOCKMASK;
    309 	} while (atomic_cas_32(lockword, old, new) != old);
    310 
    311 	return (old);
    312 }
    313 
    314 /*
    315  * Same as clear_lockbyte(), but operates on mutex_lockword64.
    316  * The mutex_ownerpid field is cleared along with the lock byte.
    317  */
    318 static uint64_t
    319 clear_lockbyte64(volatile uint64_t *lockword64)
    320 {
    321 	uint64_t old;
    322 	uint64_t new;
    323 
    324 	do {
    325 		old = *lockword64;
    326 		new = old & ~LOCKMASK64;
    327 	} while (atomic_cas_64(lockword64, old, new) != old);
    328 
    329 	return (old);
    330 }
    331 
    332 /*
    333  * Similar to set_lock_byte(), which only tries to set the lock byte.
    334  * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
    335  * the remaining bytes constant.  This atomic operation is required for the
    336  * correctness of process-shared robust locks, otherwise there would be
    337  * a window or vulnerability in which the lock byte had been set but the
    338  * mutex_ownerpid had not yet been set.  If the process were to die in
    339  * this window of vulnerability (due to some other thread calling exit()
    340  * or the process receiving a fatal signal), the mutex would be left locked
    341  * but without a process-ID to determine which process was holding the lock.
    342  * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
    343  * when the process died.  For all other cases of process-shared locks, this
    344  * operation is just a convenience, for the sake of common code.
    345  *
    346  * This operation requires process-shared robust locks to be properly
    347  * aligned on an 8-byte boundary, at least on sparc machines, lest the
    348  * operation incur an alignment fault.  This is automatic when locks
    349  * are declared properly using the mutex_t or pthread_mutex_t data types
    350  * and the application does not allocate dynamic memory on less than an
    351  * 8-byte boundary.  See the 'horrible hack' comments below for cases
    352  * dealing with such broken applications.
    353  */
    354 static int
    355 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
    356 {
    357 	uint64_t old;
    358 	uint64_t new;
    359 
    360 	old = *lockword64 & ~LOCKMASK64;
    361 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
    362 	if (atomic_cas_64(lockword64, old, new) == old)
    363 		return (LOCKCLEAR);
    364 
    365 	return (LOCKSET);
    366 }
    367 
    368 /*
    369  * Increment the spinners count in the mutex lock word.
    370  * Return 0 on success.  Return -1 if the count would overflow.
    371  */
    372 static int
    373 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
    374 {
    375 	uint32_t old;
    376 	uint32_t new;
    377 
    378 	do {
    379 		old = *lockword;
    380 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
    381 			return (-1);
    382 		new = old + (1 << SPINNERSHIFT);
    383 	} while (atomic_cas_32(lockword, old, new) != old);
    384 
    385 	return (0);
    386 }
    387 
    388 /*
    389  * Decrement the spinners count in the mutex lock word.
    390  * Return the new value of the lock word.
    391  */
    392 static uint32_t
    393 spinners_decr(volatile uint32_t *lockword)
    394 {
    395 	uint32_t old;
    396 	uint32_t new;
    397 
    398 	do {
    399 		new = old = *lockword;
    400 		if (new & SPINNERMASK)
    401 			new -= (1 << SPINNERSHIFT);
    402 	} while (atomic_cas_32(lockword, old, new) != old);
    403 
    404 	return (new);
    405 }
    406 
    407 /*
    408  * Non-preemptive spin locks.  Used by queue_lock().
    409  * No lock statistics are gathered for these locks.
    410  * No DTrace probes are provided for these locks.
    411  */
    412 void
    413 spin_lock_set(mutex_t *mp)
    414 {
    415 	ulwp_t *self = curthread;
    416 
    417 	no_preempt(self);
    418 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
    419 		mp->mutex_owner = (uintptr_t)self;
    420 		return;
    421 	}
    422 	/*
    423 	 * Spin for a while, attempting to acquire the lock.
    424 	 */
    425 	INCR32(self->ul_spin_lock_spin);
    426 	if (mutex_queuelock_adaptive(mp) == 0 ||
    427 	    set_lock_byte(&mp->mutex_lockw) == 0) {
    428 		mp->mutex_owner = (uintptr_t)self;
    429 		return;
    430 	}
    431 	/*
    432 	 * Try harder if we were previously at a no premption level.
    433 	 */
    434 	if (self->ul_preempt > 1) {
    435 		INCR32(self->ul_spin_lock_spin2);
    436 		if (mutex_queuelock_adaptive(mp) == 0 ||
    437 		    set_lock_byte(&mp->mutex_lockw) == 0) {
    438 			mp->mutex_owner = (uintptr_t)self;
    439 			return;
    440 		}
    441 	}
    442 	/*
    443 	 * Give up and block in the kernel for the mutex.
    444 	 */
    445 	INCR32(self->ul_spin_lock_sleep);
    446 	(void) ___lwp_mutex_timedlock(mp, NULL, self);
    447 }
    448 
    449 void
    450 spin_lock_clear(mutex_t *mp)
    451 {
    452 	ulwp_t *self = curthread;
    453 
    454 	mp->mutex_owner = 0;
    455 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
    456 		(void) ___lwp_mutex_wakeup(mp, 0);
    457 		INCR32(self->ul_spin_lock_wakeup);
    458 	}
    459 	preempt(self);
    460 }
    461 
    462 /*
    463  * Allocate the sleep queue hash table.
    464  */
    465 void
    466 queue_alloc(void)
    467 {
    468 	ulwp_t *self = curthread;
    469 	uberdata_t *udp = self->ul_uberdata;
    470 	queue_head_t *qp;
    471 	void *data;
    472 	int i;
    473 
    474 	/*
    475 	 * No locks are needed; we call here only when single-threaded.
    476 	 */
    477 	ASSERT(self == udp->ulwp_one);
    478 	ASSERT(!udp->uberflags.uf_mt);
    479 	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
    480 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
    481 	    == MAP_FAILED)
    482 		thr_panic("cannot allocate thread queue_head table");
    483 	udp->queue_head = qp = (queue_head_t *)data;
    484 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
    485 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
    486 		qp->qh_lock.mutex_flag = LOCK_INITED;
    487 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
    488 		qp->qh_hlist = &qp->qh_def_root;
    489 #if defined(THREAD_DEBUG)
    490 		qp->qh_hlen = 1;
    491 		qp->qh_hmax = 1;
    492 #endif
    493 	}
    494 }
    495 
    496 #if defined(THREAD_DEBUG)
    497 
    498 /*
    499  * Debugging: verify correctness of a sleep queue.
    500  */
    501 void
    502 QVERIFY(queue_head_t *qp)
    503 {
    504 	ulwp_t *self = curthread;
    505 	uberdata_t *udp = self->ul_uberdata;
    506 	queue_root_t *qrp;
    507 	ulwp_t *ulwp;
    508 	ulwp_t *prev;
    509 	uint_t index;
    510 	uint32_t cnt;
    511 	char qtype;
    512 	void *wchan;
    513 
    514 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
    515 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
    516 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
    517 		cnt++;
    518 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
    519 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
    520 	}
    521 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
    522 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
    523 	ASSERT(qp->qh_type == qtype);
    524 	if (!thread_queue_verify)
    525 		return;
    526 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
    527 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
    528 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
    529 		    prev = ulwp, ulwp = ulwp->ul_link) {
    530 			cnt++;
    531 			if (ulwp->ul_writer)
    532 				ASSERT(prev == NULL || prev->ul_writer);
    533 			ASSERT(ulwp->ul_qtype == qtype);
    534 			ASSERT(ulwp->ul_wchan != NULL);
    535 			ASSERT(ulwp->ul_sleepq == qp);
    536 			wchan = ulwp->ul_wchan;
    537 			ASSERT(qrp->qr_wchan == wchan);
    538 			index = QUEUE_HASH(wchan, qtype);
    539 			ASSERT(&udp->queue_head[index] == qp);
    540 		}
    541 		ASSERT(qrp->qr_tail == prev);
    542 	}
    543 	ASSERT(qp->qh_qlen == cnt);
    544 }
    545 
    546 #else	/* THREAD_DEBUG */
    547 
    548 #define	QVERIFY(qp)
    549 
    550 #endif	/* THREAD_DEBUG */
    551 
    552 /*
    553  * Acquire a queue head.
    554  */
    555 queue_head_t *
    556 queue_lock(void *wchan, int qtype)
    557 {
    558 	uberdata_t *udp = curthread->ul_uberdata;
    559 	queue_head_t *qp;
    560 	queue_root_t *qrp;
    561 
    562 	ASSERT(qtype == MX || qtype == CV);
    563 
    564 	/*
    565 	 * It is possible that we could be called while still single-threaded.
    566 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
    567 	 */
    568 	if ((qp = udp->queue_head) == NULL) {
    569 		queue_alloc();
    570 		qp = udp->queue_head;
    571 	}
    572 	qp += QUEUE_HASH(wchan, qtype);
    573 	spin_lock_set(&qp->qh_lock);
    574 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
    575 		if (qrp->qr_wchan == wchan)
    576 			break;
    577 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
    578 		/* the default queue root is available; use it */
    579 		qrp = &qp->qh_def_root;
    580 		qrp->qr_wchan = wchan;
    581 		ASSERT(qrp->qr_next == NULL);
    582 		ASSERT(qrp->qr_tail == NULL &&
    583 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
    584 	}
    585 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
    586 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
    587 	INCR32(qp->qh_lockcount);
    588 	QVERIFY(qp);
    589 	return (qp);
    590 }
    591 
    592 /*
    593  * Release a queue head.
    594  */
    595 void
    596 queue_unlock(queue_head_t *qp)
    597 {
    598 	QVERIFY(qp);
    599 	spin_lock_clear(&qp->qh_lock);
    600 }
    601 
    602 /*
    603  * For rwlock queueing, we must queue writers ahead of readers of the
    604  * same priority.  We do this by making writers appear to have a half
    605  * point higher priority for purposes of priority comparisons below.
    606  */
    607 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
    608 
    609 void
    610 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
    611 {
    612 	queue_root_t *qrp;
    613 	ulwp_t **ulwpp;
    614 	ulwp_t *next;
    615 	int pri = CMP_PRIO(ulwp);
    616 
    617 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    618 	ASSERT(ulwp->ul_sleepq != qp);
    619 
    620 	if ((qrp = qp->qh_root) == NULL) {
    621 		/* use the thread's queue root for the linkage */
    622 		qrp = &ulwp->ul_queue_root;
    623 		qrp->qr_next = qp->qh_hlist;
    624 		qrp->qr_prev = NULL;
    625 		qrp->qr_head = NULL;
    626 		qrp->qr_tail = NULL;
    627 		qrp->qr_wchan = qp->qh_wchan;
    628 		qrp->qr_rtcount = 0;
    629 		qrp->qr_qlen = 0;
    630 		qrp->qr_qmax = 0;
    631 		qp->qh_hlist->qr_prev = qrp;
    632 		qp->qh_hlist = qrp;
    633 		qp->qh_root = qrp;
    634 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
    635 	}
    636 
    637 	/*
    638 	 * LIFO queue ordering is unfair and can lead to starvation,
    639 	 * but it gives better performance for heavily contended locks.
    640 	 * We use thread_queue_fifo (range is 0..8) to determine
    641 	 * the frequency of FIFO vs LIFO queuing:
    642 	 *	0 : every 256th time	(almost always LIFO)
    643 	 *	1 : every 128th time
    644 	 *	2 : every 64th  time
    645 	 *	3 : every 32nd  time
    646 	 *	4 : every 16th  time	(the default value, mostly LIFO)
    647 	 *	5 : every 8th   time
    648 	 *	6 : every 4th   time
    649 	 *	7 : every 2nd   time
    650 	 *	8 : every time		(never LIFO, always FIFO)
    651 	 * Note that there is always some degree of FIFO ordering.
    652 	 * This breaks live lock conditions that occur in applications
    653 	 * that are written assuming (incorrectly) that threads acquire
    654 	 * locks fairly, that is, in roughly round-robin order.
    655 	 * In any event, the queue is maintained in kernel priority order.
    656 	 *
    657 	 * If force_fifo is non-zero, fifo queueing is forced.
    658 	 * SUSV3 requires this for semaphores.
    659 	 */
    660 	if (qrp->qr_head == NULL) {
    661 		/*
    662 		 * The queue is empty.  LIFO/FIFO doesn't matter.
    663 		 */
    664 		ASSERT(qrp->qr_tail == NULL);
    665 		ulwpp = &qrp->qr_head;
    666 	} else if (force_fifo |
    667 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
    668 		/*
    669 		 * Enqueue after the last thread whose priority is greater
    670 		 * than or equal to the priority of the thread being queued.
    671 		 * Attempt first to go directly onto the tail of the queue.
    672 		 */
    673 		if (pri <= CMP_PRIO(qrp->qr_tail))
    674 			ulwpp = &qrp->qr_tail->ul_link;
    675 		else {
    676 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
    677 			    ulwpp = &next->ul_link)
    678 				if (pri > CMP_PRIO(next))
    679 					break;
    680 		}
    681 	} else {
    682 		/*
    683 		 * Enqueue before the first thread whose priority is less
    684 		 * than or equal to the priority of the thread being queued.
    685 		 * Hopefully we can go directly onto the head of the queue.
    686 		 */
    687 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
    688 		    ulwpp = &next->ul_link)
    689 			if (pri >= CMP_PRIO(next))
    690 				break;
    691 	}
    692 	if ((ulwp->ul_link = *ulwpp) == NULL)
    693 		qrp->qr_tail = ulwp;
    694 	*ulwpp = ulwp;
    695 
    696 	ulwp->ul_sleepq = qp;
    697 	ulwp->ul_wchan = qp->qh_wchan;
    698 	ulwp->ul_qtype = qp->qh_type;
    699 	if ((ulwp->ul_schedctl != NULL &&
    700 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
    701 	    ulwp->ul_pilocks) {
    702 		ulwp->ul_rtqueued = 1;
    703 		qrp->qr_rtcount++;
    704 	}
    705 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
    706 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
    707 }
    708 
    709 /*
    710  * Helper function for queue_slot() and queue_slot_rt().
    711  * Try to find a non-suspended thread on the queue.
    712  */
    713 static ulwp_t **
    714 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
    715 {
    716 	ulwp_t *ulwp;
    717 	ulwp_t **foundpp = NULL;
    718 	int priority = -1;
    719 	ulwp_t *prev;
    720 	int tpri;
    721 
    722 	for (prev = NULL;
    723 	    (ulwp = *ulwpp) != NULL;
    724 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
    725 		if (ulwp->ul_stop)	/* skip suspended threads */
    726 			continue;
    727 		tpri = rt? CMP_PRIO(ulwp) : 0;
    728 		if (tpri > priority) {
    729 			foundpp = ulwpp;
    730 			*prevp = prev;
    731 			priority = tpri;
    732 			if (!rt)
    733 				break;
    734 		}
    735 	}
    736 	return (foundpp);
    737 }
    738 
    739 /*
    740  * For real-time, we search the entire queue because the dispatch
    741  * (kernel) priorities may have changed since enqueueing.
    742  */
    743 static ulwp_t **
    744 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
    745 {
    746 	ulwp_t **ulwpp = ulwpp_org;
    747 	ulwp_t *ulwp = *ulwpp;
    748 	ulwp_t **foundpp = ulwpp;
    749 	int priority = CMP_PRIO(ulwp);
    750 	ulwp_t *prev;
    751 	int tpri;
    752 
    753 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
    754 	    (ulwp = *ulwpp) != NULL;
    755 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
    756 		tpri = CMP_PRIO(ulwp);
    757 		if (tpri > priority) {
    758 			foundpp = ulwpp;
    759 			*prevp = prev;
    760 			priority = tpri;
    761 		}
    762 	}
    763 	ulwp = *foundpp;
    764 
    765 	/*
    766 	 * Try not to return a suspended thread.
    767 	 * This mimics the old libthread's behavior.
    768 	 */
    769 	if (ulwp->ul_stop &&
    770 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
    771 		foundpp = ulwpp;
    772 		ulwp = *foundpp;
    773 	}
    774 	ulwp->ul_rt = 1;
    775 	return (foundpp);
    776 }
    777 
    778 ulwp_t **
    779 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
    780 {
    781 	queue_root_t *qrp;
    782 	ulwp_t **ulwpp;
    783 	ulwp_t *ulwp;
    784 	int rt;
    785 
    786 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    787 
    788 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
    789 		*more = 0;
    790 		return (NULL);		/* no lwps on the queue */
    791 	}
    792 	rt = (qrp->qr_rtcount != 0);
    793 	*prevp = NULL;
    794 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
    795 		*more = 0;
    796 		ulwp->ul_rt = rt;
    797 		return (&qrp->qr_head);
    798 	}
    799 	*more = 1;
    800 
    801 	if (rt)		/* real-time queue */
    802 		return (queue_slot_rt(&qrp->qr_head, prevp));
    803 	/*
    804 	 * Try not to return a suspended thread.
    805 	 * This mimics the old libthread's behavior.
    806 	 */
    807 	if (ulwp->ul_stop &&
    808 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
    809 		ulwp = *ulwpp;
    810 		ulwp->ul_rt = 0;
    811 		return (ulwpp);
    812 	}
    813 	/*
    814 	 * The common case; just pick the first thread on the queue.
    815 	 */
    816 	ulwp->ul_rt = 0;
    817 	return (&qrp->qr_head);
    818 }
    819 
    820 /*
    821  * Common code for unlinking an lwp from a user-level sleep queue.
    822  */
    823 void
    824 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
    825 {
    826 	queue_root_t *qrp = qp->qh_root;
    827 	queue_root_t *nqrp;
    828 	ulwp_t *ulwp = *ulwpp;
    829 	ulwp_t *next;
    830 
    831 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    832 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
    833 
    834 	DECR(qp->qh_qlen);
    835 	DECR(qrp->qr_qlen);
    836 	if (ulwp->ul_rtqueued) {
    837 		ulwp->ul_rtqueued = 0;
    838 		qrp->qr_rtcount--;
    839 	}
    840 	next = ulwp->ul_link;
    841 	*ulwpp = next;
    842 	ulwp->ul_link = NULL;
    843 	if (qrp->qr_tail == ulwp)
    844 		qrp->qr_tail = prev;
    845 	if (qrp == &ulwp->ul_queue_root) {
    846 		/*
    847 		 * We can't continue to use the unlinked thread's
    848 		 * queue root for the linkage.
    849 		 */
    850 		queue_root_t *qr_next = qrp->qr_next;
    851 		queue_root_t *qr_prev = qrp->qr_prev;
    852 
    853 		if (qrp->qr_tail) {
    854 			/* switch to using the last thread's queue root */
    855 			ASSERT(qrp->qr_qlen != 0);
    856 			nqrp = &qrp->qr_tail->ul_queue_root;
    857 			*nqrp = *qrp;
    858 			if (qr_next)
    859 				qr_next->qr_prev = nqrp;
    860 			if (qr_prev)
    861 				qr_prev->qr_next = nqrp;
    862 			else
    863 				qp->qh_hlist = nqrp;
    864 			qp->qh_root = nqrp;
    865 		} else {
    866 			/* empty queue root; just delete from the hash list */
    867 			ASSERT(qrp->qr_qlen == 0);
    868 			if (qr_next)
    869 				qr_next->qr_prev = qr_prev;
    870 			if (qr_prev)
    871 				qr_prev->qr_next = qr_next;
    872 			else
    873 				qp->qh_hlist = qr_next;
    874 			qp->qh_root = NULL;
    875 			DECR(qp->qh_hlen);
    876 		}
    877 	}
    878 }
    879 
    880 ulwp_t *
    881 dequeue(queue_head_t *qp, int *more)
    882 {
    883 	ulwp_t **ulwpp;
    884 	ulwp_t *ulwp;
    885 	ulwp_t *prev;
    886 
    887 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
    888 		return (NULL);
    889 	ulwp = *ulwpp;
    890 	queue_unlink(qp, ulwpp, prev);
    891 	ulwp->ul_sleepq = NULL;
    892 	ulwp->ul_wchan = NULL;
    893 	return (ulwp);
    894 }
    895 
    896 /*
    897  * Return a pointer to the highest priority thread sleeping on wchan.
    898  */
    899 ulwp_t *
    900 queue_waiter(queue_head_t *qp)
    901 {
    902 	ulwp_t **ulwpp;
    903 	ulwp_t *prev;
    904 	int more;
    905 
    906 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
    907 		return (NULL);
    908 	return (*ulwpp);
    909 }
    910 
    911 int
    912 dequeue_self(queue_head_t *qp)
    913 {
    914 	ulwp_t *self = curthread;
    915 	queue_root_t *qrp;
    916 	ulwp_t **ulwpp;
    917 	ulwp_t *ulwp;
    918 	ulwp_t *prev;
    919 	int found = 0;
    920 
    921 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
    922 
    923 	/* find self on the sleep queue */
    924 	if ((qrp = qp->qh_root) != NULL) {
    925 		for (prev = NULL, ulwpp = &qrp->qr_head;
    926 		    (ulwp = *ulwpp) != NULL;
    927 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
    928 			if (ulwp == self) {
    929 				queue_unlink(qp, ulwpp, prev);
    930 				self->ul_cvmutex = NULL;
    931 				self->ul_sleepq = NULL;
    932 				self->ul_wchan = NULL;
    933 				found = 1;
    934 				break;
    935 			}
    936 		}
    937 	}
    938 
    939 	if (!found)
    940 		thr_panic("dequeue_self(): curthread not found on queue");
    941 
    942 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
    943 }
    944 
    945 /*
    946  * Called from call_user_handler() and _thrp_suspend() to take
    947  * ourself off of our sleep queue so we can grab locks.
    948  */
    949 void
    950 unsleep_self(void)
    951 {
    952 	ulwp_t *self = curthread;
    953 	queue_head_t *qp;
    954 
    955 	/*
    956 	 * Calling enter_critical()/exit_critical() here would lead
    957 	 * to recursion.  Just manipulate self->ul_critical directly.
    958 	 */
    959 	self->ul_critical++;
    960 	while (self->ul_sleepq != NULL) {
    961 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
    962 		/*
    963 		 * We may have been moved from a CV queue to a
    964 		 * mutex queue while we were attempting queue_lock().
    965 		 * If so, just loop around and try again.
    966 		 * dequeue_self() clears self->ul_sleepq.
    967 		 */
    968 		if (qp == self->ul_sleepq)
    969 			(void) dequeue_self(qp);
    970 		queue_unlock(qp);
    971 	}
    972 	self->ul_writer = 0;
    973 	self->ul_critical--;
    974 }
    975 
    976 /*
    977  * Common code for calling the the ___lwp_mutex_timedlock() system call.
    978  * Returns with mutex_owner and mutex_ownerpid set correctly.
    979  */
    980 static int
    981 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
    982 {
    983 	ulwp_t *self = curthread;
    984 	uberdata_t *udp = self->ul_uberdata;
    985 	int mtype = mp->mutex_type;
    986 	hrtime_t begin_sleep;
    987 	int acquired;
    988 	int error;
    989 
    990 	self->ul_sp = stkptr();
    991 	self->ul_wchan = mp;
    992 	if (__td_event_report(self, TD_SLEEP, udp)) {
    993 		self->ul_td_evbuf.eventnum = TD_SLEEP;
    994 		self->ul_td_evbuf.eventdata = mp;
    995 		tdb_event(TD_SLEEP, udp);
    996 	}
    997 	if (msp) {
    998 		tdb_incr(msp->mutex_sleep);
    999 		begin_sleep = gethrtime();
   1000 	}
   1001 
   1002 	DTRACE_PROBE1(plockstat, mutex__block, mp);
   1003 
   1004 	for (;;) {
   1005 		/*
   1006 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
   1007 		 * means we successfully acquired the lock.
   1008 		 */
   1009 		if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
   1010 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
   1011 			acquired = 0;
   1012 			break;
   1013 		}
   1014 
   1015 		if (mtype & USYNC_PROCESS) {
   1016 			/*
   1017 			 * Defend against forkall().  We may be the child,
   1018 			 * in which case we don't actually own the mutex.
   1019 			 */
   1020 			enter_critical(self);
   1021 			if (mp->mutex_ownerpid == udp->pid) {
   1022 				exit_critical(self);
   1023 				acquired = 1;
   1024 				break;
   1025 			}
   1026 			exit_critical(self);
   1027 		} else {
   1028 			acquired = 1;
   1029 			break;
   1030 		}
   1031 	}
   1032 
   1033 	if (msp)
   1034 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
   1035 	self->ul_wchan = NULL;
   1036 	self->ul_sp = 0;
   1037 
   1038 	if (acquired) {
   1039 		ASSERT(mp->mutex_owner == (uintptr_t)self);
   1040 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
   1041 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   1042 	} else {
   1043 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
   1044 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1045 	}
   1046 
   1047 	return (error);
   1048 }
   1049 
   1050 /*
   1051  * Common code for calling the ___lwp_mutex_trylock() system call.
   1052  * Returns with mutex_owner and mutex_ownerpid set correctly.
   1053  */
   1054 int
   1055 mutex_trylock_kernel(mutex_t *mp)
   1056 {
   1057 	ulwp_t *self = curthread;
   1058 	uberdata_t *udp = self->ul_uberdata;
   1059 	int mtype = mp->mutex_type;
   1060 	int error;
   1061 	int acquired;
   1062 
   1063 	for (;;) {
   1064 		/*
   1065 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
   1066 		 * means we successfully acquired the lock.
   1067 		 */
   1068 		if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
   1069 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
   1070 			acquired = 0;
   1071 			break;
   1072 		}
   1073 
   1074 		if (mtype & USYNC_PROCESS) {
   1075 			/*
   1076 			 * Defend against forkall().  We may be the child,
   1077 			 * in which case we don't actually own the mutex.
   1078 			 */
   1079 			enter_critical(self);
   1080 			if (mp->mutex_ownerpid == udp->pid) {
   1081 				exit_critical(self);
   1082 				acquired = 1;
   1083 				break;
   1084 			}
   1085 			exit_critical(self);
   1086 		} else {
   1087 			acquired = 1;
   1088 			break;
   1089 		}
   1090 	}
   1091 
   1092 	if (acquired) {
   1093 		ASSERT(mp->mutex_owner == (uintptr_t)self);
   1094 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   1095 	} else if (error != EBUSY) {
   1096 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1097 	}
   1098 
   1099 	return (error);
   1100 }
   1101 
   1102 volatile sc_shared_t *
   1103 setup_schedctl(void)
   1104 {
   1105 	ulwp_t *self = curthread;
   1106 	volatile sc_shared_t *scp;
   1107 	sc_shared_t *tmp;
   1108 
   1109 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
   1110 	    !self->ul_vfork &&			/* not a child of vfork() */
   1111 	    !self->ul_schedctl_called) {	/* haven't been called before */
   1112 		enter_critical(self);
   1113 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
   1114 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
   1115 			self->ul_schedctl = scp = tmp;
   1116 		exit_critical(self);
   1117 	}
   1118 	/*
   1119 	 * Unless the call to setup_schedctl() is surrounded
   1120 	 * by enter_critical()/exit_critical(), the address
   1121 	 * we are returning could be invalid due to a forkall()
   1122 	 * having occurred in another thread.
   1123 	 */
   1124 	return (scp);
   1125 }
   1126 
   1127 /*
   1128  * Interfaces from libsched, incorporated into libc.
   1129  * libsched.so.1 is now a filter library onto libc.
   1130  */
   1131 #pragma weak schedctl_lookup = schedctl_init
   1132 schedctl_t *
   1133 schedctl_init(void)
   1134 {
   1135 	volatile sc_shared_t *scp = setup_schedctl();
   1136 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
   1137 }
   1138 
   1139 void
   1140 schedctl_exit(void)
   1141 {
   1142 }
   1143 
   1144 /*
   1145  * Contract private interface for java.
   1146  * Set up the schedctl data if it doesn't exist yet.
   1147  * Return a pointer to the pointer to the schedctl data.
   1148  */
   1149 volatile sc_shared_t *volatile *
   1150 _thr_schedctl(void)
   1151 {
   1152 	ulwp_t *self = curthread;
   1153 	volatile sc_shared_t *volatile *ptr;
   1154 
   1155 	if (self->ul_vfork)
   1156 		return (NULL);
   1157 	if (*(ptr = &self->ul_schedctl) == NULL)
   1158 		(void) setup_schedctl();
   1159 	return (ptr);
   1160 }
   1161 
   1162 /*
   1163  * Block signals and attempt to block preemption.
   1164  * no_preempt()/preempt() must be used in pairs but can be nested.
   1165  */
   1166 void
   1167 no_preempt(ulwp_t *self)
   1168 {
   1169 	volatile sc_shared_t *scp;
   1170 
   1171 	if (self->ul_preempt++ == 0) {
   1172 		enter_critical(self);
   1173 		if ((scp = self->ul_schedctl) != NULL ||
   1174 		    (scp = setup_schedctl()) != NULL) {
   1175 			/*
   1176 			 * Save the pre-existing preempt value.
   1177 			 */
   1178 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
   1179 			scp->sc_preemptctl.sc_nopreempt = 1;
   1180 		}
   1181 	}
   1182 }
   1183 
   1184 /*
   1185  * Undo the effects of no_preempt().
   1186  */
   1187 void
   1188 preempt(ulwp_t *self)
   1189 {
   1190 	volatile sc_shared_t *scp;
   1191 
   1192 	ASSERT(self->ul_preempt > 0);
   1193 	if (--self->ul_preempt == 0) {
   1194 		if ((scp = self->ul_schedctl) != NULL) {
   1195 			/*
   1196 			 * Restore the pre-existing preempt value.
   1197 			 */
   1198 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
   1199 			if (scp->sc_preemptctl.sc_yield &&
   1200 			    scp->sc_preemptctl.sc_nopreempt == 0) {
   1201 				yield();
   1202 				if (scp->sc_preemptctl.sc_yield) {
   1203 					/*
   1204 					 * Shouldn't happen.  This is either
   1205 					 * a race condition or the thread
   1206 					 * just entered the real-time class.
   1207 					 */
   1208 					yield();
   1209 					scp->sc_preemptctl.sc_yield = 0;
   1210 				}
   1211 			}
   1212 		}
   1213 		exit_critical(self);
   1214 	}
   1215 }
   1216 
   1217 /*
   1218  * If a call to preempt() would cause the current thread to yield or to
   1219  * take deferred actions in exit_critical(), then unpark the specified
   1220  * lwp so it can run while we delay.  Return the original lwpid if the
   1221  * unpark was not performed, else return zero.  The tests are a repeat
   1222  * of some of the tests in preempt(), above.  This is a statistical
   1223  * optimization solely for cond_sleep_queue(), below.
   1224  */
   1225 static lwpid_t
   1226 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
   1227 {
   1228 	volatile sc_shared_t *scp = self->ul_schedctl;
   1229 
   1230 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
   1231 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
   1232 	    (self->ul_curplease && self->ul_critical == 1)) {
   1233 		(void) __lwp_unpark(lwpid);
   1234 		lwpid = 0;
   1235 	}
   1236 	return (lwpid);
   1237 }
   1238 
   1239 /*
   1240  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
   1241  * If this fails, return EBUSY and let the caller deal with it.
   1242  * If this succeeds, return 0 with mutex_owner set to curthread.
   1243  */
   1244 static int
   1245 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
   1246 {
   1247 	ulwp_t *self = curthread;
   1248 	int error = EBUSY;
   1249 	ulwp_t *ulwp;
   1250 	volatile sc_shared_t *scp;
   1251 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
   1252 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
   1253 	uint32_t new_lockword;
   1254 	int count = 0;
   1255 	int max_count;
   1256 	uint8_t max_spinners;
   1257 
   1258 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
   1259 
   1260 	if (MUTEX_OWNED(mp, self))
   1261 		return (EBUSY);
   1262 
   1263 	enter_critical(self);
   1264 
   1265 	/* short-cut, not definitive (see below) */
   1266 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
   1267 		ASSERT(mp->mutex_type & LOCK_ROBUST);
   1268 		error = ENOTRECOVERABLE;
   1269 		goto done;
   1270 	}
   1271 
   1272 	/*
   1273 	 * Make one attempt to acquire the lock before
   1274 	 * incurring the overhead of the spin loop.
   1275 	 */
   1276 	if (set_lock_byte(lockp) == 0) {
   1277 		*ownerp = (uintptr_t)self;
   1278 		error = 0;
   1279 		goto done;
   1280 	}
   1281 	if (!tryhard)
   1282 		goto done;
   1283 	if (ncpus == 0)
   1284 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
   1285 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
   1286 		max_spinners = ncpus - 1;
   1287 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
   1288 	if (max_count == 0)
   1289 		goto done;
   1290 
   1291 	/*
   1292 	 * This spin loop is unfair to lwps that have already dropped into
   1293 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
   1294 	 * This is just too bad.  The adaptive spin algorithm is intended
   1295 	 * to allow programs with highly-contended locks (that is, broken
   1296 	 * programs) to execute with reasonable speed despite their contention.
   1297 	 * Being fair would reduce the speed of such programs and well-written
   1298 	 * programs will not suffer in any case.
   1299 	 */
   1300 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
   1301 		goto done;
   1302 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
   1303 	for (count = 1; ; count++) {
   1304 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
   1305 			*ownerp = (uintptr_t)self;
   1306 			error = 0;
   1307 			break;
   1308 		}
   1309 		if (count == max_count)
   1310 			break;
   1311 		SMT_PAUSE();
   1312 		/*
   1313 		 * Stop spinning if the mutex owner is not running on
   1314 		 * a processor; it will not drop the lock any time soon
   1315 		 * and we would just be wasting time to keep spinning.
   1316 		 *
   1317 		 * Note that we are looking at another thread (ulwp_t)
   1318 		 * without ensuring that the other thread does not exit.
   1319 		 * The scheme relies on ulwp_t structures never being
   1320 		 * deallocated by the library (the library employs a free
   1321 		 * list of ulwp_t structs that are reused when new threads
   1322 		 * are created) and on schedctl shared memory never being
   1323 		 * deallocated once created via __schedctl().
   1324 		 *
   1325 		 * Thus, the worst that can happen when the spinning thread
   1326 		 * looks at the owner's schedctl data is that it is looking
   1327 		 * at some other thread's schedctl data.  This almost never
   1328 		 * happens and is benign when it does.
   1329 		 */
   1330 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
   1331 		    ((scp = ulwp->ul_schedctl) == NULL ||
   1332 		    scp->sc_state != SC_ONPROC))
   1333 			break;
   1334 	}
   1335 	new_lockword = spinners_decr(&mp->mutex_lockword);
   1336 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
   1337 		/*
   1338 		 * We haven't yet acquired the lock, the lock
   1339 		 * is free, and there are no other spinners.
   1340 		 * Make one final attempt to acquire the lock.
   1341 		 *
   1342 		 * This isn't strictly necessary since mutex_lock_queue()
   1343 		 * (the next action this thread will take if it doesn't
   1344 		 * acquire the lock here) makes one attempt to acquire
   1345 		 * the lock before putting the thread to sleep.
   1346 		 *
   1347 		 * If the next action for this thread (on failure here)
   1348 		 * were not to call mutex_lock_queue(), this would be
   1349 		 * necessary for correctness, to avoid ending up with an
   1350 		 * unheld mutex with waiters but no one to wake them up.
   1351 		 */
   1352 		if (set_lock_byte(lockp) == 0) {
   1353 			*ownerp = (uintptr_t)self;
   1354 			error = 0;
   1355 		}
   1356 		count++;
   1357 	}
   1358 
   1359 done:
   1360 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
   1361 		ASSERT(mp->mutex_type & LOCK_ROBUST);
   1362 		/*
   1363 		 * We shouldn't own the mutex.
   1364 		 * Just clear the lock; everyone has already been waked up.
   1365 		 */
   1366 		*ownerp = 0;
   1367 		(void) clear_lockbyte(&mp->mutex_lockword);
   1368 		error = ENOTRECOVERABLE;
   1369 	}
   1370 
   1371 	exit_critical(self);
   1372 
   1373 	if (error) {
   1374 		if (count) {
   1375 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
   1376 		}
   1377 		if (error != EBUSY) {
   1378 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1379 		}
   1380 	} else {
   1381 		if (count) {
   1382 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
   1383 		}
   1384 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
   1385 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
   1386 			ASSERT(mp->mutex_type & LOCK_ROBUST);
   1387 			error = EOWNERDEAD;
   1388 		}
   1389 	}
   1390 
   1391 	return (error);
   1392 }
   1393 
   1394 /*
   1395  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
   1396  * The owner field is not set here; the caller (spin_lock_set()) sets it.
   1397  */
   1398 static int
   1399 mutex_queuelock_adaptive(mutex_t *mp)
   1400 {
   1401 	ulwp_t *ulwp;
   1402 	volatile sc_shared_t *scp;
   1403 	volatile uint8_t *lockp;
   1404 	volatile uint64_t *ownerp;
   1405 	int count = curthread->ul_queue_spin;
   1406 
   1407 	ASSERT(mp->mutex_type == USYNC_THREAD);
   1408 
   1409 	if (count == 0)
   1410 		return (EBUSY);
   1411 
   1412 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
   1413 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
   1414 	while (--count >= 0) {
   1415 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
   1416 			return (0);
   1417 		SMT_PAUSE();
   1418 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
   1419 		    ((scp = ulwp->ul_schedctl) == NULL ||
   1420 		    scp->sc_state != SC_ONPROC))
   1421 			break;
   1422 	}
   1423 
   1424 	return (EBUSY);
   1425 }
   1426 
   1427 /*
   1428  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
   1429  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
   1430  * If this fails, return EBUSY and let the caller deal with it.
   1431  * If this succeeds, return 0 with mutex_owner set to curthread
   1432  * and mutex_ownerpid set to the current pid.
   1433  */
   1434 static int
   1435 mutex_trylock_process(mutex_t *mp, int tryhard)
   1436 {
   1437 	ulwp_t *self = curthread;
   1438 	uberdata_t *udp = self->ul_uberdata;
   1439 	int error = EBUSY;
   1440 	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
   1441 	uint32_t new_lockword;
   1442 	int count = 0;
   1443 	int max_count;
   1444 	uint8_t max_spinners;
   1445 
   1446 #if defined(__sparc) && !defined(_LP64)
   1447 	/* horrible hack, necessary only on 32-bit sparc */
   1448 	int fix_alignment_problem =
   1449 	    (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
   1450 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
   1451 #endif
   1452 
   1453 	ASSERT(mp->mutex_type & USYNC_PROCESS);
   1454 
   1455 	if (shared_mutex_held(mp))
   1456 		return (EBUSY);
   1457 
   1458 	enter_critical(self);
   1459 
   1460 	/* short-cut, not definitive (see below) */
   1461 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
   1462 		ASSERT(mp->mutex_type & LOCK_ROBUST);
   1463 		error = ENOTRECOVERABLE;
   1464 		goto done;
   1465 	}
   1466 
   1467 	/*
   1468 	 * Make one attempt to acquire the lock before
   1469 	 * incurring the overhead of the spin loop.
   1470 	 */
   1471 #if defined(__sparc) && !defined(_LP64)
   1472 	/* horrible hack, necessary only on 32-bit sparc */
   1473 	if (fix_alignment_problem) {
   1474 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   1475 			mp->mutex_ownerpid = udp->pid;
   1476 			mp->mutex_owner = (uintptr_t)self;
   1477 			error = 0;
   1478 			goto done;
   1479 		}
   1480 	} else
   1481 #endif
   1482 	if (set_lock_byte64(lockp, udp->pid) == 0) {
   1483 		mp->mutex_owner = (uintptr_t)self;
   1484 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
   1485 		error = 0;
   1486 		goto done;
   1487 	}
   1488 	if (!tryhard)
   1489 		goto done;
   1490 	if (ncpus == 0)
   1491 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
   1492 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
   1493 		max_spinners = ncpus - 1;
   1494 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
   1495 	if (max_count == 0)
   1496 		goto done;
   1497 
   1498 	/*
   1499 	 * This is a process-shared mutex.
   1500 	 * We cannot know if the owner is running on a processor.
   1501 	 * We just spin and hope that it is on a processor.
   1502 	 */
   1503 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
   1504 		goto done;
   1505 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
   1506 	for (count = 1; ; count++) {
   1507 #if defined(__sparc) && !defined(_LP64)
   1508 		/* horrible hack, necessary only on 32-bit sparc */
   1509 		if (fix_alignment_problem) {
   1510 			if ((*lockp & LOCKMASK64) == 0 &&
   1511 			    set_lock_byte(&mp->mutex_lockw) == 0) {
   1512 				mp->mutex_ownerpid = udp->pid;
   1513 				mp->mutex_owner = (uintptr_t)self;
   1514 				error = 0;
   1515 				break;
   1516 			}
   1517 		} else
   1518 #endif
   1519 		if ((*lockp & LOCKMASK64) == 0 &&
   1520 		    set_lock_byte64(lockp, udp->pid) == 0) {
   1521 			mp->mutex_owner = (uintptr_t)self;
   1522 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
   1523 			error = 0;
   1524 			break;
   1525 		}
   1526 		if (count == max_count)
   1527 			break;
   1528 		SMT_PAUSE();
   1529 	}
   1530 	new_lockword = spinners_decr(&mp->mutex_lockword);
   1531 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
   1532 		/*
   1533 		 * We haven't yet acquired the lock, the lock
   1534 		 * is free, and there are no other spinners.
   1535 		 * Make one final attempt to acquire the lock.
   1536 		 *
   1537 		 * This isn't strictly necessary since mutex_lock_kernel()
   1538 		 * (the next action this thread will take if it doesn't
   1539 		 * acquire the lock here) makes one attempt to acquire
   1540 		 * the lock before putting the thread to sleep.
   1541 		 *
   1542 		 * If the next action for this thread (on failure here)
   1543 		 * were not to call mutex_lock_kernel(), this would be
   1544 		 * necessary for correctness, to avoid ending up with an
   1545 		 * unheld mutex with waiters but no one to wake them up.
   1546 		 */
   1547 #if defined(__sparc) && !defined(_LP64)
   1548 		/* horrible hack, necessary only on 32-bit sparc */
   1549 		if (fix_alignment_problem) {
   1550 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
   1551 				mp->mutex_ownerpid = udp->pid;
   1552 				mp->mutex_owner = (uintptr_t)self;
   1553 				error = 0;
   1554 			}
   1555 		} else
   1556 #endif
   1557 		if (set_lock_byte64(lockp, udp->pid) == 0) {
   1558 			mp->mutex_owner = (uintptr_t)self;
   1559 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
   1560 			error = 0;
   1561 		}
   1562 		count++;
   1563 	}
   1564 
   1565 done:
   1566 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
   1567 		ASSERT(mp->mutex_type & LOCK_ROBUST);
   1568 		/*
   1569 		 * We shouldn't own the mutex.
   1570 		 * Just clear the lock; everyone has already been waked up.
   1571 		 */
   1572 		mp->mutex_owner = 0;
   1573 		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
   1574 		(void) clear_lockbyte64(&mp->mutex_lockword64);
   1575 		error = ENOTRECOVERABLE;
   1576 	}
   1577 
   1578 	exit_critical(self);
   1579 
   1580 	if (error) {
   1581 		if (count) {
   1582 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
   1583 		}
   1584 		if (error != EBUSY) {
   1585 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1586 		}
   1587 	} else {
   1588 		if (count) {
   1589 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
   1590 		}
   1591 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
   1592 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
   1593 			ASSERT(mp->mutex_type & LOCK_ROBUST);
   1594 			if (mp->mutex_flag & LOCK_OWNERDEAD)
   1595 				error = EOWNERDEAD;
   1596 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
   1597 				error = ELOCKUNMAPPED;
   1598 			else
   1599 				error = EOWNERDEAD;
   1600 		}
   1601 	}
   1602 
   1603 	return (error);
   1604 }
   1605 
   1606 /*
   1607  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
   1608  * Returns the lwpid of the thread that was dequeued, if any.
   1609  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
   1610  * to wake up the specified lwp.
   1611  */
   1612 static lwpid_t
   1613 mutex_wakeup(mutex_t *mp)
   1614 {
   1615 	lwpid_t lwpid = 0;
   1616 	int more;
   1617 	queue_head_t *qp;
   1618 	ulwp_t *ulwp;
   1619 
   1620 	/*
   1621 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
   1622 	 * waiters bit if no one was found on the queue because the mutex
   1623 	 * might have been deallocated or reallocated for another purpose.
   1624 	 */
   1625 	qp = queue_lock(mp, MX);
   1626 	if ((ulwp = dequeue(qp, &more)) != NULL) {
   1627 		lwpid = ulwp->ul_lwpid;
   1628 		mp->mutex_waiters = more;
   1629 	}
   1630 	queue_unlock(qp);
   1631 	return (lwpid);
   1632 }
   1633 
   1634 /*
   1635  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
   1636  */
   1637 static void
   1638 mutex_wakeup_all(mutex_t *mp)
   1639 {
   1640 	queue_head_t *qp;
   1641 	queue_root_t *qrp;
   1642 	int nlwpid = 0;
   1643 	int maxlwps = MAXLWPS;
   1644 	ulwp_t *ulwp;
   1645 	lwpid_t buffer[MAXLWPS];
   1646 	lwpid_t *lwpid = buffer;
   1647 
   1648 	/*
   1649 	 * Walk the list of waiters and prepare to wake up all of them.
   1650 	 * The waiters flag has already been cleared from the mutex.
   1651 	 *
   1652 	 * We keep track of lwpids that are to be unparked in lwpid[].
   1653 	 * __lwp_unpark_all() is called to unpark all of them after
   1654 	 * they have been removed from the sleep queue and the sleep
   1655 	 * queue lock has been dropped.  If we run out of space in our
   1656 	 * on-stack buffer, we need to allocate more but we can't call
   1657 	 * lmalloc() because we are holding a queue lock when the overflow
   1658 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
   1659 	 * either because the application may have allocated a small
   1660 	 * stack and we don't want to overrun the stack.  So we call
   1661 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
   1662 	 * system call directly since that path acquires no locks.
   1663 	 */
   1664 	qp = queue_lock(mp, MX);
   1665 	for (;;) {
   1666 		if ((qrp = qp->qh_root) == NULL ||
   1667 		    (ulwp = qrp->qr_head) == NULL)
   1668 			break;
   1669 		ASSERT(ulwp->ul_wchan == mp);
   1670 		queue_unlink(qp, &qrp->qr_head, NULL);
   1671 		ulwp->ul_sleepq = NULL;
   1672 		ulwp->ul_wchan = NULL;
   1673 		if (nlwpid == maxlwps)
   1674 			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
   1675 		lwpid[nlwpid++] = ulwp->ul_lwpid;
   1676 	}
   1677 
   1678 	if (nlwpid == 0) {
   1679 		queue_unlock(qp);
   1680 	} else {
   1681 		mp->mutex_waiters = 0;
   1682 		no_preempt(curthread);
   1683 		queue_unlock(qp);
   1684 		if (nlwpid == 1)
   1685 			(void) __lwp_unpark(lwpid[0]);
   1686 		else
   1687 			(void) __lwp_unpark_all(lwpid, nlwpid);
   1688 		preempt(curthread);
   1689 	}
   1690 
   1691 	if (lwpid != buffer)
   1692 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
   1693 }
   1694 
   1695 /*
   1696  * Release a process-private mutex.
   1697  * As an optimization, if there are waiters but there are also spinners
   1698  * attempting to acquire the mutex, then don't bother waking up a waiter;
   1699  * one of the spinners will acquire the mutex soon and it would be a waste
   1700  * of resources to wake up some thread just to have it spin for a while
   1701  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
   1702  */
   1703 static lwpid_t
   1704 mutex_unlock_queue(mutex_t *mp, int release_all)
   1705 {
   1706 	ulwp_t *self = curthread;
   1707 	lwpid_t lwpid = 0;
   1708 	uint32_t old_lockword;
   1709 
   1710 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   1711 	sigoff(self);
   1712 	mp->mutex_owner = 0;
   1713 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
   1714 	if ((old_lockword & WAITERMASK) &&
   1715 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
   1716 		no_preempt(self);	/* ensure a prompt wakeup */
   1717 		if (release_all)
   1718 			mutex_wakeup_all(mp);
   1719 		else
   1720 			lwpid = mutex_wakeup(mp);
   1721 		if (lwpid == 0)
   1722 			preempt(self);
   1723 	}
   1724 	sigon(self);
   1725 	return (lwpid);
   1726 }
   1727 
   1728 /*
   1729  * Like mutex_unlock_queue(), but for process-shared mutexes.
   1730  */
   1731 static void
   1732 mutex_unlock_process(mutex_t *mp, int release_all)
   1733 {
   1734 	ulwp_t *self = curthread;
   1735 	uint64_t old_lockword64;
   1736 
   1737 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   1738 	sigoff(self);
   1739 	mp->mutex_owner = 0;
   1740 #if defined(__sparc) && !defined(_LP64)
   1741 	/* horrible hack, necessary only on 32-bit sparc */
   1742 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
   1743 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
   1744 		uint32_t old_lockword;
   1745 		mp->mutex_ownerpid = 0;
   1746 		old_lockword = clear_lockbyte(&mp->mutex_lockword);
   1747 		if ((old_lockword & WAITERMASK) &&
   1748 		    (release_all || (old_lockword & SPINNERMASK) == 0)) {
   1749 			no_preempt(self);	/* ensure a prompt wakeup */
   1750 			(void) ___lwp_mutex_wakeup(mp, release_all);
   1751 			preempt(self);
   1752 		}
   1753 		sigon(self);
   1754 		return;
   1755 	}
   1756 #endif
   1757 	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
   1758 	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
   1759 	if ((old_lockword64 & WAITERMASK64) &&
   1760 	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
   1761 		no_preempt(self);	/* ensure a prompt wakeup */
   1762 		(void) ___lwp_mutex_wakeup(mp, release_all);
   1763 		preempt(self);
   1764 	}
   1765 	sigon(self);
   1766 }
   1767 
   1768 void
   1769 stall(void)
   1770 {
   1771 	for (;;)
   1772 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
   1773 }
   1774 
   1775 /*
   1776  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
   1777  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
   1778  * If successful, returns with mutex_owner set correctly.
   1779  */
   1780 int
   1781 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
   1782 	timespec_t *tsp)
   1783 {
   1784 	uberdata_t *udp = curthread->ul_uberdata;
   1785 	queue_head_t *qp;
   1786 	hrtime_t begin_sleep;
   1787 	int error = 0;
   1788 
   1789 	self->ul_sp = stkptr();
   1790 	if (__td_event_report(self, TD_SLEEP, udp)) {
   1791 		self->ul_wchan = mp;
   1792 		self->ul_td_evbuf.eventnum = TD_SLEEP;
   1793 		self->ul_td_evbuf.eventdata = mp;
   1794 		tdb_event(TD_SLEEP, udp);
   1795 	}
   1796 	if (msp) {
   1797 		tdb_incr(msp->mutex_sleep);
   1798 		begin_sleep = gethrtime();
   1799 	}
   1800 
   1801 	DTRACE_PROBE1(plockstat, mutex__block, mp);
   1802 
   1803 	/*
   1804 	 * Put ourself on the sleep queue, and while we are
   1805 	 * unable to grab the lock, go park in the kernel.
   1806 	 * Take ourself off the sleep queue after we acquire the lock.
   1807 	 * The waiter bit can be set/cleared only while holding the queue lock.
   1808 	 */
   1809 	qp = queue_lock(mp, MX);
   1810 	enqueue(qp, self, 0);
   1811 	mp->mutex_waiters = 1;
   1812 	for (;;) {
   1813 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   1814 			mp->mutex_owner = (uintptr_t)self;
   1815 			mp->mutex_waiters = dequeue_self(qp);
   1816 			break;
   1817 		}
   1818 		set_parking_flag(self, 1);
   1819 		queue_unlock(qp);
   1820 		/*
   1821 		 * __lwp_park() will return the residual time in tsp
   1822 		 * if we are unparked before the timeout expires.
   1823 		 */
   1824 		error = __lwp_park(tsp, 0);
   1825 		set_parking_flag(self, 0);
   1826 		/*
   1827 		 * We could have taken a signal or suspended ourself.
   1828 		 * If we did, then we removed ourself from the queue.
   1829 		 * Someone else may have removed us from the queue
   1830 		 * as a consequence of mutex_unlock().  We may have
   1831 		 * gotten a timeout from __lwp_park().  Or we may still
   1832 		 * be on the queue and this is just a spurious wakeup.
   1833 		 */
   1834 		qp = queue_lock(mp, MX);
   1835 		if (self->ul_sleepq == NULL) {
   1836 			if (error) {
   1837 				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
   1838 				if (error != EINTR)
   1839 					break;
   1840 				error = 0;
   1841 			}
   1842 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
   1843 				mp->mutex_owner = (uintptr_t)self;
   1844 				break;
   1845 			}
   1846 			enqueue(qp, self, 0);
   1847 			mp->mutex_waiters = 1;
   1848 		}
   1849 		ASSERT(self->ul_sleepq == qp &&
   1850 		    self->ul_qtype == MX &&
   1851 		    self->ul_wchan == mp);
   1852 		if (error) {
   1853 			if (error != EINTR) {
   1854 				mp->mutex_waiters = dequeue_self(qp);
   1855 				break;
   1856 			}
   1857 			error = 0;
   1858 		}
   1859 	}
   1860 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
   1861 	    self->ul_wchan == NULL);
   1862 	self->ul_sp = 0;
   1863 
   1864 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
   1865 
   1866 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
   1867 		ASSERT(mp->mutex_type & LOCK_ROBUST);
   1868 		/*
   1869 		 * We shouldn't own the mutex.
   1870 		 * Just clear the lock; everyone has already been waked up.
   1871 		 */
   1872 		mp->mutex_owner = 0;
   1873 		(void) clear_lockbyte(&mp->mutex_lockword);
   1874 		error = ENOTRECOVERABLE;
   1875 	}
   1876 
   1877 	queue_unlock(qp);
   1878 
   1879 	if (msp)
   1880 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
   1881 
   1882 	if (error) {
   1883 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
   1884 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1885 	} else {
   1886 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
   1887 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   1888 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
   1889 			ASSERT(mp->mutex_type & LOCK_ROBUST);
   1890 			error = EOWNERDEAD;
   1891 		}
   1892 	}
   1893 
   1894 	return (error);
   1895 }
   1896 
   1897 static int
   1898 mutex_recursion(mutex_t *mp, int mtype, int try)
   1899 {
   1900 	ASSERT(mutex_held(mp));
   1901 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
   1902 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
   1903 
   1904 	if (mtype & LOCK_RECURSIVE) {
   1905 		if (mp->mutex_rcount == RECURSION_MAX) {
   1906 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
   1907 			return (EAGAIN);
   1908 		}
   1909 		mp->mutex_rcount++;
   1910 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
   1911 		return (0);
   1912 	}
   1913 	if (try == MUTEX_LOCK) {
   1914 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
   1915 		return (EDEADLK);
   1916 	}
   1917 	return (EBUSY);
   1918 }
   1919 
   1920 /*
   1921  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
   1922  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
   1923  * We use tdb_hash_lock here and in the synch object tracking code in
   1924  * the tdb_agent.c file.  There is no conflict between these two usages.
   1925  */
   1926 void
   1927 register_lock(mutex_t *mp)
   1928 {
   1929 	uberdata_t *udp = curthread->ul_uberdata;
   1930 	uint_t hash = LOCK_HASH(mp);
   1931 	robust_t *rlp;
   1932 	robust_t *invalid;
   1933 	robust_t **rlpp;
   1934 	robust_t **table;
   1935 
   1936 	if ((table = udp->robustlocks) == NULL) {
   1937 		lmutex_lock(&udp->tdb_hash_lock);
   1938 		if ((table = udp->robustlocks) == NULL) {
   1939 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
   1940 			membar_producer();
   1941 			udp->robustlocks = table;
   1942 		}
   1943 		lmutex_unlock(&udp->tdb_hash_lock);
   1944 	}
   1945 	membar_consumer();
   1946 
   1947 	/*
   1948 	 * First search the registered table with no locks held.
   1949 	 * This is safe because the table never shrinks
   1950 	 * and we can only get a false negative.
   1951 	 */
   1952 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
   1953 		if (rlp->robust_lock == mp)	/* already registered */
   1954 			return;
   1955 	}
   1956 
   1957 	/*
   1958 	 * The lock was not found.
   1959 	 * Repeat the operation with tdb_hash_lock held.
   1960 	 */
   1961 	lmutex_lock(&udp->tdb_hash_lock);
   1962 
   1963 	invalid = NULL;
   1964 	for (rlpp = &table[hash];
   1965 	    (rlp = *rlpp) != NULL;
   1966 	    rlpp = &rlp->robust_next) {
   1967 		if (rlp->robust_lock == mp) {	/* already registered */
   1968 			lmutex_unlock(&udp->tdb_hash_lock);
   1969 			return;
   1970 		}
   1971 		/* remember the first invalid entry, if any */
   1972 		if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
   1973 			invalid = rlp;
   1974 	}
   1975 
   1976 	/*
   1977 	 * The lock has never been registered.
   1978 	 * Add it to the table and register it now.
   1979 	 */
   1980 	if ((rlp = invalid) != NULL) {
   1981 		/*
   1982 		 * Reuse the invalid entry we found above.
   1983 		 * The linkages are still correct.
   1984 		 */
   1985 		rlp->robust_lock = mp;
   1986 		membar_producer();
   1987 	} else {
   1988 		/*
   1989 		 * Allocate a new entry and add it to
   1990 		 * the hash table and to the global list.
   1991 		 */
   1992 		rlp = lmalloc(sizeof (*rlp));
   1993 		rlp->robust_lock = mp;
   1994 		rlp->robust_next = NULL;
   1995 		rlp->robust_list = udp->robustlist;
   1996 		udp->robustlist = rlp;
   1997 		membar_producer();
   1998 		*rlpp = rlp;
   1999 	}
   2000 
   2001 	lmutex_unlock(&udp->tdb_hash_lock);
   2002 
   2003 	(void) ___lwp_mutex_register(mp, &rlp->robust_lock);
   2004 }
   2005 
   2006 /*
   2007  * This is called in the child of fork()/forkall() to start over
   2008  * with a clean slate.  (Each process must register its own locks.)
   2009  * No locks are needed because all other threads are suspended or gone.
   2010  */
   2011 void
   2012 unregister_locks(void)
   2013 {
   2014 	uberdata_t *udp = curthread->ul_uberdata;
   2015 	robust_t **table;
   2016 	robust_t *rlp;
   2017 	robust_t *next;
   2018 
   2019 	/*
   2020 	 * Do this first, before calling lfree().
   2021 	 */
   2022 	table = udp->robustlocks;
   2023 	udp->robustlocks = NULL;
   2024 	rlp = udp->robustlist;
   2025 	udp->robustlist = NULL;
   2026 
   2027 	/*
   2028 	 * Do this by traversing the global list, not the hash table.
   2029 	 */
   2030 	while (rlp != NULL) {
   2031 		next = rlp->robust_list;
   2032 		lfree(rlp, sizeof (*rlp));
   2033 		rlp = next;
   2034 	}
   2035 	if (table != NULL)
   2036 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
   2037 }
   2038 
   2039 /*
   2040  * Returns with mutex_owner set correctly.
   2041  */
   2042 int
   2043 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
   2044 {
   2045 	ulwp_t *self = curthread;
   2046 	uberdata_t *udp = self->ul_uberdata;
   2047 	int mtype = mp->mutex_type;
   2048 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
   2049 	int error = 0;
   2050 	int noceil = try & MUTEX_NOCEIL;
   2051 	uint8_t ceil;
   2052 	int myprio;
   2053 
   2054 	try &= ~MUTEX_NOCEIL;
   2055 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
   2056 
   2057 	if (!self->ul_schedctl_called)
   2058 		(void) setup_schedctl();
   2059 
   2060 	if (msp && try == MUTEX_TRY)
   2061 		tdb_incr(msp->mutex_try);
   2062 
   2063 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
   2064 		return (mutex_recursion(mp, mtype, try));
   2065 
   2066 	if (self->ul_error_detection && try == MUTEX_LOCK &&
   2067 	    tsp == NULL && mutex_held(mp))
   2068 		lock_error(mp, "mutex_lock", NULL, NULL);
   2069 
   2070 	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
   2071 		update_sched(self);
   2072 		if (self->ul_cid != self->ul_rtclassid) {
   2073 			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
   2074 			return (EPERM);
   2075 		}
   2076 		ceil = mp->mutex_ceiling;
   2077 		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
   2078 		if (myprio > ceil) {
   2079 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
   2080 			return (EINVAL);
   2081 		}
   2082 		if ((error = _ceil_mylist_add(mp)) != 0) {
   2083 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   2084 			return (error);
   2085 		}
   2086 		if (myprio < ceil)
   2087 			_ceil_prio_inherit(ceil);
   2088 	}
   2089 
   2090 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
   2091 	    == (USYNC_PROCESS | LOCK_ROBUST))
   2092 		register_lock(mp);
   2093 
   2094 	if (mtype & LOCK_PRIO_INHERIT) {
   2095 		/* go straight to the kernel */
   2096 		if (try == MUTEX_TRY)
   2097 			error = mutex_trylock_kernel(mp);
   2098 		else	/* MUTEX_LOCK */
   2099 			error = mutex_lock_kernel(mp, tsp, msp);
   2100 		/*
   2101 		 * The kernel never sets or clears the lock byte
   2102 		 * for LOCK_PRIO_INHERIT mutexes.
   2103 		 * Set it here for consistency.
   2104 		 */
   2105 		switch (error) {
   2106 		case 0:
   2107 			self->ul_pilocks++;
   2108 			mp->mutex_lockw = LOCKSET;
   2109 			break;
   2110 		case EOWNERDEAD:
   2111 		case ELOCKUNMAPPED:
   2112 			self->ul_pilocks++;
   2113 			mp->mutex_lockw = LOCKSET;
   2114 			/* FALLTHROUGH */
   2115 		case ENOTRECOVERABLE:
   2116 			ASSERT(mtype & LOCK_ROBUST);
   2117 			break;
   2118 		case EDEADLK:
   2119 			if (try == MUTEX_TRY) {
   2120 				error = EBUSY;
   2121 			} else if (tsp != NULL) {	/* simulate a timeout */
   2122 				/*
   2123 				 * Note: mutex_timedlock() never returns EINTR.
   2124 				 */
   2125 				timespec_t ts = *tsp;
   2126 				timespec_t rts;
   2127 
   2128 				while (__nanosleep(&ts, &rts) == EINTR)
   2129 					ts = rts;
   2130 				error = ETIME;
   2131 			} else {		/* simulate a deadlock */
   2132 				stall();
   2133 			}
   2134 			break;
   2135 		}
   2136 	} else if (mtype & USYNC_PROCESS) {
   2137 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
   2138 		if (error == EBUSY && try == MUTEX_LOCK)
   2139 			error = mutex_lock_kernel(mp, tsp, msp);
   2140 	} else {	/* USYNC_THREAD */
   2141 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
   2142 		if (error == EBUSY && try == MUTEX_LOCK)
   2143 			error = mutex_lock_queue(self, msp, mp, tsp);
   2144 	}
   2145 
   2146 	switch (error) {
   2147 	case 0:
   2148 	case EOWNERDEAD:
   2149 	case ELOCKUNMAPPED:
   2150 		if (mtype & LOCK_ROBUST)
   2151 			remember_lock(mp);
   2152 		if (msp)
   2153 			record_begin_hold(msp);
   2154 		break;
   2155 	default:
   2156 		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
   2157 			(void) _ceil_mylist_del(mp);
   2158 			if (myprio < ceil)
   2159 				_ceil_prio_waive();
   2160 		}
   2161 		if (try == MUTEX_TRY) {
   2162 			if (msp)
   2163 				tdb_incr(msp->mutex_try_fail);
   2164 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
   2165 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
   2166 				tdb_event(TD_LOCK_TRY, udp);
   2167 			}
   2168 		}
   2169 		break;
   2170 	}
   2171 
   2172 	return (error);
   2173 }
   2174 
   2175 int
   2176 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
   2177 {
   2178 	ulwp_t *self = curthread;
   2179 	uberdata_t *udp = self->ul_uberdata;
   2180 
   2181 	/*
   2182 	 * We know that USYNC_PROCESS is set in mtype and that
   2183 	 * zero, one, or both of the flags LOCK_RECURSIVE and
   2184 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
   2185 	 */
   2186 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
   2187 	enter_critical(self);
   2188 #if defined(__sparc) && !defined(_LP64)
   2189 	/* horrible hack, necessary only on 32-bit sparc */
   2190 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
   2191 	    self->ul_misaligned) {
   2192 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   2193 			mp->mutex_ownerpid = udp->pid;
   2194 			mp->mutex_owner = (uintptr_t)self;
   2195 			exit_critical(self);
   2196 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2197 			return (0);
   2198 		}
   2199 	} else
   2200 #endif
   2201 	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
   2202 		mp->mutex_owner = (uintptr_t)self;
   2203 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
   2204 		exit_critical(self);
   2205 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2206 		return (0);
   2207 	}
   2208 	exit_critical(self);
   2209 
   2210 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
   2211 		return (mutex_recursion(mp, mtype, try));
   2212 
   2213 	if (try == MUTEX_LOCK) {
   2214 		if (mutex_trylock_process(mp, 1) == 0)
   2215 			return (0);
   2216 		return (mutex_lock_kernel(mp, tsp, NULL));
   2217 	}
   2218 
   2219 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
   2220 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
   2221 		tdb_event(TD_LOCK_TRY, udp);
   2222 	}
   2223 	return (EBUSY);
   2224 }
   2225 
   2226 static int
   2227 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
   2228 {
   2229 	ulwp_t *self = curthread;
   2230 	int mtype = mp->mutex_type;
   2231 	uberflags_t *gflags;
   2232 
   2233 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
   2234 	    self->ul_error_detection && self->ul_misaligned == 0)
   2235 		lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
   2236 
   2237 	/*
   2238 	 * Optimize the case of USYNC_THREAD, including
   2239 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
   2240 	 * no error detection, no lock statistics,
   2241 	 * and the process has only a single thread.
   2242 	 * (Most likely a traditional single-threaded application.)
   2243 	 */
   2244 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
   2245 	    self->ul_uberdata->uberflags.uf_all) == 0) {
   2246 		/*
   2247 		 * Only one thread exists so we don't need an atomic operation.
   2248 		 * We do, however, need to protect against signals.
   2249 		 */
   2250 		if (mp->mutex_lockw == 0) {
   2251 			sigoff(self);
   2252 			mp->mutex_lockw = LOCKSET;
   2253 			mp->mutex_owner = (uintptr_t)self;
   2254 			sigon(self);
   2255 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2256 			return (0);
   2257 		}
   2258 		if (mtype && MUTEX_OWNER(mp) == self)
   2259 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
   2260 		/*
   2261 		 * We have reached a deadlock, probably because the
   2262 		 * process is executing non-async-signal-safe code in
   2263 		 * a signal handler and is attempting to acquire a lock
   2264 		 * that it already owns.  This is not surprising, given
   2265 		 * bad programming practices over the years that has
   2266 		 * resulted in applications calling printf() and such
   2267 		 * in their signal handlers.  Unless the user has told
   2268 		 * us that the signal handlers are safe by setting:
   2269 		 *	export _THREAD_ASYNC_SAFE=1
   2270 		 * we return EDEADLK rather than actually deadlocking.
   2271 		 */
   2272 		if (tsp == NULL &&
   2273 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
   2274 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
   2275 			return (EDEADLK);
   2276 		}
   2277 	}
   2278 
   2279 	/*
   2280 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
   2281 	 * no error detection, and no lock statistics.
   2282 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
   2283 	 */
   2284 	if ((gflags = self->ul_schedctl_called) != NULL &&
   2285 	    (gflags->uf_trs_ted |
   2286 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
   2287 		if (mtype & USYNC_PROCESS)
   2288 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
   2289 		sigoff(self);
   2290 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   2291 			mp->mutex_owner = (uintptr_t)self;
   2292 			sigon(self);
   2293 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2294 			return (0);
   2295 		}
   2296 		sigon(self);
   2297 		if (mtype && MUTEX_OWNER(mp) == self)
   2298 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
   2299 		if (mutex_trylock_adaptive(mp, 1) != 0)
   2300 			return (mutex_lock_queue(self, NULL, mp, tsp));
   2301 		return (0);
   2302 	}
   2303 
   2304 	/* else do it the long way */
   2305 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
   2306 }
   2307 
   2308 #pragma weak pthread_mutex_lock = mutex_lock
   2309 #pragma weak _mutex_lock = mutex_lock
   2310 int
   2311 mutex_lock(mutex_t *mp)
   2312 {
   2313 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
   2314 	return (mutex_lock_impl(mp, NULL));
   2315 }
   2316 
   2317 int
   2318 pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
   2319 	const struct timespec *_RESTRICT_KYWD abstime)
   2320 {
   2321 	timespec_t tslocal;
   2322 	int error;
   2323 
   2324 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
   2325 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
   2326 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
   2327 	if (error == ETIME)
   2328 		error = ETIMEDOUT;
   2329 	return (error);
   2330 }
   2331 
   2332 int
   2333 pthread_mutex_reltimedlock_np(pthread_mutex_t *_RESTRICT_KYWD mp,
   2334 	const struct timespec *_RESTRICT_KYWD reltime)
   2335 {
   2336 	timespec_t tslocal;
   2337 	int error;
   2338 
   2339 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
   2340 	tslocal = *reltime;
   2341 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
   2342 	if (error == ETIME)
   2343 		error = ETIMEDOUT;
   2344 	return (error);
   2345 }
   2346 
   2347 #pragma weak pthread_mutex_trylock = mutex_trylock
   2348 int
   2349 mutex_trylock(mutex_t *mp)
   2350 {
   2351 	ulwp_t *self = curthread;
   2352 	uberdata_t *udp = self->ul_uberdata;
   2353 	int mtype = mp->mutex_type;
   2354 	uberflags_t *gflags;
   2355 
   2356 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
   2357 
   2358 	/*
   2359 	 * Optimize the case of USYNC_THREAD, including
   2360 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
   2361 	 * no error detection, no lock statistics,
   2362 	 * and the process has only a single thread.
   2363 	 * (Most likely a traditional single-threaded application.)
   2364 	 */
   2365 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
   2366 	    udp->uberflags.uf_all) == 0) {
   2367 		/*
   2368 		 * Only one thread exists so we don't need an atomic operation.
   2369 		 * We do, however, need to protect against signals.
   2370 		 */
   2371 		if (mp->mutex_lockw == 0) {
   2372 			sigoff(self);
   2373 			mp->mutex_lockw = LOCKSET;
   2374 			mp->mutex_owner = (uintptr_t)self;
   2375 			sigon(self);
   2376 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2377 			return (0);
   2378 		}
   2379 		if (mtype && MUTEX_OWNER(mp) == self)
   2380 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
   2381 		return (EBUSY);
   2382 	}
   2383 
   2384 	/*
   2385 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
   2386 	 * no error detection, and no lock statistics.
   2387 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
   2388 	 */
   2389 	if ((gflags = self->ul_schedctl_called) != NULL &&
   2390 	    (gflags->uf_trs_ted |
   2391 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
   2392 		if (mtype & USYNC_PROCESS)
   2393 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
   2394 		sigoff(self);
   2395 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   2396 			mp->mutex_owner = (uintptr_t)self;
   2397 			sigon(self);
   2398 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2399 			return (0);
   2400 		}
   2401 		sigon(self);
   2402 		if (mtype && MUTEX_OWNER(mp) == self)
   2403 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
   2404 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
   2405 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
   2406 			tdb_event(TD_LOCK_TRY, udp);
   2407 		}
   2408 		return (EBUSY);
   2409 	}
   2410 
   2411 	/* else do it the long way */
   2412 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
   2413 }
   2414 
   2415 int
   2416 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
   2417 {
   2418 	ulwp_t *self = curthread;
   2419 	uberdata_t *udp = self->ul_uberdata;
   2420 	int mtype = mp->mutex_type;
   2421 	tdb_mutex_stats_t *msp;
   2422 	int error = 0;
   2423 	int release_all;
   2424 	lwpid_t lwpid;
   2425 
   2426 	if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
   2427 	    !mutex_held(mp))
   2428 		return (EPERM);
   2429 
   2430 	if (self->ul_error_detection && !mutex_held(mp))
   2431 		lock_error(mp, "mutex_unlock", NULL, NULL);
   2432 
   2433 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
   2434 		mp->mutex_rcount--;
   2435 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
   2436 		return (0);
   2437 	}
   2438 
   2439 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
   2440 		(void) record_hold_time(msp);
   2441 
   2442 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
   2443 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
   2444 		ASSERT(mtype & LOCK_ROBUST);
   2445 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
   2446 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
   2447 	}
   2448 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
   2449 
   2450 	if (mtype & LOCK_PRIO_INHERIT) {
   2451 		no_preempt(self);
   2452 		mp->mutex_owner = 0;
   2453 		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
   2454 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   2455 		mp->mutex_lockw = LOCKCLEAR;
   2456 		self->ul_pilocks--;
   2457 		error = ___lwp_mutex_unlock(mp);
   2458 		preempt(self);
   2459 	} else if (mtype & USYNC_PROCESS) {
   2460 		mutex_unlock_process(mp, release_all);
   2461 	} else {	/* USYNC_THREAD */
   2462 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
   2463 			(void) __lwp_unpark(lwpid);
   2464 			preempt(self);
   2465 		}
   2466 	}
   2467 
   2468 	if (mtype & LOCK_ROBUST)
   2469 		forget_lock(mp);
   2470 
   2471 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
   2472 		_ceil_prio_waive();
   2473 
   2474 	return (error);
   2475 }
   2476 
   2477 #pragma weak pthread_mutex_unlock = mutex_unlock
   2478 #pragma weak _mutex_unlock = mutex_unlock
   2479 int
   2480 mutex_unlock(mutex_t *mp)
   2481 {
   2482 	ulwp_t *self = curthread;
   2483 	int mtype = mp->mutex_type;
   2484 	uberflags_t *gflags;
   2485 	lwpid_t lwpid;
   2486 	short el;
   2487 
   2488 	/*
   2489 	 * Optimize the case of USYNC_THREAD, including
   2490 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
   2491 	 * no error detection, no lock statistics,
   2492 	 * and the process has only a single thread.
   2493 	 * (Most likely a traditional single-threaded application.)
   2494 	 */
   2495 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
   2496 	    self->ul_uberdata->uberflags.uf_all) == 0) {
   2497 		if (mtype) {
   2498 			/*
   2499 			 * At this point we know that one or both of the
   2500 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
   2501 			 */
   2502 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
   2503 				return (EPERM);
   2504 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
   2505 				mp->mutex_rcount--;
   2506 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
   2507 				return (0);
   2508 			}
   2509 		}
   2510 		/*
   2511 		 * Only one thread exists so we don't need an atomic operation.
   2512 		 * Also, there can be no waiters.
   2513 		 */
   2514 		sigoff(self);
   2515 		mp->mutex_owner = 0;
   2516 		mp->mutex_lockword = 0;
   2517 		sigon(self);
   2518 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   2519 		return (0);
   2520 	}
   2521 
   2522 	/*
   2523 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
   2524 	 * no error detection, and no lock statistics.
   2525 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
   2526 	 */
   2527 	if ((gflags = self->ul_schedctl_called) != NULL) {
   2528 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
   2529 fast_unlock:
   2530 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
   2531 				(void) __lwp_unpark(lwpid);
   2532 				preempt(self);
   2533 			}
   2534 			return (0);
   2535 		}
   2536 		if (el)		/* error detection or lock statistics */
   2537 			goto slow_unlock;
   2538 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
   2539 			/*
   2540 			 * At this point we know that one or both of the
   2541 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
   2542 			 */
   2543 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
   2544 				return (EPERM);
   2545 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
   2546 				mp->mutex_rcount--;
   2547 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
   2548 				return (0);
   2549 			}
   2550 			goto fast_unlock;
   2551 		}
   2552 		if ((mtype &
   2553 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
   2554 			/*
   2555 			 * At this point we know that zero, one, or both of the
   2556 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
   2557 			 * that the USYNC_PROCESS flag is set.
   2558 			 */
   2559 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
   2560 				return (EPERM);
   2561 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
   2562 				mp->mutex_rcount--;
   2563 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
   2564 				return (0);
   2565 			}
   2566 			mutex_unlock_process(mp, 0);
   2567 			return (0);
   2568 		}
   2569 	}
   2570 
   2571 	/* else do it the long way */
   2572 slow_unlock:
   2573 	return (mutex_unlock_internal(mp, 0));
   2574 }
   2575 
   2576 /*
   2577  * Internally to the library, almost all mutex lock/unlock actions
   2578  * go through these lmutex_ functions, to protect critical regions.
   2579  * We replicate a bit of code from mutex_lock() and mutex_unlock()
   2580  * to make these functions faster since we know that the mutex type
   2581  * of all internal locks is USYNC_THREAD.  We also know that internal
   2582  * locking can never fail, so we panic if it does.
   2583  */
   2584 void
   2585 lmutex_lock(mutex_t *mp)
   2586 {
   2587 	ulwp_t *self = curthread;
   2588 	uberdata_t *udp = self->ul_uberdata;
   2589 
   2590 	ASSERT(mp->mutex_type == USYNC_THREAD);
   2591 
   2592 	enter_critical(self);
   2593 	/*
   2594 	 * Optimize the case of no lock statistics and only a single thread.
   2595 	 * (Most likely a traditional single-threaded application.)
   2596 	 */
   2597 	if (udp->uberflags.uf_all == 0) {
   2598 		/*
   2599 		 * Only one thread exists; the mutex must be free.
   2600 		 */
   2601 		ASSERT(mp->mutex_lockw == 0);
   2602 		mp->mutex_lockw = LOCKSET;
   2603 		mp->mutex_owner = (uintptr_t)self;
   2604 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2605 	} else {
   2606 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
   2607 
   2608 		if (!self->ul_schedctl_called)
   2609 			(void) setup_schedctl();
   2610 
   2611 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
   2612 			mp->mutex_owner = (uintptr_t)self;
   2613 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2614 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
   2615 			(void) mutex_lock_queue(self, msp, mp, NULL);
   2616 		}
   2617 
   2618 		if (msp)
   2619 			record_begin_hold(msp);
   2620 	}
   2621 }
   2622 
   2623 void
   2624 lmutex_unlock(mutex_t *mp)
   2625 {
   2626 	ulwp_t *self = curthread;
   2627 	uberdata_t *udp = self->ul_uberdata;
   2628 
   2629 	ASSERT(mp->mutex_type == USYNC_THREAD);
   2630 
   2631 	/*
   2632 	 * Optimize the case of no lock statistics and only a single thread.
   2633 	 * (Most likely a traditional single-threaded application.)
   2634 	 */
   2635 	if (udp->uberflags.uf_all == 0) {
   2636 		/*
   2637 		 * Only one thread exists so there can be no waiters.
   2638 		 */
   2639 		mp->mutex_owner = 0;
   2640 		mp->mutex_lockword = 0;
   2641 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   2642 	} else {
   2643 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
   2644 		lwpid_t lwpid;
   2645 
   2646 		if (msp)
   2647 			(void) record_hold_time(msp);
   2648 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
   2649 			(void) __lwp_unpark(lwpid);
   2650 			preempt(self);
   2651 		}
   2652 	}
   2653 	exit_critical(self);
   2654 }
   2655 
   2656 /*
   2657  * For specialized code in libc, like the asynchronous i/o code,
   2658  * the following sig_*() locking primitives are used in order
   2659  * to make the code asynchronous signal safe.  Signals are
   2660  * deferred while locks acquired by these functions are held.
   2661  */
   2662 void
   2663 sig_mutex_lock(mutex_t *mp)
   2664 {
   2665 	ulwp_t *self = curthread;
   2666 
   2667 	sigoff(self);
   2668 	(void) mutex_lock(mp);
   2669 }
   2670 
   2671 void
   2672 sig_mutex_unlock(mutex_t *mp)
   2673 {
   2674 	ulwp_t *self = curthread;
   2675 
   2676 	(void) mutex_unlock(mp);
   2677 	sigon(self);
   2678 }
   2679 
   2680 int
   2681 sig_mutex_trylock(mutex_t *mp)
   2682 {
   2683 	ulwp_t *self = curthread;
   2684 	int error;
   2685 
   2686 	sigoff(self);
   2687 	if ((error = mutex_trylock(mp)) != 0)
   2688 		sigon(self);
   2689 	return (error);
   2690 }
   2691 
   2692 /*
   2693  * sig_cond_wait() is a cancellation point.
   2694  */
   2695 int
   2696 sig_cond_wait(cond_t *cv, mutex_t *mp)
   2697 {
   2698 	int error;
   2699 
   2700 	ASSERT(curthread->ul_sigdefer != 0);
   2701 	pthread_testcancel();
   2702 	error = __cond_wait(cv, mp);
   2703 	if (error == EINTR && curthread->ul_cursig) {
   2704 		sig_mutex_unlock(mp);
   2705 		/* take the deferred signal here */
   2706 		sig_mutex_lock(mp);
   2707 	}
   2708 	pthread_testcancel();
   2709 	return (error);
   2710 }
   2711 
   2712 /*
   2713  * sig_cond_reltimedwait() is a cancellation point.
   2714  */
   2715 int
   2716 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
   2717 {
   2718 	int error;
   2719 
   2720 	ASSERT(curthread->ul_sigdefer != 0);
   2721 	pthread_testcancel();
   2722 	error = __cond_reltimedwait(cv, mp, ts);
   2723 	if (error == EINTR && curthread->ul_cursig) {
   2724 		sig_mutex_unlock(mp);
   2725 		/* take the deferred signal here */
   2726 		sig_mutex_lock(mp);
   2727 	}
   2728 	pthread_testcancel();
   2729 	return (error);
   2730 }
   2731 
   2732 /*
   2733  * For specialized code in libc, like the stdio code.
   2734  * the following cancel_safe_*() locking primitives are used in
   2735  * order to make the code cancellation-safe.  Cancellation is
   2736  * deferred while locks acquired by these functions are held.
   2737  */
   2738 void
   2739 cancel_safe_mutex_lock(mutex_t *mp)
   2740 {
   2741 	(void) mutex_lock(mp);
   2742 	curthread->ul_libc_locks++;
   2743 }
   2744 
   2745 int
   2746 cancel_safe_mutex_trylock(mutex_t *mp)
   2747 {
   2748 	int error;
   2749 
   2750 	if ((error = mutex_trylock(mp)) == 0)
   2751 		curthread->ul_libc_locks++;
   2752 	return (error);
   2753 }
   2754 
   2755 void
   2756 cancel_safe_mutex_unlock(mutex_t *mp)
   2757 {
   2758 	ulwp_t *self = curthread;
   2759 
   2760 	ASSERT(self->ul_libc_locks != 0);
   2761 
   2762 	(void) mutex_unlock(mp);
   2763 
   2764 	/*
   2765 	 * Decrement the count of locks held by cancel_safe_mutex_lock().
   2766 	 * If we are then in a position to terminate cleanly and
   2767 	 * if there is a pending cancellation and cancellation
   2768 	 * is not disabled and we received EINTR from a recent
   2769 	 * system call then perform the cancellation action now.
   2770 	 */
   2771 	if (--self->ul_libc_locks == 0 &&
   2772 	    !(self->ul_vfork | self->ul_nocancel |
   2773 	    self->ul_critical | self->ul_sigdefer) &&
   2774 	    cancel_active())
   2775 		pthread_exit(PTHREAD_CANCELED);
   2776 }
   2777 
   2778 static int
   2779 shared_mutex_held(mutex_t *mparg)
   2780 {
   2781 	/*
   2782 	 * The 'volatile' is necessary to make sure the compiler doesn't
   2783 	 * reorder the tests of the various components of the mutex.
   2784 	 * They must be tested in this order:
   2785 	 *	mutex_lockw
   2786 	 *	mutex_owner
   2787 	 *	mutex_ownerpid
   2788 	 * This relies on the fact that everywhere mutex_lockw is cleared,
   2789 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
   2790 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
   2791 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
   2792 	 * mutex_lockw is set or cleared with a memory barrier.
   2793 	 */
   2794 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
   2795 	ulwp_t *self = curthread;
   2796 	uberdata_t *udp = self->ul_uberdata;
   2797 
   2798 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
   2799 }
   2800 
   2801 #pragma weak _mutex_held = mutex_held
   2802 int
   2803 mutex_held(mutex_t *mparg)
   2804 {
   2805 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
   2806 
   2807 	if (mparg->mutex_type & USYNC_PROCESS)
   2808 		return (shared_mutex_held(mparg));
   2809 	return (MUTEX_OWNED(mp, curthread));
   2810 }
   2811 
   2812 #pragma weak pthread_mutex_destroy = mutex_destroy
   2813 #pragma weak _mutex_destroy = mutex_destroy
   2814 int
   2815 mutex_destroy(mutex_t *mp)
   2816 {
   2817 	if (mp->mutex_type & USYNC_PROCESS)
   2818 		forget_lock(mp);
   2819 	(void) memset(mp, 0, sizeof (*mp));
   2820 	tdb_sync_obj_deregister(mp);
   2821 	return (0);
   2822 }
   2823 
   2824 #pragma weak pthread_mutex_consistent_np = mutex_consistent
   2825 #pragma weak pthread_mutex_consistent = mutex_consistent
   2826 int
   2827 mutex_consistent(mutex_t *mp)
   2828 {
   2829 	/*
   2830 	 * Do this only for an inconsistent, initialized robust lock
   2831 	 * that we hold.  For all other cases, return EINVAL.
   2832 	 */
   2833 	if (mutex_held(mp) &&
   2834 	    (mp->mutex_type & LOCK_ROBUST) &&
   2835 	    (mp->mutex_flag & LOCK_INITED) &&
   2836 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
   2837 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
   2838 		mp->mutex_rcount = 0;
   2839 		return (0);
   2840 	}
   2841 	return (EINVAL);
   2842 }
   2843 
   2844 /*
   2845  * Spin locks are separate from ordinary mutexes,
   2846  * but we use the same data structure for them.
   2847  */
   2848 
   2849 int
   2850 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
   2851 {
   2852 	mutex_t *mp = (mutex_t *)lock;
   2853 
   2854 	(void) memset(mp, 0, sizeof (*mp));
   2855 	if (pshared == PTHREAD_PROCESS_SHARED)
   2856 		mp->mutex_type = USYNC_PROCESS;
   2857 	else
   2858 		mp->mutex_type = USYNC_THREAD;
   2859 	mp->mutex_flag = LOCK_INITED;
   2860 	mp->mutex_magic = MUTEX_MAGIC;
   2861 
   2862 	/*
   2863 	 * This should be at the beginning of the function,
   2864 	 * but for the sake of old broken applications that
   2865 	 * do not have proper alignment for their mutexes
   2866 	 * (and don't check the return code from pthread_spin_init),
   2867 	 * we put it here, after initializing the mutex regardless.
   2868 	 */
   2869 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
   2870 	    curthread->ul_misaligned == 0)
   2871 		return (EINVAL);
   2872 
   2873 	return (0);
   2874 }
   2875 
   2876 int
   2877 pthread_spin_destroy(pthread_spinlock_t *lock)
   2878 {
   2879 	(void) memset(lock, 0, sizeof (*lock));
   2880 	return (0);
   2881 }
   2882 
   2883 int
   2884 pthread_spin_trylock(pthread_spinlock_t *lock)
   2885 {
   2886 	mutex_t *mp = (mutex_t *)lock;
   2887 	ulwp_t *self = curthread;
   2888 	int error = 0;
   2889 
   2890 	no_preempt(self);
   2891 	if (set_lock_byte(&mp->mutex_lockw) != 0)
   2892 		error = EBUSY;
   2893 	else {
   2894 		mp->mutex_owner = (uintptr_t)self;
   2895 		if (mp->mutex_type == USYNC_PROCESS)
   2896 			mp->mutex_ownerpid = self->ul_uberdata->pid;
   2897 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   2898 	}
   2899 	preempt(self);
   2900 	return (error);
   2901 }
   2902 
   2903 int
   2904 pthread_spin_lock(pthread_spinlock_t *lock)
   2905 {
   2906 	mutex_t *mp = (mutex_t *)lock;
   2907 	ulwp_t *self = curthread;
   2908 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
   2909 	int count = 0;
   2910 
   2911 	ASSERT(!self->ul_critical || self->ul_bindflags);
   2912 
   2913 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
   2914 
   2915 	/*
   2916 	 * We don't care whether the owner is running on a processor.
   2917 	 * We just spin because that's what this interface requires.
   2918 	 */
   2919 	for (;;) {
   2920 		if (*lockp == 0) {	/* lock byte appears to be clear */
   2921 			no_preempt(self);
   2922 			if (set_lock_byte(lockp) == 0)
   2923 				break;
   2924 			preempt(self);
   2925 		}
   2926 		if (count < INT_MAX)
   2927 			count++;
   2928 		SMT_PAUSE();
   2929 	}
   2930 	mp->mutex_owner = (uintptr_t)self;
   2931 	if (mp->mutex_type == USYNC_PROCESS)
   2932 		mp->mutex_ownerpid = self->ul_uberdata->pid;
   2933 	preempt(self);
   2934 	if (count) {
   2935 		DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
   2936 	}
   2937 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
   2938 	return (0);
   2939 }
   2940 
   2941 int
   2942 pthread_spin_unlock(pthread_spinlock_t *lock)
   2943 {
   2944 	mutex_t *mp = (mutex_t *)lock;
   2945 	ulwp_t *self = curthread;
   2946 
   2947 	no_preempt(self);
   2948 	mp->mutex_owner = 0;
   2949 	mp->mutex_ownerpid = 0;
   2950 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
   2951 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
   2952 	preempt(self);
   2953 	return (0);
   2954 }
   2955 
   2956 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
   2957 
   2958 /*
   2959  * Find/allocate an entry for 'lock' in our array of held locks.
   2960  */
   2961 static mutex_t **
   2962 find_lock_entry(mutex_t *lock)
   2963 {
   2964 	ulwp_t *self = curthread;
   2965 	mutex_t **remembered = NULL;
   2966 	mutex_t **lockptr;
   2967 	uint_t nlocks;
   2968 
   2969 	if ((nlocks = self->ul_heldlockcnt) != 0)
   2970 		lockptr = self->ul_heldlocks.array;
   2971 	else {
   2972 		nlocks = 1;
   2973 		lockptr = &self->ul_heldlocks.single;
   2974 	}
   2975 
   2976 	for (; nlocks; nlocks--, lockptr++) {
   2977 		if (*lockptr == lock)
   2978 			return (lockptr);
   2979 		if (*lockptr == NULL && remembered == NULL)
   2980 			remembered = lockptr;
   2981 	}
   2982 	if (remembered != NULL) {
   2983 		*remembered = lock;
   2984 		return (remembered);
   2985 	}
   2986 
   2987 	/*
   2988 	 * No entry available.  Allocate more space, converting
   2989 	 * the single entry into an array of entries if necessary.
   2990 	 */
   2991 	if ((nlocks = self->ul_heldlockcnt) == 0) {
   2992 		/*
   2993 		 * Initial allocation of the array.
   2994 		 * Convert the single entry into an array.
   2995 		 */
   2996 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
   2997 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
   2998 		/*
   2999 		 * The single entry becomes the first entry in the array.
   3000 		 */
   3001 		*lockptr = self->ul_heldlocks.single;
   3002 		self->ul_heldlocks.array = lockptr;
   3003 		/*
   3004 		 * Return the next available entry in the array.
   3005 		 */
   3006 		*++lockptr = lock;
   3007 		return (lockptr);
   3008 	}
   3009 	/*
   3010 	 * Reallocate the array, double the size each time.
   3011 	 */
   3012 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
   3013 	(void) memcpy(lockptr, self->ul_heldlocks.array,
   3014 	    nlocks * sizeof (mutex_t *));
   3015 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
   3016 	self->ul_heldlocks.array = lockptr;
   3017 	self->ul_heldlockcnt *= 2;
   3018 	/*
   3019 	 * Return the next available entry in the newly allocated array.
   3020 	 */
   3021 	*(lockptr += nlocks) = lock;
   3022 	return (lockptr);
   3023 }
   3024 
   3025 /*
   3026  * Insert 'lock' into our list of held locks.
   3027  * Currently only used for LOCK_ROBUST mutexes.
   3028  */
   3029 void
   3030 remember_lock(mutex_t *lock)
   3031 {
   3032 	(void) find_lock_entry(lock);
   3033 }
   3034 
   3035 /*
   3036  * Remove 'lock' from our list of held locks.
   3037  * Currently only used for LOCK_ROBUST mutexes.
   3038  */
   3039 void
   3040 forget_lock(mutex_t *lock)
   3041 {
   3042 	*find_lock_entry(lock) = NULL;
   3043 }
   3044 
   3045 /*
   3046  * Free the array of held locks.
   3047  */
   3048 void
   3049 heldlock_free(ulwp_t *ulwp)
   3050 {
   3051 	uint_t nlocks;
   3052 
   3053 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
   3054 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
   3055 	ulwp->ul_heldlockcnt = 0;
   3056 	ulwp->ul_heldlocks.array = NULL;
   3057 }
   3058 
   3059 /*
   3060  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
   3061  * Called from _thrp_exit() to deal with abandoned locks.
   3062  */
   3063 void
   3064 heldlock_exit(void)
   3065 {
   3066 	ulwp_t *self = curthread;
   3067 	mutex_t **lockptr;
   3068 	uint_t nlocks;
   3069 	mutex_t *mp;
   3070 
   3071 	if ((nlocks = self->ul_heldlockcnt) != 0)
   3072 		lockptr = self->ul_heldlocks.array;
   3073 	else {
   3074 		nlocks = 1;
   3075 		lockptr = &self->ul_heldlocks.single;
   3076 	}
   3077 
   3078 	for (; nlocks; nlocks--, lockptr++) {
   3079 		/*
   3080 		 * The kernel takes care of transitioning held
   3081 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
   3082 		 * We avoid that case here.
   3083 		 */
   3084 		if ((mp = *lockptr) != NULL &&
   3085 		    mutex_held(mp) &&
   3086 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
   3087 		    LOCK_ROBUST) {
   3088 			mp->mutex_rcount = 0;
   3089 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
   3090 				mp->mutex_flag |= LOCK_OWNERDEAD;
   3091 			(void) mutex_unlock_internal(mp, 1);
   3092 		}
   3093 	}
   3094 
   3095 	heldlock_free(self);
   3096 }
   3097 
   3098 #pragma weak _cond_init = cond_init
   3099 /* ARGSUSED2 */
   3100 int
   3101 cond_init(cond_t *cvp, int type, void *arg)
   3102 {
   3103 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
   3104 		return (EINVAL);
   3105 	(void) memset(cvp, 0, sizeof (*cvp));
   3106 	cvp->cond_type = (uint16_t)type;
   3107 	cvp->cond_magic = COND_MAGIC;
   3108 
   3109 	/*
   3110 	 * This should be at the beginning of the function,
   3111 	 * but for the sake of old broken applications that
   3112 	 * do not have proper alignment for their condvars
   3113 	 * (and don't check the return code from cond_init),
   3114 	 * we put it here, after initializing the condvar regardless.
   3115 	 */
   3116 	if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
   3117 	    curthread->ul_misaligned == 0)
   3118 		return (EINVAL);
   3119 
   3120 	return (0);
   3121 }
   3122 
   3123 /*
   3124  * cond_sleep_queue(): utility function for cond_wait_queue().
   3125  *
   3126  * Go to sleep on a condvar sleep queue, expect to be waked up
   3127  * by someone calling cond_signal() or cond_broadcast() or due
   3128  * to receiving a UNIX signal or being cancelled, or just simply
   3129  * due to a spurious wakeup (like someome calling forkall()).
   3130  *
   3131  * The associated mutex is *not* reacquired before returning.
   3132  * That must be done by the caller of cond_sleep_queue().
   3133  */
   3134 static int
   3135 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
   3136 {
   3137 	ulwp_t *self = curthread;
   3138 	queue_head_t *qp;
   3139 	queue_head_t *mqp;
   3140 	lwpid_t lwpid;
   3141 	int signalled;
   3142 	int error;
   3143 	int cv_wake;
   3144 	int release_all;
   3145 
   3146 	/*
   3147 	 * Put ourself on the CV sleep queue, unlock the mutex, then
   3148 	 * park ourself and unpark a candidate lwp to grab the mutex.
   3149 	 * We must go onto the CV sleep queue before dropping the
   3150 	 * mutex in order to guarantee atomicity of the operation.
   3151 	 */
   3152 	self->ul_sp = stkptr();
   3153 	qp = queue_lock(cvp, CV);
   3154 	enqueue(qp, self, 0);
   3155 	cvp->cond_waiters_user = 1;
   3156 	self->ul_cvmutex = mp;
   3157 	self->ul_cv_wake = cv_wake = (tsp != NULL);
   3158 	self->ul_signalled = 0;
   3159 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
   3160 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
   3161 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
   3162 	}
   3163 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
   3164 	lwpid = mutex_unlock_queue(mp, release_all);
   3165 	for (;;) {
   3166 		set_parking_flag(self, 1);
   3167 		queue_unlock(qp);
   3168 		if (lwpid != 0) {
   3169 			lwpid = preempt_unpark(self, lwpid);
   3170 			preempt(self);
   3171 		}
   3172 		/*
   3173 		 * We may have a deferred signal present,
   3174 		 * in which case we should return EINTR.
   3175 		 * Also, we may have received a SIGCANCEL; if so
   3176 		 * and we are cancelable we should return EINTR.
   3177 		 * We force an immediate EINTR return from
   3178 		 * __lwp_park() by turning our parking flag off.
   3179 		 */
   3180 		if (self->ul_cursig != 0 ||
   3181 		    (self->ul_cancelable && self->ul_cancel_pending))
   3182 			set_parking_flag(self, 0);
   3183 		/*
   3184 		 * __lwp_park() will return the residual time in tsp
   3185 		 * if we are unparked before the timeout expires.
   3186 		 */
   3187 		error = __lwp_park(tsp, lwpid);
   3188 		set_parking_flag(self, 0);
   3189 		lwpid = 0;	/* unpark the other lwp only once */
   3190 		/*
   3191 		 * We were waked up by cond_signal(), cond_broadcast(),
   3192 		 * by an interrupt or timeout (EINTR or ETIME),
   3193 		 * or we may just have gotten a spurious wakeup.
   3194 		 */
   3195 		qp = queue_lock(cvp, CV);
   3196 		if (!cv_wake)
   3197 			mqp = queue_lock(mp, MX);
   3198 		if (self->ul_sleepq == NULL)
   3199 			break;
   3200 		/*
   3201 		 * We are on either the condvar sleep queue or the
   3202 		 * mutex sleep queue.  Break out of the sleep if we
   3203 		 * were interrupted or we timed out (EINTR or ETIME).
   3204 		 * Else this is a spurious wakeup; continue the loop.
   3205 		 */
   3206 		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
   3207 			if (error) {
   3208 				mp->mutex_waiters = dequeue_self(mqp);
   3209 				break;
   3210 			}
   3211 			tsp = NULL;	/* no more timeout */
   3212 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
   3213 			if (error) {
   3214 				cvp->cond_waiters_user = dequeue_self(qp);
   3215 				break;
   3216 			}
   3217 			/*
   3218 			 * Else a spurious wakeup on the condvar queue.
   3219 			 * __lwp_park() has already adjusted the timeout.
   3220 			 */
   3221 		} else {
   3222 			thr_panic("cond_sleep_queue(): thread not on queue");
   3223 		}
   3224 		if (!cv_wake)
   3225 			queue_unlock(mqp);
   3226 	}
   3227 
   3228 	self->ul_sp = 0;
   3229 	self->ul_cv_wake = 0;
   3230 	ASSERT(self->ul_cvmutex == NULL);
   3231 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
   3232 	    self->ul_wchan == NULL);
   3233 
   3234 	signalled = self->ul_signalled;
   3235 	self->ul_signalled = 0;
   3236 	queue_unlock(qp);
   3237 	if (!cv_wake)
   3238 		queue_unlock(mqp);
   3239 
   3240 	/*
   3241 	 * If we were concurrently cond_signal()d and any of:
   3242 	 * received a UNIX signal, were cancelled, or got a timeout,
   3243 	 * then perform another cond_signal() to avoid consuming it.
   3244 	 */
   3245 	if (error && signalled)
   3246 		(void) cond_signal(cvp);
   3247 
   3248 	return (error);
   3249 }
   3250 
   3251 static void
   3252 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
   3253 {
   3254 	if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
   3255 		lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
   3256 	if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
   3257 		lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
   3258 }
   3259 
   3260 int
   3261 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
   3262 {
   3263 	ulwp_t *self = curthread;
   3264 	int error;
   3265 	int merror;
   3266 
   3267 	if (self->ul_error_detection && self->ul_misaligned == 0)
   3268 		cond_wait_check_alignment(cvp, mp);
   3269 
   3270 	/*
   3271 	 * The old thread library was programmed to defer signals
   3272 	 * while in cond_wait() so that the associated mutex would
   3273 	 * be guaranteed to be held when the application signal
   3274 	 * handler was invoked.
   3275 	 *
   3276 	 * We do not behave this way by default; the state of the
   3277 	 * associated mutex in the signal handler is undefined.
   3278 	 *
   3279 	 * To accommodate applications that depend on the old
   3280 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
   3281 	 * variable can be set to 1 and we will behave in the
   3282 	 * old way with respect to cond_wait().
   3283 	 */
   3284 	if (self->ul_cond_wait_defer)
   3285 		sigoff(self);
   3286 
   3287 	error = cond_sleep_queue(cvp, mp, tsp);
   3288 
   3289 	/*
   3290 	 * Reacquire the mutex.
   3291 	 */
   3292 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
   3293 		error = merror;
   3294 
   3295 	/*
   3296 	 * Take any deferred signal now, after we have reacquired the mutex.
   3297 	 */
   3298 	if (self->ul_cond_wait_defer)
   3299 		sigon(self);
   3300 
   3301 	return (error);
   3302 }
   3303 
   3304 /*
   3305  * cond_sleep_kernel(): utility function for cond_wait_kernel().
   3306  * See the comment ahead of cond_sleep_queue(), above.
   3307  */
   3308 static int
   3309 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
   3310 {
   3311 	int mtype = mp->mutex_type;
   3312 	ulwp_t *self = curthread;
   3313 	int error;
   3314 
   3315 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
   3316 		_ceil_prio_waive();
   3317 
   3318 	self->ul_sp = stkptr();
   3319 	self->ul_wchan = cvp;
   3320 	sigoff(self);
   3321 	mp->mutex_owner = 0;
   3322 	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
   3323 	if (mtype & LOCK_PRIO_INHERIT) {
   3324 		mp->mutex_lockw = LOCKCLEAR;
   3325 		self->ul_pilocks--;
   3326 	}
   3327 	/*
   3328 	 * ___lwp_cond_wait() returns immediately with EINTR if
   3329 	 * set_parking_flag(self,0) is called on this lwp before it
   3330 	 * goes to sleep in the kernel.  sigacthandler() calls this
   3331 	 * when a deferred signal is noted.  This assures that we don't
   3332 	 * get stuck in ___lwp_cond_wait() with all signals blocked
   3333 	 * due to taking a deferred signal before going to sleep.
   3334 	 */
   3335 	set_parking_flag(self, 1);
   3336 	if (self->ul_cursig != 0 ||
   3337 	    (self->ul_cancelable && self->ul_cancel_pending))
   3338 		set_parking_flag(self, 0);
   3339 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
   3340 	set_parking_flag(self, 0);
   3341 	sigon(self);
   3342 	self->ul_sp = 0;
   3343 	self->ul_wchan = NULL;
   3344 	return (error);
   3345 }
   3346 
   3347 int
   3348 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
   3349 {
   3350 	ulwp_t *self = curthread;
   3351 	int error;
   3352 	int merror;
   3353 
   3354 	if (self->ul_error_detection && self->ul_misaligned == 0)
   3355 		cond_wait_check_alignment(cvp, mp);
   3356 
   3357 	/*
   3358 	 * See the large comment in cond_wait_queue(), above.
   3359 	 */
   3360 	if (self->ul_cond_wait_defer)
   3361 		sigoff(self);
   3362 
   3363 	error = cond_sleep_kernel(cvp, mp, tsp);
   3364 
   3365 	/*
   3366 	 * Override the return code from ___lwp_cond_wait()
   3367 	 * with any non-zero return code from mutex_lock().
   3368 	 * This addresses robust lock failures in particular;
   3369 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
   3370 	 * errors in order to take corrective action.
   3371 	 */
   3372 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
   3373 		error = merror;
   3374 
   3375 	/*
   3376 	 * Take any deferred signal now, after we have reacquired the mutex.
   3377 	 */
   3378 	if (self->ul_cond_wait_defer)
   3379 		sigon(self);
   3380 
   3381 	return (error);
   3382 }
   3383 
   3384 /*
   3385  * Common code for cond_wait() and cond_timedwait()
   3386  */
   3387 int
   3388 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
   3389 {
   3390 	int mtype = mp->mutex_type;
   3391 	hrtime_t begin_sleep = 0;
   3392 	ulwp_t *self = curthread;
   3393 	uberdata_t *udp = self->ul_uberdata;
   3394 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
   3395 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
   3396 	uint8_t rcount;
   3397 	int error = 0;
   3398 
   3399 	/*
   3400 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
   3401 	 *	Except in the case of [ETIMEDOUT], all these error checks
   3402 	 *	shall act as if they were performed immediately at the
   3403 	 *	beginning of processing for the function and shall cause
   3404 	 *	an error return, in effect, prior to modifying the state
   3405 	 *	of the mutex specified by mutex or the condition variable
   3406 	 *	specified by cond.
   3407 	 * Therefore, we must return EINVAL now if the timout is invalid.
   3408 	 */
   3409 	if (tsp != NULL &&
   3410 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
   3411 		return (EINVAL);
   3412 
   3413 	if (__td_event_report(self, TD_SLEEP, udp)) {
   3414 		self->ul_sp = stkptr();
   3415 		self->ul_wchan = cvp;
   3416 		self->ul_td_evbuf.eventnum = TD_SLEEP;
   3417 		self->ul_td_evbuf.eventdata = cvp;
   3418 		tdb_event(TD_SLEEP, udp);
   3419 		self->ul_sp = 0;
   3420 	}
   3421 	if (csp) {
   3422 		if (tsp)
   3423 			tdb_incr(csp->cond_timedwait);
   3424 		else
   3425 			tdb_incr(csp->cond_wait);
   3426 	}
   3427 	if (msp)
   3428 		begin_sleep = record_hold_time(msp);
   3429 	else if (csp)
   3430 		begin_sleep = gethrtime();
   3431 
   3432 	if (self->ul_error_detection) {
   3433 		if (!mutex_held(mp))
   3434 			lock_error(mp, "cond_wait", cvp, NULL);
   3435 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
   3436 			lock_error(mp, "recursive mutex in cond_wait",
   3437 			    cvp, NULL);
   3438 		if (cvp->cond_type & USYNC_PROCESS) {
   3439 			if (!(mtype & USYNC_PROCESS))
   3440 				lock_error(mp, "cond_wait", cvp,
   3441 				    "condvar process-shared, "
   3442 				    "mutex process-private");
   3443 		} else {
   3444 			if (mtype & USYNC_PROCESS)
   3445 				lock_error(mp, "cond_wait", cvp,
   3446 				    "condvar process-private, "
   3447 				    "mutex process-shared");
   3448 		}
   3449 	}
   3450 
   3451 	/*
   3452 	 * We deal with recursive mutexes by completely
   3453 	 * dropping the lock and restoring the recursion
   3454 	 * count after waking up.  This is arguably wrong,
   3455 	 * but it obeys the principle of least astonishment.
   3456 	 */
   3457 	rcount = mp->mutex_rcount;
   3458 	mp->mutex_rcount = 0;
   3459 	if ((mtype &
   3460 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
   3461 	    (cvp->cond_type & USYNC_PROCESS))
   3462 		error = cond_wait_kernel(cvp, mp, tsp);
   3463 	else
   3464 		error = cond_wait_queue(cvp, mp, tsp);
   3465 	mp->mutex_rcount = rcount;
   3466 
   3467 	if (csp) {
   3468 		hrtime_t lapse = gethrtime() - begin_sleep;
   3469 		if (tsp == NULL)
   3470 			csp->cond_wait_sleep_time += lapse;
   3471 		else {
   3472 			csp->cond_timedwait_sleep_time += lapse;
   3473 			if (error == ETIME)
   3474 				tdb_incr(csp->cond_timedwait_timeout);
   3475 		}
   3476 	}
   3477 	return (error);
   3478 }
   3479 
   3480 /*
   3481  * cond_wait() is a cancellation point but __cond_wait() is not.
   3482  * Internally, libc calls the non-cancellation version.
   3483  * Other libraries need to use pthread_setcancelstate(), as appropriate,
   3484  * since __cond_wait() is not exported from libc.
   3485  */
   3486 int
   3487 __cond_wait(cond_t *cvp, mutex_t *mp)
   3488 {
   3489 	ulwp_t *self = curthread;
   3490 	uberdata_t *udp = self->ul_uberdata;
   3491 	uberflags_t *gflags;
   3492 
   3493 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
   3494 	    !mutex_held(mp))
   3495 		return (EPERM);
   3496 
   3497 	/*
   3498 	 * Optimize the common case of USYNC_THREAD plus
   3499 	 * no error detection, no lock statistics, and no event tracing.
   3500 	 */
   3501 	if ((gflags = self->ul_schedctl_called) != NULL &&
   3502 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
   3503 	    self->ul_td_events_enable |
   3504 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
   3505 		return (cond_wait_queue(cvp, mp, NULL));
   3506 
   3507 	/*
   3508 	 * Else do it the long way.
   3509 	 */
   3510 	return (cond_wait_common(cvp, mp, NULL));
   3511 }
   3512 
   3513 #pragma weak _cond_wait = cond_wait
   3514 int
   3515 cond_wait(cond_t *cvp, mutex_t *mp)
   3516 {
   3517 	int error;
   3518 
   3519 	_cancelon();
   3520 	error = __cond_wait(cvp, mp);
   3521 	if (error == EINTR)
   3522 		_canceloff();
   3523 	else
   3524 		_canceloff_nocancel();
   3525 	return (error);
   3526 }
   3527 
   3528 /*
   3529  * pthread_cond_wait() is a cancellation point.
   3530  */
   3531 int
   3532 pthread_cond_wait(pthread_cond_t *_RESTRICT_KYWD cvp,
   3533 	pthread_mutex_t *_RESTRICT_KYWD mp)
   3534 {
   3535 	int error;
   3536 
   3537 	error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
   3538 	return ((error == EINTR)? 0 : error);
   3539 }
   3540 
   3541 /*
   3542  * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
   3543  */
   3544 int
   3545 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
   3546 {
   3547 	clockid_t clock_id = cvp->cond_clockid;
   3548 	timespec_t reltime;
   3549 	int error;
   3550 
   3551 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
   3552 	    !mutex_held(mp))
   3553 		return (EPERM);
   3554 
   3555 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
   3556 		clock_id = CLOCK_REALTIME;
   3557 	abstime_to_reltime(clock_id, abstime, &reltime);
   3558 	error = cond_wait_common(cvp, mp, &reltime);
   3559 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
   3560 		/*
   3561 		 * Don't return ETIME if we didn't really get a timeout.
   3562 		 * This can happen if we return because someone resets
   3563 		 * the system clock.  Just return zero in this case,
   3564 		 * giving a spurious wakeup but not a timeout.
   3565 		 */
   3566 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
   3567 		    abstime->tv_nsec > gethrtime())
   3568 			error = 0;
   3569 	}
   3570 	return (error);
   3571 }
   3572 
   3573 int
   3574 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
   3575 {
   3576 	int error;
   3577 
   3578 	_cancelon();
   3579 	error = __cond_timedwait(cvp, mp, abstime);
   3580 	if (error == EINTR)
   3581 		_canceloff();
   3582 	else
   3583 		_canceloff_nocancel();
   3584 	return (error);
   3585 }
   3586 
   3587 /*
   3588  * pthread_cond_timedwait() is a cancellation point.
   3589  */
   3590 int
   3591 pthread_cond_timedwait(pthread_cond_t *_RESTRICT_KYWD cvp,
   3592 	pthread_mutex_t *_RESTRICT_KYWD mp,
   3593 	const struct timespec *_RESTRICT_KYWD abstime)
   3594 {
   3595 	int error;
   3596 
   3597 	error = cond_timedwait((cond_t *)cvp, (mutex_t *)mp, abstime);
   3598 	if (error == ETIME)
   3599 		error = ETIMEDOUT;
   3600 	else if (error == EINTR)
   3601 		error = 0;
   3602 	return (error);
   3603 }
   3604 
   3605 /*
   3606  * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
   3607  */
   3608 int
   3609 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
   3610 {
   3611 	timespec_t tslocal = *reltime;
   3612 
   3613 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
   3614 	    !mutex_held(mp))
   3615 		return (EPERM);
   3616 
   3617 	return (cond_wait_common(cvp, mp, &tslocal));
   3618 }
   3619 
   3620 int
   3621 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
   3622 {
   3623 	int error;
   3624 
   3625 	_cancelon();
   3626 	error = __cond_reltimedwait(cvp, mp, reltime);
   3627 	if (error == EINTR)
   3628 		_canceloff();
   3629 	else
   3630 		_canceloff_nocancel();
   3631 	return (error);
   3632 }
   3633 
   3634 int
   3635 pthread_cond_reltimedwait_np(pthread_cond_t *_RESTRICT_KYWD cvp,
   3636 	pthread_mutex_t *_RESTRICT_KYWD mp,
   3637 	const struct timespec *_RESTRICT_KYWD reltime)
   3638 {
   3639 	int error;
   3640 
   3641 	error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
   3642 	if (error == ETIME)
   3643 		error = ETIMEDOUT;
   3644 	else if (error == EINTR)
   3645 		error = 0;
   3646 	return (error);
   3647 }
   3648 
   3649 #pragma weak pthread_cond_signal = cond_signal
   3650 #pragma weak _cond_signal = cond_signal
   3651 int
   3652 cond_signal(cond_t *cvp)
   3653 {
   3654 	ulwp_t *self = curthread;
   3655 	uberdata_t *udp = self->ul_uberdata;
   3656 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
   3657 	int error = 0;
   3658 	int more;
   3659 	lwpid_t lwpid;
   3660 	queue_head_t *qp;
   3661 	mutex_t *mp;
   3662 	queue_head_t *mqp;
   3663 	ulwp_t **ulwpp;
   3664 	ulwp_t *ulwp;
   3665 	ulwp_t *prev;
   3666 
   3667 	if (csp)
   3668 		tdb_incr(csp->cond_signal);
   3669 
   3670 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
   3671 		error = _lwp_cond_signal(cvp);
   3672 
   3673 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
   3674 		return (error);
   3675 
   3676 	/*
   3677 	 * Move someone from the condvar sleep queue to the mutex sleep
   3678 	 * queue for the mutex that he will acquire on being waked up.
   3679 	 * We can do this only if we own the mutex he will acquire.
   3680 	 * If we do not own the mutex, or if his ul_cv_wake flag
   3681 	 * is set, just dequeue and unpark him.
   3682 	 */
   3683 	qp = queue_lock(cvp, CV);
   3684 	ulwpp = queue_slot(qp, &prev, &more);
   3685 	cvp->cond_waiters_user = more;
   3686 	if (ulwpp == NULL) {	/* no one on the sleep queue */
   3687 		queue_unlock(qp);
   3688 		return (error);
   3689 	}
   3690 	ulwp = *ulwpp;
   3691 
   3692 	/*
   3693 	 * Inform the thread that he was the recipient of a cond_signal().
   3694 	 * This lets him deal with cond_signal() and, concurrently,
   3695 	 * one or more of a cancellation, a UNIX signal, or a timeout.
   3696 	 * These latter conditions must not consume a cond_signal().
   3697 	 */
   3698 	ulwp->ul_signalled = 1;
   3699 
   3700 	/*
   3701 	 * Dequeue the waiter but leave his ul_sleepq non-NULL
   3702 	 * while we move him to the mutex queue so that he can
   3703 	 * deal properly with spurious wakeups.
   3704 	 */
   3705 	queue_unlink(qp, ulwpp, prev);
   3706 
   3707 	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
   3708 	ulwp->ul_cvmutex = NULL;
   3709 	ASSERT(mp != NULL);
   3710 
   3711 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
   3712 		/* just wake him up */
   3713 		lwpid = ulwp->ul_lwpid;
   3714 		no_preempt(self);
   3715 		ulwp->ul_sleepq = NULL;
   3716 		ulwp->ul_wchan = NULL;
   3717 		queue_unlock(qp);
   3718 		(void) __lwp_unpark(lwpid);
   3719 		preempt(self);
   3720 	} else {
   3721 		/* move him to the mutex queue */
   3722 		mqp = queue_lock(mp, MX);
   3723 		enqueue(mqp, ulwp, 0);
   3724 		mp->mutex_waiters = 1;
   3725 		queue_unlock(mqp);
   3726 		queue_unlock(qp);
   3727 	}
   3728 
   3729 	return (error);
   3730 }
   3731 
   3732 /*
   3733  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
   3734  * and rw_queue_release() to (re)allocate a big buffer to hold the
   3735  * lwpids of all the threads to be set running after they are removed
   3736  * from their sleep queues.  Since we are holding a queue lock, we
   3737  * cannot call any function that might acquire a lock.  mmap(), munmap(),
   3738  * lwp_unpark_all() are simple system calls and are safe in this regard.
   3739  */
   3740 lwpid_t *
   3741 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
   3742 {
   3743 	/*
   3744 	 * Allocate NEWLWPS ids on the first overflow.
   3745 	 * Double the allocation each time after that.
   3746 	 */
   3747 	int nlwpid = *nlwpid_ptr;
   3748 	int maxlwps = *maxlwps_ptr;
   3749 	int first_allocation;
   3750 	int newlwps;
   3751 	void *vaddr;
   3752 
   3753 	ASSERT(nlwpid == maxlwps);
   3754 
   3755 	first_allocation = (maxlwps == MAXLWPS);
   3756 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
   3757 	vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
   3758 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
   3759 
   3760 	if (vaddr == MAP_FAILED) {
   3761 		/*
   3762 		 * Let's hope this never happens.
   3763 		 * If it does, then we have a terrible
   3764 		 * thundering herd on our hands.
   3765 		 */
   3766 		(void) __lwp_unpark_all(lwpid, nlwpid);
   3767 		*nlwpid_ptr = 0;
   3768 	} else {
   3769 		(void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
   3770 		if (!first_allocation)
   3771 			(void) munmap((caddr_t)lwpid,
   3772 			    maxlwps * sizeof (lwpid_t));
   3773 		lwpid = vaddr;
   3774 		*maxlwps_ptr = newlwps;
   3775 	}
   3776 
   3777 	return (lwpid);
   3778 }
   3779 
   3780 #pragma weak pthread_cond_broadcast = cond_broadcast
   3781 #pragma weak _cond_broadcast = cond_broadcast
   3782 int
   3783 cond_broadcast(cond_t *cvp)
   3784 {
   3785 	ulwp_t *self = curthread;
   3786 	uberdata_t *udp = self->ul_uberdata;
   3787 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
   3788 	int error = 0;
   3789 	queue_head_t *qp;
   3790 	queue_root_t *qrp;
   3791 	mutex_t *mp;
   3792 	mutex_t *mp_cache = NULL;
   3793 	queue_head_t *mqp = NULL;
   3794 	ulwp_t *ulwp;
   3795 	int nlwpid = 0;
   3796 	int maxlwps = MAXLWPS;
   3797 	lwpid_t buffer[MAXLWPS];
   3798 	lwpid_t *lwpid = buffer;
   3799 
   3800 	if (csp)
   3801 		tdb_incr(csp->cond_broadcast);
   3802 
   3803 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
   3804 		error = _lwp_cond_broadcast(cvp);
   3805 
   3806 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
   3807 		return (error);
   3808 
   3809 	/*
   3810 	 * Move everyone from the condvar sleep queue to the mutex sleep
   3811 	 * queue for the mutex that they will acquire on being waked up.
   3812 	 * We can do this only if we own the mutex they will acquire.
   3813 	 * If we do not own the mutex, or if their ul_cv_wake flag
   3814 	 * is set, just dequeue and unpark them.
   3815 	 *
   3816 	 * We keep track of lwpids that are to be unparked in lwpid[].
   3817 	 * __lwp_unpark_all() is called to unpark all of them after
   3818 	 * they have been removed from the sleep queue and the sleep
   3819 	 * queue lock has been dropped.  If we run out of space in our
   3820 	 * on-stack buffer, we need to allocate more but we can't call
   3821 	 * lmalloc() because we are holding a queue lock when the overflow
   3822 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
   3823 	 * either because the application may have allocated a small
   3824 	 * stack and we don't want to overrun the stack.  So we call
   3825 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
   3826 	 * system call directly since that path acquires no locks.
   3827 	 */
   3828 	qp = queue_lock(cvp, CV);
   3829 	cvp->cond_waiters_user = 0;
   3830 	for (;;) {
   3831 		if ((qrp = qp->qh_root) == NULL ||
   3832 		    (ulwp = qrp->qr_head) == NULL)
   3833 			break;
   3834 		ASSERT(ulwp->ul_wchan == cvp);
   3835 		queue_unlink(qp, &qrp->qr_head, NULL);
   3836 		mp = ulwp->ul_cvmutex;		/* his mutex */
   3837 		ulwp->ul_cvmutex = NULL;
   3838 		ASSERT(mp != NULL);
   3839 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
   3840 			/* just wake him up */
   3841 			ulwp->ul_sleepq = NULL;
   3842 			ulwp->ul_wchan = NULL;
   3843 			if (nlwpid == maxlwps)
   3844 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
   3845 			lwpid[nlwpid++] = ulwp->ul_lwpid;
   3846 		} else {
   3847 			/* move him to the mutex queue */
   3848 			if (mp != mp_cache) {
   3849 				mp_cache = mp;
   3850 				if (mqp != NULL)
   3851 					queue_unlock(mqp);
   3852 				mqp = queue_lock(mp, MX);
   3853 			}
   3854 			enqueue(mqp, ulwp, 0);
   3855 			mp->mutex_waiters = 1;
   3856 		}
   3857 	}
   3858 	if (mqp != NULL)
   3859 		queue_unlock(mqp);
   3860 	if (nlwpid == 0) {
   3861 		queue_unlock(qp);
   3862 	} else {
   3863 		no_preempt(self);
   3864 		queue_unlock(qp);
   3865 		if (nlwpid == 1)
   3866 			(void) __lwp_unpark(lwpid[0]);
   3867 		else
   3868 			(void) __lwp_unpark_all(lwpid, nlwpid);
   3869 		preempt(self);
   3870 	}
   3871 	if (lwpid != buffer)
   3872 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
   3873 	return (error);
   3874 }
   3875 
   3876 #pragma weak pthread_cond_destroy = cond_destroy
   3877 int
   3878 cond_destroy(cond_t *cvp)
   3879 {
   3880 	cvp->cond_magic = 0;
   3881 	tdb_sync_obj_deregister(cvp);
   3882 	return (0);
   3883 }
   3884 
   3885 #if defined(THREAD_DEBUG)
   3886 void
   3887 assert_no_libc_locks_held(void)
   3888 {
   3889 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
   3890 }
   3891 
   3892 /* protected by link_lock */
   3893 uint64_t spin_lock_spin;
   3894 uint64_t spin_lock_spin2;
   3895 uint64_t spin_lock_sleep;
   3896 uint64_t spin_lock_wakeup;
   3897 
   3898 /*
   3899  * Record spin lock statistics.
   3900  * Called by a thread exiting itself in thrp_exit().
   3901  * Also called via atexit() from the thread calling
   3902  * exit() to do all the other threads as well.
   3903  */
   3904 void
   3905 record_spin_locks(ulwp_t *ulwp)
   3906 {
   3907 	spin_lock_spin += ulwp->ul_spin_lock_spin;
   3908 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
   3909 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
   3910 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
   3911 	ulwp->ul_spin_lock_spin = 0;
   3912 	ulwp->ul_spin_lock_spin2 = 0;
   3913 	ulwp->ul_spin_lock_sleep = 0;
   3914 	ulwp->ul_spin_lock_wakeup = 0;
   3915 }
   3916 
   3917 /*
   3918  * atexit function:  dump the queue statistics to stderr.
   3919  */
   3920 #include <stdio.h>
   3921 void
   3922 dump_queue_statistics(void)
   3923 {
   3924 	uberdata_t *udp = curthread->ul_uberdata;
   3925 	queue_head_t *qp;
   3926 	int qn;
   3927 	uint64_t spin_lock_total = 0;
   3928 
   3929 	if (udp->queue_head == NULL || thread_queue_dump == 0)
   3930 		return;
   3931 
   3932 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
   3933 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
   3934 		return;
   3935 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
   3936 		if (qp->qh_lockcount == 0)
   3937 			continue;
   3938 		spin_lock_total += qp->qh_lockcount;
   3939 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
   3940 		    (u_longlong_t)qp->qh_lockcount,
   3941 		    qp->qh_qmax, qp->qh_hmax) < 0)
   3942 			return;
   3943 	}
   3944 
   3945 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
   3946 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
   3947 		return;
   3948 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
   3949 		if (qp->qh_lockcount == 0)
   3950 			continue;
   3951 		spin_lock_total += qp->qh_lockcount;
   3952 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
   3953 		    (u_longlong_t)qp->qh_lockcount,
   3954 		    qp->qh_qmax, qp->qh_hmax) < 0)
   3955 			return;
   3956 	}
   3957 
   3958 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
   3959 	    (u_longlong_t)spin_lock_total);
   3960 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
   3961 	    (u_longlong_t)spin_lock_spin);
   3962 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
   3963 	    (u_longlong_t)spin_lock_spin2);
   3964 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
   3965 	    (u_longlong_t)spin_lock_sleep);
   3966 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
   3967 	    (u_longlong_t)spin_lock_wakeup);
   3968 }
   3969 #endif
   3970