Home | History | Annotate | Download | only in threads
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 
     22 /*
     23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     24  * Use is subject to license terms.
     25  */
     26 
     27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     28 
     29 #include "lint.h"
     30 #include "thr_uberdata.h"
     31 #include <sys/rtpriocntl.h>
     32 #include <sys/sdt.h>
     33 #include <atomic.h>
     34 
     35 #if defined(THREAD_DEBUG)
     36 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
     37 #define	INCR(x)		((x)++)
     38 #define	DECR(x)		((x)--)
     39 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
     40 #else
     41 #define	INCR32(x)
     42 #define	INCR(x)
     43 #define	DECR(x)
     44 #define	MAXINCR(m, x)
     45 #endif
     46 
     47 /*
     48  * This mutex is initialized to be held by lwp#1.
     49  * It is used to block a thread that has returned from a mutex_lock()
     50  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
     51  */
     52 mutex_t	stall_mutex = DEFAULTMUTEX;
     53 
     54 static int shared_mutex_held(mutex_t *);
     55 static int mutex_queuelock_adaptive(mutex_t *);
     56 static void mutex_wakeup_all(mutex_t *);
     57 
     58 /*
     59  * Lock statistics support functions.
     60  */
     61 void
     62 record_begin_hold(tdb_mutex_stats_t *msp)
     63 {
     64 	tdb_incr(msp->mutex_lock);
     65 	msp->mutex_begin_hold = gethrtime();
     66 }
     67 
     68 hrtime_t
     69 record_hold_time(tdb_mutex_stats_t *msp)
     70 {
     71 	hrtime_t now = gethrtime();
     72 
     73 	if (msp->mutex_begin_hold)
     74 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
     75 	msp->mutex_begin_hold = 0;
     76 	return (now);
     77 }
     78 
     79 /*
     80  * Called once at library initialization.
     81  */
     82 void
     83 mutex_setup(void)
     84 {
     85 	if (set_lock_byte(&stall_mutex.mutex_lockw))
     86 		thr_panic("mutex_setup() cannot acquire stall_mutex");
     87 	stall_mutex.mutex_owner = (uintptr_t)curthread;
     88 }
     89 
     90 /*
     91  * The default spin count of 1000 is experimentally determined.
     92  * On sun4u machines with any number of processors it could be raised
     93  * to 10,000 but that (experimentally) makes almost no difference.
     94  * The environment variable:
     95  *	_THREAD_ADAPTIVE_SPIN=count
     96  * can be used to override and set the count in the range [0 .. 1,000,000].
     97  */
     98 int	thread_adaptive_spin = 1000;
     99 uint_t	thread_max_spinners = 100;
    100 int	thread_queue_verify = 0;
    101 static	int	ncpus;
    102 
    103 /*
    104  * Distinguish spinning for queue locks from spinning for regular locks.
    105  * We try harder to acquire queue locks by spinning.
    106  * The environment variable:
    107  *	_THREAD_QUEUE_SPIN=count
    108  * can be used to override and set the count in the range [0 .. 1,000,000].
    109  */
    110 int	thread_queue_spin = 10000;
    111 
    112 #define	ALL_ATTRIBUTES				\
    113 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
    114 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
    115 	LOCK_ROBUST)
    116 
    117 /*
    118  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
    119  * augmented by zero or more the flags:
    120  *	LOCK_RECURSIVE
    121  *	LOCK_ERRORCHECK
    122  *	LOCK_PRIO_INHERIT
    123  *	LOCK_PRIO_PROTECT
    124  *	LOCK_ROBUST
    125  */
    126 #pragma weak _mutex_init = mutex_init
    127 /* ARGSUSED2 */
    128 int
    129 mutex_init(mutex_t *mp, int type, void *arg)
    130 {
    131 	int basetype = (type & ~ALL_ATTRIBUTES);
    132 	const pcclass_t *pccp;
    133 	int error = 0;
    134 	int ceil;
    135 
    136 	if (basetype == USYNC_PROCESS_ROBUST) {
    137 		/*
    138 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
    139 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
    140 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
    141 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
    142 		 * mutexes will ever draw ELOCKUNMAPPED).
    143 		 */
    144 		type |= (USYNC_PROCESS | LOCK_ROBUST);
    145 		basetype = USYNC_PROCESS;
    146 	}
    147 
    148 	if (type & LOCK_PRIO_PROTECT)
    149 		pccp = get_info_by_policy(SCHED_FIFO);
    150 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
    151 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
    152 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
    153 	    ((type & LOCK_PRIO_PROTECT) &&
    154 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
    155 	    ceil > pccp->pcc_primax))) {
    156 		error = EINVAL;
    157 	} else if (type & LOCK_ROBUST) {
    158 		/*
    159 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
    160 		 * are required to pass an initially all-zero mutex.
    161 		 * Multiple calls to mutex_init() are allowed; all but
    162 		 * the first return EBUSY.  A call to mutex_init() is
    163 		 * allowed to make an inconsistent robust lock consistent
    164 		 * (for historical usage, even though the proper interface
    165 		 * for this is mutex_consistent()).  Note that we use
    166 		 * atomic_or_16() to set the LOCK_INITED flag so as
    167 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
    168 		 */
    169 		if (!(mp->mutex_flag & LOCK_INITED)) {
    170 			mp->mutex_type = (uint8_t)type;
    171 			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
    172 			mp->mutex_magic = MUTEX_MAGIC;
    173 		} else if (type != mp->mutex_type ||
    174 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
    175 			error = EINVAL;
    176 		} else if (mutex_consistent(mp) != 0) {
    177 			error = EBUSY;
    178 		}
    179 		/* register a process robust mutex with the kernel */
    180 		if (basetype == USYNC_PROCESS)
    181 			register_lock(mp);
    182 	} else {
    183 		(void) memset(mp, 0, sizeof (*mp));
    184 		mp->mutex_type = (uint8_t)type;
    185 		mp->mutex_flag = LOCK_INITED;
    186 		mp->mutex_magic = MUTEX_MAGIC;
    187 	}
    188 
    189 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
    190 		mp->mutex_ceiling = ceil;
    191 	}
    192 
    193 	return (error);
    194 }
    195 
    196 /*
    197  * Delete mp from list of ceiling mutexes owned by curthread.
    198  * Return 1 if the head of the chain was updated.
    199  */
    200 int
    201 _ceil_mylist_del(mutex_t *mp)
    202 {
    203 	ulwp_t *self = curthread;
    204 	mxchain_t **mcpp;
    205 	mxchain_t *mcp;
    206 
    207 	for (mcpp = &self->ul_mxchain;
    208 	    (mcp = *mcpp) != NULL;
    209 	    mcpp = &mcp->mxchain_next) {
    210 		if (mcp->mxchain_mx == mp) {
    211 			*mcpp = mcp->mxchain_next;
    212 			lfree(mcp, sizeof (*mcp));
    213 			return (mcpp == &self->ul_mxchain);
    214 		}
    215 	}
    216 	return (0);
    217 }
    218 
    219 /*
    220  * Add mp to the list of ceiling mutexes owned by curthread.
    221  * Return ENOMEM if no memory could be allocated.
    222  */
    223 int
    224 _ceil_mylist_add(mutex_t *mp)
    225 {
    226 	ulwp_t *self = curthread;
    227 	mxchain_t *mcp;
    228 
    229 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
    230 		return (ENOMEM);
    231 	mcp->mxchain_mx = mp;
    232 	mcp->mxchain_next = self->ul_mxchain;
    233 	self->ul_mxchain = mcp;
    234 	return (0);
    235 }
    236 
    237 /*
    238  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
    239  */
    240 static void
    241 set_rt_priority(ulwp_t *self, int prio)
    242 {
    243 	pcparms_t pcparm;
    244 
    245 	pcparm.pc_cid = self->ul_rtclassid;
    246 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
    247 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
    248 	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
    249 }
    250 
    251 /*
    252  * Inherit priority from ceiling.
    253  * This changes the effective priority, not the assigned priority.
    254  */
    255 void
    256 _ceil_prio_inherit(int prio)
    257 {
    258 	ulwp_t *self = curthread;
    259 
    260 	self->ul_epri = prio;
    261 	set_rt_priority(self, prio);
    262 }
    263 
    264 /*
    265  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
    266  * if holding at least one ceiling lock.  If no ceiling locks are held at this
    267  * point, disinherit completely, reverting back to assigned priority.
    268  */
    269 void
    270 _ceil_prio_waive(void)
    271 {
    272 	ulwp_t *self = curthread;
    273 	mxchain_t *mcp = self->ul_mxchain;
    274 	int prio;
    275 
    276 	if (mcp == NULL) {
    277 		prio = self->ul_pri;
    278 		self->ul_epri = 0;
    279 	} else {
    280 		prio = mcp->mxchain_mx->mutex_ceiling;
    281 		self->ul_epri = prio;
    282 	}
    283 	set_rt_priority(self, prio);
    284 }
    285 
    286 /*
    287  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
    288  * Return the old value of the lock word.
    289  */
    290 static uint32_t
    291 clear_lockbyte(volatile uint32_t *lockword)
    292 {
    293 	uint32_t old;
    294 	uint32_t new;
    295 
    296 	do {
    297 		old = *lockword;
    298 		new = old & ~LOCKMASK;
    299 	} while (atomic_cas_32(lockword, old, new) != old);
    300 
    301 	return (old);
    302 }
    303 
    304 /*
    305  * Same as clear_lockbyte(), but operates on mutex_lockword64.
    306  * The mutex_ownerpid field is cleared along with the lock byte.
    307  */
    308 static uint64_t
    309 clear_lockbyte64(volatile uint64_t *lockword64)
    310 {
    311 	uint64_t old;
    312 	uint64_t new;
    313 
    314 	do {
    315 		old = *lockword64;
    316 		new = old & ~LOCKMASK64;
    317 	} while (atomic_cas_64(lockword64, old, new) != old);
    318 
    319 	return (old);
    320 }
    321 
    322 /*
    323  * Similar to set_lock_byte(), which only tries to set the lock byte.
    324  * Here, we attempt to set the lock byte AND the mutex_ownerpid,
    325  * keeping the remaining bytes constant.
    326  */
    327 static int
    328 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
    329 {
    330 	uint64_t old;
    331 	uint64_t new;
    332 
    333 	old = *lockword64 & ~LOCKMASK64;
    334 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
    335 	if (atomic_cas_64(lockword64, old, new) == old)
    336 		return (LOCKCLEAR);
    337 
    338 	return (LOCKSET);
    339 }
    340 
    341 /*
    342  * Increment the spinners count in the mutex lock word.
    343  * Return 0 on success.  Return -1 if the count would overflow.
    344  */
    345 static int
    346 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
    347 {
    348 	uint32_t old;
    349 	uint32_t new;
    350 
    351 	do {
    352 		old = *lockword;
    353 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
    354 			return (-1);
    355 		new = old + (1 << SPINNERSHIFT);
    356 	} while (atomic_cas_32(lockword, old, new) != old);
    357 
    358 	return (0);
    359 }
    360 
    361 /*
    362  * Decrement the spinners count in the mutex lock word.
    363  * Return the new value of the lock word.
    364  */
    365 static uint32_t
    366 spinners_decr(volatile uint32_t *lockword)
    367 {
    368 	uint32_t old;
    369 	uint32_t new;
    370 
    371 	do {
    372 		new = old = *lockword;
    373 		if (new & SPINNERMASK)
    374 			new -= (1 << SPINNERSHIFT);
    375 	} while (atomic_cas_32(lockword, old, new) != old);
    376 
    377 	return (new);
    378 }
    379 
    380 /*
    381  * Non-preemptive spin locks.  Used by queue_lock().
    382  * No lock statistics are gathered for these locks.
    383  * No DTrace probes are provided for these locks.
    384  */
    385 void
    386 spin_lock_set(mutex_t *mp)
    387 {
    388 	ulwp_t *self = curthread;
    389 
    390 	no_preempt(self);
    391 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
    392 		mp->mutex_owner = (uintptr_t)self;
    393 		return;
    394 	}
    395 	/*
    396 	 * Spin for a while, attempting to acquire the lock.
    397 	 */
    398 	INCR32(self->ul_spin_lock_spin);
    399 	if (mutex_queuelock_adaptive(mp) == 0 ||
    400 	    set_lock_byte(&mp->mutex_lockw) == 0) {
    401 		mp->mutex_owner = (uintptr_t)self;
    402 		return;
    403 	}
    404 	/*
    405 	 * Try harder if we were previously at a no premption level.
    406 	 */
    407 	if (self->ul_preempt > 1) {
    408 		INCR32(self->ul_spin_lock_spin2);
    409 		if (mutex_queuelock_adaptive(mp) == 0 ||
    410 		    set_lock_byte(&mp->mutex_lockw) == 0) {
    411 			mp->mutex_owner = (uintptr_t)self;
    412 			return;
    413 		}
    414 	}
    415 	/*
    416 	 * Give up and block in the kernel for the mutex.
    417 	 */
    418 	INCR32(self->ul_spin_lock_sleep);
    419 	(void) ___lwp_mutex_timedlock(mp, NULL);
    420 	mp->mutex_owner = (uintptr_t)self;
    421 }
    422 
    423 void
    424 spin_lock_clear(mutex_t *mp)
    425 {
    426 	ulwp_t *self = curthread;
    427 
    428 	mp->mutex_owner = 0;
    429 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
    430 		(void) ___lwp_mutex_wakeup(mp, 0);
    431 		INCR32(self->ul_spin_lock_wakeup);
    432 	}
    433 	preempt(self);
    434 }
    435 
    436 /*
    437  * Allocate the sleep queue hash table.
    438  */
    439 void
    440 queue_alloc(void)
    441 {
    442 	ulwp_t *self = curthread;
    443 	uberdata_t *udp = self->ul_uberdata;
    444 	queue_head_t *qp;
    445 	void *data;
    446 	int i;
    447 
    448 	/*
    449 	 * No locks are needed; we call here only when single-threaded.
    450 	 */
    451 	ASSERT(self == udp->ulwp_one);
    452 	ASSERT(!udp->uberflags.uf_mt);
    453 	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
    454 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
    455 	    == MAP_FAILED)
    456 		thr_panic("cannot allocate thread queue_head table");
    457 	udp->queue_head = qp = (queue_head_t *)data;
    458 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
    459 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
    460 		qp->qh_lock.mutex_flag = LOCK_INITED;
    461 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
    462 		qp->qh_hlist = &qp->qh_def_root;
    463 #if defined(THREAD_DEBUG)
    464 		qp->qh_hlen = 1;
    465 		qp->qh_hmax = 1;
    466 #endif
    467 	}
    468 }
    469 
    470 #if defined(THREAD_DEBUG)
    471 
    472 /*
    473  * Debugging: verify correctness of a sleep queue.
    474  */
    475 void
    476 QVERIFY(queue_head_t *qp)
    477 {
    478 	ulwp_t *self = curthread;
    479 	uberdata_t *udp = self->ul_uberdata;
    480 	queue_root_t *qrp;
    481 	ulwp_t *ulwp;
    482 	ulwp_t *prev;
    483 	uint_t index;
    484 	uint32_t cnt;
    485 	char qtype;
    486 	void *wchan;
    487 
    488 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
    489 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
    490 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
    491 		cnt++;
    492 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
    493 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
    494 	}
    495 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
    496 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
    497 	ASSERT(qp->qh_type == qtype);
    498 	if (!thread_queue_verify)
    499 		return;
    500 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
    501 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
    502 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
    503 		    prev = ulwp, ulwp = ulwp->ul_link) {
    504 			cnt++;
    505 			if (ulwp->ul_writer)
    506 				ASSERT(prev == NULL || prev->ul_writer);
    507 			ASSERT(ulwp->ul_qtype == qtype);
    508 			ASSERT(ulwp->ul_wchan != NULL);
    509 			ASSERT(ulwp->ul_sleepq == qp);
    510 			wchan = ulwp->ul_wchan;
    511 			ASSERT(qrp->qr_wchan == wchan);
    512 			index = QUEUE_HASH(wchan, qtype);
    513 			ASSERT(&udp->queue_head[index] == qp);
    514 		}
    515 		ASSERT(qrp->qr_tail == prev);
    516 	}
    517 	ASSERT(qp->qh_qlen == cnt);
    518 }
    519 
    520 #else	/* THREAD_DEBUG */
    521 
    522 #define	QVERIFY(qp)
    523 
    524 #endif	/* THREAD_DEBUG */
    525 
    526 /*
    527  * Acquire a queue head.
    528  */
    529 queue_head_t *
    530 queue_lock(void *wchan, int qtype)
    531 {
    532 	uberdata_t *udp = curthread->ul_uberdata;
    533 	queue_head_t *qp;
    534 	queue_root_t *qrp;
    535 
    536 	ASSERT(qtype == MX || qtype == CV);
    537 
    538 	/*
    539 	 * It is possible that we could be called while still single-threaded.
    540 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
    541 	 */
    542 	if ((qp = udp->queue_head) == NULL) {
    543 		queue_alloc();
    544 		qp = udp->queue_head;
    545 	}
    546 	qp += QUEUE_HASH(wchan, qtype);
    547 	spin_lock_set(&qp->qh_lock);
    548 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
    549 		if (qrp->qr_wchan == wchan)
    550 			break;
    551 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
    552 		/* the default queue root is available; use it */
    553 		qrp = &qp->qh_def_root;
    554 		qrp->qr_wchan = wchan;
    555 		ASSERT(qrp->qr_next == NULL);
    556 		ASSERT(qrp->qr_tail == NULL &&
    557 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
    558 	}
    559 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
    560 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
    561 	INCR32(qp->qh_lockcount);
    562 	QVERIFY(qp);
    563 	return (qp);
    564 }
    565 
    566 /*
    567  * Release a queue head.
    568  */
    569 void
    570 queue_unlock(queue_head_t *qp)
    571 {
    572 	QVERIFY(qp);
    573 	spin_lock_clear(&qp->qh_lock);
    574 }
    575 
    576 /*
    577  * For rwlock queueing, we must queue writers ahead of readers of the
    578  * same priority.  We do this by making writers appear to have a half
    579  * point higher priority for purposes of priority comparisons below.
    580  */
    581 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
    582 
    583 void
    584 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
    585 {
    586 	queue_root_t *qrp;
    587 	ulwp_t **ulwpp;
    588 	ulwp_t *next;
    589 	int pri = CMP_PRIO(ulwp);
    590 
    591 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    592 	ASSERT(ulwp->ul_sleepq != qp);
    593 
    594 	if ((qrp = qp->qh_root) == NULL) {
    595 		/* use the thread's queue root for the linkage */
    596 		qrp = &ulwp->ul_queue_root;
    597 		qrp->qr_next = qp->qh_hlist;
    598 		qrp->qr_prev = NULL;
    599 		qrp->qr_head = NULL;
    600 		qrp->qr_tail = NULL;
    601 		qrp->qr_wchan = qp->qh_wchan;
    602 		qrp->qr_rtcount = 0;
    603 		qrp->qr_qlen = 0;
    604 		qrp->qr_qmax = 0;
    605 		qp->qh_hlist->qr_prev = qrp;
    606 		qp->qh_hlist = qrp;
    607 		qp->qh_root = qrp;
    608 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
    609 	}
    610 
    611 	/*
    612 	 * LIFO queue ordering is unfair and can lead to starvation,
    613 	 * but it gives better performance for heavily contended locks.
    614 	 * We use thread_queue_fifo (range is 0..8) to determine
    615 	 * the frequency of FIFO vs LIFO queuing:
    616 	 *	0 : every 256th time	(almost always LIFO)
    617 	 *	1 : every 128th time
    618 	 *	2 : every 64th  time
    619 	 *	3 : every 32nd  time
    620 	 *	4 : every 16th  time	(the default value, mostly LIFO)
    621 	 *	5 : every 8th   time
    622 	 *	6 : every 4th   time
    623 	 *	7 : every 2nd   time
    624 	 *	8 : every time		(never LIFO, always FIFO)
    625 	 * Note that there is always some degree of FIFO ordering.
    626 	 * This breaks live lock conditions that occur in applications
    627 	 * that are written assuming (incorrectly) that threads acquire
    628 	 * locks fairly, that is, in roughly round-robin order.
    629 	 * In any event, the queue is maintained in kernel priority order.
    630 	 *
    631 	 * If force_fifo is non-zero, fifo queueing is forced.
    632 	 * SUSV3 requires this for semaphores.
    633 	 */
    634 	if (qrp->qr_head == NULL) {
    635 		/*
    636 		 * The queue is empty.  LIFO/FIFO doesn't matter.
    637 		 */
    638 		ASSERT(qrp->qr_tail == NULL);
    639 		ulwpp = &qrp->qr_head;
    640 	} else if (force_fifo |
    641 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
    642 		/*
    643 		 * Enqueue after the last thread whose priority is greater
    644 		 * than or equal to the priority of the thread being queued.
    645 		 * Attempt first to go directly onto the tail of the queue.
    646 		 */
    647 		if (pri <= CMP_PRIO(qrp->qr_tail))
    648 			ulwpp = &qrp->qr_tail->ul_link;
    649 		else {
    650 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
    651 			    ulwpp = &next->ul_link)
    652 				if (pri > CMP_PRIO(next))
    653 					break;
    654 		}
    655 	} else {
    656 		/*
    657 		 * Enqueue before the first thread whose priority is less
    658 		 * than or equal to the priority of the thread being queued.
    659 		 * Hopefully we can go directly onto the head of the queue.
    660 		 */
    661 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
    662 		    ulwpp = &next->ul_link)
    663 			if (pri >= CMP_PRIO(next))
    664 				break;
    665 	}
    666 	if ((ulwp->ul_link = *ulwpp) == NULL)
    667 		qrp->qr_tail = ulwp;
    668 	*ulwpp = ulwp;
    669 
    670 	ulwp->ul_sleepq = qp;
    671 	ulwp->ul_wchan = qp->qh_wchan;
    672 	ulwp->ul_qtype = qp->qh_type;
    673 	if ((ulwp->ul_schedctl != NULL &&
    674 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
    675 	    ulwp->ul_pilocks) {
    676 		ulwp->ul_rtqueued = 1;
    677 		qrp->qr_rtcount++;
    678 	}
    679 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
    680 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
    681 }
    682 
    683 /*
    684  * Helper function for queue_slot() and queue_slot_rt().
    685  * Try to find a non-suspended thread on the queue.
    686  */
    687 static ulwp_t **
    688 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
    689 {
    690 	ulwp_t *ulwp;
    691 	ulwp_t **foundpp = NULL;
    692 	int priority = -1;
    693 	ulwp_t *prev;
    694 	int tpri;
    695 
    696 	for (prev = NULL;
    697 	    (ulwp = *ulwpp) != NULL;
    698 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
    699 		if (ulwp->ul_stop)	/* skip suspended threads */
    700 			continue;
    701 		tpri = rt? CMP_PRIO(ulwp) : 0;
    702 		if (tpri > priority) {
    703 			foundpp = ulwpp;
    704 			*prevp = prev;
    705 			priority = tpri;
    706 			if (!rt)
    707 				break;
    708 		}
    709 	}
    710 	return (foundpp);
    711 }
    712 
    713 /*
    714  * For real-time, we search the entire queue because the dispatch
    715  * (kernel) priorities may have changed since enqueueing.
    716  */
    717 static ulwp_t **
    718 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
    719 {
    720 	ulwp_t **ulwpp = ulwpp_org;
    721 	ulwp_t *ulwp = *ulwpp;
    722 	ulwp_t **foundpp = ulwpp;
    723 	int priority = CMP_PRIO(ulwp);
    724 	ulwp_t *prev;
    725 	int tpri;
    726 
    727 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
    728 	    (ulwp = *ulwpp) != NULL;
    729 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
    730 		tpri = CMP_PRIO(ulwp);
    731 		if (tpri > priority) {
    732 			foundpp = ulwpp;
    733 			*prevp = prev;
    734 			priority = tpri;
    735 		}
    736 	}
    737 	ulwp = *foundpp;
    738 
    739 	/*
    740 	 * Try not to return a suspended thread.
    741 	 * This mimics the old libthread's behavior.
    742 	 */
    743 	if (ulwp->ul_stop &&
    744 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
    745 		foundpp = ulwpp;
    746 		ulwp = *foundpp;
    747 	}
    748 	ulwp->ul_rt = 1;
    749 	return (foundpp);
    750 }
    751 
    752 ulwp_t **
    753 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
    754 {
    755 	queue_root_t *qrp;
    756 	ulwp_t **ulwpp;
    757 	ulwp_t *ulwp;
    758 	int rt;
    759 
    760 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    761 
    762 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
    763 		*more = 0;
    764 		return (NULL);		/* no lwps on the queue */
    765 	}
    766 	rt = (qrp->qr_rtcount != 0);
    767 	*prevp = NULL;
    768 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
    769 		*more = 0;
    770 		ulwp->ul_rt = rt;
    771 		return (&qrp->qr_head);
    772 	}
    773 	*more = 1;
    774 
    775 	if (rt)		/* real-time queue */
    776 		return (queue_slot_rt(&qrp->qr_head, prevp));
    777 	/*
    778 	 * Try not to return a suspended thread.
    779 	 * This mimics the old libthread's behavior.
    780 	 */
    781 	if (ulwp->ul_stop &&
    782 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
    783 		ulwp = *ulwpp;
    784 		ulwp->ul_rt = 0;
    785 		return (ulwpp);
    786 	}
    787 	/*
    788 	 * The common case; just pick the first thread on the queue.
    789 	 */
    790 	ulwp->ul_rt = 0;
    791 	return (&qrp->qr_head);
    792 }
    793 
    794 /*
    795  * Common code for unlinking an lwp from a user-level sleep queue.
    796  */
    797 void
    798 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
    799 {
    800 	queue_root_t *qrp = qp->qh_root;
    801 	queue_root_t *nqrp;
    802 	ulwp_t *ulwp = *ulwpp;
    803 	ulwp_t *next;
    804 
    805 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
    806 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
    807 
    808 	DECR(qp->qh_qlen);
    809 	DECR(qrp->qr_qlen);
    810 	if (ulwp->ul_rtqueued) {
    811 		ulwp->ul_rtqueued = 0;
    812 		qrp->qr_rtcount--;
    813 	}
    814 	next = ulwp->ul_link;
    815 	*ulwpp = next;
    816 	ulwp->ul_link = NULL;
    817 	if (qrp->qr_tail == ulwp)
    818 		qrp->qr_tail = prev;
    819 	if (qrp == &ulwp->ul_queue_root) {
    820 		/*
    821 		 * We can't continue to use the unlinked thread's
    822 		 * queue root for the linkage.
    823 		 */
    824 		queue_root_t *qr_next = qrp->qr_next;
    825 		queue_root_t *qr_prev = qrp->qr_prev;
    826 
    827 		if (qrp->qr_tail) {
    828 			/* switch to using the last thread's queue root */
    829 			ASSERT(qrp->qr_qlen != 0);
    830 			nqrp = &qrp->qr_tail->ul_queue_root;
    831 			*nqrp = *qrp;
    832 			if (qr_next)
    833 				qr_next->qr_prev = nqrp;
    834 			if (qr_prev)
    835 				qr_prev->qr_next = nqrp;
    836 			else
    837 				qp->qh_hlist = nqrp;
    838 			qp->qh_root = nqrp;
    839 		} else {
    840 			/* empty queue root; just delete from the hash list */
    841 			ASSERT(qrp->qr_qlen == 0);
    842 			if (qr_next)
    843 				qr_next->qr_prev = qr_prev;
    844 			if (qr_prev)
    845 				qr_prev->qr_next = qr_next;
    846 			else
    847 				qp->qh_hlist = qr_next;
    848 			qp->qh_root = NULL;
    849 			DECR(qp->qh_hlen);
    850 		}
    851 	}
    852 }
    853 
    854 ulwp_t *
    855 dequeue(queue_head_t *qp, int *more)
    856 {
    857 	ulwp_t **ulwpp;
    858 	ulwp_t *ulwp;
    859 	ulwp_t *prev;
    860 
    861 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
    862 		return (NULL);
    863 	ulwp = *ulwpp;
    864 	queue_unlink(qp, ulwpp, prev);
    865 	ulwp->ul_sleepq = NULL;
    866 	ulwp->ul_wchan = NULL;
    867 	return (ulwp);
    868 }
    869 
    870 /*
    871  * Return a pointer to the highest priority thread sleeping on wchan.
    872  */
    873 ulwp_t *
    874 queue_waiter(queue_head_t *qp)
    875 {
    876 	ulwp_t **ulwpp;
    877 	ulwp_t *prev;
    878 	int more;
    879 
    880 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
    881 		return (NULL);
    882 	return (*ulwpp);
    883 }
    884 
    885 int
    886 dequeue_self(queue_head_t *qp)
    887 {
    888 	ulwp_t *self = curthread;
    889 	queue_root_t *qrp;
    890 	ulwp_t **ulwpp;
    891 	ulwp_t *ulwp;
    892 	ulwp_t *prev;
    893 	int found = 0;
    894 
    895 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
    896 
    897 	/* find self on the sleep queue */
    898 	if ((qrp = qp->qh_root) != NULL) {
    899 		for (prev = NULL, ulwpp = &qrp->qr_head;
    900 		    (ulwp = *ulwpp) != NULL;
    901 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
    902 			if (ulwp == self) {
    903 				queue_unlink(qp, ulwpp, prev);
    904 				self->ul_cvmutex = NULL;
    905 				self->ul_sleepq = NULL;
    906 				self->ul_wchan = NULL;
    907 				found = 1;
    908 				break;
    909 			}
    910 		}
    911 	}
    912 
    913 	if (!found)
    914 		thr_panic("dequeue_self(): curthread not found on queue");
    915 
    916 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
    917 }
    918 
    919 /*
    920  * Called from call_user_handler() and _thrp_suspend() to take
    921  * ourself off of our sleep queue so we can grab locks.
    922  */
    923 void
    924 unsleep_self(void)
    925 {
    926 	ulwp_t *self = curthread;
    927 	queue_head_t *qp;
    928 
    929 	/*
    930 	 * Calling enter_critical()/exit_critical() here would lead
    931 	 * to recursion.  Just manipulate self->ul_critical directly.
    932 	 */
    933 	self->ul_critical++;
    934 	while (self->ul_sleepq != NULL) {
    935 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
    936 		/*
    937 		 * We may have been moved from a CV queue to a
    938 		 * mutex queue while we were attempting queue_lock().
    939 		 * If so, just loop around and try again.
    940 		 * dequeue_self() clears self->ul_sleepq.
    941 		 */
    942 		if (qp == self->ul_sleepq)
    943 			(void) dequeue_self(qp);
    944 		queue_unlock(qp);
    945 	}
    946 	self->ul_writer = 0;
    947 	self->ul_critical--;
    948 }
    949 
    950 /*
    951  * Common code for calling the the ___lwp_mutex_timedlock() system call.
    952  * Returns with mutex_owner and mutex_ownerpid set correctly.
    953  */
    954 static int
    955 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
    956 {
    957 	ulwp_t *self = curthread;
    958 	uberdata_t *udp = self->ul_uberdata;
    959 	int mtype = mp->mutex_type;
    960 	hrtime_t begin_sleep;
    961 	int acquired;
    962 	int error;
    963 
    964 	self->ul_sp = stkptr();
    965 	self->ul_wchan = mp;
    966 	if (__td_event_report(self, TD_SLEEP, udp)) {
    967 		self->ul_td_evbuf.eventnum = TD_SLEEP;
    968 		self->ul_td_evbuf.eventdata = mp;
    969 		tdb_event(TD_SLEEP, udp);
    970 	}
    971 	if (msp) {
    972 		tdb_incr(msp->mutex_sleep);
    973 		begin_sleep = gethrtime();
    974 	}
    975 
    976 	DTRACE_PROBE1(plockstat, mutex__block, mp);
    977 
    978 	for (;;) {
    979 		/*
    980 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
    981 		 * means we successfully acquired the lock.
    982 		 */
    983 		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0 &&
    984 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
    985 			acquired = 0;
    986 			break;
    987 		}
    988 
    989 		if (mtype & USYNC_PROCESS) {
    990 			/*
    991 			 * Defend against forkall().  We may be the child,
    992 			 * in which case we don't actually own the mutex.
    993 			 */
    994 			enter_critical(self);
    995 			if (mp->mutex_ownerpid == udp->pid) {
    996 				mp->mutex_owner = (uintptr_t)self;
    997 				exit_critical(self);
    998 				acquired = 1;
    999 				break;
   1000 			}
   1001 			exit_critical(self);
   1002 		} else {
   1003 			mp->mutex_owner = (uintptr_t)self;
   1004 			acquired = 1;
   1005 			break;
   1006 		}
   1007 	}
   1008 	if (msp)
   1009 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
   1010 	self->ul_wchan = NULL;
   1011 	self->ul_sp = 0;
   1012 
   1013 	if (acquired) {
   1014 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
   1015 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   1016 	} else {
   1017 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
   1018 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1019 	}
   1020 
   1021 	return (error);
   1022 }
   1023 
   1024 /*
   1025  * Common code for calling the ___lwp_mutex_trylock() system call.
   1026  * Returns with mutex_owner and mutex_ownerpid set correctly.
   1027  */
   1028 int
   1029 mutex_trylock_kernel(mutex_t *mp)
   1030 {
   1031 	ulwp_t *self = curthread;
   1032 	uberdata_t *udp = self->ul_uberdata;
   1033 	int mtype = mp->mutex_type;
   1034 	int error;
   1035 	int acquired;
   1036 
   1037 	for (;;) {
   1038 		/*
   1039 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
   1040 		 * means we successfully acquired the lock.
   1041 		 */
   1042 		if ((error = ___lwp_mutex_trylock(mp)) != 0 &&
   1043 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
   1044 			acquired = 0;
   1045 			break;
   1046 		}
   1047 
   1048 		if (mtype & USYNC_PROCESS) {
   1049 			/*
   1050 			 * Defend against forkall().  We may be the child,
   1051 			 * in which case we don't actually own the mutex.
   1052 			 */
   1053 			enter_critical(self);
   1054 			if (mp->mutex_ownerpid == udp->pid) {
   1055 				mp->mutex_owner = (uintptr_t)self;
   1056 				exit_critical(self);
   1057 				acquired = 1;
   1058 				break;
   1059 			}
   1060 			exit_critical(self);
   1061 		} else {
   1062 			mp->mutex_owner = (uintptr_t)self;
   1063 			acquired = 1;
   1064 			break;
   1065 		}
   1066 	}
   1067 
   1068 	if (acquired) {
   1069 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
   1070 	} else if (error != EBUSY) {
   1071 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
   1072 	}
   1073 
   1074 	return (error);
   1075 }
   1076 
   1077 volatile sc_shared_t *
   1078 setup_schedctl(void)
   1079 {
   1080 	ulwp_t *self = curthread;
   1081 	volatile