1 files changed, 3194 insertions, 0 deletions
diff --git a/usr/src/lib/libc/port/threads/synch.c b/usr/src/lib/libc/port/threads/synch.c
new file mode 100644
index 0000000000..21ecb0a2b7
--- /dev/null
+++ b/usr/src/lib/libc/port/threads/synch.c
@@ -0,0 +1,3194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/sdt.h>
+
+#include "lint.h"
+#include "thr_uberdata.h"
+
+/*
+ * This mutex is initialized to be held by lwp#1.
+ * It is used to block a thread that has returned from a mutex_lock()
+ * of a PTHREAD_PRIO_INHERIT mutex with an unrecoverable error.
+ */
+mutex_t	stall_mutex = DEFAULTMUTEX;
+
+static int shared_mutex_held(mutex_t *);
+
+/*
+ * Lock statistics support functions.
+ */
+void
+record_begin_hold(tdb_mutex_stats_t *msp)
+{
+	tdb_incr(msp->mutex_lock);
+	msp->mutex_begin_hold = gethrtime();
+}
+
+hrtime_t
+record_hold_time(tdb_mutex_stats_t *msp)
+{
+	hrtime_t now = gethrtime();
+
+	if (msp->mutex_begin_hold)
+		msp->mutex_hold_time += now - msp->mutex_begin_hold;
+	msp->mutex_begin_hold = 0;
+	return (now);
+}
+
+/*
+ * Called once at library initialization.
+ */
+void
+mutex_setup(void)
+{
+	if (set_lock_byte(&stall_mutex.mutex_lockw))
+		thr_panic("mutex_setup() cannot acquire stall_mutex");
+	stall_mutex.mutex_owner = (uintptr_t)curthread;
+}
+
+/*
+ * The default spin counts of 1000 and 500 are experimentally determined.
+ * On sun4u machines with any number of processors they could be raised
+ * to 10,000 but that (experimentally) makes almost no difference.
+ * The environment variables:
+ *	_THREAD_ADAPTIVE_SPIN=count
+ *	_THREAD_RELEASE_SPIN=count
+ * can be used to override and set the counts in the range [0 .. 1,000,000].
+ */
+int	thread_adaptive_spin = 1000;
+uint_t	thread_max_spinners = 100;
+int	thread_release_spin = 500;
+int	thread_queue_verify = 0;
+static	int	ncpus;
+
+/*
+ * Distinguish spinning for queue locks from spinning for regular locks.
+ * The environment variable:
+ *	_THREAD_QUEUE_SPIN=count
+ * can be used to override and set the count in the range [0 .. 1,000,000].
+ * There is no release spin concept for queue locks.
+ */
+int	thread_queue_spin = 1000;
+
+/*
+ * Use the otherwise-unused 'mutex_ownerpid' field of a USYNC_THREAD
+ * mutex to be a count of adaptive spins in progress.
+ */
+#define	mutex_spinners	mutex_ownerpid
+
+void
+_mutex_set_typeattr(mutex_t *mp, int attr)
+{
+	mp->mutex_type |= (uint8_t)attr;
+}
+
+/*
+ * 'type' can be one of USYNC_THREAD or USYNC_PROCESS, possibly
+ * augmented by the flags LOCK_RECURSIVE and/or LOCK_ERRORCHECK,
+ * or it can be USYNC_PROCESS_ROBUST with no extra flags.
+ */
+#pragma weak _private_mutex_init = __mutex_init
+#pragma weak mutex_init = __mutex_init
+#pragma weak _mutex_init = __mutex_init
+/* ARGSUSED2 */
+int
+__mutex_init(mutex_t *mp, int type, void *arg)
+{
+	int error;
+
+	switch (type & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) {
+	case USYNC_THREAD:
+	case USYNC_PROCESS:
+		(void) _memset(mp, 0, sizeof (*mp));
+		mp->mutex_type = (uint8_t)type;
+		mp->mutex_flag = LOCK_INITED;
+		error = 0;
+		break;
+	case USYNC_PROCESS_ROBUST:
+		if (type & (LOCK_RECURSIVE|LOCK_ERRORCHECK))
+			error = EINVAL;
+		else
+			error = ___lwp_mutex_init(mp, type);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	if (error == 0)
+		mp->mutex_magic = MUTEX_MAGIC;
+	return (error);
+}
+
+/*
+ * Delete mp from list of ceil mutexes owned by curthread.
+ * Return 1 if the head of the chain was updated.
+ */
+int
+_ceil_mylist_del(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	mxchain_t **mcpp;
+	mxchain_t *mcp;
+
+	mcpp = &self->ul_mxchain;
+	while ((*mcpp)->mxchain_mx != mp)
+		mcpp = &(*mcpp)->mxchain_next;
+	mcp = *mcpp;
+	*mcpp = mcp->mxchain_next;
+	lfree(mcp, sizeof (*mcp));
+	return (mcpp == &self->ul_mxchain);
+}
+
+/*
+ * Add mp to head of list of ceil mutexes owned by curthread.
+ * Return ENOMEM if no memory could be allocated.
+ */
+int
+_ceil_mylist_add(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	mxchain_t *mcp;
+
+	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
+		return (ENOMEM);
+	mcp->mxchain_mx = mp;
+	mcp->mxchain_next = self->ul_mxchain;
+	self->ul_mxchain = mcp;
+	return (0);
+}
+
+/*
+ * Inherit priority from ceiling.  The inheritance impacts the effective
+ * priority, not the assigned priority.  See _thread_setschedparam_main().
+ */
+void
+_ceil_prio_inherit(int ceil)
+{
+	ulwp_t *self = curthread;
+	struct sched_param param;
+
+	(void) _memset(&param, 0, sizeof (param));
+	param.sched_priority = ceil;
+	if (_thread_setschedparam_main(self->ul_lwpid,
+	    self->ul_policy, &param, PRIO_INHERIT)) {
+		/*
+		 * Panic since unclear what error code to return.
+		 * If we do return the error codes returned by above
+		 * called routine, update the man page...
+		 */
+		thr_panic("_thread_setschedparam_main() fails");
+	}
+}
+
+/*
+ * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
+ * if holding at least one ceiling lock.  If no ceiling locks are held at this
+ * point, disinherit completely, reverting back to assigned priority.
+ */
+void
+_ceil_prio_waive(void)
+{
+	ulwp_t *self = curthread;
+	struct sched_param param;
+
+	(void) _memset(&param, 0, sizeof (param));
+	if (self->ul_mxchain == NULL) {
+		/*
+		 * No ceil locks held.  Zero the epri, revert back to ul_pri.
+		 * Since thread's hash lock is not held, one cannot just
+		 * read ul_pri here...do it in the called routine...
+		 */
+		param.sched_priority = self->ul_pri;	/* ignored */
+		if (_thread_setschedparam_main(self->ul_lwpid,
+		    self->ul_policy, &param, PRIO_DISINHERIT))
+			thr_panic("_thread_setschedparam_main() fails");
+	} else {
+		/*
+		 * Set priority to that of the mutex at the head
+		 * of the ceilmutex chain.
+		 */
+		param.sched_priority =
+		    self->ul_mxchain->mxchain_mx->mutex_ceiling;
+		if (_thread_setschedparam_main(self->ul_lwpid,
+		    self->ul_policy, &param, PRIO_INHERIT))
+			thr_panic("_thread_setschedparam_main() fails");
+	}
+}
+
+/*
+ * Non-preemptive spin locks.  Used by queue_lock().
+ * No lock statistics are gathered for these locks.
+ */
+void
+spin_lock_set(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+
+	no_preempt(self);
+	if (set_lock_byte(&mp->mutex_lockw) == 0) {
+		mp->mutex_owner = (uintptr_t)self;
+		return;
+	}
+	/*
+	 * Spin for a while, attempting to acquire the lock.
+	 */
+	if (self->ul_spin_lock_spin != UINT_MAX)
+		self->ul_spin_lock_spin++;
+	if (mutex_queuelock_adaptive(mp) == 0 ||
+	    set_lock_byte(&mp->mutex_lockw) == 0) {
+		mp->mutex_owner = (uintptr_t)self;
+		return;
+	}
+	/*
+	 * Try harder if we were previously at a no premption level.
+	 */
+	if (self->ul_preempt > 1) {
+		if (self->ul_spin_lock_spin2 != UINT_MAX)
+			self->ul_spin_lock_spin2++;
+		if (mutex_queuelock_adaptive(mp) == 0 ||
+		    set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			return;
+		}
+	}
+	/*
+	 * Give up and block in the kernel for the mutex.
+	 */
+	if (self->ul_spin_lock_sleep != UINT_MAX)
+		self->ul_spin_lock_sleep++;
+	(void) ___lwp_mutex_timedlock(mp, NULL);
+	mp->mutex_owner = (uintptr_t)self;
+}
+
+void
+spin_lock_clear(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+
+	mp->mutex_owner = 0;
+	if (swap32(&mp->mutex_lockword, 0) & WAITERMASK) {
+		(void) ___lwp_mutex_wakeup(mp);
+		if (self->ul_spin_lock_wakeup != UINT_MAX)
+			self->ul_spin_lock_wakeup++;
+	}
+	preempt(self);
+}
+
+/*
+ * Allocate the sleep queue hash table.
+ */
+void
+queue_alloc(void)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	void *data;
+	int i;
+
+	/*
+	 * No locks are needed; we call here only when single-threaded.
+	 */
+	ASSERT(self == udp->ulwp_one);
+	ASSERT(!udp->uberflags.uf_mt);
+	if ((data = _private_mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
+	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
+	    == MAP_FAILED)
+		thr_panic("cannot allocate thread queue_head table");
+	udp->queue_head = (queue_head_t *)data;
+	for (i = 0; i < 2 * QHASHSIZE; i++)
+		udp->queue_head[i].qh_lock.mutex_magic = MUTEX_MAGIC;
+}
+
+#if defined(THREAD_DEBUG)
+
+/*
+ * Debugging: verify correctness of a sleep queue.
+ */
+void
+QVERIFY(queue_head_t *qp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	ulwp_t *ulwp;
+	ulwp_t *prev;
+	uint_t index;
+	uint32_t cnt = 0;
+	char qtype;
+	void *wchan;
+
+	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
+	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
+	ASSERT((qp->qh_head != NULL && qp->qh_tail != NULL) ||
+		(qp->qh_head == NULL && qp->qh_tail == NULL));
+	if (!thread_queue_verify)
+		return;
+	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
+	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
+	for (prev = NULL, ulwp = qp->qh_head; ulwp != NULL;
+	    prev = ulwp, ulwp = ulwp->ul_link, cnt++) {
+		ASSERT(ulwp->ul_qtype == qtype);
+		ASSERT(ulwp->ul_wchan != NULL);
+		ASSERT(ulwp->ul_sleepq == qp);
+		wchan = ulwp->ul_wchan;
+		index = QUEUE_HASH(wchan, qtype);
+		ASSERT(&udp->queue_head[index] == qp);
+	}
+	ASSERT(qp->qh_tail == prev);
+	ASSERT(qp->qh_qlen == cnt);
+}
+
+#else	/* THREAD_DEBUG */
+
+#define	QVERIFY(qp)
+
+#endif	/* THREAD_DEBUG */
+
+/*
+ * Acquire a queue head.
+ */
+queue_head_t *
+queue_lock(void *wchan, int qtype)
+{
+	uberdata_t *udp = curthread->ul_uberdata;
+	queue_head_t *qp;
+
+	ASSERT(qtype == MX || qtype == CV);
+
+	/*
+	 * It is possible that we could be called while still single-threaded.
+	 * If so, we call queue_alloc() to allocate the queue_head[] array.
+	 */
+	if ((qp = udp->queue_head) == NULL) {
+		queue_alloc();
+		qp = udp->queue_head;
+	}
+	qp += QUEUE_HASH(wchan, qtype);
+	spin_lock_set(&qp->qh_lock);
+	/*
+	 * At once per nanosecond, qh_lockcount will wrap after 512 years.
+	 * Were we to care about this, we could peg the value at UINT64_MAX.
+	 */
+	qp->qh_lockcount++;
+	QVERIFY(qp);
+	return (qp);
+}
+
+/*
+ * Release a queue head.
+ */
+void
+queue_unlock(queue_head_t *qp)
+{
+	QVERIFY(qp);
+	spin_lock_clear(&qp->qh_lock);
+}
+
+/*
+ * For rwlock queueing, we must queue writers ahead of readers of the
+ * same priority.  We do this by making writers appear to have a half
+ * point higher priority for purposes of priority comparisons below.
+ */
+#define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
+
+void
+enqueue(queue_head_t *qp, ulwp_t *ulwp, void *wchan, int qtype)
+{
+	ulwp_t **ulwpp;
+	ulwp_t *next;
+	int pri = CMP_PRIO(ulwp);
+	int force_fifo = (qtype & FIFOQ);
+	int do_fifo;
+
+	qtype &= ~FIFOQ;
+	ASSERT(qtype == MX || qtype == CV);
+	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
+	ASSERT(ulwp->ul_sleepq != qp);
+
+	/*
+	 * LIFO queue ordering is unfair and can lead to starvation,
+	 * but it gives better performance for heavily contended locks.
+	 * We use thread_queue_fifo (range is 0..8) to determine
+	 * the frequency of FIFO vs LIFO queuing:
+	 *	0 : every 256th time	(almost always LIFO)
+	 *	1 : every 128th time
+	 *	2 : every 64th  time
+	 *	3 : every 32nd  time
+	 *	4 : every 16th  time	(the default value, mostly LIFO)
+	 *	5 : every 8th   time
+	 *	6 : every 4th   time
+	 *	7 : every 2nd   time
+	 *	8 : every time		(never LIFO, always FIFO)
+	 * Note that there is always some degree of FIFO ordering.
+	 * This breaks live lock conditions that occur in applications
+	 * that are written assuming (incorrectly) that threads acquire
+	 * locks fairly, that is, in roughly round-robin order.
+	 * In any event, the queue is maintained in priority order.
+	 *
+	 * If we are given the FIFOQ flag in qtype, fifo queueing is forced.
+	 * SUSV3 requires this for semaphores.
+	 */
+	do_fifo = (force_fifo ||
+		((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0);
+
+	if (qp->qh_head == NULL) {
+		/*
+		 * The queue is empty.  LIFO/FIFO doesn't matter.
+		 */
+		ASSERT(qp->qh_tail == NULL);
+		ulwpp = &qp->qh_head;
+	} else if (do_fifo) {
+		/*
+		 * Enqueue after the last thread whose priority is greater
+		 * than or equal to the priority of the thread being queued.
+		 * Attempt first to go directly onto the tail of the queue.
+		 */
+		if (pri <= CMP_PRIO(qp->qh_tail))
+			ulwpp = &qp->qh_tail->ul_link;
+		else {
+			for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
+			    ulwpp = &next->ul_link)
+				if (pri > CMP_PRIO(next))
+					break;
+		}
+	} else {
+		/*
+		 * Enqueue before the first thread whose priority is less
+		 * than or equal to the priority of the thread being queued.
+		 * Hopefully we can go directly onto the head of the queue.
+		 */
+		for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
+		    ulwpp = &next->ul_link)
+			if (pri >= CMP_PRIO(next))
+				break;
+	}
+	if ((ulwp->ul_link = *ulwpp) == NULL)
+		qp->qh_tail = ulwp;
+	*ulwpp = ulwp;
+
+	ulwp->ul_sleepq = qp;
+	ulwp->ul_wchan = wchan;
+	ulwp->ul_qtype = qtype;
+	if (qp->qh_qmax < ++qp->qh_qlen)
+		qp->qh_qmax = qp->qh_qlen;
+}
+
+/*
+ * Return a pointer to the queue slot of the
+ * highest priority thread on the queue.
+ * On return, prevp, if not NULL, will contain a pointer
+ * to the thread's predecessor on the queue
+ */
+static ulwp_t **
+queue_slot(queue_head_t *qp, void *wchan, int *more, ulwp_t **prevp)
+{
+	ulwp_t **ulwpp;
+	ulwp_t *ulwp;
+	ulwp_t *prev = NULL;
+	ulwp_t **suspp = NULL;
+	ulwp_t *susprev;
+
+	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
+
+	/*
+	 * Find a waiter on the sleep queue.
+	 */
+	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
+	    prev = ulwp, ulwpp = &ulwp->ul_link) {
+		if (ulwp->ul_wchan == wchan) {
+			if (!ulwp->ul_stop)
+				break;
+			/*
+			 * Try not to return a suspended thread.
+			 * This mimics the old libthread's behavior.
+			 */
+			if (suspp == NULL) {
+				suspp = ulwpp;
+				susprev = prev;
+			}
+		}
+	}
+
+	if (ulwp == NULL && suspp != NULL) {
+		ulwp = *(ulwpp = suspp);
+		prev = susprev;
+		suspp = NULL;
+	}
+	if (ulwp == NULL) {
+		if (more != NULL)
+			*more = 0;
+		return (NULL);
+	}
+
+	if (prevp != NULL)
+		*prevp = prev;
+	if (more == NULL)
+		return (ulwpp);
+
+	/*
+	 * Scan the remainder of the queue for another waiter.
+	 */
+	if (suspp != NULL) {
+		*more = 1;
+		return (ulwpp);
+	}
+	for (ulwp = ulwp->ul_link; ulwp != NULL; ulwp = ulwp->ul_link) {
+		if (ulwp->ul_wchan == wchan) {
+			*more = 1;
+			return (ulwpp);
+		}
+	}
+
+	*more = 0;
+	return (ulwpp);
+}
+
+ulwp_t *
+dequeue(queue_head_t *qp, void *wchan, int *more)
+{
+	ulwp_t **ulwpp;
+	ulwp_t *ulwp;
+	ulwp_t *prev;
+
+	if ((ulwpp = queue_slot(qp, wchan, more, &prev)) == NULL)
+		return (NULL);
+
+	/*
+	 * Dequeue the waiter.
+	 */
+	ulwp = *ulwpp;
+	*ulwpp = ulwp->ul_link;
+	ulwp->ul_link = NULL;
+	if (qp->qh_tail == ulwp)
+		qp->qh_tail = prev;
+	qp->qh_qlen--;
+	ulwp->ul_sleepq = NULL;
+	ulwp->ul_wchan = NULL;
+
+	return (ulwp);
+}
+
+/*
+ * Return a pointer to the highest priority thread sleeping on wchan.
+ */
+ulwp_t *
+queue_waiter(queue_head_t *qp, void *wchan)
+{
+	ulwp_t **ulwpp;
+
+	if ((ulwpp = queue_slot(qp, wchan, NULL, NULL)) == NULL)
+		return (NULL);
+	return (*ulwpp);
+}
+
+uint8_t
+dequeue_self(queue_head_t *qp, void *wchan)
+{
+	ulwp_t *self = curthread;
+	ulwp_t **ulwpp;
+	ulwp_t *ulwp;
+	ulwp_t *prev = NULL;
+	int found = 0;
+	int more = 0;
+
+	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
+
+	/* find self on the sleep queue */
+	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
+	    prev = ulwp, ulwpp = &ulwp->ul_link) {
+		if (ulwp == self) {
+			/* dequeue ourself */
+			*ulwpp = self->ul_link;
+			if (qp->qh_tail == self)
+				qp->qh_tail = prev;
+			qp->qh_qlen--;
+			ASSERT(self->ul_wchan == wchan);
+			self->ul_cvmutex = NULL;
+			self->ul_sleepq = NULL;
+			self->ul_wchan = NULL;
+			self->ul_cv_wake = 0;
+			self->ul_link = NULL;
+			found = 1;
+			break;
+		}
+		if (ulwp->ul_wchan == wchan)
+			more = 1;
+	}
+
+	if (!found)
+		thr_panic("dequeue_self(): curthread not found on queue");
+
+	if (more)
+		return (1);
+
+	/* scan the remainder of the queue for another waiter */
+	for (ulwp = *ulwpp; ulwp != NULL; ulwp = ulwp->ul_link) {
+		if (ulwp->ul_wchan == wchan)
+			return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Called from call_user_handler() and _thrp_suspend() to take
+ * ourself off of our sleep queue so we can grab locks.
+ */
+void
+unsleep_self(void)
+{
+	ulwp_t *self = curthread;
+	queue_head_t *qp;
+
+	/*
+	 * Calling enter_critical()/exit_critical() here would lead
+	 * to recursion.  Just manipulate self->ul_critical directly.
+	 */
+	self->ul_critical++;
+	self->ul_writer = 0;
+	while (self->ul_sleepq != NULL) {
+		qp = queue_lock(self->ul_wchan, self->ul_qtype);
+		/*
+		 * We may have been moved from a CV queue to a
+		 * mutex queue while we were attempting queue_lock().
+		 * If so, just loop around and try again.
+		 * dequeue_self() clears self->ul_sleepq.
+		 */
+		if (qp == self->ul_sleepq)
+			(void) dequeue_self(qp, self->ul_wchan);
+		queue_unlock(qp);
+	}
+	self->ul_critical--;
+}
+
+/*
+ * Common code for calling the the ___lwp_mutex_timedlock() system call.
+ * Returns with mutex_owner and mutex_ownerpid set correctly.
+ */
+int
+mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	hrtime_t begin_sleep;
+	int error;
+
+	self->ul_sp = stkptr();
+	self->ul_wchan = mp;
+	if (__td_event_report(self, TD_SLEEP, udp)) {
+		self->ul_td_evbuf.eventnum = TD_SLEEP;
+		self->ul_td_evbuf.eventdata = mp;
+		tdb_event(TD_SLEEP, udp);
+	}
+	if (msp) {
+		tdb_incr(msp->mutex_sleep);
+		begin_sleep = gethrtime();
+	}
+
+	DTRACE_PROBE1(plockstat, mutex__block, mp);
+
+	for (;;) {
+		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0) {
+			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
+			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
+			break;
+		}
+
+		if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
+			/*
+			 * Defend against forkall().  We may be the child,
+			 * in which case we don't actually own the mutex.
+			 */
+			enter_critical(self);
+			if (mp->mutex_ownerpid == udp->pid) {
+				mp->mutex_owner = (uintptr_t)self;
+				exit_critical(self);
+				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    0, 0);
+				break;
+			}
+			exit_critical(self);
+		} else {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			break;
+		}
+	}
+	if (msp)
+		msp->mutex_sleep_time += gethrtime() - begin_sleep;
+	self->ul_wchan = NULL;
+	self->ul_sp = 0;
+
+	return (error);
+}
+
+/*
+ * Common code for calling the ___lwp_mutex_trylock() system call.
+ * Returns with mutex_owner and mutex_ownerpid set correctly.
+ */
+int
+mutex_trylock_kernel(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	int error;
+
+	for (;;) {
+		if ((error = ___lwp_mutex_trylock(mp)) != 0) {
+			if (error != EBUSY) {
+				DTRACE_PROBE2(plockstat, mutex__error, mp,
+				    error);
+			}
+			break;
+		}
+
+		if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
+			/*
+			 * Defend against forkall().  We may be the child,
+			 * in which case we don't actually own the mutex.
+			 */
+			enter_critical(self);
+			if (mp->mutex_ownerpid == udp->pid) {
+				mp->mutex_owner = (uintptr_t)self;
+				exit_critical(self);
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    0, 0);
+				break;
+			}
+			exit_critical(self);
+		} else {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			break;
+		}
+	}
+
+	return (error);
+}
+
+volatile sc_shared_t *
+setup_schedctl(void)
+{
+	ulwp_t *self = curthread;
+	volatile sc_shared_t *scp;
+	sc_shared_t *tmp;
+
+	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
+	    !self->ul_vfork &&			/* not a child of vfork() */
+	    !self->ul_schedctl_called) {	/* haven't been called before */
+		enter_critical(self);
+		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
+		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
+			self->ul_schedctl = scp = tmp;
+		exit_critical(self);
+	}
+	/*
+	 * Unless the call to setup_schedctl() is surrounded
+	 * by enter_critical()/exit_critical(), the address
+	 * we are returning could be invalid due to a forkall()
+	 * having occurred in another thread.
+	 */
+	return (scp);
+}
+
+/*
+ * Interfaces from libsched, incorporated into libc.
+ * libsched.so.1 is now a filter library onto libc.
+ */
+#pragma weak schedctl_lookup = _schedctl_init
+#pragma weak _schedctl_lookup = _schedctl_init
+#pragma weak schedctl_init = _schedctl_init
+schedctl_t *
+_schedctl_init(void)
+{
+	volatile sc_shared_t *scp = setup_schedctl();
+	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
+}
+
+#pragma weak schedctl_exit = _schedctl_exit
+void
+_schedctl_exit(void)
+{
+}
+
+/*
+ * Contract private interface for java.
+ * Set up the schedctl data if it doesn't exist yet.
+ * Return a pointer to the pointer to the schedctl data.
+ */
+volatile sc_shared_t *volatile *
+_thr_schedctl(void)
+{
+	ulwp_t *self = curthread;
+	volatile sc_shared_t *volatile *ptr;
+
+	if (self->ul_vfork)
+		return (NULL);
+	if (*(ptr = &self->ul_schedctl) == NULL)
+		(void) setup_schedctl();
+	return (ptr);
+}
+
+/*
+ * Block signals and attempt to block preemption.
+ * no_preempt()/preempt() must be used in pairs but can be nested.
+ */
+void
+no_preempt(ulwp_t *self)
+{
+	volatile sc_shared_t *scp;
+
+	if (self->ul_preempt++ == 0) {
+		enter_critical(self);
+		if ((scp = self->ul_schedctl) != NULL ||
+		    (scp = setup_schedctl()) != NULL) {
+			/*
+			 * Save the pre-existing preempt value.
+			 */
+			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
+			scp->sc_preemptctl.sc_nopreempt = 1;
+		}
+	}
+}
+
+/*
+ * Undo the effects of no_preempt().
+ */
+void
+preempt(ulwp_t *self)
+{
+	volatile sc_shared_t *scp;
+
+	ASSERT(self->ul_preempt > 0);
+	if (--self->ul_preempt == 0) {
+		if ((scp = self->ul_schedctl) != NULL) {
+			/*
+			 * Restore the pre-existing preempt value.
+			 */
+			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
+			if (scp->sc_preemptctl.sc_yield &&
+			    scp->sc_preemptctl.sc_nopreempt == 0) {
+				lwp_yield();
+				if (scp->sc_preemptctl.sc_yield) {
+					/*
+					 * Shouldn't happen.  This is either
+					 * a race condition or the thread
+					 * just entered the real-time class.
+					 */
+					lwp_yield();
+					scp->sc_preemptctl.sc_yield = 0;
+				}
+			}
+		}
+		exit_critical(self);
+	}
+}
+
+/*
+ * If a call to preempt() would cause the current thread to yield or to
+ * take deferred actions in exit_critical(), then unpark the specified
+ * lwp so it can run while we delay.  Return the original lwpid if the
+ * unpark was not performed, else return zero.  The tests are a repeat
+ * of some of the tests in preempt(), above.  This is a statistical
+ * optimization solely for cond_sleep_queue(), below.
+ */
+static lwpid_t
+preempt_unpark(ulwp_t *self, lwpid_t lwpid)
+{
+	volatile sc_shared_t *scp = self->ul_schedctl;
+
+	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
+	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
+	    (self->ul_curplease && self->ul_critical == 1)) {
+		(void) __lwp_unpark(lwpid);
+		lwpid = 0;
+	}
+	return (lwpid);
+}
+
+/*
+ * Spin for a while, trying to grab the lock.  We know that we
+ * failed set_lock_byte(&mp->mutex_lockw) once before coming here.
+ * If this fails, return EBUSY and let the caller deal with it.
+ * If this succeeds, return 0 with mutex_owner set to curthread.
+ */
+int
+mutex_trylock_adaptive(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	ulwp_t *ulwp;
+	volatile sc_shared_t *scp;
+	volatile uint8_t *lockp;
+	volatile uint64_t *ownerp;
+	int count, max = self->ul_adaptive_spin;
+
+	ASSERT(!(mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)));
+
+	if (max == 0 || (mp->mutex_spinners >= self->ul_max_spinners))
+		return (EBUSY);
+
+	lockp = (volatile uint8_t *)&mp->mutex_lockw;
+	ownerp = (volatile uint64_t *)&mp->mutex_owner;
+
+	DTRACE_PROBE1(plockstat, mutex__spin, mp);
+
+	/*
+	 * This spin loop is unfair to lwps that have already dropped into
+	 * the kernel to sleep.  They will starve on a highly-contended mutex.
+	 * This is just too bad.  The adaptive spin algorithm is intended
+	 * to allow programs with highly-contended locks (that is, broken
+	 * programs) to execute with reasonable speed despite their contention.
+	 * Being fair would reduce the speed of such programs and well-written
+	 * programs will not suffer in any case.
+	 */
+	enter_critical(self);		/* protects ul_schedctl */
+	incr32(&mp->mutex_spinners);
+	for (count = 0; count < max; count++) {
+		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
+			*ownerp = (uintptr_t)self;
+			decr32(&mp->mutex_spinners);
+			exit_critical(self);
+			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
+			return (0);
+		}
+		SMT_PAUSE();
+		/*
+		 * Stop spinning if the mutex owner is not running on
+		 * a processor; it will not drop the lock any time soon
+		 * and we would just be wasting time to keep spinning.
+		 *
+		 * Note that we are looking at another thread (ulwp_t)
+		 * without ensuring that the other thread does not exit.
+		 * The scheme relies on ulwp_t structures never being
+		 * deallocated by the library (the library employs a free
+		 * list of ulwp_t structs that are reused when new threads
+		 * are created) and on schedctl shared memory never being
+		 * deallocated once created via __schedctl().
+		 *
+		 * Thus, the worst that can happen when the spinning thread
+		 * looks at the owner's schedctl data is that it is looking
+		 * at some other thread's schedctl data.  This almost never
+		 * happens and is benign when it does.
+		 */
+		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
+		    ((scp = ulwp->ul_schedctl) == NULL ||
+		    scp->sc_state != SC_ONPROC))
+			break;
+	}
+	decr32(&mp->mutex_spinners);
+	exit_critical(self);
+
+	DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
+
+	return (EBUSY);
+}
+
+/*
+ * Same as mutex_trylock_adaptive(), except specifically for queue locks.
+ * The owner field is not set here; the caller (spin_lock_set()) sets it.
+ */
+int
+mutex_queuelock_adaptive(mutex_t *mp)
+{
+	ulwp_t *ulwp;
+	volatile sc_shared_t *scp;
+	volatile uint8_t *lockp;
+	volatile uint64_t *ownerp;
+	int count = curthread->ul_queue_spin;
+
+	ASSERT(mp->mutex_type == USYNC_THREAD);
+
+	if (count == 0)
+		return (EBUSY);
+
+	lockp = (volatile uint8_t *)&mp->mutex_lockw;
+	ownerp = (volatile uint64_t *)&mp->mutex_owner;
+	while (--count >= 0) {
+		if (*lockp == 0 && set_lock_byte(lockp) == 0)
+			return (0);
+		SMT_PAUSE();
+		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
+		    ((scp = ulwp->ul_schedctl) == NULL ||
+		    scp->sc_state != SC_ONPROC))
+			break;
+	}
+
+	return (EBUSY);
+}
+
+/*
+ * Like mutex_trylock_adaptive(), but for process-shared mutexes.
+ * Spin for a while, trying to grab the lock.  We know that we
+ * failed set_lock_byte(&mp->mutex_lockw) once before coming here.
+ * If this fails, return EBUSY and let the caller deal with it.
+ * If this succeeds, return 0 with mutex_owner set to curthread
+ * and mutex_ownerpid set to the current pid.
+ */
+int
+mutex_trylock_process(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	int count;
+	volatile uint8_t *lockp;
+	volatile uint64_t *ownerp;
+	volatile int32_t *pidp;
+	pid_t pid, newpid;
+	uint64_t owner, newowner;
+
+	if ((count = ncpus) == 0)
+		count = ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
+	count = (count > 1)? self->ul_adaptive_spin : 0;
+
+	ASSERT((mp->mutex_type & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) ==
+		USYNC_PROCESS);
+
+	if (count == 0)
+		return (EBUSY);
+
+	lockp = (volatile uint8_t *)&mp->mutex_lockw;
+	ownerp = (volatile uint64_t *)&mp->mutex_owner;
+	pidp = (volatile int32_t *)&mp->mutex_ownerpid;
+	owner = *ownerp;
+	pid = *pidp;
+	/*
+	 * This is a process-shared mutex.
+	 * We cannot know if the owner is running on a processor.
+	 * We just spin and hope that it is on a processor.
+	 */
+	while (--count >= 0) {
+		if (*lockp == 0) {
+			enter_critical(self);
+			if (set_lock_byte(lockp) == 0) {
+				*ownerp = (uintptr_t)self;
+				*pidp = udp->pid;
+				exit_critical(self);
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    0, 0);
+				return (0);
+			}
+			exit_critical(self);
+		} else if ((newowner = *ownerp) == owner &&
+		    (newpid = *pidp) == pid) {
+			SMT_PAUSE();
+			continue;
+		}
+		/*
+		 * The owner of the lock changed; start the count over again.
+		 * This may be too aggressive; it needs testing.
+		 */
+		owner = newowner;
+		pid = newpid;
+		count = self->ul_adaptive_spin;
+	}
+
+	return (EBUSY);
+}
+
+/*
+ * Mutex wakeup code for releasing a USYNC_THREAD mutex.
+ * Returns the lwpid of the thread that was dequeued, if any.
+ * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
+ * to wake up the specified lwp.
+ */
+lwpid_t
+mutex_wakeup(mutex_t *mp)
+{
+	lwpid_t lwpid = 0;
+	queue_head_t *qp;
+	ulwp_t *ulwp;
+	int more;
+
+	/*
+	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
+	 * waiters bit if no one was found on the queue because the mutex
+	 * might have been deallocated or reallocated for another purpose.
+	 */
+	qp = queue_lock(mp, MX);
+	if ((ulwp = dequeue(qp, mp, &more)) != NULL) {
+		lwpid = ulwp->ul_lwpid;
+		mp->mutex_waiters = (more? 1 : 0);
+	}
+	queue_unlock(qp);
+	return (lwpid);
+}
+
+/*
+ * Spin for a while, testing to see if the lock has been grabbed.
+ * If this fails, call mutex_wakeup() to release a waiter.
+ */
+lwpid_t
+mutex_unlock_queue(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uint32_t *lockw = &mp->mutex_lockword;
+	lwpid_t lwpid;
+	volatile uint8_t *lockp;
+	volatile uint32_t *spinp;
+	int count;
+
+	/*
+	 * We use the swap primitive to clear the lock, but we must
+	 * atomically retain the waiters bit for the remainder of this
+	 * code to work.  We first check to see if the waiters bit is
+	 * set and if so clear the lock by swapping in a word containing
+	 * only the waiters bit.  This could produce a false positive test
+	 * for whether there are waiters that need to be waked up, but
+	 * this just causes an extra call to mutex_wakeup() to do nothing.
+	 * The opposite case is more delicate:  If there are no waiters,
+	 * we swap in a zero lock byte and a zero waiters bit.  The result
+	 * of the swap could indicate that there really was a waiter so in
+	 * this case we go directly to mutex_wakeup() without performing
+	 * any of the adaptive code because the waiter bit has been cleared
+	 * and the adaptive code is unreliable in this case.
+	 */
+	if (!(*lockw & WAITERMASK)) {	/* no waiter exists right now */
+		mp->mutex_owner = 0;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+		if (!(swap32(lockw, 0) & WAITERMASK))	/* still no waiters */
+			return (0);
+		no_preempt(self);	/* ensure a prompt wakeup */
+		lwpid = mutex_wakeup(mp);
+	} else {
+		no_preempt(self);	/* ensure a prompt wakeup */
+		lockp = (volatile uint8_t *)&mp->mutex_lockw;
+		spinp = (volatile uint32_t *)&mp->mutex_spinners;
+		mp->mutex_owner = 0;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+		(void) swap32(lockw, WAITER);	/* clear lock, retain waiter */
+
+		/*
+		 * We spin here fewer times than mutex_trylock_adaptive().
+		 * We are trying to balance two conflicting goals:
+		 * 1. Avoid waking up anyone if a spinning thread
+		 *    grabs the lock.
+		 * 2. Wake up a sleeping thread promptly to get on
+		 *    with useful work.
+		 * We don't spin at all if there is no acquiring spinner;
+		 * (mp->mutex_spinners is non-zero if there are spinners).
+		 */
+		for (count = self->ul_release_spin;
+		    *spinp && count > 0; count--) {
+			/*
+			 * There is a waiter that we will have to wake
+			 * up unless someone else grabs the lock while
+			 * we are busy spinning.  Like the spin loop in
+			 * mutex_trylock_adaptive(), this spin loop is
+			 * unfair to lwps that have already dropped into
+			 * the kernel to sleep.  They will starve on a
+			 * highly-contended mutex.  Too bad.
+			 */
+			if (*lockp != 0) {	/* somebody grabbed the lock */
+				preempt(self);
+				return (0);
+			}
+			SMT_PAUSE();
+		}
+
+		/*
+		 * No one grabbed the lock.
+		 * Wake up some lwp that is waiting for it.
+		 */
+		mp->mutex_waiters = 0;
+		lwpid = mutex_wakeup(mp);
+	}
+
+	if (lwpid == 0)
+		preempt(self);
+	return (lwpid);
+}
+
+/*
+ * Like mutex_unlock_queue(), but for process-shared mutexes.
+ * We tested the waiters field before calling here and it was non-zero.
+ */
+void
+mutex_unlock_process(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	int count;
+	volatile uint8_t *lockp;
+
+	/*
+	 * See the comments in mutex_unlock_queue(), above.
+	 */
+	if ((count = ncpus) == 0)
+		count = ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
+	count = (count > 1)? self->ul_release_spin : 0;
+	no_preempt(self);
+	mp->mutex_owner = 0;
+	mp->mutex_ownerpid = 0;
+	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+	if (count == 0) {
+		/* clear lock, test waiter */
+		if (!(swap32(&mp->mutex_lockword, 0) & WAITERMASK)) {
+			/* no waiters now */
+			preempt(self);
+			return;
+		}
+	} else {
+		/* clear lock, retain waiter */
+		(void) swap32(&mp->mutex_lockword, WAITER);
+		lockp = (volatile uint8_t *)&mp->mutex_lockw;
+		while (--count >= 0) {
+			if (*lockp != 0) {
+				/* somebody grabbed the lock */
+				preempt(self);
+				return;
+			}
+			SMT_PAUSE();
+		}
+		/*
+		 * We must clear the waiters field before going
+		 * to the kernel, else it could remain set forever.
+		 */
+		mp->mutex_waiters = 0;
+	}
+	(void) ___lwp_mutex_wakeup(mp);
+	preempt(self);
+}
+
+/*
+ * Return the real priority of a thread.
+ */
+int
+real_priority(ulwp_t *ulwp)
+{
+	if (ulwp->ul_epri == 0)
+		return (ulwp->ul_mappedpri? ulwp->ul_mappedpri : ulwp->ul_pri);
+	return (ulwp->ul_emappedpri? ulwp->ul_emappedpri : ulwp->ul_epri);
+}
+
+void
+stall(void)
+{
+	for (;;)
+		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
+}
+
+/*
+ * Acquire a USYNC_THREAD mutex via user-level sleep queues.
+ * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
+ * Returns with mutex_owner set correctly.
+ */
+int
+mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
+	timespec_t *tsp)
+{
+	uberdata_t *udp = curthread->ul_uberdata;
+	queue_head_t *qp;
+	hrtime_t begin_sleep;
+	int error = 0;
+
+	self->ul_sp = stkptr();
+	if (__td_event_report(self, TD_SLEEP, udp)) {
+		self->ul_wchan = mp;
+		self->ul_td_evbuf.eventnum = TD_SLEEP;
+		self->ul_td_evbuf.eventdata = mp;
+		tdb_event(TD_SLEEP, udp);
+	}
+	if (msp) {
+		tdb_incr(msp->mutex_sleep);
+		begin_sleep = gethrtime();
+	}
+
+	DTRACE_PROBE1(plockstat, mutex__block, mp);
+
+	/*
+	 * Put ourself on the sleep queue, and while we are
+	 * unable to grab the lock, go park in the kernel.
+	 * Take ourself off the sleep queue after we acquire the lock.
+	 * The waiter bit can be set/cleared only while holding the queue lock.
+	 */
+	qp = queue_lock(mp, MX);
+	enqueue(qp, self, mp, MX);
+	mp->mutex_waiters = 1;
+	for (;;) {
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			mp->mutex_waiters = dequeue_self(qp, mp);
+			break;
+		}
+		set_parking_flag(self, 1);
+		queue_unlock(qp);
+		/*
+		 * __lwp_park() will return the residual time in tsp
+		 * if we are unparked before the timeout expires.
+		 */
+		if ((error = __lwp_park(tsp, 0)) == EINTR)
+			error = 0;
+		set_parking_flag(self, 0);
+		/*
+		 * We could have taken a signal or suspended ourself.
+		 * If we did, then we removed ourself from the queue.
+		 * Someone else may have removed us from the queue
+		 * as a consequence of mutex_unlock().  We may have
+		 * gotten a timeout from __lwp_park().  Or we may still
+		 * be on the queue and this is just a spurious wakeup.
+		 */
+		qp = queue_lock(mp, MX);
+		if (self->ul_sleepq == NULL) {
+			if (error) {
+				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
+				DTRACE_PROBE2(plockstat, mutex__error, mp,
+				    error);
+				break;
+			}
+			if (set_lock_byte(&mp->mutex_lockw) == 0) {
+				mp->mutex_owner = (uintptr_t)self;
+				DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    0, 0);
+				break;
+			}
+			enqueue(qp, self, mp, MX);
+			mp->mutex_waiters = 1;
+		}
+		ASSERT(self->ul_sleepq == qp &&
+		    self->ul_qtype == MX &&
+		    self->ul_wchan == mp);
+		if (error) {
+			mp->mutex_waiters = dequeue_self(qp, mp);
+			DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
+			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
+			break;
+		}
+	}
+
+	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
+	    self->ul_wchan == NULL);
+	self->ul_sp = 0;
+
+	queue_unlock(qp);
+	if (msp)
+		msp->mutex_sleep_time += gethrtime() - begin_sleep;
+
+	ASSERT(error == 0 || error == EINVAL || error == ETIME);
+	return (error);
+}
+
+/*
+ * Returns with mutex_owner set correctly.
+ */
+int
+mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	int mtype = mp->mutex_type;
+	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
+	int error = 0;
+
+	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
+
+	if (!self->ul_schedctl_called)
+		(void) setup_schedctl();
+
+	if (msp && try == MUTEX_TRY)
+		tdb_incr(msp->mutex_try);
+
+	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_is_held(mp)) {
+		if (mtype & LOCK_RECURSIVE) {
+			if (mp->mutex_rcount == RECURSION_MAX) {
+				error = EAGAIN;
+			} else {
+				mp->mutex_rcount++;
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    1, 0);
+				return (0);
+			}
+		} else if (try == MUTEX_TRY) {
+			return (EBUSY);
+		} else {
+			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
+			return (EDEADLK);
+		}
+	}
+
+	if (self->ul_error_detection && try == MUTEX_LOCK &&
+	    tsp == NULL && mutex_is_held(mp))
+		lock_error(mp, "mutex_lock", NULL, NULL);
+
+	if (mtype &
+	    (USYNC_PROCESS_ROBUST|PTHREAD_PRIO_INHERIT|PTHREAD_PRIO_PROTECT)) {
+		uint8_t ceil;
+		int myprio;
+
+		if (mtype & PTHREAD_PRIO_PROTECT) {
+			ceil = mp->mutex_ceiling;
+			ASSERT(_validate_rt_prio(SCHED_FIFO, ceil) == 0);
+			myprio = real_priority(self);
+			if (myprio > ceil) {
+				DTRACE_PROBE2(plockstat, mutex__error, mp,
+				    EINVAL);
+				return (EINVAL);
+			}
+			if ((error = _ceil_mylist_add(mp)) != 0) {
+				DTRACE_PROBE2(plockstat, mutex__error, mp,
+				    error);
+				return (error);
+			}
+			if (myprio < ceil)
+				_ceil_prio_inherit(ceil);
+		}
+
+		if (mtype & PTHREAD_PRIO_INHERIT) {
+			/* go straight to the kernel */
+			if (try == MUTEX_TRY)
+				error = mutex_trylock_kernel(mp);
+			else	/* MUTEX_LOCK */
+				error = mutex_lock_kernel(mp, tsp, msp);
+			/*
+			 * The kernel never sets or clears the lock byte
+			 * for PTHREAD_PRIO_INHERIT mutexes.
+			 * Set it here for debugging consistency.
+			 */
+			switch (error) {
+			case 0:
+			case EOWNERDEAD:
+				mp->mutex_lockw = LOCKSET;
+				break;
+			}
+		} else if (mtype & USYNC_PROCESS_ROBUST) {
+			/* go straight to the kernel */
+			if (try == MUTEX_TRY)
+				error = mutex_trylock_kernel(mp);
+			else	/* MUTEX_LOCK */
+				error = mutex_lock_kernel(mp, tsp, msp);
+		} else {	/* PTHREAD_PRIO_PROTECT */
+			/*
+			 * Try once at user level before going to the kernel.
+			 * If this is a process shared mutex then protect
+			 * against forkall() while setting mp->mutex_ownerpid.
+			 */
+			if (mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
+				enter_critical(self);
+				if (set_lock_byte(&mp->mutex_lockw) == 0) {
+					mp->mutex_owner = (uintptr_t)self;
+					mp->mutex_ownerpid = udp->pid;
+					exit_critical(self);
+					DTRACE_PROBE3(plockstat,
+					    mutex__acquire, mp, 0, 0);
+				} else {
+					exit_critical(self);
+					error = EBUSY;
+				}
+			} else {
+				if (set_lock_byte(&mp->mutex_lockw) == 0) {
+					mp->mutex_owner = (uintptr_t)self;
+					DTRACE_PROBE3(plockstat,
+					    mutex__acquire, mp, 0, 0);
+				} else {
+					error = EBUSY;
+				}
+			}
+			if (error && try == MUTEX_LOCK)
+				error = mutex_lock_kernel(mp, tsp, msp);
+		}
+
+		if (error) {
+			if (mtype & PTHREAD_PRIO_INHERIT) {
+				switch (error) {
+				case EOWNERDEAD:
+				case ENOTRECOVERABLE:
+					if (mtype & PTHREAD_MUTEX_ROBUST_NP)
+						break;
+					if (error == EOWNERDEAD) {
+						/*
+						 * We own the mutex; unlock it.
+						 * It becomes ENOTRECOVERABLE.
+						 * All waiters are waked up.
+						 */
+						mp->mutex_owner = 0;
+						mp->mutex_ownerpid = 0;
+						DTRACE_PROBE2(plockstat,
+						    mutex__release, mp, 0);
+						mp->mutex_lockw = LOCKCLEAR;
+						(void) ___lwp_mutex_unlock(mp);
+					}
+					/* FALLTHROUGH */
+				case EDEADLK:
+					if (try == MUTEX_LOCK)
+						stall();
+					error = EBUSY;
+					break;
+				}
+			}
+			if ((mtype & PTHREAD_PRIO_PROTECT) &&
+			    error != EOWNERDEAD) {
+				(void) _ceil_mylist_del(mp);
+				if (myprio < ceil)
+					_ceil_prio_waive();
+			}
+		}
+	} else if (mtype & USYNC_PROCESS) {
+		/*
+		 * This is a process shared mutex.  Protect against
+		 * forkall() while setting mp->mutex_ownerpid.
+		 */
+		enter_critical(self);
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			mp->mutex_ownerpid = udp->pid;
+			exit_critical(self);
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+		} else {
+			/* try a little harder */
+			exit_critical(self);
+			error = mutex_trylock_process(mp);
+		}
+		if (error && try == MUTEX_LOCK)
+			error = mutex_lock_kernel(mp, tsp, msp);
+	} else  {	/* USYNC_THREAD */
+		/* try once */
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+		} else {
+			/* try a little harder if we don't own the mutex */
+			error = EBUSY;
+			if (MUTEX_OWNER(mp) != self)
+				error = mutex_trylock_adaptive(mp);
+			if (error && try == MUTEX_LOCK)		/* go park */
+				error = mutex_lock_queue(self, msp, mp, tsp);
+		}
+	}
+
+	switch (error) {
+	case EOWNERDEAD:
+	case ELOCKUNMAPPED:
+		mp->mutex_owner = (uintptr_t)self;
+		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+		/* FALLTHROUGH */
+	case 0:
+		if (msp)
+			record_begin_hold(msp);
+		break;
+	default:
+		if (try == MUTEX_TRY) {
+			if (msp)
+				tdb_incr(msp->mutex_try_fail);
+			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
+				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
+				tdb_event(TD_LOCK_TRY, udp);
+			}
+		}
+		break;
+	}
+
+	return (error);
+}
+
+int
+fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+
+	/*
+	 * We know that USYNC_PROCESS is set in mtype and that
+	 * zero, one, or both of the flags LOCK_RECURSIVE and
+	 * LOCK_ERRORCHECK are set, and that no other flags are set.
+	 */
+	enter_critical(self);
+	if (set_lock_byte(&mp->mutex_lockw) == 0) {
+		mp->mutex_owner = (uintptr_t)self;
+		mp->mutex_ownerpid = udp->pid;
+		exit_critical(self);
+		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+		return (0);
+	}
+	exit_critical(self);
+
+	if ((mtype & ~USYNC_PROCESS) && shared_mutex_held(mp)) {
+		if (mtype & LOCK_RECURSIVE) {
+			if (mp->mutex_rcount == RECURSION_MAX)
+				return (EAGAIN);
+			mp->mutex_rcount++;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
+			return (0);
+		}
+		if (try == MUTEX_LOCK) {
+			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
+			return (EDEADLK);
+		}
+		return (EBUSY);
+	}
+
+	/* try a little harder if we don't own the mutex */
+	if (!shared_mutex_held(mp) && mutex_trylock_process(mp) == 0)
+		return (0);
+
+	if (try == MUTEX_LOCK)
+		return (mutex_lock_kernel(mp, tsp, NULL));
+
+	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
+		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
+		tdb_event(TD_LOCK_TRY, udp);
+	}
+	return (EBUSY);
+}
+
+static int
+slow_lock(ulwp_t *self, mutex_t *mp, timespec_t *tsp)
+{
+	int error = 0;
+
+	if (MUTEX_OWNER(mp) == self || mutex_trylock_adaptive(mp) != 0)
+		error = mutex_lock_queue(self, NULL, mp, tsp);
+	return (error);
+}
+
+int
+mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	uberflags_t *gflags;
+	int mtype;
+
+	/*
+	 * Optimize the case of USYNC_THREAD, including
+	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
+	 * no error detection, no lock statistics,
+	 * and the process has only a single thread.
+	 * (Most likely a traditional single-threaded application.)
+	 */
+	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
+	    udp->uberflags.uf_all) == 0) {
+		/*
+		 * Only one thread exists so we don't need an atomic operation.
+		 */
+		if (mp->mutex_lockw == 0) {
+			mp->mutex_lockw = LOCKSET;
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			return (0);
+		}
+		if (mtype && MUTEX_OWNER(mp) == self) {
+			/*
+			 * LOCK_RECURSIVE, LOCK_ERRORCHECK, or both.
+			 */
+			if (mtype & LOCK_RECURSIVE) {
+				if (mp->mutex_rcount == RECURSION_MAX)
+					return (EAGAIN);
+				mp->mutex_rcount++;
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    1, 0);
+				return (0);
+			}
+			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
+			return (EDEADLK);	/* LOCK_ERRORCHECK */
+		}
+		/*
+		 * We have reached a deadlock, probably because the
+		 * process is executing non-async-signal-safe code in
+		 * a signal handler and is attempting to acquire a lock
+		 * that it already owns.  This is not surprising, given
+		 * bad programming practices over the years that has
+		 * resulted in applications calling printf() and such
+		 * in their signal handlers.  Unless the user has told
+		 * us that the signal handlers are safe by setting:
+		 *	export _THREAD_ASYNC_SAFE=1
+		 * we return EDEADLK rather than actually deadlocking.
+		 */
+		if (tsp == NULL &&
+		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
+			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
+			return (EDEADLK);
+		}
+	}
+
+	/*
+	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
+	 * no error detection, and no lock statistics.
+	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
+	 */
+	if ((gflags = self->ul_schedctl_called) != NULL &&
+	    (gflags->uf_trs_ted |
+	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
+
+		if (mtype & USYNC_PROCESS)
+			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
+
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			return (0);
+		}
+
+		if (mtype && MUTEX_OWNER(mp) == self) {
+			if (mtype & LOCK_RECURSIVE) {
+				if (mp->mutex_rcount == RECURSION_MAX)
+					return (EAGAIN);
+				mp->mutex_rcount++;
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    1, 0);
+				return (0);
+			}
+			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
+			return (EDEADLK);	/* LOCK_ERRORCHECK */
+		}
+
+		return (slow_lock(self, mp, tsp));
+	}
+
+	/* else do it the long way */
+	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
+}
+
+#pragma weak _private_mutex_lock = __mutex_lock
+#pragma weak mutex_lock = __mutex_lock
+#pragma weak _mutex_lock = __mutex_lock
+#pragma weak pthread_mutex_lock = __mutex_lock
+#pragma weak _pthread_mutex_lock = __mutex_lock
+int
+__mutex_lock(mutex_t *mp)
+{
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+	return (mutex_lock_impl(mp, NULL));
+}
+
+#pragma weak pthread_mutex_timedlock = _pthread_mutex_timedlock
+int
+_pthread_mutex_timedlock(mutex_t *mp, const timespec_t *abstime)
+{
+	timespec_t tslocal;
+	int error;
+
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
+	error = mutex_lock_impl(mp, &tslocal);
+	if (error == ETIME)
+		error = ETIMEDOUT;
+	return (error);
+}
+
+#pragma weak pthread_mutex_reltimedlock_np = _pthread_mutex_reltimedlock_np
+int
+_pthread_mutex_reltimedlock_np(mutex_t *mp, const timespec_t *reltime)
+{
+	timespec_t tslocal;
+	int error;
+
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+	tslocal = *reltime;
+	error = mutex_lock_impl(mp, &tslocal);
+	if (error == ETIME)
+		error = ETIMEDOUT;
+	return (error);
+}
+
+static int
+slow_trylock(mutex_t *mp, ulwp_t *self)
+{
+	if (MUTEX_OWNER(mp) == self ||
+	    mutex_trylock_adaptive(mp) != 0) {
+		uberdata_t *udp = self->ul_uberdata;
+
+		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
+			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
+			tdb_event(TD_LOCK_TRY, udp);
+		}
+		return (EBUSY);
+	}
+	return (0);
+}
+
+#pragma weak _private_mutex_trylock = __mutex_trylock
+#pragma weak mutex_trylock = __mutex_trylock
+#pragma weak _mutex_trylock = __mutex_trylock
+#pragma weak pthread_mutex_trylock = __mutex_trylock
+#pragma weak _pthread_mutex_trylock = __mutex_trylock
+int
+__mutex_trylock(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	uberflags_t *gflags;
+	int mtype;
+
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+	/*
+	 * Optimize the case of USYNC_THREAD, including
+	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
+	 * no error detection, no lock statistics,
+	 * and the process has only a single thread.
+	 * (Most likely a traditional single-threaded application.)
+	 */
+	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
+	    udp->uberflags.uf_all) == 0) {
+		/*
+		 * Only one thread exists so we don't need an atomic operation.
+		 */
+		if (mp->mutex_lockw == 0) {
+			mp->mutex_lockw = LOCKSET;
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			return (0);
+		}
+		if (mtype && MUTEX_OWNER(mp) == self) {
+			if (mtype & LOCK_RECURSIVE) {
+				if (mp->mutex_rcount == RECURSION_MAX)
+					return (EAGAIN);
+				mp->mutex_rcount++;
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    1, 0);
+				return (0);
+			}
+			return (EDEADLK);	/* LOCK_ERRORCHECK */
+		}
+		return (EBUSY);
+	}
+
+	/*
+	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
+	 * no error detection, and no lock statistics.
+	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
+	 */
+	if ((gflags = self->ul_schedctl_called) != NULL &&
+	    (gflags->uf_trs_ted |
+	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
+
+		if (mtype & USYNC_PROCESS)
+			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
+
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+			return (0);
+		}
+
+		if (mtype && MUTEX_OWNER(mp) == self) {
+			if (mtype & LOCK_RECURSIVE) {
+				if (mp->mutex_rcount == RECURSION_MAX)
+					return (EAGAIN);
+				mp->mutex_rcount++;
+				DTRACE_PROBE3(plockstat, mutex__acquire, mp,
+				    1, 0);
+				return (0);
+			}
+			return (EBUSY);		/* LOCK_ERRORCHECK */
+		}
+
+		return (slow_trylock(mp, self));
+	}
+
+	/* else do it the long way */
+	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
+}
+
+int
+mutex_unlock_internal(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	int mtype = mp->mutex_type;
+	tdb_mutex_stats_t *msp;
+	int error;
+	lwpid_t lwpid;
+
+	if ((mtype & LOCK_ERRORCHECK) && !mutex_is_held(mp))
+		return (EPERM);
+
+	if (self->ul_error_detection && !mutex_is_held(mp))
+		lock_error(mp, "mutex_unlock", NULL, NULL);
+
+	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
+		mp->mutex_rcount--;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
+		return (0);
+	}
+
+	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
+		(void) record_hold_time(msp);
+
+	if (mtype &
+	    (USYNC_PROCESS_ROBUST|PTHREAD_PRIO_INHERIT|PTHREAD_PRIO_PROTECT)) {
+		no_preempt(self);
+		mp->mutex_owner = 0;
+		mp->mutex_ownerpid = 0;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+		if (mtype & PTHREAD_PRIO_INHERIT) {
+			mp->mutex_lockw = LOCKCLEAR;
+			error = ___lwp_mutex_unlock(mp);
+		} else if (mtype & USYNC_PROCESS_ROBUST) {
+			error = ___lwp_mutex_unlock(mp);
+		} else {
+			if (swap32(&mp->mutex_lockword, 0) & WAITERMASK)
+				(void) ___lwp_mutex_wakeup(mp);
+			error = 0;
+		}
+		if (mtype & PTHREAD_PRIO_PROTECT) {
+			if (_ceil_mylist_del(mp))
+				_ceil_prio_waive();
+		}
+		preempt(self);
+	} else if (mtype & USYNC_PROCESS) {
+		if (mp->mutex_lockword & WAITERMASK)
+			mutex_unlock_process(mp);
+		else {
+			mp->mutex_owner = 0;
+			mp->mutex_ownerpid = 0;
+			DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+			if (swap32(&mp->mutex_lockword, 0) & WAITERMASK) {
+				no_preempt(self);
+				(void) ___lwp_mutex_wakeup(mp);
+				preempt(self);
+			}
+		}
+		error = 0;
+	} else {	/* USYNC_THREAD */
+		if ((lwpid = mutex_unlock_queue(mp)) != 0) {
+			(void) __lwp_unpark(lwpid);
+			preempt(self);
+		}
+		error = 0;
+	}
+
+	return (error);
+}
+
+#pragma weak _private_mutex_unlock = __mutex_unlock
+#pragma weak mutex_unlock = __mutex_unlock
+#pragma weak _mutex_unlock = __mutex_unlock
+#pragma weak pthread_mutex_unlock = __mutex_unlock
+#pragma weak _pthread_mutex_unlock = __mutex_unlock
+int
+__mutex_unlock(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	uberflags_t *gflags;
+	lwpid_t lwpid;
+	int mtype;
+	short el;
+
+	/*
+	 * Optimize the case of USYNC_THREAD, including
+	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
+	 * no error detection, no lock statistics,
+	 * and the process has only a single thread.
+	 * (Most likely a traditional single-threaded application.)
+	 */
+	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
+	    udp->uberflags.uf_all) == 0) {
+		if (mtype) {
+			/*
+			 * At this point we know that one or both of the
+			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
+			 */
+			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
+				return (EPERM);
+			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
+				mp->mutex_rcount--;
+				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
+				return (0);
+			}
+		}
+		/*
+		 * Only one thread exists so we don't need an atomic operation.
+		 * Also, there can be no waiters.
+		 */
+		mp->mutex_owner = 0;
+		mp->mutex_lockword = 0;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+		return (0);
+	}
+
+	/*
+	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
+	 * no error detection, and no lock statistics.
+	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
+	 */
+	if ((gflags = self->ul_schedctl_called) != NULL) {
+		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
+fast_unlock:
+			if (!(mp->mutex_lockword & WAITERMASK)) {
+				/* no waiter exists right now */
+				mp->mutex_owner = 0;
+				DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+				if (swap32(&mp->mutex_lockword, 0) &
+				    WAITERMASK) {
+					/* a waiter suddenly appeared */
+					no_preempt(self);
+					if ((lwpid = mutex_wakeup(mp)) != 0)
+						(void) __lwp_unpark(lwpid);
+					preempt(self);
+				}
+			} else if ((lwpid = mutex_unlock_queue(mp)) != 0) {
+				(void) __lwp_unpark(lwpid);
+				preempt(self);
+			}
+			return (0);
+		}
+		if (el)		/* error detection or lock statistics */
+			goto slow_unlock;
+		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
+			/*
+			 * At this point we know that one or both of the
+			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
+			 */
+			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
+				return (EPERM);
+			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
+				mp->mutex_rcount--;
+				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
+				return (0);
+			}
+			goto fast_unlock;
+		}
+		if ((mtype &
+		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
+			/*
+			 * At this point we know that zero, one, or both of the
+			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
+			 * that the USYNC_PROCESS flag is set.
+			 */
+			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
+				return (EPERM);
+			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
+				mp->mutex_rcount--;
+				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
+				return (0);
+			}
+			if (mp->mutex_lockword & WAITERMASK)
+				mutex_unlock_process(mp);
+			else {
+				mp->mutex_owner = 0;
+				mp->mutex_ownerpid = 0;
+				DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+				if (swap32(&mp->mutex_lockword, 0) &
+				    WAITERMASK) {
+					no_preempt(self);
+					(void) ___lwp_mutex_wakeup(mp);
+					preempt(self);
+				}
+			}
+			return (0);
+		}
+	}
+
+	/* else do it the long way */
+slow_unlock:
+	return (mutex_unlock_internal(mp));
+}
+
+/*
+ * Internally to the library, almost all mutex lock/unlock actions
+ * go through these lmutex_ functions, to protect critical regions.
+ * We replicate a bit of code from __mutex_lock() and __mutex_unlock()
+ * to make these functions faster since we know that the mutex type
+ * of all internal locks is USYNC_THREAD.  We also know that internal
+ * locking can never fail, so we panic if it does.
+ */
+void
+lmutex_lock(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+
+	ASSERT(mp->mutex_type == USYNC_THREAD);
+
+	enter_critical(self);
+	/*
+	 * Optimize the case of no lock statistics and only a single thread.
+	 * (Most likely a traditional single-threaded application.)
+	 */
+	if (udp->uberflags.uf_all == 0) {
+		/*
+		 * Only one thread exists; the mutex must be free.
+		 */
+		ASSERT(mp->mutex_lockw == 0);
+		mp->mutex_lockw = LOCKSET;
+		mp->mutex_owner = (uintptr_t)self;
+		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+	} else {
+		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
+
+		if (!self->ul_schedctl_called)
+			(void) setup_schedctl();
+
+		if (set_lock_byte(&mp->mutex_lockw) == 0) {
+			mp->mutex_owner = (uintptr_t)self;
+			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+		} else if (mutex_trylock_adaptive(mp) != 0) {
+			(void) mutex_lock_queue(self, msp, mp, NULL);
+		}
+
+		if (msp)
+			record_begin_hold(msp);
+	}
+}
+
+void
+lmutex_unlock(mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+
+	ASSERT(mp->mutex_type == USYNC_THREAD);
+
+	/*
+	 * Optimize the case of no lock statistics and only a single thread.
+	 * (Most likely a traditional single-threaded application.)
+	 */
+	if (udp->uberflags.uf_all == 0) {
+		/*
+		 * Only one thread exists so there can be no waiters.
+		 */
+		mp->mutex_owner = 0;
+		mp->mutex_lockword = 0;
+		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+	} else {
+		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
+		lwpid_t lwpid;
+
+		if (msp)
+			(void) record_hold_time(msp);
+		if ((lwpid = mutex_unlock_queue(mp)) != 0) {
+			(void) __lwp_unpark(lwpid);
+			preempt(self);
+		}
+	}
+	exit_critical(self);
+}
+
+static int
+shared_mutex_held(mutex_t *mparg)
+{
+	/*
+	 * There is an inherent data race in the current ownership design.
+	 * The mutex_owner and mutex_ownerpid fields cannot be set or tested
+	 * atomically as a pair. The original implementation tested each
+	 * field just once. This was exposed to trivial false positives in
+	 * the case of multiple multithreaded processes with thread addresses
+	 * in common. To close the window to an acceptable level we now use a
+	 * sequence of five tests: pid-thr-pid-thr-pid. This ensures that any
+	 * single interruption will still leave one uninterrupted sequence of
+	 * pid-thr-pid tests intact.
+	 *
+	 * It is assumed that all updates are always ordered thr-pid and that
+	 * we have TSO hardware.
+	 */
+	volatile mutex_t *mp = (volatile mutex_t *)mparg;
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+
+	if (mp->mutex_ownerpid != udp->pid)
+		return (0);
+
+	if (!MUTEX_OWNED(mp, self))
+		return (0);
+
+	if (mp->mutex_ownerpid != udp->pid)
+		return (0);
+
+	if (!MUTEX_OWNED(mp, self))
+		return (0);
+
+	if (mp->mutex_ownerpid != udp->pid)
+		return (0);
+
+	return (1);
+}
+
+/*
+ * Some crufty old programs define their own version of _mutex_held()
+ * to be simply return(1).  This breaks internal libc logic, so we
+ * define a private version for exclusive use by libc, mutex_is_held(),
+ * and also a new public function, __mutex_held(), to be used in new
+ * code to circumvent these crufty old programs.
+ */
+#pragma weak mutex_held = mutex_is_held
+#pragma weak _mutex_held = mutex_is_held
+#pragma weak __mutex_held = mutex_is_held
+int
+mutex_is_held(mutex_t *mp)
+{
+	if (mp->mutex_type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))
+		return (shared_mutex_held(mp));
+	return (MUTEX_OWNED(mp, curthread));
+}
+
+#pragma weak _private_mutex_destroy = __mutex_destroy
+#pragma weak mutex_destroy = __mutex_destroy
+#pragma weak _mutex_destroy = __mutex_destroy
+#pragma weak pthread_mutex_destroy = __mutex_destroy
+#pragma weak _pthread_mutex_destroy = __mutex_destroy
+int
+__mutex_destroy(mutex_t *mp)
+{
+	mp->mutex_magic = 0;
+	mp->mutex_flag &= ~LOCK_INITED;
+	tdb_sync_obj_deregister(mp);
+	return (0);
+}
+
+/*
+ * Spin locks are separate from ordinary mutexes,
+ * but we use the same data structure for them.
+ */
+
+#pragma weak pthread_spin_init = _pthread_spin_init
+int
+_pthread_spin_init(pthread_spinlock_t *lock, int pshared)
+{
+	mutex_t *mp = (mutex_t *)lock;
+
+	(void) _memset(mp, 0, sizeof (*mp));
+	if (pshared == PTHREAD_PROCESS_SHARED)
+		mp->mutex_type = USYNC_PROCESS;
+	else
+		mp->mutex_type = USYNC_THREAD;
+	mp->mutex_flag = LOCK_INITED;
+	mp->mutex_magic = MUTEX_MAGIC;
+	return (0);
+}
+
+#pragma weak pthread_spin_destroy = _pthread_spin_destroy
+int
+_pthread_spin_destroy(pthread_spinlock_t *lock)
+{
+	(void) _memset(lock, 0, sizeof (*lock));
+	return (0);
+}
+
+#pragma weak pthread_spin_trylock = _pthread_spin_trylock
+int
+_pthread_spin_trylock(pthread_spinlock_t *lock)
+{
+	mutex_t *mp = (mutex_t *)lock;
+	ulwp_t *self = curthread;
+	int error = 0;
+
+	no_preempt(self);
+	if (set_lock_byte(&mp->mutex_lockw) != 0)
+		error = EBUSY;
+	else {
+		mp->mutex_owner = (uintptr_t)self;
+		if (mp->mutex_type == USYNC_PROCESS)
+			mp->mutex_ownerpid = self->ul_uberdata->pid;
+		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+	}
+	preempt(self);
+	return (error);
+}
+
+#pragma weak pthread_spin_lock = _pthread_spin_lock
+int
+_pthread_spin_lock(pthread_spinlock_t *lock)
+{
+	volatile uint8_t *lockp =
+		(volatile uint8_t *)&((mutex_t *)lock)->mutex_lockw;
+
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+	/*
+	 * We don't care whether the owner is running on a processor.
+	 * We just spin because that's what this interface requires.
+	 */
+	for (;;) {
+		if (*lockp == 0) {	/* lock byte appears to be clear */
+			if (_pthread_spin_trylock(lock) == 0)
+				return (0);
+		}
+		SMT_PAUSE();
+	}
+}
+
+#pragma weak pthread_spin_unlock = _pthread_spin_unlock
+int
+_pthread_spin_unlock(pthread_spinlock_t *lock)
+{
+	mutex_t *mp = (mutex_t *)lock;
+	ulwp_t *self = curthread;
+
+	no_preempt(self);
+	mp->mutex_owner = 0;
+	mp->mutex_ownerpid = 0;
+	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
+	(void) swap32(&mp->mutex_lockword, 0);
+	preempt(self);
+	return (0);
+}
+
+#pragma weak cond_init = _cond_init
+/* ARGSUSED2 */
+int
+_cond_init(cond_t *cvp, int type, void *arg)
+{
+	if (type != USYNC_THREAD && type != USYNC_PROCESS)
+		return (EINVAL);
+	(void) _memset(cvp, 0, sizeof (*cvp));
+	cvp->cond_type = (uint16_t)type;
+	cvp->cond_magic = COND_MAGIC;
+	return (0);
+}
+
+/*
+ * cond_sleep_queue(): utility function for cond_wait_queue().
+ *
+ * Go to sleep on a condvar sleep queue, expect to be waked up
+ * by someone calling cond_signal() or cond_broadcast() or due
+ * to receiving a UNIX signal or being cancelled, or just simply
+ * due to a spurious wakeup (like someome calling forkall()).
+ *
+ * The associated mutex is *not* reacquired before returning.
+ * That must be done by the caller of cond_sleep_queue().
+ */
+int
+cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
+{
+	ulwp_t *self = curthread;
+	queue_head_t *qp;
+	queue_head_t *mqp;
+	lwpid_t lwpid;
+	int signalled;
+	int error;
+
+	/*
+	 * Put ourself on the CV sleep queue, unlock the mutex, then
+	 * park ourself and unpark a candidate lwp to grab the mutex.
+	 * We must go onto the CV sleep queue before dropping the
+	 * mutex in order to guarantee atomicity of the operation.
+	 */
+	self->ul_sp = stkptr();
+	qp = queue_lock(cvp, CV);
+	enqueue(qp, self, cvp, CV);
+	cvp->cond_waiters_user = 1;
+	self->ul_cvmutex = mp;
+	self->ul_cv_wake = (tsp != NULL);
+	self->ul_signalled = 0;
+	lwpid = mutex_unlock_queue(mp);
+	for (;;) {
+		set_parking_flag(self, 1);
+		queue_unlock(qp);
+		if (lwpid != 0) {
+			lwpid = preempt_unpark(self, lwpid);
+			preempt(self);
+		}
+		/*
+		 * We may have a deferred signal present,
+		 * in which case we should return EINTR.
+		 * Also, we may have received a SIGCANCEL; if so
+		 * and we are cancelable we should return EINTR.
+		 * We force an immediate EINTR return from
+		 * __lwp_park() by turning our parking flag off.
+		 */
+		if (self->ul_cursig != 0 ||
+		    (self->ul_cancelable && self->ul_cancel_pending))
+			set_parking_flag(self, 0);
+		/*
+		 * __lwp_park() will return the residual time in tsp
+		 * if we are unparked before the timeout expires.
+		 */
+		error = __lwp_park(tsp, lwpid);
+		set_parking_flag(self, 0);
+		lwpid = 0;	/* unpark the other lwp only once */
+		/*
+		 * We were waked up by cond_signal(), cond_broadcast(),
+		 * by an interrupt or timeout (EINTR or ETIME),
+		 * or we may just have gotten a spurious wakeup.
+		 */
+		qp = queue_lock(cvp, CV);
+		mqp = queue_lock(mp, MX);
+		if (self->ul_sleepq == NULL)
+			break;
+		/*
+		 * We are on either the condvar sleep queue or the
+		 * mutex sleep queue.  If we are on the mutex sleep
+		 * queue, continue sleeping.  If we are on the condvar
+		 * sleep queue, break out of the sleep if we were
+		 * interrupted or we timed out (EINTR or ETIME).
+		 * Else this is a spurious wakeup; continue the loop.
+		 */
+		if (self->ul_sleepq == mqp)		/* mutex queue */
+			tsp = NULL;
+		else if (self->ul_sleepq == qp) {	/* condvar queue */
+			if (error) {
+				cvp->cond_waiters_user = dequeue_self(qp, cvp);
+				break;
+			}
+			/*
+			 * Else a spurious wakeup on the condvar queue.
+			 * __lwp_park() has already adjusted the timeout.
+			 */
+		} else {
+			thr_panic("cond_sleep_queue(): thread not on queue");
+		}
+		queue_unlock(mqp);
+	}
+
+	self->ul_sp = 0;
+	ASSERT(self->ul_cvmutex == NULL && self->ul_cv_wake == 0);
+	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
+	    self->ul_wchan == NULL);
+
+	signalled = self->ul_signalled;
+	self->ul_signalled = 0;
+	queue_unlock(qp);
+	queue_unlock(mqp);
+
+	/*
+	 * If we were concurrently cond_signal()d and any of:
+	 * received a UNIX signal, were cancelled, or got a timeout,
+	 * then perform another cond_signal() to avoid consuming it.
+	 */
+	if (error && signalled)
+		(void) cond_signal_internal(cvp);
+
+	return (error);
+}
+
+int
+cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp,
+	tdb_mutex_stats_t *msp)
+{
+	ulwp_t *self = curthread;
+	int error;
+
+	/*
+	 * The old thread library was programmed to defer signals
+	 * while in cond_wait() so that the associated mutex would
+	 * be guaranteed to be held when the application signal
+	 * handler was invoked.
+	 *
+	 * We do not behave this way by default; the state of the
+	 * associated mutex in the signal handler is undefined.
+	 *
+	 * To accommodate applications that depend on the old
+	 * behavior, the _THREAD_COND_WAIT_DEFER environment
+	 * variable can be set to 1 and we will behave in the
+	 * old way with respect to cond_wait().
+	 */
+	if (self->ul_cond_wait_defer)
+		sigoff(self);
+
+	error = cond_sleep_queue(cvp, mp, tsp);
+
+	/*
+	 * Reacquire the mutex.
+	 */
+	if (set_lock_byte(&mp->mutex_lockw) == 0) {
+		mp->mutex_owner = (uintptr_t)self;
+		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
+	} else if (mutex_trylock_adaptive(mp) != 0) {
+		(void) mutex_lock_queue(self, msp, mp, NULL);
+	}
+
+	if (msp)
+		record_begin_hold(msp);
+
+	/*
+	 * Take any deferred signal now, after we have reacquired the mutex.
+	 */
+	if (self->ul_cond_wait_defer)
+		sigon(self);
+
+	return (error);
+}
+
+/*
+ * cond_sleep_kernel(): utility function for cond_wait_kernel().
+ * See the comment ahead of cond_sleep_queue(), above.
+ */
+int
+cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
+{
+	int mtype = mp->mutex_type;
+	ulwp_t *self = curthread;
+	int error;
+
+	if (mtype & PTHREAD_PRIO_PROTECT) {
+		if (_ceil_mylist_del(mp))
+			_ceil_prio_waive();
+	}
+
+	self->ul_sp = stkptr();
+	self->ul_wchan = cvp;
+	mp->mutex_owner = 0;
+	mp->mutex_ownerpid = 0;
+	if (mtype & PTHREAD_PRIO_INHERIT)
+		mp->mutex_lockw = LOCKCLEAR;
+	/*
+	 * ___lwp_cond_wait() returns immediately with EINTR if
+	 * set_parking_flag(self,0) is called on this lwp before it
+	 * goes to sleep in the kernel.  sigacthandler() calls this
+	 * when a deferred signal is noted.  This assures that we don't
+	 * get stuck in ___lwp_cond_wait() with all signals blocked
+	 * due to taking a deferred signal before going to sleep.
+	 */
+	set_parking_flag(self, 1);
+	if (self->ul_cursig != 0 ||
+	    (self->ul_cancelable && self->ul_cancel_pending))
+		set_parking_flag(self, 0);
+	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
+	set_parking_flag(self, 0);
+	self->ul_sp = 0;
+	self->ul_wchan = NULL;
+	return (error);
+}
+
+int
+cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
+{
+	ulwp_t *self = curthread;
+	int error;
+	int merror;
+
+	/*
+	 * See the large comment in cond_wait_queue(), above.
+	 */
+	if (self->ul_cond_wait_defer)
+		sigoff(self);
+
+	error = cond_sleep_kernel(cvp, mp, tsp);
+
+	/*
+	 * Override the return code from ___lwp_cond_wait()
+	 * with any non-zero return code from mutex_lock().
+	 * This addresses robust lock failures in particular;
+	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
+	 * errors in order to take corrective action.
+	 */
+	if ((merror = _private_mutex_lock(mp)) != 0)
+		error = merror;
+
+	/*
+	 * Take any deferred signal now, after we have reacquired the mutex.
+	 */
+	if (self->ul_cond_wait_defer)
+		sigon(self);
+
+	return (error);
+}
+
+/*
+ * Common code for _cond_wait() and _cond_timedwait()
+ */
+int
+cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
+{
+	int mtype = mp->mutex_type;
+	hrtime_t begin_sleep = 0;
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
+	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
+	uint8_t rcount;
+	int error = 0;
+
+	/*
+	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
+	 *	Except in the case of [ETIMEDOUT], all these error checks
+	 *	shall act as if they were performed immediately at the
+	 *	beginning of processing for the function and shall cause
+	 *	an error return, in effect, prior to modifying the state
+	 *	of the mutex specified by mutex or the condition variable
+	 *	specified by cond.
+	 * Therefore, we must return EINVAL now if the timout is invalid.
+	 */
+	if (tsp != NULL &&
+	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
+		return (EINVAL);
+
+	if (__td_event_report(self, TD_SLEEP, udp)) {
+		self->ul_sp = stkptr();
+		self->ul_wchan = cvp;
+		self->ul_td_evbuf.eventnum = TD_SLEEP;
+		self->ul_td_evbuf.eventdata = cvp;
+		tdb_event(TD_SLEEP, udp);
+		self->ul_sp = 0;
+	}
+	if (csp) {
+		if (tsp)
+			tdb_incr(csp->cond_timedwait);
+		else
+			tdb_incr(csp->cond_wait);
+	}
+	if (msp)
+		begin_sleep = record_hold_time(msp);
+	else if (csp)
+		begin_sleep = gethrtime();
+
+	if (self->ul_error_detection) {
+		if (!mutex_is_held(mp))
+			lock_error(mp, "cond_wait", cvp, NULL);
+		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
+			lock_error(mp, "recursive mutex in cond_wait",
+				cvp, NULL);
+		if (cvp->cond_type & USYNC_PROCESS) {
+			if (!(mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)))
+				lock_error(mp, "cond_wait", cvp,
+					"condvar process-shared, "
+					"mutex process-private");
+		} else {
+			if (mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))
+				lock_error(mp, "cond_wait", cvp,
+					"condvar process-private, "
+					"mutex process-shared");
+		}
+	}
+
+	/*
+	 * We deal with recursive mutexes by completely
+	 * dropping the lock and restoring the recursion
+	 * count after waking up.  This is arguably wrong,
+	 * but it obeys the principle of least astonishment.
+	 */
+	rcount = mp->mutex_rcount;
+	mp->mutex_rcount = 0;
+	if ((mtype & (USYNC_PROCESS | USYNC_PROCESS_ROBUST |
+	    PTHREAD_PRIO_INHERIT | PTHREAD_PRIO_PROTECT)) |
+	    (cvp->cond_type & USYNC_PROCESS))
+		error = cond_wait_kernel(cvp, mp, tsp);
+	else
+		error = cond_wait_queue(cvp, mp, tsp, msp);
+	mp->mutex_rcount = rcount;
+
+	if (csp) {
+		hrtime_t lapse = gethrtime() - begin_sleep;
+		if (tsp == NULL)
+			csp->cond_wait_sleep_time += lapse;
+		else {
+			csp->cond_timedwait_sleep_time += lapse;
+			if (error == ETIME)
+				tdb_incr(csp->cond_timedwait_timeout);
+		}
+	}
+	return (error);
+}
+
+/*
+ * cond_wait() is a cancellation point but _cond_wait() is not.
+ * System libraries call the non-cancellation version.
+ * It is expected that only applications call the cancellation version.
+ */
+int
+_cond_wait(cond_t *cvp, mutex_t *mp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	uberflags_t *gflags;
+
+	/*
+	 * Optimize the common case of USYNC_THREAD plus
+	 * no error detection, no lock statistics, and no event tracing.
+	 */
+	if ((gflags = self->ul_schedctl_called) != NULL &&
+	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
+	    self->ul_td_events_enable |
+	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
+		return (cond_wait_queue(cvp, mp, NULL, NULL));
+
+	/*
+	 * Else do it the long way.
+	 */
+	return (cond_wait_common(cvp, mp, NULL));
+}
+
+int
+cond_wait(cond_t *cvp, mutex_t *mp)
+{
+	int error;
+
+	_cancelon();
+	error = _cond_wait(cvp, mp);
+	if (error == EINTR)
+		_canceloff();
+	else
+		_canceloff_nocancel();
+	return (error);
+}
+
+#pragma weak pthread_cond_wait = _pthread_cond_wait
+int
+_pthread_cond_wait(cond_t *cvp, mutex_t *mp)
+{
+	int error;
+
+	error = cond_wait(cvp, mp);
+	return ((error == EINTR)? 0 : error);
+}
+
+/*
+ * cond_timedwait() is a cancellation point but _cond_timedwait() is not.
+ * System libraries call the non-cancellation version.
+ * It is expected that only applications call the cancellation version.
+ */
+int
+_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
+{
+	clockid_t clock_id = cvp->cond_clockid;
+	timespec_t reltime;
+	int error;
+
+	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
+		clock_id = CLOCK_REALTIME;
+	abstime_to_reltime(clock_id, abstime, &reltime);
+	error = cond_wait_common(cvp, mp, &reltime);
+	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
+		/*
+		 * Don't return ETIME if we didn't really get a timeout.
+		 * This can happen if we return because someone resets
+		 * the system clock.  Just return zero in this case,
+		 * giving a spurious wakeup but not a timeout.
+		 */
+		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
+		    abstime->tv_nsec > gethrtime())
+			error = 0;
+	}
+	return (error);
+}
+
+int
+cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
+{
+	int error;
+
+	_cancelon();
+	error = _cond_timedwait(cvp, mp, abstime);
+	if (error == EINTR)
+		_canceloff();
+	else
+		_canceloff_nocancel();
+	return (error);
+}
+
+#pragma weak pthread_cond_timedwait = _pthread_cond_timedwait
+int
+_pthread_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
+{
+	int error;
+
+	error = cond_timedwait(cvp, mp, abstime);
+	if (error == ETIME)
+		error = ETIMEDOUT;
+	else if (error == EINTR)
+		error = 0;
+	return (error);
+}
+
+/*
+ * cond_reltimedwait() is a cancellation point but _cond_reltimedwait()
+ * is not.  System libraries call the non-cancellation version.
+ * It is expected that only applications call the cancellation version.
+ */
+int
+_cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
+{
+	timespec_t tslocal = *reltime;
+
+	return (cond_wait_common(cvp, mp, &tslocal));
+}
+
+#pragma weak cond_reltimedwait = _cond_reltimedwait_cancel
+int
+_cond_reltimedwait_cancel(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
+{
+	int error;
+
+	_cancelon();
+	error = _cond_reltimedwait(cvp, mp, reltime);
+	if (error == EINTR)
+		_canceloff();
+	else
+		_canceloff_nocancel();
+	return (error);
+}
+
+#pragma weak pthread_cond_reltimedwait_np = _pthread_cond_reltimedwait_np
+int
+_pthread_cond_reltimedwait_np(cond_t *cvp, mutex_t *mp,
+	const timespec_t *reltime)
+{
+	int error;
+
+	error = _cond_reltimedwait_cancel(cvp, mp, reltime);
+	if (error == ETIME)
+		error = ETIMEDOUT;
+	else if (error == EINTR)
+		error = 0;
+	return (error);
+}
+
+#pragma weak pthread_cond_signal = cond_signal_internal
+#pragma weak _pthread_cond_signal = cond_signal_internal
+#pragma weak cond_signal = cond_signal_internal
+#pragma weak _cond_signal = cond_signal_internal
+int
+cond_signal_internal(cond_t *cvp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
+	int error = 0;
+	queue_head_t *qp;
+	mutex_t *mp;
+	queue_head_t *mqp;
+	ulwp_t **ulwpp;
+	ulwp_t *ulwp;
+	ulwp_t *prev = NULL;
+	ulwp_t *next;
+	ulwp_t **suspp = NULL;
+	ulwp_t *susprev;
+
+	if (csp)
+		tdb_incr(csp->cond_signal);
+
+	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
+		error = __lwp_cond_signal(cvp);
+
+	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
+		return (error);
+
+	/*
+	 * Move someone from the condvar sleep queue to the mutex sleep
+	 * queue for the mutex that he will acquire on being waked up.
+	 * We can do this only if we own the mutex he will acquire.
+	 * If we do not own the mutex, or if his ul_cv_wake flag
+	 * is set, just dequeue and unpark him.
+	 */
+	qp = queue_lock(cvp, CV);
+	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
+	    prev = ulwp, ulwpp = &ulwp->ul_link) {
+		if (ulwp->ul_wchan == cvp) {
+			if (!ulwp->ul_stop)
+				break;
+			/*
+			 * Try not to dequeue a suspended thread.
+			 * This mimics the old libthread's behavior.
+			 */
+			if (suspp == NULL) {
+				suspp = ulwpp;
+				susprev = prev;
+			}
+		}
+	}
+	if (ulwp == NULL && suspp != NULL) {
+		ulwp = *(ulwpp = suspp);
+		prev = susprev;
+		suspp = NULL;
+	}
+	if (ulwp == NULL) {	/* no one on the sleep queue */
+		cvp->cond_waiters_user = 0;
+		queue_unlock(qp);
+		return (error);
+	}
+	/*
+	 * Scan the remainder of the CV queue for another waiter.
+	 */
+	if (suspp != NULL) {
+		next = *suspp;
+	} else {
+		for (next = ulwp->ul_link; next != NULL; next = next->ul_link)
+			if (next->ul_wchan == cvp)
+				break;
+	}
+	if (next == NULL)
+		cvp->cond_waiters_user = 0;
+
+	/*
+	 * Inform the thread that he was the recipient of a cond_signal().
+	 * This lets him deal with cond_signal() and, concurrently,
+	 * one or more of a cancellation, a UNIX signal, or a timeout.
+	 * These latter conditions must not consume a cond_signal().
+	 */
+	ulwp->ul_signalled = 1;
+
+	/*
+	 * Dequeue the waiter but leave his ul_sleepq non-NULL
+	 * while we move him to the mutex queue so that he can
+	 * deal properly with spurious wakeups.
+	 */
+	*ulwpp = ulwp->ul_link;
+	if (qp->qh_tail == ulwp)
+		qp->qh_tail = prev;
+	qp->qh_qlen--;
+	ulwp->ul_link = NULL;
+
+	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
+	ulwp->ul_cvmutex = NULL;
+	ASSERT(mp != NULL);
+
+	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
+		lwpid_t lwpid = ulwp->ul_lwpid;
+
+		no_preempt(self);
+		ulwp->ul_sleepq = NULL;
+		ulwp->ul_wchan = NULL;
+		ulwp->ul_cv_wake = 0;
+		queue_unlock(qp);
+		(void) __lwp_unpark(lwpid);
+		preempt(self);
+	} else {
+		mqp = queue_lock(mp, MX);
+		enqueue(mqp, ulwp, mp, MX);
+		mp->mutex_waiters = 1;
+		queue_unlock(mqp);
+		queue_unlock(qp);
+	}
+
+	return (error);
+}
+
+#define	MAXLWPS	128	/* max remembered lwpids before overflow */
+#define	NEWLWPS	2048	/* max remembered lwpids at first overflow */
+
+#pragma weak pthread_cond_broadcast = cond_broadcast_internal
+#pragma weak _pthread_cond_broadcast = cond_broadcast_internal
+#pragma weak cond_broadcast = cond_broadcast_internal
+#pragma weak _cond_broadcast = cond_broadcast_internal
+int
+cond_broadcast_internal(cond_t *cvp)
+{
+	ulwp_t *self = curthread;
+	uberdata_t *udp = self->ul_uberdata;
+	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
+	int error = 0;
+	queue_head_t *qp;
+	mutex_t *mp;
+	queue_head_t *mqp;
+	mutex_t *mp_cache = NULL;
+	queue_head_t *mqp_cache = NULL;
+	ulwp_t **ulwpp;
+	ulwp_t *ulwp;
+	ulwp_t *prev = NULL;
+	lwpid_t buffer[MAXLWPS];
+	lwpid_t *lwpid = buffer;
+	int nlwpid = 0;
+	int maxlwps = MAXLWPS;
+
+	if (csp)
+		tdb_incr(csp->cond_broadcast);
+
+	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
+		error = __lwp_cond_broadcast(cvp);
+
+	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
+		return (error);
+
+	/*
+	 * Move everyone from the condvar sleep queue to the mutex sleep
+	 * queue for the mutex that they will acquire on being waked up.
+	 * We can do this only if we own the mutex they will acquire.
+	 * If we do not own the mutex, or if their ul_cv_wake flag
+	 * is set, just dequeue and unpark them.
+	 *
+	 * We keep track of lwpids that are to be unparked in lwpid[].
+	 * __lwp_unpark_all() is called to unpark all of them after
+	 * they have been removed from the sleep queue and the sleep
+	 * queue lock has been dropped.  If we run out of space in our
+	 * on-stack buffer, we need to allocate more but we can't call
+	 * lmalloc() because we are holding a queue lock when the overflow
+	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
+	 * either because the application may have allocated a small stack
+	 * and we don't want to overrun the stack.  So we use the mmap()
+	 * system call directly since that path acquires no locks.
+	 */
+	qp = queue_lock(cvp, CV);
+	cvp->cond_waiters_user = 0;
+	ulwpp = &qp->qh_head;
+	while ((ulwp = *ulwpp) != NULL) {
+
+		if (ulwp->ul_wchan != cvp) {
+			prev = ulwp;
+			ulwpp = &ulwp->ul_link;
+			continue;
+		}
+
+		*ulwpp = ulwp->ul_link;
+		if (qp->qh_tail == ulwp)
+			qp->qh_tail = prev;
+		qp->qh_qlen--;
+		ulwp->ul_link = NULL;
+
+		mp = ulwp->ul_cvmutex;		/* his mutex */
+		ulwp->ul_cvmutex = NULL;
+		ASSERT(mp != NULL);
+
+		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
+			ulwp->ul_sleepq = NULL;
+			ulwp->ul_wchan = NULL;
+			ulwp->ul_cv_wake = 0;
+			if (nlwpid == maxlwps) {
+				/*
+				 * Allocate NEWLWPS ids on the first overflow.
+				 * Double the allocation each time after that.
+				 */
+				int newlwps = (lwpid == buffer)? NEWLWPS :
+						2 * maxlwps;
+				void *vaddr = _private_mmap(NULL,
+					newlwps * sizeof (lwpid_t),
+					PROT_READ|PROT_WRITE,
+					MAP_PRIVATE|MAP_ANON, -1, (off_t)0);
+				if (vaddr == MAP_FAILED) {
+					/*
+					 * Let's hope this never happens.
+					 * If it does, then we have a terrible
+					 * thundering herd on our hands.
+					 */
+					(void) __lwp_unpark_all(lwpid, nlwpid);
+					nlwpid = 0;
+				} else {
+					(void) _memcpy(vaddr, lwpid,
+						maxlwps * sizeof (lwpid_t));
+					if (lwpid != buffer)
+						(void) _private_munmap(lwpid,
+						    maxlwps * sizeof (lwpid_t));
+					lwpid = vaddr;
+					maxlwps = newlwps;
+				}
+			}
+			lwpid[nlwpid++] = ulwp->ul_lwpid;
+		} else {
+			if (mp != mp_cache) {
+				if (mqp_cache != NULL)
+					queue_unlock(mqp_cache);
+				mqp_cache = queue_lock(mp, MX);
+				mp_cache = mp;
+			}
+			mqp = mqp_cache;
+			enqueue(mqp, ulwp, mp, MX);
+			mp->mutex_waiters = 1;
+		}
+	}
+	if (mqp_cache != NULL)
+		queue_unlock(mqp_cache);
+	queue_unlock(qp);
+	if (nlwpid) {
+		if (nlwpid == 1)
+			(void) __lwp_unpark(lwpid[0]);
+		else
+			(void) __lwp_unpark_all(lwpid, nlwpid);
+	}
+	if (lwpid != buffer)
+		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
+
+	return (error);
+}
+
+#pragma weak pthread_cond_destroy = _cond_destroy
+#pragma weak _pthread_cond_destroy = _cond_destroy
+#pragma weak cond_destroy = _cond_destroy
+int
+_cond_destroy(cond_t *cvp)
+{
+	cvp->cond_magic = 0;
+	tdb_sync_obj_deregister(cvp);
+	return (0);
+}
+
+#if defined(THREAD_DEBUG)
+void
+assert_no_libc_locks_held(void)
+{
+	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
+}
+#endif
+
+/* protected by link_lock */
+uint64_t spin_lock_spin;
+uint64_t spin_lock_spin2;
+uint64_t spin_lock_sleep;
+uint64_t spin_lock_wakeup;
+
+/*
+ * Record spin lock statistics.
+ * Called by a thread exiting itself in thrp_exit().
+ * Also called via atexit() from the thread calling
+ * exit() to do all the other threads as well.
+ */
+void
+record_spin_locks(ulwp_t *ulwp)
+{
+	spin_lock_spin += ulwp->ul_spin_lock_spin;
+	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
+	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
+	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
+	ulwp->ul_spin_lock_spin = 0;
+	ulwp->ul_spin_lock_spin2 = 0;
+	ulwp->ul_spin_lock_sleep = 0;
+	ulwp->ul_spin_lock_wakeup = 0;
+}
+
+/*
+ * atexit function:  dump the queue statistics to stderr.
+ */
+#include <stdio.h>
+void
+dump_queue_statistics(void)
+{
+	uberdata_t *udp = curthread->ul_uberdata;
+	queue_head_t *qp;
+	int qn;
+	uint64_t spin_lock_total = 0;
+
+	if (udp->queue_head == NULL || thread_queue_dump == 0)
+		return;
+
+	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
+	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
+		return;
+	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
+		if (qp->qh_lockcount == 0)
+			continue;
+		spin_lock_total += qp->qh_lockcount;
+		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
+			(u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
+				return;
+	}
+
+	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
+	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
+		return;
+	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
+		if (qp->qh_lockcount == 0)
+			continue;
+		spin_lock_total += qp->qh_lockcount;
+		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
+			(u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
+				return;
+	}
+
+	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
+		(u_longlong_t)spin_lock_total);
+	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
+		(u_longlong_t)spin_lock_spin);
+	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
+		(u_longlong_t)spin_lock_spin2);
+	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
+		(u_longlong_t)spin_lock_sleep);
+	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
+		(u_longlong_t)spin_lock_wakeup);
+}