1 files changed, 283 insertions, 89 deletions
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index 854b33798d..d576738e75 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -75,6 +75,11 @@
 #include <sys/cpucaps.h>
 #include <sys/kiconv.h>
 #include <sys/ctype.h>
+#include <sys/ht.h>
+
+#ifndef	STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
 
 struct kmem_cache *thread_cache;	/* cache of free threads */
 struct kmem_cache *lwp_cache;		/* cache of free lwps */
@@ -373,7 +378,7 @@ thread_create(
 		if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 			cmn_err(CE_PANIC, "thread_create: proposed stack size"
 			    " too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
 		stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 		stksize &= -PTR24_ALIGN;	/* make thread aligned */
 		t = (kthread_t *)(stk + stksize);
@@ -382,13 +387,6 @@ thread_create(
 			audit_thread_create(t);
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else	/* stack grows to larger addresses */
-		stksize -= SA(sizeof (kthread_t));
-		t = (kthread_t *)(stk);
-		bzero(t, sizeof (kthread_t));
-		t->t_stk = stk + sizeof (kthread_t);
-		t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif	/* STACK_GROWTH_DOWN */
 		t->t_flag |= T_TALLOCSTK;
 		t->t_swap = stk;
 	} else {
@@ -401,13 +399,8 @@ thread_create(
 		 * Initialize t_stk to the kernel stack pointer to use
 		 * upon entry to the kernel
 		 */
-#ifdef STACK_GROWTH_DOWN
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else
-		t->t_stk = stk;			/* 3b2-like */
-		t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
 	}
 
 	if (kmem_stackinfo != 0) {
@@ -487,15 +480,9 @@ thread_create(
 	curthread->t_prev = t;
 
 	/*
-	 * Threads should never have a NULL t_cpu pointer so assign it
-	 * here.  If the thread is being created with state TS_RUN a
-	 * better CPU may be chosen when it is placed on the run queue.
-	 *
-	 * We need to keep kernel preemption disabled when setting all
-	 * three fields to keep them in sync.  Also, always create in
-	 * the default partition since that's where kernel threads go
-	 * (if this isn't a kernel thread, t_cpupart will be changed
-	 * in lwp_create before setting the thread runnable).
+	 * We'll always create in the default partition since that's where
+	 * kernel threads go (we'll change this later if needed, in
+	 * lwp_create()).
 	 */
 	t->t_cpupart = &cp_default;
 
@@ -504,20 +491,23 @@ thread_create(
 	 * Since the kernel does not (presently) allocate its memory
 	 * in a locality aware fashion, the root is an appropriate home.
 	 * If this thread is later associated with an lwp, it will have
-	 * it's lgroup re-assigned at that time.
+	 * its lgroup re-assigned at that time.
 	 */
 	lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
 
 	/*
-	 * Inherit the current cpu.  If this cpu isn't part of the chosen
-	 * lgroup, a new cpu will be chosen by cpu_choose when the thread
-	 * is ready to run.
+	 * If the current CPU is in the default cpupart, use it.  Otherwise,
+	 * pick one that is; before entering the dispatcher code, we'll
+	 * make sure to keep the invariant that ->t_cpu is set.  (In fact, we
+	 * rely on this, in ht_should_run(), in the call tree of
+	 * disp_lowpri_cpu().)
 	 */
-	if (CPU->cpu_part == &cp_default)
+	if (CPU->cpu_part == &cp_default) {
 		t->t_cpu = CPU;
-	else
-		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
-		    t->t_pri, NULL);
+	} else {
+		t->t_cpu = cp_default.cp_cpulist;
+		t->t_cpu = disp_lowpri_cpu(t->t_cpu, t, t->t_pri);
+	}
 
 	t->t_disp_queue = t->t_cpu->cpu_disp;
 	kpreempt_enable();
@@ -590,6 +580,9 @@ thread_exit(void)
 	if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 		cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 
+	if ((t->t_flag & T_SPLITSTK) != 0)
+		cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
 	tsd_exit();		/* Clean up this thread's TSD */
 
 	kcpc_passivate();	/* clean up performance counter state */
@@ -870,12 +863,12 @@ thread_zone_destroy(zoneid_t zoneid, void *unused)
 
 	/*
 	 * Guard against race condition in mutex_owner_running:
-	 * 	thread=owner(mutex)
-	 * 	<interrupt>
-	 * 				thread exits mutex
-	 * 				thread exits
-	 * 				thread reaped
-	 * 				thread struct freed
+	 *	thread=owner(mutex)
+	 *	<interrupt>
+	 *				thread exits mutex
+	 *				thread exits
+	 *				thread reaped
+	 *				thread struct freed
 	 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 	 * A cross call to all cpus will cause the interrupt handler
 	 * to reset the PC if it is in mutex_owner_running, refreshing
@@ -932,12 +925,12 @@ thread_reaper()
 
 		/*
 		 * Guard against race condition in mutex_owner_running:
-		 * 	thread=owner(mutex)
-		 * 	<interrupt>
-		 * 				thread exits mutex
-		 * 				thread exits
-		 * 				thread reaped
-		 * 				thread struct freed
+		 *	thread=owner(mutex)
+		 *	<interrupt>
+		 *				thread exits mutex
+		 *				thread exits
+		 *				thread reaped
+		 *				thread struct freed
 		 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 		 * A cross call to all cpus will cause the interrupt handler
 		 * to reset the PC if it is in mutex_owner_running, refreshing
@@ -1055,8 +1048,44 @@ installctx(
 	ctx->exit_op = exit;
 	ctx->free_op = free;
 	ctx->arg = arg;
-	ctx->next = t->t_ctx;
+	ctx->save_ts = 0;
+	ctx->restore_ts = 0;
+
+	/*
+	 * Keep ctxops in a doubly-linked list to allow traversal in both
+	 * directions.  Using only the newest-to-oldest ordering was adequate
+	 * previously, but reversing the order for restore_op actions is
+	 * necessary if later-added ctxops depends on earlier ones.
+	 *
+	 * One example of such a dependency:  Hypervisor software handling the
+	 * guest FPU expects that it save FPU state prior to host FPU handling
+	 * and consequently handle the guest logic _after_ the host FPU has
+	 * been restored.
+	 *
+	 * The t_ctx member points to the most recently added ctxop or is NULL
+	 * if no ctxops are associated with the thread.  The 'next' pointers
+	 * form a loop of the ctxops in newest-to-oldest order.  The 'prev'
+	 * pointers form a loop in the reverse direction, where t_ctx->prev is
+	 * the oldest entry associated with the thread.
+	 *
+	 * The protection of kpreempt_disable is required to safely perform the
+	 * list insertion, since there are inconsistent states between some of
+	 * the pointer assignments.
+	 */
+	kpreempt_disable();
+	if (t->t_ctx == NULL) {
+		ctx->next = ctx;
+		ctx->prev = ctx;
+	} else {
+		struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev;
+
+		ctx->next = head;
+		ctx->prev = tail;
+		head->prev = ctx;
+		tail->next = ctx;
+	}
 	t->t_ctx = ctx;
+	kpreempt_enable();
 }
 
 /*
@@ -1073,7 +1102,7 @@ removectx(
 	void	(*exit)(void *),
 	void	(*free)(void *, int))
 {
-	struct ctxop *ctx, *prev_ctx;
+	struct ctxop *ctx, *head;
 
 	/*
 	 * The incoming kthread_t (which is the thread for which the
@@ -1098,17 +1127,31 @@ removectx(
 	 * and the target thread from racing with each other during lwp exit.
 	 */
 	mutex_enter(&t->t_ctx_lock);
-	prev_ctx = NULL;
 	kpreempt_disable();
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
+
+	if (t->t_ctx == NULL) {
+		mutex_exit(&t->t_ctx_lock);
+		kpreempt_enable();
+		return (0);
+	}
+
+	ctx = head = t->t_ctx;
+	do {
 		if (ctx->save_op == save && ctx->restore_op == restore &&
 		    ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
 		    ctx->exit_op == exit && ctx->free_op == free &&
 		    ctx->arg == arg) {
-			if (prev_ctx)
-				prev_ctx->next = ctx->next;
-			else
+			ctx->prev->next = ctx->next;
+			ctx->next->prev = ctx->prev;
+			if (ctx->next == ctx) {
+				/* last remaining item */
+				t->t_ctx = NULL;
+			} else if (ctx == t->t_ctx) {
+				/* fix up head of list */
 				t->t_ctx = ctx->next;
+			}
+			ctx->next = ctx->prev = NULL;
+
 			mutex_exit(&t->t_ctx_lock);
 			if (ctx->free_op != NULL)
 				(ctx->free_op)(ctx->arg, 0);
@@ -1116,44 +1159,70 @@ removectx(
 			kpreempt_enable();
 			return (1);
 		}
-		prev_ctx = ctx;
-	}
+
+		ctx = ctx->next;
+	} while (ctx != head);
+
 	mutex_exit(&t->t_ctx_lock);
 	kpreempt_enable();
-
 	return (0);
 }
 
 void
 savectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->save_op != NULL)
-			(ctx->save_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->save_op != NULL) {
+				ctx->save_ts = gethrtime_unscaled();
+				(ctx->save_op)(ctx->arg);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 void
 restorectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->restore_op != NULL)
-			(ctx->restore_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *tail;
+
+		/* Backward traversal (starting at the tail) */
+		ctx = tail = t->t_ctx->prev;
+		do {
+			if (ctx->restore_op != NULL) {
+				ctx->restore_ts = gethrtime_unscaled();
+				(ctx->restore_op)(ctx->arg);
+			}
+			ctx = ctx->prev;
+		} while (ctx != tail);
+	}
 }
 
 void
 forkctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->fork_op != NULL)
-			(ctx->fork_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->fork_op != NULL) {
+				(ctx->fork_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1164,11 +1233,18 @@ forkctx(kthread_t *t, kthread_t *ct)
 void
 lwp_createctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->lwp_create_op != NULL)
-			(ctx->lwp_create_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->lwp_create_op != NULL) {
+				(ctx->lwp_create_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1181,11 +1257,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct)
 void
 exitctx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->exit_op != NULL)
-			(ctx->exit_op)(t);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->exit_op != NULL) {
+				(ctx->exit_op)(t);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1195,14 +1278,21 @@ exitctx(kthread_t *t)
 void
 freectx(kthread_t *t, int isexec)
 {
-	struct ctxop *ctx;
-
 	kpreempt_disable();
-	while ((ctx = t->t_ctx) != NULL) {
-		t->t_ctx = ctx->next;
-		if (ctx->free_op != NULL)
-			(ctx->free_op)(ctx->arg, isexec);
-		kmem_free(ctx, sizeof (struct ctxop));
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		ctx = head = t->t_ctx;
+		t->t_ctx = NULL;
+		do {
+			struct ctxop *next = ctx->next;
+
+			if (ctx->free_op != NULL) {
+				(ctx->free_op)(ctx->arg, isexec);
+			}
+			kmem_free(ctx, sizeof (struct ctxop));
+			ctx = next;
+		} while (ctx != head);
 	}
 	kpreempt_enable();
 }
@@ -1217,17 +1307,22 @@ freectx(kthread_t *t, int isexec)
 void
 freectx_ctx(struct ctxop *ctx)
 {
-	struct ctxop *nctx;
+	struct ctxop *head = ctx;
 
 	ASSERT(ctx != NULL);
 
 	kpreempt_disable();
+
+	head = ctx;
 	do {
-		nctx = ctx->next;
-		if (ctx->free_op != NULL)
+		struct ctxop *next = ctx->next;
+
+		if (ctx->free_op != NULL) {
 			(ctx->free_op)(ctx->arg, 0);
+		}
 		kmem_free(ctx, sizeof (struct ctxop));
-	} while ((ctx = nctx) != NULL);
+		ctx = next;
+	} while (ctx != head);
 	kpreempt_enable();
 }
 
@@ -1326,6 +1421,8 @@ thread_unpin()
 	itp = t->t_intr;		/* interrupted thread */
 	t->t_intr = NULL;		/* clear interrupt ptr */
 
+	ht_end_intr();
+
 	/*
 	 * Get state from interrupt thread for the one
 	 * it interrupted.
@@ -1422,7 +1519,7 @@ thread_create_intr(struct cpu *cp)
 static kmutex_t		tsd_mutex;	 /* linked list spin lock */
 static uint_t		tsd_nkeys;	 /* size of destructor array */
 /* per-key destructor funcs */
-static void 		(**tsd_destructor)(void *);
+static void		(**tsd_destructor)(void *);
 /* list of tsd_thread's */
 static struct tsd_thread	*tsd_list;
 
@@ -1889,6 +1986,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
 	return (on_rq);
 }
 
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread.  To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function.  After the specified function returns,
+ * the stack is deallocated and control is returned to the caller.  This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated.  Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+	kthread_t *t = curthread;
+	caddr_t ostk, ostkbase, stk;
+	ushort_t otflag;
+
+	if (t->t_onfault != NULL)
+		panic("thread_splitstack: called with non-NULL t_onfault");
+
+	ostk = t->t_stk;
+	ostkbase = t->t_stkbase;
+	otflag = t->t_flag;
+
+	stksize = roundup(stksize, PAGESIZE);
+
+	if (stksize < default_stksize)
+		stksize = default_stksize;
+
+	if (stksize == default_stksize) {
+		stk = (caddr_t)segkp_cache_get(segkp_thread);
+	} else {
+		stksize = roundup(stksize, PAGESIZE);
+		stk = (caddr_t)segkp_get(segkp, stksize,
+		    (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+	}
+
+	/*
+	 * We're going to lock ourselves before we set T_SPLITSTK to assure
+	 * that we're not swapped out in the meantime.  (Note that we don't
+	 * bother to set t_swap, as we're not going to be swapped out.)
+	 */
+	thread_lock(t);
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag |= T_SPLITSTK;
+
+	t->t_stk = stk + stksize;
+	t->t_stkbase = stk;
+
+	thread_unlock(t);
+
+	/*
+	 * Now actually run on the new (split) stack...
+	 */
+	thread_splitstack_run(t->t_stk, func, arg);
+
+	/*
+	 * We're back onto our own stack; lock ourselves and restore our
+	 * pre-split state.
+	 */
+	thread_lock(t);
+
+	t->t_stk = ostk;
+	t->t_stkbase = ostkbase;
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag &= ~T_SPLITSTK;
+
+	thread_unlock(t);
+
+	/*
+	 * Now that we are entirely back on our own stack, call back into
+	 * the platform layer to perform any platform-specific cleanup.
+	 */
+	thread_splitstack_cleanup();
+
+	segkp_release(segkp, stk);
+}
+
 /*
  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
  * specific pattern.