1 files changed, 310 insertions, 62 deletions
diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c
index f2685af534..c923ba5d1a 100644
--- a/usr/src/uts/common/disp/thread.c
+++ b/usr/src/uts/common/disp/thread.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -74,6 +74,11 @@
 #include <sys/waitq.h>
 #include <sys/cpucaps.h>
 #include <sys/kiconv.h>
+#include <sys/ht.h>
+
+#ifndef	STACK_GROWTH_DOWN
+#error Stacks do not grow downward; 3b2 zombie attack detected!
+#endif
 
 struct kmem_cache *thread_cache;	/* cache of free threads */
 struct kmem_cache *lwp_cache;		/* cache of free lwps */
@@ -372,7 +377,7 @@ thread_create(
 		if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 			cmn_err(CE_PANIC, "thread_create: proposed stack size"
 			    " too small to hold thread.");
-#ifdef STACK_GROWTH_DOWN
+
 		stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 		stksize &= -PTR24_ALIGN;	/* make thread aligned */
 		t = (kthread_t *)(stk + stksize);
@@ -381,13 +386,6 @@ thread_create(
 			audit_thread_create(t);
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else	/* stack grows to larger addresses */
-		stksize -= SA(sizeof (kthread_t));
-		t = (kthread_t *)(stk);
-		bzero(t, sizeof (kthread_t));
-		t->t_stk = stk + sizeof (kthread_t);
-		t->t_stkbase = stk + stksize + sizeof (kthread_t);
-#endif	/* STACK_GROWTH_DOWN */
 		t->t_flag |= T_TALLOCSTK;
 		t->t_swap = stk;
 	} else {
@@ -400,13 +398,8 @@ thread_create(
 		 * Initialize t_stk to the kernel stack pointer to use
 		 * upon entry to the kernel
 		 */
-#ifdef STACK_GROWTH_DOWN
 		t->t_stk = stk + stksize;
 		t->t_stkbase = stk;
-#else
-		t->t_stk = stk;			/* 3b2-like */
-		t->t_stkbase = stk + stksize;
-#endif /* STACK_GROWTH_DOWN */
 	}
 
 	if (kmem_stackinfo != 0) {
@@ -515,8 +508,8 @@ thread_create(
 	if (CPU->cpu_part == &cp_default)
 		t->t_cpu = CPU;
 	else
-		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
-		    t->t_pri, NULL);
+		t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t,
+		    t->t_pri);
 
 	t->t_disp_queue = t->t_cpu->cpu_disp;
 	kpreempt_enable();
@@ -589,6 +582,9 @@ thread_exit(void)
 	if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 		cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 
+	if ((t->t_flag & T_SPLITSTK) != 0)
+		cmn_err(CE_PANIC, "thread_exit: called when stack is split");
+
 	tsd_exit();		/* Clean up this thread's TSD */
 
 	kcpc_passivate();	/* clean up performance counter state */
@@ -791,6 +787,11 @@ thread_free(kthread_t *t)
 	nthread--;
 	mutex_exit(&pidlock);
 
+	if (t->t_name != NULL) {
+		kmem_free(t->t_name, THREAD_NAME_MAX);
+		t->t_name = NULL;
+	}
+
 	/*
 	 * Free thread, lwp and stack.  This needs to be done carefully, since
 	 * if T_TALLOCSTK is set, the thread is part of the stack.
@@ -1049,8 +1050,44 @@ installctx(
 	ctx->exit_op = exit;
 	ctx->free_op = free;
 	ctx->arg = arg;
-	ctx->next = t->t_ctx;
+	ctx->save_ts = 0;
+	ctx->restore_ts = 0;
+
+	/*
+	 * Keep ctxops in a doubly-linked list to allow traversal in both
+	 * directions.  Using only the newest-to-oldest ordering was adequate
+	 * previously, but reversing the order for restore_op actions is
+	 * necessary if later-added ctxops depends on earlier ones.
+	 *
+	 * One example of such a dependency:  Hypervisor software handling the
+	 * guest FPU expects that it save FPU state prior to host FPU handling
+	 * and consequently handle the guest logic _after_ the host FPU has
+	 * been restored.
+	 *
+	 * The t_ctx member points to the most recently added ctxop or is NULL
+	 * if no ctxops are associated with the thread.  The 'next' pointers
+	 * form a loop of the ctxops in newest-to-oldest order.  The 'prev'
+	 * pointers form a loop in the reverse direction, where t_ctx->prev is
+	 * the oldest entry associated with the thread.
+	 *
+	 * The protection of kpreempt_disable is required to safely perform the
+	 * list insertion, since there are inconsistent states between some of
+	 * the pointer assignments.
+	 */
+	kpreempt_disable();
+	if (t->t_ctx == NULL) {
+		ctx->next = ctx;
+		ctx->prev = ctx;
+	} else {
+		struct ctxop *head = t->t_ctx, *tail = t->t_ctx->prev;
+
+		ctx->next = head;
+		ctx->prev = tail;
+		head->prev = ctx;
+		tail->next = ctx;
+	}
 	t->t_ctx = ctx;
+	kpreempt_enable();
 }
 
 /*
@@ -1067,7 +1104,7 @@ removectx(
 	void	(*exit)(void *),
 	void	(*free)(void *, int))
 {
-	struct ctxop *ctx, *prev_ctx;
+	struct ctxop *ctx, *head;
 
 	/*
 	 * The incoming kthread_t (which is the thread for which the
@@ -1092,17 +1129,31 @@ removectx(
 	 * and the target thread from racing with each other during lwp exit.
 	 */
 	mutex_enter(&t->t_ctx_lock);
-	prev_ctx = NULL;
 	kpreempt_disable();
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
+
+	if (t->t_ctx == NULL) {
+		mutex_exit(&t->t_ctx_lock);
+		kpreempt_enable();
+		return (0);
+	}
+
+	ctx = head = t->t_ctx;
+	do {
 		if (ctx->save_op == save && ctx->restore_op == restore &&
 		    ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
 		    ctx->exit_op == exit && ctx->free_op == free &&
 		    ctx->arg == arg) {
-			if (prev_ctx)
-				prev_ctx->next = ctx->next;
-			else
+			ctx->prev->next = ctx->next;
+			ctx->next->prev = ctx->prev;
+			if (ctx->next == ctx) {
+				/* last remaining item */
+				t->t_ctx = NULL;
+			} else if (ctx == t->t_ctx) {
+				/* fix up head of list */
 				t->t_ctx = ctx->next;
+			}
+			ctx->next = ctx->prev = NULL;
+
 			mutex_exit(&t->t_ctx_lock);
 			if (ctx->free_op != NULL)
 				(ctx->free_op)(ctx->arg, 0);
@@ -1110,44 +1161,70 @@ removectx(
 			kpreempt_enable();
 			return (1);
 		}
-		prev_ctx = ctx;
-	}
+
+		ctx = ctx->next;
+	} while (ctx != head);
+
 	mutex_exit(&t->t_ctx_lock);
 	kpreempt_enable();
-
 	return (0);
 }
 
 void
 savectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->save_op != NULL)
-			(ctx->save_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->save_op != NULL) {
+				ctx->save_ts = gethrtime_unscaled();
+				(ctx->save_op)(ctx->arg);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 void
 restorectx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
 	ASSERT(t == curthread);
-	for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
-		if (ctx->restore_op != NULL)
-			(ctx->restore_op)(ctx->arg);
+
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *tail;
+
+		/* Backward traversal (starting at the tail) */
+		ctx = tail = t->t_ctx->prev;
+		do {
+			if (ctx->restore_op != NULL) {
+				ctx->restore_ts = gethrtime_unscaled();
+				(ctx->restore_op)(ctx->arg);
+			}
+			ctx = ctx->prev;
+		} while (ctx != tail);
+	}
 }
 
 void
 forkctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->fork_op != NULL)
-			(ctx->fork_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->fork_op != NULL) {
+				(ctx->fork_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1158,11 +1235,18 @@ forkctx(kthread_t *t, kthread_t *ct)
 void
 lwp_createctx(kthread_t *t, kthread_t *ct)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->lwp_create_op != NULL)
-			(ctx->lwp_create_op)(t, ct);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->lwp_create_op != NULL) {
+				(ctx->lwp_create_op)(t, ct);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1175,11 +1259,18 @@ lwp_createctx(kthread_t *t, kthread_t *ct)
 void
 exitctx(kthread_t *t)
 {
-	struct ctxop *ctx;
-
-	for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
-		if (ctx->exit_op != NULL)
-			(ctx->exit_op)(t);
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		/* Forward traversal */
+		ctx = head = t->t_ctx;
+		do {
+			if (ctx->exit_op != NULL) {
+				(ctx->exit_op)(t);
+			}
+			ctx = ctx->next;
+		} while (ctx != head);
+	}
 }
 
 /*
@@ -1189,14 +1280,21 @@ exitctx(kthread_t *t)
 void
 freectx(kthread_t *t, int isexec)
 {
-	struct ctxop *ctx;
-
 	kpreempt_disable();
-	while ((ctx = t->t_ctx) != NULL) {
-		t->t_ctx = ctx->next;
-		if (ctx->free_op != NULL)
-			(ctx->free_op)(ctx->arg, isexec);
-		kmem_free(ctx, sizeof (struct ctxop));
+	if (t->t_ctx != NULL) {
+		struct ctxop *ctx, *head;
+
+		ctx = head = t->t_ctx;
+		t->t_ctx = NULL;
+		do {
+			struct ctxop *next = ctx->next;
+
+			if (ctx->free_op != NULL) {
+				(ctx->free_op)(ctx->arg, isexec);
+			}
+			kmem_free(ctx, sizeof (struct ctxop));
+			ctx = next;
+		} while (ctx != head);
 	}
 	kpreempt_enable();
 }
@@ -1211,17 +1309,22 @@ freectx(kthread_t *t, int isexec)
 void
 freectx_ctx(struct ctxop *ctx)
 {
-	struct ctxop *nctx;
+	struct ctxop *head = ctx;
 
 	ASSERT(ctx != NULL);
 
 	kpreempt_disable();
+
+	head = ctx;
 	do {
-		nctx = ctx->next;
-		if (ctx->free_op != NULL)
+		struct ctxop *next = ctx->next;
+
+		if (ctx->free_op != NULL) {
 			(ctx->free_op)(ctx->arg, 0);
+		}
 		kmem_free(ctx, sizeof (struct ctxop));
-	} while ((ctx = nctx) != NULL);
+		ctx = next;
+	} while (ctx != head);
 	kpreempt_enable();
 }
 
@@ -1320,6 +1423,8 @@ thread_unpin()
 	itp = t->t_intr;		/* interrupted thread */
 	t->t_intr = NULL;		/* clear interrupt ptr */
 
+	ht_end_intr();
+
 	/*
 	 * Get state from interrupt thread for the one
 	 * it interrupted.
@@ -1883,6 +1988,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
 	return (on_rq);
 }
 
+
+/*
+ * There are occasions in the kernel when we need much more stack than we
+ * allocate by default, but we do not wish to have that work done
+ * asynchronously by another thread.  To accommodate these scenarios, we allow
+ * for a split stack (also known as a "segmented stack") whereby a new stack
+ * is dynamically allocated and the current thread jumps onto it for purposes
+ * of executing the specified function.  After the specified function returns,
+ * the stack is deallocated and control is returned to the caller.  This
+ * functionality is implemented by thread_splitstack(), below; there are a few
+ * constraints on its use:
+ *
+ * - The caller must be in a context where it is safe to block for memory.
+ * - The caller cannot be in a t_onfault context
+ * - The called function must not call thread_exit() while on the split stack
+ *
+ * The code will explicitly panic if these constraints are violated.  Notably,
+ * however, thread_splitstack() _can_ be called on a split stack -- there
+ * is no limit to the level that split stacks can nest.
+ *
+ * When the stack is split, it is constructed such that stack backtraces
+ * from kernel debuggers continue to function -- though note that DTrace's
+ * stack() action and stackdepth function will only show the stack up to and
+ * including thread_splitstack_run(); DTrace explicitly bounds itself to
+ * pointers that exist within the current declared stack as a safety
+ * mechanism.
+ */
+void
+thread_splitstack(void (*func)(void *), void *arg, size_t stksize)
+{
+	kthread_t *t = curthread;
+	caddr_t ostk, ostkbase, stk;
+	ushort_t otflag;
+
+	if (t->t_onfault != NULL)
+		panic("thread_splitstack: called with non-NULL t_onfault");
+
+	ostk = t->t_stk;
+	ostkbase = t->t_stkbase;
+	otflag = t->t_flag;
+
+	stksize = roundup(stksize, PAGESIZE);
+
+	if (stksize < default_stksize)
+		stksize = default_stksize;
+
+	if (stksize == default_stksize) {
+		stk = (caddr_t)segkp_cache_get(segkp_thread);
+	} else {
+		stksize = roundup(stksize, PAGESIZE);
+		stk = (caddr_t)segkp_get(segkp, stksize,
+		    (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
+	}
+
+	/*
+	 * We're going to lock ourselves before we set T_SPLITSTK to assure
+	 * that we're not swapped out in the meantime.  (Note that we don't
+	 * bother to set t_swap, as we're not going to be swapped out.)
+	 */
+	thread_lock(t);
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag |= T_SPLITSTK;
+
+	t->t_stk = stk + stksize;
+	t->t_stkbase = stk;
+
+	thread_unlock(t);
+
+	/*
+	 * Now actually run on the new (split) stack...
+	 */
+	thread_splitstack_run(t->t_stk, func, arg);
+
+	/*
+	 * We're back onto our own stack; lock ourselves and restore our
+	 * pre-split state.
+	 */
+	thread_lock(t);
+
+	t->t_stk = ostk;
+	t->t_stkbase = ostkbase;
+
+	if (!(otflag & T_SPLITSTK))
+		t->t_flag &= ~T_SPLITSTK;
+
+	thread_unlock(t);
+
+	/*
+	 * Now that we are entirely back on our own stack, call back into
+	 * the platform layer to perform any platform-specific cleanup.
+	 */
+	thread_splitstack_cleanup();
+
+	segkp_release(segkp, stk);
+}
+
 /*
  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
  * specific pattern.
@@ -2127,3 +2329,49 @@ stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
 	}
 	return (percent);
 }
+
+/*
+ * NOTE: This will silently truncate a name > THREAD_NAME_MAX - 1 characters
+ * long.  It is expected that callers (acting on behalf of userland clients)
+ * will perform any required checks to return the correct error semantics.
+ * It is also expected callers on behalf of userland clients have done
+ * any necessary permission checks.
+ */
+void
+thread_setname(kthread_t *t, const char *name)
+{
+	char *buf = NULL;
+
+	/*
+	 * We optimistically assume that a thread's name will only be set
+	 * once and so allocate memory in preparation of setting t_name.
+	 * If it turns out a name has already been set, we just discard (free)
+	 * the buffer we just allocated and reuse the current buffer
+	 * (as all should be THREAD_NAME_MAX large).
+	 *
+	 * Such an arrangement means over the lifetime of a kthread_t, t_name
+	 * is either NULL or has one value (the address of the buffer holding
+	 * the current thread name).   The assumption is that most kthread_t
+	 * instances will not have a name assigned, so dynamically allocating
+	 * the memory should minimize the footprint of this feature, but by
+	 * having the buffer persist for the life of the thread, it simplifies
+	 * usage in highly constrained situations (e.g. dtrace).
+	 */
+	if (name != NULL && name[0] != '\0') {
+		buf = kmem_zalloc(THREAD_NAME_MAX, KM_SLEEP);
+		(void) strlcpy(buf, name, THREAD_NAME_MAX);
+	}
+
+	mutex_enter(&ttoproc(t)->p_lock);
+	if (t->t_name == NULL) {
+		t->t_name = buf;
+	} else {
+		if (buf != NULL) {
+			(void) strlcpy(t->t_name, name, THREAD_NAME_MAX);
+			kmem_free(buf, THREAD_NAME_MAX);
+		} else {
+			bzero(t->t_name, THREAD_NAME_MAX);
+		}
+	}
+	mutex_exit(&ttoproc(t)->p_lock);
+}