68 files changed, 3670 insertions, 1438 deletions
diff --git a/usr/src/uts/common/os/autoconf.c b/usr/src/uts/common/os/autoconf.c
index 71af31ba2b..44ec3353fc 100644
--- a/usr/src/uts/common/os/autoconf.c
+++ b/usr/src/uts/common/os/autoconf.c
@@ -53,6 +53,7 @@
 #include <sys/fm/util.h>
 #include <sys/ddifm_impl.h>
 #include <sys/ddi_ufm_impl.h>
+#include <sys/ksensor_impl.h>
 
 extern dev_info_t *top_devinfo;
 extern dev_info_t *scsi_vhci_dip;
@@ -96,6 +97,7 @@ setup_ddi(void)
 	ndi_fm_init();
 	irm_init();
 	ufm_init();
+	ksensor_init();
 
 	(void) i_ddi_load_drvconf(DDI_MAJOR_T_NONE);
 
diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c
index abaaef1b4a..daf3b638a6 100644
--- a/usr/src/uts/common/os/bio.c
+++ b/usr/src/uts/common/os/bio.c
@@ -1488,7 +1488,6 @@ bio_getfreeblk(long bsize)
 	 */
 	bio_mem_get(bsize);	/* Account for our memory request */
 
-again:
 	bp = bio_bhdr_alloc();	/* Get a buf hdr */
 	sema_p(&bp->b_sem);	/* Should never fail */
 
diff --git a/usr/src/uts/common/os/bitmap.c b/usr/src/uts/common/os/bitmap.c
index 46fae44adb..06dd326f4a 100644
--- a/usr/src/uts/common/os/bitmap.c
+++ b/usr/src/uts/common/os/bitmap.c
@@ -19,17 +19,16 @@
  *
  * CDDL HEADER END
  */
-/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
 
 
 /*
  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2022 Oxide Computer Company
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Operations on bitmaps of arbitrary size
  * A bitmap is a vector of 1 or more ulongs.
@@ -39,7 +38,7 @@
 
 #include <sys/types.h>
 #include <sys/bitmap.h>
-#include <sys/debug.h>		/* ASSERT */
+#include <sys/debug.h>
 
 /*
  * Return index of first available bit in denoted bitmap, or -1 for
@@ -49,7 +48,7 @@
  * Caller is responsible for range checks.
  */
 index_t
-bt_availbit(ulong_t *bitmap, size_t nbits)
+bt_availbit(const ulong_t *bitmap, size_t nbits)
 {
 	index_t	maxword;	/* index of last in map */
 	index_t	wx;		/* word index in map */
@@ -92,7 +91,7 @@ bt_availbit(ulong_t *bitmap, size_t nbits)
  * the word specified by wx.
  */
 int
-bt_gethighbit(ulong_t *mapp, int wx)
+bt_gethighbit(const ulong_t *mapp, int wx)
 {
 	ulong_t word;
 
@@ -115,7 +114,7 @@ bt_gethighbit(ulong_t *mapp, int wx)
  * and one past the last bit (pos2) in the pattern.
  */
 int
-bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos)
+bt_range(const ulong_t *bitmap, size_t *pos1, size_t *pos2, size_t end_pos)
 {
 	size_t pos;
 
@@ -169,7 +168,7 @@ odd_parity(ulong_t i)
  * a -1 is returned.
  */
 int
-bt_getlowbit(ulong_t *map, size_t start, size_t stop)
+bt_getlowbit(const ulong_t *map, size_t start, size_t stop)
 {
 	ulong_t		word;
 	int		counter = start >> BT_ULSHIFT;
@@ -236,7 +235,7 @@ bt_getlowbit(ulong_t *map, size_t start, size_t stop)
  * Copy the bitmap.
  */
 void
-bt_copy(ulong_t *from, ulong_t *to, ulong_t size)
+bt_copy(const ulong_t *from, ulong_t *to, ulong_t size)
 {
 	ulong_t i;
 	for (i = 0; i < size; i++)
diff --git a/usr/src/uts/common/os/cap_util.c b/usr/src/uts/common/os/cap_util.c
index 4f9b9f5985..7647302cfe 100644
--- a/usr/src/uts/common/os/cap_util.c
+++ b/usr/src/uts/common/os/cap_util.c
@@ -693,7 +693,7 @@ cu_cpc_program(cpu_t *cp, int *err)
 	 *
 	 * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
 	 * unprogrammed and may be marked with KCPC_CTX_INVALID when
-	 * kcpc_invalidate_all() is called by cpustat(1M) and dtrace CPC to
+	 * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
 	 * invalidate all CPC contexts before they take over all the counters.
 	 *
 	 * This isn't necessary since these flags are only used for thread bound
@@ -1258,7 +1258,7 @@ cu_cpu_fini(cpu_t *cp)
 			ctx = cpu_ctx->ctx_ptr_array[i];
 			if (ctx == NULL)
 				continue;
-			kcpc_free(ctx, 0);
+			kcpc_free_cpu(ctx);
 		}
 
 		/*
diff --git a/usr/src/uts/common/os/clock.c b/usr/src/uts/common/os/clock.c
index 75c3b000db..93f12d7b96 100644
--- a/usr/src/uts/common/os/clock.c
+++ b/usr/src/uts/common/os/clock.c
@@ -318,7 +318,9 @@ time_t	boot_time = 0;		/* Boot time in seconds since 1970 */
 cyclic_id_t clock_cyclic;	/* clock()'s cyclic_id */
 cyclic_id_t deadman_cyclic;	/* deadman()'s cyclic_id */
 
-extern void	clock_tick_schedule(int);
+extern void clock_tick_schedule(int);
+extern void set_freemem(void);
+extern void pageout_deadman(void);
 
 static int lgrp_ticks;		/* counter to schedule lgrp load calcs */
 
@@ -400,7 +402,6 @@ clock(void)
 	uint_t	w_io;
 	cpu_t	*cp;
 	cpupart_t *cpupart;
-	extern	void	set_freemem();
 	void	(*funcp)();
 	int32_t ltemp;
 	int64_t lltemp;
@@ -477,6 +478,7 @@ clock(void)
 	if (one_sec) {
 		loadavg_update();
 		deadman_counter++;
+		pageout_deadman();
 	}
 
 	/*
diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c
index 1280c8a1b6..27bc319ee6 100644
--- a/usr/src/uts/common/os/clock_highres.c
+++ b/usr/src/uts/common/os/clock_highres.c
@@ -93,7 +93,7 @@ clock_highres_fire(void *arg)
 
 static int
 clock_highres_timer_settime(itimer_t *it, int flags,
-	const struct itimerspec *when)
+    const struct itimerspec *when)
 {
 	cyclic_id_t cyc, *cycp = it->it_arg;
 	proc_t *p = curproc;
diff --git a/usr/src/uts/common/os/clock_process.c b/usr/src/uts/common/os/clock_process.c
new file mode 100644
index 0000000000..a3c1641c9c
--- /dev/null
+++ b/usr/src/uts/common/os/clock_process.c
@@ -0,0 +1,130 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This clock backend implements basic support for the CLOCK_PROCESS_CPUTIME_ID
+ * clock. This clock is weakly defined by POSIX as "The identifier of the
+ * CPU-time clock associated with the process making a clock() or timer*()
+ * function call". We interpret that as including LMS_USER, LMS_SYSTEM, and
+ * LMS_TRAP microstates. This is similar to what we do in proc(5) for the
+ * lwpstatus_t and the prstatus_t.
+ *
+ * At this time, we only provide the ability to read the current time (e.g.
+ * through a call to clock_gettime(3C)). There is never a case where being able
+ * to set the time makes sense today and even if so, the privileges required for
+ * that are circumspect. Today, we do not support the ability to create interval
+ * timers based on this backend (e.g. timer_create(3C) and timer_settime(3C)).
+ * However, there is no reason that couldn't be added.
+ *
+ * To implement this, we leverage the existing microstate aggregation time that
+ * is done in /proc.
+ */
+
+#include <sys/timer.h>
+#include <sys/cyclic.h>
+#include <sys/msacct.h>
+
+static clock_backend_t clock_process;
+
+static int
+clock_process_settime(timespec_t *ts)
+{
+	return (EINVAL);
+}
+
+static int
+clock_process_gettime(timespec_t *ts)
+{
+	hrtime_t hrt;
+	proc_t *p = curproc;
+
+	/*
+	 * mstate_aggr_state() automatically includes LMS_TRAP when we ask for
+	 * LMS_SYSTEM below.
+	 */
+	mutex_enter(&p->p_lock);
+	hrt = mstate_aggr_state(p, LMS_USER);
+	hrt += mstate_aggr_state(p, LMS_SYSTEM);
+	mutex_exit(&p->p_lock);
+
+	hrt2ts(hrt, ts);
+
+	return (0);
+}
+
+/*
+ * See the discussion in clock_thread_getres() for the why of using
+ * cyclic_getres() here.
+ */
+static int
+clock_process_getres(timespec_t *ts)
+{
+	hrt2ts(cyclic_getres(), (timestruc_t *)ts);
+
+	return (0);
+}
+
+static int
+clock_process_timer_create(itimer_t *it, void (*fire)(itimer_t *))
+{
+	return (EINVAL);
+}
+
+static int
+clock_process_timer_settime(itimer_t *it, int flags,
+    const struct itimerspec *when)
+{
+	return (EINVAL);
+}
+
+static int
+clock_process_timer_gettime(itimer_t *it, struct itimerspec *when)
+{
+	return (EINVAL);
+}
+
+static int
+clock_process_timer_delete(itimer_t *it)
+{
+	return (EINVAL);
+}
+
+static void
+clock_process_timer_lwpbind(itimer_t *it)
+{
+}
+
+void
+clock_process_init(void)
+{
+	/*
+	 * While this clock backend doesn't support notifications right now, we
+	 * still fill out the default for what it would be.
+	 */
+	clock_process.clk_default.sigev_signo = SIGALRM;
+	clock_process.clk_default.sigev_notify = SIGEV_SIGNAL;
+	clock_process.clk_default.sigev_value.sival_ptr = NULL;
+
+	clock_process.clk_clock_settime = clock_process_settime;
+	clock_process.clk_clock_gettime = clock_process_gettime;
+	clock_process.clk_clock_getres = clock_process_getres;
+	clock_process.clk_timer_create = clock_process_timer_create;
+	clock_process.clk_timer_settime = clock_process_timer_settime;
+	clock_process.clk_timer_gettime = clock_process_timer_gettime;
+	clock_process.clk_timer_delete = clock_process_timer_delete;
+	clock_process.clk_timer_lwpbind = clock_process_timer_lwpbind;
+
+	clock_add_backend(CLOCK_PROCESS_CPUTIME_ID, &clock_process);
+}
diff --git a/usr/src/uts/common/os/clock_thread.c b/usr/src/uts/common/os/clock_thread.c
new file mode 100644
index 0000000000..96dd36fa08
--- /dev/null
+++ b/usr/src/uts/common/os/clock_thread.c
@@ -0,0 +1,191 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+/*
+ * This clock backend implements basic support for the following two clocks:
+ *
+ *   o CLOCK_VIRTUAL		This provides the ability to read the amount of
+ *				user CPU time that the calling thread has spent
+ *				on CPU. This is the LMS_USER cpu microstate.
+ *
+ *   o CLOCK_THREAD_CPUTIME_ID	This clock is similar to the above; however, it
+ *				also includes system time. This is the LMS_USER,
+ *				LMS_SYSTEM, and LMS_TRAP microstates combined
+ *				together. We include LMS_TRAP here because that
+ *				is what you see in a thread's lwpstatus file.
+ *
+ * At this time, we only provide the ability to read the current time (e.g.
+ * through a call to clock_gettime(3C)). There is never a case where being able
+ * to set the time makes sense today and truthfully, lying about a process's
+ * runtime should be left to mdb -kw. Today, we do not support the ability to
+ * create interval timers based on this backend (e.g. timer_create(3C) and
+ * timer_settime(3C)). However, there is no reason that couldn't be added.
+ *
+ * A nice simplification here is that this clock is always about reading from
+ * the current thread. This means that one can always access it. Because the
+ * calling thread exists and is in this code, it means that we know it is here.
+ * Any other privilege information is left to the broader kernel.
+ *
+ * Because the only difference between these is the question of whether or not
+ * we include LMS_SYSTEM time in the value, we generally use the same actual
+ * clock backend functions except for the one that implements
+ * clk_clock_gettime().
+ */
+
+#include <sys/timer.h>
+#include <sys/cyclic.h>
+#include <sys/msacct.h>
+
+static clock_backend_t clock_thread_usr;
+static clock_backend_t clock_thread_usrsys;
+
+static int
+clock_thread_settime(timespec_t *ts)
+{
+	return (EINVAL);
+}
+
+static int
+clock_thread_usr_gettime(timespec_t *ts)
+{
+	hrtime_t hrt;
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+
+	hrt = lwp->lwp_mstate.ms_acct[LMS_USER];
+	scalehrtime(&hrt);
+	hrt2ts(hrt, ts);
+
+	return (0);
+}
+
+static int
+clock_thread_usrsys_gettime(timespec_t *ts)
+{
+	hrtime_t hrt;
+	kthread_t *t = curthread;
+
+	/*
+	 * mstate_thread_onproc_time() takes care of doing the following:
+	 *
+	 *  o Combining LMS_USER, LMS_SYSTEM, and LMS_TRAP.
+	 *  o Ensuring that the result is scaled
+	 *  o Ensuring that the time that's elapsed to the point of our asking
+	 *    is included. By definition the kernel is executing in LMS_SYSTEM
+	 *    so this ensures that we add that time which isn't currently in the
+	 *    microstate to this.
+	 */
+	thread_lock(t);
+	hrt = mstate_thread_onproc_time(t);
+	thread_unlock(t);
+
+	hrt2ts(hrt, ts);
+	return (0);
+}
+
+/*
+ * The question of the resolution here is a thorny one. Technically this would
+ * really be based upon the resolution of gethrtime_unscaled(), as we can
+ * actually tell that much due to our use of CPU microstate accounting. However,
+ * from a timer resolution perspective it's actually quite different and would
+ * in theory be based on the system tick rate.
+ *
+ * This basically leaves us with two options:
+ *
+ *   1) Use 'nsec_per_tick' to go down the Hz path.
+ *   2) Use the cyclic resolution, which basically is kind of the resolution of
+ *      that timer.
+ *
+ * POSIX is unclear as to the effect of the resolution in the case of timer_*()
+ * functions and only really says it is used to impact the implementation of
+ * clock_settime() which of course isn't actually supported here. As a result,
+ * we opt to prefer the cyclic resolution, which is closer to the actual
+ * resolution of this subsystem. Strictly speaking, this might not be completely
+ * accurate, but should be on current platforms.
+ */
+static int
+clock_thread_getres(timespec_t *ts)
+{
+	hrt2ts(cyclic_getres(), (timestruc_t *)ts);
+
+	return (0);
+}
+
+static int
+clock_thread_timer_create(itimer_t *it, void (*fire)(itimer_t *))
+{
+	return (EINVAL);
+}
+
+static int
+clock_thread_timer_settime(itimer_t *it, int flags,
+    const struct itimerspec *when)
+{
+	return (EINVAL);
+}
+
+static int
+clock_thread_timer_gettime(itimer_t *it, struct itimerspec *when)
+{
+	return (EINVAL);
+}
+
+static int
+clock_thread_timer_delete(itimer_t *it)
+{
+	return (EINVAL);
+}
+
+static void
+clock_thread_timer_lwpbind(itimer_t *it)
+{
+}
+
+void
+clock_thread_init(void)
+{
+	/*
+	 * While this clock backends don't support notifications right now, we
+	 * still fill out the default for what it would be.
+	 */
+	clock_thread_usr.clk_default.sigev_signo = SIGALRM;
+	clock_thread_usr.clk_default.sigev_notify = SIGEV_SIGNAL;
+	clock_thread_usr.clk_default.sigev_value.sival_ptr = NULL;
+
+	clock_thread_usr.clk_clock_settime = clock_thread_settime;
+	clock_thread_usr.clk_clock_gettime = clock_thread_usr_gettime;
+	clock_thread_usr.clk_clock_getres = clock_thread_getres;
+	clock_thread_usr.clk_timer_create = clock_thread_timer_create;
+	clock_thread_usr.clk_timer_settime = clock_thread_timer_settime;
+	clock_thread_usr.clk_timer_gettime = clock_thread_timer_gettime;
+	clock_thread_usr.clk_timer_delete = clock_thread_timer_delete;
+	clock_thread_usr.clk_timer_lwpbind = clock_thread_timer_lwpbind;
+
+	clock_thread_usrsys.clk_default.sigev_signo = SIGALRM;
+	clock_thread_usrsys.clk_default.sigev_notify = SIGEV_SIGNAL;
+	clock_thread_usrsys.clk_default.sigev_value.sival_ptr = NULL;
+
+	clock_thread_usrsys.clk_clock_settime = clock_thread_settime;
+	clock_thread_usrsys.clk_clock_gettime = clock_thread_usrsys_gettime;
+	clock_thread_usrsys.clk_clock_getres = clock_thread_getres;
+	clock_thread_usrsys.clk_timer_create = clock_thread_timer_create;
+	clock_thread_usrsys.clk_timer_settime = clock_thread_timer_settime;
+	clock_thread_usrsys.clk_timer_gettime = clock_thread_timer_gettime;
+	clock_thread_usrsys.clk_timer_delete = clock_thread_timer_delete;
+	clock_thread_usrsys.clk_timer_lwpbind = clock_thread_timer_lwpbind;
+
+	clock_add_backend(CLOCK_VIRTUAL, &clock_thread_usr);
+	clock_add_backend(CLOCK_THREAD_CPUTIME_ID, &clock_thread_usrsys);
+}
diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c
index e53c75b64e..6a86dbb8cb 100644
--- a/usr/src/uts/common/os/cpu.c
+++ b/usr/src/uts/common/os/cpu.c
@@ -22,6 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
  */
 
 /*
@@ -60,7 +61,7 @@
 #include <sys/archsystm.h>
 #include <sys/sdt.h>
 #include <sys/smt.h>
-#if defined(__x86) || defined(__amd64)
+#if defined(__x86)
 #include <sys/x86_archext.h>
 #endif
 #include <sys/callo.h>
@@ -613,7 +614,7 @@ again:
 	 * requests will continue to be satisfied in the same way,
 	 * even if weak bindings have recommenced.
 	 */
-	if (t->t_nomigrate < 0 || weakbindingbarrier && t->t_nomigrate == 0) {
+	if (t->t_nomigrate < 0 || (weakbindingbarrier && t->t_nomigrate == 0)) {
 		--t->t_nomigrate;
 		thread_unlock(curthread);
 		return;		/* with kpreempt_disable still active */
@@ -2909,7 +2910,7 @@ cpuset_atomic_xdel(cpuset_t *s, const uint_t cpu)
 }
 
 void
-cpuset_or(cpuset_t *dst, cpuset_t *src)
+cpuset_or(cpuset_t *dst, const cpuset_t *src)
 {
 	for (int i = 0; i < CPUSET_WORDS; i++) {
 		dst->cpub[i] |= src->cpub[i];
@@ -2917,7 +2918,7 @@ cpuset_or(cpuset_t *dst, cpuset_t *src)
 }
 
 void
-cpuset_xor(cpuset_t *dst, cpuset_t *src)
+cpuset_xor(cpuset_t *dst, const cpuset_t *src)
 {
 	for (int i = 0; i < CPUSET_WORDS; i++) {
 		dst->cpub[i] ^= src->cpub[i];
@@ -2925,7 +2926,7 @@ cpuset_xor(cpuset_t *dst, cpuset_t *src)
 }
 
 void
-cpuset_and(cpuset_t *dst, cpuset_t *src)
+cpuset_and(cpuset_t *dst, const cpuset_t *src)
 {
 	for (int i = 0; i < CPUSET_WORDS; i++) {
 		dst->cpub[i] &= src->cpub[i];
diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c
index 0bd6cfd44f..5e909667de 100644
--- a/usr/src/uts/common/os/cred.c
+++ b/usr/src/uts/common/os/cred.c
@@ -20,13 +20,14 @@
  */
 /*
  * Copyright (c) 2013, Ira Cooper.  All rights reserved.
+ * Copyright 2020 Nexenta by DDN, Inc. All rights reserved.
  */
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
@@ -288,7 +289,7 @@ crget(void)
 {
 	cred_t *cr = kmem_cache_alloc(cred_cache, KM_SLEEP);
 
-	bcopy(kcred, cr, crsize);
+	bcopy(zone_kcred(), cr, crsize);
 	cr->cr_ref = 1;
 	zone_cred_hold(cr->cr_zone);
 	if (cr->cr_label)
@@ -377,7 +378,7 @@ crfree(cred_t *cr)
 /*
  * Copy a cred structure to a new one and free the old one.
  *	The new cred will have two references.  One for the calling process,
- * 	and one for the thread.
+ *	and one for the thread.
  */
 cred_t *
 crcopy(cred_t *cr)
@@ -404,7 +405,7 @@ crcopy(cred_t *cr)
 /*
  * Copy a cred structure to a new one and free the old one.
  *	The new cred will have two references.  One for the calling process,
- * 	and one for the thread.
+ *	and one for the thread.
  * This variation on crcopy uses a pre-allocated structure for the
  * "new" cred.
  */
diff --git a/usr/src/uts/common/os/dacf.c b/usr/src/uts/common/os/dacf.c
index 8d4cd486d8..592b1cd570 100644
--- a/usr/src/uts/common/os/dacf.c
+++ b/usr/src/uts/common/os/dacf.c
@@ -642,7 +642,7 @@ dacf_arglist_delete(dacf_arg_t **list)
  * 	Match a device-spec to a rule.
  */
 dacf_rule_t *
-dacf_match(dacf_opid_t op, dacf_devspec_t ds, void *match_info)
+dacf_match(dacf_opid_t op, dacf_devspec_t ds, const void *match_info)
 {
 	dacf_rule_t *rule;
 
diff --git a/usr/src/uts/common/os/dacf_clnt.c b/usr/src/uts/common/os/dacf_clnt.c
index e40509d33b..fdb1696fb2 100644
--- a/usr/src/uts/common/os/dacf_clnt.c
+++ b/usr/src/uts/common/os/dacf_clnt.c
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * DACF (Device Autoconfiguration Framework) client code.
  *
@@ -67,8 +65,8 @@
  * 	for the operation to be invoked at post-attach and/or pre-detach time.
  */
 void
-dacfc_match_create_minor(char *name, char *node_type, dev_info_t *dip,
-    struct ddi_minor_data *dmdp, int flag)
+dacfc_match_create_minor(const char *name, const char *node_type,
+    dev_info_t *dip, struct ddi_minor_data *dmdp, int flag)
 {
 	dacf_rule_t *r;
 	char *dev_path, *dev_pathp, *drv_mname = NULL;
diff --git a/usr/src/uts/common/os/ddi.c b/usr/src/uts/common/os/ddi.c
index a37d91e92a..c348ee474c 100644
--- a/usr/src/uts/common/os/ddi.c
+++ b/usr/src/uts/common/os/ddi.c
@@ -1136,8 +1136,8 @@ qunbufcall(queue_t *q, bufcall_id_t id)
  * Associate the stream with an instance of the bottom driver.  This
  * function is called by APIs that establish or modify the hardware
  * association (ppa) of an open stream.  Two examples of such
- * post-open(9E) APIs are the dlpi(7p) DL_ATTACH_REQ message, and the
- * ndd(1M) "instance=" ioctl(2).  This interface may be called from a
+ * post-open(9E) APIs are the dlpi(4P) DL_ATTACH_REQ message, and the
+ * ndd(8) "instance=" ioctl(2).  This interface may be called from a
  * stream driver's wput procedure and from within syncq perimeters,
  * so it can't block.
  *
diff --git a/usr/src/uts/common/os/ddi_hp_impl.c b/usr/src/uts/common/os/ddi_hp_impl.c
index 38e575dbfd..8f0890fc2b 100644
--- a/usr/src/uts/common/os/ddi_hp_impl.c
+++ b/usr/src/uts/common/os/ddi_hp_impl.c
@@ -92,8 +92,8 @@
  *	- Through the nexus driver interface, ndi_hp_state_change_req. PCIe
  *	nexus drivers that pass a hotplug interrupt through to pciehpc will kick
  *	off state changes in this way.
- *	- Through coordinated removal, ddihp_modctl. Both cfgadm(1M) and
- *	hotplug(1M) pass state change requests through hotplugd, which uses
+ *	- Through coordinated removal, ddihp_modctl. Both cfgadm(8) and
+ *	hotplug(8) pass state change requests through hotplugd, which uses
  *	modctl to request state changes to the DDI hotplug framework. That
  *	interface is ultimately implemented by ddihp_modctl.
  *
@@ -131,7 +131,7 @@
  * of some key components are below.
  *
  *				+------------+
- *				| cfgadm(1M) |
+ *				| cfgadm(8)  |
  *				+------------+
  *				      |
  *			    +-------------------+
@@ -139,7 +139,7 @@
  *			    +-------------------+
  *				      |
  *	+-------------+		 +------------+
- *	| hotplug(1M) |----------| libhotplug |
+ *	| hotplug(8)  |----------| libhotplug |
  *	+-------------+		 +------------+
  *				      |
  *				 +----------+
@@ -193,14 +193,14 @@
  *
  * KEY HOTPLUG SOFTWARE COMPONENTS
  *
- *	CFGADM(1M)
+ *	cfgadm(8)
  *
  *	cfgadm is the canonical tool for hotplug operations. It can be used to
  *	list connections on the system and change their state in a coordinated
  *	fashion. For more information, see its manual page.
  *
  *
- *	HOTPLUG(1M)
+ *	hotplug(8)
  *
  *	hotplug is a command line tool for managing hotplug connections for
  *	connectors. For more information, see its manual page.
diff --git a/usr/src/uts/common/os/ddi_intr_impl.c b/usr/src/uts/common/os/ddi_intr_impl.c
index 215be73722..22f4548607 100644
--- a/usr/src/uts/common/os/ddi_intr_impl.c
+++ b/usr/src/uts/common/os/ddi_intr_impl.c
@@ -35,7 +35,7 @@
 #include <sys/sunndi.h>
 #include <sys/ndi_impldefs.h>	/* include prototypes */
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 /*
  * MSI-X allocation limit.
  */
@@ -294,7 +294,7 @@ i_ddi_intr_get_limit(dev_info_t *dip, int type, ddi_irm_pool_t *pool_p)
 	limit = MIN(limit, nintrs);
 
 	/* Impose a global MSI-X limit on x86 */
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 	if (type == DDI_INTR_TYPE_MSIX)
 		limit = MIN(limit, ddi_msix_alloc_limit);
 #endif
@@ -539,7 +539,7 @@ set_intr_affinity(ddi_intr_handle_t h, processorid_t tgt)
 	return (ret);
 }
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 ddi_acc_handle_t
 i_ddi_get_pci_config_handle(dev_info_t *dip)
 {
diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c
index a4b35dcb5b..2433c504fc 100644
--- a/usr/src/uts/common/os/ddi_intr_irm.c
+++ b/usr/src/uts/common/os/ddi_intr_irm.c
@@ -34,7 +34,7 @@
 #include <sys/sunndi.h>
 #include <sys/ndi_impldefs.h>	/* include prototypes */
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 /*
  * MSI-X allocation limit.
  */
@@ -767,7 +767,7 @@ i_ddi_irm_set_cb(dev_info_t *dip, boolean_t has_cb_flag)
 		/* Determine new request size */
 		nreq = MIN(req_p->ireq_nreq, pool_p->ipool_defsz);
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 		/* Use the default static limit for non-IRM drivers */
 		if (req_p->ireq_type == DDI_INTR_TYPE_MSIX)
 			nreq = MIN(nreq, ddi_msix_alloc_limit);
diff --git a/usr/src/uts/common/os/ddi_ufm.c b/usr/src/uts/common/os/ddi_ufm.c
index ffb04eddec..940ebf82bf 100644
--- a/usr/src/uts/common/os/ddi_ufm.c
+++ b/usr/src/uts/common/os/ddi_ufm.c
@@ -11,6 +11,7 @@
 
 /*
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #include <sys/avl.h>
@@ -20,13 +21,16 @@
 #include <sys/kmem.h>
 #include <sys/sunddi.h>
 #include <sys/stddef.h>
+#include <sys/sunndi.h>
+#include <sys/file.h>
+#include <sys/sysmacros.h>
 
 /*
  * The UFM subsystem tracks its internal state with respect to device
  * drivers that participate in the DDI UFM subsystem on a per-instance basis
  * via ddi_ufm_handle_t structures (see ddi_ufm_impl.h).  This is known as the
  * UFM handle.  The UFM handle contains a pointer to the driver's UFM ops,
- * which the ufm(7D) pseudo driver uses to invoke the UFM entry points in
+ * which the ufm(4D) pseudo driver uses to invoke the UFM entry points in
  * response to DDI UFM ioctls.  Additionally, the DDI UFM subsystem uses the
  * handle to maintain cached UFM image and slot data.
  *
@@ -65,6 +69,12 @@
  * These tests should be run whenever changes are made to the DDI UFM
  * subsystem or the ufm driver.
  */
+
+/*
+ * Amount of data to read in one go (1 MiB).
+ */
+#define	UFM_READ_STRIDE	(1024 * 1024)
+
 static avl_tree_t ufm_handles;
 static kmutex_t ufm_lock;
 
@@ -171,7 +181,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
 	 */
 	ufmh->ufmh_images =
 	    kmem_zalloc((sizeof (ddi_ufm_image_t) * ufmh->ufmh_nimages),
-	    KM_NOSLEEP | KM_NORMALPRI);
+	    KM_NOSLEEP_LAZY);
 	if (ufmh->ufmh_images == NULL)
 		return (ENOMEM);
 
@@ -191,7 +201,7 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
 
 		img->ufmi_slots =
 		    kmem_zalloc((sizeof (ddi_ufm_slot_t) * img->ufmi_nslots),
-		    KM_NOSLEEP | KM_NORMALPRI);
+		    KM_NOSLEEP_LAZY);
 		if (img->ufmi_slots == NULL) {
 			ret = ENOMEM;
 			goto cache_fail;
@@ -234,6 +244,12 @@ ufm_cache_fill(ddi_ufm_handle_t *ufmh)
 			if (slot->ufms_attrs & DDI_UFM_ATTR_EMPTY)
 				continue;
 
+			if (slot->ufms_imgsize != 0) {
+				fnvlist_add_uint64(slots[s],
+				    DDI_UFM_NV_SLOT_IMGSIZE,
+				    slot->ufms_imgsize);
+			}
+
 			fnvlist_add_string(slots[s], DDI_UFM_NV_SLOT_VERSION,
 			    slot->ufms_version);
 			if (slot->ufms_misc != NULL) {
@@ -257,6 +273,56 @@ cache_fail:
 	return (ret);
 }
 
+int
+ufm_read_img(ddi_ufm_handle_t *ufmh, uint_t img, uint_t slot, uint64_t len,
+    uint64_t off, uintptr_t uaddr, uint64_t *nreadp, int copyflags)
+{
+	int ret = 0;
+	ddi_ufm_cap_t caps;
+	void *buf;
+	uint64_t nread;
+
+	ret = ufmh->ufmh_ops->ddi_ufm_op_getcaps(ufmh, ufmh->ufmh_arg, &caps);
+	if (ret != 0) {
+		return (ret);
+	}
+
+	if ((caps & DDI_UFM_CAP_READIMG) == 0 ||
+	    ufmh->ufmh_ops->ddi_ufm_op_readimg == NULL) {
+		return (ENOTSUP);
+	}
+
+	if (off + len < MAX(off, len)) {
+		return (EOVERFLOW);
+	}
+
+	buf = kmem_zalloc(UFM_READ_STRIDE, KM_SLEEP);
+	nread = 0;
+	while (len > 0) {
+		uint64_t toread = MIN(len, UFM_READ_STRIDE);
+		uint64_t iter;
+
+		ret = ufmh->ufmh_ops->ddi_ufm_op_readimg(ufmh, ufmh->ufmh_arg,
+		    img, slot, toread, off + nread, buf, &iter);
+		if (ret != 0) {
+			break;
+		}
+
+		if (ddi_copyout(buf, (void *)(uintptr_t)(uaddr + nread), iter,
+		    copyflags & FKIOCTL) != 0) {
+			ret = EFAULT;
+			break;
+		}
+
+		nread += iter;
+		len -= iter;
+	}
+
+	*nreadp = nread;
+	kmem_free(buf, UFM_READ_STRIDE);
+	return (ret);
+}
+
 /*
  * This gets called early in boot by setup_ddi().
  */
@@ -375,6 +441,12 @@ ddi_ufm_init(dev_info_t *dip, uint_t version, ddi_ufm_ops_t *ufmops,
 		mutex_exit(&old_ufmh->ufmh_lock);
 	}
 
+	/*
+	 * Give a hint in the devinfo tree that this device supports UFM
+	 * capabilities.
+	 */
+	(void) ndi_prop_create_boolean(DDI_DEV_T_NONE, dip, "ddi-ufm-capable");
+
 	return (DDI_SUCCESS);
 }
 
@@ -453,3 +525,10 @@ ddi_ufm_slot_set_misc(ddi_ufm_slot_t *usp, nvlist_t *misc)
 	nvlist_free(usp->ufms_misc);
 	usp->ufms_misc = misc;
 }
+
+void
+ddi_ufm_slot_set_imgsize(ddi_ufm_slot_t *usp, uint64_t size)
+{
+	VERIFY3P(usp, !=, NULL);
+	usp->ufms_imgsize = size;
+}
diff --git a/usr/src/uts/common/os/ddifm.c b/usr/src/uts/common/os/ddifm.c
index 533fa15aed..dc39ba49ab 100644
--- a/usr/src/uts/common/os/ddifm.c
+++ b/usr/src/uts/common/os/ddifm.c
@@ -56,7 +56,7 @@
  *
  *	Error reports resulting from hardware component specific and common IO
  *	fault and driver defects must be accompanied by an Eversholt fault
- *	tree (.eft) by the Solaris fault manager (fmd(1M)) for
+ *	tree (.eft) by the Solaris fault manager (fmd(8)) for
  *	diagnosis.
  *
  * DDI_FM_ERRCB_CAPABLE
@@ -466,7 +466,7 @@ out:	if (ereport && (nva == NULL))
 
 /*
  * Generate an error report for consumption by the Solaris Fault Manager,
- * fmd(1M).  Valid ereport classes are defined in /usr/include/sys/fm/io.
+ * fmd(8).  Valid ereport classes are defined in /usr/include/sys/fm/io.
  *
  * The ENA should be set if this error is a result of an error status
  * returned from ddi_dma_err_check() or ddi_acc_err_check().  Otherwise,
diff --git a/usr/src/uts/common/os/devcfg.c b/usr/src/uts/common/os/devcfg.c
index cbcc4db3d8..d61525be9c 100644
--- a/usr/src/uts/common/os/devcfg.c
+++ b/usr/src/uts/common/os/devcfg.c
@@ -24,6 +24,7 @@
  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org>
  */
 
 #include <sys/note.h>
@@ -62,6 +63,7 @@
 #include <sys/varargs.h>
 #include <sys/modhash.h>
 #include <sys/instance.h>
+#include <sys/sysevent/eventdefs.h>
 
 #if defined(__amd64) && !defined(__xpv)
 #include <sys/iommulib.h>
@@ -253,7 +255,7 @@ i_ddi_node_cache_init()
  * The allocated node has a reference count of 0.
  */
 dev_info_t *
-i_ddi_alloc_node(dev_info_t *pdip, char *node_name, pnode_t nodeid,
+i_ddi_alloc_node(dev_info_t *pdip, const char *node_name, pnode_t nodeid,
     int instance, ddi_prop_t *sys_prop, int flag)
 {
 	struct dev_info *devi;
@@ -395,6 +397,9 @@ sid:		devi->devi_node_attributes |= DDI_PERSISTENT;
 	devi->devi_ct_count = -1;	/* counter not in use if -1 */
 	list_create(&(devi->devi_ct), sizeof (cont_device_t),
 	    offsetof(cont_device_t, cond_next));
+	list_create(&devi->devi_unbind_cbs, sizeof (ddi_unbind_callback_t),
+	    offsetof(ddi_unbind_callback_t, ddiub_next));
+	mutex_init(&devi->devi_unbind_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	i_ddi_set_node_state((dev_info_t *)devi, DS_PROTO);
 	da_log_enter((dev_info_t *)devi);
@@ -491,6 +496,9 @@ i_ddi_free_node(dev_info_t *dip)
 	if (devi->devi_ev_path)
 		kmem_free(devi->devi_ev_path, MAXPATHLEN);
 
+	mutex_destroy(&devi->devi_unbind_lock);
+	list_destroy(&devi->devi_unbind_cbs);
+
 	kmem_cache_free(ddi_node_cache, devi);
 }
 
@@ -828,6 +836,7 @@ bind_node(dev_info_t *dip)
 static int
 unbind_node(dev_info_t *dip)
 {
+	ddi_unbind_callback_t *cb;
 	ASSERT(DEVI(dip)->devi_node_state == DS_BOUND);
 	ASSERT(DEVI(dip)->devi_major != DDI_MAJOR_T_NONE);
 
@@ -842,6 +851,11 @@ unbind_node(dev_info_t *dip)
 
 	DEVI(dip)->devi_major = DDI_MAJOR_T_NONE;
 	DEVI(dip)->devi_binding_name = DEVI(dip)->devi_node_name;
+
+	while ((cb = list_remove_head(&DEVI(dip)->devi_unbind_cbs)) != NULL) {
+		cb->ddiub_cb(cb->ddiub_arg, dip);
+	}
+
 	return (DDI_SUCCESS);
 }
 
@@ -1486,12 +1500,12 @@ postattach_node(dev_info_t *dip)
 	/*
 	 * Plumbing during postattach may fail because of the
 	 * underlying device is not ready. This will fail ndi_devi_config()
-	 * in dv_filldir() and a warning message is issued. The message
-	 * from here will explain what happened
+	 * in dv_filldir().
 	 */
 	if (rval != DACF_SUCCESS) {
-		cmn_err(CE_WARN, "Postattach failed for %s%d\n",
-		    ddi_driver_name(dip), ddi_get_instance(dip));
+		NDI_CONFIG_DEBUG((CE_CONT, "postattach_node: %s%d (%p) "
+		    "postattach failed\n", ddi_driver_name(dip),
+		    ddi_get_instance(dip), (void *)dip));
 		return (DDI_FAILURE);
 	}
 
@@ -2044,7 +2058,7 @@ ndi_devi_tryenter(dev_info_t *dip, int *circular)
  * not allowed to sleep.
  */
 int
-ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
+ndi_devi_alloc(dev_info_t *parent, const char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip)
 {
 	ASSERT(node_name != NULL);
@@ -2064,7 +2078,7 @@ ndi_devi_alloc(dev_info_t *parent, char *node_name, pnode_t nodeid,
  * This routine may sleep and should not be called at interrupt time
  */
 void
-ndi_devi_alloc_sleep(dev_info_t *parent, char *node_name, pnode_t nodeid,
+ndi_devi_alloc_sleep(dev_info_t *parent, const char *node_name, pnode_t nodeid,
     dev_info_t **ret_dip)
 {
 	ASSERT(node_name != NULL);
@@ -2534,7 +2548,7 @@ i_ddi_get_exported_classes(dev_info_t *dip, char ***classes)
  * Helper functions, returns NULL if no memory.
  */
 char *
-i_ddi_strdup(char *str, uint_t flag)
+i_ddi_strdup(const char *str, uint_t flag)
 {
 	char *copy;
 
@@ -3560,7 +3574,6 @@ walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg,
  *	They include, but not limited to, _init(9e), _fini(9e), probe(9e),
  *	attach(9e), and detach(9e).
  */
-
 void
 ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg)
 {
@@ -3580,7 +3593,6 @@ ddi_walk_devs(dev_info_t *dip, int (*f)(dev_info_t *, void *), void *arg)
  *
  * N.B. The same restrictions from ddi_walk_devs() apply.
  */
-
 void
 e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg)
 {
@@ -3609,6 +3621,91 @@ e_ddi_walk_driver(char *drv, int (*f)(dev_info_t *, void *), void *arg)
 	UNLOCK_DEV_OPS(&dnp->dn_lock);
 }
 
+struct preroot_walk_block_devices_arg {
+	int (*prwb_func)(const char *, void *);
+	void *prwb_arg;
+};
+
+static int
+preroot_walk_block_devices_walker(dev_info_t *dip, void *arg)
+{
+	struct preroot_walk_block_devices_arg *prwb = arg;
+
+	if (i_ddi_devi_class(dip) == NULL ||
+	    strcmp(i_ddi_devi_class(dip), ESC_DISK) != 0) {
+		/*
+		 * We do not think that this is a disk.
+		 */
+		return (DDI_WALK_CONTINUE);
+	}
+
+	for (struct ddi_minor_data *md = DEVI(dip)->devi_minor; md != NULL;
+	    md = md->next) {
+		if (md->ddm_spec_type != S_IFBLK) {
+			/*
+			 * We don't want the raw version of any block device.
+			 */
+			continue;
+		}
+
+		/*
+		 * The node type taxonomy is hierarchical, with each level
+		 * separated by colons.  Nodes of interest are either of the
+		 * BLOCK type, or are prefixed with that type.
+		 */
+		if (strcmp(md->ddm_node_type, DDI_NT_BLOCK) != 0 &&
+		    strncmp(md->ddm_node_type, DDI_NT_BLOCK ":",
+		    strlen(DDI_NT_BLOCK ":")) != 0) {
+			/*
+			 * This minor node does not represent a block device.
+			 */
+			continue;
+		}
+
+		char buf[MAXPATHLEN];
+		int r;
+		if ((r = prwb->prwb_func(ddi_pathname_minor(md, buf),
+		    prwb->prwb_arg)) == PREROOT_WALK_BLOCK_DEVICES_CANCEL) {
+			/*
+			 * The consumer does not need any more minor nodes.
+			 */
+			return (DDI_WALK_TERMINATE);
+		}
+		VERIFY3S(r, ==, PREROOT_WALK_BLOCK_DEVICES_NEXT);
+	}
+
+	return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * Private routine for ZFS when it needs to attach and scan all of the block
+ * device minors in the system while looking for vdev labels.
+ *
+ * The callback function accepts a physical device path and the context
+ * argument (arg) passed to this function; it should return
+ * PREROOT_WALK_BLOCK_DEVICES_NEXT when more devices are required and
+ * PREROOT_WALK_BLOCK_DEVICES_CANCEL to stop the walk.
+ */
+void
+preroot_walk_block_devices(int (*callback)(const char *, void *), void *arg)
+{
+	/*
+	 * First, force everything which can attach to do so.  The device class
+	 * is not derived until at least one minor mode is created, so we
+	 * cannot walk the device tree looking for a device class of ESC_DISK
+	 * until everything is attached.
+	 */
+	(void) ndi_devi_config(ddi_root_node(), NDI_CONFIG | NDI_DEVI_PERSIST |
+	    NDI_NO_EVENT | NDI_DRV_CONF_REPROBE);
+
+	struct preroot_walk_block_devices_arg prwb;
+	prwb.prwb_func = callback;
+	prwb.prwb_arg = arg;
+
+	ddi_walk_devs(ddi_root_node(), preroot_walk_block_devices_walker,
+	    &prwb);
+}
+
 /*
  * argument to i_find_devi, a devinfo node search callback function.
  */
@@ -3823,8 +3920,8 @@ ddi_is_pci_dip(dev_info_t *dip)
  * to ioc's bus_config entry point.
  */
 int
-resolve_pathname(char *pathname,
-	dev_info_t **dipp, dev_t *devtp, int *spectypep)
+resolve_pathname(char *pathname, dev_info_t **dipp, dev_t *devtp,
+    int *spectypep)
 {
 	int			error;
 	dev_info_t		*parent, *child;
@@ -9055,7 +9152,7 @@ out:
 char *
 ddi_curr_redirect(char *curr)
 {
-	char 	*alias;
+	char *alias;
 	int i;
 
 	if (ddi_aliases_present == B_FALSE)
@@ -9196,3 +9293,13 @@ ddi_mem_update(uint64_t addr, uint64_t size)
 	;
 #endif
 }
+
+void
+e_ddi_register_unbind_callback(dev_info_t *dip, ddi_unbind_callback_t *cb)
+{
+	struct dev_info *devi = DEVI(dip);
+
+	mutex_enter(&devi->devi_unbind_lock);
+	list_insert_tail(&devi->devi_unbind_cbs, cb);
+	mutex_exit(&devi->devi_unbind_lock);
+}
diff --git a/usr/src/uts/common/os/devid_cache.c b/usr/src/uts/common/os/devid_cache.c
index 3e1a06a844..2a780eebe2 100644
--- a/usr/src/uts/common/os/devid_cache.c
+++ b/usr/src/uts/common/os/devid_cache.c
@@ -47,7 +47,7 @@
  * involves walking the entire device tree attaching all possible disk
  * instances, to search for the device referenced by a devid.  Obviously,
  * full device discovery is something to be avoided where possible.
- * Note that simply invoking devfsadm(1M) is equivalent to running full
+ * Note that simply invoking devfsadm(8) is equivalent to running full
  * discovery at the devid cache level.
  *
  * Reasons why a disk may not be accessible:
@@ -61,7 +61,7 @@
  * When discovery may succeed:
  *	Discovery will result in success when a device has been moved
  *	to a different address.  Note that it's recommended that
- *	devfsadm(1M) be invoked (no arguments required) whenever a system's
+ *	devfsadm(8) be invoked (no arguments required) whenever a system's
  *	h/w configuration has been updated.  Alternatively, a
  *	reconfiguration boot can be used to accomplish the same result.
  *
@@ -69,7 +69,7 @@
  * failure for a device which was powered off.  Assuming the cache has an
  * entry for such a device, simply powering it on should permit the system
  * to access it.  If problems persist after powering it on, invoke
- * devfsadm(1M).
+ * devfsadm(8).
  *
  * Discovery prior to mounting root is only of interest when booting
  * from a filesystem which accesses devices by device id, which of
diff --git a/usr/src/uts/common/os/dkioc_free_util.c b/usr/src/uts/common/os/dkioc_free_util.c
index 85470f7e28..4bf1f54ca4 100644
--- a/usr/src/uts/common/os/dkioc_free_util.c
+++ b/usr/src/uts/common/os/dkioc_free_util.c
@@ -10,7 +10,8 @@
  */
 
 /*
- * Copyright 2017 Nexenta Inc.  All rights reserved.
+ * Copyright 2021 Tintri by DDN, Inc. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
  */
 
 /* needed when building libzpool */
@@ -25,6 +26,13 @@
 #include <sys/file.h>
 #include <sys/sdt.h>
 
+static int adjust_exts(dkioc_free_list_t *, const dkioc_free_info_t *,
+    uint64_t len_blk);
+static int split_extent(dkioc_free_list_t *, const dkioc_free_info_t *,
+    uint64_t, dfl_iter_fn_t, void *, int);
+static int process_range(dkioc_free_list_t *, uint64_t, uint64_t,
+    dfl_iter_fn_t, void *, int);
+
 /*
  * Copy-in convenience function for variable-length dkioc_free_list_t
  * structures. The pointer to be copied from is in `arg' (may be a pointer
@@ -78,3 +86,435 @@ dfl_free(dkioc_free_list_t *dfl)
 {
 	kmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
 }
+
+/*
+ * Convenience function to resize and segment the array of extents in
+ * a DKIOCFREE request as required by a driver.
+ *
+ * Some devices that implement DKIOCFREE (e.g. vioblk) have limits
+ * on either the number of extents that can be submitted in a single request,
+ * or the total number of blocks that can be submitted in a single request.
+ * In addition, devices may have alignment requirements on the starting
+ * address stricter than the device block size.
+ *
+ * Since there is currently no mechanism for callers of DKIOCFREE to discover
+ * such restrictions, instead of rejecting any requests that do not conform to
+ * some undiscoverable (to the caller) set of requirements, a driver can use
+ * dfl_iter() to adjust and resegment the extents from a DKIOCFREE call as
+ * required to conform to its requirements.
+ *
+ * The original request is passed as 'dfl' and the alignment requirements
+ * are passed in 'dfi'. Additionally the maximum offset of the device allowed
+ * in bytes) is passed as max_off -- this allows a driver with
+ * multiple instances of different sizes but similar requirements (e.g.
+ * a partitioned blkdev device) to not construct a separate dkioc_free_info_t
+ * struct for each device.
+ *
+ * dfl_iter() will call 'func' with a dkioc_free_list_t and the value of
+ * arg passed to it as needed. If the extents in the dkioc_free_list_t passed
+ * to dfl_iter() meet all the requirements in 'dfi', the dkioc_free_list_t will
+ * be passed on to 'func' unmodified. If any of the extents passed to dfl_iter()
+ * do not meet the requirements, dfl_iter() will allocate new dkioc_free_list_t
+ * instances and populate them with the adjusted extents that do conform to the
+ * requirements in 'dfi'. dfl_iter() will also free the dkioc_free_list_t
+ * passed to it when this occurs. The net result is that 'func' can always
+ * assume it will be called with a dkioc_free_list_t with extents that
+ * comply with the requirements in 'dfi'. 'func' is also responsible for
+ * freeing the dkioc_free_list_t passed to it (likely via a completion
+ * callback).
+ *
+ * Combined with the behavior described above, dfl_iter() can be viewed as
+ * consuming the dkioc_free_list_t passed to it. Either it will pass it along
+ * to 'func' (and let 'func' handle freeing it), or it will free it and
+ * allocate one or more new dkioc_free_list_ts to pass to 'func' (while still
+ * letting 'func' handle freeing the new instances). This way neither the
+ * dfl_iter() caller nor nor the driver need to worry about treating
+ * conforming and non-conforming requests differently.
+ *
+ * Unfortunately, the DKIOCFREE ioctl provides no method for communicating
+ * any notion of partial completion -- either it returns success (0) or
+ * an error. It's not clear if such a notion would even be possible while
+ * supporting multiple types of devices (NVMe, SCSI, etc.) with the same
+ * interface. As such, there's little benefit to providing more detailed error
+ * semantics beyond what DKIOCFREE can handle.
+ *
+ * Due to this, a somewhat simplistic approach is taken to error handling. The
+ * original list of extents is first checked to make sure they all appear
+ * valid -- that is they do not start or extend beyond the end of the device.
+ * Any request that contains such extents is always rejected in it's entirety.
+ * It is possible after applying any needed adjustments to the original list
+ * of extents that the result is not acceptable to the driver. For example,
+ * a device with a 512 byte block size that tries to free the range 513-1023
+ * (bytes) would not be able to be processed. Such extents will be silently
+ * ignored. If the original request consists of nothing but such requests,
+ * dfl_iter() will never call 'func' and will merely return 0.
+ */
+int
+dfl_iter(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t max_off,
+    dfl_iter_fn_t func, void *arg, int kmflag)
+{
+	dkioc_free_list_ext_t *ext;
+	uint64_t n_bytes, n_segs, start_idx, i;
+	uint_t bsize = 1U << dfi->dfi_bshift;
+	int r = 0;
+	boolean_t need_copy = B_FALSE;
+
+	/*
+	 * Make sure the block size derived from dfi_bshift is at least 512
+	 * (1U << DEV_BSHIFT) bytes and less than 2^30. The lower bound is
+	 * to prevent any problems with other parts of the system that might
+	 * assume a minimum block size of 512, and the upper bound is just
+	 * to prevent overflow when creating the block size from dfi_bshift
+	 * (though it seems unlikely we'll have _block_ sizes near a GiB
+	 * any time soon).
+	 */
+	if (dfi->dfi_bshift < DEV_BSHIFT || dfi->dfi_bshift > 30) {
+		r = SET_ERROR(EINVAL);
+		goto done;
+	}
+
+	/* Max bytes must be a multiple of the block size */
+	if (!IS_P2ALIGNED(dfi->dfi_max_bytes, bsize)) {
+		r = SET_ERROR(EINVAL);
+		goto done;
+	}
+
+	/* Start offset alignment must also be a multiple of the block size */
+	if (dfi->dfi_align == 0 || !IS_P2ALIGNED(dfi->dfi_align, bsize)) {
+		r = SET_ERROR(EINVAL);
+		goto done;
+	}
+
+	/* Max bytes in an extent must be a multiple of the block size */
+	if (!IS_P2ALIGNED(dfi->dfi_max_ext_bytes, bsize)) {
+		r = SET_ERROR(EINVAL);
+		goto done;
+	}
+
+	/*
+	 * It makes no sense to allow a single extent to be larger than the
+	 * total allowed for an entire request.
+	 */
+	if (dfi->dfi_max_ext_bytes > 0 &&
+	    dfi->dfi_max_ext_bytes > dfi->dfi_max_bytes) {
+		r = SET_ERROR(EINVAL);
+		goto done;
+	}
+
+	/*
+	 * The first pass, align everything as needed and make sure all the
+	 * extents look valid.
+	 */
+	if ((r = adjust_exts(dfl, dfi, max_off)) != 0) {
+		goto done;
+	}
+
+	/*
+	 * Go through and split things up as needed. The general idea is to
+	 * split along the original extent boundaries when needed. We only
+	 * split an extent from the original request into multiple extents
+	 * if the original extent is by itself too big for the device to
+	 * process in a single request.
+	 */
+	start_idx = 0;
+	n_bytes = n_segs = 0;
+	ext = dfl->dfl_exts;
+	for (i = 0; i < dfl->dfl_num_exts; i++, ext++) {
+		uint64_t start = dfl->dfl_offset + ext->dfle_start;
+		uint64_t len = ext->dfle_length;
+
+		if (len == 0) {
+			/*
+			 * If we encounter a zero length extent, we're going
+			 * to create a new copy of dfl no matter what --
+			 * the size of dfl is determined by dfl_num_exts so
+			 * we cannot do things like shift the contents and
+			 * reduce dfl_num_exts to get a contiguous array
+			 * of non-zero length extents.
+			 */
+			need_copy = B_TRUE;
+			continue;
+		}
+
+		if (dfi->dfi_max_ext_bytes > 0 &&
+		    len > dfi->dfi_max_ext_bytes) {
+			/*
+			 * An extent that's too large. Dispatch what we've
+			 * accumulated, and then split this extent into
+			 * smaller ones the device can accept.
+			 */
+			if ((r = process_range(dfl, start_idx, i - start_idx,
+			    func, arg, kmflag)) != 0) {
+				goto done;
+			}
+
+			if ((r = split_extent(dfl, dfi, i, func, arg,
+			    kmflag)) != 0) {
+				goto done;
+			}
+			start_idx = i + 1;
+			n_segs = 0;
+			n_bytes = 0;
+			continue;
+		}
+
+		if (dfi->dfi_max_bytes > 0 &&
+		    n_bytes + len > dfi->dfi_max_bytes) {
+			/*
+			 * This extent would put us over the limit for total
+			 * bytes that can be trimmed in one request.
+			 * Dispatch what we've accumulated. Then deal
+			 * with this extent.
+			 */
+			if ((r = process_range(dfl, start_idx, i - start_idx,
+			    func, arg, kmflag)) != 0) {
+				goto done;
+			}
+
+			if (len < dfi->dfi_max_bytes) {
+				/*
+				 * After dispatching what we've accumulated,
+				 * this extent can fit in a new request
+				 * Just add it to the accumulated list of
+				 * extents and move on.
+				 */
+				start_idx = i;
+				n_segs = 1;
+				n_bytes = len;
+				continue;
+			}
+
+			/*
+			 * Even after starting a new request, this extent
+			 * is too big. Split it until it fits.
+			 */
+			if ((r = split_extent(dfl, dfi, i, func, arg,
+			    kmflag)) != 0) {
+				goto done;
+			}
+
+			start_idx = i + 1;
+			n_segs = 0;
+			n_bytes = 0;
+			continue;
+		}
+
+		if (dfi->dfi_max_ext > 0 && n_segs + 1 > dfi->dfi_max_ext) {
+			/*
+			 * This extent will put us over the limit on the number
+			 * of extents the device can accept. Dispatch what
+			 * we've accumulated so far.
+			 */
+			if ((r = process_range(dfl, start_idx, i - start_idx,
+			    func, arg, kmflag)) != 0) {
+				goto done;
+			}
+
+			start_idx = i;
+			n_segs = 1;
+			n_bytes = len;
+			continue;
+		}
+
+		n_segs++;
+		n_bytes += len;
+	}
+
+	/*
+	 * If a copy wasn't required, and we haven't processed a subset of
+	 * the extents already, we can just use the original request.
+	 */
+	if (!need_copy && start_idx == 0) {
+		return (func(dfl, arg, kmflag));
+	}
+
+	r = process_range(dfl, start_idx, i - start_idx, func, arg, kmflag);
+
+done:
+	dfl_free(dfl);
+	return (r);
+}
+
+/*
+ * Adjust the start and length of each extent in dfl so that it conforms to
+ * the requirements in dfi. It also verifies that no extent extends beyond
+ * the end of the device (given by len_blk).
+ *
+ * Returns 0 on success, or an error value.
+ */
+static int
+adjust_exts(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi,
+    uint64_t max_off)
+{
+	dkioc_free_list_ext_t *exts = dfl->dfl_exts;
+	/*
+	 * These must be uint64_t to prevent the P2 macros from truncating
+	 * the result.
+	 */
+	const uint64_t align = dfi->dfi_align;
+	const uint64_t bsize = (uint64_t)1 << dfi->dfi_bshift;
+
+	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++) {
+		/*
+		 * Since there are no known requirements on the value of
+		 * dfl_offset, it's possible (though odd) to have a scenario
+		 * where dfl_offset == 1, and dfle_start == 511 (resulting
+		 * in an actual start offset of 512). As such, we always
+		 * apply the offset and find the resulting starting offset
+		 * and length (in bytes) first, then apply any rounding
+		 * and alignment.
+		 */
+		uint64_t start = exts->dfle_start + dfl->dfl_offset;
+		uint64_t end = start + exts->dfle_length;
+
+		/*
+		 * Make sure after applying dfl->dfl_offset and any alignment
+		 * adjustments that the results don't overflow.
+		 */
+		if (start < dfl->dfl_offset || start > (UINT64_MAX - bsize)) {
+			return (SET_ERROR(EOVERFLOW));
+		}
+
+		if (end < start) {
+			return (SET_ERROR(EOVERFLOW));
+		}
+
+		/*
+		 * Make sure we don't extend past the end of the device
+		 */
+		if (end > max_off) {
+			return (SET_ERROR(EINVAL));
+		}
+
+		start = P2ROUNDUP(start, align);
+		end = P2ALIGN(end, bsize);
+
+		/*
+		 * Remove the offset so that when it's later applied again,
+		 * the correct start value is obtained.
+		 */
+		exts->dfle_start = start - dfl->dfl_offset;
+
+		/*
+		 * If the original length was less than the block size
+		 * of the device, we can end up with end < start. If that
+		 * happens we just set the length to zero.
+		 */
+		exts->dfle_length = (end < start) ? 0 : end - start;
+	}
+
+	return (0);
+}
+
+/*
+ * Take a subset of extents from dfl (starting at start_idx, with n entries)
+ * and create a new dkioc_free_list_t, passing that to func.
+ */
+static int
+process_range(dkioc_free_list_t *dfl, uint64_t start_idx, uint64_t n,
+    dfl_iter_fn_t func, void *arg, int kmflag)
+{
+	dkioc_free_list_t *new_dfl = NULL;
+	dkioc_free_list_ext_t *new_exts = NULL;
+	dkioc_free_list_ext_t *exts = dfl->dfl_exts + start_idx;
+	size_t actual_n = n;
+	int r = 0;
+
+	if (n == 0) {
+		return (0);
+	}
+
+	/*
+	 * Ignore any zero length extents. No known devices attach any
+	 * semantic meaning to such extents, and are likely just a result of
+	 * narrowing the range of the extent to fit the device alignment
+	 * requirements. It is possible the original caller submitted a
+	 * zero length extent, but we ignore those as well. Since we can't
+	 * communicate partial results back to the caller anyway, it's
+	 * unclear whether reporting that one of potentially many exents was
+	 * too small (without being able to identify which one) to the caller
+	 * of the DKIOCFREE request would be useful.
+	 */
+	for (uint64_t i = 0; i < n; i++) {
+		if (exts[i].dfle_length == 0 && --actual_n == 0) {
+			return (0);
+		}
+	}
+
+	new_dfl = kmem_zalloc(DFL_SZ(actual_n), kmflag);
+	if (new_dfl == NULL) {
+		return (SET_ERROR(ENOMEM));
+	}
+
+	new_dfl->dfl_flags = dfl->dfl_flags;
+	new_dfl->dfl_num_exts = actual_n;
+	new_dfl->dfl_offset = dfl->dfl_offset;
+	new_exts = new_dfl->dfl_exts;
+
+	for (uint64_t i = 0; i < n; i++) {
+		if (exts[i].dfle_length == 0) {
+			continue;
+		}
+
+		*new_exts++ = exts[i];
+	}
+
+	return (func(new_dfl, arg, kmflag));
+}
+
+/*
+ * If dfi_max_ext_bytes is set, use as the max segment length,
+ * otherwise use dfi_max_bytes if set, otherwise fallback to UINT64_MAX
+ */
+#define	MAX_SEGLEN(dfi) \
+	(((dfi)->dfi_max_ext_bytes > 0) ? (dfi)->dfi_max_ext_bytes :	\
+	((dfi)->dfi_max_bytes > 0) ? (dfi)->dfi_max_bytes : UINT64_MAX)
+
+/*
+ * Split the extent at idx into multiple lists (calling func for each one).
+ */
+static int
+split_extent(dkioc_free_list_t *dfl, const dkioc_free_info_t *dfi, uint64_t idx,
+    dfl_iter_fn_t func, void *arg, int kmflag)
+{
+	ASSERT3U(idx, <, dfl->dfl_num_exts);
+
+	const uint64_t		maxlen = MAX_SEGLEN(dfi);
+	dkioc_free_list_ext_t	*ext = dfl->dfl_exts + idx;
+	uint64_t		remain = ext->dfle_length;
+	int			r;
+
+	/*
+	 * Break the extent into as many single requests as needed. While it
+	 * would be possible in some circumstances to combine the final chunk
+	 * of the extent (after splitting) with the remaining extents in the
+	 * original request, it's not clear there's much benefit from the
+	 * added complexity. Such behavior could be added in the future if
+	 * it's determined to be worthwhile.
+	 */
+	while (remain > 0) {
+		uint64_t start = dfl->dfl_offset + ext->dfle_start;
+		uint64_t len = remain;
+
+		/*
+		 * If we know we have at least one more segment left after
+		 * the current iteration of this loop, split it so that
+		 * the next segment starts on an aligned boundary.
+		 */
+		if (len > maxlen) {
+			uint64_t end = P2ALIGN(start + maxlen, dfi->dfi_align);
+			len = end - start;
+		}
+
+		ext->dfle_length = len;
+
+		if ((r = process_range(dfl, idx, 1, func, arg, kmflag)) != 0) {
+			return (r);
+		}
+
+		ext->dfle_start += len;
+		remain -= len;
+	}
+
+	return (0);
+}
diff --git a/usr/src/uts/common/os/driver_lyr.c b/usr/src/uts/common/os/driver_lyr.c
index 9e5eb33dd6..d64342738b 100644
--- a/usr/src/uts/common/os/driver_lyr.c
+++ b/usr/src/uts/common/os/driver_lyr.c
@@ -1131,7 +1131,7 @@ ldi_usage_walker_helper(struct ldi_ident *lip, vnode_t *vp,
 	else
 		major = lip->li_major;
 
-	ASSERT((major >= 0) && (major < devcnt));
+	ASSERT3U(major, <, devcnt);
 
 	dnp = &devnamesp[major];
 	LOCK_DEV_OPS(&dnp->dn_lock);
@@ -1258,7 +1258,7 @@ ldi_mlink_lh(vnode_t *vp, int cmd, intptr_t arg, cred_t *crp, int *rvalp)
  * in its internal state so that the devinfo snapshot code has some
  * observability into streams device linkage information.
  */
-void
+int
 ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
 {
 	vnode_t			*vp = fpdown->f_vnode;
@@ -1267,9 +1267,13 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
 	major_t			major;
 	int			ret;
 
-	/* if the lower stream is not a device then return */
+	/*
+	 * If the lower stream is not a device then return but claim to have
+	 * succeeded, which matches our historical behaviour of just not
+	 * setting up LDI in this case.
+	 */
 	if (!vn_matchops(vp, spec_getvnodeops()))
-		return;
+		return (0);
 
 	ASSERT(!servicing_interrupt());
 
@@ -1280,6 +1284,41 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
 	sp = VTOS(vp);
 	csp = VTOS(sp->s_commonvp);
 
+	/* get a layered ident for the upper stream */
+	if (type == LINKNORMAL) {
+		/*
+		 * if the link is not persistant then we can associate
+		 * the upper stream with a dev_t.  this is because the
+		 * upper stream is associated with a vnode, which is
+		 * associated with a dev_t and this binding can't change
+		 * during the life of the stream.  since the link isn't
+		 * persistant once the stream is destroyed the link is
+		 * destroyed.  so the dev_t will be valid for the life
+		 * of the link.
+		 */
+		ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
+	} else {
+		/*
+		 * if the link is persistant we can only associate the
+		 * link with a driver (and not a dev_t.)  this is
+		 * because subsequent opens of the upper device may result
+		 * in a different stream (and dev_t) having access to
+		 * the lower stream.
+		 *
+		 * for example, if the upper stream is closed after the
+		 * persistant link operation is completed, a subsequent
+		 * open of the upper device will create a new stream which
+		 * may have a different dev_t and an unlink operation
+		 * can be performed using this new upper stream.
+		 */
+		VERIFY3S(type, ==, LINKPERSIST);
+		major = getmajor(stp->sd_vnode->v_rdev);
+		ret = ldi_ident_from_major(major, &li);
+	}
+
+	if (ret != 0)
+		return (ret);
+
 	/* check if this was a plink via a layered handle */
 	if (lhlink) {
 		/*
@@ -1303,8 +1342,10 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
 		 * while there may still be valid layered handles
 		 * pointing to it.
 		 */
+		VERIFY3S(type, ==, LINKPERSIST);
+
 		mutex_enter(&csp->s_lock);
-		ASSERT(csp->s_count >= 1);
+		VERIFY(csp->s_count >= 1);
 		csp->s_count++;
 		mutex_exit(&csp->s_lock);
 
@@ -1330,48 +1371,17 @@ ldi_mlink_fp(struct stdata *stp, file_t *fpdown, int lhlink, int type)
 	 * mark the snode/stream as multiplexed
 	 */
 	mutex_enter(&sp->s_lock);
-	ASSERT(!(sp->s_flag & SMUXED));
+	VERIFY(!(sp->s_flag & SMUXED));
 	sp->s_flag |= SMUXED;
 	mutex_exit(&sp->s_lock);
 
-	/* get a layered ident for the upper stream */
-	if (type == LINKNORMAL) {
-		/*
-		 * if the link is not persistant then we can associate
-		 * the upper stream with a dev_t.  this is because the
-		 * upper stream is associated with a vnode, which is
-		 * associated with a dev_t and this binding can't change
-		 * during the life of the stream.  since the link isn't
-		 * persistant once the stream is destroyed the link is
-		 * destroyed.  so the dev_t will be valid for the life
-		 * of the link.
-		 */
-		ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
-	} else {
-		/*
-		 * if the link is persistant we can only associate the
-		 * link with a driver (and not a dev_t.)  this is
-		 * because subsequent opens of the upper device may result
-		 * in a different stream (and dev_t) having access to
-		 * the lower stream.
-		 *
-		 * for example, if the upper stream is closed after the
-		 * persistant link operation is compleated, a subsequent
-		 * open of the upper device will create a new stream which
-		 * may have a different dev_t and an unlink operation
-		 * can be performed using this new upper stream.
-		 */
-		ASSERT(type == LINKPERSIST);
-		major = getmajor(stp->sd_vnode->v_rdev);
-		ret = ldi_ident_from_major(major, &li);
-	}
-
-	ASSERT(ret == 0);
 	(void) handle_alloc(vp, (struct ldi_ident *)li);
 	ldi_ident_release(li);
+
+	return (0);
 }
 
-void
+int
 ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
 {
 	struct ldi_handle	*lhp;
@@ -1381,31 +1391,21 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
 	major_t			major;
 	int			ret;
 
-	/* if the lower stream is not a device then return */
+	/*
+	 * If the lower stream is not a device then return but claim to have
+	 * succeeded, which matches our historical behaviour of just not
+	 * setting up LDI in this case.
+	 */
 	if (!vn_matchops(vp, spec_getvnodeops()))
-		return;
+		return (0);
 
 	ASSERT(!servicing_interrupt());
-	ASSERT((type == LINKNORMAL) || (type == LINKPERSIST));
 
 	LDI_STREAMS_LNK((CE_NOTE, "%s: unlinking streams "
 	    "stp=0x%p, fpdown=0x%p", "ldi_munlink_fp",
 	    (void *)stp, (void *)fpdown));
 
 	/*
-	 * NOTE: here we rely on the streams subsystem not allowing
-	 * a stream to be multiplexed more than once.  if this
-	 * changes, we break.
-	 *
-	 * mark the snode/stream as not multiplexed
-	 */
-	sp = VTOS(vp);
-	mutex_enter(&sp->s_lock);
-	ASSERT(sp->s_flag & SMUXED);
-	sp->s_flag &= ~SMUXED;
-	mutex_exit(&sp->s_lock);
-
-	/*
 	 * clear the owner for this snode
 	 * see the comment in ldi_mlink_fp() for information about how
 	 * the ident is allocated
@@ -1413,15 +1413,32 @@ ldi_munlink_fp(struct stdata *stp, file_t *fpdown, int type)
 	if (type == LINKNORMAL) {
 		ret = ldi_ident_from_stream(getendq(stp->sd_wrq), &li);
 	} else {
-		ASSERT(type == LINKPERSIST);
+		VERIFY3S(type, ==, LINKPERSIST);
 		major = getmajor(stp->sd_vnode->v_rdev);
 		ret = ldi_ident_from_major(major, &li);
 	}
 
-	ASSERT(ret == 0);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * NOTE: here we rely on the streams subsystem not allowing
+	 * a stream to be multiplexed more than once.  if this
+	 * changes, we break.
+	 *
+	 * mark the snode/stream as not multiplexed
+	 */
+	sp = VTOS(vp);
+	mutex_enter(&sp->s_lock);
+	VERIFY(sp->s_flag & SMUXED);
+	sp->s_flag &= ~SMUXED;
+	mutex_exit(&sp->s_lock);
+
 	lhp = handle_find(vp, (struct ldi_ident *)li);
 	handle_release(lhp);
 	ldi_ident_release(li);
+
+	return (0);
 }
 
 /*
diff --git a/usr/src/uts/common/os/errorq.c b/usr/src/uts/common/os/errorq.c
index 8b41e7e8c1..cd71b9be08 100644
--- a/usr/src/uts/common/os/errorq.c
+++ b/usr/src/uts/common/os/errorq.c
@@ -946,7 +946,7 @@ errorq_cancel(errorq_t *eqp, errorq_elem_t *eqep)
 
 /*
  * Write elements on the dump list of each nvlist errorq to the dump device.
- * Upon reboot, fmd(1M) will extract and replay them for diagnosis.
+ * Upon reboot, fmd(8) will extract and replay them for diagnosis.
  */
 void
 errorq_dump(void)
diff --git a/usr/src/uts/common/os/exacct.c b/usr/src/uts/common/os/exacct.c
index c9214cec84..1051c037fa 100644
--- a/usr/src/uts/common/os/exacct.c
+++ b/usr/src/uts/common/os/exacct.c
@@ -1508,10 +1508,8 @@ exacct_attach_flow_item(flow_usage_t *fu, ea_object_t *record, int res)
 		}
 		break;
 	case AC_FLOW_UID:
-		if (fu->fu_userid >= 0) {
-			(void) ea_attach_item(record, &fu->fu_userid,
-			    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
-		}
+		(void) ea_attach_item(record, &fu->fu_userid,
+		    sizeof (uint32_t), EXT_UINT32 | EXD_FLOW_UID);
 		break;
 	case AC_FLOW_ANAME:
 		(void) ea_attach_item(record, fu->fu_aname,
diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c
index 06e0117cd6..7ccf9b3221 100644
--- a/usr/src/uts/common/os/exit.c
+++ b/usr/src/uts/common/os/exit.c
@@ -22,6 +22,8 @@
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -71,6 +73,7 @@
 #include <sys/pool.h>
 #include <sys/sdt.h>
 #include <sys/corectl.h>
+#include <sys/core.h>
 #include <sys/brand.h>
 #include <sys/libc_kernel.h>
 
@@ -163,7 +166,7 @@ restart_init_notify(zone_t *zone)
  * it failed.  As long as the given zone is still in the "running"
  * state, we will re-exec() init, but first we need to reset things
  * which are usually inherited across exec() but will break init's
- * assumption that it is being exec()'d from a virgin process.  Most
+ * assumption that it is being exec()'d from a virgin process.	Most
  * importantly this includes closing all file descriptors (exec only
  * closes those marked close-on-exec) and resetting signals (exec only
  * resets handled signals, and we need to clear any signals which
@@ -176,6 +179,7 @@ restart_init(int what, int why)
 	kthread_t *t = curthread;
 	klwp_t *lwp = ttolwp(t);
 	proc_t *p = ttoproc(t);
+	proc_t *pp = p->p_zone->zone_zsched;
 	user_t *up = PTOU(p);
 
 	vnode_t *oldcd, *oldrd;
@@ -187,11 +191,11 @@ restart_init(int what, int why)
 	 * zone) know that init has failed and will be restarted.
 	 */
 	zcmn_err(p->p_zone->zone_id, CE_WARN,
-	    "init(1M) %s: restarting automatically",
+	    "init(8) %s: restarting automatically",
 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
 
 	if (!INGLOBALZONE(p)) {
-		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
+		cmn_err(CE_WARN, "init(8) for zone %s (pid %d) %s: "
 		    "restarting automatically",
 		    p->p_zone->zone_name, p->p_pid, reason_buf);
 	}
@@ -206,7 +210,7 @@ restart_init(int what, int why)
 
 	/*
 	 * Grab p_lock and begin clearing miscellaneous global process
-	 * state that needs to be reset before we exec the new init(1M).
+	 * state that needs to be reset before we exec the new init(8).
 	 */
 
 	mutex_enter(&p->p_lock);
@@ -270,6 +274,11 @@ restart_init(int what, int why)
 		up->u_cwd = NULL;
 	}
 
+	/* Reset security flags */
+	mutex_enter(&pp->p_lock);
+	p->p_secflags = pp->p_secflags;
+	mutex_exit(&pp->p_lock);
+
 	mutex_exit(&p->p_lock);
 
 	if (oldrd != NULL)
@@ -277,6 +286,23 @@ restart_init(int what, int why)
 	if (oldcd != NULL)
 		VN_RELE(oldcd);
 
+	/*
+	 * It's possible that a zone's init will have become privilege aware
+	 * and modified privilege sets; reset them.
+	 */
+	cred_t *oldcr, *newcr;
+
+	mutex_enter(&p->p_crlock);
+	oldcr = p->p_cred;
+	mutex_enter(&pp->p_crlock);
+	crhold(newcr = p->p_cred = pp->p_cred);
+	mutex_exit(&pp->p_crlock);
+	mutex_exit(&p->p_crlock);
+	crfree(oldcr);
+	/* Additional hold for the current thread - expected by crset() */
+	crhold(newcr);
+	crset(p, newcr);
+
 	/* Free the controlling tty.  (freectty() always assumes curproc.) */
 	ASSERT(p == curproc);
 	(void) freectty(B_TRUE);
@@ -284,7 +310,7 @@ restart_init(int what, int why)
 	restart_init_notify(p->p_zone);
 
 	/*
-	 * Now exec() the new init(1M) on top of the current process.  If we
+	 * Now exec() the new init(8) on top of the current process.  If we
 	 * succeed, the caller will treat this like a successful system call.
 	 * If we fail, we issue messages and the caller will proceed with exit.
 	 */
@@ -294,11 +320,11 @@ restart_init(int what, int why)
 		return (0);
 
 	zcmn_err(p->p_zone->zone_id, CE_WARN,
-	    "failed to restart init(1M) (err=%d): system reboot required", err);
+	    "failed to restart init(8) (err=%d): system reboot required", err);
 
 	if (!INGLOBALZONE(p)) {
-		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
-		    "(pid %d, err=%d): zoneadm(1M) boot required",
+		cmn_err(CE_WARN, "failed to restart init(8) for zone %s "
+		    "(pid %d, err=%d): zoneadm(8) boot required",
 		    p->p_zone->zone_name, p->p_pid, err);
 	}
 
@@ -317,7 +343,7 @@ exit(int why, int what)
 	/*
 	 * If proc_exit() fails, then some other lwp in the process
 	 * got there first.  We just have to call lwp_exit() to allow
-	 * the other lwp to finish exiting the process.  Otherwise we're
+	 * the other lwp to finish exiting the process.	 Otherwise we're
 	 * restarting init, and should return.
 	 */
 	if (proc_exit(why, what) != 0) {
@@ -330,7 +356,7 @@ exit(int why, int what)
 
 /*
  * Set the SEXITING flag on the process, after making sure /proc does
- * not have it locked.  This is done in more places than proc_exit(),
+ * not have it locked.	This is done in more places than proc_exit(),
  * so it is a separate function.
  */
 void
@@ -445,9 +471,9 @@ zone_init_exit(zone_t *z, int why, int what)
 		}
 	}
 
-
 	/*
-	 * The restart failed, the zone will shut down.
+	 * The restart failed, or the criteria for a restart are not met;
+	 * the zone will shut down.
 	 */
 	z->zone_init_status = wstat(why, what);
 	(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred());
@@ -484,7 +510,7 @@ proc_exit(int why, int what)
 
 	/*
 	 * Stop and discard the process's lwps except for the current one,
-	 * unless some other lwp beat us to it.  If exitlwps() fails then
+	 * unless some other lwp beat us to it.	 If exitlwps() fails then
 	 * return and the calling lwp will call (or continue in) lwp_exit().
 	 */
 	proc_is_exiting(p);
@@ -502,6 +528,13 @@ proc_exit(int why, int what)
 	}
 	mutex_exit(&p->p_lock);
 
+	/*
+	 * Don't let init exit unless zone_start_init() failed its exec, or
+	 * we are shutting down the zone or the machine.
+	 *
+	 * Since we are single threaded, we don't need to lock the
+	 * following accesses to zone_proc_initpid.
+	 */
 	if (p->p_pid == z->zone_proc_initpid) {
 		/* If zone's init restarts, we're done here. */
 		if (zone_init_exit(z, why, what))
@@ -600,6 +633,14 @@ proc_exit(int why, int what)
 	}
 
 	/*
+	 * If we had generated any upanic(2) state, free that now.
+	 */
+	if (p->p_upanic != NULL) {
+		kmem_free(p->p_upanic, PRUPANIC_BUFLEN);
+		p->p_upanic = NULL;
+	}
+
+	/*
 	 * Remove any fpollinfo_t's for this (last) thread from our file
 	 * descriptors so closeall() can ASSERT() that they're all gone.
 	 */
@@ -971,7 +1012,7 @@ proc_exit(int why, int what)
 	 * curthread's proc pointer is changed to point to the 'sched'
 	 * process for the corresponding zone, except in the case when
 	 * the exiting process is in fact a zsched instance, in which
-	 * case the proc pointer is set to p0.  We do so, so that the
+	 * case the proc pointer is set to p0.	We do so, so that the
 	 * process still points at the right zone when we call the VN_RELE()
 	 * below.
 	 *
@@ -1055,7 +1096,7 @@ proc_exit(int why, int what)
 	/*
 	 * task_rele() may ultimately cause the zone to go away (or
 	 * may cause the last user process in a zone to go away, which
-	 * signals zsched to go away).  So prior to this call, we must
+	 * signals zsched to go away).	So prior to this call, we must
 	 * no longer point at zsched.
 	 */
 	t->t_procp = &p0;
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index ec89cb0657..f6179cf301 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2017, Joyent Inc.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -958,7 +959,22 @@ closef(file_t *fp)
 
 	vp = fp->f_vnode;
 
-	error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
+	/*
+	 * The __FLXPATH flag is a private interface for use by the lx
+	 * brand in order to emulate open(O_NOFOLLOW|O_PATH) which,
+	 * when a symbolic link is encountered, returns a file
+	 * descriptor which references it.
+	 * See uts/common/brand/lx/syscall/lx_open.c
+	 *
+	 * When this flag is set, VOP_OPEN() will not have been called when
+	 * this file descriptor was opened, and VOP_CLOSE() should not be
+	 * called here (for a symlink, most filesystems would return ENOSYS
+	 * anyway)
+	 */
+	if (fp->f_flag2 & (__FLXPATH >> 16))
+		error = 0;
+	else
+		error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL);
 
 	if (count > 1) {
 		mutex_exit(&fp->f_tlock);
@@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp)
 	mutex_enter(&fp->f_tlock);
 	fp->f_count = 1;
 	fp->f_flag = (ushort_t)flag;
-	fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16;
+	fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16;
 	fp->f_vnode = vp;
 	fp->f_offset = 0;
 	fp->f_audit_data = 0;
@@ -1585,7 +1601,9 @@ fsetattrat(int fd, char *path, int flags, struct vattr *vap)
 		VN_HOLD(vp);
 	}
 
-	if (vn_is_readonly(vp)) {
+	if (vp->v_type == VLNK && (vap->va_mask & AT_MODE) != 0) {
+		error = EOPNOTSUPP;
+	} else if (vn_is_readonly(vp)) {
 		error = EROFS;
 	} else {
 		error = VOP_SETATTR(vp, vap, 0, CRED(), NULL);
diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c
index 78907db25c..2d7849e30d 100644
--- a/usr/src/uts/common/os/flock.c
+++ b/usr/src/uts/common/os/flock.c
@@ -28,7 +28,7 @@
 /*	All Rights Reserved */
 
 /*
- * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
  * Copyright 2015 Joyent, Inc.
  */
 
@@ -1122,8 +1122,8 @@ flk_process_request(lock_descriptor_t *request)
 	}
 
 	if (!request_blocked_by_active) {
-			lock_descriptor_t *lk[1];
-			lock_descriptor_t *first_glock = NULL;
+		lock_descriptor_t *lk[1];
+		lock_descriptor_t *first_glock = NULL;
 		/*
 		 * Shall we grant this?! NO!!
 		 * What about those locks that were just granted and still
@@ -2093,12 +2093,12 @@ flk_graph_uncolor(graph_t *gp)
 
 	if (gp->mark == UINT_MAX) {
 		gp->mark = 1;
-	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
-	    lock = lock->l_next)
+		for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
+		    lock = lock->l_next)
 			lock->l_color  = 0;
 
-	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
-	    lock = lock->l_next)
+		for (lock = SLEEPING_HEAD(gp)->l_next;
+		    lock != SLEEPING_HEAD(gp); lock = lock->l_next)
 			lock->l_color  = 0;
 	} else {
 		gp->mark++;
@@ -4318,6 +4318,8 @@ nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
 		    lock->l_flock.l_pid != pid) &&
 		    lock_blocks_io(op, offset, length,
 		    lock->l_type, lock->l_start, lock->l_end)) {
+			DTRACE_PROBE1(conflict_lock,
+			    lock_descriptor_t *, lock);
 			conflict = 1;
 			break;
 		}
@@ -4467,34 +4469,34 @@ check_sleeping_locks(graph_t *gp)
 	edge_t *ep;
 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
 	    lock1 = lock1->l_next) {
-				ASSERT(!IS_BARRIER(lock1));
-	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
-	    lock2 = lock2->l_next) {
-		if (lock1->l_vnode == lock2->l_vnode) {
-			if (BLOCKS(lock2, lock1)) {
-				ASSERT(!IS_GRANTED(lock1));
-				ASSERT(!NOT_BLOCKED(lock1));
-				path(lock1, lock2);
+		ASSERT(!IS_BARRIER(lock1));
+		for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
+		    lock2 = lock2->l_next) {
+			if (lock1->l_vnode == lock2->l_vnode) {
+				if (BLOCKS(lock2, lock1)) {
+					ASSERT(!IS_GRANTED(lock1));
+					ASSERT(!NOT_BLOCKED(lock1));
+					path(lock1, lock2);
+				}
 			}
 		}
-	}
 
-	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
-	    lock2 = lock2->l_next) {
-				ASSERT(!IS_BARRIER(lock1));
-		if (lock1->l_vnode == lock2->l_vnode) {
-			if (BLOCKS(lock2, lock1)) {
-				ASSERT(!IS_GRANTED(lock1));
-				ASSERT(!NOT_BLOCKED(lock1));
-				path(lock1, lock2);
+		for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
+		    lock2 = lock2->l_next) {
+			ASSERT(!IS_BARRIER(lock1));
+			if (lock1->l_vnode == lock2->l_vnode) {
+				if (BLOCKS(lock2, lock1)) {
+					ASSERT(!IS_GRANTED(lock1));
+					ASSERT(!NOT_BLOCKED(lock1));
+					path(lock1, lock2);
+				}
 			}
 		}
-	}
-	ep = FIRST_ADJ(lock1);
-	while (ep != HEAD(lock1)) {
-		ASSERT(BLOCKS(ep->to_vertex, lock1));
-		ep = NEXT_ADJ(ep);
-	}
+		ep = FIRST_ADJ(lock1);
+		while (ep != HEAD(lock1)) {
+			ASSERT(BLOCKS(ep->to_vertex, lock1));
+			ep = NEXT_ADJ(ep);
+		}
 	}
 }
 
diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c
index 66fe699366..bd3e5dceac 100644
--- a/usr/src/uts/common/os/fm.c
+++ b/usr/src/uts/common/os/fm.c
@@ -336,6 +336,7 @@ fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
 			c = fm_printf(d + 1, c, cols, "[...]");
 			break;
 		case DATA_TYPE_UNKNOWN:
+		case DATA_TYPE_DONTCARE:
 			c = fm_printf(d + 1, c, cols, "<unknown>");
 			break;
 		}
@@ -363,7 +364,7 @@ fm_nvprint(nvlist_t *nvl)
 
 /*
  * Wrapper for panic() that first produces an FMA-style message for admins.
- * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
+ * Normally such messages are generated by fmd(8)'s syslog-msgs agent: this
  * is the one exception to that rule and the only error that gets messaged.
  * This function is intended for use by subsystems that have detected a fatal
  * error and enqueued appropriate ereports and wish to then force a panic.
@@ -375,9 +376,9 @@ fm_panic(const char *format, ...)
 	va_list ap;
 
 	(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 	fastreboot_disable_highpil();
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
 	va_start(ap, format);
 	vpanic(format, ap);
 	va_end(ap);
diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c
index 07fd623a95..6e2d3c403c 100644
--- a/usr/src/uts/common/os/grow.c
+++ b/usr/src/uts/common/os/grow.c
@@ -30,7 +30,7 @@
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 #include <sys/types.h>
 #include <sys/inttypes.h>
@@ -770,20 +770,11 @@ smmap_common(caddr_t *addrp, size_t len,
 	}
 
 	/*
-	 * XXX - Do we also adjust maxprot based on protections
-	 * of the vnode?  E.g. if no execute permission is given
-	 * on the vnode for the current user, maxprot probably
-	 * should disallow PROT_EXEC also?  This is different
-	 * from the write access as this would be a per vnode
-	 * test as opposed to a per fd test for writability.
-	 */
-
-	/*
-	 * Verify that the specified protections are not greater than
-	 * the maximum allowable protections.  Also test to make sure
-	 * that the file descriptor does allows for read access since
-	 * "write only" mappings are hard to do since normally we do
-	 * the read from the file before the page can be written.
+	 * Verify that the specified protections are not greater than the
+	 * maximum allowable protections.  Also test to make sure that the
+	 * file descriptor allows for read access since "write only" mappings
+	 * are hard to do since normally we do the read from the file before
+	 * the page can be written.
 	 */
 	if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 		return (EACCES);
diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 0a237e86ec..51a93dfa24 100644
--- a/usr/src/uts/common/os/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -21,7 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -40,8 +40,7 @@
 #include <sys/multidata.h>
 #include <sys/multidata_impl.h>
 
-extern unsigned int 	ip_ocsum(ushort_t *address, int halfword_count,
-    unsigned int sum);
+extern unsigned int ip_ocsum(ushort_t *, int, unsigned int);
 
 /*
  * Checksum routine for Internet Protocol family headers.
@@ -587,7 +586,8 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
 	ip6_rthdr_t *rthdr;
 	ip6_frag_t *fraghdr;
 
-	ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+	if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION)
+		return (B_FALSE);
 	length = IPV6_HDR_LEN;
 	whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
 	endptr = mp->b_wptr;
diff --git a/usr/src/uts/common/os/kcpc.c b/usr/src/uts/common/os/kcpc.c
index 977d243400..27e30a5725 100644
--- a/usr/src/uts/common/os/kcpc.c
+++ b/usr/src/uts/common/os/kcpc.c
@@ -21,6 +21,8 @@
 
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
  */
 
 #include <sys/param.h>
@@ -74,9 +76,10 @@ static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
  */
 static int kcpc_nullctx_panic = 0;
 
-static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
-static void kcpc_restore(kcpc_ctx_t *ctx);
-static void kcpc_save(kcpc_ctx_t *ctx);
+static void kcpc_save(void *);
+static void kcpc_restore(void *);
+static void kcpc_lwp_create(void *, void *);
+static void kcpc_free(void *, int);
 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
@@ -111,6 +114,14 @@ extern int kcpc_hw_load_pcbe(void);
  */
 static int kcpc_pcbe_error = 0;
 
+static const struct ctxop_template kcpc_ctxop_tpl = {
+	.ct_rev		= CTXOP_TPL_REV,
+	.ct_save	= kcpc_save,
+	.ct_restore	= kcpc_restore,
+	.ct_lwp_create	= kcpc_lwp_create,
+	.ct_free	= kcpc_free,
+};
+
 /*
  * Perform one-time initialization of kcpc framework.
  * This function performs the initialization only the first time it is called.
@@ -317,8 +328,7 @@ kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 	/*
 	 * Add a device context to the subject thread.
 	 */
-	installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
-	    kcpc_lwp_create, NULL, kcpc_free);
+	ctxop_install(t, &kcpc_ctxop_tpl, ctx);
 
 	/*
 	 * Ask the backend to program the hardware.
@@ -546,7 +556,7 @@ kcpc_unbind(kcpc_set_t *set)
 		t = ctx->kc_thread;
 		/*
 		 * The context is thread-bound and therefore has a device
-		 * context.  It will be freed via removectx() calling
+		 * context.  It will be freed via ctxop_remove() calling
 		 * freectx() calling kcpc_free().
 		 */
 		if (t == curthread) {
@@ -559,15 +569,7 @@ kcpc_unbind(kcpc_set_t *set)
 			splx(save_spl);
 			kpreempt_enable();
 		}
-#ifdef DEBUG
-		if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
-		    kcpc_lwp_create, NULL, kcpc_free) == 0)
-			panic("kcpc_unbind: context %p not preset on thread %p",
-			    (void *)ctx, (void *)t);
-#else
-		(void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
-		    kcpc_lwp_create, NULL, kcpc_free);
-#endif /* DEBUG */
+		VERIFY3U(ctxop_remove(t, &kcpc_ctxop_tpl, ctx), !=, 0);
 		t->t_cpc_set = NULL;
 		t->t_cpc_ctx = NULL;
 	} else {
@@ -1214,8 +1216,9 @@ kcpc_overflow_ast()
  * Called when switching away from current thread.
  */
 static void
-kcpc_save(kcpc_ctx_t *ctx)
+kcpc_save(void *arg)
 {
+	kcpc_ctx_t *ctx = arg;
 	int err;
 	int save_spl;
 
@@ -1263,8 +1266,9 @@ kcpc_save(kcpc_ctx_t *ctx)
 }
 
 static void
-kcpc_restore(kcpc_ctx_t *ctx)
+kcpc_restore(void *arg)
 {
+	kcpc_ctx_t *ctx = arg;
 	int save_spl;
 
 	mutex_enter(&ctx->kc_lock);
@@ -1323,9 +1327,11 @@ kcpc_restore(kcpc_ctx_t *ctx)
  * it is switched off.
  */
 /*ARGSUSED*/
-void
-kcpc_idle_save(struct cpu *cp)
+static void
+kcpc_idle_save(void *arg)
 {
+	struct cpu *cp = arg;
+
 	/*
 	 * The idle thread shouldn't be run anywhere else.
 	 */
@@ -1347,9 +1353,11 @@ kcpc_idle_save(struct cpu *cp)
 	mutex_exit(&cp->cpu_cpc_ctxlock);
 }
 
-void
-kcpc_idle_restore(struct cpu *cp)
+static void
+kcpc_idle_restore(void *arg)
 {
+	struct cpu *cp = arg;
+
 	/*
 	 * The idle thread shouldn't be run anywhere else.
 	 */
@@ -1371,10 +1379,23 @@ kcpc_idle_restore(struct cpu *cp)
 	mutex_exit(&cp->cpu_cpc_ctxlock);
 }
 
+static const struct ctxop_template kcpc_idle_ctxop_tpl = {
+	.ct_rev		= CTXOP_TPL_REV,
+	.ct_save	= kcpc_idle_save,
+	.ct_restore	= kcpc_idle_restore,
+};
+
+void
+kcpc_idle_ctxop_install(kthread_t *t, struct cpu *cp)
+{
+	ctxop_install(t, &kcpc_idle_ctxop_tpl, cp);
+}
+
 /*ARGSUSED*/
 static void
-kcpc_lwp_create(kthread_t *t, kthread_t *ct)
+kcpc_lwp_create(void *parent, void *child)
 {
+	kthread_t *t = parent, *ct = child;
 	kcpc_ctx_t	*ctx = t->t_cpc_ctx, *cctx;
 	int		i;
 
@@ -1423,8 +1444,7 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 		aston(ct);
 	}
 
-	installctx(ct, cctx, kcpc_save, kcpc_restore,
-	    NULL, kcpc_lwp_create, NULL, kcpc_free);
+	ctxop_install(ct, &kcpc_ctxop_tpl, cctx);
 }
 
 /*
@@ -1461,8 +1481,9 @@ kcpc_lwp_create(kthread_t *t, kthread_t *ct)
 
 /*ARGSUSED*/
 void
-kcpc_free(kcpc_ctx_t *ctx, int isexec)
+kcpc_free(void *arg, int isexec)
 {
+	kcpc_ctx_t *ctx = arg;
 	int		i;
 	kcpc_set_t	*set = ctx->kc_set;
 
@@ -1543,6 +1564,12 @@ kcpc_free(kcpc_ctx_t *ctx, int isexec)
 	kcpc_free_set(set);
 }
 
+void
+kcpc_free_cpu(kcpc_ctx_t *ctx)
+{
+	kcpc_free(ctx, 0);
+}
+
 /*
  * Free the memory associated with a request set.
  */
diff --git a/usr/src/uts/common/os/klpd.c b/usr/src/uts/common/os/klpd.c
index 8592b47021..0879f791b5 100644
--- a/usr/src/uts/common/os/klpd.c
+++ b/usr/src/uts/common/os/klpd.c
@@ -1150,7 +1150,7 @@ check_user_privs(const cred_t *cr, const priv_set_t *set)
 out:
 	if (da.rbuf != (char *)&res)
 		kmem_free(da.rbuf, da.rsize);
-out1:
+
 	kmem_free(pap, pasize);
 	klpd_rele(pfd);
 	return (err);
diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c
index d12928acc3..4d2c1e6c10 100644
--- a/usr/src/uts/common/os/kmem.c
+++ b/usr/src/uts/common/os/kmem.c
@@ -24,6 +24,7 @@
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2018, Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 /*
@@ -2250,7 +2251,7 @@ kmem_dumppr(char **pp, char *e, const char *format, ...)
 }
 
 /*
- * Called when dumpadm(1M) configures dump parameters.
+ * Called when dumpadm(8) configures dump parameters.
  */
 void
 kmem_dump_init(size_t size)
@@ -4462,8 +4463,7 @@ kmem_init(void)
 	if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE |
 	    KMF_CONTENTS | KMF_LITE)) != 0) ||
 	    ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE))
-		cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x. "
-		    "See the Solaris Tunable Parameters Reference Manual.",
+		cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x.",
 		    kmem_flags);
 
 #ifdef DEBUG
@@ -4481,8 +4481,7 @@ kmem_init(void)
 	    (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0)
 		cmn_err(CE_WARN, "High-overhead kmem debugging features "
 		    "enabled (kmem_flags = 0x%x).  Performance degradation "
-		    "and large memory overhead possible. See the Solaris "
-		    "Tunable Parameters Reference Manual.", kmem_flags);
+		    "and large memory overhead possible.", kmem_flags);
 #endif /* not DEBUG */
 
 	kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP);
@@ -4530,8 +4529,21 @@ void
 kmem_thread_init(void)
 {
 	kmem_move_init();
+
+	/*
+	 * This taskq is used for various kmem maintenance functions, including
+	 * kmem_reap().   When maintenance is required on every cache,
+	 * kmem_cache_applyall() dispatches one task per cache onto this queue.
+	 *
+	 * In the case of kmem_reap(), the system may be under increasingly
+	 * dire memory pressure and may not be able to allocate a new task
+	 * entry.  The count of entries to prepopulate (below) should cover at
+	 * least as many caches as we generally expect to exist on the system
+	 * so that they may all be scheduled for reaping under those
+	 * conditions.
+	 */
 	kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri,
-	    300, INT_MAX, TASKQ_PREPOPULATE);
+	    600, INT_MAX, TASKQ_PREPOPULATE);
 }
 
 void
@@ -5351,7 +5363,7 @@ kmem_cache_scan(kmem_cache_t *cp)
 	}
 
 	if (kmem_cache_is_fragmented(cp, &reap)) {
-		size_t slabs_found;
+		int slabs_found;
 
 		/*
 		 * Consolidate reclaimable slabs from the end of the partial
diff --git a/usr/src/uts/common/os/ksensor.c b/usr/src/uts/common/os/ksensor.c
new file mode 100644
index 0000000000..7dd4a22c8a
--- /dev/null
+++ b/usr/src/uts/common/os/ksensor.c
@@ -0,0 +1,871 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2020 Oxide Computer Company
+ */
+
+/*
+ * Kernel Sensor Framework
+ *
+ * The kernel sensor framework exists to provide a simple and straightforward
+ * means for various parts of the system to declare and instantiate sensor
+ * information. Between this and the ksensor character device
+ * (uts/common/io/ksensor/ksensor_drv.c) this exposes per-device sensors and
+ * character devices.
+ *
+ * --------------------------
+ * Driver and User Interfaces
+ * --------------------------
+ *
+ * Each sensor that is registered with the framework is exposed as a character
+ * device under /dev/sensors. The device class and node name are often ':'
+ * delineated and must begin with 'ddi_sensor'. Everything after 'ddi_sensor'
+ * will be created in a directory under /dev/sensors. So for example the Intel
+ * PCH driver uses a class "ddi_sensor:temperature:pch" and a node name of
+ * 'ts.%d'. This creates the node /dev/sensors/temperature/pch/ts.0. The
+ * devfsadm plugin automatically handles the creation of directories which makes
+ * the addition of additional sensor types easy to create.
+ *
+ * Strictly speaking, any device can manage their own sensors and minor nodes by
+ * using the appropriate class and implementing the corresponding ioctls. That
+ * was how the first kernel sensors were written; however, there are a lot of
+ * issues with that which led to this:
+ *
+ * 1. Every driver had to actually implement character devices.
+ *
+ * 2. Every driver had to duplicate a lot of the logic around open(9E),
+ *    close(9E), and ioctl(9E).
+ *
+ * 3. Drivers that tied into frameworks like mac(9E) or SCSAv3 needed a lot more
+ *    work to fit into this model. For example, because the minor state is
+ *    shared between all the instances and the frameworks, they would have
+ *    required shared, global state that they don't have today.
+ *
+ * Ultimately, having an operations vector and a callback argument makes work a
+ * lot simpler for the producers of sensor data and that simplicity makes it
+ * worthwhile to take on additional effort and work here.
+ *
+ * ----------
+ * Components
+ * ----------
+ *
+ * The ksensor framework is made of a couple of different pieces:
+ *
+ * 1. This glue that is a part of genunix.
+ * 2. The ksensor character device driver.
+ * 3. Sensor providers, which are generally drivers that register with the
+ *    ksensor framework.
+ *
+ * The implementation of (1) is all in this file. The implementation of (2) is
+ * in uts/common/io/ksensor/ksensor_drv.c. The implementation of (3) is found in
+ * all of the different leaf devices. Examples of (3) include pchtemp(4D) and
+ * igb(4D).
+ *
+ * We separate numbers one and two into two different components for a few
+ * reasons. The most important thing is that drivers that provide sensors should
+ * not be dependent on some other part of the system having been loaded. This
+ * makes a compelling argument for it being a part of the core kernel. However,
+ * like other subsystems (e.g. kstats, smbios, etc.), it's useful to separate
+ * out the thing that provides the interface to users with the thing that is
+ * used to glue together providers in the kernel. There's the added benefit that
+ * it's practically simpler to spin up a pseudo-device through a module.
+ *
+ * The ksensor character device driver (2) registers with the main genunix
+ * ksensor code (1) when it attaches and when it detaches. The kernel only
+ * allows a single driver to be attached to it. When that character device
+ * driver attaches, the ksensor framework will walk through all of the currently
+ * registered sensors and inform the character device driver of the nodes that
+ * it needs to create. While the character device driver is attached, the
+ * ksensor framework will also call back into it when a sensor needs to be
+ * removed.
+ *
+ * Generally speaking, this distinction of responsibilities allows the kernel
+ * sensor character device driver to attach and detach without impact to the
+ * sensor providers or them even being notified at all, it's all transparent to
+ * them.
+ *
+ * ------------------------------
+ * Sensor Lifetime and detach(9E)
+ * ------------------------------
+ *
+ * Traditionally, a device driver may be detached by the broader kernel whenever
+ * the kernel desires it. On debug builds this happens by a dedicated thread. On
+ * a non-debug build this may happen due to memory pressure or as an attempt to
+ * reclaim idle resources (though this is much less common). However, when the
+ * module is detached, the system remembers that minor nodes previously existed
+ * and that entries in /devices had been created. When something proceeds to
+ * access an entry in /devices again, the system will use that to bring a driver
+ * back to life. It doesn't matter whether it's a pseudo-device driver or
+ * something else, this can happen.
+ *
+ * One downside to the sensor framework, is that we need to emulate this
+ * behavior which leads to some amount of complexity here. But this is a
+ * worthwhile tradeoff as it makes things much simpler for providers and it's
+ * not too hard for us to emulate this behavior.
+ *
+ * When a sensor provider registers the sensor, the sensor becomes available to
+ * the system. When the sensor provider unregisters with the system, which
+ * happens during its detach routine, then we note that it has been detached;
+ * however, we don't delete its minor node and if something accesses it, we
+ * attempt to load the driver again, the same way that devfs (the file system
+ * behind /devices) does.
+ *
+ * For each dev_info_t that registers a sensor we register a callback such that
+ * when the device is removed, e.g. someone called rem_drv or physically pulls
+ * the device, then we'll be able to finally clean up the device. This lifetime
+ * can be represented in the following image:
+ *
+ *         |
+ *         |
+ *         +-----<-------------------------------------+
+ *         |                                           |
+ *         | . . call ksensor_create()                 |
+ *         v                                           |
+ *     +-------+                                       |
+ *     | Valid |                                       |
+ *     +-------+                                       |
+ *         |                                           ^
+ *         | . . call ksensor_remove()                 |
+ *         v                                           |
+ *    +---------+                                      |
+ *    | Invalid |                                      |
+ *    +---------+                                      |
+ *      |     |                                        |
+ *      |     | . . user uses sensor again             |
+ *      |     |                                        |
+ *      |     +-------------------+                    |
+ *      |                         |                    |
+ *      |                         v                    |
+ *      |                 +---------------+            |
+ *      |                 | Attatching... |-->---------+
+ *      |                 +---------------+
+ *      | . . ddi unbind cb       |
+ *      |                         |
+ *      v                         | . . attatch fails or
+ *   +---------+                  |     no call to ksensor_create()
+ *   | Deleted |--<---------------+     again
+ *   +---------+
+ *
+ * When the DDI unbind callback is called, we know that the device is going to
+ * be removed. However, this happens within a subtle context with a majority of
+ * the device tree held (at least the dip's parent). In particular, another
+ * thread may be trying to obtain a hold on it and be blocked in
+ * ndi_devi_enter(). As the callback thread holds that, that could lead to a
+ * deadlock. As a result, we clean things up in two phases. One during the
+ * synchronous callback and the other via a taskq. In the first phase we
+ * logically do the following:
+ *
+ *  o Remove the dip from the list of ksensor dips and set the flag that
+ *    indicates that it's been removed.
+ *  o Remove all of the sensors from the global avl to make sure that new
+ *    threads cannot look it up.
+ *
+ * Then, after the taskq is dispatched, we do the following in taskq context:
+ *
+ *  o Tell the ksensor driver that it should remove the minor node.
+ *  o Block on each sensor until it is no-longer busy and then clean it up.
+ *  o Clean up the ksensor_dip_t.
+ *
+ * ------------------
+ * Accessing a Sensor
+ * ------------------
+ *
+ * Access to a particular sensor is serialized in the system. In addition to
+ * that, a number of steps are required to access one that is not unlike
+ * accessing a character device. When a given sensor is held the KSENSOR_F_BUSY
+ * flag is set in the ksensor_flags member. In addition, as part of taking a
+ * hold a number of side effects occur that ensure that the sensor provider's
+ * dev_info_t is considered busy and can't be detached.
+ *
+ * To obtain a hold on a sensor the following logical steps are required (see
+ * ksensor_hold_by_id() for the implementation):
+ *
+ *  1. Map the minor to the ksensor_t via the avl tree
+ *  2. Check that the ksensor's dip is valid
+ *  3. If the sensor is busy, wait until it is no longer so, and restart from
+ *     the top. Otherwise, mark the sensor as busy.
+ *  4. Enter the parent and place a hold on the sensor provider's dip.
+ *  5. Once again check if the dip is removed or not because we have to drop
+ *     locks during that operation.
+ *  6. Check if the ksensor has the valid flag set. If not, attempt to configure
+ *     the dip.
+ *  7. Assuming the sensor is now valid, we can return it.
+ *
+ * After this point, the sensor is considered valid for use. Once the consumer
+ * is finished with the sensor, it should be released by calling
+ * ksensor_release().
+ *
+ * An important aspect of the above scheme is that the KSENSOR_F_BUSY flag is
+ * required to progress through the validation and holding of the device. This
+ * makes sure that only one thread is attempting to attach it at a given time. A
+ * reasonable future optimization would be to amortize this cost in open(9E)
+ * and close(9E) of the minor and to bump a count as it being referenced as long
+ * as it is open.
+ *
+ * -----------------------------
+ * Character Device Registration
+ * -----------------------------
+ *
+ * The 'ksensor' character device driver can come and go. To support this, the
+ * ksensor framework communicates with the ksensor character device by a
+ * well-defined set of callbacks, used to indicate sensor addition and removal.
+ * The ksensor character device is found in uts/common/io/ksensor/ksensor_drv.c.
+ * The ksensor character device is responsible for creating and destroying minor
+ * nodes.
+ *
+ * Each ksensor_t has a flag, KSENSOR_F_NOTIFIED, that is used to indicate
+ * whether or not the registered driver has been notified of the sensor. When a
+ * callback is first registered, we'll walk through the entire list of nodes to
+ * make sure that its minor has been created. When unregistering, the minor node
+ * remove callback will not be called; however, this can generally by dealt with
+ * by calling something like ddi_remove_minor_node(dip, NULL).
+ *
+ * -------
+ * Locking
+ * -------
+ *
+ * The following rules apply to dealing with lock ordering:
+ *
+ * 1. The global ksensor_g_mutex protects all global data and must be taken
+ *    before a ksensor_t's individual mutex.
+ *
+ * 2. A thread should not hold any two ksensor_t's mutex at any time.
+ *
+ * 3. No locks should be held when attempting to grab or manipulate a
+ *    dev_info_t, e.g. ndi_devi_enter().
+ *
+ * 4. Unless the ksensor is actively being held, whenever a ksensor is found,
+ *    one must check whether the ksensor_dip_t flag KSENSOR_DIP_F_REMOVED is
+ *    set or not and whether the ksensor_t's KSENSOR_F_VALID flag is set.
+ */
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/esunddi.h>
+#include <sys/ksensor_impl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/pci.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/stddef.h>
+#include <sys/sysmacros.h>
+#include <sys/fs/dv_node.h>
+
+typedef enum {
+	/*
+	 * This flag indicates that the subscribing ksensor character device has
+	 * been notified about this flag.
+	 */
+	KSENSOR_F_NOTIFIED	= 1 << 0,
+	/*
+	 * This indicates that the sensor is currently valid, meaning that the
+	 * ops vector and argument are safe to use. This is removed when a
+	 * driver with a sensor is detached.
+	 */
+	KSENSOR_F_VALID		= 1 << 1,
+	/*
+	 * Indicates that a client has a hold on the sensor for some purpose.
+	 * This must be set before trying to get an NDI hold. Once this is set
+	 * and a NDI hold is in place, it is safe to use the operations vector
+	 * and argument.
+	 */
+	KSENSOR_F_BUSY		= 1 << 2,
+} ksensor_flags_t;
+
+typedef enum {
+	KSENSOR_DIP_F_REMOVED	= 1 << 0
+} ksensor_dip_flags_t;
+
+typedef struct {
+	list_node_t ksdip_link;
+	ksensor_dip_flags_t ksdip_flags;
+	dev_info_t *ksdip_dip;
+	ddi_unbind_callback_t ksdip_cb;
+	list_t ksdip_sensors;
+} ksensor_dip_t;
+
+typedef struct {
+	kmutex_t ksensor_mutex;
+	kcondvar_t ksensor_cv;
+	ksensor_flags_t ksensor_flags;
+	list_node_t ksensor_dip_list;
+	avl_node_t ksensor_id_avl;
+	uint_t ksensor_nwaiters;
+	ksensor_dip_t *ksensor_ksdip;
+	char *ksensor_name;
+	char *ksensor_class;
+	id_t ksensor_id;
+	const ksensor_ops_t *ksensor_ops;
+	void *ksensor_arg;
+} ksensor_t;
+
+static kmutex_t ksensor_g_mutex;
+static id_space_t *ksensor_ids;
+static list_t ksensor_dips;
+static avl_tree_t ksensor_avl;
+static dev_info_t *ksensor_cb_dip;
+static ksensor_create_f ksensor_cb_create;
+static ksensor_remove_f ksensor_cb_remove;
+
+static int
+ksensor_avl_compare(const void *l, const void *r)
+{
+	const ksensor_t *kl = l;
+	const ksensor_t *kr = r;
+
+	if (kl->ksensor_id > kr->ksensor_id) {
+		return (1);
+	} else if (kl->ksensor_id < kr->ksensor_id) {
+		return (-1);
+	} else {
+		return (0);
+	}
+}
+
+static ksensor_t *
+ksensor_find_by_id(id_t id)
+{
+	ksensor_t k, *ret;
+
+	ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+
+	k.ksensor_id = id;
+	return (avl_find(&ksensor_avl, &k, NULL));
+
+}
+
+static ksensor_t *
+ksensor_search_ksdip(ksensor_dip_t *ksdip, const char *name, const char *class)
+{
+	ksensor_t *s;
+
+	ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+
+	for (s = list_head(&ksdip->ksdip_sensors); s != NULL;
+	    s = list_next(&ksdip->ksdip_sensors, s)) {
+		if (strcmp(s->ksensor_name, name) == 0 &&
+		    strcmp(s->ksensor_class, class) == 0) {
+			return (s);
+		}
+	}
+
+	return (NULL);
+}
+
+static void
+ksensor_free_sensor(ksensor_t *sensor)
+{
+	strfree(sensor->ksensor_name);
+	strfree(sensor->ksensor_class);
+	id_free(ksensor_ids, sensor->ksensor_id);
+	mutex_destroy(&sensor->ksensor_mutex);
+	kmem_free(sensor, sizeof (ksensor_t));
+}
+
+static void
+ksensor_free_dip(ksensor_dip_t *ksdip)
+{
+	list_destroy(&ksdip->ksdip_sensors);
+	kmem_free(ksdip, sizeof (ksensor_dip_t));
+}
+
+static void
+ksensor_dip_unbind_taskq(void *arg)
+{
+	ksensor_dip_t *k = arg;
+	ksensor_t *sensor;
+
+	/*
+	 * First notify an attached driver that the nodes are going away
+	 * before we block and wait on them.
+	 */
+	mutex_enter(&ksensor_g_mutex);
+	for (sensor = list_head(&k->ksdip_sensors); sensor != NULL;
+	    sensor = list_next(&k->ksdip_sensors, sensor)) {
+		mutex_enter(&sensor->ksensor_mutex);
+		if (sensor->ksensor_flags & KSENSOR_F_NOTIFIED) {
+			ksensor_cb_remove(sensor->ksensor_id,
+			    sensor->ksensor_name);
+			sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED;
+		}
+		mutex_exit(&sensor->ksensor_mutex);
+	}
+	mutex_exit(&ksensor_g_mutex);
+
+	/*
+	 * Now that the driver has destroyed its minor, wait for anything that's
+	 * still there.
+	 */
+	while ((sensor = list_remove_head(&k->ksdip_sensors)) != NULL) {
+		mutex_enter(&sensor->ksensor_mutex);
+		while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0 ||
+		    sensor->ksensor_nwaiters > 0) {
+			cv_wait(&sensor->ksensor_cv, &sensor->ksensor_mutex);
+		}
+		mutex_exit(&sensor->ksensor_mutex);
+		ksensor_free_sensor(sensor);
+	}
+	ksensor_free_dip(k);
+}
+
+static void
+ksensor_dip_unbind_cb(void *arg, dev_info_t *dip)
+{
+	ksensor_dip_t *k = arg;
+	ksensor_t *sensor;
+
+	/*
+	 * Remove the dip and the associated sensors from global visibility.
+	 * This will ensure that no new clients can find this; however, others
+	 * may have extent attempts to grab it (but lost the race in an NDI
+	 * hold).
+	 */
+	mutex_enter(&ksensor_g_mutex);
+	list_remove(&ksensor_dips, k);
+	k->ksdip_flags |= KSENSOR_DIP_F_REMOVED;
+	for (sensor = list_head(&k->ksdip_sensors); sensor != NULL;
+	    sensor = list_next(&k->ksdip_sensors, sensor)) {
+		avl_remove(&ksensor_avl, sensor);
+	}
+	mutex_exit(&ksensor_g_mutex);
+
+	(void) taskq_dispatch(system_taskq, ksensor_dip_unbind_taskq, k,
+	    TQ_SLEEP);
+}
+
+static ksensor_dip_t *
+ksensor_dip_create(dev_info_t *dip)
+{
+	ksensor_dip_t *k;
+
+	k = kmem_zalloc(sizeof (ksensor_dip_t), KM_SLEEP);
+	k->ksdip_dip = dip;
+	k->ksdip_cb.ddiub_cb = ksensor_dip_unbind_cb;
+	k->ksdip_cb.ddiub_arg = k;
+	list_create(&k->ksdip_sensors, sizeof (ksensor_t),
+	    offsetof(ksensor_t, ksensor_dip_list));
+	e_ddi_register_unbind_callback(dip, &k->ksdip_cb);
+
+	return (k);
+}
+
+static ksensor_dip_t *
+ksensor_dip_find(dev_info_t *dip)
+{
+	ksensor_dip_t *k;
+
+	ASSERT(MUTEX_HELD(&ksensor_g_mutex));
+	for (k = list_head(&ksensor_dips); k != NULL;
+	    k = list_next(&ksensor_dips, k)) {
+		if (dip == k->ksdip_dip) {
+			return (k);
+		}
+	}
+
+	return (NULL);
+}
+
+int
+ksensor_create(dev_info_t *dip, const ksensor_ops_t *ops, void *arg,
+    const char *name, const char *class, id_t *idp)
+{
+	ksensor_dip_t *ksdip;
+	ksensor_t *sensor;
+
+	if (dip == NULL || ops == NULL || name == NULL || class == NULL ||
+	    idp == NULL) {
+		return (EINVAL);
+	}
+
+	if (!DEVI_IS_ATTACHING(dip)) {
+		return (EAGAIN);
+	}
+
+	mutex_enter(&ksensor_g_mutex);
+	ksdip = ksensor_dip_find(dip);
+	if (ksdip == NULL) {
+		ksdip = ksensor_dip_create(dip);
+		list_insert_tail(&ksensor_dips, ksdip);
+	}
+
+	sensor = ksensor_search_ksdip(ksdip, name, class);
+	if (sensor != NULL) {
+		ASSERT3P(sensor->ksensor_ksdip, ==, ksdip);
+		if ((sensor->ksensor_flags & KSENSOR_F_VALID) != 0) {
+			mutex_exit(&ksensor_g_mutex);
+			dev_err(dip, CE_WARN, "tried to create sensor %s:%s "
+			    "which is currently active", class, name);
+			return (EEXIST);
+		}
+
+		sensor->ksensor_ops = ops;
+		sensor->ksensor_arg = arg;
+	} else {
+		sensor = kmem_zalloc(sizeof (ksensor_t), KM_SLEEP);
+		sensor->ksensor_ksdip = ksdip;
+		sensor->ksensor_name = ddi_strdup(name, KM_SLEEP);
+		sensor->ksensor_class = ddi_strdup(class, KM_SLEEP);
+		sensor->ksensor_id = id_alloc(ksensor_ids);
+		sensor->ksensor_ops = ops;
+		sensor->ksensor_arg = arg;
+		list_insert_tail(&ksdip->ksdip_sensors, sensor);
+		avl_add(&ksensor_avl, sensor);
+	}
+
+	sensor->ksensor_flags |= KSENSOR_F_VALID;
+
+	if (ksensor_cb_create != NULL) {
+
+		if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class,
+		    sensor->ksensor_name) == 0) {
+			sensor->ksensor_flags |= KSENSOR_F_NOTIFIED;
+		}
+	}
+
+	*idp = sensor->ksensor_id;
+	mutex_exit(&ksensor_g_mutex);
+
+	return (0);
+}
+
+int
+ksensor_create_scalar_pcidev(dev_info_t *dip, uint_t kind,
+    const ksensor_ops_t *ops, void *arg, const char *name, id_t *idp)
+{
+	char *pci_name, *type;
+	const char *class;
+	int *regs, ret;
+	uint_t nregs;
+	uint16_t bus, dev;
+
+	switch (kind) {
+	case SENSOR_KIND_TEMPERATURE:
+		class = "ddi_sensor:temperature:pci";
+		break;
+	case SENSOR_KIND_VOLTAGE:
+		class = "ddi_sensor:voltage:pci";
+		break;
+	case SENSOR_KIND_CURRENT:
+		class = "ddi_sensor:current:pci";
+		break;
+	default:
+		return (ENOTSUP);
+	}
+
+	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, "device_type",
+	    &type) != DDI_PROP_SUCCESS) {
+		return (EINVAL);
+	}
+
+	if (strcmp(type, "pciex") != 0 && strcmp(type, "pci") != 0) {
+		ddi_prop_free(type);
+		return (EINVAL);
+	}
+	ddi_prop_free(type);
+
+	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, 0, "reg",
+	    &regs, &nregs) != DDI_PROP_SUCCESS) {
+		return (EINVAL);
+	}
+
+	if (nregs < 1) {
+		ddi_prop_free(regs);
+		return (EIO);
+	}
+
+	bus = PCI_REG_BUS_G(regs[0]);
+	dev = PCI_REG_DEV_G(regs[0]);
+	ddi_prop_free(regs);
+
+	pci_name = kmem_asprintf("%x.%x:%s", bus, dev, name);
+
+	ret = ksensor_create(dip, ops, arg, pci_name, class, idp);
+	strfree(pci_name);
+	return (ret);
+}
+
+/*
+ * When a driver removes a sensor, we basically mark it as invalid. This happens
+ * because drivers can detach and we will need to reattach them when the sensor
+ * is used again.
+ */
+int
+ksensor_remove(dev_info_t *dip, id_t id)
+{
+	ksensor_dip_t *kdip;
+	ksensor_t *sensor;
+
+	if (!DEVI_IS_ATTACHING(dip) && !DEVI_IS_DETACHING(dip)) {
+		return (EAGAIN);
+	}
+
+	mutex_enter(&ksensor_g_mutex);
+	kdip = ksensor_dip_find(dip);
+	if (kdip == NULL) {
+		mutex_exit(&ksensor_g_mutex);
+		return (ENOENT);
+	}
+
+	for (sensor = list_head(&kdip->ksdip_sensors); sensor != NULL;
+	    sensor = list_next(&kdip->ksdip_sensors, sensor)) {
+		if (sensor->ksensor_id == id || id == KSENSOR_ALL_IDS) {
+			mutex_enter(&sensor->ksensor_mutex);
+			sensor->ksensor_flags &= ~KSENSOR_F_VALID;
+			sensor->ksensor_ops = NULL;
+			sensor->ksensor_arg = NULL;
+			mutex_exit(&sensor->ksensor_mutex);
+		}
+	}
+	mutex_exit(&ksensor_g_mutex);
+	return (0);
+}
+
+static void
+ksensor_release(ksensor_t *sensor)
+{
+	int circ;
+	dev_info_t *pdip;
+
+	ddi_release_devi(sensor->ksensor_ksdip->ksdip_dip);
+
+	mutex_enter(&sensor->ksensor_mutex);
+	sensor->ksensor_flags &= ~KSENSOR_F_BUSY;
+	cv_broadcast(&sensor->ksensor_cv);
+	mutex_exit(&sensor->ksensor_mutex);
+}
+
+static int
+ksensor_hold_by_id(id_t id, ksensor_t **outp)
+{
+	int circ;
+	ksensor_t *sensor;
+	dev_info_t *pdip;
+
+restart:
+	mutex_enter(&ksensor_g_mutex);
+	sensor = ksensor_find_by_id(id);
+	if (sensor == NULL) {
+		mutex_exit(&ksensor_g_mutex);
+		*outp = NULL;
+		return (ESTALE);
+	}
+
+	if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) {
+		mutex_exit(&ksensor_g_mutex);
+		*outp = NULL;
+		return (ESTALE);
+	}
+
+	mutex_enter(&sensor->ksensor_mutex);
+	if ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) {
+		mutex_exit(&ksensor_g_mutex);
+		sensor->ksensor_nwaiters++;
+		while ((sensor->ksensor_flags & KSENSOR_F_BUSY) != 0) {
+			int cv = cv_wait_sig(&sensor->ksensor_cv,
+			    &sensor->ksensor_mutex);
+			if (cv == 0) {
+				sensor->ksensor_nwaiters--;
+				cv_broadcast(&sensor->ksensor_cv);
+				mutex_exit(&sensor->ksensor_mutex);
+				*outp = NULL;
+				return (EINTR);
+			}
+		}
+		sensor->ksensor_nwaiters--;
+		cv_broadcast(&sensor->ksensor_cv);
+		mutex_exit(&sensor->ksensor_mutex);
+		goto restart;
+	}
+
+	/*
+	 * We have obtained ownership of the sensor. At this point, we should
+	 * check to see if it's valid or not.
+	 */
+	sensor->ksensor_flags |= KSENSOR_F_BUSY;
+	pdip = ddi_get_parent(sensor->ksensor_ksdip->ksdip_dip);
+	mutex_exit(&sensor->ksensor_mutex);
+	mutex_exit(&ksensor_g_mutex);
+
+	/*
+	 * Grab a reference on the device node to ensure that it won't go away.
+	 */
+	ndi_devi_enter(pdip, &circ);
+	e_ddi_hold_devi(sensor->ksensor_ksdip->ksdip_dip);
+	ndi_devi_exit(pdip, circ);
+
+	/*
+	 * Now that we have an NDI hold, check if it's valid or not. It may have
+	 * become invalid while we were waiting due to a race.
+	 */
+	mutex_enter(&ksensor_g_mutex);
+	if ((sensor->ksensor_ksdip->ksdip_flags & KSENSOR_DIP_F_REMOVED) != 0) {
+		mutex_exit(&ksensor_g_mutex);
+		ksensor_release(sensor);
+		return (ESTALE);
+	}
+
+	mutex_enter(&sensor->ksensor_mutex);
+	if ((sensor->ksensor_flags & KSENSOR_F_VALID) == 0) {
+		mutex_exit(&sensor->ksensor_mutex);
+		mutex_exit(&ksensor_g_mutex);
+		(void) ndi_devi_config(pdip, NDI_NO_EVENT);
+		mutex_enter(&ksensor_g_mutex);
+		mutex_enter(&sensor->ksensor_mutex);
+
+		/*
+		 * If we attempted to reattach it and it isn't now valid, fail
+		 * this request.
+		 */
+		if ((sensor->ksensor_ksdip->ksdip_flags &
+		    KSENSOR_DIP_F_REMOVED) != 0 ||
+		    (sensor->ksensor_flags & KSENSOR_F_VALID) == 0) {
+			mutex_exit(&sensor->ksensor_mutex);
+			mutex_exit(&ksensor_g_mutex);
+			ksensor_release(sensor);
+			return (ESTALE);
+		}
+	}
+	mutex_exit(&sensor->ksensor_mutex);
+	mutex_exit(&ksensor_g_mutex);
+	*outp = sensor;
+
+	return (0);
+}
+
+int
+ksensor_op_kind(id_t id, sensor_ioctl_kind_t *kind)
+{
+	int ret;
+	ksensor_t *sensor;
+
+	if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) {
+		return (ret);
+	}
+
+	ret = sensor->ksensor_ops->kso_kind(sensor->ksensor_arg, kind);
+	ksensor_release(sensor);
+
+	return (ret);
+}
+
+int
+ksensor_op_scalar(id_t id, sensor_ioctl_scalar_t *scalar)
+{
+	int ret;
+	ksensor_t *sensor;
+
+	if ((ret = ksensor_hold_by_id(id, &sensor)) != 0) {
+		return (ret);
+	}
+
+	ret = sensor->ksensor_ops->kso_scalar(sensor->ksensor_arg, scalar);
+	ksensor_release(sensor);
+
+	return (ret);
+}
+
+void
+ksensor_unregister(dev_info_t *reg_dip)
+{
+	ksensor_t *sensor;
+
+	mutex_enter(&ksensor_g_mutex);
+	if (ksensor_cb_dip != reg_dip) {
+		dev_err(reg_dip, CE_PANIC, "asked to unregister illegal dip");
+	}
+
+	for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor =
+	    AVL_NEXT(&ksensor_avl, sensor)) {
+		mutex_enter(&sensor->ksensor_mutex);
+		sensor->ksensor_flags &= ~KSENSOR_F_NOTIFIED;
+		mutex_exit(&sensor->ksensor_mutex);
+	}
+
+	ksensor_cb_dip = NULL;
+	ksensor_cb_create = NULL;
+	ksensor_cb_remove = NULL;
+	mutex_exit(&ksensor_g_mutex);
+}
+
+int
+ksensor_register(dev_info_t *reg_dip, ksensor_create_f create,
+    ksensor_remove_f remove)
+{
+	ksensor_t *sensor;
+
+	mutex_enter(&ksensor_g_mutex);
+	if (ksensor_cb_dip != NULL) {
+		dev_err(reg_dip, CE_WARN, "kernel sensors are already "
+		    "registered");
+		mutex_exit(&ksensor_g_mutex);
+		return (EEXIST);
+	}
+
+	ksensor_cb_dip = reg_dip;
+	ksensor_cb_create = create;
+	ksensor_cb_remove = remove;
+
+	for (sensor = avl_first(&ksensor_avl); sensor != NULL; sensor =
+	    AVL_NEXT(&ksensor_avl, sensor)) {
+		mutex_enter(&sensor->ksensor_mutex);
+		ASSERT0(sensor->ksensor_flags & KSENSOR_F_NOTIFIED);
+
+		if (ksensor_cb_create(sensor->ksensor_id, sensor->ksensor_class,
+		    sensor->ksensor_name) == 0) {
+			sensor->ksensor_flags |= KSENSOR_F_NOTIFIED;
+		}
+
+		mutex_exit(&sensor->ksensor_mutex);
+	}
+
+	mutex_exit(&ksensor_g_mutex);
+
+	return (0);
+}
+
+int
+ksensor_kind_temperature(void *unused, sensor_ioctl_kind_t *k)
+{
+	k->sik_kind = SENSOR_KIND_TEMPERATURE;
+	return (0);
+}
+
+int
+ksensor_kind_current(void *unused, sensor_ioctl_kind_t *k)
+{
+	k->sik_kind = SENSOR_KIND_CURRENT;
+	return (0);
+}
+
+int
+ksensor_kind_voltage(void *unused, sensor_ioctl_kind_t *k)
+{
+	k->sik_kind = SENSOR_KIND_VOLTAGE;
+	return (0);
+}
+
+void
+ksensor_init(void)
+{
+	mutex_init(&ksensor_g_mutex, NULL, MUTEX_DRIVER, NULL);
+	list_create(&ksensor_dips, sizeof (ksensor_dip_t),
+	    offsetof(ksensor_dip_t, ksdip_link));
+	ksensor_ids = id_space_create("ksensor", 1, L_MAXMIN32);
+	avl_create(&ksensor_avl, ksensor_avl_compare, sizeof (ksensor_t),
+	    offsetof(ksensor_t, ksensor_id_avl));
+}
diff --git a/usr/src/uts/common/os/lgrp.c b/usr/src/uts/common/os/lgrp.c
index f3404a1cdf..31b0cf7e0d 100644
--- a/usr/src/uts/common/os/lgrp.c
+++ b/usr/src/uts/common/os/lgrp.c
@@ -1449,8 +1449,8 @@ lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
 		 * Remove memory node from lgroup.
 		 */
 		lgrp->lgrp_mnodes &= ~mnodes_mask;
+		ASSERT(lgrp->lgrp_nmnodes > 0);
 		lgrp->lgrp_nmnodes--;
-		ASSERT(lgrp->lgrp_nmnodes >= 0);
 	}
 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
 
@@ -2160,8 +2160,8 @@ lpl_topo_verify(cpupart_t *cpupart)
 
 		/* do the parent lgroups exist and do they match? */
 		if (lgrp->lgrp_parent) {
-			ASSERT(lpl->lpl_parent);
-			ASSERT(lgrp->lgrp_parent->lgrp_id ==
+			ASSERT(lpl->lpl_parent != NULL &&
+			    lgrp->lgrp_parent->lgrp_id ==
 			    lpl->lpl_parent->lpl_lgrpid);
 
 			if (!lpl->lpl_parent) {
@@ -4100,12 +4100,13 @@ lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
 	lgrp_shm_policy_seg_t	*newseg;
 	avl_index_t		where;
 
-	ASSERT(seg != NULL);
-	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
+	ASSERT(seg != NULL && (off >= seg->shm_off &&
+	    off <= seg->shm_off + seg->shm_size));
 
-	if (!seg || off < seg->shm_off || off > seg->shm_off +
-	    seg->shm_size)
+	if (!seg || off < seg->shm_off ||
+	    off > seg->shm_off + seg->shm_size) {
 		return (NULL);
+	}
 
 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
 		return (seg);
diff --git a/usr/src/uts/common/os/log_sysevent.c b/usr/src/uts/common/os/log_sysevent.c
index 35e0048ee7..50dc5dfd82 100644
--- a/usr/src/uts/common/os/log_sysevent.c
+++ b/usr/src/uts/common/os/log_sysevent.c
@@ -1277,7 +1277,7 @@ get_registration(sysevent_channel_descriptor_t *chan, char *databuf,
 	class_lst_t *clist;
 	subclass_lst_t *sc_list;
 
-	if (class_index < 0 || class_index > CLASS_HASH_SZ)
+	if (class_index > CLASS_HASH_SZ)
 		return (EINVAL);
 
 	if ((clist = chan->scd_class_list_tbl[class_index]) == NULL) {
@@ -1395,10 +1395,15 @@ log_sysevent_register(char *channel_name, char *udatabuf, se_pubsub_t *udata)
 	case SE_CLOSE_REGISTRATION:
 		close_channel(kchannel);
 		break;
-	case SE_BIND_REGISTRATION:
-		if ((kdata.ps_id = bind_common(chan, kdata.ps_type)) <= 0)
+	case SE_BIND_REGISTRATION: {
+		id_t id;
+
+		id = bind_common(chan, kdata.ps_type);
+		kdata.ps_id = (uint32_t)id;
+		if (id <= 0)
 			error = EBUSY;
 		break;
+	}
 	case SE_UNBIND_REGISTRATION:
 		(void) unbind_common(chan, kdata.ps_type, (id_t)kdata.ps_id);
 		break;
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 9e58a7bb56..6a922343e7 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -20,9 +20,11 @@
  */
 
 /*
+ * Copyright 2020 Oxide Computer Company
  * Copyright (c) 2013 Gary Mills
  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2020 Joyent, Inc.
+ * Copyright 2022 Joyent, Inc.
+ * Copyright 2022 MNX Cloud, Inc.
  */
 
 #include <sys/types.h>
@@ -43,6 +45,7 @@
 #include <sys/utsname.h>
 #include <sys/id_space.h>
 #include <sys/zone.h>
+#include <sys/bootbanner.h>
 
 log_zone_t log_global;
 queue_t *log_consq;
@@ -182,6 +185,14 @@ log_zonefree(zoneid_t zoneid, void *arg)
 	kmem_free(lzp, sizeof (log_zone_t));
 }
 
+static void
+log_bootbanner_print(const char *line, uint_t num)
+{
+	const char *pfx = (num == 0) ? "\r" : "";
+
+	printf("%s%s\n", pfx, line);
+}
+
 void
 log_init(void)
 {
@@ -189,7 +200,7 @@ log_init(void)
 
 	/*
 	 * Create a backlog queue to consume console messages during periods
-	 * when there is no console reader (e.g. before syslogd(1M) starts).
+	 * when there is no console reader (e.g. before syslogd(8) starts).
 	 */
 	log_backlogq = log_consq = log_makeq(0, LOG_HIWAT, NULL);
 
@@ -207,7 +218,7 @@ log_init(void)
 	log_intrq = log_makeq(0, LOG_HIWAT, (void *)ipltospl(SPL8));
 
 	/*
-	 * Create a queue to hold the most recent 8K of console messages.
+	 * Create a queue to hold the most recent 64K of console messages.
 	 * Useful for debugging.  Required by the "$<msgbuf" adb macro.
 	 */
 	log_recentq = log_makeq(0, LOG_RECENTSIZE, NULL);
@@ -246,11 +257,19 @@ log_init(void)
 	log_update(&log_backlog, log_backlogq, SL_CONSOLE, log_console);
 
 	/*
-	 * Now that logging is enabled, emit the SunOS banner.
+	 * Now that logging is enabled, emit the boot banner.
 	 */
+#ifdef	LEGACY_BANNER
 	printf("\rSunOS Release %s Version %s %u-bit\n",
 	    utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *));
-	printf("Copyright 2010-2020 Joyent, Inc.\n");
+	/*
+	 * Note: In the future this should be 2022-20XX, and delete this
+	 * comment when we don't need it anymore
+	 */
+	printf("Copyright 2022 MNX Cloud, Inc.\n");
+#else
+	bootbanner_print(log_bootbanner_print, KM_SLEEP);
+#endif
 #ifdef DEBUG
 	printf("DEBUG enabled\n");
 #endif
@@ -667,7 +686,7 @@ log_sendmsg(mblk_t *mp, zoneid_t zoneid)
 				if (lp->log_q == log_consq) {
 					console_printf(log_overflow_msg,
 					    lp->log_minor,
-					    " -- is syslogd(1M) running?");
+					    " -- is syslogd(8) running?");
 				} else {
 					printf(log_overflow_msg,
 					    lp->log_minor, "");
diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c
index 6961a2ff4f..c57f8a7d2c 100644
--- a/usr/src/uts/common/os/main.c
+++ b/usr/src/uts/common/os/main.c
@@ -565,7 +565,7 @@ main(void)
 	/*
 	 * Set the scan rate and other parameters of the paging subsystem.
 	 */
-	setupclock(0);
+	setupclock();
 
 	/*
 	 * Initialize process 0's lwp directory and lwpid hash table.
diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c
index 285b76347b..fd74dd3092 100644
--- a/usr/src/uts/common/os/mem_config.c
+++ b/usr/src/uts/common/os/mem_config.c
@@ -509,7 +509,7 @@ mapalloc:
 	 * Recalculate the paging parameters now total_pages has changed.
 	 * This will also cause the clock hands to be reset before next use.
 	 */
-	setupclock(1);
+	setupclock();
 
 	memsegs_unlock(1);
 
@@ -2700,7 +2700,7 @@ kphysm_del_cleanup(struct mem_handle *mhp)
 	 * Recalculate the paging parameters now total_pages has changed.
 	 * This will also cause the clock hands to be reset before next use.
 	 */
-	setupclock(1);
+	setupclock();
 
 	memsegs_unlock(1);
 
diff --git a/usr/src/uts/common/os/memlist_new.c b/usr/src/uts/common/os/memlist_new.c
index adef7cb015..eaa23ed24e 100644
--- a/usr/src/uts/common/os/memlist_new.c
+++ b/usr/src/uts/common/os/memlist_new.c
@@ -143,13 +143,17 @@ memlist_insert(
 	}
 	new->ml_next = NULL;
 	new->ml_prev = last;
-	if (last != NULL)
+	if (last != NULL) {
 		last->ml_next = new;
+	} else {
+		ASSERT3P(*curmemlistp, ==, NULL);
+		*curmemlistp = new;
+	}
 }
 
 void
 memlist_del(struct memlist *memlistp,
-	struct memlist **curmemlistp)
+    struct memlist **curmemlistp)
 {
 #ifdef DEBUG
 	/*
diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c
index 0410e6f47b..d14a4ef005 100644
--- a/usr/src/uts/common/os/mmapobj.c
+++ b/usr/src/uts/common/os/mmapobj.c
@@ -213,8 +213,6 @@ struct mobj_stats {
 #define	OVERLAPS_STACK(addr, p)						\
 	((p->p_model == DATAMODEL_LP64) &&				\
 	(addr >= (p->p_usrstack - ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK))))
-#elif defined(__i386)
-#define	OVERLAPS_STACK(addr, p)	0
 #endif
 
 /* lv_flags values - bitmap */
@@ -1010,8 +1008,8 @@ mmapobj_map_flat(vnode_t *vp, mmapobj_result_t *mrp, size_t padding,
  * fcred - credentials for the file associated with vp at open time.
  */
 static int
-mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
-    off_t offset, int prot, cred_t *fcred)
+mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len,
+    volatile size_t zfodlen, off_t offset, int prot, cred_t *fcred)
 {
 	int error = 0;
 	caddr_t zfodbase, oldaddr;
@@ -1060,8 +1058,8 @@ mmapobj_map_ptload(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
 			 * maxprot is passed as PROT_ALL so that mdb can
 			 * write to this segment.
 			 */
-			if (error = VOP_MAP(vp, (offset_t)offset, as, &addr,
-			    len, prot, PROT_ALL, mflag, fcred, NULL)) {
+			if ((error = VOP_MAP(vp, (offset_t)offset, as, &addr,
+			    len, prot, PROT_ALL, mflag, fcred, NULL)) != 0) {
 				return (error);
 			}
 
diff --git a/usr/src/uts/common/os/modctl.c b/usr/src/uts/common/os/modctl.c
index d8782b320e..f141fb4bf0 100644
--- a/usr/src/uts/common/os/modctl.c
+++ b/usr/src/uts/common/os/modctl.c
@@ -2694,7 +2694,7 @@ modrload(const char *subdir, const char *filename, struct modctl **rmodp)
 		CPU_STATS_ADDQ(CPU, sys, modload, 1);
 	}
 
-done:	if (subdir != NULL)
+	if (subdir != NULL)
 		kmem_free(fullname, size);
 	return (rmodp ? retval : id);
 }
diff --git a/usr/src/uts/common/os/modsubr.c b/usr/src/uts/common/os/modsubr.c
index e980516b10..53c4195e48 100644
--- a/usr/src/uts/common/os/modsubr.c
+++ b/usr/src/uts/common/os/modsubr.c
@@ -74,8 +74,7 @@ static void hwc_unhash(struct hwc_spec *);
 int
 major_valid(major_t major)
 {
-	return (major != DDI_MAJOR_T_NONE &&
-	    (major >= 0 && major < devcnt));
+	return (major != DDI_MAJOR_T_NONE && major < devcnt);
 }
 
 int
diff --git a/usr/src/uts/common/os/ndifm.c b/usr/src/uts/common/os/ndifm.c
index 16613a9203..54640971fd 100644
--- a/usr/src/uts/common/os/ndifm.c
+++ b/usr/src/uts/common/os/ndifm.c
@@ -669,7 +669,7 @@ ndi_fm_dma_err_set(ddi_dma_handle_t handle, ddi_fm_error_t *dfe)
 /*
  * Call parent busop fm initialization routine.
  *
- * Called during driver attach(1M)
+ * Called during driver attach(9E)
  */
 int
 i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc)
@@ -696,7 +696,7 @@ i_ndi_busop_fm_init(dev_info_t *dip, int tcap, ddi_iblock_cookie_t *ibc)
 /*
  * Call parent busop fm clean-up routine.
  *
- * Called during driver detach(1M)
+ * Called during driver detach(9E)
  */
 void
 i_ndi_busop_fm_fini(dev_info_t *dip)
diff --git a/usr/src/uts/common/os/panic.c b/usr/src/uts/common/os/panic.c
index 62be47e843..addb8b79cb 100644
--- a/usr/src/uts/common/os/panic.c
+++ b/usr/src/uts/common/os/panic.c
@@ -213,7 +213,7 @@ panicsys(const char *format, va_list alist, struct regs *rp, int on_panic_stack)
 	cpu_t *cp = CPU;
 
 	caddr_t intr_stack = NULL;
-	uint_t intr_actv;
+	volatile uint_t intr_actv;
 
 	ushort_t schedflag = t->t_schedflag;
 	cpu_t *bound_cpu = t->t_bound_cpu;
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index 861c748cff..b3f01cfab2 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -22,6 +22,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright 2022 Oxide Computer Company
  */
 
 #include <sys/types.h>
@@ -69,6 +70,19 @@ int priv_debug = 0;
 int priv_basic_test = -1;
 
 /*
+ * Unlinking or creating new hard links to directories was historically allowed
+ * in some file systems; e.g., UFS allows root users to do it, at the cost of
+ * almost certain file system corruption that will require fsck to fix.
+ *
+ * Most modern operating systems and file systems (e.g., ZFS) do not allow this
+ * behaviour anymore, and we have elected to stamp it out entirely for
+ * compatibility and safety reasons.  An attempt to unlink a directory will
+ * fail with EPERM, as described in the standard.  During this transition, one
+ * can turn the behaviour back on, at their own risk, with this tuneable:
+ */
+int priv_allow_linkdir = 0;
+
+/*
  * This file contains the majority of the policy routines.
  * Since the policy routines are defined by function and not
  * by privilege, there is quite a bit of duplication of
@@ -896,6 +910,23 @@ secpolicy_fs_config(const cred_t *cr, const vfs_t *vfsp)
 int
 secpolicy_fs_linkdir(const cred_t *cr, const vfs_t *vfsp)
 {
+	if (priv_allow_linkdir == 0) {
+		/*
+		 * By default, this policy check will now always return EPERM
+		 * unless overridden.
+		 *
+		 * We do so without triggering auditing or allowing privilege
+		 * debugging for two reasons: first, we intend eventually to
+		 * deprecate the PRIV_SYS_LINKDIR privilege entirely and remove
+		 * the use of this policy check from the file systems; second,
+		 * for privilege debugging in particular, because it would be
+		 * confusing to report an unlink() failure as the result of a
+		 * missing privilege when in fact we are simply no longer
+		 * allowing the operation at all.
+		 */
+		return (EPERM);
+	}
+
 	return (PRIV_POLICY(cr, PRIV_SYS_LINKDIR, B_FALSE, EPERM, NULL));
 }
 
@@ -1381,7 +1412,7 @@ secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
  * this is required because vop_access function should lock the
  * node for reading.  A three argument function should be defined
  * which accepts the following argument:
- * 	A pointer to the internal "node" type (inode *)
+ *	A pointer to the internal "node" type (inode *)
  *	vnode access bits (VREAD|VWRITE|VEXEC)
  *	a pointer to the credential
  *
@@ -1453,8 +1484,8 @@ secpolicy_vnode_setattr(cred_t *cr, struct vnode *vp, struct vattr *vap,
 		 *
 		 * If you are the file owner:
 		 *	chown to other uid		FILE_CHOWN_SELF
-		 *	chown to gid (non-member) 	FILE_CHOWN_SELF
-		 *	chown to gid (member) 		<none>
+		 *	chown to gid (non-member)	FILE_CHOWN_SELF
+		 *	chown to gid (member)		<none>
 		 *
 		 * Instead of PRIV_FILE_CHOWN_SELF, FILE_CHOWN is also
 		 * acceptable but the first one is reported when debugging.
@@ -2433,13 +2464,14 @@ secpolicy_gart_map(const cred_t *cr)
 }
 
 /*
- * secpolicy_xhci
+ * secpolicy_hwmanip
  *
- * Determine if the subject can observe and manipulate the xhci driver with a
- * dangerous blunt hammer.  Requires all privileges.
+ * Determine if the subject can observe and manipulate a hardware device with a
+ * dangerous blunt hammer, often suggests they can do something destructive.
+ * Requires all privileges.
  */
 int
-secpolicy_xhci(const cred_t *cr)
+secpolicy_hwmanip(const cred_t *cr)
 {
 	return (secpolicy_require_set(cr, PRIV_FULLSET, NULL, KLPDARG_NONE));
 }
diff --git a/usr/src/uts/common/os/pool.c b/usr/src/uts/common/os/pool.c
index f9fe8649c0..57bd2241fd 100644
--- a/usr/src/uts/common/os/pool.c
+++ b/usr/src/uts/common/os/pool.c
@@ -1441,9 +1441,13 @@ pool_do_bind(pool_t *pool, idtype_t idtype, id_t id, int flags)
 			switch (idtype) {
 			case P_PID:
 			case P_TASKID:
+			default:
+
 				/*
-				 * Can't bind processes or tasks
-				 * in local zones to pools.
+				 * Can't bind processes or tasks in local zones
+				 * to pools. Also catch all remaining types of
+				 * idtype_t that should already have been
+				 * filtered out.
 				 */
 				mutex_exit(&p->p_lock);
 				mutex_exit(&pidlock);
@@ -1715,6 +1719,8 @@ out:	switch (idtype) {
 		zone->zone_pool_mod = gethrtime();
 		zone_rele(zone);
 		break;
+	default:
+		break;
 	}
 
 	kmem_free(procs, procs_size * sizeof (proc_t *));
diff --git a/usr/src/uts/common/os/priv.c b/usr/src/uts/common/os/priv.c
index ccde6e5af5..388ccd8918 100644
--- a/usr/src/uts/common/os/priv.c
+++ b/usr/src/uts/common/os/priv.c
@@ -182,8 +182,7 @@ priv_pr_spriv(proc_t *p, prpriv_t *prpriv, const cred_t *cr)
 	if (prpriv->pr_nsets != PRIV_NSET ||
 	    prpriv->pr_setsize != PRIV_SETSIZE ||
 	    (prpriv->pr_infosize & (sizeof (uint32_t) - 1)) != 0 ||
-	    prpriv->pr_infosize > priv_info->priv_infosize ||
-	    prpriv->pr_infosize < 0)
+	    prpriv->pr_infosize > priv_info->priv_infosize)
 		return (EINVAL);
 
 	mutex_exit(&p->p_lock);
diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs
index 854fb602da..05979dd236 100644
--- a/usr/src/uts/common/os/priv_defs
+++ b/usr/src/uts/common/os/priv_defs
@@ -217,7 +217,7 @@ privilege PRIV_NET_BINDMLP
 	Allow a process to bind to a port that is configured as a
 	multi-level port(MLP) for the process's zone. This privilege
 	applies to both shared address and zone-specific address MLPs.
-	See tnzonecfg(4) from the Trusted Extensions manual pages for
+	See tnzonecfg(5) from the Trusted Extensions manual pages for
 	information on configuring MLP ports.
 	This privilege is interpreted only if the system is configured
 	with Trusted Extensions.
@@ -507,7 +507,7 @@ privilege PRIV_SYS_TRANS_LABEL
 privilege PRIV_VIRT_MANAGE
 
 	Allows a process to manage virtualized environments such as
-	xVM(5).
+	xVM(7).
 
 privilege PRIV_WIN_COLORMAP
 
@@ -613,7 +613,7 @@ privilege PRIV_WIN_UPGRADE_SL
 
 privilege PRIV_XVM_CONTROL
 
-	Allows a process access to the xVM(5) control devices for
+	Allows a process access to the xVM(7) control devices for
 	managing guest domains and the hypervisor. This privilege is
 	used only if booted into xVM on x86 platforms.
 
diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c
index e0a1126567..8f52f4ef3a 100644
--- a/usr/src/uts/common/os/rctl.c
+++ b/usr/src/uts/common/os/rctl.c
@@ -149,7 +149,7 @@
  *   The locking subsequence of interest is: p_lock, rctl_dict_lock,
  *   rctl_lists_lock, entity->rcs_lock.
  *
- * The projects(4) database and project entity resource controls
+ * The project(5) database and project entity resource controls
  *   A special case is made for RCENTITY_PROJECT values set through the
  *   setproject(3PROJECT) interface.  setproject() makes use of a private
  *   interface, setprojrctl(), which passes through an array of resource control
@@ -170,7 +170,7 @@
  *
  *      rctl->rc_values - a linked list of rctl_val_t.  These are the active
  *      resource values associated with this rctl, and may have been set by
- *      setrctl() - via prctl(1M), or by setprojrctl() - via
+ *      setrctl() - via prctl(1), or by setprojrctl() - via
  *      setproject(3PROJECT).
  *
  *      rctl->rc_projdb - a linked list of rctl_val_t.  These reflect the
@@ -1570,8 +1570,6 @@ rctl_local_op(rctl_hndl_t hndl, rctl_val_t *oval, rctl_val_t *nval,
 	int ret = 0;
 	rctl_dict_entry_t *rde = rctl_dict_lookup_hndl(hndl);
 
-local_op_retry:
-
 	ASSERT(MUTEX_HELD(&p->p_lock));
 
 	rset = rctl_entity_obtain_rset(rde, p);
diff --git a/usr/src/uts/common/os/schedctl.c b/usr/src/uts/common/os/schedctl.c
index 18b396a765..d500bf7468 100644
--- a/usr/src/uts/common/os/schedctl.c
+++ b/usr/src/uts/common/os/schedctl.c
@@ -22,7 +22,8 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2021 Joyent, Inc.
+ * Copyright 2021 Oxide Computer Company
  */
 
 #include <sys/types.h>
@@ -81,9 +82,9 @@ static size_t	sc_bitmap_len;		/* # of bits in allocation bitmap */
 static size_t	sc_bitmap_words;	/* # of words in allocation bitmap */
 
 /* Context ops */
-static void	schedctl_save(sc_shared_t *);
-static void	schedctl_restore(sc_shared_t *);
-static void	schedctl_fork(kthread_t *, kthread_t *);
+static void schedctl_save(void *);
+static void schedctl_restore(void *);
+static void schedctl_fork(void *, void *);
 
 /* Functions for handling shared pages */
 static int	schedctl_shared_alloc(sc_shared_t **, uintptr_t *);
@@ -92,6 +93,13 @@ static int	schedctl_map(struct anon_map *, caddr_t *, caddr_t);
 static int	schedctl_getpage(struct anon_map **, caddr_t *);
 static void	schedctl_freepage(struct anon_map *, caddr_t);
 
+static const struct ctxop_template schedctl_ctxop_tpl = {
+	.ct_rev		= CTXOP_TPL_REV,
+	.ct_save	= schedctl_save,
+	.ct_restore	= schedctl_restore,
+	.ct_fork	= schedctl_fork,
+};
+
 /*
  * System call interface to scheduler activations.
  * This always operates on the current lwp.
@@ -112,8 +120,7 @@ schedctl(void)
 			return ((caddr_t)(uintptr_t)set_errno(error));
 		bzero(ssp, sizeof (*ssp));
 
-		installctx(t, ssp, schedctl_save, schedctl_restore,
-		    schedctl_fork, NULL, NULL, NULL);
+		ctxop_install(t, &schedctl_ctxop_tpl, ssp);
 
 		thread_lock(t);	/* protect against ts_tick and ts_update */
 		t->t_schedctl = ssp;
@@ -151,8 +158,7 @@ schedctl_lwp_cleanup(kthread_t *t)
 	 * Remove the context op to avoid the final call to
 	 * schedctl_save when switching away from this lwp.
 	 */
-	(void) removectx(t, ssp, schedctl_save, schedctl_restore,
-	    schedctl_fork, NULL, NULL, NULL);
+	(void) ctxop_remove(t, &schedctl_ctxop_tpl, ssp);
 
 	/*
 	 * Do not unmap the shared page until the process exits.
@@ -207,8 +213,10 @@ schedctl_proc_cleanup(void)
  * Save new thread state.
  */
 static void
-schedctl_save(sc_shared_t *ssp)
+schedctl_save(void *arg)
 {
+	sc_shared_t *ssp = arg;
+
 	ssp->sc_state = curthread->t_state;
 }
 
@@ -218,8 +226,10 @@ schedctl_save(sc_shared_t *ssp)
  * Save new thread state and CPU.
  */
 static void
-schedctl_restore(sc_shared_t *ssp)
+schedctl_restore(void *arg)
 {
+	sc_shared_t *ssp = arg;
+
 	ssp->sc_state = SC_ONPROC;
 	ssp->sc_cpu = CPU->cpu_id;
 }
@@ -230,8 +240,9 @@ schedctl_restore(sc_shared_t *ssp)
  * The child's threads must call schedctl() to get new shared mappings.
  */
 static void
-schedctl_fork(kthread_t *pt, kthread_t *ct)
+schedctl_fork(void *parent, void *child)
 {
+	kthread_t *pt = parent, *ct = child;
 	proc_t *pp = ttoproc(pt);
 	proc_t *cp = ttoproc(ct);
 	sc_page_ctl_t *pagep;
diff --git a/usr/src/uts/common/os/share.c b/usr/src/uts/common/os/share.c
index 55a7422868..6a06be2d9c 100644
--- a/usr/src/uts/common/os/share.c
+++ b/usr/src/uts/common/os/share.c
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -125,6 +125,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 				    (shr->s_deny & F_RDDNY) ||
 				    (shrl->shr->s_access & F_WRACC)) {
 					mutex_exit(&vp->v_lock);
+					DTRACE_PROBE1(conflict_shrlock,
+					    struct shrlock *, shrl->shr);
 					return (EAGAIN);
 				}
 				/*
@@ -135,6 +137,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 				if (isreadonly(vp))
 					break;
 				mutex_exit(&vp->v_lock);
+				DTRACE_PROBE1(conflict_shrlock,
+				    struct shrlock *, shrl->shr);
 				return (EAGAIN);
 			}
 			/*
@@ -147,6 +151,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 			    (shrl->shr->s_access == F_RDACC))
 				break;
 			mutex_exit(&vp->v_lock);
+			DTRACE_PROBE1(conflict_shrlock,
+			    struct shrlock *, shrl->shr);
 			return (EAGAIN);
 		}
 
@@ -171,6 +177,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 			    (shrl->shr->s_deny & F_RDDNY) ||
 			    (shrl->shr->s_access & F_WRACC)) {
 				mutex_exit(&vp->v_lock);
+				DTRACE_PROBE1(conflict_shrlock,
+				    struct shrlock *, shrl->shr);
 				return (EAGAIN);
 			}
 			/*
@@ -183,6 +191,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 					break;
 				}
 				mutex_exit(&vp->v_lock);
+				DTRACE_PROBE1(conflict_shrlock,
+				    struct shrlock *, shrl->shr);
 				return (EAGAIN);
 			}
 			/*
@@ -199,6 +209,8 @@ add_share(struct vnode *vp, struct shrlock *shr)
 		if ((shr->s_access & shrl->shr->s_deny) ||
 		    (shr->s_deny & shrl->shr->s_access)) {
 			mutex_exit(&vp->v_lock);
+			DTRACE_PROBE1(conflict_shrlock,
+			    struct shrlock *, shrl->shr);
 			return (EAGAIN);
 		}
 	}
@@ -609,8 +621,11 @@ nbl_share_conflict(vnode_t *vp, nbl_op_t op, caller_context_t *ct)
 			break;
 #endif
 		}
-		if (conflict)
+		if (conflict) {
+			DTRACE_PROBE1(conflict_shrlock,
+			    struct shrlock *, shrl->shr);
 			break;
+		}
 	}
 
 	mutex_exit(&vp->v_lock);
diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c
index 74f1649a07..d0611eb9bb 100644
--- a/usr/src/uts/common/os/shm.c
+++ b/usr/src/uts/common/os/shm.c
@@ -348,7 +348,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 		size = P2ROUNDUP(size, share_size);
 
 		align_hint = share_size;
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 		/*
 		 * For x86, we want to share as much of the page table tree
 		 * as possible. We use a large align_hint at first, but
@@ -366,7 +366,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp)
 			while (size >= ptes_per_table * (uint64_t)align_hint)
 				align_hint *= ptes_per_table;
 		}
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
 
 #if defined(__sparcv9)
 		if (addr == 0 &&
diff --git a/usr/src/uts/common/os/softint.c b/usr/src/uts/common/os/softint.c
index ecdb038c79..8801340cf9 100644
--- a/usr/src/uts/common/os/softint.c
+++ b/usr/src/uts/common/os/softint.c
@@ -58,29 +58,29 @@
  *
  * Starting state is IDLE.
  *
- * 				softint()
+ *				softint()
  *
  *
  *				(c)
- * 	____________________________________________________
- * 	|                          ^                         ^
- * 	v            (a)           |           (b)           |
- * 	IDLE--------------------->PEND--------------------->DRAIN
- *	^                         |                         |
- * 	|                         |                         |
- * 	|                         |                         |
- * 	|                         |                         |
- * 	|                         |                         |
- * 	|                         d                         d
- * 	|                         |                         |
- * 	|                         v                         v
- * 	|                         PEND                      DRAIN
- * 	|            (e)           &                          &
- * 	|<-----------------------STEAL                      STEAL
- * 	^                                                    |
- * 	|                                                    |
- * 	|                         (e)                        v
- * 	|_________________________<__________________________|
+ *	____________________________________________________
+ *	|			   ^			     ^
+ *	v	     (a)	   |	       (b)	     |
+ *	IDLE--------------------->PEND--------------------->DRAIN
+ *	^			  |			    |
+ *	|			  |			    |
+ *	|			  |			    |
+ *	|			  |			    |
+ *	|			  |			    |
+ *	|			  d			    d
+ *	|			  |			    |
+ *	|			  v			    v
+ *	|			  PEND			    DRAIN
+ *	|	     (e)	   &			      &
+ *	|<-----------------------STEAL			    STEAL
+ *	^						     |
+ *	|						     |
+ *	|			  (e)			     v
+ *	|_________________________<__________________________|
  *
  *
  *
@@ -146,9 +146,9 @@ uint_t softcall_pokemax = 10;
 
 /*
  * This ensures that softcall entries don't get stuck for long. It's expressed
- * in 10 milliseconds as 1 unit. When hires_tick is set or other clock frequency
- * is used, softcall_init() ensures that it's still expressed as 1 =  10 milli
- * seconds.
+ * in 10 milliseconds as 1 unit. Regardless of the value of hires_tick or
+ * clock frequency, softcall_init() ensures that it's still expressed as 1 =
+ * 10 milliseconds.
  */
 unsigned int softcall_delay = 1;
 
diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c
index 3fd8275df0..37792b7254 100644
--- a/usr/src/uts/common/os/space.c
+++ b/usr/src/uts/common/os/space.c
@@ -23,6 +23,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2016 Nexenta Systems, Inc.
+ * Copyright 2020 Joyent, Inc.
  */
 
 /*
@@ -93,8 +94,6 @@ int	__lintzero;		/* Alway zero for shutting up lint */
 pfn_t	physmax;
 pgcnt_t	physinstalled;
 
-struct var v;
-
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kmem.h>
@@ -142,53 +141,6 @@ char dhcifname[IFNAMSIZ];
 
 ether_addr_t etherbroadcastaddr = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 
-
-/*
- * Data from timod that must be resident
- */
-
-/*
- * state transition table for TI interface
- */
-#include <sys/tihdr.h>
-
-#define	nr	127		/* not reachable */
-
-char ti_statetbl[TE_NOEVENTS][TS_NOSTATES] = {
-				/* STATES */
-	/* 0  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 */
-
-	{ 1, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  2, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  4, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr,  0,  3, nr,  3,  3, nr, nr,  7, nr, nr, nr,  6,  7,  9, 10, 11},
-	{nr, nr,  0, nr, nr,  6, nr, nr, nr, nr, nr, nr,  3, nr,  3,  3,  3},
-	{nr, nr, nr, nr, nr, nr, nr, nr,  9, nr, nr, nr, nr,  3, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr,  3, nr, nr, nr, nr,  3, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr,  7, nr, nr, nr, nr,  7, nr, nr, nr},
-	{nr, nr, nr,  5, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr,  8, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, 12, 13, nr, 14, 15, 16, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr,  9, nr, 11, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr,  9, nr, 11, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr, 10, nr,  3, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  7, nr, nr, nr,  7, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr,  9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr,  9, 10, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr,  9, 10, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr, nr, nr, 11,  3, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr,  3, nr, nr,  3,  3,  3, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr, nr, nr, nr, nr,  7, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  9, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-	{nr, nr, nr,  3, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr, nr},
-};
-
-
 #include <sys/tty.h>
 #include <sys/ptyvar.h>
 
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 975b2f3d2e..90a9ea6f0f 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -3629,7 +3629,7 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag,
 
 		/*
 		 * The I_STR facility provides a trap door for malicious
-		 * code to send down bogus streamio(7I) ioctl commands to
+		 * code to send down bogus streamio(4I) ioctl commands to
 		 * unsuspecting STREAMS modules and drivers which expect to
 		 * only get these messages from the stream head.
 		 * Explicitly prohibit any streamio ioctls which can be
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index ac1ee2d1ce..796f89dca2 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/*	  All Rights Reserved	*/
 
 
 /*
@@ -28,6 +28,7 @@
  * Copyright (c) 2016 by Delphix. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
+ * Copyright 2018 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -1901,36 +1902,9 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
 	 */
 	error = strdoioctl(stp, &strioc, FNATIVE,
 	    K_TO_K | STR_NOERROR | STR_NOSIG, crp, rvalp);
-	if (error != 0) {
-		lbfree(linkp);
-
-		if (!(passyncq->sq_flags & SQ_BLOCKED))
-			blocksq(passyncq, SQ_BLOCKED, 0);
-		/*
-		 * Restore the stream head queue and then remove
-		 * the passq. Turn off STPLEX before we turn on
-		 * the stream by removing the passq.
-		 */
-		rq->q_ptr = _WR(rq)->q_ptr = stpdown;
-		setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO,
-		    B_TRUE);
-
-		mutex_enter(&stpdown->sd_lock);
-		stpdown->sd_flag &= ~STPLEX;
-		mutex_exit(&stpdown->sd_lock);
-
-		link_rempassthru(passq);
-
-		mutex_enter(&stpdown->sd_lock);
-		stpdown->sd_flag &= ~STRPLUMB;
-		/* Wakeup anyone waiting for STRPLUMB to clear. */
-		cv_broadcast(&stpdown->sd_monitor);
-		mutex_exit(&stpdown->sd_lock);
+	if (error != 0)
+		goto cleanup;
 
-		mutex_exit(&muxifier);
-		netstack_rele(ss->ss_netstack);
-		return (error);
-	}
 	mutex_enter(&fpdown->f_tlock);
 	fpdown->f_count++;
 	mutex_exit(&fpdown->f_tlock);
@@ -1942,9 +1916,16 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
 
 	ASSERT((cmd == I_LINK) || (cmd == I_PLINK));
 	if (cmd == I_LINK) {
-		ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL);
+		error = ldi_mlink_fp(stp, fpdown, lhlink, LINKNORMAL);
 	} else {
-		ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST);
+		error = ldi_mlink_fp(stp, fpdown, lhlink, LINKPERSIST);
+	}
+
+	if (error != 0) {
+		mutex_enter(&fpdown->f_tlock);
+		fpdown->f_count--;
+		mutex_exit(&fpdown->f_tlock);
+		goto cleanup;
 	}
 
 	link_rempassthru(passq);
@@ -1976,6 +1957,36 @@ mlink_file(vnode_t *vp, int cmd, struct file *fpdown, cred_t *crp, int *rvalp,
 	*rvalp = linkp->li_lblk.l_index;
 	netstack_rele(ss->ss_netstack);
 	return (0);
+
+cleanup:
+	lbfree(linkp);
+
+	if (!(passyncq->sq_flags & SQ_BLOCKED))
+		blocksq(passyncq, SQ_BLOCKED, 0);
+	/*
+	 * Restore the stream head queue and then remove
+	 * the passq. Turn off STPLEX before we turn on
+	 * the stream by removing the passq.
+	 */
+	rq->q_ptr = _WR(rq)->q_ptr = stpdown;
+	setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO,
+	    B_TRUE);
+
+	mutex_enter(&stpdown->sd_lock);
+	stpdown->sd_flag &= ~STPLEX;
+	mutex_exit(&stpdown->sd_lock);
+
+	link_rempassthru(passq);
+
+	mutex_enter(&stpdown->sd_lock);
+	stpdown->sd_flag &= ~STRPLUMB;
+	/* Wakeup anyone waiting for STRPLUMB to clear. */
+	cv_broadcast(&stpdown->sd_monitor);
+	mutex_exit(&stpdown->sd_lock);
+
+	mutex_exit(&muxifier);
+	netstack_rele(ss->ss_netstack);
+	return (error);
 }
 
 int
@@ -2232,9 +2243,9 @@ munlink(stdata_t *stp, linkinfo_t *linkp, int flag, cred_t *crp, int *rvalp,
 
 	/* clean up the layered driver linkages */
 	if ((flag & LINKTYPEMASK) == LINKNORMAL) {
-		ldi_munlink_fp(stp, fpdown, LINKNORMAL);
+		VERIFY0(ldi_munlink_fp(stp, fpdown, LINKNORMAL));
 	} else {
-		ldi_munlink_fp(stp, fpdown, LINKPERSIST);
+		VERIFY0(ldi_munlink_fp(stp, fpdown, LINKPERSIST));
 	}
 
 	link_rempassthru(passq);
@@ -3006,7 +3017,7 @@ strwaitbuf(size_t size, int pri)
  *	GETWAIT		Check for read side errors, no M_READ
  *	WRITEWAIT	Check for write side errors.
  *	NOINTR		Do not return error if nonblocking or timeout.
- * 	STR_NOERROR	Ignore all errors except STPLEX.
+ *	STR_NOERROR	Ignore all errors except STPLEX.
  *	STR_NOSIG	Ignore/hold signals during the duration of the call.
  *	STR_PEEK	Pass through the strgeterr().
  */
@@ -6630,9 +6641,9 @@ drain_syncq(syncq_t *sq)
  *
  * qdrain_syncq can be called (currently) from only one of two places:
  *	drain_syncq
- * 	putnext  (or some variation of it).
+ *	putnext  (or some variation of it).
  * and eventually
- * 	qwait(_sig)
+ *	qwait(_sig)
  *
  * If called from drain_syncq, we found it in the list of queues needing
  * service, so there is work to be done (or it wouldn't be in the list).
@@ -6652,8 +6663,8 @@ drain_syncq(syncq_t *sq)
  *
  * ASSUMES:
  *	One claim
- * 	QLOCK held
- * 	SQLOCK not held
+ *	QLOCK held
+ *	SQLOCK not held
  *	Will release QLOCK before returning
  */
 void
@@ -7107,11 +7118,11 @@ static int
 propagate_syncq(queue_t *qp)
 {
 	mblk_t		*bp, *head, *tail, *prev, *next;
-	syncq_t 	*sq;
+	syncq_t		*sq;
 	queue_t		*nqp;
 	syncq_t		*nsq;
 	boolean_t	isdriver;
-	int 		moved = 0;
+	int		moved = 0;
 	uint16_t	flags;
 	pri_t		priority = curthread->t_pri;
 #ifdef DEBUG
@@ -7144,7 +7155,7 @@ propagate_syncq(queue_t *qp)
 			/* debug macro */
 			SQ_PUTLOCKS_HELD(nsq);
 #ifdef DEBUG
-			func = (void (*)())nqp->q_qinfo->qi_putp;
+			func = (void (*)())(uintptr_t)nqp->q_qinfo->qi_putp;
 #endif
 		}
 
diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c
index 0dde96307b..ac48bf31b7 100644
--- a/usr/src/uts/common/os/sunddi.c
+++ b/usr/src/uts/common/os/sunddi.c
@@ -250,7 +250,7 @@ ddi_unmap_regs(dev_info_t *dip, uint_t rnumber, caddr_t *kaddrp, off_t offset,
 
 int
 ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
-	off_t offset, off_t len, caddr_t *vaddrp)
+    off_t offset, off_t len, caddr_t *vaddrp)
 {
 	return (i_ddi_bus_map(dip, rdip, mp, offset, len, vaddrp));
 }
@@ -265,7 +265,7 @@ ddi_bus_map(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
  */
 int
 nullbusmap(dev_info_t *dip, dev_info_t *rdip, ddi_map_req_t *mp,
-	off_t offset, off_t len, caddr_t *vaddrp)
+    off_t offset, off_t len, caddr_t *vaddrp)
 {
 	_NOTE(ARGUNUSED(rdip))
 	if (mp->map_type == DDI_MT_RNUMBER)
@@ -433,45 +433,6 @@ ddi_peek64(dev_info_t *dip, int64_t *addr, int64_t *val_p)
 	    val_p));
 }
 
-
-/*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using ddi_peek8 and built against S10
- * or earlier will actually have a reference to ddi_peekc in the binary.
- */
-#ifdef _ILP32
-int
-ddi_peekc(dev_info_t *dip, int8_t *addr, int8_t *val_p)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
-	    val_p));
-}
-
-int
-ddi_peeks(dev_info_t *dip, int16_t *addr, int16_t *val_p)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
-	    val_p));
-}
-
-int
-ddi_peekl(dev_info_t *dip, int32_t *addr, int32_t *val_p)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
-	    val_p));
-}
-
-int
-ddi_peekd(dev_info_t *dip, int64_t *addr, int64_t *val_p)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_PEEK, sizeof (*val_p), addr,
-	    val_p));
-}
-#endif /* _ILP32 */
-
 int
 ddi_poke8(dev_info_t *dip, int8_t *addr, int8_t val)
 {
@@ -497,40 +458,6 @@ ddi_poke64(dev_info_t *dip, int64_t *addr, int64_t val)
 }
 
 /*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using ddi_poke8 and built against S10
- * or earlier will actually have a reference to ddi_pokec in the binary.
- */
-#ifdef _ILP32
-int
-ddi_pokec(dev_info_t *dip, int8_t *addr, int8_t val)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_pokes(dev_info_t *dip, int16_t *addr, int16_t val)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_pokel(dev_info_t *dip, int32_t *addr, int32_t val)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-
-int
-ddi_poked(dev_info_t *dip, int64_t *addr, int64_t val)
-{
-	return (i_ddi_peekpoke(dip, DDI_CTLOPS_POKE, sizeof (val), addr, &val));
-}
-#endif /* _ILP32 */
-
-/*
  * ddi_peekpokeio() is used primarily by the mem drivers for moving
  * data to and from uio structures via peek and poke.  Note that we
  * use "internal" routines ddi_peek and ddi_poke to make this go
@@ -2886,7 +2813,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data)
 		 */
 		ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos +
 		    sizeof (int64_t);
-			return (DDI_PROP_RESULT_OK);
+		return (DDI_PROP_RESULT_OK);
 
 	case DDI_PROP_CMD_ENCODE:
 		/*
@@ -2934,7 +2861,7 @@ ddi_prop_int64_op(prop_handle_t *ph, uint_t cmd, int64_t *data)
 		 */
 		ph->ph_cur_pos = (uchar_t *)ph->ph_cur_pos +
 		    sizeof (int64_t);
-			return (DDI_PROP_RESULT_OK);
+		return (DDI_PROP_RESULT_OK);
 
 	case DDI_PROP_CMD_GET_ESIZE:
 		/*
@@ -3115,7 +3042,7 @@ ddi_prop_1275_string(prop_handle_t *ph, uint_t cmd, char *data)
  */
 int
 ddi_prop_1275_bytes(prop_handle_t *ph, uint_t cmd, uchar_t *data,
-	uint_t nelements)
+    uint_t nelements)
 {
 	switch (cmd) {
 	case DDI_PROP_CMD_DECODE:
@@ -4922,7 +4849,7 @@ impl_ddi_callback_init(void)
 
 static void
 callback_insert(int (*funcp)(caddr_t), caddr_t arg, uintptr_t *listid,
-	int count)
+    int count)
 {
 	struct ddi_callback *list, *marker, *new;
 	size_t size = sizeof (struct ddi_callback);
@@ -5614,7 +5541,7 @@ fail:
  * devfs event subclass names as device class names.
  */
 static int
-derive_devi_class(dev_info_t *dip, char *node_type, int flag)
+derive_devi_class(dev_info_t *dip, const char *node_type, int flag)
 {
 	int rv = DDI_SUCCESS;
 
@@ -5659,10 +5586,10 @@ derive_devi_class(dev_info_t *dip, char *node_type, int flag)
  * exceed IFNAMSIZ (16) characters in length.
  */
 static boolean_t
-verify_name(char *name)
+verify_name(const char *name)
 {
-	size_t	len = strlen(name);
-	char	*cp;
+	size_t len = strlen(name);
+	const char *cp;
 
 	if (len == 0 || len > IFNAMSIZ)
 		return (B_FALSE);
@@ -5680,9 +5607,9 @@ verify_name(char *name)
  *				attach it to the given devinfo node.
  */
 
-int
-ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
-    minor_t minor_num, char *node_type, int flag, ddi_minor_type mtype,
+static int
+ddi_create_minor_common(dev_info_t *dip, const char *name, int spec_type,
+    minor_t minor_num, const char *node_type, int flag, ddi_minor_type mtype,
     const char *read_priv, const char *write_priv, mode_t priv_mode)
 {
 	struct ddi_minor_data *dmdp;
@@ -5793,7 +5720,7 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
 	 */
 	if (!(DEVI_IS_ATTACHING(dip) || DEVI_IS_DETACHING(dip)) &&
 	    mtype != DDM_INTERNAL_PATH) {
-		(void) i_log_devfs_minor_create(dip, name);
+		(void) i_log_devfs_minor_create(dip, dmdp->ddm_name);
 	}
 
 	/*
@@ -5804,16 +5731,16 @@ ddi_create_minor_common(dev_info_t *dip, char *name, int spec_type,
 }
 
 int
-ddi_create_minor_node(dev_info_t *dip, char *name, int spec_type,
-    minor_t minor_num, char *node_type, int flag)
+ddi_create_minor_node(dev_info_t *dip, const char *name, int spec_type,
+    minor_t minor_num, const char *node_type, int flag)
 {
 	return (ddi_create_minor_common(dip, name, spec_type, minor_num,
 	    node_type, flag, DDM_MINOR, NULL, NULL, 0));
 }
 
 int
-ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type,
-    minor_t minor_num, char *node_type, int flag,
+ddi_create_priv_minor_node(dev_info_t *dip, const char *name, int spec_type,
+    minor_t minor_num, const char *node_type, int flag,
     const char *rdpriv, const char *wrpriv, mode_t priv_mode)
 {
 	return (ddi_create_minor_common(dip, name, spec_type, minor_num,
@@ -5821,8 +5748,8 @@ ddi_create_priv_minor_node(dev_info_t *dip, char *name, int spec_type,
 }
 
 int
-ddi_create_default_minor_node(dev_info_t *dip, char *name, int spec_type,
-    minor_t minor_num, char *node_type, int flag)
+ddi_create_default_minor_node(dev_info_t *dip, const char *name, int spec_type,
+    minor_t minor_num, const char *node_type, int flag)
 {
 	return (ddi_create_minor_common(dip, name, spec_type, minor_num,
 	    node_type, flag, DDM_DEFAULT, NULL, NULL, 0));
@@ -5842,7 +5769,7 @@ ddi_create_internal_pathname(dev_info_t *dip, char *name, int spec_type,
 }
 
 void
-ddi_remove_minor_node(dev_info_t *dip, char *name)
+ddi_remove_minor_node(dev_info_t *dip, const char *name)
 {
 	int			circ;
 	struct ddi_minor_data	*dmdp, *dmdp1;
@@ -6956,7 +6883,7 @@ ddi_set_console_bell(void (*bellfunc)(clock_t duration))
 
 int
 ddi_dma_alloc_handle(dev_info_t *dip, ddi_dma_attr_t *attr,
-	int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep)
+    int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep)
 {
 	int (*funcp)() = ddi_dma_allochdl;
 	ddi_dma_attr_t dma_attr;
@@ -6986,9 +6913,9 @@ static uintptr_t dma_mem_list_id = 0;
 
 int
 ddi_dma_mem_alloc(ddi_dma_handle_t handle, size_t length,
-	ddi_device_acc_attr_t *accattrp, uint_t flags,
-	int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp,
-	size_t *real_length, ddi_acc_handle_t *handlep)
+    ddi_device_acc_attr_t *accattrp, uint_t flags,
+    int (*waitfp)(caddr_t), caddr_t arg, caddr_t *kaddrp,
+    size_t *real_length, ddi_acc_handle_t *handlep)
 {
 	ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
 	dev_info_t *dip = hp->dmai_rdip;
@@ -7079,8 +7006,8 @@ ddi_dma_mem_free(ddi_acc_handle_t *handlep)
 
 int
 ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp,
-	uint_t flags, int (*waitfp)(caddr_t), caddr_t arg,
-	ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+    uint_t flags, int (*waitfp)(caddr_t), caddr_t arg,
+    ddi_dma_cookie_t *cookiep, uint_t *ccountp)
 {
 	ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
 	dev_info_t *dip, *rdip;
@@ -7143,8 +7070,8 @@ ddi_dma_buf_bind_handle(ddi_dma_handle_t handle, struct buf *bp,
 
 int
 ddi_dma_addr_bind_handle(ddi_dma_handle_t handle, struct as *as,
-	caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t),
-	caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+    caddr_t addr, size_t len, uint_t flags, int (*waitfp)(caddr_t),
+    caddr_t arg, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
 {
 	ddi_dma_impl_t *hp = (ddi_dma_impl_t *)handle;
 	dev_info_t *dip, *rdip;
@@ -7282,7 +7209,7 @@ ddi_dma_numwin(ddi_dma_handle_t handle, uint_t *nwinp)
 
 int
 ddi_dma_getwin(ddi_dma_handle_t h, uint_t win, off_t *offp,
-	size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
+    size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp)
 {
 	int (*funcp)() = ddi_dma_win;
 	struct bus_ops *bop;
@@ -7358,8 +7285,8 @@ i_ddi_dma_clr_fault(ddi_dma_handle_t handle)
  */
 int
 ddi_regs_map_setup(dev_info_t *dip, uint_t rnumber, caddr_t *addrp,
-	offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp,
-	ddi_acc_handle_t *handle)
+    offset_t offset, offset_t len, ddi_device_acc_attr_t *accattrp,
+    ddi_acc_handle_t *handle)
 {
 	ddi_map_req_t mr;
 	ddi_acc_hdl_t *hp;
@@ -7433,7 +7360,7 @@ ddi_regs_map_free(ddi_acc_handle_t *handlep)
 
 int
 ddi_device_zero(ddi_acc_handle_t handle, caddr_t dev_addr, size_t bytecount,
-	ssize_t dev_advcnt, uint_t dev_datasz)
+    ssize_t dev_advcnt, uint_t dev_datasz)
 {
 	uint8_t *b;
 	uint16_t *w;
@@ -7627,7 +7554,7 @@ i_ddi_devtspectype_to_minorname(dev_info_t *dip, dev_t dev, int spec_type)
  */
 int
 i_ddi_minorname_to_devtspectype(dev_info_t *dip, char *minor_name,
-	dev_t *devtp, int *spectypep)
+    dev_t *devtp, int *spectypep)
 {
 	int			circ;
 	struct ddi_minor_data	*dmdp;
@@ -8366,8 +8293,8 @@ umem_decr_devlockmem(struct ddi_umem_cookie *cookie)
  */
 int
 umem_lockmemory(caddr_t addr, size_t len, int flags, ddi_umem_cookie_t *cookie,
-		struct umem_callback_ops *ops_vector,
-		proc_t *procp)
+    struct umem_callback_ops *ops_vector,
+    proc_t *procp)
 {
 	int	error;
 	struct ddi_umem_cookie *p;
@@ -8838,8 +8765,8 @@ ddi_umem_unlock(ddi_umem_cookie_t cookie)
  */
 struct buf *
 ddi_umem_iosetup(ddi_umem_cookie_t cookie, off_t off, size_t len,
-	int direction, dev_t dev, daddr_t blkno,
-	int (*iodone)(struct buf *), int sleepflag)
+    int direction, dev_t dev, daddr_t blkno,
+    int (*iodone)(struct buf *), int sleepflag)
 {
 	struct ddi_umem_cookie *p = (struct ddi_umem_cookie *)cookie;
 	struct buf *bp;
@@ -8919,7 +8846,7 @@ ddi_get_devstate(dev_info_t *dip)
 
 void
 ddi_dev_report_fault(dev_info_t *dip, ddi_fault_impact_t impact,
-	ddi_fault_location_t location, const char *message)
+    ddi_fault_location_t location, const char *message)
 {
 	struct ddi_fault_event_data fd;
 	ddi_eventcookie_t ec;
@@ -8950,7 +8877,7 @@ i_ddi_devi_class(dev_info_t *dip)
 }
 
 int
-i_ddi_set_devi_class(dev_info_t *dip, char *devi_class, int flag)
+i_ddi_set_devi_class(dev_info_t *dip, const char *devi_class, int flag)
 {
 	struct dev_info *devi = DEVI(dip);
 
@@ -9912,7 +9839,7 @@ e_ddi_branch_unconfigure(
 		/* The dip still exists, so do a hold */
 		e_ddi_branch_hold(rdip);
 	}
-out:
+
 	kmem_free(devnm, MAXNAMELEN + 1);
 	ndi_devi_exit(pdip, circ);
 	return (ndi2errno(rv));
diff --git a/usr/src/uts/common/os/sunmdi.c b/usr/src/uts/common/os/sunmdi.c
index 0cdfd30392..6d1e10e0a4 100644
--- a/usr/src/uts/common/os/sunmdi.c
+++ b/usr/src/uts/common/os/sunmdi.c
@@ -3597,6 +3597,16 @@ i_mdi_pi_state_change(mdi_pathinfo_t *pip, mdi_pathinfo_state_t state, int flag)
 		MDI_PI_LOCK(pip);
 		MDI_PI_SET_OFFLINING(pip);
 		break;
+
+	case MDI_PATHINFO_STATE_INIT:
+		/*
+		 * Callers are not allowed to ask us to change the state to the
+		 * initial state.
+		 */
+		rv = MDI_FAILURE;
+		MDI_PI_UNLOCK(pip);
+		goto state_change_exit;
+
 	}
 	MDI_PI_UNLOCK(pip);
 	MDI_CLIENT_UNSTABLE(ct);
@@ -5722,6 +5732,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
 			break;
 
 		case DDI_RESUME:
+		case DDI_PM_RESUME:
 			MDI_DEBUG(2, (MDI_NOTE, dip,
 			    "pHCI post_resume: called %p", (void *)ph));
 			if (error == DDI_SUCCESS) {
@@ -5769,6 +5780,7 @@ mdi_post_attach(dev_info_t *dip, ddi_attach_cmd_t cmd, int error)
 			break;
 
 		case DDI_RESUME:
+		case DDI_PM_RESUME:
 			MDI_DEBUG(2, (MDI_NOTE, dip,
 			    "client post_attach: called %p", (void *)ct));
 			if (error == DDI_SUCCESS) {
@@ -6011,12 +6023,15 @@ i_mdi_phci_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
 		break;
 
 	case DDI_SUSPEND:
+	case DDI_PM_SUSPEND:
 		MDI_DEBUG(2, (MDI_NOTE, dip,
 		    "pHCI post_suspend: called %p",
 		    (void *)ph));
 		if (error != DDI_SUCCESS)
 			MDI_PHCI_SET_RESUME(ph);
 		break;
+	case DDI_HOTPLUG_DETACH:
+		break;
 	}
 	MDI_PHCI_UNLOCK(ph);
 }
@@ -6054,11 +6069,14 @@ i_mdi_client_post_detach(dev_info_t *dip, ddi_detach_cmd_t cmd, int error)
 		break;
 
 	case DDI_SUSPEND:
+	case DDI_PM_SUSPEND:
 		MDI_DEBUG(2, (MDI_NOTE, dip,
 		    "called %p", (void *)ct));
 		if (error != DDI_SUCCESS)
 			MDI_CLIENT_SET_RESUME(ct);
 		break;
+	case DDI_HOTPLUG_DETACH:
+		break;
 	}
 	MDI_CLIENT_UNLOCK(ct);
 }
@@ -6820,6 +6838,10 @@ mdi_bus_power(dev_info_t *parent, void *impl_arg, pm_bus_power_op_t op,
 			i_mdi_pm_rele_client(ct, ct->ct_path_count);
 		}
 		break;
+	default:
+		dev_err(parent, CE_WARN, "!unhandled bus power operation: 0x%x",
+		    op);
+		break;
 	}
 
 	MDI_CLIENT_UNLOCK(ct);
diff --git a/usr/src/uts/common/os/sunpci.c b/usr/src/uts/common/os/sunpci.c
index 209b269838..b1098b4fcc 100644
--- a/usr/src/uts/common/os/sunpci.c
+++ b/usr/src/uts/common/os/sunpci.c
@@ -145,104 +145,6 @@ pci_config_put64(ddi_acc_handle_t handle, off_t offset, uint64_t value)
 	ddi_put64(handle, (uint64_t *)cfgaddr, value);
 }
 
-/*
- * We need to separate the old interfaces from the new ones and leave them
- * in here for a while. Previous versions of the OS defined the new interfaces
- * to the old interfaces. This way we can fix things up so that we can
- * eventually remove these interfaces.
- * e.g. A 3rd party module/driver using pci_config_get8 and built against S10
- * or earlier will actually have a reference to pci_config_getb in the binary.
- */
-#ifdef _ILP32
-uint8_t
-pci_config_getb(ddi_acc_handle_t handle, off_t offset)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	return (ddi_get8(handle, (uint8_t *)cfgaddr));
-}
-
-uint16_t
-pci_config_getw(ddi_acc_handle_t handle, off_t offset)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	return (ddi_get16(handle, (uint16_t *)cfgaddr));
-}
-
-uint32_t
-pci_config_getl(ddi_acc_handle_t handle, off_t offset)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	return (ddi_get32(handle, (uint32_t *)cfgaddr));
-}
-
-uint64_t
-pci_config_getll(ddi_acc_handle_t handle, off_t offset)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	return (ddi_get64(handle, (uint64_t *)cfgaddr));
-}
-
-void
-pci_config_putb(ddi_acc_handle_t handle, off_t offset, uint8_t value)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	ddi_put8(handle, (uint8_t *)cfgaddr, value);
-}
-
-void
-pci_config_putw(ddi_acc_handle_t handle, off_t offset, uint16_t value)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	ddi_put16(handle, (uint16_t *)cfgaddr, value);
-}
-
-void
-pci_config_putl(ddi_acc_handle_t handle, off_t offset, uint32_t value)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	ddi_put32(handle, (uint32_t *)cfgaddr, value);
-}
-
-void
-pci_config_putll(ddi_acc_handle_t handle, off_t offset, uint64_t value)
-{
-	caddr_t	cfgaddr;
-	ddi_acc_hdl_t *hp;
-
-	hp = impl_acc_hdl_get(handle);
-	cfgaddr = hp->ah_addr + offset;
-	ddi_put64(handle, (uint64_t *)cfgaddr, value);
-}
-#endif /* _ILP32 */
-
 /*ARGSUSED*/
 int
 pci_report_pmcap(dev_info_t *dip, int cap, void *arg)
@@ -926,7 +828,7 @@ restoreconfig_err:
 /*ARGSUSED*/
 static int
 pci_lookup_pmcap(dev_info_t *dip, ddi_acc_handle_t conf_hdl,
-	uint16_t *pmcap_offsetp)
+    uint16_t *pmcap_offsetp)
 {
 	uint8_t cap_ptr;
 	uint8_t cap_id;
diff --git a/usr/src/uts/common/os/sunpm.c b/usr/src/uts/common/os/sunpm.c
index 3ce7cc530d..7518c45cea 100644
--- a/usr/src/uts/common/os/sunpm.c
+++ b/usr/src/uts/common/os/sunpm.c
@@ -61,8 +61,8 @@
  * tells what each component's power state values are, and provides human
  * readable strings (currently unused) for each component name and power state.
  * Devices which export pm-components(9P) are automatically power managed
- * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(1M)
- * after parsing power.conf(4)). The exception to this rule is that power
+ * whenever autopm is enabled (via PM_START_PM ioctl issued by pmconfig(8)
+ * after parsing power.conf(5)). The exception to this rule is that power
  * manageable CPU devices may be automatically managed independently of autopm
  * by either enabling or disabling (via PM_START_CPUPM and PM_STOP_CPUPM
  * ioctls) cpupm. If the CPU devices are not managed independently, then they
@@ -72,13 +72,13 @@
  * hardware state.
  *
  * Each device component also has a threshold time associated with each power
- * transition (see power.conf(4)), and a busy/idle state maintained by the
+ * transition (see power.conf(5)), and a busy/idle state maintained by the
  * driver calling pm_idle_component(9F) and pm_busy_component(9F).
  * Components are created idle.
  *
  * The PM framework provides several functions:
- * -implement PM policy as described in power.conf(4)
- *  Policy is set by pmconfig(1M) issuing pm ioctls based on power.conf(4).
+ * -implement PM policy as described in power.conf(5)
+ *  Policy is set by pmconfig(8) issuing pm ioctls based on power.conf(5).
  *  Policies consist of:
  *    -set threshold values (defaults if none provided by pmconfig)
  *    -set dependencies among devices
@@ -122,7 +122,7 @@
  * cdrom is always up whenever the console framebuffer is up, so that the user
  * can insert a cdrom and see a popup as a result.
  *
- * The dependency terminology used in power.conf(4) is not easy to understand,
+ * The dependency terminology used in power.conf(5) is not easy to understand,
  * so we've adopted a different terminology in the implementation.  We write
  * of a "keeps up" and a "kept up" device.  A relationship can be established
  * where one device keeps up another.  That means that if the keepsup device
@@ -384,7 +384,7 @@ int cpr_platform_enable = 0;
 
 /*
  * pm_S3_enabled indicates that we believe the platform can support S3,
- * which we get from pmconfig(1M)
+ * which we get from pmconfig(8)
  */
 int		pm_S3_enabled;
 
@@ -1616,7 +1616,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level,
 				    (PM_CP(dip, comp)->pmc_flags &
 				    PM_PHC_WHILE_SET_POWER));
 
-					resume_needed = suspended;
+				resume_needed = suspended;
 			}
 		} else {
 			if (POWERING_OFF(old_level, level)) {
@@ -1629,7 +1629,7 @@ power_dev(dev_info_t *dip, int comp, int level, int old_level,
 				    (PM_CP(dip, comp)->pmc_flags &
 				    PM_PHC_WHILE_SET_POWER));
 
-					resume_needed = suspended;
+				resume_needed = suspended;
 			}
 		}
 	}
@@ -2076,13 +2076,12 @@ e_pm_hold_rele_power(dev_info_t *dip, int cnt)
 		return;
 
 	PM_LOCK_POWER(dip, &circ);
-	ASSERT(cnt >= 0 && PM_KUC(dip) >= 0 || cnt < 0 && PM_KUC(dip) > 0);
+	ASSERT(cnt >= 0 || (cnt < 0 && PM_KUC(dip) > 0));
 	PMD(PMD_KIDSUP, ("%s: kidsupcnt for %s@%s(%s#%d) %d->%d\n", pmf,
 	    PM_DEVICE(dip), PM_KUC(dip), (PM_KUC(dip) + cnt)))
 
 	PM_KUC(dip) += cnt;
 
-	ASSERT(PM_KUC(dip) >= 0);
 	PM_UNLOCK_POWER(dip, circ);
 
 	if (cnt < 0 && PM_KUC(dip) == 0)
@@ -7647,7 +7646,7 @@ pm_cfb_setup(const char *stdout_path)
 			 */
 		} else {
 			cmn_err(CE_WARN, "Kernel debugger present: see "
-			    "kmdb(1M) for interaction with power management.");
+			    "kmdb(1) for interaction with power management.");
 		}
 	}
 #ifdef DEBUG
diff --git a/usr/src/uts/common/os/swapgeneric.c b/usr/src/uts/common/os/swapgeneric.c
index 77167149fe..ce64aff89a 100644
--- a/usr/src/uts/common/os/swapgeneric.c
+++ b/usr/src/uts/common/os/swapgeneric.c
@@ -878,7 +878,7 @@ load_bootpath_drivers(char *bootpath)
 #endif
 	dip = path_to_devinfo(pathcopy);
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 	/*
 	 * i386 does not provide stub nodes for all boot devices,
 	 * but we should be able to find the node for the parent,
@@ -910,7 +910,7 @@ load_bootpath_drivers(char *bootpath)
 			rval = load_boot_driver(leaf, NULL);
 			if (rval == -1) {
 				kmem_free(pathcopy, pathcopy_len);
-				return (NULL);
+				return (0);
 			}
 		}
 	}
@@ -920,7 +920,7 @@ load_bootpath_drivers(char *bootpath)
 		cmn_err(CE_WARN, "can't bind driver for boot path <%s>",
 		    bootpath);
 		kmem_free(pathcopy, pathcopy_len);
-		return (NULL);
+		return (0);
 	}
 
 	/*
@@ -936,7 +936,7 @@ load_bootpath_drivers(char *bootpath)
 	    modloadonly("drv", "ibp") == -1) {
 		cmn_err(CE_CONT, "ibp: cannot load platform driver\n");
 		kmem_free(pathcopy, pathcopy_len);
-		return (NULL);
+		return (0);
 	}
 
 	/*
diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c
index fb64000e4d..dca168b642 100644
--- a/usr/src/uts/common/os/sysent.c
+++ b/usr/src/uts/common/os/sysent.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
@@ -88,9 +89,9 @@ int	getloadavg(int *, int);
 int	rusagesys(int, void *, void *, void *, void *);
 int	getpagesizes(int, size_t *, int);
 int	gtty(int, intptr_t);
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 int	hrtsys(struct hrtsysa *, rval_t *);
-#endif /* __i386 || __amd64 */
+#endif /* __x86 */
 int	ioctl(int, int, intptr_t);
 int	kill();
 int	labelsys(int, void *, void *, void *, void *, void *);
@@ -184,7 +185,7 @@ int	statvfs(char *, struct statvfs *);
 int	fstatvfs(int, struct statvfs *);
 offset_t llseek32(int32_t, uint32_t, uint32_t, int);
 
-#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
 int	sysi86(short, uintptr_t, uintptr_t, uintptr_t);
 #endif
 
@@ -330,6 +331,7 @@ int	setsockopt(int, int, int, void *, socklen_t *, int);
 int	sockconfig(int, void *, void *, void *, void *);
 ssize_t	sendfilev(int, int, const struct sendfilevec *, int, size_t *);
 ssize_t	getrandom(void *, size_t, unsigned int);
+void	upanic(void *, size_t);
 
 typedef int64_t	(*llfcn_t)();	/* for casting one-word returns */
 
@@ -390,19 +392,15 @@ typedef int64_t	(*llfcn_t)();	/* for casting one-word returns */
 #define	IF_sparc(true, false)	false
 #endif
 
-#if defined(__i386) && !defined(__amd64)
-#define	IF_i386(true, false)	true
-#else
 #define	IF_i386(true, false)	false
-#endif
 
-#if defined(__i386) || defined(__amd64)
+#if defined(__x86)
 #define	IF_x86(true, false)	true
 #else
 #define	IF_x86(true, false)	false
 #endif
 
-#if (defined(__i386) && !defined(__amd64)) || defined(__i386_COMPAT)
+#if defined(__i386_COMPAT)
 #define	IF_386_ABI(true, false)	true
 #else
 #define	IF_386_ABI(true, false)	false
@@ -583,7 +581,7 @@ struct sysent sysent[NSYSCALL] =
 	/* 122 */ SYSENT_CL("writev",		writev,		3),
 	/* 123 */ SYSENT_CL("preadv",		preadv,		5),
 	/* 124 */ SYSENT_CL("pwritev",		pwritev,	5),
-	/* 125 */ SYSENT_LOADABLE(),			/* (was fxstat) */
+	/* 125 */ SYSENT_CI("upanic",		upanic,		2),
 	/* 126 */ SYSENT_CL("getrandom",	getrandom,	3),
 	/* 127 */ SYSENT_CI("mmapobj",		mmapobjsys,	5),
 	/* 128 */ IF_LP64(
@@ -948,7 +946,7 @@ struct sysent sysent32[NSYSCALL] =
 	/* 122 */ SYSENT_CI("writev",		writev32,	3),
 	/* 123 */ SYSENT_CI("preadv",		preadv,		5),
 	/* 124 */ SYSENT_CI("pwritev",		pwritev,	5),
-	/* 125 */ SYSENT_LOADABLE32(),		/*	was fxstat32	*/
+	/* 125 */ SYSENT_CI("upanic",		upanic,		2),
 	/* 126 */ SYSENT_CI("getrandom",	getrandom,	3),
 	/* 127 */ SYSENT_CI("mmapobj",		mmapobjsys,	5),
 	/* 128 */ SYSENT_CI("setrlimit",	setrlimit32,	2),
diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c
index c965db6737..f587430625 100644
--- a/usr/src/uts/common/os/timer.c
+++ b/usr/src/uts/common/os/timer.c
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
  */
 
 #include <sys/timer.h>
@@ -179,7 +179,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
 
 /*
  * timer_grab() and its companion routine, timer_release(), are wrappers
- * around timer_lock()/_unlock() which allow the timer_*(3R) routines to
+ * around timer_lock()/_unlock() which allow the timer_*(3C) routines to
  * (a) share error handling code and (b) not grab p_lock themselves.  Routines
  * which are called with p_lock held (e.g. timer_lwpbind(), timer_lwpexit())
  * must call timer_lock()/_unlock() explictly.
@@ -194,7 +194,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it)
  *  (a)	The specified timer ID is out of range.
  *
  *  (b)	The specified timer ID does not correspond to a timer ID returned
- *	from timer_create(3R).
+ *	from timer_create(3C).
  *
  *  (c)	The specified timer ID is currently being removed.
  *
@@ -482,105 +482,106 @@ timer_fire(itimer_t *it)
 }
 
 /*
- * Allocate an itimer_t and find and appropriate slot for it in p_itimer.
- * Acquires p_lock and holds it on return, regardless of success.
+ * Find an unused (i.e. NULL) entry in p->p_itimer and set *id to the
+ * index of the unused entry, growing p->p_itimer as necessary (up to timer_max
+ * entries). Returns B_TRUE (with *id set) on success, B_FALSE on failure
+ * (e.g. the process already has the maximum number of allowed timers
+ * allocated).
  */
-static itimer_t *
-timer_alloc(proc_t *p, timer_t *id)
+static boolean_t
+timer_get_id(proc_t *p, timer_t *id)
 {
-	itimer_t *it, **itp = NULL;
+	itimer_t **itp = NULL, **itp_new;
+	uint_t target_sz;
 	uint_t i;
 
-	ASSERT(MUTEX_NOT_HELD(&p->p_lock));
-
-	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
-	bzero(it, sizeof (itimer_t));
-	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
-
-	mutex_enter(&p->p_lock);
-retry:
-	if (p->p_itimer != NULL) {
-		for (i = 0; i < p->p_itimer_sz; i++) {
-			if (p->p_itimer[i] == NULL) {
-				itp = &(p->p_itimer[i]);
-				break;
-			}
-		}
-	}
-
-	/*
-	 * A suitable slot was not found.  If possible, allocate (or resize)
-	 * the p_itimer array and try again.
-	 */
-	if (itp == NULL) {
-		uint_t target_sz = _TIMER_ALLOC_INIT;
-		itimer_t **itp_new;
-
-		if (p->p_itimer != NULL) {
-			ASSERT(p->p_itimer_sz != 0);
+	ASSERT(MUTEX_HELD(&p->p_lock));
 
-			target_sz = p->p_itimer_sz * 2;
-		}
+	if (p->p_itimer == NULL) {
 		/*
-		 * Protect against exceeding the max or overflow
+		 * No timers have been allocated for this process, allocate
+		 * the initial array.
 		 */
-		if (target_sz > timer_max || target_sz > INT_MAX ||
-		    target_sz < p->p_itimer_sz) {
-			kmem_cache_free(clock_timer_cache, it);
-			return (NULL);
-		}
+		ASSERT0(p->p_itimer_sz);
+		target_sz = _TIMER_ALLOC_INIT;
+
 		mutex_exit(&p->p_lock);
 		itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *),
 		    KM_SLEEP);
 		mutex_enter(&p->p_lock);
-		if (target_sz <= p->p_itimer_sz) {
-			/*
-			 * A racing thread performed the resize while we were
-			 * waiting outside p_lock.  Discard our now-useless
-			 * allocation and retry.
-			 */
-			kmem_free(itp_new, target_sz * sizeof (itimer_t *));
-			goto retry;
-		} else {
+
+		if (p->p_itimer == NULL) {
 			/*
-			 * Instantiate the larger allocation and select the
-			 * first fresh entry for use.
+			 * As long as no other thread beat us to allocating
+			 * the initial p_itimer array, use what we allocated.
+			 * Since we just allocated it, we know slot 0 is
+			 * free.
 			 */
-			if (p->p_itimer != NULL) {
-				uint_t old_sz;
-
-				old_sz = p->p_itimer_sz;
-				bcopy(p->p_itimer, itp_new,
-				    old_sz * sizeof (itimer_t *));
-				kmem_free(p->p_itimer,
-				    old_sz * sizeof (itimer_t *));
-
-				/*
-				 * Short circuit to use the first free entry in
-				 * the new allocation.  It's possible that
-				 * other lower-indexed timers were freed while
-				 * p_lock was dropped, but skipping over them
-				 * is not harmful at all.  In the common case,
-				 * we skip the need to walk over an array
-				 * filled with timers before arriving at the
-				 * slot we know is fresh from the allocation.
-				 */
-				i = old_sz;
-			} else {
-				/*
-				 * For processes lacking any existing timers,
-				 * we can simply select the first entry.
-				 */
-				i = 0;
-			}
 			p->p_itimer = itp_new;
 			p->p_itimer_sz = target_sz;
+			i = 0;
+			goto done;
+		}
+
+		/*
+		 * Another thread beat us to allocating the initial array.
+		 * Proceed to searching for an empty slot and growing the
+		 * array if needed.
+		 */
+		kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+	}
+
+retry:
+	/* Use the first empty slot (if any exist) */
+	for (i = 0; i < p->p_itimer_sz; i++) {
+		if (p->p_itimer[i] == NULL) {
+			goto done;
 		}
 	}
 
-	ASSERT(i <= INT_MAX);
+	/* No empty slots, try to grow p->p_itimer and retry */
+	target_sz = p->p_itimer_sz * 2;
+	if (target_sz > timer_max || target_sz > INT_MAX ||
+	    target_sz < p->p_itimer_sz) {
+		/* Protect against exceeding the max or overflow */
+		return (B_FALSE);
+	}
+
+	mutex_exit(&p->p_lock);
+	itp_new = kmem_zalloc(target_sz * sizeof (itimer_t *), KM_SLEEP);
+	mutex_enter(&p->p_lock);
+
+	if (target_sz <= p->p_itimer_sz) {
+		/*
+		 * A racing thread performed the resize while we were
+		 * waiting outside p_lock.  Discard our now-useless
+		 * allocation and retry.
+		 */
+		kmem_free(itp_new, target_sz * sizeof (itimer_t *));
+		goto retry;
+	}
+
+	ASSERT3P(p->p_itimer, !=, NULL);
+	bcopy(p->p_itimer, itp_new, p->p_itimer_sz * sizeof (itimer_t *));
+	kmem_free(p->p_itimer, p->p_itimer_sz * sizeof (itimer_t *));
+
+	/*
+	 * Short circuit to use the first free entry in the new allocation.
+	 * It's possible that other lower-indexed timers were freed while
+	 * p_lock was dropped, but skipping over them is not harmful at all.
+	 * In the common case, we skip the need to walk over an array filled
+	 * with timers before arriving at the slot we know is fresh from the
+	 * allocation.
+	 */
+	i = p->p_itimer_sz;
+
+	p->p_itimer = itp_new;
+	p->p_itimer_sz = target_sz;
+
+done:
+	ASSERT3U(i, <=, INT_MAX);
 	*id = (timer_t)i;
-	return (it);
+	return (B_TRUE);
 }
 
 /*
@@ -612,19 +613,20 @@ timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp,
 	sigq = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	/*
-	 * Allocate a timer and choose a slot for it. This acquires p_lock.
+	 * Allocate a timer and choose a slot for it.
 	 */
-	it = timer_alloc(p, &tid);
-	ASSERT(MUTEX_HELD(&p->p_lock));
+	it = kmem_cache_alloc(clock_timer_cache, KM_SLEEP);
+	bzero(it, sizeof (*it));
+	mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL);
 
-	if (it == NULL) {
+	mutex_enter(&p->p_lock);
+	if (!timer_get_id(p, &tid)) {
 		mutex_exit(&p->p_lock);
 		kmem_free(sigq, sizeof (sigqueue_t));
-		return (EAGAIN);
+		return (set_errno(EAGAIN));
 	}
 
 	ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL);
-	ASSERT(evp != NULL);
 
 	/*
 	 * If we develop other notification mechanisms, this will need
diff --git a/usr/src/uts/common/os/timers.c b/usr/src/uts/common/os/timers.c
index 53be806026..cb57b60758 100644
--- a/usr/src/uts/common/os/timers.c
+++ b/usr/src/uts/common/os/timers.c
@@ -1211,7 +1211,7 @@ hrt2ts(hrtime_t hrt, timestruc_t *tsp)
 hrtime_t
 ts2hrt(const timestruc_t *tsp)
 {
-#if defined(__amd64) || defined(__i386)
+#if defined(__x86)
 	/*
 	 * On modern x86 CPUs, the simple version is faster.
 	 */
@@ -1232,7 +1232,7 @@ ts2hrt(const timestruc_t *tsp)
 	hrt = (hrt << 7) - hrt - hrt - hrt;
 	hrt = (hrt << 9) + tsp->tv_nsec;
 	return (hrt);
-#endif /* defined(__amd64) || defined(__i386) */
+#endif /* defined(__x86) */
 }
 
 /*
diff --git a/usr/src/uts/common/os/upanic.c b/usr/src/uts/common/os/upanic.c
new file mode 100644
index 0000000000..b4d23eeaff
--- /dev/null
+++ b/usr/src/uts/common/os/upanic.c
@@ -0,0 +1,98 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ */
+
+#include <sys/proc.h>
+#include <c2/audit.h>
+#include <sys/procfs.h>
+#include <sys/core.h>
+
+/*
+ * This function is meant to be a guaranteed abort that generates a core file
+ * that allows up to 1k of data to enter into an elfnote in the process. This is
+ * meant to insure that even in the face of other problems, this can get out.
+ */
+
+void
+upanic(void *addr, size_t len)
+{
+	kthread_t *t = curthread;
+	proc_t *p = curproc;
+	klwp_t *lwp = ttolwp(t);
+	uint32_t auditing = AU_AUDITING();
+	uint32_t upflag = P_UPF_PANICKED;
+	void *buf;
+	int code;
+
+	/*
+	 * Before we worry about the data that the user has as a message, go
+	 * ahead and make sure we try and get all the other threads stopped.
+	 * That'll help us make sure that nothing else is going on and we don't
+	 * lose a race.
+	 */
+	mutex_enter(&p->p_lock);
+	lwp->lwp_cursig = SIGABRT;
+	mutex_exit(&p->p_lock);
+
+	proc_is_exiting(p);
+	if (exitlwps(1) != 0) {
+		mutex_enter(&p->p_lock);
+		lwp_exit();
+	}
+
+	/*
+	 * Copy in the user data. We truncate it to PRUPANIC_BUFLEN no matter
+	 * what and ensure that the last data was set to zero.
+	 */
+	if (addr != NULL && len > 0) {
+		size_t copylen;
+
+		upflag |= P_UPF_HAVEMSG;
+
+		if (len >= PRUPANIC_BUFLEN) {
+			copylen = PRUPANIC_BUFLEN;
+			upflag |= P_UPF_TRUNCMSG;
+		} else {
+			copylen = len;
+		}
+
+		buf = kmem_zalloc(PRUPANIC_BUFLEN, KM_SLEEP);
+		if (copyin(addr, buf, copylen) != 0) {
+			upflag |= P_UPF_INVALMSG;
+			upflag &= ~P_UPF_HAVEMSG;
+		} else {
+			mutex_enter(&p->p_lock);
+			ASSERT3P(p->p_upanic, ==, NULL);
+			p->p_upanic = buf;
+			mutex_exit(&p->p_lock);
+		}
+	}
+
+	mutex_enter(&p->p_lock);
+	p->p_upanicflag = upflag;
+	mutex_exit(&p->p_lock);
+
+	/*
+	 * If we're auditing we need to finish the system call itself and then
+	 * begin the core dump.
+	 */
+	if (auditing) {
+		audit_finish(0, SYS_upanic, 0, NULL);
+		audit_core_start(SIGABRT);
+	}
+	code = core(SIGABRT, B_FALSE);
+	if (auditing)		/* audit core dump */
+		audit_core_finish(code ? CLD_KILLED : CLD_DUMPED);
+	exit(code ? CLD_KILLED : CLD_DUMPED, SIGABRT);
+}
diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c
index f5ee76a2cb..1df2f479a5 100644
--- a/usr/src/uts/common/os/vm_pageout.c
+++ b/usr/src/uts/common/os/vm_pageout.c
@@ -18,14 +18,20 @@
  *
  * CDDL HEADER END
  */
+
+/*
+ * Copyright 2021 Oxide Computer Company
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+ */
+
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  * Copyright 2018 Joyent, Inc.
  */
 
-/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
-/*	  All Rights Reserved  	*/
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
 
 /*
  * University Copyright- Copyright (c) 1982, 1986, 1988
@@ -60,6 +66,7 @@
 #include <sys/mem_cage.h>
 #include <sys/time.h>
 #include <sys/zone.h>
+#include <sys/stdbool.h>
 
 #include <vm/hat.h>
 #include <vm/as.h>
@@ -68,149 +75,275 @@
 #include <vm/pvn.h>
 #include <vm/seg_kmem.h>
 
-static int checkpage(page_t *, int);
+/*
+ * FREE MEMORY MANAGEMENT
+ *
+ * Management of the pool of free pages is a tricky business.  There are
+ * several critical threshold values which constrain our allocation of new
+ * pages and inform the rate of paging out of memory to swap.  These threshold
+ * values, and the behaviour they induce, are described below in descending
+ * order of size -- and thus increasing order of severity!
+ *
+ *   +---------------------------------------------------- physmem (all memory)
+ *   |
+ *   | Ordinarily there are no particular constraints placed on page
+ *   v allocation.  The page scanner is not running and page_create_va()
+ *   | will effectively grant all page requests (whether from the kernel
+ *   | or from user processes) without artificial delay.
+ *   |
+ *   +------------------------ lotsfree (1.56% of physmem, min. 16MB, max. 2GB)
+ *   |
+ *   | When we have less than "lotsfree" pages, pageout_scanner() is
+ *   v signalled by schedpaging() to begin looking for pages that can
+ *   | be evicted to disk to bring us back above lotsfree.  At this
+ *   | stage there is still no constraint on allocation of free pages.
+ *   |
+ *   | For small systems, we set a lower bound of 16MB for lotsfree;
+ *   v this is the natural value for a system with 1GB memory.  This is
+ *   | to ensure that the pageout reserve pool contains at least 4MB
+ *   | for use by ZFS.
+ *   |
+ *   | For systems with a large amount of memory, we constrain lotsfree
+ *   | to be at most 2GB (with a pageout reserve of around 0.5GB), as
+ *   v at some point the required slack relates more closely to the
+ *   | rate at which paging can occur than to the total amount of memory.
+ *   |
+ *   +------------------- desfree (1/2 of lotsfree, 0.78% of physmem, min. 8MB)
+ *   |
+ *   | When we drop below desfree, a number of kernel facilities will
+ *   v wait before allocating more memory, under the assumption that
+ *   | pageout or reaping will make progress and free up some memory.
+ *   | This behaviour is not especially coordinated; look for comparisons
+ *   | of desfree and freemem.
+ *   |
+ *   | In addition to various attempts at advisory caution, clock()
+ *   | will wake up the thread that is ordinarily parked in sched().
+ *   | This routine is responsible for the heavy-handed swapping out
+ *   v of entire processes in an attempt to arrest the slide of free
+ *   | memory.  See comments in sched.c for more details.
+ *   |
+ *   +----- minfree & throttlefree (3/4 of desfree, 0.59% of physmem, min. 6MB)
+ *   |
+ *   | These two separate tunables have, by default, the same value.
+ *   v Various parts of the kernel use minfree to signal the need for
+ *   | more aggressive reclamation of memory, and sched() is more
+ *   | aggressive at swapping processes out.
+ *   |
+ *   | If free memory falls below throttlefree, page_create_va() will
+ *   | use page_create_throttle() to begin holding most requests for
+ *   | new pages while pageout and reaping free up memory.  Sleeping
+ *   v allocations (e.g., KM_SLEEP) are held here while we wait for
+ *   | more memory.  Non-sleeping allocations are generally allowed to
+ *   | proceed, unless their priority is explicitly lowered with
+ *   | KM_NORMALPRI (Note: KM_NOSLEEP_LAZY == (KM_NOSLEEP | KM_NORMALPRI).).
+ *   |
+ *   +------- pageout_reserve (3/4 of throttlefree, 0.44% of physmem, min. 4MB)
+ *   |
+ *   | When we hit throttlefree, the situation is already dire.  The
+ *   v system is generally paging out memory and swapping out entire
+ *   | processes in order to free up memory for continued operation.
+ *   |
+ *   | Unfortunately, evicting memory to disk generally requires short
+ *   | term use of additional memory; e.g., allocation of buffers for
+ *   | storage drivers, updating maps of free and used blocks, etc.
+ *   | As such, pageout_reserve is the number of pages that we keep in
+ *   | special reserve for use by pageout() and sched() and by any
+ *   v other parts of the kernel that need to be working for those to
+ *   | make forward progress such as the ZFS I/O pipeline.
+ *   |
+ *   | When we are below pageout_reserve, we fail or hold any allocation
+ *   | that has not explicitly requested access to the reserve pool.
+ *   | Access to the reserve is generally granted via the KM_PUSHPAGE
+ *   | flag, or by marking a thread T_PUSHPAGE such that all allocations
+ *   | can implicitly tap the reserve.  For more details, see the
+ *   v NOMEMWAIT() macro, the T_PUSHPAGE thread flag, the KM_PUSHPAGE
+ *   | and VM_PUSHPAGE allocation flags, and page_create_throttle().
+ *   |
+ *   +---------------------------------------------------------- no free memory
+ *   |
+ *   | If we have arrived here, things are very bad indeed.  It is
+ *   v surprisingly difficult to tell if this condition is even fatal,
+ *   | as enough memory may have been granted to pageout() and to the
+ *   | ZFS I/O pipeline that requests for eviction that have already been
+ *   | made will complete and free up memory some time soon.
+ *   |
+ *   | If free memory does not materialise, the system generally remains
+ *   | deadlocked.  The pageout_deadman() below is run once per second
+ *   | from clock(), seeking to limit the amount of time a single request
+ *   v to page out can be blocked before the system panics to get a crash
+ *   | dump and return to service.
+ *   |
+ *   +-------------------------------------------------------------------------
+ */
 
 /*
  * The following parameters control operation of the page replacement
- * algorithm.  They are initialized to 0, and then computed at boot time
- * based on the size of the system.  If they are patched non-zero in
- * a loaded vmunix they are left alone and may thus be changed per system
- * using mdb on the loaded system.
+ * algorithm.  They are initialized to 0, and then computed at boot time based
+ * on the size of the system; see setupclock().  If they are patched non-zero
+ * in a loaded vmunix they are left alone and may thus be changed per system
+ * using "mdb -kw" on the loaded system.
  */
 pgcnt_t		slowscan = 0;
 pgcnt_t		fastscan = 0;
 
 static pgcnt_t	handspreadpages = 0;
-static int	loopfraction = 2;
+
+/*
+ * looppages:
+ *     Cached copy of the total number of pages in the system (total_pages).
+ *
+ * loopfraction:
+ *     Divisor used to relate fastscan to looppages in setupclock().
+ */
+static uint_t	loopfraction = 2;
 static pgcnt_t	looppages;
-/* See comment below describing 4% and 80% */
-static int	min_percent_cpu = 4;
-static int	max_percent_cpu = 80;
+
+static uint_t	min_percent_cpu = 4;
+static uint_t	max_percent_cpu = 80;
 static pgcnt_t	maxfastscan = 0;
 static pgcnt_t	maxslowscan = 100;
 
-pgcnt_t	maxpgio = 0;
-pgcnt_t	minfree = 0;
-pgcnt_t	desfree = 0;
-pgcnt_t	lotsfree = 0;
-pgcnt_t	needfree = 0;
-pgcnt_t	throttlefree = 0;
-pgcnt_t	pageout_reserve = 0;
+#define		MEGABYTES		(1024ULL * 1024ULL)
+
+/*
+ * pageout_threshold_style:
+ *     set to 1 to use the previous default threshold size calculation;
+ *     i.e., each threshold is half of the next largest value.
+ */
+uint_t		pageout_threshold_style = 0;
+
+/*
+ * The operator may override these tunables to request a different minimum or
+ * maximum lotsfree value, or to change the divisor we use for automatic
+ * sizing.
+ *
+ * By default, we make lotsfree 1/64th of the total memory in the machine.  The
+ * minimum and maximum are specified in bytes, rather than pages; a zero value
+ * means the default values (below) are used.
+ */
+uint_t		lotsfree_fraction = 64;
+pgcnt_t		lotsfree_min = 0;
+pgcnt_t		lotsfree_max = 0;
 
-pgcnt_t	deficit;
-pgcnt_t	nscan;
-pgcnt_t	desscan;
+#define		LOTSFREE_MIN_DEFAULT	(16 * MEGABYTES)
+#define		LOTSFREE_MAX_DEFAULT	(2048 * MEGABYTES)
+
+/*
+ * If these tunables are set to non-zero values in /etc/system, and provided
+ * the value is not larger than the threshold above, the specified value will
+ * be used directly without any additional calculation or adjustment.  The boot
+ * time value of these overrides is preserved in the "clockinit" struct.  More
+ * detail is available in the comment at the top of the file.
+ */
+pgcnt_t		maxpgio = 0;
+pgcnt_t		minfree = 0;
+pgcnt_t		desfree = 0;
+pgcnt_t		lotsfree = 0;
+pgcnt_t		needfree = 0;
+pgcnt_t		throttlefree = 0;
+pgcnt_t		pageout_reserve = 0;
+pri_t		pageout_pri;
+
+pgcnt_t		deficit;
+pgcnt_t		nscan;
+pgcnt_t		desscan;
 
 /* kstats */
 uint64_t low_mem_scan;
 uint64_t zone_cap_scan;
-uint64_t n_throttle;
 
-clock_t	zone_pageout_ticks;	/* tunable to change zone pagescan ticks */
+#define	MAX_PSCAN_THREADS	16
 
 /*
- * Values for min_pageout_ticks, max_pageout_ticks and pageout_ticks
- * are the number of ticks in each wakeup cycle that gives the
- * equivalent of some underlying %CPU duty cycle.
- *
- * For example, when RATETOSCHEDPAGING is 4 (the default), then schedpaging()
- * will run 4 times/sec to update pageout scanning parameters and kickoff
- * the pageout_scanner() thread if necessary.
+ * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and
+ * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle
+ * that gives the equivalent of some underlying %CPU duty cycle.
  *
- * Given hz is 100, min_pageout_ticks will be set to 1 (1% of a CPU). When
- * pageout_ticks is set to min_pageout_ticks, then the total CPU time consumed
- * by the scanner in a 1 second interval is 4% of a CPU (RATETOSCHEDPAGING * 1).
+ * min_pageout_nsec:
+ *     nanoseconds/wakeup equivalent of min_percent_cpu.
  *
- * Given hz is 100, max_pageout_ticks will be set to 20 (20% of a CPU). When
- * pageout_ticks is set to max_pageout_ticks, then the total CPU time consumed
- * by the scanner in a 1 second interval is 80% of a CPU
- * (RATETOSCHEDPAGING * 20). There is no point making max_pageout_ticks >25
- * since schedpaging() runs RATETOSCHEDPAGING (4) times/sec.
+ * max_pageout_nsec:
+ *     nanoseconds/wakeup equivalent of max_percent_cpu.
  *
- * If hz is 1000, then min_pageout_ticks will be 10 and max_pageout_ticks
- * will be 200, so the CPU percentages are the same as when hz is 100.
- *
- * min_pageout_ticks:
- *     ticks/wakeup equivalent of min_percent_cpu.
- *
- * max_pageout_ticks:
- *     ticks/wakeup equivalent of max_percent_cpu.
- *
- * pageout_ticks:
- *     Number of clock ticks budgeted for each wakeup cycle.
+ * pageout_nsec:
+ *     Number of nanoseconds budgeted for each wakeup cycle.
  *     Computed each time around by schedpaging().
- *     Varies between min_pageout_ticks .. max_pageout_ticks,
+ *     Varies between min_pageout_nsec and max_pageout_nsec,
  *     depending on memory pressure or zones over their cap.
+ *
+ * zone_pageout_nsec:
+ *      Number of nanoseconds budget for each cycle when a zone
+ *      is over its memory cap. If this is zero, then the value
+ *      of max_pageout_nsec is used instead.
  */
+static hrtime_t	min_pageout_nsec;
+static hrtime_t	max_pageout_nsec;
+static hrtime_t	pageout_nsec;
+static hrtime_t	zone_pageout_nsec;
 
-static clock_t	min_pageout_ticks;
-static clock_t	max_pageout_ticks;
-static clock_t	pageout_ticks;
+static boolean_t	reset_hands[MAX_PSCAN_THREADS];
 
-#define	MAX_PSCAN_THREADS	16
-static boolean_t reset_hands[MAX_PSCAN_THREADS];
+#define	PAGES_POLL_MASK	1023
+#define	SCHEDPAGING_HZ	4
 
 /*
- * These can be tuned in /etc/system or set with mdb.
- * 'des_page_scanners' is the desired number of page scanner threads. The
- * system will bring the actual number of threads into line with the desired
- * number. If des_page_scanners is set to an invalid value, the system will
- * correct the setting.
+ * despagescanners:
+ *	The desired number of page scanner threads. The value can be set in
+ *	/etc/system or tuned directly with 'mdb -kw'.  The system will bring
+ *	the actual number of threads into line with the desired number. If set
+ *	to an invalid value, the system will correct the setting.
  */
-uint_t des_page_scanners;
-uint_t pageout_reset_cnt = 64;	/* num. cycles for pageout_scanner hand reset */
-
-uint_t n_page_scanners;
-static pgcnt_t	pscan_region_sz; /* informational only */
-
-
-#define	PAGES_POLL_MASK	1023
+uint_t despagescanners = 0;
 
 /*
  * pageout_sample_lim:
- *     The limit on the number of samples needed to establish a value
- *     for new pageout parameters, fastscan, slowscan, and handspreadpages.
+ *     The limit on the number of samples needed to establish a value for new
+ *     pageout parameters: fastscan, slowscan, pageout_new_spread, and
+ *     handspreadpages.
  *
  * pageout_sample_cnt:
- *     Current sample number.  Once the sample gets large enough,
- *     set new values for handspreadpages, fastscan and slowscan.
+ *     Current sample number.  Once the sample gets large enough, set new
+ *     values for handspreadpages, pageout_new_spread, fastscan and slowscan.
  *
  * pageout_sample_pages:
  *     The accumulated number of pages scanned during sampling.
  *
  * pageout_sample_etime:
- *     The accumulated number of nanoseconds for the sample.
+ *     The accumulated nanoseconds for the sample.
  *
  * pageout_rate:
- *     Rate in pages/second, computed at the end of sampling.
+ *     Rate in pages/nanosecond, computed at the end of sampling.
  *
  * pageout_new_spread:
- *     The new value to use for maxfastscan and (perhaps) handspreadpages.
- *     Intended to be the number pages that can be scanned per sec using ~10%
- *     of a CPU. Calculated after enough samples have been taken.
- *     pageout_rate / 10
+ *     Initially zero while the system scan rate is measured by
+ *     pageout_scanner(), which then sets this value once per system boot after
+ *     enough samples have been recorded (pageout_sample_cnt).  Once set, this
+ *     new value is used for fastscan and handspreadpages.
  */
-
 typedef hrtime_t hrrate_t;
 
-static uint_t	pageout_sample_lim = 4;
-static uint_t	pageout_sample_cnt = 0;
+static uint64_t	pageout_sample_lim = 4;
+static uint64_t	pageout_sample_cnt = 0;
 static pgcnt_t	pageout_sample_pages = 0;
+static hrtime_t	pageout_sample_etime = 0;
 static hrrate_t	pageout_rate = 0;
 static pgcnt_t	pageout_new_spread = 0;
 
-static hrtime_t	pageout_sample_etime = 0;
-
-/* True if page scanner is first starting up */
+/* True if the page scanner is first starting up */
 #define	PAGE_SCAN_STARTUP	(pageout_sample_cnt < pageout_sample_lim)
 
+/* The current number of page scanner threads */
+static uint_t n_page_scanners = 1;
+/* The number of page scanner threads that are actively scanning. */
+static uint_t pageouts_running;
+
 /*
- * Record number of times a pageout_scanner wakeup cycle finished because it
+ * Record number of times a pageout_scanner() wakeup cycle finished because it
  * timed out (exceeded its CPU budget), rather than because it visited
  * its budgeted number of pages. This is only done when scanning under low
  * free memory conditions, not when scanning for zones over their cap.
  */
-uint64_t pageout_timeouts = 0;
+uint64_t	pageout_timeouts = 0;
 
 #ifdef VM_STATS
 static struct pageoutvmstats_str {
@@ -225,10 +358,57 @@ static struct pageoutvmstats_str {
 kmutex_t	memavail_lock;
 kcondvar_t	memavail_cv;
 
-/*
- * The size of the clock loop.
- */
-#define	LOOPPAGES	total_pages
+typedef enum pageout_hand {
+	POH_FRONT = 1,
+	POH_BACK,
+} pageout_hand_t;
+
+typedef enum {
+	CKP_INELIGIBLE,
+	CKP_NOT_FREED,
+	CKP_FREED,
+} checkpage_result_t;
+
+static checkpage_result_t checkpage(page_t *, pageout_hand_t);
+
+static struct clockinit {
+	bool ci_init;
+	pgcnt_t ci_lotsfree_min;
+	pgcnt_t ci_lotsfree_max;
+	pgcnt_t ci_lotsfree;
+	pgcnt_t ci_desfree;
+	pgcnt_t ci_minfree;
+	pgcnt_t ci_throttlefree;
+	pgcnt_t ci_pageout_reserve;
+	pgcnt_t ci_maxpgio;
+	pgcnt_t ci_maxfastscan;
+	pgcnt_t ci_fastscan;
+	pgcnt_t ci_slowscan;
+	pgcnt_t ci_handspreadpages;
+	uint_t  ci_despagescanners;
+} clockinit = { .ci_init = false };
+
+static inline pgcnt_t
+clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum)
+{
+	if (value < minimum) {
+		return (minimum);
+	} else if (value > maximum) {
+		return (maximum);
+	} else {
+		return (value);
+	}
+}
+
+static pgcnt_t
+tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval)
+{
+	if (initval == 0 || initval >= initval_ceiling) {
+		return (defval);
+	} else {
+		return (initval);
+	}
+}
 
 /*
  * Local boolean to control scanning when zones are over their cap. Avoids
@@ -242,108 +422,145 @@ kcondvar_t	memavail_cv;
 static boolean_t zones_over = B_FALSE;
 
 /*
- * Set up the paging constants for the page scanner clock-hand algorithm.
- * Called at startup after the system is initialized and the amount of memory
- * and number of paging devices is known (recalc will be 0). Called again once
- * PAGE_SCAN_STARTUP is true after the scanner has collected enough samples
- * (recalc will be 1).
- *
- * Will also be called after a memory dynamic reconfiguration operation and
- * recalc will be 1 in those cases too.
+ * On large memory systems, multiple instances of the page scanner are run,
+ * each responsible for a separate region of memory. This speeds up page
+ * invalidation under low memory conditions.
  *
- * lotsfree is 1/64 of memory, but at least 512K (ha!).
- * desfree is 1/2 of lotsfree.
- * minfree is 1/2 of desfree.
+ * despagescanners can be set in /etc/system or via mdb and it will
+ * be used as a guide for how many page scanners to create; the value
+ * will be adjusted if it is not sensible. Otherwise, the number of
+ * page scanners is determined dynamically based on handspreadpages.
  */
-void
-setupclock(int recalc)
+static void
+recalc_pagescanners(void)
 {
-	uint_t i;
-	pgcnt_t sz, tmp;
+	pgcnt_t sz;
+	uint_t des;
 
-	static spgcnt_t init_lfree, init_dfree, init_mfree;
-	static spgcnt_t init_tfree, init_preserve, init_mpgio;
-	static spgcnt_t init_mfscan, init_fscan, init_sscan, init_hspages;
-
-	looppages = LOOPPAGES;
+	/* If the initial calibration has not been done, take no action. */
+	if (pageout_new_spread == 0)
+		return;
 
 	/*
-	 * setupclock can be called to recalculate the paging
-	 * parameters in the case of dynamic reconfiguration of memory.
-	 * So to make sure we make the proper calculations, if such a
-	 * situation should arise, we save away the initial values
-	 * of each parameter so we can recall them when needed. This
-	 * way we don't lose the settings an admin might have made
-	 * through the /etc/system file.
+	 * If the desired number of scanners is set in /etc/system
+	 * then try to use it.
 	 */
+	if (despagescanners == 0 && clockinit.ci_despagescanners != 0)
+		despagescanners = clockinit.ci_despagescanners;
 
-	if (!recalc) {
-		init_lfree = lotsfree;
-		init_dfree = desfree;
-		init_mfree = minfree;
-		init_tfree = throttlefree;
-		init_preserve = pageout_reserve;
-		init_mpgio = maxpgio;
-		init_mfscan = maxfastscan;
-		init_fscan = fastscan;
-		init_sscan = slowscan;
-		init_hspages = handspreadpages;
+	if (despagescanners != 0) {
+		/*
+		 * We have a desired number of page scanners, either from
+		 * /etc/system or set via mdb. Try and use it (it will be
+		 * clamped below).
+		 */
+		des = despagescanners;
+	} else {
+		/*
+		 * Calculate the number of desired scanners based on the
+		 * system's memory size.
+		 *
+		 * A 64GiB region size is used as the basis for calculating how
+		 * many scanner threads should be created. For systems with up
+		 * to 64GiB of RAM, a single thread is used; for very large
+		 * memory systems the threads are limited to MAX_PSCAN_THREADS.
+		 */
+		sz = btop(64ULL << 30);
+
+		if (sz > looppages) {
+			des = 1;
+		} else {
+			pgcnt_t tmp = sz;
+
+			for (des = 1; tmp < looppages; des++)
+				tmp += sz;
+		}
 	}
 
 	/*
-	 * Set up thresholds for paging:
+	 * clamp the number of scanners so that we are under MAX_PSCAN_THREADS
+	 * and so that each scanner covers at least 10% more than
+	 * handspreadpages.
 	 */
+	des = clamp(des, 1,
+	    looppages / (handspreadpages + handspreadpages / 10));
+	despagescanners = clamp(des, 1, MAX_PSCAN_THREADS);
+}
 
-	/*
-	 * Lotsfree is threshold where paging daemon turns on.
-	 */
-	if (init_lfree == 0 || init_lfree >= looppages)
-		lotsfree = MAX(looppages / 64, btop(512 * 1024));
-	else
-		lotsfree = init_lfree;
+/*
+ * Set up the paging constants for the clock algorithm used by
+ * pageout_scanner(), and by the virtual memory system overall.  See the
+ * comments at the top of this file for more information about the threshold
+ * values and system responses to memory pressure.
+ *
+ * This routine is called once by main() at startup, after the initial size of
+ * physical memory is determined.  It may be called again later if memory is
+ * added to or removed from the system, or if new measurements of the page scan
+ * rate become available.
+ */
+void
+setupclock(void)
+{
+	bool half = (pageout_threshold_style == 1);
+	bool recalc = true;
 
-	/*
-	 * Desfree is amount of memory desired free.
-	 * If less than this for extended period, start swapping.
-	 */
-	if (init_dfree == 0 || init_dfree >= lotsfree)
-		desfree = lotsfree / 2;
-	else
-		desfree = init_dfree;
+	looppages = total_pages;
 
 	/*
-	 * Minfree is minimal amount of free memory which is tolerable.
+	 * The operator may have provided specific values for some of the
+	 * tunables via /etc/system.  On our first call, we preserve those
+	 * values so that they can be used for subsequent recalculations.
+	 *
+	 * A value of zero for any tunable means we will use the default
+	 * sizing.
 	 */
-	if (init_mfree == 0 || init_mfree >= desfree)
-		minfree = desfree / 2;
-	else
-		minfree = init_mfree;
+	if (!clockinit.ci_init) {
+		clockinit.ci_init = true;
+
+		clockinit.ci_lotsfree_min = lotsfree_min;
+		clockinit.ci_lotsfree_max = lotsfree_max;
+		clockinit.ci_lotsfree = lotsfree;
+		clockinit.ci_desfree = desfree;
+		clockinit.ci_minfree = minfree;
+		clockinit.ci_throttlefree = throttlefree;
+		clockinit.ci_pageout_reserve = pageout_reserve;
+		clockinit.ci_maxpgio = maxpgio;
+		clockinit.ci_maxfastscan = maxfastscan;
+		clockinit.ci_fastscan = fastscan;
+		clockinit.ci_slowscan = slowscan;
+		clockinit.ci_handspreadpages = handspreadpages;
+		clockinit.ci_despagescanners = despagescanners;
 
-	/*
-	 * Throttlefree is the point at which we start throttling
-	 * PG_WAIT requests until enough memory becomes available.
-	 */
-	if (init_tfree == 0 || init_tfree >= desfree)
-		throttlefree = minfree;
-	else
-		throttlefree = init_tfree;
+		/*
+		 * The first call does not trigger a recalculation, only
+		 * subsequent calls.
+		 */
+		recalc = false;
+	}
 
 	/*
-	 * Pageout_reserve is the number of pages that we keep in
-	 * stock for pageout's own use.  Having a few such pages
-	 * provides insurance against system deadlock due to
-	 * pageout needing pages.  When freemem < pageout_reserve,
-	 * non-blocking allocations are denied to any threads
-	 * other than pageout and sched.  (At some point we might
-	 * want to consider a per-thread flag like T_PUSHING_PAGES
-	 * to indicate that a thread is part of the page-pushing
-	 * dance (e.g. an interrupt thread) and thus is entitled
-	 * to the same special dispensation we accord pageout.)
+	 * Configure paging threshold values.  For more details on what each
+	 * threshold signifies, see the comments at the top of this file.
 	 */
-	if (init_preserve == 0 || init_preserve >= throttlefree)
-		pageout_reserve = throttlefree / 2;
-	else
-		pageout_reserve = init_preserve;
+	lotsfree_max = tune(clockinit.ci_lotsfree_max, looppages,
+	    btop(LOTSFREE_MAX_DEFAULT));
+	lotsfree_min = tune(clockinit.ci_lotsfree_min, lotsfree_max,
+	    btop(LOTSFREE_MIN_DEFAULT));
+
+	lotsfree = tune(clockinit.ci_lotsfree, looppages,
+	    clamp(looppages / lotsfree_fraction, lotsfree_min, lotsfree_max));
+
+	desfree = tune(clockinit.ci_desfree, lotsfree,
+	    lotsfree / 2);
+
+	minfree = tune(clockinit.ci_minfree, desfree,
+	    half ? desfree / 2 : 3 * desfree / 4);
+
+	throttlefree = tune(clockinit.ci_throttlefree, desfree,
+	    minfree);
+
+	pageout_reserve = tune(clockinit.ci_pageout_reserve, throttlefree,
+	    half ? throttlefree / 2 : 3 * throttlefree / 4);
 
 	/*
 	 * Maxpgio thresholds how much paging is acceptable.
@@ -352,143 +569,160 @@ setupclock(int recalc)
 	 *
 	 * XXX - Does not account for multiple swap devices.
 	 */
-	if (init_mpgio == 0)
+	if (clockinit.ci_maxpgio == 0) {
 		maxpgio = (DISKRPM * 2) / 3;
-	else
-		maxpgio = init_mpgio;
+	} else {
+		maxpgio = clockinit.ci_maxpgio;
+	}
 
 	/*
-	 * When the system is in a low memory state, the page scan rate varies
-	 * between fastscan and slowscan based on the amount of free memory
-	 * available. When only zones are over their memory cap, the scan rate
-	 * is always fastscan.
-	 *
-	 * The fastscan rate should be set based on the number pages that can
-	 * be scanned per sec using ~10% of a CPU. Since this value depends on
-	 * the processor, MMU, Ghz etc., it must be determined dynamically.
-	 *
-	 * When the scanner first starts up, fastscan will be set to 0 and
-	 * maxfastscan will be set to MAXHANDSPREADPAGES (64MB, in pages).
-	 * However, once the scanner has collected enough samples, then fastscan
-	 * is set to be the smaller of 1/2 of memory (looppages / loopfraction)
-	 * or maxfastscan (which is set from pageout_new_spread). Thus,
-	 * MAXHANDSPREADPAGES is irrelevant after the scanner is fully
-	 * initialized.
-	 *
-	 * pageout_new_spread is calculated when the scanner first starts
-	 * running. During this initial sampling period the nscan_limit
-	 * is set to the total_pages of system memory. Thus, the scanner could
-	 * theoretically scan all of memory in one pass. However, each sample
-	 * is also limited by the %CPU budget. This is controlled by
-	 * pageout_ticks which is set in schedpaging(). During the sampling
-	 * period, pageout_ticks is set to max_pageout_ticks. This tick value
-	 * is derived from the max_percent_cpu (80%) described above. On a
-	 * system with more than a small amount of memory (~8GB), the scanner's
-	 * %CPU will be the limiting factor in calculating pageout_new_spread.
-	 *
-	 * At the end of the sampling period, the pageout_rate indicates how
-	 * many pages could be scanned per second. The pageout_new_spread is
-	 * then set to be 1/10th of that (i.e. approximating 10% of a CPU).
-	 * Of course, this value could still be more than the physical memory
-	 * on the system. If so, fastscan is set to 1/2 of memory, as
-	 * mentioned above.
+	 * The clock scan rate varies between fastscan and slowscan
+	 * based on the amount of free memory available.  Fastscan
+	 * rate should be set based on the number pages that can be
+	 * scanned per sec using ~10% of processor time.  Since this
+	 * value depends on the processor, MMU, Mhz etc., it is
+	 * difficult to determine it in a generic manner for all
+	 * architectures.
 	 *
-	 * All of this leads up to the setting of handspreadpages, which is
-	 * set to fastscan. This is the distance, in pages, between the front
-	 * and back hands during scanning. It will dictate which pages will
-	 * be considered "hot" on the backhand and which pages will be "cold"
-	 * and reclaimed
+	 * Instead of trying to determine the number of pages scanned
+	 * per sec for every processor, fastscan is set to be the smaller
+	 * of 1/2 of memory or MAXHANDSPREADPAGES and the sampling
+	 * time is limited to ~4% of processor time.
 	 *
-	 * If the scanner is limited by desscan, then at the highest rate it
-	 * will scan up to fastscan/RATETOSCHEDPAGING pages per cycle. If the
-	 * scanner is limited by the %CPU, then at the highest rate (20% of a
-	 * CPU per cycle) the number of pages scanned could be much less.
+	 * Setting fastscan to be 1/2 of memory allows pageout to scan
+	 * all of memory in ~2 secs.  This implies that user pages not
+	 * accessed within 1 sec (assuming, handspreadpages == fastscan)
+	 * can be reclaimed when free memory is very low.  Stealing pages
+	 * not accessed within 1 sec seems reasonable and ensures that
+	 * active user processes don't thrash.
 	 *
-	 * Thus, if the scanner is limited by desscan, then the handspreadpages
-	 * setting means 1sec between the front and back hands, but if the
-	 * scanner is limited by %CPU, it could be several seconds between the
-	 * two hands.
+	 * Smaller values of fastscan result in scanning fewer pages
+	 * every second and consequently pageout may not be able to free
+	 * sufficient memory to maintain the minimum threshold.  Larger
+	 * values of fastscan result in scanning a lot more pages which
+	 * could lead to thrashing and higher CPU usage.
 	 *
-	 * The basic assumption is that at the worst case, stealing pages
-	 * not accessed within 1 sec seems reasonable and ensures that active
-	 * user processes don't thrash. This is especially true when the system
-	 * is in a low memory state.
+	 * Fastscan needs to be limited to a maximum value and should not
+	 * scale with memory to prevent pageout from consuming too much
+	 * time for scanning on slow CPU's and avoid thrashing, as a
+	 * result of scanning too many pages, on faster CPU's.
+	 * The value of 64 Meg was chosen for MAXHANDSPREADPAGES
+	 * (the upper bound for fastscan) based on the average number
+	 * of pages that can potentially be scanned in ~1 sec (using ~4%
+	 * of the CPU) on some of the following machines that currently
+	 * run Solaris 2.x:
 	 *
-	 * There are some additional factors to consider for the case of
-	 * scanning when zones are over their cap. In this situation it is
-	 * also likely that the machine will have a large physical memory which
-	 * will take many seconds to fully scan (due to the %CPU and desscan
-	 * limits per cycle). It is probable that there will be few (or 0)
-	 * pages attributed to these zones in any single scanning cycle. The
-	 * result is that reclaiming enough pages for these zones might take
-	 * several additional seconds (this is generally not a problem since
-	 * the zone physical cap is just a soft cap).
+	 *			average memory scanned in ~1 sec
 	 *
-	 * This is similar to the typical multi-processor situation in which
-	 * pageout is often unable to maintain the minimum paging thresholds
-	 * under heavy load due to the fact that user processes running on
-	 * other CPU's can be dirtying memory at a much faster pace than
-	 * pageout can find pages to free.
+	 *	25 Mhz SS1+:		23 Meg
+	 *	LX:			37 Meg
+	 *	50 Mhz SC2000:		68 Meg
 	 *
-	 * One potential approach to address both of these cases is to enable
-	 * more than one CPU to run the page scanner, in such a manner that the
-	 * various clock hands don't overlap. However, this also makes it more
-	 * difficult to determine the values for fastscan, slowscan and
-	 * handspreadpages. This is left as a future enhancement, if necessary.
+	 *	40 Mhz 486:		26 Meg
+	 *	66 Mhz 486:		42 Meg
 	 *
-	 * When free memory falls just below lotsfree, the scan rate goes from
-	 * 0 to slowscan (i.e., the page scanner starts running).  This
+	 * When free memory falls just below lotsfree, the scan rate
+	 * goes from 0 to slowscan (i.e., pageout starts running).  This
 	 * transition needs to be smooth and is achieved by ensuring that
 	 * pageout scans a small number of pages to satisfy the transient
 	 * memory demand.  This is set to not exceed 100 pages/sec (25 per
 	 * wakeup) since scanning that many pages has no noticible impact
 	 * on system performance.
 	 *
-	 * The swapper is currently used to free up memory when pageout is
-	 * unable to meet memory demands. It does this by swapping out entire
-	 * processes. In addition to freeing up memory, swapping also reduces
-	 * the demand for memory because the swapped out processes cannot
-	 * run, and thereby consume memory. However, this is a pathological
-	 * state and performance will generally be considered unacceptable.
+	 * In addition to setting fastscan and slowscan, pageout is
+	 * limited to using ~4% of the CPU.  This results in increasing
+	 * the time taken to scan all of memory, which in turn means that
+	 * user processes have a better opportunity of preventing their
+	 * pages from being stolen.  This has a positive effect on
+	 * interactive and overall system performance when memory demand
+	 * is high.
+	 *
+	 * Thus, the rate at which pages are scanned for replacement will
+	 * vary linearly between slowscan and the number of pages that
+	 * can be scanned using ~4% of processor time instead of varying
+	 * linearly between slowscan and fastscan.
+	 *
+	 * Also, the processor time used by pageout will vary from ~1%
+	 * at slowscan to ~4% at fastscan instead of varying between
+	 * ~1% at slowscan and ~10% at fastscan.
+	 *
+	 * The values chosen for the various VM parameters (fastscan,
+	 * handspreadpages, etc) are not universally true for all machines,
+	 * but appear to be a good rule of thumb for the machines we've
+	 * tested.  They have the following ranges:
+	 *
+	 *	cpu speed:	20 to 70 Mhz
+	 *	page size:	4K to 8K
+	 *	memory size:	16M to 5G
+	 *	page scan rate:	4000 - 17400 4K pages per sec
+	 *
+	 * The values need to be re-examined for machines which don't
+	 * fall into the various ranges (e.g., slower or faster CPUs,
+	 * smaller or larger pagesizes etc) shown above.
+	 *
+	 * On an MP machine, pageout is often unable to maintain the
+	 * minimum paging thresholds under heavy load.  This is due to
+	 * the fact that user processes running on other CPU's can be
+	 * dirtying memory at a much faster pace than pageout can find
+	 * pages to free.  The memory demands could be met by enabling
+	 * more than one CPU to run the clock algorithm in such a manner
+	 * that the various clock hands don't overlap.  This also makes
+	 * it more difficult to determine the values for fastscan, slowscan
+	 * and handspreadpages.
+	 *
+	 * The swapper is currently used to free up memory when pageout
+	 * is unable to meet memory demands by swapping out processes.
+	 * In addition to freeing up memory, swapping also reduces the
+	 * demand for memory by preventing user processes from running
+	 * and thereby consuming memory.
 	 */
-	if (init_mfscan == 0) {
-		if (pageout_new_spread != 0)
+	if (clockinit.ci_maxfastscan == 0) {
+		if (pageout_new_spread != 0) {
 			maxfastscan = pageout_new_spread;
-		else
+		} else {
 			maxfastscan = MAXHANDSPREADPAGES;
+		}
 	} else {
-		maxfastscan = init_mfscan;
+		maxfastscan = clockinit.ci_maxfastscan;
 	}
-	if (init_fscan == 0) {
+
+	if (clockinit.ci_fastscan == 0) {
 		fastscan = MIN(looppages / loopfraction, maxfastscan);
 	} else {
-		fastscan = init_fscan;
-		if (fastscan > looppages / loopfraction)
-			fastscan = looppages / loopfraction;
+		fastscan = clockinit.ci_fastscan;
+	}
+
+	if (fastscan > looppages / loopfraction) {
+		fastscan = looppages / loopfraction;
 	}
 
 	/*
 	 * Set slow scan time to 1/10 the fast scan time, but
 	 * not to exceed maxslowscan.
 	 */
-	if (init_sscan == 0)
+	if (clockinit.ci_slowscan == 0) {
 		slowscan = MIN(fastscan / 10, maxslowscan);
-	else
-		slowscan = init_sscan;
-	if (slowscan > fastscan / 2)
+	} else {
+		slowscan = clockinit.ci_slowscan;
+	}
+
+	if (slowscan > fastscan / 2) {
 		slowscan = fastscan / 2;
+	}
 
 	/*
-	 * Handspreadpages is distance (in pages) between front and back
+	 * Handspreadpages is the distance (in pages) between front and back
 	 * pageout daemon hands.  The amount of time to reclaim a page
 	 * once pageout examines it increases with this distance and
 	 * decreases as the scan rate rises. It must be < the amount
 	 * of pageable memory.
 	 *
-	 * Since pageout is limited to the %CPU per cycle, setting
-	 * handspreadpages to be "fastscan" results in the front hand being
-	 * a few secs (varies based on the processor speed) ahead of the back
-	 * hand at fastscan rates.
+	 * Since pageout is limited to ~4% of the CPU, setting handspreadpages
+	 * to be "fastscan" results in the front hand being a few secs
+	 * (varies based on the processor speed) ahead of the back hand
+	 * at fastscan rates.  This distance can be further reduced, if
+	 * necessary, by increasing the processor time used by pageout
+	 * to be more than ~4% and preferrably not more than ~10%.
 	 *
 	 * As a result, user processes have a much better chance of
 	 * referencing their pages before the back hand examines them.
@@ -496,91 +730,62 @@ setupclock(int recalc)
 	 * the freelist since pageout does not end up freeing pages which
 	 * may be referenced a sec later.
 	 */
-	if (init_hspages == 0)
+	if (clockinit.ci_handspreadpages == 0) {
 		handspreadpages = fastscan;
-	else
-		handspreadpages = init_hspages;
+	} else {
+		handspreadpages = clockinit.ci_handspreadpages;
+	}
 
 	/*
 	 * Make sure that back hand follows front hand by at least
-	 * 1/RATETOSCHEDPAGING seconds.  Without this test, it is possible
-	 * for the back hand to look at a page during the same wakeup of
-	 * the pageout daemon in which the front hand cleared its ref bit.
+	 * 1/SCHEDPAGING_HZ seconds.  Without this test, it is possible for the
+	 * back hand to look at a page during the same wakeup of the pageout
+	 * daemon in which the front hand cleared its ref bit.
 	 */
-	if (handspreadpages >= looppages)
+	if (handspreadpages >= looppages) {
 		handspreadpages = looppages - 1;
-
-	if (recalc == 0) {
-		/*
-		 * Setup basic values at initialization.
-		 */
-		pscan_region_sz = total_pages;
-		des_page_scanners = n_page_scanners = 1;
-		reset_hands[0] = B_TRUE;
-		return;
 	}
 
 	/*
-	 * Recalculating
-	 *
-	 * We originally set the number of page scanners to 1. Now that we
-	 * know what the handspreadpages is for a scanner, figure out how many
-	 * scanners we should run. We want to ensure that the regions don't
-	 * overlap and that they are not touching.
-	 *
-	 * A default 64GB region size is used as the initial value to calculate
-	 * how many scanner threads we should create on lower memory systems.
-	 * The idea is to limit the number of threads to a practical value
-	 * (e.g. a 64GB machine really only needs one scanner thread). For very
-	 * large memory systems, we limit ourselves to MAX_PSCAN_THREADS
-	 * threads.
-	 *
-	 * The scanner threads themselves are evenly spread out around the
-	 * memory "clock" in pageout_scanner when we reset the hands, and each
-	 * thread will scan all of memory.
+	 * Establish the minimum and maximum length of time to be spent
+	 * scanning pages per wakeup, limiting the scanner duty cycle.  The
+	 * input percentage values (0-100) must be converted to a fraction of
+	 * the number of nanoseconds in a second of wall time, then further
+	 * scaled down by the number of scanner wakeups in a second.
 	 */
-	sz = (btop(64ULL * 0x40000000ULL));
-	if (sz < handspreadpages) {
-		/*
-		 * 64GB is smaller than the separation between the front
-		 * and back hands; use double handspreadpages.
-		 */
-		sz = handspreadpages << 1;
-	}
-	if (sz > total_pages) {
-		sz = total_pages;
-	}
-	/* Record region size for inspection with mdb, otherwise unused */
-	pscan_region_sz = sz;
+	min_pageout_nsec = MAX(1,
+	    NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ);
+	max_pageout_nsec = MAX(min_pageout_nsec,
+	    NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ);
 
-	tmp = sz;
-	for (i = 1; tmp < total_pages; i++) {
-		tmp += sz;
-	}
+	/*
+	 * If not called for recalculation, return and skip the remaining
+	 * steps.
+	 */
+	if (!recalc)
+		return;
 
-	if (i > MAX_PSCAN_THREADS)
-		i = MAX_PSCAN_THREADS;
+	/*
+	 * Set a flag to re-evaluate the clock hand positions.
+	 */
+	for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++)
+		reset_hands[i] = B_TRUE;
 
-	des_page_scanners = i;
+	recalc_pagescanners();
 }
 
 /*
  * Pageout scheduling.
  *
  * Schedpaging controls the rate at which the page out daemon runs by
- * setting the global variables pageout_ticks and desscan RATETOSCHEDPAGING
- * times a second. The pageout_ticks variable controls the percent of one
- * CPU that each page scanner thread should consume (see min_percent_cpu
- * and max_percent_cpu descriptions). The desscan variable records the number
- * of pages pageout should examine in its next pass; schedpaging sets this
- * value based on the amount of currently available memory. In addtition, the
- * nscan variable records the number of pages pageout has examined in its
- * current pass; schedpaging resets this value to zero each time it runs.
+ * setting the global variables nscan and desscan SCHEDPAGING_HZ
+ * times a second.  Nscan records the number of pages pageout has examined
+ * in its current pass; schedpaging() resets this value to zero each time
+ * it runs.  Desscan records the number of pages pageout should examine
+ * in its next pass; schedpaging() sets this value based on the amount of
+ * currently available memory.
  */
 
-#define	RATETOSCHEDPAGING	4		/* times/second */
-
-/* held while pageout_scanner or schedpaging are modifying shared data */
 static kmutex_t	pageout_mutex;
 
 /*
@@ -592,7 +797,24 @@ static struct async_reqs *push_list;	/* pending reqs */
 static kmutex_t push_lock;		/* protects req pool */
 static kcondvar_t push_cv;
 
-static int async_list_size = 256;	/* number of async request structs */
+/*
+ * If pageout() is stuck on a single push for this many seconds,
+ * pageout_deadman() will assume the system has hit a memory deadlock.  If set
+ * to 0, the deadman will have no effect.
+ *
+ * Note that we are only looking for stalls in the calls that pageout() makes
+ * to VOP_PUTPAGE().  These calls are merely asynchronous requests for paging
+ * I/O, which should not take long unless the underlying strategy call blocks
+ * indefinitely for memory.  The actual I/O request happens (or fails) later.
+ */
+uint_t pageout_deadman_seconds = 90;
+
+static uint_t pageout_stucktime = 0;
+static bool pageout_pushing = false;
+static uint64_t pageout_pushcount = 0;
+static uint64_t pageout_pushcount_seen = 0;
+
+static int async_list_size = 8192;	/* number of async request structs */
 
 static void pageout_scanner(void *);
 
@@ -623,153 +845,142 @@ schedpaging(void *arg)
 	if (kcage_on && (kcage_freemem < kcage_desfree || kcage_needfree))
 		kcage_cageout_wakeup();
 
-	(void) atomic_swap_ulong(&nscan, 0);
-	vavail = freemem - deficit;
-	if (pageout_new_spread != 0)
-		vavail -= needfree;
-	if (vavail < 0)
-		vavail = 0;
-	if (vavail > lotsfree)
-		vavail = lotsfree;
+	if (mutex_tryenter(&pageout_mutex)) {
 
-	/*
-	 * Fix for 1161438 (CRS SPR# 73922).  All variables
-	 * in the original calculation for desscan were 32 bit signed
-	 * ints.  As freemem approaches 0x0 on a system with 1 Gig or
-	 * more of memory, the calculation can overflow.  When this
-	 * happens, desscan becomes negative and pageout_scanner()
-	 * stops paging out.
-	 */
-	if ((needfree) && (pageout_new_spread == 0)) {
-		/*
-		 * If we've not yet collected enough samples to
-		 * calculate a spread, kick into high gear anytime
-		 * needfree is non-zero. Note that desscan will not be
-		 * the limiting factor for systems with larger memory;
-		 * the %CPU will limit the scan. That will also be
-		 * maxed out below.
-		 */
-		desscan = fastscan / RATETOSCHEDPAGING;
-	} else {
-		/*
-		 * Once we've calculated a spread based on system
-		 * memory and usage, just treat needfree as another
-		 * form of deficit.
-		 */
-		spgcnt_t faststmp, slowstmp, result;
+		if (pageouts_running != 0)
+			goto out;
 
-		slowstmp = slowscan * vavail;
-		faststmp = fastscan * (lotsfree - vavail);
-		result = (slowstmp + faststmp) /
-		    nz(lotsfree) / RATETOSCHEDPAGING;
-		desscan = (pgcnt_t)result;
-	}
+		/* No pageout scanner threads running. */
+		nscan = 0;
+		vavail = freemem - deficit;
+		if (pageout_new_spread != 0)
+			vavail -= needfree;
+		vavail = clamp(vavail, 0, lotsfree);
 
-	/*
-	 * If we've not yet collected enough samples to calculate a
-	 * spread, also kick %CPU to the max.
-	 */
-	if (pageout_new_spread == 0) {
-		pageout_ticks = max_pageout_ticks;
-	} else {
-		pageout_ticks = min_pageout_ticks +
-		    (lotsfree - vavail) *
-		    (max_pageout_ticks - min_pageout_ticks) /
-		    nz(lotsfree);
-	}
+		if (needfree > 0 && pageout_new_spread == 0) {
+			/*
+			 * If we've not yet collected enough samples to
+			 * calculate a spread, use the old logic of kicking
+			 * into high gear anytime needfree is non-zero.
+			 */
+			desscan = fastscan / SCHEDPAGING_HZ;
+		} else {
+			/*
+			 * Once we've calculated a spread based on system
+			 * memory and usage, just treat needfree as another
+			 * form of deficit.
+			 */
+			spgcnt_t faststmp, slowstmp, result;
 
-	if (pageout_new_spread != 0 && des_page_scanners != n_page_scanners) {
-		/*
-		 * We have finished the pagescan initialization and the desired
-		 * number of page scanners has changed, either because
-		 * initialization just finished, because of a memory DR, or
-		 * because des_page_scanners has been modified on the fly (i.e.
-		 * by mdb). If we need more scanners, start them now, otherwise
-		 * the excess scanners will terminate on their own when they
-		 * reset their hands.
-		 */
-		uint_t i;
-		uint_t curr_nscan = n_page_scanners;
-		pgcnt_t max = total_pages / handspreadpages;
+			slowstmp = slowscan * vavail;
+			faststmp = fastscan * (lotsfree - vavail);
+			result = (slowstmp + faststmp) /
+			    nz(lotsfree) / SCHEDPAGING_HZ;
+			desscan = (pgcnt_t)result;
+		}
 
-		if (des_page_scanners > max)
-			des_page_scanners = max;
+		pageout_nsec = min_pageout_nsec + (lotsfree - vavail) *
+		    (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree);
 
-		if (des_page_scanners > MAX_PSCAN_THREADS) {
-			des_page_scanners = MAX_PSCAN_THREADS;
-		} else if (des_page_scanners == 0) {
-			des_page_scanners = 1;
-		}
+		DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t,
+		    pageout_nsec);
 
-		/*
-		 * Each thread has its own entry in the reset_hands array, so
-		 * we don't need any locking in pageout_scanner to check the
-		 * thread's reset_hands entry. Thus, we use a pre-allocated
-		 * fixed size reset_hands array and upper limit on the number
-		 * of pagescan threads.
-		 *
-		 * The reset_hands entries need to be true before we start new
-		 * scanners, but if we're reducing, we don't want a race on the
-		 * recalculation for the existing threads, so we set
-		 * n_page_scanners first.
-		 */
-		n_page_scanners = des_page_scanners;
-		for (i = 0; i < MAX_PSCAN_THREADS; i++) {
-			reset_hands[i] = B_TRUE;
-		}
+		if (pageout_new_spread != 0 && despagescanners != 0 &&
+		    despagescanners != n_page_scanners) {
+			/*
+			* We have finished the pagescan initialisation and the
+			* desired number of page scanners has changed, either
+			* because initialisation just finished, because of a
+			* memory DR, or because despagescanners has been
+			* modified on the fly (i.e. by mdb).
+			*/
+			uint_t i, curr_nscan = n_page_scanners;
+
+			/* Re-validate despagescanners */
+			recalc_pagescanners();
+
+			n_page_scanners = despagescanners;
+
+			for (i = 0; i < MAX_PSCAN_THREADS; i++)
+				reset_hands[i] = B_TRUE;
+
+			/* If we need more scanners, start them now. */
+			if (n_page_scanners > curr_nscan) {
+				for (i = curr_nscan; i < n_page_scanners; i++) {
+					(void) lwp_kernel_create(proc_pageout,
+					    pageout_scanner,
+					    (void *)(uintptr_t)i, TS_RUN,
+					    pageout_pri);
+				}
+			}
 
-		if (des_page_scanners > curr_nscan) {
-			/* Create additional pageout scanner threads. */
-			for (i = curr_nscan; i < des_page_scanners; i++) {
-				(void) lwp_kernel_create(proc_pageout,
-				    pageout_scanner, (void *)(uintptr_t)i,
-				    TS_RUN, curthread->t_pri);
+			/*
+			 * If the number of scanners has decreased, trigger a
+			 * wakeup so that the excess threads will terminate.
+			 */
+			if (n_page_scanners < curr_nscan) {
+				WAKE_PAGEOUT_SCANNER();
 			}
 		}
-	}
-
-	zones_over = B_FALSE;
-
-	if (freemem < lotsfree + needfree || PAGE_SCAN_STARTUP) {
-		if (!PAGE_SCAN_STARTUP)
-			low_mem_scan++;
-		DTRACE_PROBE(schedpage__wake__low);
-		WAKE_PAGEOUT_SCANNER();
-
-	} else if (zone_num_over_cap > 0) {
-		/* One or more zones are over their cap. */
 
-		/* No page limit */
-		desscan = total_pages;
+		zones_over = B_FALSE;
 
-		/*
-		 * Increase the scanning CPU% to the max. This implies
-		 * 80% of one CPU/sec if the scanner can run each
-		 * opportunity. Can also be tuned via setting
-		 * zone_pageout_ticks in /etc/system or with mdb.
-		 */
-		pageout_ticks = (zone_pageout_ticks != 0) ?
-		    zone_pageout_ticks : max_pageout_ticks;
+		if (PAGE_SCAN_STARTUP) {
+			/*
+			 * We still need to measure the rate at which the
+			 * system is able to scan pages of memory. Each of
+			 * these initial samples is a scan of as much system
+			 * memory as practical, regardless of whether or not we
+			 * are experiencing memory pressure.
+			 */
+			desscan = total_pages;
+			pageout_nsec = max_pageout_nsec;
 
-		zones_over = B_TRUE;
-		zone_cap_scan++;
+			DTRACE_PROBE(schedpage__wake__sample);
+			WAKE_PAGEOUT_SCANNER();
+		} else if (freemem < lotsfree + needfree) {
+			/*
+			 * We need more memory.
+			 */
+			low_mem_scan++;
 
-		DTRACE_PROBE(schedpage__wake__zone);
-		WAKE_PAGEOUT_SCANNER();
+			DTRACE_PROBE(schedpage__wake__low);
+			WAKE_PAGEOUT_SCANNER();
+		} else if (zone_num_over_cap > 0) {
+			/*
+			 * One of more zones are over their cap.
+			 */
 
-	} else {
-		/*
-		 * There are enough free pages, no need to
-		 * kick the scanner thread.  And next time
-		 * around, keep more of the `highly shared'
-		 * pages.
-		 */
-		cv_signal_pageout();
+			/* No page limit */
+			desscan = total_pages;
 
-		mutex_enter(&pageout_mutex);
-		if (po_share > MIN_PO_SHARE) {
-			po_share >>= 1;
+			/*
+			* Increase the scanning CPU% to the max. This implies
+			* 80% of one CPU/sec if the scanner can run each
+			* opportunity. Can also be tuned via setting
+			* zone_pageout_nsec in /etc/system or with mdb.
+			*/
+			pageout_nsec = (zone_pageout_nsec != 0) ?
+			   zone_pageout_nsec : max_pageout_nsec;
+
+			zones_over = B_TRUE;
+			zone_cap_scan++;
+
+			DTRACE_PROBE(schedpage__wake__zone);
+			WAKE_PAGEOUT_SCANNER();
+		} else {
+			/*
+			 * There are enough free pages, no need to
+			 * kick the scanner thread.  And next time
+			 * around, keep more of the `highly shared'
+			 * pages.
+			 */
+			cv_signal_pageout();
+			if (po_share > MIN_PO_SHARE) {
+				po_share >>= 1;
+			}
 		}
+out:
 		mutex_exit(&pageout_mutex);
 	}
 
@@ -782,61 +993,55 @@ schedpaging(void *arg)
 	if (kmem_avail() > 0)
 		cv_broadcast(&memavail_cv);
 
-	(void) timeout(schedpaging, arg, hz / RATETOSCHEDPAGING);
+	(void) timeout(schedpaging, arg, hz / SCHEDPAGING_HZ);
 }
 
 pgcnt_t		pushes;
 ulong_t		push_list_size;		/* # of requests on pageout queue */
 
-#define	FRONT	1
-#define	BACK	2
-
-int dopageout = 1;	/* /etc/system tunable to disable page reclamation */
+/*
+ * Paging out should always be enabled.  This tunable exists to hold pageout
+ * for debugging purposes.  If set to 0, pageout_scanner() will go back to
+ * sleep each time it is woken by schedpaging().
+ */
+uint_t dopageout = 1;
 
 /*
  * The page out daemon, which runs as process 2.
  *
- * Page out occurs when either:
- * a) there is less than lotsfree pages,
- * b) there are one or more zones over their physical memory cap.
- *
- * The daemon treats physical memory as a circular array of pages and scans the
- * pages using a 'two-handed clock' algorithm. The front hand moves through
- * the pages, clearing the reference bit. The back hand travels a distance
- * (handspreadpages) behind the front hand, freeing the pages that have not
- * been referenced in the time since the front hand passed. If modified, they
- * are first written to their backing store before being freed.
- *
- * In order to make page invalidation more responsive on machines with larger
- * memory, multiple pageout_scanner threads may be created. In this case, the
- * threads are evenly distributed around the the memory "clock face" so that
- * memory can be reclaimed more quickly (that is, there can be large regions in
- * which no pages can be reclaimed by a single thread, leading to lag which
- * causes undesirable behavior such as htable stealing).
+ * The daemon treats physical memory as a circular array of pages and scans
+ * the pages using a 'two-handed clock' algorithm. The front hand moves
+ * through the pages, clearing the reference bit. The back hand travels a
+ * distance (handspreadpages) behind the front hand, freeing the pages that
+ * have not been referenced in the time since the front hand passed. If
+ * modified, they are first written to their backing store before being
+ * freed.
  *
- * As long as there are at least lotsfree pages, or no zones over their cap,
- * then pageout_scanner threads are not run. When pageout_scanner threads are
- * running for case (a), all pages are considered for pageout. For case (b),
- * only pages belonging to a zone over its cap will be considered for pageout.
+ * In order to make page invalidation more responsive on machines with
+ * larger memory, multiple pageout_scanner threads may be created. In this
+ * case, each thread is given a segment of the memory "clock face" so that
+ * memory can be reclaimed more quickly.
  *
- * There are multiple threads that act on behalf of the pageout process.
- * A set of threads scan pages (pageout_scanner) and frees them up if
- * they don't require any VOP_PUTPAGE operation. If a page must be
- * written back to its backing store, the request is put on a list
- * and the other (pageout) thread is signaled. The pageout thread
- * grabs VOP_PUTPAGE requests from the list, and processes them.
- * Some filesystems may require resources for the VOP_PUTPAGE
- * operations (like memory) and hence can block the pageout
- * thread, but the pageout_scanner threads can still operate. There is still
- * no guarantee that memory deadlocks cannot occur.
+ * As long as there are at least lotsfree pages, or no zones over their
+ * cap, then pageout_scanner threads are not run. When pageout_scanner
+ * threads are running for case (a), all pages are considered for pageout.
+ * For case (b), only pages belonging to a zone over its cap will be
+ * considered for pageout.
  *
- * The pageout_scanner parameters are determined in schedpaging().
+ * There are multiple threads that act on behalf of the pageout process. A
+ * set of threads scan pages (pageout_scanner) and frees them up if they
+ * don't require any VOP_PUTPAGE operation. If a page must be written back
+ * to its backing store, the request is put on a list and the other
+ * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE
+ * requests from the list, and processes them. Some filesystems may require
+ * resources for the VOP_PUTPAGE operations (like memory) and hence can
+ * block the pageout thread, but the scanner thread can still operate.
+ * There is still no guarantee that memory deadlocks cannot occur.
  */
 void
 pageout()
 {
 	struct async_reqs *arg;
-	pri_t pageout_pri;
 	int i;
 	pgcnt_t max_pushes;
 	callb_cpr_t cprinfo;
@@ -863,14 +1068,16 @@ pageout()
 	    kmem_zalloc(async_list_size * sizeof (struct async_reqs), KM_SLEEP);
 
 	req_freelist = push_req;
-	for (i = 0; i < async_list_size - 1; i++)
+	for (i = 0; i < async_list_size - 1; i++) {
 		push_req[i].a_next = &push_req[i + 1];
+	}
 
-	pageout_pri = curthread->t_pri;
+	pageout_pri = curthread->t_pri - 1;
 
-	/* Create the (first) pageout scanner thread. */
-	(void) lwp_kernel_create(proc_pageout, pageout_scanner, (void *) 0,
-	    TS_RUN, pageout_pri - 1);
+	/* Create the first pageout scanner thread. */
+	(void) lwp_kernel_create(proc_pageout, pageout_scanner,
+	    (void *)0,	/* this is instance 0, not NULL */
+	    TS_RUN, pageout_pri);
 
 	/*
 	 * kick off pageout scheduler.
@@ -888,7 +1095,7 @@ pageout()
 	/*
 	 * Limit pushes to avoid saturating pageout devices.
 	 */
-	max_pushes = maxpgio / RATETOSCHEDPAGING;
+	max_pushes = maxpgio / SCHEDPAGING_HZ;
 	CALLB_CPR_INIT(&cprinfo, &push_lock, callb_generic_cpr, "pageout");
 
 	for (;;) {
@@ -902,9 +1109,11 @@ pageout()
 		}
 		push_list = arg->a_next;
 		arg->a_next = NULL;
+		pageout_pushing = true;
 		mutex_exit(&push_lock);
 
 		DTRACE_PROBE(pageout__push);
+
 		if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off,
 		    arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) {
 			pushes++;
@@ -914,6 +1123,8 @@ pageout()
 		VN_RELE(arg->a_vp);
 
 		mutex_enter(&push_lock);
+		pageout_pushing = false;
+		pageout_pushcount++;
 		arg->a_next = req_freelist;	/* back on freelist */
 		req_freelist = arg;
 		push_list_size--;
@@ -927,134 +1138,172 @@ pageout()
 static void
 pageout_scanner(void *a)
 {
-	struct page *fronthand, *backhand;
-	uint_t count, iter = 0;
+	struct page *fronthand, *backhand, *fronthandstart;
+	struct page *regionstart, *regionend;
+	uint_t laps;
 	callb_cpr_t cprinfo;
-	pgcnt_t	nscan_cnt, nscan_limit;
+	pgcnt_t	nscan_cnt, tick;
 	pgcnt_t	pcount;
-	uint_t inst = (uint_t)(uintptr_t)a;
+	bool bhwrapping, fhwrapping;
 	hrtime_t sample_start, sample_end;
-	clock_t pageout_lbolt;
-	kmutex_t pscan_mutex;
+	uint_t inst = (uint_t)(uintptr_t)a;
 
 	VERIFY3U(inst, <, MAX_PSCAN_THREADS);
 
-	mutex_init(&pscan_mutex, NULL, MUTEX_DEFAULT, NULL);
+	CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan");
+	mutex_enter(&pageout_mutex);
 
-	CALLB_CPR_INIT(&cprinfo, &pscan_mutex, callb_generic_cpr, "poscan");
-	mutex_enter(&pscan_mutex);
+	/*
+	 * The restart case does not attempt to point the hands at roughly
+	 * the right point on the assumption that after one circuit things
+	 * will have settled down, and restarts shouldn't be that often.
+	 */
+	reset_hands[inst] = B_TRUE;
 
-	min_pageout_ticks = MAX(1,
-	    ((hz * min_percent_cpu) / 100) / RATETOSCHEDPAGING);
-	max_pageout_ticks = MAX(min_pageout_ticks,
-	    ((hz * max_percent_cpu) / 100) / RATETOSCHEDPAGING);
+	pageouts_running++;
+	mutex_exit(&pageout_mutex);
 
 loop:
 	cv_signal_pageout();
 
+	mutex_enter(&pageout_mutex);
+	pageouts_running--;
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
-	cv_wait(&proc_pageout->p_cv, &pscan_mutex);
-	CALLB_CPR_SAFE_END(&cprinfo, &pscan_mutex);
+	cv_wait(&proc_pageout->p_cv, &pageout_mutex);
+	CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex);
+	pageouts_running++;
+	mutex_exit(&pageout_mutex);
 
-	if (!dopageout)
+	/*
+	 * Check if pageout has been disabled for debugging purposes.
+	 */
+	if (!dopageout) {
 		goto loop;
+	}
 
+	/*
+	 * One may reset the clock hands and scanned region for debugging
+	 * purposes. Hands will also be reset on first thread startup, if
+	 * the number of scanning threads (n_page_scanners) changes, or if
+	 * memory is added to, or removed from, the system.
+	 */
 	if (reset_hands[inst]) {
 		struct page *first;
-		pgcnt_t offset = total_pages / n_page_scanners;
 
 		reset_hands[inst] = B_FALSE;
+
 		if (inst >= n_page_scanners) {
 			/*
-			 * The desired number of page scanners has been
-			 * reduced and this instance is no longer wanted.
-			 * Exit the lwp.
-			 */
+			* The desired number of page scanners has been
+			* reduced and this instance is no longer wanted.
+			* Exit the lwp.
+			*/
 			VERIFY3U(inst, !=, 0);
-			mutex_exit(&pscan_mutex);
+			DTRACE_PROBE1(pageout__exit, uint_t, inst);
+			mutex_enter(&pageout_mutex);
+			pageouts_running--;
+			mutex_exit(&pageout_mutex);
 			mutex_enter(&curproc->p_lock);
 			lwp_exit();
+			/* NOTREACHED */
 		}
 
+		first = page_first();
+
 		/*
-		 * The reset case repositions the hands at the proper place
-		 * on the memory clock face to prevent creep into another
-		 * thread's active region or when the number of threads has
-		 * changed.
-		 *
-		 * Set the two clock hands to be separated by a reasonable
-		 * amount, but no more than 360 degrees apart.
-		 *
-		 * If inst == 0, backhand starts at first page, otherwise
-		 * it is (inst * offset) around the memory "clock face" so that
-		 * we spread out each scanner instance evenly.
+		 * Each scanner thread gets its own sector of the memory
+		 * clock face.
 		 */
-		first = page_first();
-		backhand = page_nextn(first, offset * inst);
-		if (handspreadpages >= total_pages) {
-			fronthand = page_nextn(backhand, total_pages - 1);
+		pgcnt_t span, offset;
+
+		span = looppages / n_page_scanners;
+		VERIFY3U(span, >, handspreadpages);
+
+		offset = inst * span;
+		regionstart = page_nextn(first, offset);
+		if (inst == n_page_scanners - 1) {
+			/* The last instance goes up to the last page */
+			regionend = page_nextn(first, looppages - 1);
 		} else {
-			fronthand = page_nextn(backhand, handspreadpages);
+			regionend = page_nextn(regionstart, span - 1);
 		}
+
+		backhand = regionstart;
+		fronthand = page_nextn(backhand, handspreadpages);
+		tick = 1;
+
+		bhwrapping = fhwrapping = B_FALSE;
+
+		DTRACE_PROBE4(pageout__reset, uint_t, inst,
+		    pgcnt_t, regionstart, pgcnt_t, regionend,
+		    pgcnt_t, fronthand);
 	}
 
 	/*
-	 * This CPU kstat is only incremented here and we're obviously on this
-	 * CPU, so no lock.
+	 * This CPU kstat is only incremented here and we're obviously
+	 * on this CPU, so no lock.
 	 */
 	CPU_STATS_ADDQ(CPU, vm, pgrrun, 1);
-	count = 0;
 
-	/* Kernel probe */
-	TNF_PROBE_2(pageout_scan_start, "vm pagedaemon", /* CSTYLED */,
-	    tnf_ulong, pages_free, freemem, tnf_ulong, pages_needed, needfree);
+	/*
+	 * Keep track of the number of times we have scanned all the way around
+	 * the loop on this wakeup.
+	 */
+	laps = 0;
 
-	pcount = 0;
+	/*
+	 * Track the number of pages visited during this scan so that we can
+	 * periodically measure our duty cycle.
+	 */
 	nscan_cnt = 0;
-	if (PAGE_SCAN_STARTUP) {
-		nscan_limit = total_pages;
-	} else {
-		nscan_limit = desscan;
-	}
+	pcount = 0;
+
+	DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan,
+	    hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand);
 
-	DTRACE_PROBE4(pageout__start, pgcnt_t, nscan_limit, uint_t, inst,
-	    page_t *, backhand, page_t *, fronthand);
+	/*
+	 * Record the initial position of the front hand for this cycle so
+	 * that we can detect when the hand wraps around.
+	 */
+	fronthandstart = fronthand;
 
-	pageout_lbolt = ddi_get_lbolt();
 	sample_start = gethrtime();
 
 	/*
 	 * Scan the appropriate number of pages for a single duty cycle.
-	 * Only scan while at least one of these is true:
-	 * 1) one or more zones is over its cap
-	 * 2) there is not enough free memory
-	 * 3) during page scan startup when determining sample data
 	 */
-	while (nscan_cnt < nscan_limit &&
-	    (zones_over ||
-	    freemem < lotsfree + needfree ||
-	    PAGE_SCAN_STARTUP)) {
-		int rvfront, rvback;
+	while (nscan_cnt < desscan) {
+		checkpage_result_t rvfront, rvback;
+
+		/*
+		 * Only scan while at least one of these is true:
+		 *  1) one or more zones is over its cap
+		 *  2) there is not enough free memory
+		 *  3) during page scan startup when determining sample data
+		 */
+		if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree &&
+		    !zones_over) {
+			/*
+			 * We are not sampling and enough memory has become
+			 * available that scanning is no longer required.
+			 */
+			DTRACE_PROBE1(pageout__memfree, uint_t, inst);
+			break;
+		}
 
-		DTRACE_PROBE2(pageout__loop, pgcnt_t, pcount, uint_t, inst);
+		DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount);
 
 		/*
-		 * Check to see if we have exceeded our %CPU budget
-		 * for this wakeup, but not on every single page visited,
-		 * just every once in a while.
+		 * Periodically check to see if we have exceeded the CPU duty
+		 * cycle for a single wakeup.
 		 */
 		if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) {
-			clock_t pageout_cycle_ticks;
+			hrtime_t pageout_cycle_nsec;
 
-			pageout_cycle_ticks = ddi_get_lbolt() - pageout_lbolt;
-			if (pageout_cycle_ticks >= pageout_ticks) {
-				/*
-				 * This is where we normally break out of the
-				 * loop when scanning zones or sampling.
-				 */
-				if (!zones_over) {
+			pageout_cycle_nsec = gethrtime() - sample_start;
+			if (pageout_cycle_nsec >= pageout_nsec) {
+				if (!zones_over)
 					atomic_inc_64(&pageout_timeouts);
-				}
 				DTRACE_PROBE1(pageout__timeout, uint_t, inst);
 				break;
 			}
@@ -1062,12 +1311,14 @@ loop:
 
 		/*
 		 * If checkpage manages to add a page to the free list,
-		 * we give ourselves another couple of trips around memory.
+		 * we give ourselves another couple of trips around the loop.
 		 */
-		if ((rvfront = checkpage(fronthand, FRONT)) == 1)
-			count = 0;
-		if ((rvback = checkpage(backhand, BACK)) == 1)
-			count = 0;
+		if ((rvfront = checkpage(fronthand, POH_FRONT)) == CKP_FREED) {
+			laps = 0;
+		}
+		if ((rvback = checkpage(backhand, POH_BACK)) == CKP_FREED) {
+			laps = 0;
+		}
 
 		++pcount;
 
@@ -1080,25 +1331,35 @@ loop:
 		/*
 		 * Don't include ineligible pages in the number scanned.
 		 */
-		if (rvfront != -1 || rvback != -1)
+		if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) {
 			nscan_cnt++;
+		}
+
+		if (bhwrapping) {
+			backhand = regionstart;
+			bhwrapping = B_FALSE;
+		} else {
+			backhand = page_nextn(backhand, tick);
+			if (backhand == regionend)
+				bhwrapping = B_TRUE;
+		}
 
-		backhand = page_next(backhand);
+		if (fhwrapping) {
+			fronthand = regionstart;
+			fhwrapping = B_FALSE;
+		} else {
+			fronthand = page_nextn(fronthand, tick);
+			if (fronthand == regionend)
+				fhwrapping = B_TRUE;
+		}
 
 		/*
-		 * backhand update and wraparound check are done separately
-		 * because lint barks when it finds an empty "if" body
+		 * The front hand has wrapped around during this wakeup.
 		 */
-
-		if ((fronthand = page_next(fronthand)) == page_first())	{
-			DTRACE_PROBE1(pageout__wrap__front, uint_t, inst);
-
-			/*
-			 * Every 64 wraps we reposition our hands within our
-			 * region to prevent creep into another thread.
-			 */
-			if ((++iter % pageout_reset_cnt) == 0)
-				reset_hands[inst] = B_TRUE;
+		if (fronthand == fronthandstart) {
+			laps++;
+			DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst,
+			    uint_t, laps);
 
 			/*
 			 * This CPU kstat is only incremented here and we're
@@ -1107,96 +1368,134 @@ loop:
 			CPU_STATS_ADDQ(CPU, vm, rev, 1);
 
 			/*
-			 * If scanning because the system is low on memory,
 			 * then when we wraparound memory we want to try to
 			 * reclaim more pages.
 			 * If scanning only because zones are over their cap,
 			 * then wrapping is common and we simply keep going.
-			 */
-			if (freemem < lotsfree + needfree && ++count > 1) {
+			*/
+			if (laps > 1 && freemem < lotsfree + needfree) {
 				/*
-				 * The system is low on memory.
 				 * Extremely unlikely, but it happens.
-				 * We went around memory at least once
-				 * and didn't reclaim enough.
+				 * We went around the loop at least once
+				 * and didn't get far enough.
 				 * If we are still skipping `highly shared'
 				 * pages, skip fewer of them.  Otherwise,
 				 * give up till the next clock tick.
 				 */
-				mutex_enter(&pageout_mutex);
 				if (po_share < MAX_PO_SHARE) {
 					po_share <<= 1;
-					mutex_exit(&pageout_mutex);
 				} else {
-					/*
-					 * Really a "goto loop", but if someone
-					 * is tracing or TNF_PROBE_ing, hit
-					 * those probes first.
-					 */
-					mutex_exit(&pageout_mutex);
 					break;
 				}
 			}
 		}
 	}
 
-	atomic_add_long(&nscan, nscan_cnt);
-
 	sample_end = gethrtime();
+	atomic_add_long(&nscan, nscan_cnt);
 
-	DTRACE_PROBE3(pageout__loop__end, pgcnt_t, nscan_cnt, pgcnt_t, pcount,
-	    uint_t, inst);
-
-	/* Kernel probe */
-	TNF_PROBE_2(pageout_scan_end, "vm pagedaemon", /* CSTYLED */,
-	    tnf_ulong, pages_scanned, nscan_cnt, tnf_ulong, pages_free,
-	    freemem);
+	DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps,
+	    pgcnt_t, nscan_cnt, pgcnt_t, pcount)
 
 	/*
-	 * The following two blocks are only relevant when the scanner is
-	 * first started up. After the scanner runs for a while, neither of
-	 * the conditions will ever be true again.
-	 *
 	 * The global variables used below are only modified by this thread and
 	 * only during initial scanning when there is a single page scanner
-	 * thread running. Thus, we don't use any locking.
+	 * thread running.
 	 */
-	if (PAGE_SCAN_STARTUP) {
+	if (pageout_new_spread == 0) {
 		VERIFY3U(inst, ==, 0);
-		pageout_sample_pages += pcount;
-		pageout_sample_etime += sample_end - sample_start;
-		++pageout_sample_cnt;
 
-	} else if (pageout_new_spread == 0) {
-		uint_t i;
+		if (PAGE_SCAN_STARTUP) {
+			/*
+			 * Continue accumulating samples until we have enough
+			 * to get a reasonable value for average scan rate.
+			 */
+			pageout_sample_pages += pcount;
+			pageout_sample_etime += sample_end - sample_start;
+			++pageout_sample_cnt;
+		}
 
+		if (!PAGE_SCAN_STARTUP) {
+			/*
+			 * We have enough samples, set the spread.
+			 */
+			pageout_rate = (hrrate_t)pageout_sample_pages *
+			    (hrrate_t)(NANOSEC) / pageout_sample_etime;
+			pageout_new_spread = pageout_rate / 10;
+			setupclock();
+		}
+	}
+
+	goto loop;
+}
+
+/*
+ * The pageout deadman is run once per second by clock().
+ */
+void
+pageout_deadman(void)
+{
+	if (panicstr != NULL) {
 		/*
-		 * We have run enough samples, set the spread.
+		 * There is no pageout after panic.
 		 */
-		VERIFY3U(inst, ==, 0);
-		pageout_rate = (hrrate_t)pageout_sample_pages *
-		    (hrrate_t)(NANOSEC) / pageout_sample_etime;
-		pageout_new_spread = pageout_rate / 10;
-		setupclock(1);
+		return;
 	}
 
-	goto loop;
+	if (pageout_deadman_seconds == 0) {
+		/*
+		 * The deadman is not enabled.
+		 */
+		return;
+	}
+
+	if (!pageout_pushing) {
+		goto reset;
+	}
+
+	/*
+	 * We are pushing a page.  Check to see if it is the same call we saw
+	 * last time we looked:
+	 */
+	if (pageout_pushcount != pageout_pushcount_seen) {
+		/*
+		 * It is a different call from the last check, so we are not
+		 * stuck.
+		 */
+		goto reset;
+	}
+
+	if (++pageout_stucktime >= pageout_deadman_seconds) {
+		panic("pageout_deadman: stuck pushing the same page for %d "
+		    "seconds (freemem is %lu)", pageout_deadman_seconds,
+		    freemem);
+	}
+
+	return;
+
+reset:
+	/*
+	 * Reset our tracking state to reflect that we are not stuck:
+	 */
+	pageout_stucktime = 0;
+	pageout_pushcount_seen = pageout_pushcount;
 }
 
 /*
  * Look at the page at hand.  If it is locked (e.g., for physical i/o),
  * system (u., page table) or free, then leave it alone.  Otherwise,
  * if we are running the front hand, turn off the page's reference bit.
- * If running the back hand, check whether the page has been reclaimed.
- * If not, free the page, pushing it to disk first if necessary.
+ * If the proc is over maxrss, we take it.  If running the back hand,
+ * check whether the page has been reclaimed.  If not, free the page,
+ * pushing it to disk first if necessary.
  *
  * Return values:
- *	-1 if the page is not a candidate at all,
- *	 0 if not freed, or
- *	 1 if we freed it.
+ *	CKP_INELIGIBLE if the page is not a candidate at all,
+ *	CKP_NOT_FREED  if the page was not freed, or
+ *	CKP_FREED      if we freed it.
  */
-static int
-checkpage(struct page *pp, int whichhand)
+static checkpage_result_t
+checkpage(struct page *pp, pageout_hand_t whichhand)
 {
 	int ppattr;
 	int isfs = 0;
@@ -1206,7 +1505,7 @@ checkpage(struct page *pp, int whichhand)
 
 	/*
 	 * Skip pages:
-	 * 	- associated with the kernel vnode since
+	 *	- associated with the kernel vnode since
 	 *	    they are always "exclusively" locked.
 	 *	- that are free
 	 *	- that are shared more than po_share'd times
@@ -1218,21 +1517,21 @@ checkpage(struct page *pp, int whichhand)
 	if (PP_ISKAS(pp) || PAGE_LOCKED(pp) || PP_ISFREE(pp) ||
 	    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
 	    hat_page_checkshare(pp, po_share)) {
-		return (-1);
+		return (CKP_INELIGIBLE);
 	}
 
 	if (!page_trylock(pp, SE_EXCL)) {
 		/*
 		 * Skip the page if we can't acquire the "exclusive" lock.
 		 */
-		return (-1);
+		return (CKP_INELIGIBLE);
 	} else if (PP_ISFREE(pp)) {
 		/*
 		 * It became free between the above check and our actually
-		 * locking the page.  Oh, well there will be other pages.
+		 * locking the page.  Oh well, there will be other pages.
 		 */
 		page_unlock(pp);
-		return (-1);
+		return (CKP_INELIGIBLE);
 	}
 
 	/*
@@ -1242,7 +1541,7 @@ checkpage(struct page *pp, int whichhand)
 	 */
 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 		page_unlock(pp);
-		return (-1);
+		return (CKP_INELIGIBLE);
 	}
 
 	if (zones_over) {
@@ -1251,11 +1550,11 @@ checkpage(struct page *pp, int whichhand)
 		if (pp->p_zoneid == ALL_ZONES ||
 		    zone_pdata[pp->p_zoneid].zpers_over == 0) {
 			/*
-			 * Cross-zone shared page, or zone not over it's cap.
-			 * Leave the page alone.
-			 */
+			* Cross-zone shared page, or zone not over it's cap.
+			* Leave the page alone.
+			*/
 			page_unlock(pp);
-			return (-1);
+			return (CKP_INELIGIBLE);
 		}
 		zid = pp->p_zoneid;
 	}
@@ -1263,7 +1562,6 @@ checkpage(struct page *pp, int whichhand)
 	/*
 	 * Maintain statistics for what we are freeing
 	 */
-
 	if (pp->p_vnode != NULL) {
 		if (pp->p_vnode->v_flag & VVMEXEC)
 			isexec = 1;
@@ -1277,34 +1575,44 @@ checkpage(struct page *pp, int whichhand)
 	 * The back hand examines the REF bit and always considers
 	 * SHARED pages as referenced.
 	 */
-	if (whichhand == FRONT)
+	if (whichhand == POH_FRONT) {
 		pagesync_flag = HAT_SYNC_ZERORM;
-	else
+	} else {
 		pagesync_flag = HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_REF |
 		    HAT_SYNC_STOPON_SHARED;
+	}
 
 	ppattr = hat_pagesync(pp, pagesync_flag);
 
 recheck:
 	/*
-	 * If page is referenced; fronthand makes unreferenced and reclaimable.
-	 * For the backhand, a process referenced the page since the front hand
-	 * went by, so it's not a candidate for freeing up.
+	 * If page is referenced; make unreferenced but reclaimable.
+	 * If this page is not referenced, then it must be reclaimable
+	 * and we can add it to the free list.
 	 */
 	if (ppattr & P_REF) {
-		DTRACE_PROBE2(pageout__isref, page_t *, pp, int, whichhand);
-		if (whichhand == FRONT) {
+		DTRACE_PROBE2(pageout__isref, page_t *, pp,
+		    pageout_hand_t, whichhand);
+
+		if (whichhand == POH_FRONT) {
+			/*
+			 * Checking of rss or madvise flags needed here...
+			 *
+			 * If not "well-behaved", fall through into the code
+			 * for not referenced.
+			 */
 			hat_clrref(pp);
 		}
+
+		/*
+		 * Somebody referenced the page since the front
+		 * hand went by, so it's not a candidate for
+		 * freeing up.
+		 */
 		page_unlock(pp);
-		return (0);
+		return (CKP_NOT_FREED);
 	}
 
-	/*
-	 * This page is not referenced, so it must be reclaimable and we can
-	 * add it to the free list. This can be done by either hand.
-	 */
-
 	VM_STAT_ADD(pageoutvmstats.checkpage[0]);
 
 	/*
@@ -1315,31 +1623,32 @@ recheck:
 		if (!page_try_demote_pages(pp)) {
 			VM_STAT_ADD(pageoutvmstats.checkpage[1]);
 			page_unlock(pp);
-			return (-1);
+			return (CKP_INELIGIBLE);
 		}
+
 		ASSERT(pp->p_szc == 0);
 		VM_STAT_ADD(pageoutvmstats.checkpage[2]);
+
 		/*
-		 * since page_try_demote_pages() could have unloaded some
+		 * Since page_try_demote_pages() could have unloaded some
 		 * mappings it makes sense to reload ppattr.
 		 */
 		ppattr = hat_page_getattr(pp, P_MOD | P_REF);
 	}
 
 	/*
-	 * If the page is currently dirty, we have to arrange
-	 * to have it cleaned before it can be freed.
+	 * If the page is currently dirty, we have to arrange to have it
+	 * cleaned before it can be freed.
 	 *
 	 * XXX - ASSERT(pp->p_vnode != NULL);
 	 */
-	if ((ppattr & P_MOD) && pp->p_vnode) {
+	if ((ppattr & P_MOD) && pp->p_vnode != NULL) {
 		struct vnode *vp = pp->p_vnode;
 		u_offset_t offset = pp->p_offset;
 
 		/*
-		 * Note: There is no possibility to test for process being
-		 * swapped out or about to exit since we can't get back to
-		 * process(es) from the page.
+		 * XXX - Test for process being swapped out or about to exit?
+		 * [Can't get back to process(es) using the page.]
 		 */
 
 		/*
@@ -1351,34 +1660,33 @@ recheck:
 		page_unlock(pp);
 
 		/*
-		 * Queue i/o request for the pageout thread.
+		 * Queue I/O request for the pageout thread.
 		 */
 		if (!queue_io_request(vp, offset)) {
 			VN_RELE(vp);
-			return (0);
+			return (CKP_NOT_FREED);
 		}
 		if (isfs) {
 			zone_pageout_stat(zid, ZPO_DIRTY);
 		} else {
 			zone_pageout_stat(zid, ZPO_ANONDIRTY);
 		}
-		return (1);
+		return (CKP_FREED);
 	}
 
 	/*
-	 * Now we unload all the translations,
-	 * and put the page back on to the free list.
-	 * If the page was used (referenced or modified) after
-	 * the pagesync but before it was unloaded we catch it
-	 * and handle the page properly.
+	 * Now we unload all the translations and put the page back on to the
+	 * free list.  If the page was used (referenced or modified) after the
+	 * pagesync but before it was unloaded we catch it and handle the page
+	 * properly.
 	 */
-	DTRACE_PROBE2(pageout__free, page_t *, pp, int, whichhand);
+	DTRACE_PROBE2(pageout__free, page_t *, pp, pageout_hand_t, whichhand);
 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 	ppattr = hat_page_getattr(pp, P_MOD | P_REF);
-	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode))
+	if ((ppattr & P_REF) || ((ppattr & P_MOD) && pp->p_vnode != NULL)) {
 		goto recheck;
+	}
 
-	/*LINTED: constant in conditional context*/
 	VN_DISPOSE(pp, B_FREE, 0, kcred);
 
 	CPU_STATS_ADD_K(vm, dfree, 1);
@@ -1395,7 +1703,7 @@ recheck:
 		zone_pageout_stat(zid, ZPO_ANON);
 	}
 
-	return (1);		/* freed a page! */
+	return (CKP_FREED);
 }
 
 /*
diff --git a/usr/src/uts/common/os/watchpoint.c b/usr/src/uts/common/os/watchpoint.c
index eee612ef93..24db9637d4 100644
--- a/usr/src/uts/common/os/watchpoint.c
+++ b/usr/src/uts/common/os/watchpoint.c
@@ -821,7 +821,6 @@ watch_xcopyin(const void *uaddr, void *kaddr, size_t count)
 			count -= part;
 		}
 
-error:
 		/* if we hit a watched address, do the watchpoint logic */
 		if (watchcode &&
 		    (!sys_watchpoint(vaddr, watchcode, ta) ||