13941 intel code and headers should not look ia32 specific

Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: Patrick Mooney <pmooney@pfmooney.com> Approved by: Garret D'Amore <garrett@damore.org>
author: Richard Lowe <richlowe@richlowe.net> 2021-06-04 15:15:12 -0500
committer: Richard Lowe <richlowe@richlowe.net> 2021-08-16 12:46:39 -0500
commit: f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988 (patch)
tree: c4ac2f5e703ed459d50bcee7ddb38a993d961520 /usr/src/uts/intel/ml
parent: d083fed0c91296a88878f7a468910ad5b5c888ea (diff)
download: illumos-gate-f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988.tar.gz
13 files changed, 8731 insertions, 0 deletions
diff --git a/usr/src/uts/intel/ml/copy.s b/usr/src/uts/intel/ml/copy.s
new file mode 100644
index 0000000000..5e5f822518
--- /dev/null
+++ b/usr/src/uts/intel/ml/copy.s
@@ -0,0 +1,1908 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2009, Intel Corporation
+ * All rights reserved.
+ */
+
+/*       Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
+/*       Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T		*/
+/*         All Rights Reserved						*/
+
+/*       Copyright (c) 1987, 1988 Microsoft Corporation			*/
+/*         All Rights Reserved						*/
+
+/*
+ * Copyright 2020 Joyent, Inc.
+ */
+
+#include <sys/errno.h>
+#include <sys/asm_linkage.h>
+
+#include "assym.h"
+
+#define	KCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
+#define	XCOPY_MIN_SIZE	128	/* Must be >= 16 bytes */
+/*
+ * Non-temopral access (NTA) alignment requirement
+ */
+#define	NTA_ALIGN_SIZE	4	/* Must be at least 4-byte aligned */
+#define	NTA_ALIGN_MASK	_CONST(NTA_ALIGN_SIZE-1)
+#define	COUNT_ALIGN_SIZE	16	/* Must be at least 16-byte aligned */
+#define	COUNT_ALIGN_MASK	_CONST(COUNT_ALIGN_SIZE-1)
+
+/*
+ * With the introduction of Broadwell, Intel has introduced supervisor mode
+ * access protection -- SMAP. SMAP forces the kernel to set certain bits to
+ * enable access of user pages (AC in rflags, defines as PS_ACHK in
+ * <sys/psw.h>). One of the challenges is that the implementation of many of the
+ * userland copy routines directly use the kernel ones. For example, copyin and
+ * copyout simply go and jump to the do_copy_fault label and traditionally let
+ * those deal with the return for them. In fact, changing that is a can of frame
+ * pointers.
+ *
+ * Rules and Constraints:
+ *
+ * 1. For anything that's not in copy.s, we have it do explicit smap_disable()
+ * or smap_enable() calls.  This is restricted to the following three places:
+ * DTrace, resume() in swtch.s and on_fault/no_fault. If you want to add it
+ * somewhere else, we should be thinking twice.
+ *
+ * 2. We try to toggle this at the smallest window possible. This means that if
+ * we take a fault, need to try to use a copyop in copyin() or copyout(), or any
+ * other function, we will always leave with SMAP enabled (the kernel cannot
+ * access user pages).
+ *
+ * 3. None of the *_noerr() or ucopy/uzero routines should toggle SMAP. They are
+ * explicitly only allowed to be called while in an on_fault()/no_fault() handler,
+ * which already takes care of ensuring that SMAP is enabled and disabled. Note
+ * this means that when under an on_fault()/no_fault() handler, one must not
+ * call the non-*_noerr() routines.
+ *
+ * 4. The first thing we should do after coming out of an lofault handler is to
+ * make sure that we call smap_enable() again to ensure that we are safely
+ * protected, as more often than not, we will have disabled smap to get there.
+ *
+ * 5. smap_enable() and smap_disable() don't exist: calls to these functions
+ * generate runtime relocations, that are then processed into the necessary
+ * clac/stac, via the krtld hotinlines mechanism and hotinline_smap().
+ *
+ * 6. For any inline user of SMAP, the appropriate SMAP_ENABLE_INSTR and
+ * SMAP_DISABLE_INSTR macro should be used. If the number of these is changed,
+ * you must update the constants SMAP_ENABLE_COUNT and SMAP_DISABLE_COUNT below.
+ *
+ * 7. Generally this .s file is processed by a K&R style cpp. This means that it
+ * really has a lot of feelings about whitespace. In particular, if you have a
+ * macro FOO with the arguments FOO(1, 3), the second argument is in fact ' 3'.
+ *
+ * 8. In general, the kernel has its own value for rflags that gets used. This
+ * is maintained in a few different places which vary based on how the thread
+ * comes into existence and whether it's a user thread. In general, when the
+ * kernel takes a trap, it always will set ourselves to a known set of flags,
+ * mainly as part of ENABLE_INTR_FLAGS and F_OFF and F_ON. These ensure that
+ * PS_ACHK is cleared for us. In addition, when using the sysenter instruction,
+ * we mask off PS_ACHK off via the AMD_SFMASK MSR. See init_cpu_syscall() for
+ * where that gets masked off.
+ */
+
+/*
+ * The optimal 64-bit bcopy and kcopy for modern x86 processors uses
+ * "rep smovq" for large sizes. Performance data shows that many calls to
+ * bcopy/kcopy/bzero/kzero operate on small buffers. For best performance for
+ * these small sizes unrolled code is used. For medium sizes loops writing
+ * 64-bytes per loop are used. Transition points were determined experimentally.
+ */
+#define BZERO_USE_REP	(1024)
+#define BCOPY_DFLT_REP	(128)
+#define	BCOPY_NHM_REP	(768)
+
+/*
+ * Copy a block of storage, returning an error code if `from' or
+ * `to' takes a kernel pagefault which cannot be resolved.
+ * Returns errno value on pagefault error, 0 if all ok
+ */
+
+/*
+ * I'm sorry about these macros, but copy.s is unsurprisingly sensitive to
+ * additional call instructions.
+ */
+#define	SMAP_DISABLE_COUNT	16
+#define	SMAP_ENABLE_COUNT	26
+
+#define	SMAP_DISABLE_INSTR(ITER)		\
+	.globl	_smap_disable_patch_/**/ITER;	\
+	_smap_disable_patch_/**/ITER/**/:;	\
+	nop; nop; nop;
+
+#define	SMAP_ENABLE_INSTR(ITER)			\
+	.globl	_smap_enable_patch_/**/ITER;	\
+	_smap_enable_patch_/**/ITER/**/:;	\
+	nop; nop; nop;
+
+	.globl	kernelbase
+	.globl	postbootkernelbase
+
+	ENTRY(kcopy)
+	pushq	%rbp
+	movq	%rsp, %rbp
+#ifdef DEBUG
+	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
+	jb	0f
+	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
+	jnb	1f
+0:	leaq	.kcopy_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_kcopy_copyerr(%rip), %rcx
+	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
+
+do_copy_fault:
+	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
+	movq	%rcx, T_LOFAULT(%r9)	/* new lofault */
+	call	bcopy_altentry
+	xorl	%eax, %eax		/* return 0 (success) */
+	SMAP_ENABLE_INSTR(0)
+
+	/*
+	 * A fault during do_copy_fault is indicated through an errno value
+	 * in %rax and we iretq from the trap handler to here.
+	 */
+_kcopy_copyerr:
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+	leave
+	ret
+	SET_SIZE(kcopy)
+
+#undef	ARG_FROM
+#undef	ARG_TO
+#undef	ARG_COUNT
+
+#define	COPY_LOOP_INIT(src, dst, cnt)	\
+	addq	cnt, src;			\
+	addq	cnt, dst;			\
+	shrq	$3, cnt;			\
+	neg	cnt
+
+	/* Copy 16 bytes per loop.  Uses %rax and %r8 */
+#define	COPY_LOOP_BODY(src, dst, cnt)	\
+	prefetchnta	0x100(src, cnt, 8);	\
+	movq	(src, cnt, 8), %rax;		\
+	movq	0x8(src, cnt, 8), %r8;		\
+	movnti	%rax, (dst, cnt, 8);		\
+	movnti	%r8, 0x8(dst, cnt, 8);		\
+	addq	$2, cnt
+
+	ENTRY(kcopy_nta)
+	pushq	%rbp
+	movq	%rsp, %rbp
+#ifdef DEBUG
+	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
+	jb	0f
+	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
+	jnb	1f
+0:	leaq	.kcopy_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+
+	movq	%gs:CPU_THREAD, %r9
+	cmpq	$0, %rcx		/* No non-temporal access? */
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_kcopy_nta_copyerr(%rip), %rcx	/* doesn't set rflags */
+	jnz	do_copy_fault		/* use regular access */
+	/*
+	 * Make sure cnt is >= KCOPY_MIN_SIZE
+	 */
+	cmpq	$KCOPY_MIN_SIZE, %rdx
+	jb	do_copy_fault
+
+	/*
+	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
+	 * count is COUNT_ALIGN_SIZE aligned.
+	 */
+	movq	%rdi, %r10
+	orq	%rsi, %r10
+	andq	$NTA_ALIGN_MASK, %r10
+	orq	%rdx, %r10
+	andq	$COUNT_ALIGN_MASK, %r10
+	jnz	do_copy_fault
+
+	ALTENTRY(do_copy_fault_nta)
+	movq    %gs:CPU_THREAD, %r9     /* %r9 = thread addr */
+	movq    T_LOFAULT(%r9), %r11    /* save the current lofault */
+	movq    %rcx, T_LOFAULT(%r9)    /* new lofault */
+
+	/*
+	 * COPY_LOOP_BODY uses %rax and %r8
+	 */
+	COPY_LOOP_INIT(%rdi, %rsi, %rdx)
+2:	COPY_LOOP_BODY(%rdi, %rsi, %rdx)
+	jnz	2b
+
+	mfence
+	xorl	%eax, %eax		/* return 0 (success) */
+	SMAP_ENABLE_INSTR(1)
+
+_kcopy_nta_copyerr:
+	movq	%r11, T_LOFAULT(%r9)    /* restore original lofault */
+	leave
+	ret
+	SET_SIZE(do_copy_fault_nta)
+	SET_SIZE(kcopy_nta)
+
+	ENTRY(bcopy)
+#ifdef DEBUG
+	orq	%rdx, %rdx		/* %rdx = count */
+	jz	1f
+	cmpq	postbootkernelbase(%rip), %rdi		/* %rdi = from */
+	jb	0f
+	cmpq	postbootkernelbase(%rip), %rsi		/* %rsi = to */
+	jnb	1f
+0:	leaq	.bcopy_panic_msg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+1:
+#endif
+	/*
+	 * bcopy_altentry() is called from kcopy, i.e., do_copy_fault.
+	 * kcopy assumes that bcopy doesn't touch %r9 and %r11. If bcopy
+	 * uses these registers in future they must be saved and restored.
+	 */
+	ALTENTRY(bcopy_altentry)
+do_copy:
+#define	L(s) .bcopy/**/s
+	cmpq	$0x50, %rdx		/* 80 */
+	jae	bcopy_ck_size
+
+	/*
+	 * Performance data shows many caller's copy small buffers. So for
+	 * best perf for these sizes unrolled code is used. Store data without
+	 * worrying about alignment.
+	 */
+	leaq	L(fwdPxQx)(%rip), %r10
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	movslq	(%r10,%rdx,4), %rcx
+	leaq	(%rcx,%r10,1), %r10
+	INDIRECT_JMP_REG(r10)
+
+	.p2align 4
+L(fwdPxQx):
+	.int       L(P0Q0)-L(fwdPxQx)	/* 0 */
+	.int       L(P1Q0)-L(fwdPxQx)
+	.int       L(P2Q0)-L(fwdPxQx)
+	.int       L(P3Q0)-L(fwdPxQx)
+	.int       L(P4Q0)-L(fwdPxQx)
+	.int       L(P5Q0)-L(fwdPxQx)
+	.int       L(P6Q0)-L(fwdPxQx)
+	.int       L(P7Q0)-L(fwdPxQx)
+
+	.int       L(P0Q1)-L(fwdPxQx)	/* 8 */
+	.int       L(P1Q1)-L(fwdPxQx)
+	.int       L(P2Q1)-L(fwdPxQx)
+	.int       L(P3Q1)-L(fwdPxQx)
+	.int       L(P4Q1)-L(fwdPxQx)
+	.int       L(P5Q1)-L(fwdPxQx)
+	.int       L(P6Q1)-L(fwdPxQx)
+	.int       L(P7Q1)-L(fwdPxQx)
+
+	.int       L(P0Q2)-L(fwdPxQx)	/* 16 */
+	.int       L(P1Q2)-L(fwdPxQx)
+	.int       L(P2Q2)-L(fwdPxQx)
+	.int       L(P3Q2)-L(fwdPxQx)
+	.int       L(P4Q2)-L(fwdPxQx)
+	.int       L(P5Q2)-L(fwdPxQx)
+	.int       L(P6Q2)-L(fwdPxQx)
+	.int       L(P7Q2)-L(fwdPxQx)
+
+	.int       L(P0Q3)-L(fwdPxQx)	/* 24 */
+	.int       L(P1Q3)-L(fwdPxQx)
+	.int       L(P2Q3)-L(fwdPxQx)
+	.int       L(P3Q3)-L(fwdPxQx)
+	.int       L(P4Q3)-L(fwdPxQx)
+	.int       L(P5Q3)-L(fwdPxQx)
+	.int       L(P6Q3)-L(fwdPxQx)
+	.int       L(P7Q3)-L(fwdPxQx)
+
+	.int       L(P0Q4)-L(fwdPxQx)	/* 32 */
+	.int       L(P1Q4)-L(fwdPxQx)
+	.int       L(P2Q4)-L(fwdPxQx)
+	.int       L(P3Q4)-L(fwdPxQx)
+	.int       L(P4Q4)-L(fwdPxQx)
+	.int       L(P5Q4)-L(fwdPxQx)
+	.int       L(P6Q4)-L(fwdPxQx)
+	.int       L(P7Q4)-L(fwdPxQx)
+
+	.int       L(P0Q5)-L(fwdPxQx)	/* 40 */
+	.int       L(P1Q5)-L(fwdPxQx)
+	.int       L(P2Q5)-L(fwdPxQx)
+	.int       L(P3Q5)-L(fwdPxQx)
+	.int       L(P4Q5)-L(fwdPxQx)
+	.int       L(P5Q5)-L(fwdPxQx)
+	.int       L(P6Q5)-L(fwdPxQx)
+	.int       L(P7Q5)-L(fwdPxQx)
+
+	.int       L(P0Q6)-L(fwdPxQx)	/* 48 */
+	.int       L(P1Q6)-L(fwdPxQx)
+	.int       L(P2Q6)-L(fwdPxQx)
+	.int       L(P3Q6)-L(fwdPxQx)
+	.int       L(P4Q6)-L(fwdPxQx)
+	.int       L(P5Q6)-L(fwdPxQx)
+	.int       L(P6Q6)-L(fwdPxQx)
+	.int       L(P7Q6)-L(fwdPxQx)
+
+	.int       L(P0Q7)-L(fwdPxQx)	/* 56 */
+	.int       L(P1Q7)-L(fwdPxQx)
+	.int       L(P2Q7)-L(fwdPxQx)
+	.int       L(P3Q7)-L(fwdPxQx)
+	.int       L(P4Q7)-L(fwdPxQx)
+	.int       L(P5Q7)-L(fwdPxQx)
+	.int       L(P6Q7)-L(fwdPxQx)
+	.int       L(P7Q7)-L(fwdPxQx)
+
+	.int       L(P0Q8)-L(fwdPxQx)	/* 64 */
+	.int       L(P1Q8)-L(fwdPxQx)
+	.int       L(P2Q8)-L(fwdPxQx)
+	.int       L(P3Q8)-L(fwdPxQx)
+	.int       L(P4Q8)-L(fwdPxQx)
+	.int       L(P5Q8)-L(fwdPxQx)
+	.int       L(P6Q8)-L(fwdPxQx)
+	.int       L(P7Q8)-L(fwdPxQx)
+
+	.int       L(P0Q9)-L(fwdPxQx)	/* 72 */
+	.int       L(P1Q9)-L(fwdPxQx)
+	.int       L(P2Q9)-L(fwdPxQx)
+	.int       L(P3Q9)-L(fwdPxQx)
+	.int       L(P4Q9)-L(fwdPxQx)
+	.int       L(P5Q9)-L(fwdPxQx)
+	.int       L(P6Q9)-L(fwdPxQx)
+	.int       L(P7Q9)-L(fwdPxQx)	/* 79 */
+
+	.p2align 4
+L(P0Q9):
+	mov    -0x48(%rdi), %rcx
+	mov    %rcx, -0x48(%rsi)
+L(P0Q8):
+	mov    -0x40(%rdi), %r10
+	mov    %r10, -0x40(%rsi)
+L(P0Q7):
+	mov    -0x38(%rdi), %r8
+	mov    %r8, -0x38(%rsi)
+L(P0Q6):
+	mov    -0x30(%rdi), %rcx
+	mov    %rcx, -0x30(%rsi)
+L(P0Q5):
+	mov    -0x28(%rdi), %r10
+	mov    %r10, -0x28(%rsi)
+L(P0Q4):
+	mov    -0x20(%rdi), %r8
+	mov    %r8, -0x20(%rsi)
+L(P0Q3):
+	mov    -0x18(%rdi), %rcx
+	mov    %rcx, -0x18(%rsi)
+L(P0Q2):
+	mov    -0x10(%rdi), %r10
+	mov    %r10, -0x10(%rsi)
+L(P0Q1):
+	mov    -0x8(%rdi), %r8
+	mov    %r8, -0x8(%rsi)
+L(P0Q0):
+	ret
+
+	.p2align 4
+L(P1Q9):
+	mov    -0x49(%rdi), %r8
+	mov    %r8, -0x49(%rsi)
+L(P1Q8):
+	mov    -0x41(%rdi), %rcx
+	mov    %rcx, -0x41(%rsi)
+L(P1Q7):
+	mov    -0x39(%rdi), %r10
+	mov    %r10, -0x39(%rsi)
+L(P1Q6):
+	mov    -0x31(%rdi), %r8
+	mov    %r8, -0x31(%rsi)
+L(P1Q5):
+	mov    -0x29(%rdi), %rcx
+	mov    %rcx, -0x29(%rsi)
+L(P1Q4):
+	mov    -0x21(%rdi), %r10
+	mov    %r10, -0x21(%rsi)
+L(P1Q3):
+	mov    -0x19(%rdi), %r8
+	mov    %r8, -0x19(%rsi)
+L(P1Q2):
+	mov    -0x11(%rdi), %rcx
+	mov    %rcx, -0x11(%rsi)
+L(P1Q1):
+	mov    -0x9(%rdi), %r10
+	mov    %r10, -0x9(%rsi)
+L(P1Q0):
+	movzbq -0x1(%rdi), %r8
+	mov    %r8b, -0x1(%rsi)
+	ret
+
+	.p2align 4
+L(P2Q9):
+	mov    -0x4a(%rdi), %r8
+	mov    %r8, -0x4a(%rsi)
+L(P2Q8):
+	mov    -0x42(%rdi), %rcx
+	mov    %rcx, -0x42(%rsi)
+L(P2Q7):
+	mov    -0x3a(%rdi), %r10
+	mov    %r10, -0x3a(%rsi)
+L(P2Q6):
+	mov    -0x32(%rdi), %r8
+	mov    %r8, -0x32(%rsi)
+L(P2Q5):
+	mov    -0x2a(%rdi), %rcx
+	mov    %rcx, -0x2a(%rsi)
+L(P2Q4):
+	mov    -0x22(%rdi), %r10
+	mov    %r10, -0x22(%rsi)
+L(P2Q3):
+	mov    -0x1a(%rdi), %r8
+	mov    %r8, -0x1a(%rsi)
+L(P2Q2):
+	mov    -0x12(%rdi), %rcx
+	mov    %rcx, -0x12(%rsi)
+L(P2Q1):
+	mov    -0xa(%rdi), %r10
+	mov    %r10, -0xa(%rsi)
+L(P2Q0):
+	movzwq -0x2(%rdi), %r8
+	mov    %r8w, -0x2(%rsi)
+	ret
+
+	.p2align 4
+L(P3Q9):
+	mov    -0x4b(%rdi), %r8
+	mov    %r8, -0x4b(%rsi)
+L(P3Q8):
+	mov    -0x43(%rdi), %rcx
+	mov    %rcx, -0x43(%rsi)
+L(P3Q7):
+	mov    -0x3b(%rdi), %r10
+	mov    %r10, -0x3b(%rsi)
+L(P3Q6):
+	mov    -0x33(%rdi), %r8
+	mov    %r8, -0x33(%rsi)
+L(P3Q5):
+	mov    -0x2b(%rdi), %rcx
+	mov    %rcx, -0x2b(%rsi)
+L(P3Q4):
+	mov    -0x23(%rdi), %r10
+	mov    %r10, -0x23(%rsi)
+L(P3Q3):
+	mov    -0x1b(%rdi), %r8
+	mov    %r8, -0x1b(%rsi)
+L(P3Q2):
+	mov    -0x13(%rdi), %rcx
+	mov    %rcx, -0x13(%rsi)
+L(P3Q1):
+	mov    -0xb(%rdi), %r10
+	mov    %r10, -0xb(%rsi)
+	/*
+	 * These trailing loads/stores have to do all their loads 1st,
+	 * then do the stores.
+	 */
+L(P3Q0):
+	movzwq -0x3(%rdi), %r8
+	movzbq -0x1(%rdi), %r10
+	mov    %r8w, -0x3(%rsi)
+	mov    %r10b, -0x1(%rsi)
+	ret
+
+	.p2align 4
+L(P4Q9):
+	mov    -0x4c(%rdi), %r8
+	mov    %r8, -0x4c(%rsi)
+L(P4Q8):
+	mov    -0x44(%rdi), %rcx
+	mov    %rcx, -0x44(%rsi)
+L(P4Q7):
+	mov    -0x3c(%rdi), %r10
+	mov    %r10, -0x3c(%rsi)
+L(P4Q6):
+	mov    -0x34(%rdi), %r8
+	mov    %r8, -0x34(%rsi)
+L(P4Q5):
+	mov    -0x2c(%rdi), %rcx
+	mov    %rcx, -0x2c(%rsi)
+L(P4Q4):
+	mov    -0x24(%rdi), %r10
+	mov    %r10, -0x24(%rsi)
+L(P4Q3):
+	mov    -0x1c(%rdi), %r8
+	mov    %r8, -0x1c(%rsi)
+L(P4Q2):
+	mov    -0x14(%rdi), %rcx
+	mov    %rcx, -0x14(%rsi)
+L(P4Q1):
+	mov    -0xc(%rdi), %r10
+	mov    %r10, -0xc(%rsi)
+L(P4Q0):
+	mov    -0x4(%rdi), %r8d
+	mov    %r8d, -0x4(%rsi)
+	ret
+
+	.p2align 4
+L(P5Q9):
+	mov    -0x4d(%rdi), %r8
+	mov    %r8, -0x4d(%rsi)
+L(P5Q8):
+	mov    -0x45(%rdi), %rcx
+	mov    %rcx, -0x45(%rsi)
+L(P5Q7):
+	mov    -0x3d(%rdi), %r10
+	mov    %r10, -0x3d(%rsi)
+L(P5Q6):
+	mov    -0x35(%rdi), %r8
+	mov    %r8, -0x35(%rsi)
+L(P5Q5):
+	mov    -0x2d(%rdi), %rcx
+	mov    %rcx, -0x2d(%rsi)
+L(P5Q4):
+	mov    -0x25(%rdi), %r10
+	mov    %r10, -0x25(%rsi)
+L(P5Q3):
+	mov    -0x1d(%rdi), %r8
+	mov    %r8, -0x1d(%rsi)
+L(P5Q2):
+	mov    -0x15(%rdi), %rcx
+	mov    %rcx, -0x15(%rsi)
+L(P5Q1):
+	mov    -0xd(%rdi), %r10
+	mov    %r10, -0xd(%rsi)
+L(P5Q0):
+	mov    -0x5(%rdi), %r8d
+	movzbq -0x1(%rdi), %r10
+	mov    %r8d, -0x5(%rsi)
+	mov    %r10b, -0x1(%rsi)
+	ret
+
+	.p2align 4
+L(P6Q9):
+	mov    -0x4e(%rdi), %r8
+	mov    %r8, -0x4e(%rsi)
+L(P6Q8):
+	mov    -0x46(%rdi), %rcx
+	mov    %rcx, -0x46(%rsi)
+L(P6Q7):
+	mov    -0x3e(%rdi), %r10
+	mov    %r10, -0x3e(%rsi)
+L(P6Q6):
+	mov    -0x36(%rdi), %r8
+	mov    %r8, -0x36(%rsi)
+L(P6Q5):
+	mov    -0x2e(%rdi), %rcx
+	mov    %rcx, -0x2e(%rsi)
+L(P6Q4):
+	mov    -0x26(%rdi), %r10
+	mov    %r10, -0x26(%rsi)
+L(P6Q3):
+	mov    -0x1e(%rdi), %r8
+	mov    %r8, -0x1e(%rsi)
+L(P6Q2):
+	mov    -0x16(%rdi), %rcx
+	mov    %rcx, -0x16(%rsi)
+L(P6Q1):
+	mov    -0xe(%rdi), %r10
+	mov    %r10, -0xe(%rsi)
+L(P6Q0):
+	mov    -0x6(%rdi), %r8d
+	movzwq -0x2(%rdi), %r10
+	mov    %r8d, -0x6(%rsi)
+	mov    %r10w, -0x2(%rsi)
+	ret
+
+	.p2align 4
+L(P7Q9):
+	mov    -0x4f(%rdi), %r8
+	mov    %r8, -0x4f(%rsi)
+L(P7Q8):
+	mov    -0x47(%rdi), %rcx
+	mov    %rcx, -0x47(%rsi)
+L(P7Q7):
+	mov    -0x3f(%rdi), %r10
+	mov    %r10, -0x3f(%rsi)
+L(P7Q6):
+	mov    -0x37(%rdi), %r8
+	mov    %r8, -0x37(%rsi)
+L(P7Q5):
+	mov    -0x2f(%rdi), %rcx
+	mov    %rcx, -0x2f(%rsi)
+L(P7Q4):
+	mov    -0x27(%rdi), %r10
+	mov    %r10, -0x27(%rsi)
+L(P7Q3):
+	mov    -0x1f(%rdi), %r8
+	mov    %r8, -0x1f(%rsi)
+L(P7Q2):
+	mov    -0x17(%rdi), %rcx
+	mov    %rcx, -0x17(%rsi)
+L(P7Q1):
+	mov    -0xf(%rdi), %r10
+	mov    %r10, -0xf(%rsi)
+L(P7Q0):
+	mov    -0x7(%rdi), %r8d
+	movzwq -0x3(%rdi), %r10
+	movzbq -0x1(%rdi), %rcx
+	mov    %r8d, -0x7(%rsi)
+	mov    %r10w, -0x3(%rsi)
+	mov    %cl, -0x1(%rsi)
+	ret
+
+	/*
+	 * For large sizes rep smovq is fastest.
+	 * Transition point determined experimentally as measured on
+	 * Intel Xeon processors (incl. Nehalem and previous generations) and
+	 * AMD Opteron. The transition value is patched at boot time to avoid
+	 * memory reference hit.
+	 */
+	.globl bcopy_patch_start
+bcopy_patch_start:
+	cmpq	$BCOPY_NHM_REP, %rdx
+	.globl bcopy_patch_end
+bcopy_patch_end:
+
+	.p2align 4
+	ALTENTRY(bcopy_ck_size)
+
+	cmpq	$BCOPY_DFLT_REP, %rdx
+	jae	L(use_rep)
+
+	/*
+	 * Align to a 8-byte boundary. Avoids penalties from unaligned stores
+	 * as well as from stores spanning cachelines.
+	 */
+	test	$0x7, %rsi
+	jz	L(aligned_loop)
+	test	$0x1, %rsi
+	jz	2f
+	movzbq	(%rdi), %r8
+	dec	%rdx
+	inc	%rdi
+	mov	%r8b, (%rsi)
+	inc	%rsi
+2:
+	test	$0x2, %rsi
+	jz	4f
+	movzwq	(%rdi), %r8
+	sub	$0x2, %rdx
+	add	$0x2, %rdi
+	mov	%r8w, (%rsi)
+	add	$0x2, %rsi
+4:
+	test	$0x4, %rsi
+	jz	L(aligned_loop)
+	mov	(%rdi), %r8d
+	sub	$0x4, %rdx
+	add	$0x4, %rdi
+	mov	%r8d, (%rsi)
+	add	$0x4, %rsi
+
+	/*
+	 * Copy 64-bytes per loop
+	 */
+	.p2align 4
+L(aligned_loop):
+	mov	(%rdi), %r8
+	mov	0x8(%rdi), %r10
+	lea	-0x40(%rdx), %rdx
+	mov	%r8, (%rsi)
+	mov	%r10, 0x8(%rsi)
+	mov	0x10(%rdi), %rcx
+	mov	0x18(%rdi), %r8
+	mov	%rcx, 0x10(%rsi)
+	mov	%r8, 0x18(%rsi)
+
+	cmp	$0x40, %rdx
+	mov	0x20(%rdi), %r10
+	mov	0x28(%rdi), %rcx
+	mov	%r10, 0x20(%rsi)
+	mov	%rcx, 0x28(%rsi)
+	mov	0x30(%rdi), %r8
+	mov	0x38(%rdi), %r10
+	lea	0x40(%rdi), %rdi
+	mov	%r8, 0x30(%rsi)
+	mov	%r10, 0x38(%rsi)
+	lea	0x40(%rsi), %rsi
+	jae	L(aligned_loop)
+
+	/*
+	 * Copy remaining bytes (0-63)
+	 */
+L(do_remainder):
+	leaq	L(fwdPxQx)(%rip), %r10
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	movslq	(%r10,%rdx,4), %rcx
+	leaq	(%rcx,%r10,1), %r10
+	INDIRECT_JMP_REG(r10)
+
+	/*
+	 * Use rep smovq. Clear remainder via unrolled code
+	 */
+	.p2align 4
+L(use_rep):
+	xchgq	%rdi, %rsi		/* %rsi = source, %rdi = destination */
+	movq	%rdx, %rcx		/* %rcx = count */
+	shrq	$3, %rcx		/* 8-byte word count */
+	rep
+	  smovq
+
+	xchgq	%rsi, %rdi		/* %rdi = src, %rsi = destination */
+	andq	$7, %rdx		/* remainder */
+	jnz	L(do_remainder)
+	ret
+#undef	L
+	SET_SIZE(bcopy_ck_size)
+
+#ifdef DEBUG
+	/*
+	 * Setup frame on the run-time stack. The end of the input argument
+	 * area must be aligned on a 16 byte boundary. The stack pointer %rsp,
+	 * always points to the end of the latest allocated stack frame.
+	 * panic(const char *format, ...) is a varargs function. When a
+	 * function taking variable arguments is called, %rax must be set
+	 * to eight times the number of floating point parameters passed
+	 * to the function in SSE registers.
+	 */
+call_panic:
+	pushq	%rbp			/* align stack properly */
+	movq	%rsp, %rbp
+	xorl	%eax, %eax		/* no variable arguments */
+	call	panic			/* %rdi = format string */
+#endif
+	SET_SIZE(bcopy_altentry)
+	SET_SIZE(bcopy)
+
+
+/*
+ * Zero a block of storage, returning an error code if we
+ * take a kernel pagefault which cannot be resolved.
+ * Returns errno value on pagefault error, 0 if all ok
+ */
+
+	ENTRY(kzero)
+#ifdef DEBUG
+        cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
+        jnb	0f
+        leaq	.kzero_panic_msg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+0:
+#endif
+	/*
+	 * pass lofault value as 3rd argument for fault return
+	 */
+	leaq	_kzeroerr(%rip), %rdx
+
+	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
+	movq	T_LOFAULT(%r9), %r11	/* save the current lofault */
+	movq	%rdx, T_LOFAULT(%r9)	/* new lofault */
+	call	bzero_altentry
+	xorl	%eax, %eax
+	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
+	ret
+	/*
+	 * A fault during bzero is indicated through an errno value
+	 * in %rax when we iretq to here.
+	 */
+_kzeroerr:
+	addq	$8, %rsp		/* pop bzero_altentry call ret addr */
+	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
+	ret
+	SET_SIZE(kzero)
+
+/*
+ * Zero a block of storage.
+ */
+
+	ENTRY(bzero)
+#ifdef DEBUG
+	cmpq	postbootkernelbase(%rip), %rdi	/* %rdi = addr */
+	jnb	0f
+	leaq	.bzero_panic_msg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+0:
+#endif
+	ALTENTRY(bzero_altentry)
+do_zero:
+#define	L(s) .bzero/**/s
+	xorl	%eax, %eax
+
+	cmpq	$0x50, %rsi		/* 80 */
+	jae	L(ck_align)
+
+	/*
+	 * Performance data shows many caller's are zeroing small buffers. So
+	 * for best perf for these sizes unrolled code is used. Store zeros
+	 * without worrying about alignment.
+	 */
+	leaq	L(setPxQx)(%rip), %r10
+	addq	%rsi, %rdi
+	movslq	(%r10,%rsi,4), %rcx
+	leaq	(%rcx,%r10,1), %r10
+	INDIRECT_JMP_REG(r10)
+
+	.p2align 4
+L(setPxQx):
+	.int       L(P0Q0)-L(setPxQx)	/* 0 */
+	.int       L(P1Q0)-L(setPxQx)
+	.int       L(P2Q0)-L(setPxQx)
+	.int       L(P3Q0)-L(setPxQx)
+	.int       L(P4Q0)-L(setPxQx)
+	.int       L(P5Q0)-L(setPxQx)
+	.int       L(P6Q0)-L(setPxQx)
+	.int       L(P7Q0)-L(setPxQx)
+
+	.int       L(P0Q1)-L(setPxQx)	/* 8 */
+	.int       L(P1Q1)-L(setPxQx)
+	.int       L(P2Q1)-L(setPxQx)
+	.int       L(P3Q1)-L(setPxQx)
+	.int       L(P4Q1)-L(setPxQx)
+	.int       L(P5Q1)-L(setPxQx)
+	.int       L(P6Q1)-L(setPxQx)
+	.int       L(P7Q1)-L(setPxQx)
+
+	.int       L(P0Q2)-L(setPxQx)	/* 16 */
+	.int       L(P1Q2)-L(setPxQx)
+	.int       L(P2Q2)-L(setPxQx)
+	.int       L(P3Q2)-L(setPxQx)
+	.int       L(P4Q2)-L(setPxQx)
+	.int       L(P5Q2)-L(setPxQx)
+	.int       L(P6Q2)-L(setPxQx)
+	.int       L(P7Q2)-L(setPxQx)
+
+	.int       L(P0Q3)-L(setPxQx)	/* 24 */
+	.int       L(P1Q3)-L(setPxQx)
+	.int       L(P2Q3)-L(setPxQx)
+	.int       L(P3Q3)-L(setPxQx)
+	.int       L(P4Q3)-L(setPxQx)
+	.int       L(P5Q3)-L(setPxQx)
+	.int       L(P6Q3)-L(setPxQx)
+	.int       L(P7Q3)-L(setPxQx)
+
+	.int       L(P0Q4)-L(setPxQx)	/* 32 */
+	.int       L(P1Q4)-L(setPxQx)
+	.int       L(P2Q4)-L(setPxQx)
+	.int       L(P3Q4)-L(setPxQx)
+	.int       L(P4Q4)-L(setPxQx)
+	.int       L(P5Q4)-L(setPxQx)
+	.int       L(P6Q4)-L(setPxQx)
+	.int       L(P7Q4)-L(setPxQx)
+
+	.int       L(P0Q5)-L(setPxQx)	/* 40 */
+	.int       L(P1Q5)-L(setPxQx)
+	.int       L(P2Q5)-L(setPxQx)
+	.int       L(P3Q5)-L(setPxQx)
+	.int       L(P4Q5)-L(setPxQx)
+	.int       L(P5Q5)-L(setPxQx)
+	.int       L(P6Q5)-L(setPxQx)
+	.int       L(P7Q5)-L(setPxQx)
+
+	.int       L(P0Q6)-L(setPxQx)	/* 48 */
+	.int       L(P1Q6)-L(setPxQx)
+	.int       L(P2Q6)-L(setPxQx)
+	.int       L(P3Q6)-L(setPxQx)
+	.int       L(P4Q6)-L(setPxQx)
+	.int       L(P5Q6)-L(setPxQx)
+	.int       L(P6Q6)-L(setPxQx)
+	.int       L(P7Q6)-L(setPxQx)
+
+	.int       L(P0Q7)-L(setPxQx)	/* 56 */
+	.int       L(P1Q7)-L(setPxQx)
+	.int       L(P2Q7)-L(setPxQx)
+	.int       L(P3Q7)-L(setPxQx)
+	.int       L(P4Q7)-L(setPxQx)
+	.int       L(P5Q7)-L(setPxQx)
+	.int       L(P6Q7)-L(setPxQx)
+	.int       L(P7Q7)-L(setPxQx)
+
+	.int       L(P0Q8)-L(setPxQx)	/* 64 */
+	.int       L(P1Q8)-L(setPxQx)
+	.int       L(P2Q8)-L(setPxQx)
+	.int       L(P3Q8)-L(setPxQx)
+	.int       L(P4Q8)-L(setPxQx)
+	.int       L(P5Q8)-L(setPxQx)
+	.int       L(P6Q8)-L(setPxQx)
+	.int       L(P7Q8)-L(setPxQx)
+
+	.int       L(P0Q9)-L(setPxQx)	/* 72 */
+	.int       L(P1Q9)-L(setPxQx)
+	.int       L(P2Q9)-L(setPxQx)
+	.int       L(P3Q9)-L(setPxQx)
+	.int       L(P4Q9)-L(setPxQx)
+	.int       L(P5Q9)-L(setPxQx)
+	.int       L(P6Q9)-L(setPxQx)
+	.int       L(P7Q9)-L(setPxQx)	/* 79 */
+
+	.p2align 4
+L(P0Q9): mov    %rax, -0x48(%rdi)
+L(P0Q8): mov    %rax, -0x40(%rdi)
+L(P0Q7): mov    %rax, -0x38(%rdi)
+L(P0Q6): mov    %rax, -0x30(%rdi)
+L(P0Q5): mov    %rax, -0x28(%rdi)
+L(P0Q4): mov    %rax, -0x20(%rdi)
+L(P0Q3): mov    %rax, -0x18(%rdi)
+L(P0Q2): mov    %rax, -0x10(%rdi)
+L(P0Q1): mov    %rax, -0x8(%rdi)
+L(P0Q0):
+	 ret
+
+	.p2align 4
+L(P1Q9): mov    %rax, -0x49(%rdi)
+L(P1Q8): mov    %rax, -0x41(%rdi)
+L(P1Q7): mov    %rax, -0x39(%rdi)
+L(P1Q6): mov    %rax, -0x31(%rdi)
+L(P1Q5): mov    %rax, -0x29(%rdi)
+L(P1Q4): mov    %rax, -0x21(%rdi)
+L(P1Q3): mov    %rax, -0x19(%rdi)
+L(P1Q2): mov    %rax, -0x11(%rdi)
+L(P1Q1): mov    %rax, -0x9(%rdi)
+L(P1Q0): mov    %al, -0x1(%rdi)
+	 ret
+
+	.p2align 4
+L(P2Q9): mov    %rax, -0x4a(%rdi)
+L(P2Q8): mov    %rax, -0x42(%rdi)
+L(P2Q7): mov    %rax, -0x3a(%rdi)
+L(P2Q6): mov    %rax, -0x32(%rdi)
+L(P2Q5): mov    %rax, -0x2a(%rdi)
+L(P2Q4): mov    %rax, -0x22(%rdi)
+L(P2Q3): mov    %rax, -0x1a(%rdi)
+L(P2Q2): mov    %rax, -0x12(%rdi)
+L(P2Q1): mov    %rax, -0xa(%rdi)
+L(P2Q0): mov    %ax, -0x2(%rdi)
+	 ret
+
+	.p2align 4
+L(P3Q9): mov    %rax, -0x4b(%rdi)
+L(P3Q8): mov    %rax, -0x43(%rdi)
+L(P3Q7): mov    %rax, -0x3b(%rdi)
+L(P3Q6): mov    %rax, -0x33(%rdi)
+L(P3Q5): mov    %rax, -0x2b(%rdi)
+L(P3Q4): mov    %rax, -0x23(%rdi)
+L(P3Q3): mov    %rax, -0x1b(%rdi)
+L(P3Q2): mov    %rax, -0x13(%rdi)
+L(P3Q1): mov    %rax, -0xb(%rdi)
+L(P3Q0): mov    %ax, -0x3(%rdi)
+	 mov    %al, -0x1(%rdi)
+	 ret
+
+	.p2align 4
+L(P4Q9): mov    %rax, -0x4c(%rdi)
+L(P4Q8): mov    %rax, -0x44(%rdi)
+L(P4Q7): mov    %rax, -0x3c(%rdi)
+L(P4Q6): mov    %rax, -0x34(%rdi)
+L(P4Q5): mov    %rax, -0x2c(%rdi)
+L(P4Q4): mov    %rax, -0x24(%rdi)
+L(P4Q3): mov    %rax, -0x1c(%rdi)
+L(P4Q2): mov    %rax, -0x14(%rdi)
+L(P4Q1): mov    %rax, -0xc(%rdi)
+L(P4Q0): mov    %eax, -0x4(%rdi)
+	 ret
+
+	.p2align 4
+L(P5Q9): mov    %rax, -0x4d(%rdi)
+L(P5Q8): mov    %rax, -0x45(%rdi)
+L(P5Q7): mov    %rax, -0x3d(%rdi)
+L(P5Q6): mov    %rax, -0x35(%rdi)
+L(P5Q5): mov    %rax, -0x2d(%rdi)
+L(P5Q4): mov    %rax, -0x25(%rdi)
+L(P5Q3): mov    %rax, -0x1d(%rdi)
+L(P5Q2): mov    %rax, -0x15(%rdi)
+L(P5Q1): mov    %rax, -0xd(%rdi)
+L(P5Q0): mov    %eax, -0x5(%rdi)
+	 mov    %al, -0x1(%rdi)
+	 ret
+
+	.p2align 4
+L(P6Q9): mov    %rax, -0x4e(%rdi)
+L(P6Q8): mov    %rax, -0x46(%rdi)
+L(P6Q7): mov    %rax, -0x3e(%rdi)
+L(P6Q6): mov    %rax, -0x36(%rdi)
+L(P6Q5): mov    %rax, -0x2e(%rdi)
+L(P6Q4): mov    %rax, -0x26(%rdi)
+L(P6Q3): mov    %rax, -0x1e(%rdi)
+L(P6Q2): mov    %rax, -0x16(%rdi)
+L(P6Q1): mov    %rax, -0xe(%rdi)
+L(P6Q0): mov    %eax, -0x6(%rdi)
+	 mov    %ax, -0x2(%rdi)
+	 ret
+
+	.p2align 4
+L(P7Q9): mov    %rax, -0x4f(%rdi)
+L(P7Q8): mov    %rax, -0x47(%rdi)
+L(P7Q7): mov    %rax, -0x3f(%rdi)
+L(P7Q6): mov    %rax, -0x37(%rdi)
+L(P7Q5): mov    %rax, -0x2f(%rdi)
+L(P7Q4): mov    %rax, -0x27(%rdi)
+L(P7Q3): mov    %rax, -0x1f(%rdi)
+L(P7Q2): mov    %rax, -0x17(%rdi)
+L(P7Q1): mov    %rax, -0xf(%rdi)
+L(P7Q0): mov    %eax, -0x7(%rdi)
+	 mov    %ax, -0x3(%rdi)
+	 mov    %al, -0x1(%rdi)
+	 ret
+
+	/*
+	 * Align to a 16-byte boundary. Avoids penalties from unaligned stores
+	 * as well as from stores spanning cachelines. Note 16-byte alignment
+	 * is better in case where rep sstosq is used.
+	 */
+	.p2align 4
+L(ck_align):
+	test	$0xf, %rdi
+	jz	L(aligned_now)
+	test	$1, %rdi
+	jz	2f
+	mov	%al, (%rdi)
+	dec	%rsi
+	lea	1(%rdi),%rdi
+2:
+	test	$2, %rdi
+	jz	4f
+	mov	%ax, (%rdi)
+	sub	$2, %rsi
+	lea	2(%rdi),%rdi
+4:
+	test	$4, %rdi
+	jz	8f
+	mov	%eax, (%rdi)
+	sub	$4, %rsi
+	lea	4(%rdi),%rdi
+8:
+	test	$8, %rdi
+	jz	L(aligned_now)
+	mov	%rax, (%rdi)
+	sub	$8, %rsi
+	lea	8(%rdi),%rdi
+
+	/*
+	 * For large sizes rep sstoq is fastest.
+	 * Transition point determined experimentally as measured on
+	 * Intel Xeon processors (incl. Nehalem) and AMD Opteron.
+	 */
+L(aligned_now):
+	cmp	$BZERO_USE_REP, %rsi
+	ja	L(use_rep)
+
+	/*
+	 * zero 64-bytes per loop
+	 */
+	.p2align 4
+L(bzero_loop):
+	leaq	-0x40(%rsi), %rsi
+	cmpq	$0x40, %rsi
+	movq	%rax, (%rdi)
+	movq	%rax, 0x8(%rdi)
+	movq	%rax, 0x10(%rdi)
+	movq	%rax, 0x18(%rdi)
+	movq	%rax, 0x20(%rdi)
+	movq	%rax, 0x28(%rdi)
+	movq	%rax, 0x30(%rdi)
+	movq	%rax, 0x38(%rdi)
+	leaq	0x40(%rdi), %rdi
+	jae	L(bzero_loop)
+
+	/*
+	 * Clear any remaining bytes..
+	 */
+9:
+	leaq	L(setPxQx)(%rip), %r10
+	addq	%rsi, %rdi
+	movslq	(%r10,%rsi,4), %rcx
+	leaq	(%rcx,%r10,1), %r10
+	INDIRECT_JMP_REG(r10)
+
+	/*
+	 * Use rep sstoq. Clear any remainder via unrolled code
+	 */
+	.p2align 4
+L(use_rep):
+	movq	%rsi, %rcx		/* get size in bytes */
+	shrq	$3, %rcx		/* count of 8-byte words to zero */
+	rep
+	  sstoq				/* %rcx = words to clear (%rax=0) */
+	andq	$7, %rsi		/* remaining bytes */
+	jnz	9b
+	ret
+#undef	L
+	SET_SIZE(bzero_altentry)
+	SET_SIZE(bzero)
+
+/*
+ * Transfer data to and from user space -
+ * Note that these routines can cause faults
+ * It is assumed that the kernel has nothing at
+ * less than KERNELBASE in the virtual address space.
+ *
+ * Note that copyin(9F) and copyout(9F) are part of the
+ * DDI/DKI which specifies that they return '-1' on "errors."
+ *
+ * Sigh.
+ *
+ * So there's two extremely similar routines - xcopyin_nta() and
+ * xcopyout_nta() which return the errno that we've faithfully computed.
+ * This allows other callers (e.g. uiomove(9F)) to work correctly.
+ * Given that these are used pretty heavily, we expand the calling
+ * sequences inline for all flavours (rather than making wrappers).
+ */
+
+/*
+ * Copy user data to kernel space.
+ */
+
+	ENTRY(copyin)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$24, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rsi		/* %rsi = kaddr */
+	jnb	1f
+	leaq	.copyin_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_copyin_err(%rip), %rcx
+
+	movq	%gs:CPU_THREAD, %r9
+	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
+	jae	3f			/* take copyop if uaddr > kernelbase */
+	SMAP_DISABLE_INSTR(0)
+	jmp	do_copy_fault		/* Takes care of leave for us */
+
+_copyin_err:
+	SMAP_ENABLE_INSTR(2)
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
+3:
+	movq	T_COPYOPS(%r9), %rax
+	cmpq	$0, %rax
+	jz	2f
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	leave
+	movq	CP_COPYIN(%rax), %rax
+	INDIRECT_JMP_REG(rax)
+
+2:	movl	$-1, %eax
+	leave
+	ret
+	SET_SIZE(copyin)
+
+	ENTRY(xcopyin_nta)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$24, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 * %rcx is consumed in this routine so we don't need to save
+	 * it.
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rsi		/* %rsi = kaddr */
+	jnb	1f
+	leaq	.xcopyin_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	movq	%gs:CPU_THREAD, %r9
+	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
+	jae	4f
+	cmpq	$0, %rcx		/* No non-temporal access? */
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_xcopyin_err(%rip), %rcx	/* doesn't set rflags */
+	jnz	6f			/* use regular access */
+	/*
+	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
+	 */
+	cmpq	$XCOPY_MIN_SIZE, %rdx
+	jae	5f
+6:
+	SMAP_DISABLE_INSTR(1)
+	jmp	do_copy_fault
+
+	/*
+	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
+	 * count is COUNT_ALIGN_SIZE aligned.
+	 */
+5:
+	movq	%rdi, %r10
+	orq	%rsi, %r10
+	andq	$NTA_ALIGN_MASK, %r10
+	orq	%rdx, %r10
+	andq	$COUNT_ALIGN_MASK, %r10
+	jnz	6b
+	leaq	_xcopyin_nta_err(%rip), %rcx	/* doesn't set rflags */
+	SMAP_DISABLE_INSTR(2)
+	jmp	do_copy_fault_nta	/* use non-temporal access */
+
+4:
+	movl	$EFAULT, %eax
+	jmp	3f
+
+	/*
+	 * A fault during do_copy_fault or do_copy_fault_nta is
+	 * indicated through an errno value in %rax and we iret from the
+	 * trap handler to here.
+	 */
+_xcopyin_err:
+	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
+_xcopyin_nta_err:
+	SMAP_ENABLE_INSTR(3)
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+3:
+	movq	T_COPYOPS(%r9), %r8
+	cmpq	$0, %r8
+	jz	2f
+
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	leave
+	movq	CP_XCOPYIN(%r8), %r8
+	INDIRECT_JMP_REG(r8)
+
+2:	leave
+	ret
+	SET_SIZE(xcopyin_nta)
+
+/*
+ * Copy kernel data to user space.
+ */
+
+	ENTRY(copyout)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$24, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rdi		/* %rdi = kaddr */
+	jnb	1f
+	leaq	.copyout_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_copyout_err(%rip), %rcx
+
+	movq	%gs:CPU_THREAD, %r9
+	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
+	jae	3f			/* take copyop if uaddr > kernelbase */
+	SMAP_DISABLE_INSTR(3)
+	jmp	do_copy_fault		/* Calls leave for us */
+
+_copyout_err:
+	SMAP_ENABLE_INSTR(4)
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
+3:
+	movq	T_COPYOPS(%r9), %rax
+	cmpq	$0, %rax
+	jz	2f
+
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	leave
+	movq	CP_COPYOUT(%rax), %rax
+	INDIRECT_JMP_REG(rax)
+
+2:	movl	$-1, %eax
+	leave
+	ret
+	SET_SIZE(copyout)
+
+	ENTRY(xcopyout_nta)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$24, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rdi		/* %rdi = kaddr */
+	jnb	1f
+	leaq	.xcopyout_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	movq	%gs:CPU_THREAD, %r9
+	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
+	jae	4f
+
+	cmpq	$0, %rcx		/* No non-temporal access? */
+	/*
+	 * pass lofault value as 4th argument to do_copy_fault
+	 */
+	leaq	_xcopyout_err(%rip), %rcx
+	jnz	6f
+	/*
+	 * Make sure cnt is >= XCOPY_MIN_SIZE bytes
+	 */
+	cmpq	$XCOPY_MIN_SIZE, %rdx
+	jae	5f
+6:
+	SMAP_DISABLE_INSTR(4)
+	jmp	do_copy_fault
+
+	/*
+	 * Make sure src and dst are NTA_ALIGN_SIZE aligned,
+	 * count is COUNT_ALIGN_SIZE aligned.
+	 */
+5:
+	movq	%rdi, %r10
+	orq	%rsi, %r10
+	andq	$NTA_ALIGN_MASK, %r10
+	orq	%rdx, %r10
+	andq	$COUNT_ALIGN_MASK, %r10
+	jnz	6b
+	leaq	_xcopyout_nta_err(%rip), %rcx
+	SMAP_DISABLE_INSTR(5)
+	call	do_copy_fault_nta
+	SMAP_ENABLE_INSTR(5)
+	ret
+
+4:
+	movl	$EFAULT, %eax
+	jmp	3f
+
+	/*
+	 * A fault during do_copy_fault or do_copy_fault_nta is
+	 * indicated through an errno value in %rax and we iret from the
+	 * trap handler to here.
+	 */
+_xcopyout_err:
+	addq	$8, %rsp		/* pop bcopy_altentry call ret addr */
+_xcopyout_nta_err:
+	SMAP_ENABLE_INSTR(6)
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+3:
+	movq	T_COPYOPS(%r9), %r8
+	cmpq	$0, %r8
+	jz	2f
+
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	leave
+	movq	CP_XCOPYOUT(%r8), %r8
+	INDIRECT_JMP_REG(r8)
+
+2:	leave
+	ret
+	SET_SIZE(xcopyout_nta)
+
+/*
+ * Copy a null terminated string from one point to another in
+ * the kernel address space.
+ */
+
+	ENTRY(copystr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+#ifdef DEBUG
+	movq	kernelbase(%rip), %rax
+	cmpq	%rax, %rdi		/* %rdi = from */
+	jb	0f
+	cmpq	%rax, %rsi		/* %rsi = to */
+	jnb	1f
+0:	leaq	.copystr_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	movq	%gs:CPU_THREAD, %r9
+	movq	T_LOFAULT(%r9), %r8	/* pass current lofault value as */
+					/* 5th argument to do_copystr */
+	xorl	%r10d,%r10d		/* pass smap restore need in %r10d */
+					/* as a non-ABI 6th arg */
+do_copystr:
+	movq	%gs:CPU_THREAD, %r9	/* %r9 = thread addr */
+	movq    T_LOFAULT(%r9), %r11	/* save the current lofault */
+	movq	%r8, T_LOFAULT(%r9)	/* new lofault */
+
+	movq	%rdx, %r8		/* save maxlength */
+
+	cmpq	$0, %rdx		/* %rdx = maxlength */
+	je	copystr_enametoolong	/* maxlength == 0 */
+
+copystr_loop:
+	decq	%r8
+	movb	(%rdi), %al
+	incq	%rdi
+	movb	%al, (%rsi)
+	incq	%rsi
+	cmpb	$0, %al
+	je	copystr_null		/* null char */
+	cmpq	$0, %r8
+	jne	copystr_loop
+
+copystr_enametoolong:
+	movl	$ENAMETOOLONG, %eax
+	jmp	copystr_out
+
+copystr_null:
+	xorl	%eax, %eax		/* no error */
+
+copystr_out:
+	cmpq	$0, %rcx		/* want length? */
+	je	copystr_smap		/* no */
+	subq	%r8, %rdx		/* compute length and store it */
+	movq	%rdx, (%rcx)
+
+copystr_smap:
+	cmpl	$0, %r10d
+	jz	copystr_done
+	SMAP_ENABLE_INSTR(7)
+
+copystr_done:
+	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
+	leave
+	ret
+	SET_SIZE(copystr)
+
+/*
+ * Copy a null terminated string from the user address space into
+ * the kernel address space.
+ */
+
+	ENTRY(copyinstr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+	movq	%rcx, 0x18(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rsi		/* %rsi = kaddr */
+	jnb	1f
+	leaq	.copyinstr_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif
+	/*
+	 * pass lofault value as 5th argument to do_copystr
+	 * do_copystr expects whether or not we need smap in %r10d
+	 */
+	leaq	_copyinstr_error(%rip), %r8
+	movl	$1, %r10d
+
+	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
+	jae	4f
+	SMAP_DISABLE_INSTR(6)
+	jmp	do_copystr
+4:
+	movq	%gs:CPU_THREAD, %r9
+	jmp	3f
+
+_copyinstr_error:
+	SMAP_ENABLE_INSTR(8)
+	movq	%r11, T_LOFAULT(%r9)	/* restore original lofault */
+3:
+	movq	T_COPYOPS(%r9), %rax
+	cmpq	$0, %rax
+	jz	2f
+
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	movq	0x18(%rsp), %rcx
+	leave
+	movq	CP_COPYINSTR(%rax), %rax
+	INDIRECT_JMP_REG(rax)
+
+2:	movl	$EFAULT, %eax		/* return EFAULT */
+	leave
+	ret
+	SET_SIZE(copyinstr)
+
+/*
+ * Copy a null terminated string from the kernel
+ * address space to the user address space.
+ */
+
+	ENTRY(copyoutstr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+
+	/*
+	 * save args in case we trap and need to rerun as a copyop
+	 */
+	movq	%rdi, (%rsp)
+	movq	%rsi, 0x8(%rsp)
+	movq	%rdx, 0x10(%rsp)
+	movq	%rcx, 0x18(%rsp)
+
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rdi		/* %rdi = kaddr */
+	jnb	1f
+	leaq	.copyoutstr_panic_msg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+1:
+#endif
+	/*
+	 * pass lofault value as 5th argument to do_copystr
+	 * pass one as 6th argument to do_copystr in %r10d
+	 */
+	leaq	_copyoutstr_error(%rip), %r8
+	movl	$1, %r10d
+
+	cmpq	%rax, %rsi		/* test uaddr < kernelbase */
+	jae	4f
+	SMAP_DISABLE_INSTR(7)
+	jmp	do_copystr
+4:
+	movq	%gs:CPU_THREAD, %r9
+	jmp	3f
+
+_copyoutstr_error:
+	SMAP_ENABLE_INSTR(9)
+	movq	%r11, T_LOFAULT(%r9)	/* restore the original lofault */
+3:
+	movq	T_COPYOPS(%r9), %rax
+	cmpq	$0, %rax
+	jz	2f
+
+	/*
+	 * reload args for the copyop
+	 */
+	movq	(%rsp), %rdi
+	movq	0x8(%rsp), %rsi
+	movq	0x10(%rsp), %rdx
+	movq	0x18(%rsp), %rcx
+	leave
+	movq	CP_COPYOUTSTR(%rax), %rax
+	INDIRECT_JMP_REG(rax)
+
+2:	movl	$EFAULT, %eax		/* return EFAULT */
+	leave
+	ret
+	SET_SIZE(copyoutstr)
+
+/*
+ * Since all of the fuword() variants are so similar, we have a macro to spit
+ * them out.  This allows us to create DTrace-unobservable functions easily.
+ */
+
+/*
+ * Note that we don't save and reload the arguments here
+ * because their values are not altered in the copy path.
+ * Additionally, when successful, the smap_enable jmp will
+ * actually return us to our original caller.
+ */
+
+#define	FUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)	\
+	ENTRY(NAME)				\
+	movq	%gs:CPU_THREAD, %r9;		\
+	cmpq	kernelbase(%rip), %rdi;		\
+	jae	1f;				\
+	leaq	_flt_/**/NAME, %rdx;		\
+	movq	%rdx, T_LOFAULT(%r9);		\
+	SMAP_DISABLE_INSTR(DISNUM)		\
+	INSTR	(%rdi), REG;			\
+	movq	$0, T_LOFAULT(%r9);		\
+	INSTR	REG, (%rsi);			\
+	xorl	%eax, %eax;			\
+	SMAP_ENABLE_INSTR(EN1)			\
+	ret;					\
+_flt_/**/NAME:					\
+	SMAP_ENABLE_INSTR(EN2)			\
+	movq	$0, T_LOFAULT(%r9);		\
+1:						\
+	movq	T_COPYOPS(%r9), %rax;		\
+	cmpq	$0, %rax;			\
+	jz	2f;				\
+	movq	COPYOP(%rax), %rax;		\
+	INDIRECT_JMP_REG(rax);			\
+2:						\
+	movl	$-1, %eax;			\
+	ret;					\
+	SET_SIZE(NAME)
+
+	FUWORD(fuword64, movq, %rax, CP_FUWORD64,8,10,11)
+	FUWORD(fuword32, movl, %eax, CP_FUWORD32,9,12,13)
+	FUWORD(fuword16, movw, %ax, CP_FUWORD16,10,14,15)
+	FUWORD(fuword8, movb, %al, CP_FUWORD8,11,16,17)
+
+#undef	FUWORD
+
+/*
+ * Set user word.
+ */
+
+/*
+ * Note that we don't save and reload the arguments here
+ * because their values are not altered in the copy path.
+ */
+
+#define	SUWORD(NAME, INSTR, REG, COPYOP, DISNUM, EN1, EN2)	\
+	ENTRY(NAME)				\
+	movq	%gs:CPU_THREAD, %r9;		\
+	cmpq	kernelbase(%rip), %rdi;		\
+	jae	1f;				\
+	leaq	_flt_/**/NAME, %rdx;		\
+	SMAP_DISABLE_INSTR(DISNUM)		\
+	movq	%rdx, T_LOFAULT(%r9);		\
+	INSTR	REG, (%rdi);			\
+	movq	$0, T_LOFAULT(%r9);		\
+	xorl	%eax, %eax;			\
+	SMAP_ENABLE_INSTR(EN1)			\
+	ret;					\
+_flt_/**/NAME:					\
+	SMAP_ENABLE_INSTR(EN2)			\
+	movq	$0, T_LOFAULT(%r9);		\
+1:						\
+	movq	T_COPYOPS(%r9), %rax;		\
+	cmpq	$0, %rax;			\
+	jz	3f;				\
+	movq	COPYOP(%rax), %rax;		\
+	INDIRECT_JMP_REG(rax);			\
+3:						\
+	movl	$-1, %eax;			\
+	ret;					\
+	SET_SIZE(NAME)
+
+	SUWORD(suword64, movq, %rsi, CP_SUWORD64,12,18,19)
+	SUWORD(suword32, movl, %esi, CP_SUWORD32,13,20,21)
+	SUWORD(suword16, movw, %si, CP_SUWORD16,14,22,23)
+	SUWORD(suword8, movb, %sil, CP_SUWORD8,15,24,25)
+
+#undef	SUWORD
+
+#define	FUWORD_NOERR(NAME, INSTR, REG)		\
+	ENTRY(NAME)				\
+	cmpq	kernelbase(%rip), %rdi;		\
+	cmovnbq	kernelbase(%rip), %rdi;		\
+	INSTR	(%rdi), REG;			\
+	INSTR	REG, (%rsi);			\
+	ret;					\
+	SET_SIZE(NAME)
+
+	FUWORD_NOERR(fuword64_noerr, movq, %rax)
+	FUWORD_NOERR(fuword32_noerr, movl, %eax)
+	FUWORD_NOERR(fuword16_noerr, movw, %ax)
+	FUWORD_NOERR(fuword8_noerr, movb, %al)
+
+#undef	FUWORD_NOERR
+
+#define	SUWORD_NOERR(NAME, INSTR, REG)		\
+	ENTRY(NAME)				\
+	cmpq	kernelbase(%rip), %rdi;		\
+	cmovnbq	kernelbase(%rip), %rdi;		\
+	INSTR	REG, (%rdi);			\
+	ret;					\
+	SET_SIZE(NAME)
+
+	SUWORD_NOERR(suword64_noerr, movq, %rsi)
+	SUWORD_NOERR(suword32_noerr, movl, %esi)
+	SUWORD_NOERR(suword16_noerr, movw, %si)
+	SUWORD_NOERR(suword8_noerr, movb, %sil)
+
+#undef	SUWORD_NOERR
+
+
+	.weak	subyte
+	subyte=suword8
+	.weak	subyte_noerr
+	subyte_noerr=suword8_noerr
+
+	.weak	fulword
+	fulword=fuword64
+	.weak	fulword_noerr
+	fulword_noerr=fuword64_noerr
+	.weak	sulword
+	sulword=suword64
+	.weak	sulword_noerr
+	sulword_noerr=suword64_noerr
+
+	ENTRY(copyin_noerr)
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rsi		/* %rsi = kto */
+	jae	1f
+	leaq	.cpyin_ne_pmsg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+1:
+#endif
+	cmpq	%rax, %rdi		/* ufrom < kernelbase */
+	jb	do_copy
+	movq	%rax, %rdi		/* force fault at kernelbase */
+	jmp	do_copy
+	SET_SIZE(copyin_noerr)
+
+	ENTRY(copyout_noerr)
+	movq	kernelbase(%rip), %rax
+#ifdef DEBUG
+	cmpq	%rax, %rdi		/* %rdi = kfrom */
+	jae	1f
+	leaq	.cpyout_ne_pmsg(%rip), %rdi
+	jmp	call_panic		/* setup stack and call panic */
+1:
+#endif
+	cmpq	%rax, %rsi		/* uto < kernelbase */
+	jb	do_copy
+	movq	%rax, %rsi		/* force fault at kernelbase */
+	jmp	do_copy
+	SET_SIZE(copyout_noerr)
+
+	ENTRY(uzero)
+	movq	kernelbase(%rip), %rax
+	cmpq	%rax, %rdi
+	jb	do_zero
+	movq	%rax, %rdi	/* force fault at kernelbase */
+	jmp	do_zero
+	SET_SIZE(uzero)
+
+	ENTRY(ucopy)
+	movq	kernelbase(%rip), %rax
+	cmpq	%rax, %rdi
+	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
+	cmpq	%rax, %rsi
+	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
+	jmp	do_copy
+	SET_SIZE(ucopy)
+
+	/*
+	 * Note, the frame pointer is required here becuase do_copystr expects
+	 * to be able to pop it off!
+	 */
+	ENTRY(ucopystr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	kernelbase(%rip), %rax
+	cmpq	%rax, %rdi
+	cmovaeq	%rax, %rdi	/* force fault at kernelbase */
+	cmpq	%rax, %rsi
+	cmovaeq	%rax, %rsi	/* force fault at kernelbase */
+	/* do_copystr expects lofault address in %r8 */
+	/* do_copystr expects whether or not we need smap in %r10 */
+	xorl	%r10d, %r10d
+	movq	%gs:CPU_THREAD, %r8
+	movq	T_LOFAULT(%r8), %r8
+	jmp	do_copystr
+	SET_SIZE(ucopystr)
+
+#ifdef DEBUG
+	.data
+.kcopy_panic_msg:
+	.string "kcopy: arguments below kernelbase"
+.bcopy_panic_msg:
+	.string "bcopy: arguments below kernelbase"
+.kzero_panic_msg:
+        .string "kzero: arguments below kernelbase"
+.bzero_panic_msg:
+	.string	"bzero: arguments below kernelbase"
+.copyin_panic_msg:
+	.string "copyin: kaddr argument below kernelbase"
+.xcopyin_panic_msg:
+	.string	"xcopyin: kaddr argument below kernelbase"
+.copyout_panic_msg:
+	.string "copyout: kaddr argument below kernelbase"
+.xcopyout_panic_msg:
+	.string	"xcopyout: kaddr argument below kernelbase"
+.copystr_panic_msg:
+	.string	"copystr: arguments in user space"
+.copyinstr_panic_msg:
+	.string	"copyinstr: kaddr argument not in kernel address space"
+.copyoutstr_panic_msg:
+	.string	"copyoutstr: kaddr argument not in kernel address space"
+.cpyin_ne_pmsg:
+	.string "copyin_noerr: argument not in kernel address space"
+.cpyout_ne_pmsg:
+	.string "copyout_noerr: argument not in kernel address space"
+#endif
+
+.data
+.align	4
+.globl	_smap_enable_patch_count
+.type	_smap_enable_patch_count,@object
+.size	_smap_enable_patch_count, 4
+_smap_enable_patch_count:
+	.long	SMAP_ENABLE_COUNT
+
+.globl	_smap_disable_patch_count
+.type	_smap_disable_patch_count,@object
+.size	_smap_disable_patch_count, 4
+_smap_disable_patch_count:
+	.long SMAP_DISABLE_COUNT
diff --git a/usr/src/uts/intel/ml/ddi_i86_asm.s b/usr/src/uts/intel/ml/ddi_i86_asm.s
new file mode 100644
index 0000000000..2fa9bd75e9
--- /dev/null
+++ b/usr/src/uts/intel/ml/ddi_i86_asm.s
@@ -0,0 +1,522 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include "assym.h"
+
+	ENTRY(ddi_get8)
+	ALTENTRY(ddi_mem_get8)
+	ALTENTRY(ddi_io_get8)
+	movl	ACC_ATTR(%rdi), %edx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %edx
+	jne	1f
+	movq	%rsi, %rdx
+	xorq	%rax, %rax
+	inb	(%dx)
+	ret
+1:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %edx
+	jne	2f
+	movzbq	(%rsi), %rax
+	ret
+2:
+	movq	ACC_GETB(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_get8)
+	SET_SIZE(ddi_mem_get8)
+	SET_SIZE(ddi_io_get8)
+
+
+	ENTRY(ddi_get16)
+	ALTENTRY(ddi_mem_get16)
+	ALTENTRY(ddi_io_get16)
+	movl	ACC_ATTR(%rdi), %edx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %edx
+	jne	3f
+	movq	%rsi, %rdx
+	xorq	%rax, %rax
+	inw	(%dx)
+	ret
+3:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %edx
+	jne	4f
+	movzwq	(%rsi), %rax
+	ret
+4:
+	movq	ACC_GETW(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_get16)
+	SET_SIZE(ddi_mem_get16)
+	SET_SIZE(ddi_io_get16)
+
+
+	ENTRY(ddi_get32)
+	ALTENTRY(ddi_mem_get32)
+	ALTENTRY(ddi_io_get32)
+	movl	ACC_ATTR(%rdi), %edx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %edx
+	jne	5f
+	movq	%rsi, %rdx
+	inl	(%dx)
+	ret
+5:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %edx
+	jne	6f
+	movl	(%rsi), %eax
+	ret
+6:
+	movq	ACC_GETL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_get32)
+	SET_SIZE(ddi_mem_get32)
+	SET_SIZE(ddi_io_get32)
+
+
+	ENTRY(ddi_get64)
+	ALTENTRY(ddi_mem_get64)
+	movq	ACC_GETLL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_get64)
+	SET_SIZE(ddi_mem_get64)
+
+
+	ENTRY(ddi_put8)
+	ALTENTRY(ddi_mem_put8)
+	ALTENTRY(ddi_io_put8)
+	movl	ACC_ATTR(%rdi), %ecx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %ecx
+	jne	7f
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outb	(%dx)
+	ret
+7:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %ecx
+	jne	8f
+	movb	%dl, (%rsi)
+	ret
+8:
+	movq	ACC_PUTB(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_put8)
+	SET_SIZE(ddi_mem_put8)
+	SET_SIZE(ddi_io_put8)
+
+
+	ENTRY(ddi_put16)
+	ALTENTRY(ddi_mem_put16)
+	ALTENTRY(ddi_io_put16)
+	movl	ACC_ATTR(%rdi), %ecx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %ecx
+	jne	8f
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outw	(%dx)
+	ret
+8:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %ecx
+	jne	9f
+	movw	%dx, (%rsi)
+	ret
+9:
+	movq	ACC_PUTW(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_put16)
+	SET_SIZE(ddi_mem_put16)
+	SET_SIZE(ddi_io_put16)
+
+
+	ENTRY(ddi_put32)
+	ALTENTRY(ddi_mem_put32)
+	ALTENTRY(ddi_io_put32)
+	movl	ACC_ATTR(%rdi), %ecx
+	cmpl	$_CONST(DDI_ACCATTR_IO_SPACE|DDI_ACCATTR_DIRECT), %ecx
+	jne	8f
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outl	(%dx)
+	ret
+8:
+	cmpl	$_CONST(DDI_ACCATTR_CPU_VADDR|DDI_ACCATTR_DIRECT), %ecx
+	jne	9f
+	movl	%edx, (%rsi)
+	ret
+9:
+	movq	ACC_PUTL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_put32)
+	SET_SIZE(ddi_mem_put32)
+	SET_SIZE(ddi_io_put32)
+
+
+	ENTRY(ddi_put64)
+	ALTENTRY(ddi_mem_put64)
+	movq	ACC_PUTLL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_put64)
+	SET_SIZE(ddi_mem_put64)
+
+
+	ENTRY(ddi_rep_get8)
+	ALTENTRY(ddi_mem_rep_get8)
+	movq	ACC_REP_GETB(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_get8)
+	SET_SIZE(ddi_mem_rep_get8)
+
+
+	ENTRY(ddi_rep_get16)
+	ALTENTRY(ddi_mem_rep_get16)
+	movq	ACC_REP_GETW(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_get16)
+	SET_SIZE(ddi_mem_rep_get16)
+
+
+	ENTRY(ddi_rep_get32)
+	ALTENTRY(ddi_mem_rep_get32)
+	movq	ACC_REP_GETL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_get32)
+	SET_SIZE(ddi_mem_rep_get32)
+
+
+	ENTRY(ddi_rep_get64)
+	ALTENTRY(ddi_mem_rep_get64)
+	movq	ACC_REP_GETLL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_get64)
+	SET_SIZE(ddi_mem_rep_get64)
+
+
+	ENTRY(ddi_rep_put8)
+	ALTENTRY(ddi_mem_rep_put8)
+	movq	ACC_REP_PUTB(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_put8)
+	SET_SIZE(ddi_mem_rep_put8)
+
+
+	ENTRY(ddi_rep_put16)
+	ALTENTRY(ddi_mem_rep_put16)
+	movq	ACC_REP_PUTW(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_put16)
+	SET_SIZE(ddi_mem_rep_put16)
+
+
+	ENTRY(ddi_rep_put32)
+	ALTENTRY(ddi_mem_rep_put32)
+	movq	ACC_REP_PUTL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_put32)
+	SET_SIZE(ddi_mem_rep_put32)
+
+
+	ENTRY(ddi_rep_put64)
+	ALTENTRY(ddi_mem_rep_put64)
+	movq	ACC_REP_PUTLL(%rdi), %rax
+	INDIRECT_JMP_REG(rax)
+	SET_SIZE(ddi_rep_put64)
+	SET_SIZE(ddi_mem_rep_put64)
+
+	ENTRY(i_ddi_vaddr_get8)
+	movzbq	(%rsi), %rax
+	ret
+	SET_SIZE(i_ddi_vaddr_get8)
+
+	ENTRY(i_ddi_vaddr_get16)
+	movzwq	(%rsi), %rax
+	ret
+	SET_SIZE(i_ddi_vaddr_get16)
+
+
+	ENTRY(i_ddi_vaddr_get32)
+	movl	(%rsi), %eax
+	ret
+	SET_SIZE(i_ddi_vaddr_get32)
+
+
+	ENTRY(i_ddi_vaddr_get64)
+	movq	(%rsi), %rax
+	ret
+	SET_SIZE(i_ddi_vaddr_get64)
+
+
+	ENTRY(i_ddi_io_get8)
+	movq	%rsi, %rdx
+	inb	(%dx)
+	movzbq	%al, %rax
+	ret
+	SET_SIZE(i_ddi_io_get8)
+
+
+	ENTRY(i_ddi_io_get16)
+	movq	%rsi, %rdx
+	inw	(%dx)
+	movzwq	%ax, %rax
+	ret
+	SET_SIZE(i_ddi_io_get16)
+
+
+	ENTRY(i_ddi_io_get32)
+	movq	%rsi, %rdx
+	inl	(%dx)
+	ret
+	SET_SIZE(i_ddi_io_get32)
+
+	ENTRY(i_ddi_vaddr_put8)
+	movb	%dl, (%rsi)
+	ret
+	SET_SIZE(i_ddi_vaddr_put8)
+
+
+	ENTRY(i_ddi_vaddr_put16)
+	movw	%dx, (%rsi)
+	ret
+	SET_SIZE(i_ddi_vaddr_put16)
+
+
+	ENTRY(i_ddi_vaddr_put32)
+	movl	%edx, (%rsi)
+	ret
+	SET_SIZE(i_ddi_vaddr_put32)
+
+
+	ENTRY(i_ddi_vaddr_put64)
+	movq	%rdx, (%rsi)
+	ret
+	SET_SIZE(i_ddi_vaddr_put64)
+
+	ENTRY(i_ddi_io_put8)
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outb	(%dx)
+	ret
+	SET_SIZE(i_ddi_io_put8)
+
+
+	ENTRY(i_ddi_io_put16)
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outw	(%dx)
+	ret
+	SET_SIZE(i_ddi_io_put16)
+
+
+	ENTRY(i_ddi_io_put32)
+	movq	%rdx, %rax
+	movq	%rsi, %rdx
+	outl	(%dx)
+	ret
+	SET_SIZE(i_ddi_io_put32)
+
+	/*
+	 * Incoming arguments
+	 *
+	 * %rdi	: hdlp
+	 * %rsi	: host_addr
+	 * %rdx	: dev_addr
+	 * %rcx	: repcount
+	 * %r8	: flags
+	 *
+	 * This routine will destroy values in %rdx, %rsi, %rcx.
+	 */
+	ENTRY(i_ddi_io_rep_get8)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	gb_ioadv
+	movq	%rsi, %rdi
+	rep
+	insb
+	ret
+
+gb_ioadv:
+	andq	%rcx, %rcx
+	jz	gb_ioadv_done
+gb_ioadv2:
+	inb	(%dx)
+	movb	%al, (%rsi)
+	incq	%rdx
+	incq	%rsi
+	decq	%rcx
+	jg	gb_ioadv2
+
+gb_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+
+	SET_SIZE(i_ddi_io_rep_get8)
+
+
+	ENTRY(i_ddi_io_rep_get16)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	gw_ioadv
+
+	movq	%rsi, %rdi
+	rep
+	insw
+	ret
+
+gw_ioadv:
+	andq	%rcx, %rcx
+	jz	gw_ioadv_done
+gw_ioadv2:
+	inw	(%dx)
+	movw	%ax,(%rsi)
+	addq	$2, %rsi
+	addq	$2, %rdx
+	decq	%rcx
+	jg	gw_ioadv2
+
+gw_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(i_ddi_io_rep_get16)
+
+
+	ENTRY(i_ddi_io_rep_get32)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	gl_ioadv
+
+	movq	%rsi, %rdi
+	rep
+	insl
+	ret
+
+gl_ioadv:
+	andq	%rcx, %rcx
+	jz	gl_ioadv_done
+gl_ioadv2:
+	inl	(%dx)
+	movl	%eax,(%rsi)
+	addq	$4, %rsi
+	addq	$4, %rdx
+	decq	%rcx
+	jg	gl_ioadv2
+
+gl_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+
+	SET_SIZE(i_ddi_io_rep_get32)
+
+	/*
+	 * Incoming arguments
+	 *
+	 * %rdi	: hdlp
+	 * %rsi	: host_addr
+	 * %rdx	: dev_addr
+	 * %rcx	: repcount
+	 * %r8	: flags
+	 *
+	 * This routine will destroy values in %rdx, %rsi, %rcx.
+	 */
+	ENTRY(i_ddi_io_rep_put8)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	pb_ioadv
+
+	movq	%rsi, %rdi
+	rep
+	outsb
+	ret
+
+pb_ioadv:
+	andq	%rcx, %rcx
+	jz	pb_ioadv_done
+pb_ioadv2:
+	movb	(%rsi), %al
+	outb	(%dx)
+	incq	%rsi
+	incq	%rdx
+	decq	%rcx
+	jg	pb_ioadv2
+
+pb_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(i_ddi_io_rep_put8)
+
+	ENTRY(i_ddi_io_rep_put16)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	pw_ioadv
+
+	movq	%rsi, %rdi
+	rep
+	outsw
+	ret
+
+pw_ioadv:
+	andq	%rcx, %rcx
+	jz	pw_ioadv_done
+pw_ioadv2:
+	movw	(%rsi), %ax
+	outw	(%dx)
+	addq	$2, %rsi
+	addq	$2, %rdx
+	decq	%rcx
+	jg	pw_ioadv2
+
+pw_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(i_ddi_io_rep_put16)
+
+
+	ENTRY(i_ddi_io_rep_put32)
+
+	cmpq	$DDI_DEV_AUTOINCR, %r8
+	je	pl_ioadv
+
+	movq	%rsi, %rdi
+	rep
+	outsl
+	ret
+
+pl_ioadv:
+	andq	%rcx, %rcx
+	jz	pl_ioadv_done
+pl_ioadv2:
+	movl	(%rsi), %eax
+	outl	(%dx)
+	addq	$4, %rsi
+	addq	$4, %rdx
+	decq	%rcx
+	jg	pl_ioadv2
+
+pl_ioadv_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(i_ddi_io_rep_put32)
diff --git a/usr/src/uts/intel/ml/desctbls_asm.s b/usr/src/uts/intel/ml/desctbls_asm.s
new file mode 100644
index 0000000000..4528bc07ad
--- /dev/null
+++ b/usr/src/uts/intel/ml/desctbls_asm.s
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/panic.h>
+#include <sys/ontrap.h>
+#include <sys/privregs.h>
+#include <sys/segments.h>
+#include <sys/trap.h>
+
+#include "assym.h"
+
+	ENTRY_NP(rd_idtr)
+	sidt	(%rdi)
+	ret
+	SET_SIZE(rd_idtr)
+
+	ENTRY_NP(wr_idtr)
+	lidt	(%rdi)
+	ret
+	SET_SIZE(wr_idtr)
+
+	ENTRY_NP(rd_gdtr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	sgdt	(%rdi)
+	leave
+	ret
+	SET_SIZE(rd_gdtr)
+
+	ENTRY_NP(wr_gdtr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	lgdt	(%rdi)
+	jmp	1f
+	nop
+1:
+	leave
+	ret
+	SET_SIZE(wr_gdtr)
+
+	/*
+	 * loads zero selector for ds and es.
+	 */
+	ENTRY_NP(load_segment_registers)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%rdi
+	pushq	$.newcs
+	lretq
+.newcs:
+	/*
+	 * zero %ds and %es - they're ignored anyway
+	 */
+	xorl	%eax, %eax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movl	%esi, %eax
+	movw	%ax, %fs
+	movl	%edx, %eax
+	movw	%ax, %gs
+	movl	%ecx, %eax
+	movw	%ax, %ss
+	leave
+	ret
+	SET_SIZE(load_segment_registers)
+
+	ENTRY_NP(get_cs_register)
+	movq	%cs, %rax
+	ret
+	SET_SIZE(get_cs_register)
+
+	ENTRY_NP(wr_ldtr)
+	movq	%rdi, %rax
+	lldt	%ax
+	ret
+	SET_SIZE(wr_ldtr)
+
+	ENTRY_NP(rd_ldtr)
+	xorl	%eax, %eax
+	sldt	%ax
+	ret
+	SET_SIZE(rd_ldtr)
+
+	ENTRY_NP(wr_tsr)
+	movq	%rdi, %rax
+	ltr	%ax
+	ret
+	SET_SIZE(wr_tsr)
+
diff --git a/usr/src/uts/intel/ml/exception.s b/usr/src/uts/intel/ml/exception.s
new file mode 100644
index 0000000000..92c410adc0
--- /dev/null
+++ b/usr/src/uts/intel/ml/exception.s
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ * Copyright (c) 1989, 1990 William F. Jolitz.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/amd64/amd64/exception.S,v 1.113 2003/10/15 02:04:52 peter Exp $
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/trap.h>
+#include <sys/psw.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/dtrace.h>
+#include <sys/x86_archext.h>
+#include <sys/traptrace.h>
+#include <sys/machparam.h>
+
+#include "assym.h"
+
+/*
+ * push $0 on stack for traps that do not
+ * generate an error code. This is so the rest
+ * of the kernel can expect a consistent stack
+ * from from any exception.
+ *
+ * Note that for all exceptions for amd64
+ * %r11 and %rcx are on the stack. Just pop
+ * them back into their appropriate registers and let
+ * it get saved as is running native.
+ */
+
+#if defined(__xpv)
+
+#define	NPTRAP_NOERR(trapno)	\
+	pushq	$0;		\
+	pushq	$trapno
+
+#define	TRAP_NOERR(trapno)	\
+	XPV_TRAP_POP;		\
+	NPTRAP_NOERR(trapno)
+
+/*
+ * error code already pushed by hw
+ * onto stack.
+ */
+#define	TRAP_ERR(trapno)	\
+	XPV_TRAP_POP;		\
+	pushq	$trapno
+
+#else /* __xpv */
+
+#define	TRAP_NOERR(trapno)	\
+	push	$0;		\
+	push	$trapno
+
+#define	NPTRAP_NOERR(trapno) TRAP_NOERR(trapno)
+
+/*
+ * error code already pushed by hw
+ * onto stack.
+ */
+#define	TRAP_ERR(trapno)	\
+	push	$trapno
+
+#endif	/* __xpv */
+
+	/*
+	 * These are the stacks used on cpu0 for taking double faults,
+	 * NMIs and MCEs.
+	 *
+	 * We define them here instead of in a C file so that we can page-align
+	 * them (gcc won't do that in a .c file).
+	 */
+	.data
+	DGDEF3(dblfault_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+	.fill	DEFAULTSTKSZ, 1, 0
+	DGDEF3(nmi_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+	.fill	DEFAULTSTKSZ, 1, 0
+	DGDEF3(mce_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+	.fill	DEFAULTSTKSZ, 1, 0
+
+	/*
+	 * #DE
+	 */
+	ENTRY_NP(div0trap)
+	TRAP_NOERR(T_ZERODIV)	/* $0 */
+	jmp	cmntrap
+	SET_SIZE(div0trap)
+
+	/*
+	 * #DB
+	 *
+	 * Fetch %dr6 and clear it, handing off the value to the
+	 * cmntrap code in %r15/%esi
+	 */
+	ENTRY_NP(dbgtrap)
+	TRAP_NOERR(T_SGLSTP)	/* $1 */
+
+#if !defined(__xpv)		/* no sysenter support yet */
+	/*
+	 * If we get here as a result of single-stepping a sysenter
+	 * instruction, we suddenly find ourselves taking a #db
+	 * in kernel mode -before- we've swapgs'ed.  So before we can
+	 * take the trap, we do the swapgs here, and fix the return
+	 * %rip in trap() so that we return immediately after the
+	 * swapgs in the sysenter handler to avoid doing the swapgs again.
+	 *
+	 * Nobody said that the design of sysenter was particularly
+	 * elegant, did they?
+	 */
+
+	pushq	%r11
+
+	/*
+	 * At this point the stack looks like this:
+	 *
+	 * (high address)	r_ss
+	 *			r_rsp
+	 *			r_rfl
+	 *			r_cs
+	 *			r_rip		<-- %rsp + 24
+	 *			r_err		<-- %rsp + 16
+	 *			r_trapno	<-- %rsp + 8
+	 * (low address)	%r11		<-- %rsp
+	 */
+	leaq	sys_sysenter(%rip), %r11
+	cmpq	%r11, 24(%rsp)	/* Compare to saved r_rip on the stack */
+	je	1f
+	leaq	brand_sys_sysenter(%rip), %r11
+	cmpq	%r11, 24(%rsp)	/* Compare to saved r_rip on the stack */
+	je	1f
+	leaq	tr_sys_sysenter(%rip), %r11
+	cmpq	%r11, 24(%rsp)
+	je	1f
+	leaq	tr_brand_sys_sysenter(%rip), %r11
+	cmpq	%r11, 24(%rsp)
+	jne	2f
+1:	swapgs
+2:	lfence /* swapgs mitigation */
+	popq	%r11
+#endif	/* !__xpv */
+
+	INTR_PUSH
+#if defined(__xpv)
+	movl	$6, %edi
+	call	kdi_dreg_get
+	movq	%rax, %r15		/* %db6 -> %r15 */
+	movl	$6, %edi
+	movl	$0, %esi
+	call	kdi_dreg_set		/* 0 -> %db6 */
+#else
+	movq	%db6, %r15
+	xorl	%eax, %eax
+	movq	%rax, %db6
+#endif
+
+	jmp	cmntrap_pushed
+	SET_SIZE(dbgtrap)
+
+#if !defined(__xpv)
+
+/*
+ * Macro to set the gsbase or kgsbase to the address of the struct cpu
+ * for this processor.  If we came from userland, set kgsbase else
+ * set gsbase.  We find the proper cpu struct by looping through
+ * the cpu structs for all processors till we find a match for the gdt
+ * of the trapping processor.  The stack is expected to be pointing at
+ * the standard regs pushed by hardware on a trap (plus error code and trapno).
+ *
+ * It's ok for us to clobber gsbase here (and possibly end up with both gsbase
+ * and kgsbase set to the same value) because we're not going back the normal
+ * way out of here (via IRET). Where we're going, we don't need no user %gs.
+ */
+#define	SET_CPU_GSBASE							\
+	subq	$REGOFF_TRAPNO, %rsp;	/* save regs */			\
+	movq	%rax, REGOFF_RAX(%rsp);					\
+	movq	%rbx, REGOFF_RBX(%rsp);					\
+	movq	%rcx, REGOFF_RCX(%rsp);					\
+	movq	%rdx, REGOFF_RDX(%rsp);					\
+	movq	%rbp, REGOFF_RBP(%rsp);					\
+	movq	%rsp, %rbp;						\
+	subq	$16, %rsp;		/* space for gdt */		\
+	sgdt	6(%rsp);						\
+	movq	8(%rsp), %rcx;		/* %rcx has gdt to match */	\
+	xorl	%ebx, %ebx;		/* loop index */		\
+	leaq	cpu(%rip), %rdx;	/* cpu pointer array */		\
+1:									\
+	movq	(%rdx, %rbx, CLONGSIZE), %rax;	/* get cpu[i] */	\
+	cmpq	$0x0, %rax;		/* cpu[i] == NULL ? */		\
+	je	2f;			/* yes, continue */		\
+	cmpq	%rcx, CPU_GDT(%rax);	/* gdt == cpu[i]->cpu_gdt ? */	\
+	je	3f;			/* yes, go set gsbase */	\
+2:									\
+	incl	%ebx;			/* i++ */			\
+	cmpl	$NCPU, %ebx;		/* i < NCPU ? */		\
+	jb	1b;			/* yes, loop */			\
+/* XXX BIG trouble if we fall thru here.  We didn't find a gdt match */	\
+3:									\
+	movl	$MSR_AMD_KGSBASE, %ecx;					\
+	cmpw	$KCS_SEL, REGOFF_CS(%rbp); /* trap from kernel? */	\
+	jne	4f;			/* no, go set KGSBASE */	\
+	movl	$MSR_AMD_GSBASE, %ecx;	/* yes, set GSBASE */		\
+        mfence;				/* OPTERON_ERRATUM_88 */	\
+4:									\
+	movq	%rax, %rdx;		/* write base register */	\
+	shrq	$32, %rdx;						\
+	wrmsr;								\
+	movq	REGOFF_RDX(%rbp), %rdx;	/* restore regs */		\
+	movq	REGOFF_RCX(%rbp), %rcx;					\
+	movq	REGOFF_RBX(%rbp), %rbx;					\
+	movq	REGOFF_RAX(%rbp), %rax;					\
+	movq	%rbp, %rsp;						\
+	movq	REGOFF_RBP(%rsp), %rbp;					\
+	addq	$REGOFF_TRAPNO, %rsp	/* pop stack */
+
+#else	/* __xpv */
+
+#define	SET_CPU_GSBASE	/* noop on the hypervisor */
+
+#endif	/* __xpv */
+
+
+	/*
+	 * #NMI
+	 *
+	 * XXPV: See 6532669.
+	 */
+	ENTRY_NP(nmiint)
+	TRAP_NOERR(T_NMIFLT)	/* $2 */
+
+	SET_CPU_GSBASE
+
+	/*
+	 * Save all registers and setup segment registers
+	 * with kernel selectors.
+	 */
+	INTR_PUSH
+	INTGATE_INIT_KERNEL_FLAGS
+
+	TRACE_PTR(%r12, %rax, %eax, %rdx, $TT_TRAP)
+	TRACE_REGS(%r12, %rsp, %rax, %rbx)
+	TRACE_STAMP(%r12)
+
+	movq	%rsp, %rbp
+
+	movq	%rbp, %rdi
+	call	av_dispatch_nmivect
+
+	INTR_POP
+	call	x86_md_clear
+	jmp	tr_iret_auto
+	/*NOTREACHED*/
+	SET_SIZE(nmiint)
+
+	/*
+	 * #BP
+	 */
+	ENTRY_NP(brktrap)
+	XPV_TRAP_POP
+	cmpw	$KCS_SEL, 8(%rsp)
+	jne	bp_user
+
+	/*
+	 * This is a breakpoint in the kernel -- it is very likely that this
+	 * is DTrace-induced.  To unify DTrace handling, we spoof this as an
+	 * invalid opcode (#UD) fault.  Note that #BP is a trap, not a fault --
+	 * we must decrement the trapping %rip to make it appear as a fault.
+	 * We then push a non-zero error code to indicate that this is coming
+	 * from #BP.
+	 */
+	decq	(%rsp)
+	push	$1			/* error code -- non-zero for #BP */
+	jmp	ud_kernel
+
+bp_user:
+
+	NPTRAP_NOERR(T_BPTFLT)	/* $3 */
+	jmp	dtrace_trap
+
+	SET_SIZE(brktrap)
+
+	/*
+	 * #OF
+	 */
+	ENTRY_NP(ovflotrap)
+	TRAP_NOERR(T_OVFLW)	/* $4 */
+	jmp	cmntrap
+	SET_SIZE(ovflotrap)
+
+	/*
+	 * #BR
+	 */
+	ENTRY_NP(boundstrap)
+	TRAP_NOERR(T_BOUNDFLT)	/* $5 */
+	jmp	cmntrap
+	SET_SIZE(boundstrap)
+
+	ENTRY_NP(invoptrap)
+
+	XPV_TRAP_POP
+
+	cmpw	$KCS_SEL, 8(%rsp)
+	jne	ud_user
+
+#if defined(__xpv)
+	movb	$0, 12(%rsp)		/* clear saved upcall_mask from %cs */
+#endif
+	push	$0			/* error code -- zero for #UD */
+ud_kernel:
+	push	$0xdddd			/* a dummy trap number */
+	INTR_PUSH
+	movq	REGOFF_RIP(%rsp), %rdi
+	movq	REGOFF_RSP(%rsp), %rsi
+	movq	REGOFF_RAX(%rsp), %rdx
+	pushq	(%rsi)
+	movq	%rsp, %rsi
+	subq	$8, %rsp
+	call	dtrace_invop
+	ALTENTRY(dtrace_invop_callsite)
+	addq	$16, %rsp
+	cmpl	$DTRACE_INVOP_PUSHL_EBP, %eax
+	je	ud_push
+	cmpl	$DTRACE_INVOP_LEAVE, %eax
+	je	ud_leave
+	cmpl	$DTRACE_INVOP_NOP, %eax
+	je	ud_nop
+	cmpl	$DTRACE_INVOP_RET, %eax
+	je	ud_ret
+	jmp	ud_trap
+
+ud_push:
+	/*
+	 * We must emulate a "pushq %rbp".  To do this, we pull the stack
+	 * down 8 bytes, and then store the base pointer.
+	 */
+	INTR_POP
+	subq	$16, %rsp		/* make room for %rbp */
+	pushq	%rax			/* push temp */
+	movq	24(%rsp), %rax		/* load calling RIP */
+	addq	$1, %rax		/* increment over trapping instr */
+	movq	%rax, 8(%rsp)		/* store calling RIP */
+	movq	32(%rsp), %rax		/* load calling CS */
+	movq	%rax, 16(%rsp)		/* store calling CS */
+	movq	40(%rsp), %rax		/* load calling RFLAGS */
+	movq	%rax, 24(%rsp)		/* store calling RFLAGS */
+	movq	48(%rsp), %rax		/* load calling RSP */
+	subq	$8, %rax		/* make room for %rbp */
+	movq	%rax, 32(%rsp)		/* store calling RSP */
+	movq	56(%rsp), %rax		/* load calling SS */
+	movq	%rax, 40(%rsp)		/* store calling SS */
+	movq	32(%rsp), %rax		/* reload calling RSP */
+	movq	%rbp, (%rax)		/* store %rbp there */
+	popq	%rax			/* pop off temp */
+	jmp	tr_iret_kernel		/* return from interrupt */
+	/*NOTREACHED*/
+
+ud_leave:
+	/*
+	 * We must emulate a "leave", which is the same as a "movq %rbp,
+	 * %rsp" followed by a "popq %rbp".  We can exploit the fact
+	 * that the %rsp is explicitly saved to effect the pop without
+	 * having to reshuffle the other data pushed for the trap.
+	 */
+
+	INTR_POP
+	pushq	%rax			/* push temp */
+	movq	8(%rsp), %rax		/* load calling RIP */
+	addq	$1, %rax		/* increment over trapping instr */
+	movq	%rax, 8(%rsp)		/* store calling RIP */
+	movq	(%rbp), %rax		/* get new %rbp */
+	addq	$8, %rbp		/* adjust new %rsp */
+	movq	%rbp, 32(%rsp)		/* store new %rsp */
+	movq	%rax, %rbp		/* set new %rbp */
+	popq	%rax			/* pop off temp */
+	jmp	tr_iret_kernel		/* return from interrupt */
+	/*NOTREACHED*/
+
+ud_nop:
+	/*
+	 * We must emulate a "nop".  This is obviously not hard:  we need only
+	 * advance the %rip by one.
+	 */
+	INTR_POP
+	incq	(%rsp)
+	jmp	tr_iret_kernel
+	/*NOTREACHED*/
+
+ud_ret:
+	INTR_POP
+	pushq	%rax			/* push temp */
+	movq	32(%rsp), %rax		/* load %rsp */
+	movq	(%rax), %rax		/* load calling RIP */
+	movq	%rax, 8(%rsp)		/* store calling RIP */
+	addq	$8, 32(%rsp)		/* adjust new %rsp */
+	popq	%rax			/* pop off temp */
+	jmp	tr_iret_kernel		/* return from interrupt */
+	/*NOTREACHED*/
+
+ud_trap:
+	/*
+	 * We're going to let the kernel handle this as a normal #UD.  If,
+	 * however, we came through #BP and are spoofing #UD (in this case,
+	 * the stored error value will be non-zero), we need to de-spoof
+	 * the trap by incrementing %rip and pushing T_BPTFLT.
+	 */
+	cmpq	$0, REGOFF_ERR(%rsp)
+	je	ud_ud
+	incq	REGOFF_RIP(%rsp)
+	addq	$REGOFF_RIP, %rsp
+	NPTRAP_NOERR(T_BPTFLT)	/* $3 */
+	jmp	cmntrap
+
+ud_ud:
+	addq	$REGOFF_RIP, %rsp
+ud_user:
+	NPTRAP_NOERR(T_ILLINST)
+	jmp	cmntrap
+	SET_SIZE(invoptrap)
+
+	/*
+	 * #NM
+	 */
+
+	ENTRY_NP(ndptrap)
+	TRAP_NOERR(T_NOEXTFLT)	/* $0 */
+	SET_CPU_GSBASE
+	jmp	cmntrap
+	SET_SIZE(ndptrap)
+
+#if !defined(__xpv)
+
+	/*
+	 * #DF
+	 */
+	ENTRY_NP(syserrtrap)
+	pushq	$T_DBLFLT
+	SET_CPU_GSBASE
+
+	/*
+	 * We share this handler with kmdb (if kmdb is loaded).  As such, we
+	 * may have reached this point after encountering a #df in kmdb.  If
+	 * that happens, we'll still be on kmdb's IDT.  We need to switch back
+	 * to this CPU's IDT before proceeding.  Furthermore, if we did arrive
+	 * here from kmdb, kmdb is probably in a very sickly state, and
+	 * shouldn't be entered from the panic flow.  We'll suppress that
+	 * entry by setting nopanicdebug.
+	 */
+	pushq	%rax
+	subq	$DESCTBR_SIZE, %rsp
+	sidt	(%rsp)
+	movq	%gs:CPU_IDT, %rax
+	cmpq	%rax, DTR_BASE(%rsp)
+	je	1f
+
+	movq	%rax, DTR_BASE(%rsp)
+	movw	$_MUL(NIDT, GATE_DESC_SIZE), DTR_LIMIT(%rsp)
+	lidt	(%rsp)
+
+	movl	$1, nopanicdebug
+
+1:	addq	$DESCTBR_SIZE, %rsp
+	popq	%rax
+
+	DFTRAP_PUSH
+
+	/*
+	 * freeze trap trace.
+	 */
+#ifdef TRAPTRACE
+	leaq	trap_trace_freeze(%rip), %r11
+	incl	(%r11)
+#endif
+
+	ENABLE_INTR_FLAGS
+
+	movq	%rsp, %rdi	/* &regs */
+	xorl	%esi, %esi	/* clear address */
+	xorl	%edx, %edx	/* cpuid = 0 */
+	call	trap
+
+	SET_SIZE(syserrtrap)
+
+#endif	/* !__xpv */
+
+	/*
+	 * #TS
+	 */
+	ENTRY_NP(invtsstrap)
+	TRAP_ERR(T_TSSFLT)	/* $10 already have error code on stack */
+	jmp	cmntrap
+	SET_SIZE(invtsstrap)
+
+	/*
+	 * #NP
+	 */
+	ENTRY_NP(segnptrap)
+	TRAP_ERR(T_SEGFLT)	/* $11 already have error code on stack */
+	SET_CPU_GSBASE
+	jmp	cmntrap
+	SET_SIZE(segnptrap)
+
+	/*
+	 * #SS
+	 */
+	ENTRY_NP(stktrap)
+	TRAP_ERR(T_STKFLT)	/* $12 already have error code on stack */
+	SET_CPU_GSBASE
+	jmp	cmntrap
+	SET_SIZE(stktrap)
+
+	/*
+	 * #GP
+	 */
+	ENTRY_NP(gptrap)
+	TRAP_ERR(T_GPFLT)	/* $13 already have error code on stack */
+	SET_CPU_GSBASE
+	jmp	cmntrap
+	SET_SIZE(gptrap)
+
+	/*
+	 * #PF
+	 */
+	ENTRY_NP(pftrap)
+	TRAP_ERR(T_PGFLT)	/* $14 already have error code on stack */
+	INTR_PUSH
+#if defined(__xpv)
+
+	movq	%gs:CPU_VCPU_INFO, %r15
+	movq	VCPU_INFO_ARCH_CR2(%r15), %r15	/* vcpu[].arch.cr2 */
+
+#else	/* __xpv */
+
+	movq	%cr2, %r15
+
+#endif	/* __xpv */
+	jmp	cmntrap_pushed
+	SET_SIZE(pftrap)
+
+	ENTRY_NP(resvtrap)
+	TRAP_NOERR(T_RESVTRAP)	/* (reserved)  */
+	jmp	cmntrap
+	SET_SIZE(resvtrap)
+
+	/*
+	 * #MF
+	 */
+	ENTRY_NP(ndperr)
+	TRAP_NOERR(T_EXTERRFLT)	/* $16 */
+	jmp	cmninttrap
+	SET_SIZE(ndperr)
+
+	/*
+	 * #AC
+	 */
+	ENTRY_NP(achktrap)
+	TRAP_ERR(T_ALIGNMENT)	/* $17 */
+	jmp	cmntrap
+	SET_SIZE(achktrap)
+
+	/*
+	 * #MC
+	 */
+	.globl	cmi_mca_trap	/* see uts/i86pc/os/cmi.c */
+
+	ENTRY_NP(mcetrap)
+	TRAP_NOERR(T_MCE)	/* $18 */
+
+	SET_CPU_GSBASE
+
+	INTR_PUSH
+	INTGATE_INIT_KERNEL_FLAGS
+
+	TRACE_PTR(%rdi, %rbx, %ebx, %rcx, $TT_TRAP)
+	TRACE_REGS(%rdi, %rsp, %rbx, %rcx)
+	TRACE_STAMP(%rdi)
+
+	movq	%rsp, %rbp
+
+	movq	%rsp, %rdi	/* arg0 = struct regs *rp */
+	call	cmi_mca_trap	/* cmi_mca_trap(rp); */
+
+	jmp	_sys_rtt
+	SET_SIZE(mcetrap)
+
+	/*
+	 * #XF
+	 */
+	ENTRY_NP(xmtrap)
+	TRAP_NOERR(T_SIMDFPE)	/* $19 */
+	jmp	cmninttrap
+	SET_SIZE(xmtrap)
+
+	ENTRY_NP(invaltrap)
+	TRAP_NOERR(T_INVALTRAP)	/* very invalid */
+	jmp	cmntrap
+	SET_SIZE(invaltrap)
+
+	.globl	fasttable
+
+	ENTRY_NP(fasttrap)
+	cmpl	$T_LASTFAST, %eax
+	ja	1f
+	orl	%eax, %eax	/* (zero extend top 32-bits) */
+	leaq	fasttable(%rip), %r11
+	leaq	(%r11, %rax, CLONGSIZE), %r11
+	movq	(%r11), %r11
+	INDIRECT_JMP_REG(r11)
+1:
+	/*
+	 * Fast syscall number was illegal.  Make it look
+	 * as if the INT failed.  Modify %rip to point before the
+	 * INT, push the expected error code and fake a GP fault.
+	 *
+	 * XXX Why make the error code be offset into idt + 1?
+	 * Instead we should push a real (soft?) error code
+	 * on the stack and #gp handler could know about fasttraps?
+	 */
+	XPV_TRAP_POP
+
+	subq	$2, (%rsp)	/* XXX int insn 2-bytes */
+	pushq	$_CONST(_MUL(T_FASTTRAP, GATE_DESC_SIZE) + 2)
+
+#if defined(__xpv)
+	pushq	%r11
+	pushq	%rcx
+#endif
+	jmp	gptrap
+	SET_SIZE(fasttrap)
+
+	ENTRY_NP(dtrace_ret)
+	TRAP_NOERR(T_DTRACE_RET)
+	jmp	dtrace_trap
+	SET_SIZE(dtrace_ret)
+
+	/*
+	 * RFLAGS 24 bytes up the stack from %rsp.
+	 * XXX a constant would be nicer.
+	 */
+	ENTRY_NP(fast_null)
+	XPV_TRAP_POP
+	orq	$PS_C, 24(%rsp)	/* set carry bit in user flags */
+	call	x86_md_clear
+	jmp	tr_iret_auto
+	/*NOTREACHED*/
+	SET_SIZE(fast_null)
+
+	/*
+	 * Interrupts start at 32
+	 */
+#define MKIVCT(n)			\
+	ENTRY_NP(ivct/**/n)		\
+	push	$0;			\
+	push	$n - 0x20;		\
+	jmp	cmnint;			\
+	SET_SIZE(ivct/**/n)
+
+	MKIVCT(32)
+	MKIVCT(33)
+	MKIVCT(34)
+	MKIVCT(35)
+	MKIVCT(36)
+	MKIVCT(37)
+	MKIVCT(38)
+	MKIVCT(39)
+	MKIVCT(40)
+	MKIVCT(41)
+	MKIVCT(42)
+	MKIVCT(43)
+	MKIVCT(44)
+	MKIVCT(45)
+	MKIVCT(46)
+	MKIVCT(47)
+	MKIVCT(48)
+	MKIVCT(49)
+	MKIVCT(50)
+	MKIVCT(51)
+	MKIVCT(52)
+	MKIVCT(53)
+	MKIVCT(54)
+	MKIVCT(55)
+	MKIVCT(56)
+	MKIVCT(57)
+	MKIVCT(58)
+	MKIVCT(59)
+	MKIVCT(60)
+	MKIVCT(61)
+	MKIVCT(62)
+	MKIVCT(63)
+	MKIVCT(64)
+	MKIVCT(65)
+	MKIVCT(66)
+	MKIVCT(67)
+	MKIVCT(68)
+	MKIVCT(69)
+	MKIVCT(70)
+	MKIVCT(71)
+	MKIVCT(72)
+	MKIVCT(73)
+	MKIVCT(74)
+	MKIVCT(75)
+	MKIVCT(76)
+	MKIVCT(77)
+	MKIVCT(78)
+	MKIVCT(79)
+	MKIVCT(80)
+	MKIVCT(81)
+	MKIVCT(82)
+	MKIVCT(83)
+	MKIVCT(84)
+	MKIVCT(85)
+	MKIVCT(86)
+	MKIVCT(87)
+	MKIVCT(88)
+	MKIVCT(89)
+	MKIVCT(90)
+	MKIVCT(91)
+	MKIVCT(92)
+	MKIVCT(93)
+	MKIVCT(94)
+	MKIVCT(95)
+	MKIVCT(96)
+	MKIVCT(97)
+	MKIVCT(98)
+	MKIVCT(99)
+	MKIVCT(100)
+	MKIVCT(101)
+	MKIVCT(102)
+	MKIVCT(103)
+	MKIVCT(104)
+	MKIVCT(105)
+	MKIVCT(106)
+	MKIVCT(107)
+	MKIVCT(108)
+	MKIVCT(109)
+	MKIVCT(110)
+	MKIVCT(111)
+	MKIVCT(112)
+	MKIVCT(113)
+	MKIVCT(114)
+	MKIVCT(115)
+	MKIVCT(116)
+	MKIVCT(117)
+	MKIVCT(118)
+	MKIVCT(119)
+	MKIVCT(120)
+	MKIVCT(121)
+	MKIVCT(122)
+	MKIVCT(123)
+	MKIVCT(124)
+	MKIVCT(125)
+	MKIVCT(126)
+	MKIVCT(127)
+	MKIVCT(128)
+	MKIVCT(129)
+	MKIVCT(130)
+	MKIVCT(131)
+	MKIVCT(132)
+	MKIVCT(133)
+	MKIVCT(134)
+	MKIVCT(135)
+	MKIVCT(136)
+	MKIVCT(137)
+	MKIVCT(138)
+	MKIVCT(139)
+	MKIVCT(140)
+	MKIVCT(141)
+	MKIVCT(142)
+	MKIVCT(143)
+	MKIVCT(144)
+	MKIVCT(145)
+	MKIVCT(146)
+	MKIVCT(147)
+	MKIVCT(148)
+	MKIVCT(149)
+	MKIVCT(150)
+	MKIVCT(151)
+	MKIVCT(152)
+	MKIVCT(153)
+	MKIVCT(154)
+	MKIVCT(155)
+	MKIVCT(156)
+	MKIVCT(157)
+	MKIVCT(158)
+	MKIVCT(159)
+	MKIVCT(160)
+	MKIVCT(161)
+	MKIVCT(162)
+	MKIVCT(163)
+	MKIVCT(164)
+	MKIVCT(165)
+	MKIVCT(166)
+	MKIVCT(167)
+	MKIVCT(168)
+	MKIVCT(169)
+	MKIVCT(170)
+	MKIVCT(171)
+	MKIVCT(172)
+	MKIVCT(173)
+	MKIVCT(174)
+	MKIVCT(175)
+	MKIVCT(176)
+	MKIVCT(177)
+	MKIVCT(178)
+	MKIVCT(179)
+	MKIVCT(180)
+	MKIVCT(181)
+	MKIVCT(182)
+	MKIVCT(183)
+	MKIVCT(184)
+	MKIVCT(185)
+	MKIVCT(186)
+	MKIVCT(187)
+	MKIVCT(188)
+	MKIVCT(189)
+	MKIVCT(190)
+	MKIVCT(191)
+	MKIVCT(192)
+	MKIVCT(193)
+	MKIVCT(194)
+	MKIVCT(195)
+	MKIVCT(196)
+	MKIVCT(197)
+	MKIVCT(198)
+	MKIVCT(199)
+	MKIVCT(200)
+	MKIVCT(201)
+	MKIVCT(202)
+	MKIVCT(203)
+	MKIVCT(204)
+	MKIVCT(205)
+	MKIVCT(206)
+	MKIVCT(207)
+	MKIVCT(208)
+	MKIVCT(209)
+	MKIVCT(210)
+	MKIVCT(211)
+	MKIVCT(212)
+	MKIVCT(213)
+	MKIVCT(214)
+	MKIVCT(215)
+	MKIVCT(216)
+	MKIVCT(217)
+	MKIVCT(218)
+	MKIVCT(219)
+	MKIVCT(220)
+	MKIVCT(221)
+	MKIVCT(222)
+	MKIVCT(223)
+	MKIVCT(224)
+	MKIVCT(225)
+	MKIVCT(226)
+	MKIVCT(227)
+	MKIVCT(228)
+	MKIVCT(229)
+	MKIVCT(230)
+	MKIVCT(231)
+	MKIVCT(232)
+	MKIVCT(233)
+	MKIVCT(234)
+	MKIVCT(235)
+	MKIVCT(236)
+	MKIVCT(237)
+	MKIVCT(238)
+	MKIVCT(239)
+	MKIVCT(240)
+	MKIVCT(241)
+	MKIVCT(242)
+	MKIVCT(243)
+	MKIVCT(244)
+	MKIVCT(245)
+	MKIVCT(246)
+	MKIVCT(247)
+	MKIVCT(248)
+	MKIVCT(249)
+	MKIVCT(250)
+	MKIVCT(251)
+	MKIVCT(252)
+	MKIVCT(253)
+	MKIVCT(254)
+	MKIVCT(255)
+
diff --git a/usr/src/uts/intel/ml/float.s b/usr/src/uts/intel/ml/float.s
new file mode 100644
index 0000000000..807647f553
--- /dev/null
+++ b/usr/src/uts/intel/ml/float.s
@@ -0,0 +1,347 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+/*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+/*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
+/*        All Rights Reserved   */
+
+/*      Copyright (c) 1987, 1988 Microsoft Corporation  */
+/*        All Rights Reserved   */
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/x86_archext.h>
+
+#include "assym.h"
+
+	/*
+	 * Returns zero if x87 "chip" is present(!)
+	 */
+	ENTRY_NP(fpu_initial_probe)
+	CLTS
+	fninit
+	fnstsw	%ax
+	movzbl	%al, %eax
+	ret
+	SET_SIZE(fpu_initial_probe)
+
+	ENTRY_NP(fxsave_insn)
+	fxsaveq (%rdi)
+	ret
+	SET_SIZE(fxsave_insn)
+
+/*
+ * One of these routines is called from any lwp with floating
+ * point context as part of the prolog of a context switch.
+ */
+
+/*
+ * These three functions define the Intel "xsave" handling for CPUs with
+ * different features. Newer AMD CPUs can also use these functions. See the
+ * 'exception pointers' comment below.
+ */
+	ENTRY_NP(fpxsave_ctxt)	/* %rdi is a struct fpu_ctx */
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movq	FPU_CTX_FPU_REGS(%rdi), %rdi /* fpu_regs.kfpu_u.kfpu_fx ptr */
+	fxsaveq	(%rdi)
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(fpxsave_ctxt)
+
+	ENTRY_NP(xsave_ctxt)
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movl	FPU_CTX_FPU_XSAVE_MASK(%rdi), %eax /* xsave flags in EDX:EAX */
+	movl	FPU_CTX_FPU_XSAVE_MASK+4(%rdi), %edx
+	movq	FPU_CTX_FPU_REGS(%rdi), %rsi /* fpu_regs.kfpu_u.kfpu_xs ptr */
+	xsave	(%rsi)
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	ret
+	SET_SIZE(xsave_ctxt)
+
+	ENTRY_NP(xsaveopt_ctxt)
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movl	FPU_CTX_FPU_XSAVE_MASK(%rdi), %eax /* xsave flags in EDX:EAX */
+	movl	FPU_CTX_FPU_XSAVE_MASK+4(%rdi), %edx
+	movq	FPU_CTX_FPU_REGS(%rdi), %rsi /* fpu_regs.kfpu_u.kfpu_xs ptr */
+	xsaveopt (%rsi)
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	ret
+	SET_SIZE(xsaveopt_ctxt)
+
+/*
+ * On certain AMD processors, the "exception pointers" (i.e. the last
+ * instruction pointer, last data pointer, and last opcode) are saved by the
+ * fxsave, xsave or xsaveopt instruction ONLY if the exception summary bit is
+ * set.
+ *
+ * On newer CPUs, AMD has changed their behavior to mirror the Intel behavior.
+ * We can detect this via an AMD specific cpuid feature bit
+ * (CPUID_AMD_EBX_ERR_PTR_ZERO) and use the simpler Intel-oriented functions.
+ * Otherwise we use these more complex functions on AMD CPUs. All three follow
+ * the same logic after the xsave* instruction.
+ */
+	ENTRY_NP(fpxsave_excp_clr_ctxt)	/* %rdi is a struct fpu_ctx */
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movq	FPU_CTX_FPU_REGS(%rdi), %rdi /* fpu_regs.kfpu_u.kfpu_fx ptr */
+	fxsaveq	(%rdi)
+	/*
+	 * To ensure that we don't leak these values into the next context
+	 * on the cpu, we could just issue an fninit here, but that's
+	 * rather slow and so we issue an instruction sequence that
+	 * clears them more quickly, if a little obscurely.
+	 */
+	btw	$7, FXSAVE_STATE_FSW(%rdi)	/* Test saved ES bit */
+	jnc	0f				/* jump if ES = 0 */
+	fnclex		/* clear pending x87 exceptions */
+0:	ffree	%st(7)	/* clear tag bit to remove possible stack overflow */
+	fildl	.fpzero_const(%rip)
+			/* dummy load changes all exception pointers */
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(fpxsave_excp_clr_ctxt)
+
+	ENTRY_NP(xsave_excp_clr_ctxt)
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movl	FPU_CTX_FPU_XSAVE_MASK(%rdi), %eax
+	movl	FPU_CTX_FPU_XSAVE_MASK+4(%rdi), %edx
+	movq	FPU_CTX_FPU_REGS(%rdi), %rsi /* fpu_regs.kfpu_u.kfpu_xs ptr */
+	xsave	(%rsi)
+	btw	$7, FXSAVE_STATE_FSW(%rsi)	/* Test saved ES bit */
+	jnc	0f				/* jump if ES = 0 */
+	fnclex		/* clear pending x87 exceptions */
+0:	ffree	%st(7)	/* clear tag bit to remove possible stack overflow */
+	fildl	.fpzero_const(%rip) /* dummy load changes all excp. pointers */
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	ret
+	SET_SIZE(xsave_excp_clr_ctxt)
+
+	ENTRY_NP(xsaveopt_excp_clr_ctxt)
+	cmpl	$FPU_EN, FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_VALID|FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movl	FPU_CTX_FPU_XSAVE_MASK(%rdi), %eax
+	movl	FPU_CTX_FPU_XSAVE_MASK+4(%rdi), %edx
+	movq	FPU_CTX_FPU_REGS(%rdi), %rsi /* fpu_regs.kfpu_u.kfpu_xs ptr */
+	xsaveopt (%rsi)
+	btw	$7, FXSAVE_STATE_FSW(%rsi)	/* Test saved ES bit */
+	jnc	0f				/* jump if ES = 0 */
+	fnclex		/* clear pending x87 exceptions */
+0:	ffree	%st(7)	/* clear tag bit to remove possible stack overflow */
+	fildl	.fpzero_const(%rip) /* dummy load changes all excp. pointers */
+	STTS(%rsi)	/* trap on next fpu touch */
+1:	ret
+	SET_SIZE(xsaveopt_excp_clr_ctxt)
+
+	.align	8
+.fpzero_const:
+	.4byte	0x0
+	.4byte	0x0
+
+
+	ENTRY_NP(fpxsave)
+	CLTS
+	fxsaveq (%rdi)
+	fninit				/* clear exceptions, init x87 tags */
+	STTS(%rdi)			/* set TS bit in %cr0 (disable FPU) */
+	ret
+	SET_SIZE(fpxsave)
+
+	ENTRY_NP(xsave)
+	CLTS
+	movl	%esi, %eax		/* bv mask */
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	xsave	(%rdi)
+
+	fninit				/* clear exceptions, init x87 tags */
+	STTS(%rdi)			/* set TS bit in %cr0 (disable FPU) */
+	ret
+	SET_SIZE(xsave)
+
+	ENTRY_NP(xsaveopt)
+	CLTS
+	movl	%esi, %eax		/* bv mask */
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	xsaveopt (%rdi)
+
+	fninit				/* clear exceptions, init x87 tags */
+	STTS(%rdi)			/* set TS bit in %cr0 (disable FPU) */
+	ret
+	SET_SIZE(xsaveopt)
+
+/*
+ * These functions are used when restoring the FPU as part of the epilogue of a
+ * context switch.
+ */
+
+	ENTRY(fpxrestore_ctxt)
+	cmpl	$_CONST(FPU_EN|FPU_VALID), FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movq	FPU_CTX_FPU_REGS(%rdi), %rdi /* fpu_regs.kfpu_u.kfpu_fx ptr */
+	CLTS
+	fxrstorq	(%rdi)
+1:
+	ret
+	SET_SIZE(fpxrestore_ctxt)
+
+	ENTRY(xrestore_ctxt)
+	cmpl	$_CONST(FPU_EN|FPU_VALID), FPU_CTX_FPU_FLAGS(%rdi)
+	jne	1f
+	movl	$_CONST(FPU_EN), FPU_CTX_FPU_FLAGS(%rdi)
+	movl	FPU_CTX_FPU_XSAVE_MASK(%rdi), %eax /* xsave flags in EDX:EAX */
+	movl	FPU_CTX_FPU_XSAVE_MASK+4(%rdi), %edx
+	movq	FPU_CTX_FPU_REGS(%rdi), %rdi /* fpu_regs.kfpu_u.kfpu_xs ptr */
+	CLTS
+	xrstor	(%rdi)
+1:
+	ret
+	SET_SIZE(xrestore_ctxt)
+
+
+	ENTRY_NP(fpxrestore)
+	CLTS
+	fxrstorq	(%rdi)
+	ret
+	SET_SIZE(fpxrestore)
+
+	ENTRY_NP(xrestore)
+	CLTS
+	movl	%esi, %eax		/* bv mask */
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	xrstor	(%rdi)
+	ret
+	SET_SIZE(xrestore)
+
+/*
+ * Disable the floating point unit.
+ */
+
+	ENTRY_NP(fpdisable)
+	STTS(%rdi)			/* set TS bit in %cr0 (disable FPU) */
+	ret
+	SET_SIZE(fpdisable)
+
+/*
+ * Initialize the fpu hardware.
+ */
+
+	ENTRY_NP(fpinit)
+	CLTS
+	cmpl	$FP_XSAVE, fp_save_mech
+	je	1f
+
+	/* fxsave */
+	leaq	sse_initial(%rip), %rax
+	fxrstorq	(%rax)			/* load clean initial state */
+	ret
+
+1:	/* xsave */
+	leaq	avx_initial(%rip), %rcx
+	xorl	%edx, %edx
+	movl	$XFEATURE_AVX, %eax
+	btl	$X86FSET_AVX, x86_featureset
+	cmovael	%edx, %eax
+	orl	$(XFEATURE_LEGACY_FP | XFEATURE_SSE), %eax
+	xrstor (%rcx)
+	ret
+	SET_SIZE(fpinit)
+
+/*
+ * Clears FPU exception state.
+ * Returns the FP status word.
+ */
+
+	ENTRY_NP(fperr_reset)
+	CLTS
+	xorl	%eax, %eax
+	fnstsw	%ax
+	fnclex
+	ret
+	SET_SIZE(fperr_reset)
+
+	ENTRY_NP(fpxerr_reset)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$0x10, %rsp		/* make some temporary space */
+	CLTS
+	stmxcsr	(%rsp)
+	movl	(%rsp), %eax
+	andl	$_BITNOT(SSE_MXCSR_EFLAGS), (%rsp)
+	ldmxcsr	(%rsp)			/* clear processor exceptions */
+	leave
+	ret
+	SET_SIZE(fpxerr_reset)
+
+	ENTRY_NP(fpgetcwsw)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$0x10, %rsp		/* make some temporary space	*/
+	CLTS
+	fnstsw	(%rsp)			/* store the status word	*/
+	fnstcw	2(%rsp)			/* store the control word	*/
+	movl	(%rsp), %eax		/* put both in %eax		*/
+	leave
+	ret
+	SET_SIZE(fpgetcwsw)
+
+/*
+ * Returns the MXCSR register.
+ */
+
+	ENTRY_NP(fpgetmxcsr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$0x10, %rsp		/* make some temporary space */
+	CLTS
+	stmxcsr	(%rsp)
+	movl	(%rsp), %eax
+	leave
+	ret
+	SET_SIZE(fpgetmxcsr)
+
diff --git a/usr/src/uts/intel/ml/hypersubr.s b/usr/src/uts/intel/ml/hypersubr.s
new file mode 100644
index 0000000000..e6378d8518
--- /dev/null
+++ b/usr/src/uts/intel/ml/hypersubr.s
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#ifndef __xpv
+#include <sys/xpv_support.h>
+#endif
+#include <sys/hypervisor.h>
+
+/*
+ * Hypervisor "system calls"
+ *
+ * amd64
+ *	%rax == call number
+ *	args in registers (%rdi, %rsi, %rdx, %r10, %r8, %r9)
+ *
+ * Note that we use %r10 instead of %rcx for passing 4th argument as in
+ * C calling convention since the "syscall" instruction clobbers %rcx.
+ *
+ * (These calls can be done more efficiently as gcc-style inlines, but
+ * for simplicity and help with initial debugging, we use these primitives
+ * to build the hypervisor calls up from C wrappers.)
+ */
+
+/*
+ * XXPV grr - assembler can't deal with an instruction in a quoted string
+ */
+#undef	TRAP_INSTR	/* cause it's currently "int $0x82" */
+
+/*
+ * The method for issuing a hypercall (i.e. a system call to the
+ * hypervisor) varies from platform to platform.  In 32-bit PV domains, an
+ * 'int 82' triggers the call.  In 64-bit PV domains, a 'syscall' does the
+ * trick.
+ *
+ * HVM domains are more complicated.  In all cases, we want to issue a
+ * VMEXIT instruction, but AMD and Intel use different opcodes to represent
+ * that instruction.  Rather than build CPU-specific modules with the
+ * different opcodes, we use the 'hypercall page' provided by Xen.  This
+ * page contains a collection of code stubs that do nothing except issue
+ * hypercalls using the proper instructions for this machine.  To keep the
+ * wrapper code as simple and efficient as possible, we preallocate that
+ * page below.  When the module is loaded, we ask Xen to remap the
+ * underlying PFN to that of the hypercall page.
+ *
+ * Note: this same mechanism could be used in PV domains, but using
+ * hypercall page requires a call and several more instructions than simply
+ * issuing the proper trap.
+ */
+#if !defined(__xpv)
+
+#define	HYPERCALL_PAGESIZE		0x1000
+#define	HYPERCALL_SHINFO_PAGESIZE	0x1000
+
+	.data
+	.align	HYPERCALL_SHINFO_PAGESIZE
+	.globl	hypercall_shared_info_page
+	.type	hypercall_shared_info_page, @object
+	.size	hypercall_shared_info_page, HYPERCALL_SHINFO_PAGESIZE
+hypercall_shared_info_page:
+	.skip	HYPERCALL_SHINFO_PAGESIZE
+
+	.text
+	.align	HYPERCALL_PAGESIZE
+	.globl	hypercall_page
+	.type	hypercall_page, @function
+hypercall_page:
+	.skip	HYPERCALL_PAGESIZE
+	.size	hypercall_page, HYPERCALL_PAGESIZE
+#define	TRAP_INSTR			\
+	shll	$5, %eax;		\
+	addq	$hypercall_page, %rax;	\
+	INDIRECT_JMP_REG(rax);
+
+#else /* !_xpv */
+
+#define	TRAP_INSTR	syscall
+#endif /* !__xpv */
+
+
+	ENTRY_NP(__hypercall0)
+	ALTENTRY(__hypercall0_int)
+	movl	%edi, %eax
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall0)
+
+	ENTRY_NP(__hypercall1)
+	ALTENTRY(__hypercall1_int)
+	movl	%edi, %eax
+	movq	%rsi, %rdi		/* arg 1 */
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall1)
+
+	ENTRY_NP(__hypercall2)
+	ALTENTRY(__hypercall2_int)
+	movl	%edi, %eax
+	movq	%rsi, %rdi		/* arg 1 */
+	movq	%rdx, %rsi		/* arg 2 */
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall2)
+
+	ENTRY_NP(__hypercall3)
+	ALTENTRY(__hypercall3_int)
+	movl	%edi, %eax
+	movq	%rsi, %rdi		/* arg 1 */
+	movq	%rdx, %rsi		/* arg 2 */
+	movq	%rcx, %rdx		/* arg 3 */
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall3)
+
+	ENTRY_NP(__hypercall4)
+	ALTENTRY(__hypercall4_int)
+	movl	%edi, %eax
+	movq	%rsi, %rdi		/* arg 1 */
+	movq	%rdx, %rsi		/* arg 2 */
+	movq	%rcx, %rdx		/* arg 3 */
+	movq	%r8, %r10		/* r10 = 4th arg */
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall4)
+
+	ENTRY_NP(__hypercall5)
+	ALTENTRY(__hypercall5_int)
+	movl	%edi, %eax
+	movq	%rsi, %rdi		/* arg 1 */
+	movq	%rdx, %rsi		/* arg 2 */
+	movq	%rcx, %rdx		/* arg 3 */
+	movq	%r8, %r10		/* r10 = 4th arg */
+	movq	%r9, %r8		/* arg 5 */
+	TRAP_INSTR
+	ret
+	SET_SIZE(__hypercall5)
+
diff --git a/usr/src/uts/intel/ml/i86_subr.s b/usr/src/uts/intel/ml/i86_subr.s
new file mode 100644
index 0000000000..2a1a183026
--- /dev/null
+++ b/usr/src/uts/intel/ml/i86_subr.s
@@ -0,0 +1,1629 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*
+ *  Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.
+ *  Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T
+ *    All Rights Reserved
+ */
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+/*
+ * General assembly language routines.
+ * It is the intent of this file to contain routines that are
+ * independent of the specific kernel architecture, and those that are
+ * common across kernel architectures.
+ * As architectures diverge, and implementations of specific
+ * architecture-dependent routines change, the routines should be moved
+ * from this file into the respective ../`arch -k`/subr.s file.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/panic.h>
+#include <sys/ontrap.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/reboot.h>
+#include <sys/psw.h>
+#include <sys/x86_archext.h>
+
+#include "assym.h"
+#include <sys/dditypes.h>
+
+/*
+ * on_fault()
+ *
+ * Catch lofault faults. Like setjmp except it returns one
+ * if code following causes uncorrectable fault. Turned off
+ * by calling no_fault(). Note that while under on_fault(),
+ * SMAP is disabled. For more information see
+ * uts/intel/ml/copy.s.
+ */
+
+	ENTRY(on_fault)
+	movq	%gs:CPU_THREAD, %rsi
+	leaq	catch_fault(%rip), %rdx
+	movq	%rdi, T_ONFAULT(%rsi)		/* jumpbuf in t_onfault */
+	movq	%rdx, T_LOFAULT(%rsi)		/* catch_fault in t_lofault */
+	call	smap_disable			/* allow user accesses */
+	jmp	setjmp				/* let setjmp do the rest */
+
+catch_fault:
+	movq	%gs:CPU_THREAD, %rsi
+	movq	T_ONFAULT(%rsi), %rdi		/* address of save area */
+	xorl	%eax, %eax
+	movq	%rax, T_ONFAULT(%rsi)		/* turn off onfault */
+	movq	%rax, T_LOFAULT(%rsi)		/* turn off lofault */
+	call	smap_enable			/* disallow user accesses */
+	jmp	longjmp				/* let longjmp do the rest */
+	SET_SIZE(on_fault)
+
+	ENTRY(no_fault)
+	movq	%gs:CPU_THREAD, %rsi
+	xorl	%eax, %eax
+	movq	%rax, T_ONFAULT(%rsi)		/* turn off onfault */
+	movq	%rax, T_LOFAULT(%rsi)		/* turn off lofault */
+	call	smap_enable			/* disallow user accesses */
+	ret
+	SET_SIZE(no_fault)
+
+/*
+ * Default trampoline code for on_trap() (see <sys/ontrap.h>).  We just
+ * do a longjmp(&curthread->t_ontrap->ot_jmpbuf) if this is ever called.
+ */
+
+	ENTRY(on_trap_trampoline)
+	movq	%gs:CPU_THREAD, %rsi
+	movq	T_ONTRAP(%rsi), %rdi
+	addq	$OT_JMPBUF, %rdi
+	jmp	longjmp
+	SET_SIZE(on_trap_trampoline)
+
+/*
+ * Push a new element on to the t_ontrap stack.  Refer to <sys/ontrap.h> for
+ * more information about the on_trap() mechanism.  If the on_trap_data is the
+ * same as the topmost stack element, we just modify that element.
+ */
+
+	ENTRY(on_trap)
+	movw	%si, OT_PROT(%rdi)		/* ot_prot = prot */
+	movw	$0, OT_TRAP(%rdi)		/* ot_trap = 0 */
+	leaq	on_trap_trampoline(%rip), %rdx	/* rdx = &on_trap_trampoline */
+	movq	%rdx, OT_TRAMPOLINE(%rdi)	/* ot_trampoline = rdx */
+	xorl	%ecx, %ecx
+	movq	%rcx, OT_HANDLE(%rdi)		/* ot_handle = NULL */
+	movq	%rcx, OT_PAD1(%rdi)		/* ot_pad1 = NULL */
+	movq	%gs:CPU_THREAD, %rdx		/* rdx = curthread */
+	movq	T_ONTRAP(%rdx), %rcx		/* rcx = curthread->t_ontrap */
+	cmpq	%rdi, %rcx			/* if (otp == %rcx)	*/
+	je	0f				/*	don't modify t_ontrap */
+
+	movq	%rcx, OT_PREV(%rdi)		/* ot_prev = t_ontrap */
+	movq	%rdi, T_ONTRAP(%rdx)		/* curthread->t_ontrap = otp */
+
+0:	addq	$OT_JMPBUF, %rdi		/* &ot_jmpbuf */
+	jmp	setjmp
+	SET_SIZE(on_trap)
+
+/*
+ * Setjmp and longjmp implement non-local gotos using state vectors
+ * type label_t.
+ */
+
+#if LABEL_PC != 0
+#error LABEL_PC MUST be defined as 0 for setjmp/longjmp to work as coded
+#endif	/* LABEL_PC != 0 */
+
+	ENTRY(setjmp)
+	movq	%rsp, LABEL_SP(%rdi)
+	movq	%rbp, LABEL_RBP(%rdi)
+	movq	%rbx, LABEL_RBX(%rdi)
+	movq	%r12, LABEL_R12(%rdi)
+	movq	%r13, LABEL_R13(%rdi)
+	movq	%r14, LABEL_R14(%rdi)
+	movq	%r15, LABEL_R15(%rdi)
+	movq	(%rsp), %rdx		/* return address */
+	movq	%rdx, (%rdi)		/* LABEL_PC is 0 */
+	xorl	%eax, %eax		/* return 0 */
+	ret
+	SET_SIZE(setjmp)
+
+	ENTRY(longjmp)
+	movq	LABEL_SP(%rdi), %rsp
+	movq	LABEL_RBP(%rdi), %rbp
+	movq	LABEL_RBX(%rdi), %rbx
+	movq	LABEL_R12(%rdi), %r12
+	movq	LABEL_R13(%rdi), %r13
+	movq	LABEL_R14(%rdi), %r14
+	movq	LABEL_R15(%rdi), %r15
+	movq	(%rdi), %rdx		/* return address; LABEL_PC is 0 */
+	movq	%rdx, (%rsp)
+	xorl	%eax, %eax
+	incl	%eax			/* return 1 */
+	ret
+	SET_SIZE(longjmp)
+
+/*
+ * if a() calls b() calls caller(),
+ * caller() returns return address in a().
+ * (Note: We assume a() and b() are C routines which do the normal entry/exit
+ *  sequence.)
+ */
+
+	ENTRY(caller)
+	movq	8(%rbp), %rax		/* b()'s return pc, in a() */
+	ret
+	SET_SIZE(caller)
+
+/*
+ * if a() calls callee(), callee() returns the
+ * return address in a();
+ */
+
+	ENTRY(callee)
+	movq	(%rsp), %rax		/* callee()'s return pc, in a() */
+	ret
+	SET_SIZE(callee)
+
+/*
+ * return the current frame pointer
+ */
+
+	ENTRY(getfp)
+	movq	%rbp, %rax
+	ret
+	SET_SIZE(getfp)
+
+/*
+ * Invalidate a single page table entry in the TLB
+ */
+
+	ENTRY(mmu_invlpg)
+	invlpg	(%rdi)
+	ret
+	SET_SIZE(mmu_invlpg)
+
+
+/*
+ * Get/Set the value of various control registers
+ */
+
+	ENTRY(getcr0)
+	movq	%cr0, %rax
+	ret
+	SET_SIZE(getcr0)
+
+	ENTRY(setcr0)
+	movq	%rdi, %cr0
+	ret
+	SET_SIZE(setcr0)
+
+        ENTRY(getcr2)
+#if defined(__xpv)
+	movq	%gs:CPU_VCPU_INFO, %rax
+	movq	VCPU_INFO_ARCH_CR2(%rax), %rax
+#else
+        movq    %cr2, %rax
+#endif
+        ret
+	SET_SIZE(getcr2)
+
+	ENTRY(getcr3)
+	movq    %cr3, %rax
+	ret
+	SET_SIZE(getcr3)
+
+#if !defined(__xpv)
+
+        ENTRY(setcr3)
+        movq    %rdi, %cr3
+        ret
+	SET_SIZE(setcr3)
+
+	ENTRY(reload_cr3)
+	movq	%cr3, %rdi
+	movq	%rdi, %cr3
+	ret
+	SET_SIZE(reload_cr3)
+
+#endif	/* __xpv */
+
+	ENTRY(getcr4)
+	movq	%cr4, %rax
+	ret
+	SET_SIZE(getcr4)
+
+	ENTRY(setcr4)
+	movq	%rdi, %cr4
+	ret
+	SET_SIZE(setcr4)
+
+	ENTRY(getcr8)
+	movq	%cr8, %rax
+	ret
+	SET_SIZE(getcr8)
+
+	ENTRY(setcr8)
+	movq	%rdi, %cr8
+	ret
+	SET_SIZE(setcr8)
+
+	ENTRY(__cpuid_insn)
+	movq	%rbx, %r8
+	movq	%rcx, %r9
+	movq	%rdx, %r11
+	movl	(%rdi), %eax		/* %eax = regs->cp_eax */
+	movl	0x4(%rdi), %ebx		/* %ebx = regs->cp_ebx */
+	movl	0x8(%rdi), %ecx		/* %ecx = regs->cp_ecx */
+	movl	0xc(%rdi), %edx		/* %edx = regs->cp_edx */
+	cpuid
+	movl	%eax, (%rdi)		/* regs->cp_eax = %eax */
+	movl	%ebx, 0x4(%rdi)		/* regs->cp_ebx = %ebx */
+	movl	%ecx, 0x8(%rdi)		/* regs->cp_ecx = %ecx */
+	movl	%edx, 0xc(%rdi)		/* regs->cp_edx = %edx */
+	movq	%r8, %rbx
+	movq	%r9, %rcx
+	movq	%r11, %rdx
+	ret
+	SET_SIZE(__cpuid_insn)
+
+	ENTRY_NP(i86_monitor)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movq	%rdi, %rax		/* addr */
+	movq	%rsi, %rcx		/* extensions */
+	/* rdx contains input arg3: hints */
+	clflush	(%rax)
+	.byte	0x0f, 0x01, 0xc8	/* monitor */
+	leave
+	ret
+	SET_SIZE(i86_monitor)
+
+	ENTRY_NP(i86_mwait)
+	pushq	%rbp
+	call	x86_md_clear
+	movq	%rsp, %rbp
+	movq	%rdi, %rax		/* data */
+	movq	%rsi, %rcx		/* extensions */
+	.byte	0x0f, 0x01, 0xc9	/* mwait */
+	leave
+	ret
+	SET_SIZE(i86_mwait)
+
+#if defined(__xpv)
+	/*
+	 * Defined in C
+	 */
+#else
+
+	ENTRY_NP(tsc_read)
+	movq	%rbx, %r11
+	movl	$0, %eax
+	cpuid
+	rdtsc
+	movq	%r11, %rbx
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	.globl _tsc_mfence_start
+_tsc_mfence_start:
+	mfence
+	rdtsc
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	.globl _tsc_mfence_end
+_tsc_mfence_end:
+	.globl _tscp_start
+_tscp_start:
+	.byte	0x0f, 0x01, 0xf9	/* rdtscp instruction */
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	.globl _tscp_end
+_tscp_end:
+	.globl _no_rdtsc_start
+_no_rdtsc_start:
+	xorl	%edx, %edx
+	xorl	%eax, %eax
+	ret
+	.globl _no_rdtsc_end
+_no_rdtsc_end:
+	.globl _tsc_lfence_start
+_tsc_lfence_start:
+	lfence
+	rdtsc
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	.globl _tsc_lfence_end
+_tsc_lfence_end:
+	SET_SIZE(tsc_read)
+
+
+#endif	/* __xpv */
+
+	ENTRY_NP(randtick)
+	rdtsc
+	shlq    $32, %rdx
+	orq     %rdx, %rax
+	ret
+	SET_SIZE(randtick)
+/*
+ * Insert entryp after predp in a doubly linked list.
+ */
+
+	ENTRY(_insque)
+	movq	(%rsi), %rax		/* predp->forw			*/
+	movq	%rsi, CPTRSIZE(%rdi)	/* entryp->back = predp		*/
+	movq	%rax, (%rdi)		/* entryp->forw = predp->forw	*/
+	movq	%rdi, (%rsi)		/* predp->forw = entryp		*/
+	movq	%rdi, CPTRSIZE(%rax)	/* predp->forw->back = entryp	*/
+	ret
+	SET_SIZE(_insque)
+
+/*
+ * Remove entryp from a doubly linked list
+ */
+
+	ENTRY(_remque)
+	movq	(%rdi), %rax		/* entry->forw */
+	movq	CPTRSIZE(%rdi), %rdx	/* entry->back */
+	movq	%rax, (%rdx)		/* entry->back->forw = entry->forw */
+	movq	%rdx, CPTRSIZE(%rax)	/* entry->forw->back = entry->back */
+	ret
+	SET_SIZE(_remque)
+
+/*
+ * Returns the number of
+ * non-NULL bytes in string argument.
+ */
+
+/*
+ * This is close to a simple transliteration of a C version of this
+ * routine.  We should either just -make- this be a C version, or
+ * justify having it in assembler by making it significantly faster.
+ *
+ * size_t
+ * strlen(const char *s)
+ * {
+ *	const char *s0;
+ * #if defined(DEBUG)
+ *	if ((uintptr_t)s < KERNELBASE)
+ *		panic(.str_panic_msg);
+ * #endif
+ *	for (s0 = s; *s; s++)
+ *		;
+ *	return (s - s0);
+ * }
+ */
+
+	ENTRY(strlen)
+#ifdef DEBUG
+	movq	postbootkernelbase(%rip), %rax
+	cmpq	%rax, %rdi
+	jae	str_valid
+	pushq	%rbp
+	movq	%rsp, %rbp
+	leaq	.str_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+#endif	/* DEBUG */
+str_valid:
+	cmpb	$0, (%rdi)
+	movq	%rdi, %rax
+	je	.null_found
+	.align	4
+.strlen_loop:
+	incq	%rdi
+	cmpb	$0, (%rdi)
+	jne	.strlen_loop
+.null_found:
+	subq	%rax, %rdi
+	movq	%rdi, %rax
+	ret
+	SET_SIZE(strlen)
+
+#ifdef DEBUG
+	.text
+.str_panic_msg:
+	.string "strlen: argument below kernelbase"
+#endif /* DEBUG */
+
+	/*
+	 * Berkeley 4.3 introduced symbolically named interrupt levels
+	 * as a way deal with priority in a machine independent fashion.
+	 * Numbered priorities are machine specific, and should be
+	 * discouraged where possible.
+	 *
+	 * Note, for the machine specific priorities there are
+	 * examples listed for devices that use a particular priority.
+	 * It should not be construed that all devices of that
+	 * type should be at that priority.  It is currently were
+	 * the current devices fit into the priority scheme based
+	 * upon time criticalness.
+	 *
+	 * The underlying assumption of these assignments is that
+	 * IPL 10 is the highest level from which a device
+	 * routine can call wakeup.  Devices that interrupt from higher
+	 * levels are restricted in what they can do.  If they need
+	 * kernels services they should schedule a routine at a lower
+	 * level (via software interrupt) to do the required
+	 * processing.
+	 *
+	 * Examples of this higher usage:
+	 *	Level	Usage
+	 *	14	Profiling clock (and PROM uart polling clock)
+	 *	12	Serial ports
+	 *
+	 * The serial ports request lower level processing on level 6.
+	 *
+	 * Also, almost all splN routines (where N is a number or a
+	 * mnemonic) will do a RAISE(), on the assumption that they are
+	 * never used to lower our priority.
+	 * The exceptions are:
+	 *	spl8()		Because you can't be above 15 to begin with!
+	 *	splzs()		Because this is used at boot time to lower our
+	 *			priority, to allow the PROM to poll the uart.
+	 *	spl0()		Used to lower priority to 0.
+	 */
+
+#define	SETPRI(level) \
+	movl	$/**/level, %edi;	/* new priority */		\
+	jmp	do_splx			/* redirect to do_splx */
+
+#define	RAISE(level) \
+	movl	$/**/level, %edi;	/* new priority */		\
+	jmp	splr			/* redirect to splr */
+
+	/* locks out all interrupts, including memory errors */
+	ENTRY(spl8)
+	SETPRI(15)
+	SET_SIZE(spl8)
+
+	/* just below the level that profiling runs */
+	ENTRY(spl7)
+	RAISE(13)
+	SET_SIZE(spl7)
+
+	/* sun specific - highest priority onboard serial i/o asy ports */
+	ENTRY(splzs)
+	SETPRI(12)	/* Can't be a RAISE, as it's used to lower us */
+	SET_SIZE(splzs)
+
+	ENTRY(splhi)
+	ALTENTRY(splhigh)
+	ALTENTRY(spl6)
+	ALTENTRY(i_ddi_splhigh)
+
+	RAISE(DISP_LEVEL)
+
+	SET_SIZE(i_ddi_splhigh)
+	SET_SIZE(spl6)
+	SET_SIZE(splhigh)
+	SET_SIZE(splhi)
+
+	/* allow all interrupts */
+	ENTRY(spl0)
+	SETPRI(0)
+	SET_SIZE(spl0)
+
+
+	/* splx implementation */
+	ENTRY(splx)
+	jmp	do_splx		/* redirect to common splx code */
+	SET_SIZE(splx)
+
+	ENTRY(wait_500ms)
+	pushq	%rbx
+	movl	$50000, %ebx
+1:
+	call	tenmicrosec
+	decl	%ebx
+	jnz	1b
+	popq	%rbx
+	ret
+	SET_SIZE(wait_500ms)
+
+#define	RESET_METHOD_KBC	1
+#define	RESET_METHOD_PORT92	2
+#define RESET_METHOD_PCI	4
+
+	DGDEF3(pc_reset_methods, 4, 8)
+	.long RESET_METHOD_KBC|RESET_METHOD_PORT92|RESET_METHOD_PCI;
+
+	ENTRY(pc_reset)
+
+	testl	$RESET_METHOD_KBC, pc_reset_methods(%rip)
+	jz	1f
+
+	/
+	/ Try the classic keyboard controller-triggered reset.
+	/
+	movw	$0x64, %dx
+	movb	$0xfe, %al
+	outb	(%dx)
+
+	/ Wait up to 500 milliseconds here for the keyboard controller
+	/ to pull the reset line.  On some systems where the keyboard
+	/ controller is slow to pull the reset line, the next reset method
+	/ may be executed (which may be bad if those systems hang when the
+	/ next reset method is used, e.g. Ferrari 3400 (doesn't like port 92),
+	/ and Ferrari 4000 (doesn't like the cf9 reset method))
+
+	call	wait_500ms
+
+1:
+	testl	$RESET_METHOD_PORT92, pc_reset_methods(%rip)
+	jz	3f
+
+	/
+	/ Try port 0x92 fast reset
+	/
+	movw	$0x92, %dx
+	inb	(%dx)
+	cmpb	$0xff, %al	/ If port's not there, we should get back 0xFF
+	je	1f
+	testb	$1, %al		/ If bit 0
+	jz	2f		/ is clear, jump to perform the reset
+	andb	$0xfe, %al	/ otherwise,
+	outb	(%dx)		/ clear bit 0 first, then
+2:
+	orb	$1, %al		/ Set bit 0
+	outb	(%dx)		/ and reset the system
+1:
+
+	call	wait_500ms
+
+3:
+	testl	$RESET_METHOD_PCI, pc_reset_methods(%rip)
+	jz	4f
+
+	/ Try the PCI (soft) reset vector (should work on all modern systems,
+	/ but has been shown to cause problems on 450NX systems, and some newer
+	/ systems (e.g. ATI IXP400-equipped systems))
+	/ When resetting via this method, 2 writes are required.  The first
+	/ targets bit 1 (0=hard reset without power cycle, 1=hard reset with
+	/ power cycle).
+	/ The reset occurs on the second write, during bit 2's transition from
+	/ 0->1.
+	movw	$0xcf9, %dx
+	movb	$0x2, %al	/ Reset mode = hard, no power cycle
+	outb	(%dx)
+	movb	$0x6, %al
+	outb	(%dx)
+
+	call	wait_500ms
+
+4:
+	/
+	/ port 0xcf9 failed also.  Last-ditch effort is to
+	/ triple-fault the CPU.
+	/ Also, use triple fault for EFI firmware
+	/
+	ENTRY(efi_reset)
+	pushq	$0x0
+	pushq	$0x0		/ IDT base of 0, limit of 0 + 2 unused bytes
+	lidt	(%rsp)
+	int	$0x0		/ Trigger interrupt, generate triple-fault
+
+	cli
+	hlt			/ Wait forever
+	/*NOTREACHED*/
+	SET_SIZE(efi_reset)
+	SET_SIZE(pc_reset)
+
+/*
+ * C callable in and out routines
+ */
+
+	ENTRY(outl)
+	movw	%di, %dx
+	movl	%esi, %eax
+	outl	(%dx)
+	ret
+	SET_SIZE(outl)
+
+	ENTRY(outw)
+	movw	%di, %dx
+	movw	%si, %ax
+	D16 outl (%dx)		/* XX64 why not outw? */
+	ret
+	SET_SIZE(outw)
+
+	ENTRY(outb)
+	movw	%di, %dx
+	movb	%sil, %al
+	outb	(%dx)
+	ret
+	SET_SIZE(outb)
+
+	ENTRY(inl)
+	xorl	%eax, %eax
+	movw	%di, %dx
+	inl	(%dx)
+	ret
+	SET_SIZE(inl)
+
+	ENTRY(inw)
+	xorl	%eax, %eax
+	movw	%di, %dx
+	D16 inl	(%dx)
+	ret
+	SET_SIZE(inw)
+
+
+	ENTRY(inb)
+	xorl	%eax, %eax
+	movw	%di, %dx
+	inb	(%dx)
+	ret
+	SET_SIZE(inb)
+
+/*
+ * void int3(void)
+ * void int18(void)
+ * void int20(void)
+ * void int_cmci(void)
+ */
+
+	ENTRY(int3)
+	int	$T_BPTFLT
+	ret
+	SET_SIZE(int3)
+
+	ENTRY(int18)
+	int	$T_MCE
+	ret
+	SET_SIZE(int18)
+
+	ENTRY(int20)
+	movl	boothowto, %eax
+	andl	$RB_DEBUG, %eax
+	jz	1f
+
+	int	$T_DBGENTR
+1:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(int20)
+
+	ENTRY(int_cmci)
+	int	$T_ENOEXTFLT
+	ret
+	SET_SIZE(int_cmci)
+
+	ENTRY(scanc)
+					/* rdi == size */
+					/* rsi == cp */
+					/* rdx == table */
+					/* rcx == mask */
+	addq	%rsi, %rdi		/* end = &cp[size] */
+.scanloop:
+	cmpq	%rdi, %rsi		/* while (cp < end */
+	jnb	.scandone
+	movzbq	(%rsi), %r8		/* %r8 = *cp */
+	incq	%rsi			/* cp++ */
+	testb	%cl, (%r8, %rdx)
+	jz	.scanloop		/*  && (table[*cp] & mask) == 0) */
+	decq	%rsi			/* (fix post-increment) */
+.scandone:
+	movl	%edi, %eax
+	subl	%esi, %eax		/* return (end - cp) */
+	ret
+	SET_SIZE(scanc)
+
+/*
+ * Replacement functions for ones that are normally inlined.
+ * In addition to the copy in i86.il, they are defined here just in case.
+ */
+
+	ENTRY(intr_clear)
+	ENTRY(clear_int_flag)
+	pushfq
+	popq	%rax
+#if defined(__xpv)
+	leaq	xpv_panicking, %rdi
+	movl	(%rdi), %edi
+	cmpl	$0, %edi
+	jne	2f
+	CLIRET(%rdi, %dl)	/* returns event mask in %dl */
+	/*
+	 * Synthesize the PS_IE bit from the event mask bit
+	 */
+	andq    $_BITNOT(PS_IE), %rax
+	testb	$1, %dl
+	jnz	1f
+	orq	$PS_IE, %rax
+1:
+	ret
+2:
+#endif
+	CLI(%rdi)
+	ret
+	SET_SIZE(clear_int_flag)
+	SET_SIZE(intr_clear)
+
+	ENTRY(curcpup)
+	movq	%gs:CPU_SELF, %rax
+	ret
+	SET_SIZE(curcpup)
+
+/* htonll(), ntohll(), htonl(), ntohl(), htons(), ntohs()
+ * These functions reverse the byte order of the input parameter and returns
+ * the result.  This is to convert the byte order from host byte order
+ * (little endian) to network byte order (big endian), or vice versa.
+ */
+
+	ENTRY(htonll)
+	ALTENTRY(ntohll)
+	movq	%rdi, %rax
+	bswapq	%rax
+	ret
+	SET_SIZE(ntohll)
+	SET_SIZE(htonll)
+
+	/* XX64 there must be shorter sequences for this */
+	ENTRY(htonl)
+	ALTENTRY(ntohl)
+	movl	%edi, %eax
+	bswap	%eax
+	ret
+	SET_SIZE(ntohl)
+	SET_SIZE(htonl)
+
+	/* XX64 there must be better sequences for this */
+	ENTRY(htons)
+	ALTENTRY(ntohs)
+	movl	%edi, %eax
+	bswap	%eax
+	shrl	$16, %eax
+	ret
+	SET_SIZE(ntohs)
+	SET_SIZE(htons)
+
+
+	ENTRY(intr_restore)
+	ENTRY(restore_int_flag)
+	testq	$PS_IE, %rdi
+	jz	1f
+#if defined(__xpv)
+	leaq	xpv_panicking, %rsi
+	movl	(%rsi), %esi
+	cmpl	$0, %esi
+	jne	1f
+	/*
+	 * Since we're -really- running unprivileged, our attempt
+	 * to change the state of the IF bit will be ignored.
+	 * The virtual IF bit is tweaked by CLI and STI.
+	 */
+	IE_TO_EVENT_MASK(%rsi, %rdi)
+#else
+	sti
+#endif
+1:
+	ret
+	SET_SIZE(restore_int_flag)
+	SET_SIZE(intr_restore)
+
+	ENTRY(sti)
+	STI
+	ret
+	SET_SIZE(sti)
+
+	ENTRY(cli)
+	CLI(%rax)
+	ret
+	SET_SIZE(cli)
+
+	ENTRY(dtrace_interrupt_disable)
+	pushfq
+	popq	%rax
+#if defined(__xpv)
+	leaq	xpv_panicking, %rdi
+	movl	(%rdi), %edi
+	cmpl	$0, %edi
+	jne	.dtrace_interrupt_disable_done
+	CLIRET(%rdi, %dl)	/* returns event mask in %dl */
+	/*
+	 * Synthesize the PS_IE bit from the event mask bit
+	 */
+	andq    $_BITNOT(PS_IE), %rax
+	testb	$1, %dl
+	jnz	.dtrace_interrupt_disable_done
+	orq	$PS_IE, %rax
+#else
+	CLI(%rdx)
+#endif
+.dtrace_interrupt_disable_done:
+	ret
+	SET_SIZE(dtrace_interrupt_disable)
+
+	ENTRY(dtrace_interrupt_enable)
+	pushq	%rdi
+	popfq
+#if defined(__xpv)
+	leaq	xpv_panicking, %rdx
+	movl	(%rdx), %edx
+	cmpl	$0, %edx
+	jne	.dtrace_interrupt_enable_done
+	/*
+	 * Since we're -really- running unprivileged, our attempt
+	 * to change the state of the IF bit will be ignored. The
+	 * virtual IF bit is tweaked by CLI and STI.
+	 */
+	IE_TO_EVENT_MASK(%rdx, %rdi)
+#endif
+.dtrace_interrupt_enable_done:
+	ret
+	SET_SIZE(dtrace_interrupt_enable)
+
+
+	ENTRY(dtrace_membar_producer)
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(dtrace_membar_producer)
+
+	ENTRY(dtrace_membar_consumer)
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(dtrace_membar_consumer)
+
+	ENTRY(threadp)
+	movq	%gs:CPU_THREAD, %rax
+	ret
+	SET_SIZE(threadp)
+
+/*
+ *   Checksum routine for Internet Protocol Headers
+ */
+
+	ENTRY(ip_ocsum)
+	pushq	%rbp
+	movq	%rsp, %rbp
+#ifdef DEBUG
+	movq	postbootkernelbase(%rip), %rax
+	cmpq	%rax, %rdi
+	jnb	1f
+	xorl	%eax, %eax
+	movq	%rdi, %rsi
+	leaq	.ip_ocsum_panic_msg(%rip), %rdi
+	call	panic
+	/*NOTREACHED*/
+.ip_ocsum_panic_msg:
+	.string	"ip_ocsum: address 0x%p below kernelbase\n"
+1:
+#endif
+	movl	%esi, %ecx	/* halfword_count */
+	movq	%rdi, %rsi	/* address */
+				/* partial sum in %edx */
+	xorl	%eax, %eax
+	testl	%ecx, %ecx
+	jz	.ip_ocsum_done
+	testq	$3, %rsi
+	jnz	.ip_csum_notaligned
+.ip_csum_aligned:	/* XX64 opportunities for 8-byte operations? */
+.next_iter:
+	/* XX64 opportunities for prefetch? */
+	/* XX64 compute csum with 64 bit quantities? */
+	subl	$32, %ecx
+	jl	.less_than_32
+
+	addl	0(%rsi), %edx
+.only60:
+	adcl	4(%rsi), %eax
+.only56:
+	adcl	8(%rsi), %edx
+.only52:
+	adcl	12(%rsi), %eax
+.only48:
+	adcl	16(%rsi), %edx
+.only44:
+	adcl	20(%rsi), %eax
+.only40:
+	adcl	24(%rsi), %edx
+.only36:
+	adcl	28(%rsi), %eax
+.only32:
+	adcl	32(%rsi), %edx
+.only28:
+	adcl	36(%rsi), %eax
+.only24:
+	adcl	40(%rsi), %edx
+.only20:
+	adcl	44(%rsi), %eax
+.only16:
+	adcl	48(%rsi), %edx
+.only12:
+	adcl	52(%rsi), %eax
+.only8:
+	adcl	56(%rsi), %edx
+.only4:
+	adcl	60(%rsi), %eax	/* could be adding -1 and -1 with a carry */
+.only0:
+	adcl	$0, %eax	/* could be adding -1 in eax with a carry */
+	adcl	$0, %eax
+
+	addq	$64, %rsi
+	testl	%ecx, %ecx
+	jnz	.next_iter
+
+.ip_ocsum_done:
+	addl	%eax, %edx
+	adcl	$0, %edx
+	movl	%edx, %eax	/* form a 16 bit checksum by */
+	shrl	$16, %eax	/* adding two halves of 32 bit checksum */
+	addw	%dx, %ax
+	adcw	$0, %ax
+	andl	$0xffff, %eax
+	leave
+	ret
+
+.ip_csum_notaligned:
+	xorl	%edi, %edi
+	movw	(%rsi), %di
+	addl	%edi, %edx
+	adcl	$0, %edx
+	addq	$2, %rsi
+	decl	%ecx
+	jmp	.ip_csum_aligned
+
+.less_than_32:
+	addl	$32, %ecx
+	testl	$1, %ecx
+	jz	.size_aligned
+	andl	$0xfe, %ecx
+	movzwl	(%rsi, %rcx, 2), %edi
+	addl	%edi, %edx
+	adcl	$0, %edx
+.size_aligned:
+	movl	%ecx, %edi
+	shrl	$1, %ecx
+	shl	$1, %edi
+	subq	$64, %rdi
+	addq	%rdi, %rsi
+	leaq    .ip_ocsum_jmptbl(%rip), %rdi
+	leaq	(%rdi, %rcx, 8), %rdi
+	xorl	%ecx, %ecx
+	clc
+	movq	(%rdi), %rdi
+	INDIRECT_JMP_REG(rdi)
+
+	.align	8
+.ip_ocsum_jmptbl:
+	.quad	.only0, .only4, .only8, .only12, .only16, .only20
+	.quad	.only24, .only28, .only32, .only36, .only40, .only44
+	.quad	.only48, .only52, .only56, .only60
+	SET_SIZE(ip_ocsum)
+
+/*
+ * multiply two long numbers and yield a u_longlong_t result, callable from C.
+ * Provided to manipulate hrtime_t values.
+ */
+
+	ENTRY(mul32)
+	xorl	%edx, %edx	/* XX64 joe, paranoia? */
+	movl	%edi, %eax
+	mull	%esi
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	SET_SIZE(mul32)
+
+	ENTRY(scan_memory)
+	shrq	$3, %rsi	/* convert %rsi from byte to quadword count */
+	jz	.scanm_done
+	movq	%rsi, %rcx	/* move count into rep control register */
+	movq	%rdi, %rsi	/* move addr into lodsq control reg. */
+	rep lodsq		/* scan the memory range */
+.scanm_done:
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(scan_memory)
+
+
+	ENTRY(lowbit)
+	movl	$-1, %eax
+	bsfq	%rdi, %rdi
+	cmovnz	%edi, %eax
+	incl	%eax
+	ret
+	SET_SIZE(lowbit)
+
+	ENTRY(highbit)
+	ALTENTRY(highbit64)
+	movl	$-1, %eax
+	bsrq	%rdi, %rdi
+	cmovnz	%edi, %eax
+	incl	%eax
+	ret
+	SET_SIZE(highbit64)
+	SET_SIZE(highbit)
+
+#define	XMSR_ACCESS_VAL		$0x9c5a203a
+
+	ENTRY(rdmsr)
+	movl	%edi, %ecx
+	rdmsr
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	SET_SIZE(rdmsr)
+
+	ENTRY(wrmsr)
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	movl	%esi, %eax
+	movl	%edi, %ecx
+	wrmsr
+	ret
+	SET_SIZE(wrmsr)
+
+	ENTRY(xrdmsr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	%edi, %ecx
+	movl	XMSR_ACCESS_VAL, %edi	/* this value is needed to access MSR */
+	rdmsr
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	leave
+	ret
+	SET_SIZE(xrdmsr)
+
+	ENTRY(xwrmsr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	movl	%edi, %ecx
+	movl	XMSR_ACCESS_VAL, %edi	/* this value is needed to access MSR */
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	movl	%esi, %eax
+	wrmsr
+	leave
+	ret
+	SET_SIZE(xwrmsr)
+
+	ENTRY(get_xcr)
+	movl	%edi, %ecx
+	#xgetbv
+	.byte	0x0f,0x01,0xd0
+	shlq	$32, %rdx
+	orq	%rdx, %rax
+	ret
+	SET_SIZE(get_xcr)
+
+	ENTRY(set_xcr)
+	movq	%rsi, %rdx
+	shrq	$32, %rdx
+	movl	%esi, %eax
+	movl	%edi, %ecx
+	#xsetbv
+	.byte	0x0f,0x01,0xd1
+	ret
+	SET_SIZE(set_xcr)
+
+	ENTRY(invalidate_cache)
+	wbinvd
+	ret
+	SET_SIZE(invalidate_cache)
+
+	ENTRY_NP(getcregs)
+#if defined(__xpv)
+	/*
+	 * Only a few of the hardware control registers or descriptor tables
+	 * are directly accessible to us, so just zero the structure.
+	 *
+	 * XXPV	Perhaps it would be helpful for the hypervisor to return
+	 *	virtualized versions of these for post-mortem use.
+	 *	(Need to reevaluate - perhaps it already does!)
+	 */
+	pushq	%rdi		/* save *crp */
+	movq	$CREGSZ, %rsi
+	call	bzero
+	popq	%rdi
+
+	/*
+	 * Dump what limited information we can
+	 */
+	movq	%cr0, %rax
+	movq	%rax, CREG_CR0(%rdi)	/* cr0 */
+	movq	%cr2, %rax
+	movq	%rax, CREG_CR2(%rdi)	/* cr2 */
+	movq	%cr3, %rax
+	movq	%rax, CREG_CR3(%rdi)	/* cr3 */
+	movq	%cr4, %rax
+	movq	%rax, CREG_CR4(%rdi)	/* cr4 */
+
+#else	/* __xpv */
+
+#define	GETMSR(r, off, d)	\
+	movl	$r, %ecx;	\
+	rdmsr;			\
+	movl	%eax, off(d);	\
+	movl	%edx, off+4(d)
+
+	xorl	%eax, %eax
+	movq	%rax, CREG_GDT+8(%rdi)
+	sgdt	CREG_GDT(%rdi)		/* 10 bytes */
+	movq	%rax, CREG_IDT+8(%rdi)
+	sidt	CREG_IDT(%rdi)		/* 10 bytes */
+	movq	%rax, CREG_LDT(%rdi)
+	sldt	CREG_LDT(%rdi)		/* 2 bytes */
+	movq	%rax, CREG_TASKR(%rdi)
+	str	CREG_TASKR(%rdi)	/* 2 bytes */
+	movq	%cr0, %rax
+	movq	%rax, CREG_CR0(%rdi)	/* cr0 */
+	movq	%cr2, %rax
+	movq	%rax, CREG_CR2(%rdi)	/* cr2 */
+	movq	%cr3, %rax
+	movq	%rax, CREG_CR3(%rdi)	/* cr3 */
+	movq	%cr4, %rax
+	movq	%rax, CREG_CR4(%rdi)	/* cr4 */
+	movq	%cr8, %rax
+	movq	%rax, CREG_CR8(%rdi)	/* cr8 */
+	GETMSR(MSR_AMD_KGSBASE, CREG_KGSBASE, %rdi)
+	GETMSR(MSR_AMD_EFER, CREG_EFER, %rdi)
+#endif	/* __xpv */
+	ret
+	SET_SIZE(getcregs)
+
+#undef GETMSR
+
+
+/*
+ * A panic trigger is a word which is updated atomically and can only be set
+ * once.  We atomically store 0xDEFACEDD and load the old value.  If the
+ * previous value was 0, we succeed and return 1; otherwise return 0.
+ * This allows a partially corrupt trigger to still trigger correctly.  DTrace
+ * has its own version of this function to allow it to panic correctly from
+ * probe context.
+ */
+
+	ENTRY_NP(panic_trigger)
+	xorl	%eax, %eax
+	movl	$0xdefacedd, %edx
+	lock
+	  xchgl	%edx, (%rdi)
+	cmpl	$0, %edx
+	je	0f
+	movl	$0, %eax
+	ret
+0:	movl	$1, %eax
+	ret
+	SET_SIZE(panic_trigger)
+
+	ENTRY_NP(dtrace_panic_trigger)
+	xorl	%eax, %eax
+	movl	$0xdefacedd, %edx
+	lock
+	  xchgl	%edx, (%rdi)
+	cmpl	$0, %edx
+	je	0f
+	movl	$0, %eax
+	ret
+0:	movl	$1, %eax
+	ret
+	SET_SIZE(dtrace_panic_trigger)
+
+/*
+ * The panic() and cmn_err() functions invoke vpanic() as a common entry point
+ * into the panic code implemented in panicsys().  vpanic() is responsible
+ * for passing through the format string and arguments, and constructing a
+ * regs structure on the stack into which it saves the current register
+ * values.  If we are not dying due to a fatal trap, these registers will
+ * then be preserved in panicbuf as the current processor state.  Before
+ * invoking panicsys(), vpanic() activates the first panic trigger (see
+ * common/os/panic.c) and switches to the panic_stack if successful.  Note that
+ * DTrace takes a slightly different panic path if it must panic from probe
+ * context.  Instead of calling panic, it calls into dtrace_vpanic(), which
+ * sets up the initial stack as vpanic does, calls dtrace_panic_trigger(), and
+ * branches back into vpanic().
+ */
+
+	ENTRY_NP(vpanic)			/* Initial stack layout: */
+
+	pushq	%rbp				/* | %rip |	0x60	*/
+	movq	%rsp, %rbp			/* | %rbp |	0x58	*/
+	pushfq					/* | rfl  |	0x50	*/
+	pushq	%r11				/* | %r11 |	0x48	*/
+	pushq	%r10				/* | %r10 |	0x40	*/
+	pushq	%rbx				/* | %rbx |	0x38	*/
+	pushq	%rax				/* | %rax |	0x30	*/
+	pushq	%r9				/* | %r9  |	0x28	*/
+	pushq	%r8				/* | %r8  |	0x20	*/
+	pushq	%rcx				/* | %rcx |	0x18	*/
+	pushq	%rdx				/* | %rdx |	0x10	*/
+	pushq	%rsi				/* | %rsi |	0x8 alist */
+	pushq	%rdi				/* | %rdi |	0x0 format */
+
+	movq	%rsp, %rbx			/* %rbx = current %rsp */
+
+	leaq	panic_quiesce(%rip), %rdi	/* %rdi = &panic_quiesce */
+	call	panic_trigger			/* %eax = panic_trigger() */
+
+vpanic_common:
+	/*
+	 * The panic_trigger result is in %eax from the call above, and
+	 * dtrace_panic places it in %eax before branching here.
+	 * The rdmsr instructions that follow below will clobber %eax so
+	 * we stash the panic_trigger result in %r11d.
+	 */
+	movl	%eax, %r11d
+	cmpl	$0, %r11d
+	je	0f
+
+	/*
+	 * If panic_trigger() was successful, we are the first to initiate a
+	 * panic: we now switch to the reserved panic_stack before continuing.
+	 */
+	leaq	panic_stack(%rip), %rsp
+	addq	$PANICSTKSIZE, %rsp
+0:	subq	$REGSIZE, %rsp
+	/*
+	 * Now that we've got everything set up, store the register values as
+	 * they were when we entered vpanic() to the designated location in
+	 * the regs structure we allocated on the stack.
+	 */
+	movq	0x0(%rbx), %rcx
+	movq	%rcx, REGOFF_RDI(%rsp)
+	movq	0x8(%rbx), %rcx
+	movq	%rcx, REGOFF_RSI(%rsp)
+	movq	0x10(%rbx), %rcx
+	movq	%rcx, REGOFF_RDX(%rsp)
+	movq	0x18(%rbx), %rcx
+	movq	%rcx, REGOFF_RCX(%rsp)
+	movq	0x20(%rbx), %rcx
+
+	movq	%rcx, REGOFF_R8(%rsp)
+	movq	0x28(%rbx), %rcx
+	movq	%rcx, REGOFF_R9(%rsp)
+	movq	0x30(%rbx), %rcx
+	movq	%rcx, REGOFF_RAX(%rsp)
+	movq	0x38(%rbx), %rcx
+	movq	%rcx, REGOFF_RBX(%rsp)
+	movq	0x58(%rbx), %rcx
+
+	movq	%rcx, REGOFF_RBP(%rsp)
+	movq	0x40(%rbx), %rcx
+	movq	%rcx, REGOFF_R10(%rsp)
+	movq	0x48(%rbx), %rcx
+	movq	%rcx, REGOFF_R11(%rsp)
+	movq	%r12, REGOFF_R12(%rsp)
+
+	movq	%r13, REGOFF_R13(%rsp)
+	movq	%r14, REGOFF_R14(%rsp)
+	movq	%r15, REGOFF_R15(%rsp)
+
+	xorl	%ecx, %ecx
+	movw	%ds, %cx
+	movq	%rcx, REGOFF_DS(%rsp)
+	movw	%es, %cx
+	movq	%rcx, REGOFF_ES(%rsp)
+	movw	%fs, %cx
+	movq	%rcx, REGOFF_FS(%rsp)
+	movw	%gs, %cx
+	movq	%rcx, REGOFF_GS(%rsp)
+
+	movq	$0, REGOFF_TRAPNO(%rsp)
+
+	movq	$0, REGOFF_ERR(%rsp)
+	leaq	vpanic(%rip), %rcx
+	movq	%rcx, REGOFF_RIP(%rsp)
+	movw	%cs, %cx
+	movzwq	%cx, %rcx
+	movq	%rcx, REGOFF_CS(%rsp)
+	movq	0x50(%rbx), %rcx
+	movq	%rcx, REGOFF_RFL(%rsp)
+	movq	%rbx, %rcx
+	addq	$0x60, %rcx
+	movq	%rcx, REGOFF_RSP(%rsp)
+	movw	%ss, %cx
+	movzwq	%cx, %rcx
+	movq	%rcx, REGOFF_SS(%rsp)
+
+	/*
+	 * panicsys(format, alist, rp, on_panic_stack)
+	 */
+	movq	REGOFF_RDI(%rsp), %rdi		/* format */
+	movq	REGOFF_RSI(%rsp), %rsi		/* alist */
+	movq	%rsp, %rdx			/* struct regs */
+	movl	%r11d, %ecx			/* on_panic_stack */
+	call	panicsys
+	addq	$REGSIZE, %rsp
+	popq	%rdi
+	popq	%rsi
+	popq	%rdx
+	popq	%rcx
+	popq	%r8
+	popq	%r9
+	popq	%rax
+	popq	%rbx
+	popq	%r10
+	popq	%r11
+	popfq
+	leave
+	ret
+	SET_SIZE(vpanic)
+
+	ENTRY_NP(dtrace_vpanic)			/* Initial stack layout: */
+
+	pushq	%rbp				/* | %rip |	0x60	*/
+	movq	%rsp, %rbp			/* | %rbp |	0x58	*/
+	pushfq					/* | rfl  |	0x50	*/
+	pushq	%r11				/* | %r11 |	0x48	*/
+	pushq	%r10				/* | %r10 |	0x40	*/
+	pushq	%rbx				/* | %rbx |	0x38	*/
+	pushq	%rax				/* | %rax |	0x30	*/
+	pushq	%r9				/* | %r9  |	0x28	*/
+	pushq	%r8				/* | %r8  |	0x20	*/
+	pushq	%rcx				/* | %rcx |	0x18	*/
+	pushq	%rdx				/* | %rdx |	0x10	*/
+	pushq	%rsi				/* | %rsi |	0x8 alist */
+	pushq	%rdi				/* | %rdi |	0x0 format */
+
+	movq	%rsp, %rbx			/* %rbx = current %rsp */
+
+	leaq	panic_quiesce(%rip), %rdi	/* %rdi = &panic_quiesce */
+	call	dtrace_panic_trigger	/* %eax = dtrace_panic_trigger() */
+	jmp	vpanic_common
+
+	SET_SIZE(dtrace_vpanic)
+
+	DGDEF3(timedelta, 8, 8)
+	.long	0, 0
+
+	/*
+	 * initialized to a non zero value to make pc_gethrtime()
+	 * work correctly even before clock is initialized
+	 */
+	DGDEF3(hrtime_base, 8, 8)
+	.long	_MUL(NSEC_PER_CLOCK_TICK, 6), 0
+
+	DGDEF3(adj_shift, 4, 4)
+	.long	ADJ_SHIFT
+
+	ENTRY_NP(hres_tick)
+	pushq	%rbp
+	movq	%rsp, %rbp
+
+	/*
+	 * We need to call *gethrtimef before picking up CLOCK_LOCK (obviously,
+	 * hres_last_tick can only be modified while holding CLOCK_LOCK).
+	 * At worst, performing this now instead of under CLOCK_LOCK may
+	 * introduce some jitter in pc_gethrestime().
+	 */
+	movq	gethrtimef(%rip), %rsi
+	INDIRECT_CALL_REG(rsi)
+	movq	%rax, %r8
+
+	leaq	hres_lock(%rip), %rax
+	movb	$-1, %dl
+.CL1:
+	xchgb	%dl, (%rax)
+	testb	%dl, %dl
+	jz	.CL3			/* got it */
+.CL2:
+	cmpb	$0, (%rax)		/* possible to get lock? */
+	pause
+	jne	.CL2
+	jmp	.CL1			/* yes, try again */
+.CL3:
+	/*
+	 * compute the interval since last time hres_tick was called
+	 * and adjust hrtime_base and hrestime accordingly
+	 * hrtime_base is an 8 byte value (in nsec), hrestime is
+	 * a timestruc_t (sec, nsec)
+	 */
+	leaq	hres_last_tick(%rip), %rax
+	movq	%r8, %r11
+	subq	(%rax), %r8
+	addq	%r8, hrtime_base(%rip)	/* add interval to hrtime_base */
+	addq	%r8, hrestime+8(%rip)	/* add interval to hrestime.tv_nsec */
+	/*
+	 * Now that we have CLOCK_LOCK, we can update hres_last_tick
+	 */
+	movq	%r11, (%rax)
+
+	call	__adj_hrestime
+
+	/*
+	 * release the hres_lock
+	 */
+	incl	hres_lock(%rip)
+	leave
+	ret
+	SET_SIZE(hres_tick)
+
+/*
+ * void prefetch_smap_w(void *)
+ *
+ * Prefetch ahead within a linear list of smap structures.
+ * Not implemented for ia32.  Stub for compatibility.
+ */
+
+	ENTRY(prefetch_smap_w)
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(prefetch_smap_w)
+
+/*
+ * prefetch_page_r(page_t *)
+ * issue prefetch instructions for a page_t
+ */
+
+	ENTRY(prefetch_page_r)
+	rep;	ret	/* use 2 byte return instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(prefetch_page_r)
+
+	ENTRY(bcmp)
+	pushq	%rbp
+	movq	%rsp, %rbp
+#ifdef DEBUG
+	testq	%rdx,%rdx
+	je	1f
+	movq	postbootkernelbase(%rip), %r11
+	cmpq	%r11, %rdi
+	jb	0f
+	cmpq	%r11, %rsi
+	jnb	1f
+0:	leaq	.bcmp_panic_msg(%rip), %rdi
+	xorl	%eax, %eax
+	call	panic
+1:
+#endif	/* DEBUG */
+	call	memcmp
+	testl	%eax, %eax
+	setne	%dl
+	leave
+	movzbl	%dl, %eax
+	ret
+	SET_SIZE(bcmp)
+
+#ifdef DEBUG
+	.text
+.bcmp_panic_msg:
+	.string "bcmp: arguments below kernelbase"
+#endif	/* DEBUG */
+
+	ENTRY_NP(bsrw_insn)
+	xorl	%eax, %eax
+	bsrw	%di, %ax
+	ret
+	SET_SIZE(bsrw_insn)
+
+	ENTRY_NP(switch_sp_and_call)
+	pushq	%rbp
+	movq	%rsp, %rbp		/* set up stack frame */
+	movq	%rdi, %rsp		/* switch stack pointer */
+	movq	%rdx, %rdi		/* pass func arg 1 */
+	movq	%rsi, %r11		/* save function to call */
+	movq	%rcx, %rsi		/* pass func arg 2 */
+	INDIRECT_CALL_REG(r11)		/* call function */
+	leave				/* restore stack */
+	ret
+	SET_SIZE(switch_sp_and_call)
+
+	ENTRY_NP(kmdb_enter)
+	pushq	%rbp
+	movq	%rsp, %rbp
+
+	/*
+	 * Save flags, do a 'cli' then return the saved flags
+	 */
+	call	intr_clear
+
+	int	$T_DBGENTR
+
+	/*
+	 * Restore the saved flags
+	 */
+	movq	%rax, %rdi
+	call	intr_restore
+
+	leave
+	ret
+	SET_SIZE(kmdb_enter)
+
+	ENTRY_NP(return_instr)
+	rep;	ret	/* use 2 byte instruction when branch target */
+			/* AMD Software Optimization Guide - Section 6.2 */
+	SET_SIZE(return_instr)
+
+	ENTRY(getflags)
+	pushfq
+	popq	%rax
+#if defined(__xpv)
+	CURTHREAD(%rdi)
+	KPREEMPT_DISABLE(%rdi)
+	/*
+	 * Synthesize the PS_IE bit from the event mask bit
+	 */
+	CURVCPU(%r11)
+	andq    $_BITNOT(PS_IE), %rax
+	XEN_TEST_UPCALL_MASK(%r11)
+	jnz	1f
+	orq	$PS_IE, %rax
+1:
+	KPREEMPT_ENABLE_NOKP(%rdi)
+#endif
+	ret
+	SET_SIZE(getflags)
+
+	ENTRY(ftrace_interrupt_disable)
+	pushfq
+	popq	%rax
+	CLI(%rdx)
+	ret
+	SET_SIZE(ftrace_interrupt_disable)
+
+	ENTRY(ftrace_interrupt_enable)
+	pushq	%rdi
+	popfq
+	ret
+	SET_SIZE(ftrace_interrupt_enable)
+
+	ENTRY(clflush_insn)
+	clflush (%rdi)
+	ret
+	SET_SIZE(clflush_insn)
+
+	ENTRY(mfence_insn)
+	mfence
+	ret
+	SET_SIZE(mfence_insn)
+
+/*
+ * VMware implements an I/O port that programs can query to detect if software
+ * is running in a VMware hypervisor. This hypervisor port behaves differently
+ * depending on magic values in certain registers and modifies some registers
+ * as a side effect.
+ *
+ * References: http://kb.vmware.com/kb/1009458
+ */
+
+	ENTRY(vmware_port)
+	pushq	%rbx
+	movl	$VMWARE_HVMAGIC, %eax
+	movl	$0xffffffff, %ebx
+	movl	%edi, %ecx
+	movl	$VMWARE_HVPORT, %edx
+	inl	(%dx)
+	movl	%eax, (%rsi)
+	movl	%ebx, 4(%rsi)
+	movl	%ecx, 8(%rsi)
+	movl	%edx, 12(%rsi)
+	popq	%rbx
+	ret
+	SET_SIZE(vmware_port)
diff --git a/usr/src/uts/intel/ml/lock_prim.s b/usr/src/uts/intel/ml/lock_prim.s
new file mode 100644
index 0000000000..4267561bf7
--- /dev/null
+++ b/usr/src/uts/intel/ml/lock_prim.s
@@ -0,0 +1,714 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include "assym.h"
+
+#include <sys/mutex_impl.h>
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/rwlock_impl.h>
+#include <sys/lockstat.h>
+
+/*
+ * lock_try(lp), ulock_try(lp)
+ *	- returns non-zero on success.
+ *	- doesn't block interrupts so don't use this to spin on a lock.
+ *
+ * ulock_try() is for a lock in the user address space.
+ */
+
+	.globl	kernelbase
+
+	ENTRY(lock_try)
+	movb	$-1, %dl
+	movzbq	%dl, %rax
+	xchgb	%dl, (%rdi)
+	xorb	%dl, %al
+.lock_try_lockstat_patch_point:
+	ret
+	testb	%al, %al
+	jnz	0f
+	ret
+0:
+	movq	%gs:CPU_THREAD, %rdx	/* rdx = thread addr */
+	movq	%rdi, %rsi		/* rsi = lock addr */
+	movl	$LS_LOCK_TRY_ACQUIRE, %edi /* edi = event */
+	jmp	lockstat_wrapper
+	SET_SIZE(lock_try)
+
+	ENTRY(lock_spin_try)
+	movb	$-1, %dl
+	movzbq	%dl, %rax
+	xchgb	%dl, (%rdi)
+	xorb	%dl, %al
+	ret
+	SET_SIZE(lock_spin_try)
+
+	ENTRY(ulock_try)
+#ifdef DEBUG
+	movq	kernelbase(%rip), %rax
+	cmpq	%rax, %rdi		/* test uaddr < kernelbase */
+	jb	ulock_pass		/*	uaddr < kernelbase, proceed */
+
+	movq	%rdi, %r12		/* preserve lock ptr for debugging */
+	leaq	.ulock_panic_msg(%rip), %rdi
+	pushq	%rbp			/* align stack properly */
+	movq	%rsp, %rbp
+	xorl	%eax, %eax		/* clear for varargs */
+	call	panic
+
+#endif /* DEBUG */
+
+ulock_pass:
+	movl	$1, %eax
+	xchgb	%al, (%rdi)
+	xorb	$1, %al
+	ret
+	SET_SIZE(ulock_try)
+
+#ifdef DEBUG
+	.data
+.ulock_panic_msg:
+	.string "ulock_try: Argument is above kernelbase"
+	.text
+#endif	/* DEBUG */
+
+/*
+ * lock_clear(lp)
+ *	- unlock lock without changing interrupt priority level.
+ */
+
+	ENTRY(lock_clear)
+	movb	$0, (%rdi)
+.lock_clear_lockstat_patch_point:
+	ret
+	movq	%rdi, %rsi			/* rsi = lock addr */
+	movq	%gs:CPU_THREAD, %rdx		/* rdx = thread addr */
+	movl	$LS_LOCK_CLEAR_RELEASE, %edi	/* edi = event */
+	jmp	lockstat_wrapper
+	SET_SIZE(lock_clear)
+
+	ENTRY(ulock_clear)
+#ifdef DEBUG
+	movq	kernelbase(%rip), %rcx
+	cmpq	%rcx, %rdi		/* test uaddr < kernelbase */
+	jb	ulock_clr		/*	 uaddr < kernelbase, proceed */
+
+	leaq	.ulock_clear_msg(%rip), %rdi
+	pushq	%rbp			/* align stack properly */
+	movq	%rsp, %rbp
+	xorl	%eax, %eax		/* clear for varargs */
+	call	panic
+#endif
+
+ulock_clr:
+	movb	$0, (%rdi)
+	ret
+	SET_SIZE(ulock_clear)
+
+#ifdef DEBUG
+	.data
+.ulock_clear_msg:
+	.string "ulock_clear: Argument is above kernelbase"
+	.text
+#endif	/* DEBUG */
+
+
+/*
+ * lock_set_spl(lock_t *lp, int new_pil, u_short *old_pil)
+ * Drops lp, sets pil to new_pil, stores old pil in *old_pil.
+ */
+
+	ENTRY(lock_set_spl)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$32, %rsp
+	movl	%esi, 8(%rsp)		/* save priority level */
+	movq	%rdx, 16(%rsp)		/* save old pil ptr */
+	movq	%rdi, 24(%rsp)		/* save lock pointer */
+	movl	%esi, %edi		/* pass priority level */
+	call	splr			/* raise priority level */
+	movq	24(%rsp), %rdi		/* rdi = lock addr */
+	movb	$-1, %dl
+	xchgb	%dl, (%rdi)		/* try to set lock */
+	testb	%dl, %dl		/* did we get the lock? ... */
+	jnz	.lss_miss		/* ... no, go to C for the hard case */
+	movq	16(%rsp), %rdx		/* rdx = old pil addr */
+	movw	%ax, (%rdx)		/* store old pil */
+	leave
+.lock_set_spl_lockstat_patch_point:
+	ret
+	movq	%rdi, %rsi		/* rsi = lock addr */
+	movq	%gs:CPU_THREAD, %rdx	/* rdx = thread addr */
+	movl	$LS_LOCK_SET_SPL_ACQUIRE, %edi
+	jmp	lockstat_wrapper
+.lss_miss:
+	movl	8(%rsp), %esi		/* new_pil */
+	movq	16(%rsp), %rdx		/* old_pil_addr */
+	movl	%eax, %ecx		/* original pil */
+	leave				/* unwind stack */
+	jmp	lock_set_spl_spin
+	SET_SIZE(lock_set_spl)
+
+/*
+ * void
+ * lock_init(lp)
+ */
+
+	ENTRY(lock_init)
+	movb	$0, (%rdi)
+	ret
+	SET_SIZE(lock_init)
+
+/*
+ * void
+ * lock_set(lp)
+ */
+
+	ENTRY(lock_set)
+	movb	$-1, %dl
+	xchgb	%dl, (%rdi)		/* try to set lock */
+	testb	%dl, %dl		/* did we get it? */
+	jnz	lock_set_spin		/* no, go to C for the hard case */
+.lock_set_lockstat_patch_point:
+	ret
+	movq	%rdi, %rsi		/* rsi = lock addr */
+	movq	%gs:CPU_THREAD, %rdx	/* rdx = thread addr */
+	movl	$LS_LOCK_SET_ACQUIRE, %edi
+	jmp	lockstat_wrapper
+	SET_SIZE(lock_set)
+
+/*
+ * lock_clear_splx(lp, s)
+ */
+
+	ENTRY(lock_clear_splx)
+	movb	$0, (%rdi)		/* clear lock */
+.lock_clear_splx_lockstat_patch_point:
+	jmp	0f
+0:
+	movl	%esi, %edi		/* arg for splx */
+	jmp	splx			/* let splx do its thing */
+.lock_clear_splx_lockstat:
+	pushq	%rbp			/* align stack properly */
+	movq	%rsp, %rbp
+	subq	$16, %rsp		/* space to save args across splx */
+	movq	%rdi, 8(%rsp)		/* save lock ptr across splx call */
+	movl	%esi, %edi		/* arg for splx */
+	call	splx			/* lower the priority */
+	movq	8(%rsp), %rsi		/* rsi = lock ptr */
+	leave				/* unwind stack */
+	movq	%gs:CPU_THREAD, %rdx	/* rdx = thread addr */
+	movl	$LS_LOCK_CLEAR_SPLX_RELEASE, %edi
+	jmp	lockstat_wrapper
+	SET_SIZE(lock_clear_splx)
+
+#if defined(__GNUC_AS__)
+#define	LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_VAL	\
+	(.lock_clear_splx_lockstat - .lock_clear_splx_lockstat_patch_point - 2)
+
+#define LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_POINT	\
+	(.lock_clear_splx_lockstat_patch_point + 1)
+#else
+#define	LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_VAL	\
+	[.lock_clear_splx_lockstat - .lock_clear_splx_lockstat_patch_point - 2]
+
+#define LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_POINT	\
+	[.lock_clear_splx_lockstat_patch_point + 1]
+#endif
+
+/*
+ * mutex_enter() and mutex_exit().
+ *
+ * These routines handle the simple cases of mutex_enter() (adaptive
+ * lock, not held) and mutex_exit() (adaptive lock, held, no waiters).
+ * If anything complicated is going on we punt to mutex_vector_enter().
+ *
+ * mutex_tryenter() is similar to mutex_enter() but returns zero if
+ * the lock cannot be acquired, nonzero on success.
+ *
+ * If mutex_exit() gets preempted in the window between checking waiters
+ * and clearing the lock, we can miss wakeups.  Disabling preemption
+ * in the mutex code is prohibitively expensive, so instead we detect
+ * mutex preemption by examining the trapped PC in the interrupt path.
+ * If we interrupt a thread in mutex_exit() that has not yet cleared
+ * the lock, cmnint() resets its PC back to the beginning of
+ * mutex_exit() so it will check again for waiters when it resumes.
+ *
+ * The lockstat code below is activated when the lockstat driver
+ * calls lockstat_hot_patch() to hot-patch the kernel mutex code.
+ * Note that we don't need to test lockstat_event_mask here -- we won't
+ * patch this code in unless we're gathering ADAPTIVE_HOLD lockstats.
+ */
+
+	ENTRY_NP(mutex_enter)
+	movq	%gs:CPU_THREAD, %rdx		/* rdx = thread ptr */
+	xorl	%eax, %eax			/* rax = 0 (unheld adaptive) */
+	lock
+	cmpxchgq %rdx, (%rdi)
+	jnz	mutex_vector_enter
+.mutex_enter_lockstat_patch_point:
+#if defined(OPTERON_WORKAROUND_6323525)
+.mutex_enter_6323525_patch_point:
+	ret					/* nop space for lfence */
+	nop
+	nop
+.mutex_enter_lockstat_6323525_patch_point:	/* new patch point if lfence */
+	nop
+#else	/* OPTERON_WORKAROUND_6323525 */
+	ret
+#endif	/* OPTERON_WORKAROUND_6323525 */
+	movq	%rdi, %rsi
+	movl	$LS_MUTEX_ENTER_ACQUIRE, %edi
+/*
+ * expects %rdx=thread, %rsi=lock, %edi=lockstat event
+ */
+	ALTENTRY(lockstat_wrapper)
+	incb	T_LOCKSTAT(%rdx)		/* curthread->t_lockstat++ */
+	leaq	lockstat_probemap(%rip), %rax
+	movl	(%rax, %rdi, DTRACE_IDSIZE), %eax
+	testl	%eax, %eax			/* check for non-zero probe */
+	jz	1f
+	pushq	%rbp				/* align stack properly */
+	movq	%rsp, %rbp
+	movl	%eax, %edi
+	movq	lockstat_probe, %rax
+	INDIRECT_CALL_REG(rax)
+	leave					/* unwind stack */
+1:
+	movq	%gs:CPU_THREAD, %rdx		/* reload thread ptr */
+	decb	T_LOCKSTAT(%rdx)		/* curthread->t_lockstat-- */
+	movl	$1, %eax			/* return success if tryenter */
+	ret
+	SET_SIZE(lockstat_wrapper)
+	SET_SIZE(mutex_enter)
+
+/*
+ * expects %rcx=thread, %rdx=arg, %rsi=lock, %edi=lockstat event
+ */
+	ENTRY(lockstat_wrapper_arg)
+	incb	T_LOCKSTAT(%rcx)		/* curthread->t_lockstat++ */
+	leaq	lockstat_probemap(%rip), %rax
+	movl	(%rax, %rdi, DTRACE_IDSIZE), %eax
+	testl	%eax, %eax			/* check for non-zero probe */
+	jz	1f
+	pushq	%rbp				/* align stack properly */
+	movq	%rsp, %rbp
+	movl	%eax, %edi
+	movq	lockstat_probe, %rax
+	INDIRECT_CALL_REG(rax)
+	leave					/* unwind stack */
+1:
+	movq	%gs:CPU_THREAD, %rdx		/* reload thread ptr */
+	decb	T_LOCKSTAT(%rdx)		/* curthread->t_lockstat-- */
+	movl	$1, %eax			/* return success if tryenter */
+	ret
+	SET_SIZE(lockstat_wrapper_arg)
+
+
+	ENTRY(mutex_tryenter)
+	movq	%gs:CPU_THREAD, %rdx		/* rdx = thread ptr */
+	xorl	%eax, %eax			/* rax = 0 (unheld adaptive) */
+	lock
+	cmpxchgq %rdx, (%rdi)
+	jnz	mutex_vector_tryenter
+	not	%eax				/* return success (nonzero) */
+#if defined(OPTERON_WORKAROUND_6323525)
+.mutex_tryenter_lockstat_patch_point:
+.mutex_tryenter_6323525_patch_point:
+	ret					/* nop space for lfence */
+	nop
+	nop
+.mutex_tryenter_lockstat_6323525_patch_point:	/* new patch point if lfence */
+	nop
+#else	/* OPTERON_WORKAROUND_6323525 */
+.mutex_tryenter_lockstat_patch_point:
+	ret
+#endif	/* OPTERON_WORKAROUND_6323525 */
+	movq	%rdi, %rsi
+	movl	$LS_MUTEX_ENTER_ACQUIRE, %edi
+	jmp	lockstat_wrapper
+	SET_SIZE(mutex_tryenter)
+
+	ENTRY(mutex_adaptive_tryenter)
+	movq	%gs:CPU_THREAD, %rdx		/* rdx = thread ptr */
+	xorl	%eax, %eax			/* rax = 0 (unheld adaptive) */
+	lock
+	cmpxchgq %rdx, (%rdi)
+	jnz	0f
+	not	%eax				/* return success (nonzero) */
+#if defined(OPTERON_WORKAROUND_6323525)
+.mutex_atryenter_6323525_patch_point:
+	ret					/* nop space for lfence */
+	nop
+	nop
+	nop
+#else	/* OPTERON_WORKAROUND_6323525 */
+	ret
+#endif	/* OPTERON_WORKAROUND_6323525 */
+0:
+	xorl	%eax, %eax			/* return failure */
+	ret
+	SET_SIZE(mutex_adaptive_tryenter)
+
+	.globl	mutex_owner_running_critical_start
+
+	ENTRY(mutex_owner_running)
+mutex_owner_running_critical_start:
+	movq	(%rdi), %r11		/* get owner field */
+	andq	$MUTEX_THREAD, %r11	/* remove waiters bit */
+	cmpq	$0, %r11		/* if free, skip */
+	je	1f			/* go return 0 */
+	movq	T_CPU(%r11), %r8	/* get owner->t_cpu */
+	movq	CPU_THREAD(%r8), %r9	/* get t_cpu->cpu_thread */
+.mutex_owner_running_critical_end:
+	cmpq	%r11, %r9	/* owner == running thread? */
+	je	2f		/* yes, go return cpu */
+1:
+	xorq	%rax, %rax	/* return 0 */
+	ret
+2:
+	movq	%r8, %rax		/* return cpu */
+	ret
+	SET_SIZE(mutex_owner_running)
+
+	.globl	mutex_owner_running_critical_size
+	.type	mutex_owner_running_critical_size, @object
+	.align	CPTRSIZE
+mutex_owner_running_critical_size:
+	.quad	.mutex_owner_running_critical_end - mutex_owner_running_critical_start
+	SET_SIZE(mutex_owner_running_critical_size)
+
+	.globl	mutex_exit_critical_start
+
+	ENTRY(mutex_exit)
+mutex_exit_critical_start:		/* If interrupted, restart here */
+	movq	%gs:CPU_THREAD, %rdx
+	cmpq	%rdx, (%rdi)
+	jne	mutex_vector_exit		/* wrong type or wrong owner */
+	movq	$0, (%rdi)			/* clear owner AND lock */
+.mutex_exit_critical_end:
+.mutex_exit_lockstat_patch_point:
+	ret
+	movq	%rdi, %rsi
+	movl	$LS_MUTEX_EXIT_RELEASE, %edi
+	jmp	lockstat_wrapper
+	SET_SIZE(mutex_exit)
+
+	.globl	mutex_exit_critical_size
+	.type	mutex_exit_critical_size, @object
+	.align	CPTRSIZE
+mutex_exit_critical_size:
+	.quad	.mutex_exit_critical_end - mutex_exit_critical_start
+	SET_SIZE(mutex_exit_critical_size)
+
+/*
+ * rw_enter() and rw_exit().
+ *
+ * These routines handle the simple cases of rw_enter (write-locking an unheld
+ * lock or read-locking a lock that's neither write-locked nor write-wanted)
+ * and rw_exit (no waiters or not the last reader).  If anything complicated
+ * is going on we punt to rw_enter_sleep() and rw_exit_wakeup(), respectively.
+ */
+
+	ENTRY(rw_enter)
+	cmpl	$RW_WRITER, %esi
+	je	.rw_write_enter
+	movq	(%rdi), %rax			/* rax = old rw_wwwh value */
+	testl	$RW_WRITE_LOCKED|RW_WRITE_WANTED, %eax
+	jnz	rw_enter_sleep
+	leaq	RW_READ_LOCK(%rax), %rdx	/* rdx = new rw_wwwh value */
+	lock
+	cmpxchgq %rdx, (%rdi)			/* try to grab read lock */
+	jnz	rw_enter_sleep
+.rw_read_enter_lockstat_patch_point:
+	ret
+	movq	%gs:CPU_THREAD, %rcx		/* rcx = thread ptr */
+	movq	%rdi, %rsi			/* rsi = lock ptr */
+	movl	$LS_RW_ENTER_ACQUIRE, %edi
+	movl	$RW_READER, %edx
+	jmp	lockstat_wrapper_arg
+.rw_write_enter:
+	movq	%gs:CPU_THREAD, %rdx
+	orq	$RW_WRITE_LOCKED, %rdx		/* rdx = write-locked value */
+	xorl	%eax, %eax			/* rax = unheld value */
+	lock
+	cmpxchgq %rdx, (%rdi)			/* try to grab write lock */
+	jnz	rw_enter_sleep
+
+#if defined(OPTERON_WORKAROUND_6323525)
+.rw_write_enter_lockstat_patch_point:
+.rw_write_enter_6323525_patch_point:
+	ret
+	nop
+	nop
+.rw_write_enter_lockstat_6323525_patch_point:
+	nop
+#else	/* OPTERON_WORKAROUND_6323525 */
+.rw_write_enter_lockstat_patch_point:
+	ret
+#endif	/* OPTERON_WORKAROUND_6323525 */
+
+	movq	%gs:CPU_THREAD, %rcx		/* rcx = thread ptr */
+	movq	%rdi, %rsi			/* rsi = lock ptr */
+	movl	$LS_RW_ENTER_ACQUIRE, %edi
+	movl	$RW_WRITER, %edx
+	jmp	lockstat_wrapper_arg
+	SET_SIZE(rw_enter)
+
+	ENTRY(rw_exit)
+	movq	(%rdi), %rax			/* rax = old rw_wwwh value */
+	cmpl	$RW_READ_LOCK, %eax		/* single-reader, no waiters? */
+	jne	.rw_not_single_reader
+	xorl	%edx, %edx			/* rdx = new value (unheld) */
+.rw_read_exit:
+	lock
+	cmpxchgq %rdx, (%rdi)			/* try to drop read lock */
+	jnz	rw_exit_wakeup
+.rw_read_exit_lockstat_patch_point:
+	ret
+	movq	%gs:CPU_THREAD, %rcx		/* rcx = thread ptr */
+	movq	%rdi, %rsi			/* rsi = lock ptr */
+	movl	$LS_RW_EXIT_RELEASE, %edi
+	movl	$RW_READER, %edx
+	jmp	lockstat_wrapper_arg
+.rw_not_single_reader:
+	testl	$RW_WRITE_LOCKED, %eax	/* write-locked or write-wanted? */
+	jnz	.rw_write_exit
+	leaq	-RW_READ_LOCK(%rax), %rdx	/* rdx = new value */
+	cmpl	$RW_READ_LOCK, %edx
+	jge	.rw_read_exit		/* not last reader, safe to drop */
+	jmp	rw_exit_wakeup			/* last reader with waiters */
+.rw_write_exit:
+	movq	%gs:CPU_THREAD, %rax		/* rax = thread ptr */
+	xorl	%edx, %edx			/* rdx = new value (unheld) */
+	orq	$RW_WRITE_LOCKED, %rax		/* eax = write-locked value */
+	lock
+	cmpxchgq %rdx, (%rdi)			/* try to drop read lock */
+	jnz	rw_exit_wakeup
+.rw_write_exit_lockstat_patch_point:
+	ret
+	movq	%gs:CPU_THREAD, %rcx		/* rcx = thread ptr */
+	movq	%rdi, %rsi			/* rsi - lock ptr */
+	movl	$LS_RW_EXIT_RELEASE, %edi
+	movl	$RW_WRITER, %edx
+	jmp	lockstat_wrapper_arg
+	SET_SIZE(rw_exit)
+
+#if defined(OPTERON_WORKAROUND_6323525)
+
+/*
+ * If it is necessary to patch the lock enter routines with the lfence
+ * workaround, workaround_6323525_patched is set to a non-zero value so that
+ * the lockstat_hat_patch routine can patch to the new location of the 'ret'
+ * instruction.
+ */
+	DGDEF3(workaround_6323525_patched, 4, 4)
+	.long	0
+
+#define HOT_MUTEX_PATCH(srcaddr, dstaddr, size)	\
+	movq	$size, %rbx;			\
+	movq	$dstaddr, %r13;			\
+	addq	%rbx, %r13;			\
+	movq	$srcaddr, %r12;			\
+	addq	%rbx, %r12;			\
+0:						\
+	decq	%r13;				\
+	decq	%r12;				\
+	movzbl	(%r12), %esi;			\
+	movq	$1, %rdx;			\
+	movq	%r13, %rdi;			\
+	call	hot_patch_kernel_text;		\
+	decq	%rbx;				\
+	testq	%rbx, %rbx;			\
+	jg	0b;
+
+/*
+ * patch_workaround_6323525: provide workaround for 6323525
+ *
+ * The workaround is to place a fencing instruction (lfence) between the
+ * mutex operation and the subsequent read-modify-write instruction.
+ *
+ * This routine hot patches the lfence instruction on top of the space
+ * reserved by nops in the lock enter routines.
+ */
+	ENTRY_NP(patch_workaround_6323525)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%rbx
+
+	/*
+	 * lockstat_hot_patch() to use the alternate lockstat workaround
+	 * 6323525 patch points (points past the lfence instruction to the
+	 * new ret) when workaround_6323525_patched is set.
+	 */
+	movl	$1, workaround_6323525_patched
+
+	/*
+	 * patch ret/nop/nop/nop to lfence/ret at the end of the lock enter
+	 * routines. The 4 bytes are patched in reverse order so that the
+	 * the existing ret is overwritten last. This provides lock enter
+	 * sanity during the intermediate patching stages.
+	 */
+	HOT_MUTEX_PATCH(_lfence_insn, .mutex_enter_6323525_patch_point, 4)
+	HOT_MUTEX_PATCH(_lfence_insn, .mutex_tryenter_6323525_patch_point, 4)
+	HOT_MUTEX_PATCH(_lfence_insn, .mutex_atryenter_6323525_patch_point, 4)
+	HOT_MUTEX_PATCH(_lfence_insn, .rw_write_enter_6323525_patch_point, 4)
+
+	popq	%rbx
+	popq	%r13
+	popq	%r12
+	movq	%rbp, %rsp
+	popq	%rbp
+	ret
+_lfence_insn:
+	lfence
+	ret
+	SET_SIZE(patch_workaround_6323525)
+
+
+#endif	/* OPTERON_WORKAROUND_6323525 */
+
+
+#define	HOT_PATCH(addr, event, active_instr, normal_instr, len)	\
+	movq	$normal_instr, %rsi;		\
+	movq	$active_instr, %rdi;		\
+	leaq	lockstat_probemap(%rip), %rax;	\
+	movl	_MUL(event, DTRACE_IDSIZE)(%rax), %eax;	\
+	testl	%eax, %eax;			\
+	jz	9f;				\
+	movq	%rdi, %rsi;			\
+9:						\
+	movq	$len, %rdx;			\
+	movq	$addr, %rdi;			\
+	call	hot_patch_kernel_text
+
+	ENTRY(lockstat_hot_patch)
+	pushq	%rbp			/* align stack properly */
+	movq	%rsp, %rbp
+
+#if defined(OPTERON_WORKAROUND_6323525)
+	cmpl	$0, workaround_6323525_patched
+	je	1f
+	HOT_PATCH(.mutex_enter_lockstat_6323525_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.mutex_tryenter_lockstat_6323525_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_write_enter_lockstat_6323525_patch_point,
+		LS_RW_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	jmp	2f
+1:
+	HOT_PATCH(.mutex_enter_lockstat_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.mutex_tryenter_lockstat_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_write_enter_lockstat_patch_point,
+		LS_RW_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+2:
+#else	/* OPTERON_WORKAROUND_6323525 */
+	HOT_PATCH(.mutex_enter_lockstat_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.mutex_tryenter_lockstat_patch_point,
+		LS_MUTEX_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_write_enter_lockstat_patch_point,
+		LS_RW_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+#endif	/* !OPTERON_WORKAROUND_6323525 */
+	HOT_PATCH(.mutex_exit_lockstat_patch_point,
+		LS_MUTEX_EXIT_RELEASE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_read_enter_lockstat_patch_point,
+		LS_RW_ENTER_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_write_exit_lockstat_patch_point,
+		LS_RW_EXIT_RELEASE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.rw_read_exit_lockstat_patch_point,
+		LS_RW_EXIT_RELEASE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.lock_set_lockstat_patch_point,
+		LS_LOCK_SET_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.lock_try_lockstat_patch_point,
+		LS_LOCK_TRY_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.lock_clear_lockstat_patch_point,
+		LS_LOCK_CLEAR_RELEASE, NOP_INSTR, RET_INSTR, 1)
+	HOT_PATCH(.lock_set_spl_lockstat_patch_point,
+		LS_LOCK_SET_SPL_ACQUIRE, NOP_INSTR, RET_INSTR, 1)
+
+	HOT_PATCH(LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_POINT,
+		LS_LOCK_CLEAR_SPLX_RELEASE,
+		LOCK_CLEAR_SPLX_LOCKSTAT_PATCH_VAL, 0, 1);
+	leave			/* unwind stack */
+	ret
+	SET_SIZE(lockstat_hot_patch)
+
+	ENTRY(membar_enter)
+	ALTENTRY(membar_exit)
+	ALTENTRY(membar_sync)
+	mfence			/* lighter weight than lock; xorq $0,(%rsp) */
+	ret
+	SET_SIZE(membar_sync)
+	SET_SIZE(membar_exit)
+	SET_SIZE(membar_enter)
+
+	ENTRY(membar_producer)
+	sfence
+	ret
+	SET_SIZE(membar_producer)
+
+	ENTRY(membar_consumer)
+	lfence
+	ret
+	SET_SIZE(membar_consumer)
+
+/*
+ * thread_onproc()
+ * Set thread in onproc state for the specified CPU.
+ * Also set the thread lock pointer to the CPU's onproc lock.
+ * Since the new lock isn't held, the store ordering is important.
+ * If not done in assembler, the compiler could reorder the stores.
+ */
+
+	ENTRY(thread_onproc)
+	addq	$CPU_THREAD_LOCK, %rsi	/* pointer to disp_lock while running */
+	movl	$ONPROC_THREAD, T_STATE(%rdi)	/* set state to TS_ONPROC */
+	movq	%rsi, T_LOCKP(%rdi)	/* store new lock pointer */
+	ret
+	SET_SIZE(thread_onproc)
+
+/*
+ * mutex_delay_default(void)
+ * Spins for approx a few hundred processor cycles and returns to caller.
+ */
+
+	ENTRY(mutex_delay_default)
+	movq	$92,%r11
+0:	decq	%r11
+	jg	0b
+	ret
+	SET_SIZE(mutex_delay_default)
+
diff --git a/usr/src/uts/intel/ml/modstubs.s b/usr/src/uts/intel/ml/modstubs.s
new file mode 100644
index 0000000000..4143c181a3
--- /dev/null
+++ b/usr/src/uts/intel/ml/modstubs.s
@@ -0,0 +1,1320 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright 2021 OmniOS Community Edition (OmniOSce) Association.
+ */
+
+#include <sys/asm_linkage.h>
+
+#include "assym.h"
+
+/*
+ * !!!!!!!! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! !!!!!!!!
+ *
+ *	For functions which are either STUBs or WSTUBs the actual function
+ *	need to be called using 'call' instruction because of preamble and
+ *	postamble (i.e mod_hold_stub and mod_release_stub) around the
+ *	function call. Due to this we need to copy arguments for the
+ *	real function. On Intel we can't tell how many arguments are there
+ *	on the stack so we have to either copy everything between esp and
+ *	ebp or copy only a fixed number (MAXNARG - defined here) for
+ *	all the stub functions. Currently we are using MAXNARG (it is a kludge
+ *	but worth it?!).
+ *
+ *	NOTE: Use NO_UNLOAD_STUBs if the module is NOT unloadable once it is
+ *	      loaded.
+ */
+#define	MAXNARG	10
+
+/*
+ * WARNING: there is no check for forgetting to write END_MODULE,
+ * and if you do, the kernel will most likely crash.  Be careful
+ *
+ * This file assumes that all of the contributions to the data segment
+ * will be contiguous in the output file, even though they are separated
+ * by pieces of text.  This is safe for all assemblers I know of now...
+ */
+
+/*
+ * This file uses ansi preprocessor features:
+ *
+ * 1.   #define mac(a) extra_ ## a     -->   mac(x) expands to extra_a
+ * The old version of this is
+ *      #define mac(a) extra_/.*.*./a
+ * but this fails if the argument has spaces "mac ( x )"
+ * (Ignore the dots above, I had to put them in to keep this a comment.)
+ *
+ * 2.   #define mac(a) #a             -->    mac(x) expands to "x"
+ * The old version is
+ *      #define mac(a) "a"
+ *
+ * For some reason, the 5.0 preprocessor isn't happy with the above usage.
+ * For now, we're not using these ansi features.
+ *
+ * The reason is that "the 5.0 ANSI preprocessor" is built into the compiler
+ * and is a tokenizing preprocessor. This means, when confronted by something
+ * other than C token generation rules, strange things occur. In this case,
+ * when confronted by an assembly file, it would turn the token ".globl" into
+ * two tokens "." and "globl". For this reason, the traditional, non-ANSI
+ * preprocessor is used on assembly files.
+ *
+ * It would be desirable to have a non-tokenizing cpp (accp?) to use for this.
+ */
+
+/*
+ * This file contains the stubs routines for modules which can be autoloaded.
+ */
+
+/*
+ * See the 'struct mod_modinfo' definition to see what this declaration
+ * is trying to achieve here.
+ */
+#define	MODULE(module,namespace)	\
+	.data;				\
+module/**/_modname:			\
+	.string	"namespace/module";	\
+	SET_SIZE(module/**/_modname);	\
+	.align	CPTRSIZE;		\
+	.globl	module/**/_modinfo;	\
+	.type	module/**/_modinfo, @object;	\
+module/**/_modinfo:			\
+	.quad	module/**/_modname;	\
+	.quad	0	/* storage for modctl pointer */
+
+	/* then mod_stub_info structures follow until a mods_func_adr is 0 */
+
+/* this puts a 0 where the next mods_func_adr would be */
+#define	END_MODULE(module)		\
+	.data;				\
+	.align	CPTRSIZE;		\
+	.quad 0;			\
+	SET_SIZE(module/**/_modinfo)
+
+/*
+ * The data section in the stub_common macro is the
+ * mod_stub_info structure for the stub function
+ */
+
+#define STUB_COMMON(module, fcnname, install_fcn, retfcn, weak)		\
+	ENTRY(fcnname);							\
+	leaq	fcnname/**/_info(%rip), %rax;				\
+	cmpl	$0, MODS_FLAG(%rax);			/* weak? */	\
+	je	stubs_common_code;			/* not weak */	\
+	testb	$MODS_INSTALLED, MODS_FLAG(%rax);	/* installed? */ \
+	jne	stubs_common_code;		/* yes, do the mod_hold */ \
+	movq	MODS_RETFCN(%rax), %rax;	/* no, load retfcn */	\
+	INDIRECT_JMP_REG(rax);			/* no, jump to retfcn */ \
+	SET_SIZE(fcnname);						\
+	.data;								\
+	.align	 CPTRSIZE;						\
+	.type	fcnname/**/_info, @object;				\
+fcnname/**/_info:							\
+	.quad	install_fcn;		/* 0 */				\
+	.quad	module/**/_modinfo;	/* 0x8 */			\
+	.quad	fcnname;		/* 0x10 */			\
+	.quad	retfcn;			/* 0x18 */			\
+	.long	weak;			/* 0x20 */			\
+	SET_SIZE(fcnname/**/_info)
+
+#define STUB_NO_UNLOADABLE(module, fcnname, install_fcn, retfcn, weak)	\
+	ENTRY(fcnname);							\
+	leaq	fcnname/**/_info(%rip), %rax;				\
+	testb	$MODS_INSTALLED, MODS_FLAG(%rax); /* installed? */	\
+	je	5f;			/* no */			\
+	movq	MODS_INSTFCN(%rax), %rax; /* yes, load install_fcn */	\
+	INDIRECT_JMP_REG(rax);		/* yes, jump to install_fcn */	\
+5:	testb	$MODS_WEAK, MODS_FLAG(%rax);	/* weak? */		\
+	je	stubs_common_code;	/* no, do mod load */		\
+	movq	MODS_RETFCN(%rax), %rax; /* yes, load retfcn */		\
+	INDIRECT_JMP_REG(rax);		/* yes, jump to retfcn */	\
+	SET_SIZE(fcnname);						\
+	.data;								\
+	.align	CPTRSIZE;						\
+	.type	fcnname/**/_info, @object;				\
+fcnname/**/_info:							\
+	.quad	install_fcn;		/* 0 */				\
+	.quad	module/**/_modinfo;	/* 0x8 */			\
+	.quad	fcnname;		/* 0x10 */			\
+	.quad	retfcn;			/* 0x18 */			\
+	.long   weak;			/* 0x20 */			\
+	SET_SIZE(fcnname/**/_info)
+
+/*
+ * We branch here with the fcnname_info pointer in %rax
+ */
+	ENTRY_NP(stubs_common_code)
+	.globl	mod_hold_stub
+	.globl	mod_release_stub
+	pushq	%rbp
+	movq	%rsp, %rbp
+	subq	$0x10, %rsp
+	movq	%r15, (%rsp)		/* (caller saved) */
+	movq	%rax, %r15		/* stash the fcnname_info pointer */
+	/*
+	 * save incoming register arguments
+	 */
+	pushq	%rdi
+	pushq	%rsi
+	pushq	%rdx
+	pushq	%rcx
+	pushq	%r8
+	pushq	%r9
+	/* (next 4 args, if any, are already on the stack above %rbp) */
+	movq	%r15, %rdi
+	call	mod_hold_stub		/* mod_hold_stub(mod_stub_info *) */
+	cmpl	$-1, %eax		/* error? */
+	jne	.L1
+	movq	0x18(%r15), %rax
+	INDIRECT_CALL_REG(rax)
+	addq	$0x30, %rsp
+	jmp	.L2
+.L1:
+	/*
+	 * copy MAXNARG == 10 incoming arguments
+	 */
+	popq	%r9
+	popq	%r8
+	popq	%rcx
+	popq	%rdx
+	popq	%rsi
+	popq	%rdi
+	/*
+	 * stack:
+	 *	arg9		0x38(%rsp)
+	 *	arg8		0x30(%rsp)
+	 *	arg7		0x28(%rsp)
+	 *	arg6		0x20(%rsp)
+	 *	saved %rip	0x18(%rsp)
+	 *	saved %rbp	0x10(%rsp)
+	 *	<pad>		0x8(%rsp)
+	 *	saved %r15	0x0(%rsp)
+	 */
+	movl	$MAXNARG - 6 + 3, %r11d
+	pushq	(%rsp, %r11, 8)
+	pushq	(%rsp, %r11, 8)
+	pushq	(%rsp, %r11, 8)
+	pushq	(%rsp, %r11, 8)
+	movq	(%r15), %rax
+	INDIRECT_CALL_REG(rax)		/* call the stub fn(arg, ..) */
+	addq	$0x20, %rsp		/* pop off last 4 args */
+	pushq	%rax			/* save any return values */
+	pushq	%rdx
+	movq	%r15, %rdi
+	call	mod_release_stub	/* release hold on module */
+	popq	%rdx			/* restore return values */
+	popq	%rax
+.L2:
+	popq	%r15
+	leave
+	ret
+	SET_SIZE(stubs_common_code)
+
+#define STUB(module, fcnname, retfcn)	\
+    STUB_COMMON(module, fcnname, mod_hold_stub, retfcn, 0)
+
+/*
+ * "weak stub", don't load on account of this call
+ */
+#define WSTUB(module, fcnname, retfcn)	\
+    STUB_COMMON(module, fcnname, retfcn, retfcn, MODS_WEAK)
+
+/*
+ * "non-unloadable stub", don't bother 'holding' module if it's already loaded
+ * since the module cannot be unloaded.
+ *
+ * User *MUST* guarantee the module is not unloadable (no _fini routine).
+ */
+#define NO_UNLOAD_STUB(module, fcnname, retfcn) \
+    STUB_NO_UNLOADABLE(module, fcnname,  retfcn, retfcn, MODS_NOUNLOAD)
+
+/*
+ * "weak stub" for non-unloadable module, don't load on account of this call
+ */
+#define NO_UNLOAD_WSTUB(module, fcnname, retfcn) \
+    STUB_NO_UNLOADABLE(module, fcnname, retfcn, retfcn, MODS_NOUNLOAD|MODS_WEAK)
+
+/*
+ * this is just a marker for the beginning area of text that contains stubs
+ */
+	ENTRY_NP(stubs_base)
+	nop
+
+/*
+ * WARNING WARNING WARNING!!!!!!
+ *
+ * On the MODULE macro you MUST NOT use any spaces!!! They are
+ * significant to the preprocessor.  With ansi c there is a way around this
+ * but for some reason (yet to be investigated) ansi didn't work for other
+ * reasons!
+ *
+ * When zero is used as the return function, the system will call
+ * panic if the stub can't be resolved.
+ */
+
+/*
+ * Stubs for devfs. A non-unloadable module.
+ */
+
+#ifndef	DEVFS_MODULE
+	MODULE(devfs,fs);
+	NO_UNLOAD_STUB(devfs, devfs_clean,		nomod_minus_one);
+	NO_UNLOAD_STUB(devfs, devfs_lookupname,		nomod_minus_one);
+	NO_UNLOAD_STUB(devfs, devfs_walk,		nomod_minus_one);
+	NO_UNLOAD_STUB(devfs, devfs_devpolicy,		nomod_minus_one);
+	NO_UNLOAD_STUB(devfs, devfs_reset_perm,		nomod_minus_one);
+	NO_UNLOAD_STUB(devfs, devfs_remdrv_cleanup,	nomod_minus_one);
+	END_MODULE(devfs);
+#endif
+
+#ifndef	DEV_MODULE
+	MODULE(dev,fs);
+	NO_UNLOAD_STUB(dev, sdev_modctl_readdir,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, sdev_modctl_readdir_free,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, devname_filename_register,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, sdev_modctl_devexists,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, devname_profile_update,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, sdev_devstate_change,	nomod_minus_one);
+	NO_UNLOAD_STUB(dev, devvt_getvnodeops,		nomod_minus_one);
+	NO_UNLOAD_STUB(dev, devpts_getvnodeops,		nomod_zero);
+	END_MODULE(dev);
+#endif
+
+/*
+ * Stubs for specfs. A non-unloadable module.
+ */
+
+#ifndef SPEC_MODULE
+	MODULE(specfs,fs);
+	NO_UNLOAD_STUB(specfs, common_specvp,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, makectty,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, makespecvp,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, smark,			nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_segmap,		nomod_einval);
+	NO_UNLOAD_STUB(specfs, specfind,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, specvp,			nomod_zero);
+	NO_UNLOAD_STUB(specfs, devi_stillreferenced,	nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_getvnodeops,	nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_char_map,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, specvp_devfs,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_assoc_vp_with_devi,	nomod_void);
+	NO_UNLOAD_STUB(specfs, spec_hold_devi_by_vp,	nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_snode_walk,		nomod_void);
+	NO_UNLOAD_STUB(specfs, spec_devi_open_count,	nomod_minus_one);
+	NO_UNLOAD_STUB(specfs, spec_is_clone,		nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_is_selfclone,	nomod_zero);
+	NO_UNLOAD_STUB(specfs, spec_fence_snode,	nomod_minus_one);
+	NO_UNLOAD_STUB(specfs, spec_unfence_snode,	nomod_minus_one);
+	END_MODULE(specfs);
+#endif
+
+
+/*
+ * Stubs for sockfs. A non-unloadable module.
+ */
+#ifndef SOCK_MODULE
+	MODULE(sockfs,fs);
+	NO_UNLOAD_STUB(sockfs, so_socket,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, so_socketpair,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, bind,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, listen,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, accept,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, connect,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, shutdown,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, recv,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, recvfrom,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, recvmsg,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, send,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sendmsg,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sendto,		nomod_zero);
+#ifdef _SYSCALL32_IMPL
+	NO_UNLOAD_STUB(sockfs, recv32,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, recvfrom32,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, send32,		nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sendto32,	nomod_zero);
+#endif	/* _SYSCALL32_IMPL */
+	NO_UNLOAD_STUB(sockfs, getpeername,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, getsockname,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, getsockopt,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, setsockopt,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sockconfig,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sock_getmsg,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sock_putmsg,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sosendfile64,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, snf_segmap,	nomod_einval);
+	NO_UNLOAD_STUB(sockfs, sock_getfasync,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, nl7c_sendfilev,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, sotpi_sototpi,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, socket_sendmblk,	nomod_zero);
+	NO_UNLOAD_STUB(sockfs, socket_setsockopt,  nomod_zero);
+	END_MODULE(sockfs);
+#endif
+
+/*
+ * IPsec stubs.
+ */
+
+#ifndef	IPSECAH_MODULE
+	MODULE(ipsecah,drv);
+	WSTUB(ipsecah,	ipsec_construct_inverse_acquire,	nomod_zero);
+	WSTUB(ipsecah,	sadb_acquire,		nomod_zero);
+	WSTUB(ipsecah,	ipsecah_algs_changed,	nomod_zero);
+	WSTUB(ipsecah,	sadb_alg_update,	nomod_zero);
+	WSTUB(ipsecah,	sadb_unlinkassoc,	nomod_zero);
+	WSTUB(ipsecah,	sadb_insertassoc,	nomod_zero);
+	WSTUB(ipsecah,	ipsecah_in_assocfailure,	nomod_zero);
+	WSTUB(ipsecah,	sadb_set_lpkt,		nomod_zero);
+	WSTUB(ipsecah,	ipsecah_icmp_error,	nomod_zero);
+	END_MODULE(ipsecah);
+#endif
+
+#ifndef	IPSECESP_MODULE
+	MODULE(ipsecesp,drv);
+	WSTUB(ipsecesp,	ipsecesp_fill_defs,	nomod_zero);
+	WSTUB(ipsecesp,	ipsecesp_algs_changed,	nomod_zero);
+	WSTUB(ipsecesp, ipsecesp_in_assocfailure,	nomod_zero);
+	WSTUB(ipsecesp, ipsecesp_init_funcs,	nomod_zero);
+	WSTUB(ipsecesp,	ipsecesp_icmp_error,	nomod_zero);
+	WSTUB(ipsecesp,	ipsecesp_send_keepalive,	nomod_zero);
+	END_MODULE(ipsecesp);
+#endif
+
+#ifndef	KEYSOCK_MODULE
+	MODULE(keysock,	drv);
+	WSTUB(keysock,	keysock_spdsock_wput_iocdata,	nomod_void);
+	WSTUB(keysock,	keysock_plumb_ipsec,	nomod_zero);
+	WSTUB(keysock,	keysock_extended_reg,	nomod_zero);
+	WSTUB(keysock,	keysock_next_seq,	nomod_zero);
+	END_MODULE(keysock);
+#endif
+
+#ifndef SPDSOCK_MODULE
+	MODULE(spdsock,drv);
+	WSTUB(spdsock,	spdsock_update_pending_algs,	nomod_zero);
+	END_MODULE(spdsock);
+#endif
+
+/*
+ * Stubs for nfs common code.
+ * XXX nfs_getvnodeops should go away with removal of kludge in vnode.c
+ */
+#ifndef NFS_MODULE
+	MODULE(nfs,fs);
+	WSTUB(nfs,	nfs_getvnodeops,	nomod_zero);
+	WSTUB(nfs,	nfs_perror,		nomod_zero);
+	WSTUB(nfs,	nfs_cmn_err,		nomod_zero);
+	WSTUB(nfs,	clcleanup_zone,		nomod_zero);
+	WSTUB(nfs,	clcleanup4_zone,	nomod_zero);
+	END_MODULE(nfs);
+#endif
+
+
+/*
+ * Stubs for nfs_dlboot (diskless booting).
+ */
+#ifndef NFS_DLBOOT_MODULE
+	MODULE(nfs_dlboot,misc);
+	STUB(nfs_dlboot,	mount_root,	nomod_minus_one);
+	STUB(nfs_dlboot,	dhcpinit,	nomod_minus_one);
+	END_MODULE(nfs_dlboot);
+#endif
+
+/*
+ * Stubs for nfs server-only code.
+ */
+#ifndef NFSSRV_MODULE
+	MODULE(nfssrv,misc);
+	STUB(nfssrv,		exportfs,	nomod_minus_one);
+	STUB(nfssrv,		nfs_getfh,	nomod_minus_one);
+	STUB(nfssrv,		nfsl_flush,	nomod_minus_one);
+	STUB(nfssrv,		rfs4_check_delegated, nomod_zero);
+	STUB(nfssrv,		mountd_args,	nomod_minus_one);
+	NO_UNLOAD_STUB(nfssrv,	rdma_start,	nomod_zero);
+	NO_UNLOAD_STUB(nfssrv,	nfs_svc,	nomod_zero);
+	END_MODULE(nfssrv);
+#endif
+
+/*
+ * Stubs for kernel lock manager.
+ */
+#ifndef KLM_MODULE
+	MODULE(klmmod,misc);
+	NO_UNLOAD_STUB(klmmod, lm_svc,		nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_shutdown,	nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_unexport,	nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_cprresume,	nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_cprsuspend,	nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_safelock, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_safemap, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_has_sleep, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_free_config, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_vp_active, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_get_sysid, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_rel_sysid, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_alloc_sysidt, nomod_minus_one);
+	NO_UNLOAD_STUB(klmmod, lm_free_sysidt, nomod_zero);
+	NO_UNLOAD_STUB(klmmod, lm_sysidt, nomod_minus_one);
+	END_MODULE(klmmod);
+#endif
+
+#ifndef KLMOPS_MODULE
+	MODULE(klmops,misc);
+	NO_UNLOAD_STUB(klmops, lm_frlock,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm4_frlock,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_shrlock,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm4_shrlock,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_nlm_dispatch,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_nlm4_dispatch,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_nlm_reclaim,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_nlm4_reclaim,	nomod_zero);
+	NO_UNLOAD_STUB(klmops, lm_register_lock_locally, nomod_zero);
+	END_MODULE(klmops);
+#endif
+
+/*
+ * Stubs for kernel TLI module
+ *   XXX currently we never allow this to unload
+ */
+#ifndef TLI_MODULE
+	MODULE(tlimod,misc);
+	NO_UNLOAD_STUB(tlimod,	t_kopen,		nomod_minus_one);
+	NO_UNLOAD_STUB(tlimod,	t_kunbind,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kadvise,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_krcvudata,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_ksndudata,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kalloc,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kbind,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kclose,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kspoll,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_kfree,		nomod_zero);
+	NO_UNLOAD_STUB(tlimod,	t_koptmgmt,		nomod_zero);
+	END_MODULE(tlimod);
+#endif
+
+/*
+ * Stubs for kernel RPC module
+ *   XXX currently we never allow this to unload
+ */
+#ifndef RPC_MODULE
+	MODULE(rpcmod,strmod);
+	NO_UNLOAD_STUB(rpcmod,	clnt_tli_kcreate,	nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod,	svc_tli_kcreate,	nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod,	bindresvport,		nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod, rdma_register_mod,	nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod, rdma_unregister_mod,	nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod, svc_queuereq,		nomod_minus_one);
+	NO_UNLOAD_STUB(rpcmod, clist_add,		nomod_minus_one);
+	END_MODULE(rpcmod);
+#endif
+
+/*
+ * Stubs for des
+ */
+#ifndef DES_MODULE
+	MODULE(des,misc);
+	STUB(des, cbc_crypt,		nomod_zero);
+	STUB(des, ecb_crypt,		nomod_zero);
+	STUB(des, _des_crypt,		nomod_zero);
+	END_MODULE(des);
+#endif
+
+/*
+ * Stubs for procfs. A non-unloadable module.
+ */
+#ifndef PROC_MODULE
+	MODULE(procfs,fs);
+	NO_UNLOAD_STUB(procfs, prfree,		nomod_zero);
+	NO_UNLOAD_STUB(procfs, prexit,		nomod_zero);
+	NO_UNLOAD_STUB(procfs, prlwpfree,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prlwpexit,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prinvalidate,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prnsegs,		nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetcred,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetpriv,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetprivsize,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetsecflags,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetstatus,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetlwpstatus,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetpsinfo,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetlwpsinfo,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, oprgetstatus,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, oprgetpsinfo,	nomod_zero);
+#ifdef _SYSCALL32_IMPL
+	NO_UNLOAD_STUB(procfs, prgetstatus32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetlwpstatus32, nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetpsinfo32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prgetlwpsinfo32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, oprgetstatus32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, oprgetpsinfo32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, psinfo_kto32,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, lwpsinfo_kto32,	nomod_zero);
+#endif	/* _SYSCALL32_IMPL */
+	NO_UNLOAD_STUB(procfs, prnotify,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prexecstart,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prexecend,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, prrelvm,		nomod_zero);
+	NO_UNLOAD_STUB(procfs, prbarrier,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, estimate_msacct,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_getprot,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_getprot_done,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_getsegsize,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_isobject,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_isself,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_allstopped,	nomod_zero);
+	NO_UNLOAD_STUB(procfs, pr_free_watched_pages, nomod_zero);
+	END_MODULE(procfs);
+#endif
+
+/*
+ * Stubs for fifofs
+ */
+#ifndef FIFO_MODULE
+	MODULE(fifofs,fs);
+	NO_UNLOAD_STUB(fifofs, fifovp,		nomod_zero);
+	NO_UNLOAD_STUB(fifofs, fifo_getinfo,	nomod_zero);
+	NO_UNLOAD_STUB(fifofs, fifo_vfastoff,	nomod_zero);
+	END_MODULE(fifofs);
+#endif
+
+/*
+ * Stubs for ufs
+ *
+ * This is needed to support the old quotactl system call.
+ * When the old sysent stuff goes away, this will need to be revisited.
+ */
+#ifndef UFS_MODULE
+	MODULE(ufs,fs);
+	STUB(ufs, quotactl, nomod_minus_one);
+	END_MODULE(ufs);
+#endif
+
+/*
+ * Stubs for zfs
+ */
+#ifndef ZFS_MODULE
+	MODULE(zfs,fs);
+	STUB(zfs, dsl_prop_get, nomod_minus_one);
+	STUB(zfs, spa_boot_init, nomod_minus_one);
+	STUB(zfs, zfs_prop_to_name, nomod_zero);
+	END_MODULE(zfs);
+#endif
+
+/*
+ * Stubs for dcfs
+ */
+#ifndef DCFS_MODULE
+	MODULE(dcfs,fs);
+	STUB(dcfs, decompvp, 0);
+	END_MODULE(dcfs);
+#endif
+
+/*
+ * Stubs for namefs
+ */
+#ifndef NAMEFS_MODULE
+	MODULE(namefs,fs);
+	STUB(namefs, nm_unmountall,	0);
+	END_MODULE(namefs);
+#endif
+
+/*
+ * Stubs for sysdc
+ */
+#ifndef SDC_MODULE
+	MODULE(SDC,sched);
+	NO_UNLOAD_STUB(SDC, sysdc_thread_enter,		nomod_zero);
+	END_MODULE(SDC);
+#endif
+
+/*
+ * Stubs for ts_dptbl
+ */
+#ifndef TS_DPTBL_MODULE
+	MODULE(TS_DPTBL,sched);
+	STUB(TS_DPTBL, ts_getdptbl,		0);
+	STUB(TS_DPTBL, ts_getkmdpris,		0);
+	STUB(TS_DPTBL, ts_getmaxumdpri,	0);
+	END_MODULE(TS_DPTBL);
+#endif
+
+/*
+ * Stubs for rt_dptbl
+ */
+#ifndef RT_DPTBL_MODULE
+	MODULE(RT_DPTBL,sched);
+	STUB(RT_DPTBL, rt_getdptbl,		0);
+	END_MODULE(RT_DPTBL);
+#endif
+
+/*
+ * Stubs for ia_dptbl
+ */
+#ifndef IA_DPTBL_MODULE
+	MODULE(IA_DPTBL,sched);
+	STUB(IA_DPTBL, ia_getdptbl,		nomod_zero);
+	STUB(IA_DPTBL, ia_getkmdpris,		nomod_zero);
+	STUB(IA_DPTBL, ia_getmaxumdpri,	nomod_zero);
+	END_MODULE(IA_DPTBL);
+#endif
+
+/*
+ * Stubs for FSS scheduler
+ */
+#ifndef FSS_MODULE
+	MODULE(FSS,sched);
+	WSTUB(FSS, fss_allocbuf,		nomod_zero);
+	WSTUB(FSS, fss_freebuf,			nomod_zero);
+	WSTUB(FSS, fss_changeproj,		nomod_zero);
+	WSTUB(FSS, fss_changepset,		nomod_zero);
+	END_MODULE(FSS);
+#endif
+
+/*
+ * Stubs for fx_dptbl
+ */
+#ifndef FX_DPTBL_MODULE
+	MODULE(FX_DPTBL,sched);
+	STUB(FX_DPTBL, fx_getdptbl,		0);
+	STUB(FX_DPTBL, fx_getmaxumdpri,		0);
+	END_MODULE(FX_DPTBL);
+#endif
+
+/*
+ * Stubs for bootdev
+ */
+#ifndef BOOTDEV_MODULE
+	MODULE(bootdev,misc);
+	STUB(bootdev, i_promname_to_devname, 0);
+	STUB(bootdev, i_convert_boot_device_name, 0);
+	END_MODULE(bootdev);
+#endif
+
+/*
+ * stubs for strplumb...
+ */
+#ifndef STRPLUMB_MODULE
+	MODULE(strplumb,misc);
+	STUB(strplumb, strplumb,     0);
+	STUB(strplumb, strplumb_load, 0);
+	STUB(strplumb, strplumb_get_netdev_path, 0);
+	END_MODULE(strplumb);
+#endif
+
+/*
+ * Stubs for console configuration module
+ */
+#ifndef CONSCONFIG_MODULE
+	MODULE(consconfig,misc);
+	STUB(consconfig, consconfig,	0);
+	STUB(consconfig, consconfig_get_usb_kb_path,	0);
+	STUB(consconfig, consconfig_get_usb_ms_path,	0);
+	STUB(consconfig, consconfig_get_plat_fbpath,	0);
+	STUB(consconfig, consconfig_console_is_ready,	0);
+	END_MODULE(consconfig);
+#endif
+
+/*
+ * Stubs for accounting.
+ */
+#ifndef SYSACCT_MODULE
+	MODULE(sysacct,sys);
+	NO_UNLOAD_WSTUB(sysacct, acct,			nomod_zero);
+	NO_UNLOAD_WSTUB(sysacct, acct_fs_in_use,	nomod_zero);
+	END_MODULE(sysacct);
+#endif
+
+/*
+ * Stubs for semaphore routines. sem.c
+ */
+#ifndef SEMSYS_MODULE
+	MODULE(semsys,sys);
+	NO_UNLOAD_WSTUB(semsys, semexit,		nomod_zero);
+	END_MODULE(semsys);
+#endif
+
+/*
+ * Stubs for shmem routines. shm.c
+ */
+#ifndef SHMSYS_MODULE
+	MODULE(shmsys,sys);
+	NO_UNLOAD_WSTUB(shmsys, shmexit,		nomod_zero);
+	NO_UNLOAD_WSTUB(shmsys, shmfork,		nomod_zero);
+	NO_UNLOAD_WSTUB(shmsys, shmgetid,		nomod_minus_one);
+	END_MODULE(shmsys);
+#endif
+
+/*
+ * Stubs for doors
+ */
+#ifndef DOOR_MODULE
+	MODULE(doorfs,sys);
+	NO_UNLOAD_WSTUB(doorfs, door_slam,		nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_exit,		nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_revoke_all,	nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_fork,		nomod_zero);
+	NO_UNLOAD_STUB(doorfs, door_upcall,		nomod_einval);
+	NO_UNLOAD_STUB(doorfs, door_ki_create,		nomod_einval);
+	NO_UNLOAD_STUB(doorfs, door_ki_open,		nomod_einval);
+	NO_UNLOAD_STUB(doorfs, door_ki_lookup,		nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_ki_upcall,		nomod_einval);
+	NO_UNLOAD_WSTUB(doorfs, door_ki_upcall_limited,	nomod_einval);
+	NO_UNLOAD_WSTUB(doorfs, door_ki_hold,		nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_ki_rele,		nomod_zero);
+	NO_UNLOAD_WSTUB(doorfs, door_ki_info,		nomod_einval);
+	END_MODULE(doorfs);
+#endif
+
+/*
+ * Stubs for MD5
+ */
+#ifndef MD5_MODULE
+	MODULE(md5,misc);
+	WSTUB(md5, MD5Init,		nomod_zero);
+	WSTUB(md5, MD5Update,		nomod_zero);
+	WSTUB(md5, MD5Final,		nomod_zero);
+	END_MODULE(md5);
+#endif
+
+/*
+ * Stubs for idmap
+ */
+#ifndef IDMAP_MODULE
+	MODULE(idmap,misc);
+	STUB(idmap, kidmap_batch_getgidbysid,	nomod_zero);
+	STUB(idmap, kidmap_batch_getpidbysid,	nomod_zero);
+	STUB(idmap, kidmap_batch_getsidbygid,	nomod_zero);
+	STUB(idmap, kidmap_batch_getsidbyuid,	nomod_zero);
+	STUB(idmap, kidmap_batch_getuidbysid,	nomod_zero);
+	STUB(idmap, kidmap_get_create,		nomod_zero);
+	STUB(idmap, kidmap_get_destroy,		nomod_zero);
+	STUB(idmap, kidmap_get_mappings,	nomod_zero);
+	STUB(idmap, kidmap_getgidbysid,		nomod_zero);
+	STUB(idmap, kidmap_getpidbysid,		nomod_zero);
+	STUB(idmap, kidmap_getsidbygid,		nomod_zero);
+	STUB(idmap, kidmap_getsidbyuid,		nomod_zero);
+	STUB(idmap, kidmap_getuidbysid,		nomod_zero);
+	STUB(idmap, idmap_get_door,		nomod_einval);
+	STUB(idmap, idmap_unreg_dh,		nomod_einval);
+	STUB(idmap, idmap_reg_dh,		nomod_einval);
+	STUB(idmap, idmap_purge_cache,		nomod_einval);
+	END_MODULE(idmap);
+#endif
+
+/*
+ * Stubs for auditing.
+ */
+#ifndef C2AUDIT_MODULE
+	MODULE(c2audit,sys);
+	NO_UNLOAD_STUB(c2audit, audit_init_module,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_start,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_finish,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit,			nomod_zero);
+	NO_UNLOAD_STUB(c2audit, auditdoor,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_closef,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_core_start,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_core_finish,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_strputmsg,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_savepath,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_anchorpath,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_exit,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_exec,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_symlink,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_symlink_create,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_vncreate_start,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_vncreate_finish,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_enterprom,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_exitprom,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_chdirec,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_setf,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_sock,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_strgetmsg,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_ipc,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_ipcget,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_fdsend,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_fdrecv,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_priv,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_setppriv,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_psecflags,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_devpolicy,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_setfsat_path,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_cryptoadm,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_kssl,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, audit_pf_policy,	nomod_zero);
+	NO_UNLOAD_STUB(c2audit, au_doormsg,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, au_uwrite,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, au_to_arg32,		nomod_zero);
+	NO_UNLOAD_STUB(c2audit, au_free_rec,		nomod_zero);
+	END_MODULE(c2audit);
+#endif
+
+/*
+ * Stubs for kernel rpc security service module
+ */
+#ifndef RPCSEC_MODULE
+	MODULE(rpcsec,misc);
+	NO_UNLOAD_STUB(rpcsec, sec_clnt_revoke,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec, authkern_create,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec, sec_svc_msg,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec, sec_svc_control,		nomod_zero);
+	END_MODULE(rpcsec);
+#endif
+
+/*
+ * Stubs for rpc RPCSEC_GSS security service module
+ */
+#ifndef RPCSEC_GSS_MODULE
+	MODULE(rpcsec_gss,misc);
+	NO_UNLOAD_STUB(rpcsec_gss, __svcrpcsec_gss,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_getcred,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_set_callback,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_secget,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_secfree,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_seccreate,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_set_defaults,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_revauth,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_secpurge,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_cleanup,		nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_versions,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_max_data_length,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_svc_max_data_length,	nomod_zero);
+	NO_UNLOAD_STUB(rpcsec_gss, rpc_gss_get_service_type,	nomod_zero);
+	END_MODULE(rpcsec_gss);
+#endif
+
+/*
+ * Stubs for PCI configurator module (misc/pcicfg).
+ */
+#ifndef PCICFG_MODULE
+	MODULE(pcicfg,misc);
+	STUB(pcicfg, pcicfg_configure, 0);
+	STUB(pcicfg, pcicfg_unconfigure, 0);
+	END_MODULE(pcicfg);
+#endif
+
+/*
+ * Stubs for pcieb nexus driver.
+ */
+#ifndef PCIEB_MODULE
+	MODULE(pcieb,drv);
+	STUB(pcieb, pcieb_intel_error_workaround, 0);
+	END_MODULE(pcieb);
+#endif
+
+#ifndef IWSCN_MODULE
+	MODULE(iwscn,drv);
+	STUB(iwscn, srpop, 0);
+	END_MODULE(iwscn);
+#endif
+
+/*
+ * Stubs for checkpoint-resume module
+ */
+#ifndef CPR_MODULE
+        MODULE(cpr,misc);
+        STUB(cpr, cpr, 0);
+        END_MODULE(cpr);
+#endif
+
+/*
+ * Stubs for kernel probes (tnf module).  Not unloadable.
+ */
+#ifndef TNF_MODULE
+	MODULE(tnf,drv);
+	NO_UNLOAD_STUB(tnf, tnf_ref32_1,	nomod_zero);
+	NO_UNLOAD_STUB(tnf, tnf_string_1,	nomod_zero);
+	NO_UNLOAD_STUB(tnf, tnf_opaque_array_1,	nomod_zero);
+	NO_UNLOAD_STUB(tnf, tnf_struct_tag_1,	nomod_zero);
+	NO_UNLOAD_STUB(tnf, tnf_allocate,	nomod_zero);
+	END_MODULE(tnf);
+#endif
+
+/*
+ * Stubs for i86hvm bootstraping
+ */
+#ifndef HVM_BOOTSTRAP
+	MODULE(hvm_bootstrap,misc);
+	NO_UNLOAD_STUB(hvm_bootstrap, hvmboot_rootconf, nomod_zero);
+	END_MODULE(hvm_bootstrap);
+#endif
+
+/*
+ * Clustering: stubs for bootstrapping.
+ */
+#ifndef CL_BOOTSTRAP
+	MODULE(cl_bootstrap,misc);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clboot_modload, nomod_minus_one);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clboot_loadrootmodules, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clboot_rootconf, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clboot_mountroot, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clconf_init, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clconf_get_nodeid, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, clconf_maximum_nodeid, nomod_zero);
+	NO_UNLOAD_WSTUB(cl_bootstrap, cluster, nomod_zero);
+	END_MODULE(cl_bootstrap);
+#endif
+
+/*
+ * Clustering: stubs for cluster infrastructure.
+ */
+#ifndef CL_COMM_MODULE
+	MODULE(cl_comm,misc);
+	NO_UNLOAD_STUB(cl_comm, cladmin, nomod_minus_one);
+	END_MODULE(cl_comm);
+#endif
+
+/*
+ * Clustering: stubs for global file system operations.
+ */
+#ifndef PXFS_MODULE
+	MODULE(pxfs,fs);
+	NO_UNLOAD_WSTUB(pxfs, clpxfs_aio_read, nomod_zero);
+	NO_UNLOAD_WSTUB(pxfs, clpxfs_aio_write, nomod_zero);
+	NO_UNLOAD_WSTUB(pxfs, cl_flk_state_transition_notify, nomod_zero);
+	END_MODULE(pxfs);
+#endif
+
+/*
+ * Stubs for kernel cryptographic framework module (misc/kcf).
+ */
+#ifndef KCF_MODULE
+	MODULE(kcf,misc);
+	NO_UNLOAD_STUB(kcf, crypto_mech2id, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_register_provider, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_unregister_provider, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_provider_notification, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_op_notification, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_kmflag, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_digest_key_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_decrypt_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_get_all_mech_info, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_check, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_check_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_derive, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_generate, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_generate_pair, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_unwrap, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_key_wrap, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_verify, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_verify_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_verify_decrypt, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_verify_decrypt_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_mac_decrypt_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_copy, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_create, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_destroy, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_find_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_find_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_find, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_get_attribute_value, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_get_size, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_object_set_attribute_value, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_session_close, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_session_login, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_session_logout, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_session_open, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_encrypt_mac_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_create_ctx_template, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_destroy_ctx_template, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_get_mech_list, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_free_mech_list, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_cancel_req, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_cancel_ctx, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_bufcall_alloc, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_bufcall_free, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_bufcall, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_unbufcall, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_notify_events, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_unnotify_events, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_get_provider, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_get_provinfo, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_release_provider, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_recover, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_recover_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_sign_recover_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_init, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_update, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_final, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_recover, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_recover_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, crypto_verify_recover_init_prov, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, random_add_entropy, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, random_add_pseudo_entropy, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, random_get_blocking_bytes, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, random_get_bytes, nomod_minus_one);
+	NO_UNLOAD_STUB(kcf, random_get_pseudo_bytes, nomod_minus_one);
+	END_MODULE(kcf);
+#endif
+
+/*
+ * Stubs for sha1. A non-unloadable module.
+ */
+#ifndef SHA1_MODULE
+	MODULE(sha1,crypto);
+	NO_UNLOAD_STUB(sha1, SHA1Init, nomod_void);
+	NO_UNLOAD_STUB(sha1, SHA1Update, nomod_void);
+	NO_UNLOAD_STUB(sha1, SHA1Final, nomod_void);
+	END_MODULE(sha1);
+#endif
+
+/*
+ * The following stubs are used by the mac module.
+ * Since dld already depends on mac, these
+ * stubs are needed to avoid circular dependencies.
+ */
+#ifndef	DLD_MODULE
+	MODULE(dld,drv);
+	STUB(dld, dld_init_ops, nomod_void);
+	STUB(dld, dld_fini_ops, nomod_void);
+	STUB(dld, dld_devt_to_instance, nomod_minus_one);
+	STUB(dld, dld_autopush, nomod_minus_one);
+	STUB(dld, dld_ioc_register, nomod_einval);
+	STUB(dld, dld_ioc_unregister, nomod_void);
+	END_MODULE(dld);
+#endif
+
+/*
+ * The following stubs are used by the mac module.
+ * Since dls already depends on mac, these
+ * stubs are needed to avoid circular dependencies.
+ */
+#ifndef DLS_MODULE
+	MODULE(dls,misc);
+	STUB(dls, dls_devnet_mac, nomod_zero);
+	STUB(dls, dls_devnet_hold_tmp, nomod_einval);
+	STUB(dls, dls_devnet_rele_tmp, nomod_void);
+	STUB(dls, dls_devnet_hold_link, nomod_einval);
+	STUB(dls, dls_devnet_rele_link, nomod_void);
+	STUB(dls, dls_devnet_prop_task_wait, nomod_void);
+	STUB(dls, dls_mgmt_get_linkid, nomod_einval);
+	STUB(dls, dls_devnet_macname2linkid, nomod_einval);
+	STUB(dls, dls_mgmt_get_linkinfo, nomod_einval);
+        END_MODULE(dls);
+#endif
+
+#ifndef	SOFTMAC_MODULE
+	MODULE(softmac,drv);
+	STUB(softmac, softmac_hold_device, nomod_einval);
+	STUB(softmac, softmac_rele_device, nomod_void);
+	STUB(softmac, softmac_recreate, nomod_void);
+	END_MODULE(softmac);
+#endif
+
+#ifndef IPTUN_MODULE
+	MODULE(iptun,drv);
+	STUB(iptun, iptun_create, nomod_einval);
+	STUB(iptun, iptun_delete, nomod_einval);
+	STUB(iptun, iptun_set_policy, nomod_void) ;
+	END_MODULE(iptun);
+#endif
+
+/*
+ * Stubs for dcopy, for Intel IOAT KAPIs
+ */
+#ifndef DCOPY_MODULE
+	MODULE(dcopy,misc);
+	NO_UNLOAD_STUB(dcopy, dcopy_query, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_query_channel, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_alloc, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_free, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_cmd_alloc, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_cmd_free, nomod_void);
+	NO_UNLOAD_STUB(dcopy, dcopy_cmd_post, nomod_minus_one);
+	NO_UNLOAD_STUB(dcopy, dcopy_cmd_poll, nomod_minus_one);
+	END_MODULE(dcopy);
+#endif
+
+/*
+ * Stubs for acpica
+ */
+#ifndef ACPICA_MODULE
+	MODULE(acpica,misc);
+	NO_UNLOAD_STUB(acpica, AcpiOsReadPort, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiOsWritePort, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiInstallNotifyHandler, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiRemoveNotifyHandler, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiEvaluateObject, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiEvaluateObjectTyped, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiWriteBitRegister, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiReadBitRegister, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, AcpiOsFree, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpica_get_handle_cpu, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpica_get_global_FADT, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpica_write_cpupm_capabilities,
+	    nomod_minus_one)		       ;
+	NO_UNLOAD_STUB(acpica, __acpi_wbinvd, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpica, acpi_reset_system, nomod_minus_one) ;
+	END_MODULE(acpica);
+#endif
+
+/*
+ * Stubs for acpidev
+ */
+#ifndef ACPIDEV_MODULE
+	MODULE(acpidev,misc);
+	NO_UNLOAD_STUB(acpidev, acpidev_dr_get_cpu_numa_info, nomod_minus_one) ;
+	NO_UNLOAD_STUB(acpidev, acpidev_dr_free_cpu_numa_info,
+	    nomod_minus_one) ;
+	END_MODULE(acpidev);
+#endif
+
+#ifndef IPNET_MODULE
+	MODULE(ipnet,drv);
+	STUB(ipnet, ipnet_if_getdev, nomod_zero);
+	STUB(ipnet, ipnet_walk_if, nomod_zero);
+	END_MODULE(ipnet);
+#endif
+
+#ifndef IOMMULIB_MODULE
+	MODULE(iommulib,misc);
+	STUB(iommulib, iommulib_nex_close, nomod_void);
+        END_MODULE(iommulib);
+#endif
+
+/*
+ * Stubs for rootnex nexus driver.
+ */
+#ifndef ROOTNEX_MODULE
+	MODULE(rootnex,drv);
+	STUB(rootnex, immu_init, 0);
+	STUB(rootnex, immu_startup, 0);
+	STUB(rootnex, immu_physmem_update, 0);
+	END_MODULE(rootnex);
+#endif
+
+/*
+ * Stubs for kernel socket, for iscsi
+ */
+#ifndef KSOCKET_MODULE
+	MODULE(ksocket, misc);
+	NO_UNLOAD_STUB(ksocket, ksocket_setsockopt, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_getsockopt, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_getpeername, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_getsockname, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_socket, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_bind, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_listen, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_accept, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_connect, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_recv, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_recvfrom, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_recvmsg, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_send, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_sendto, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_sendmsg, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_ioctl, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_setcallbacks, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_hold, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_rele, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_shutdown, nomod_minus_one);
+	NO_UNLOAD_STUB(ksocket, ksocket_close, nomod_minus_one);
+	END_MODULE(ksocket);
+#endif
+
+/*
+ * Stubs for elfexec
+ */
+#ifndef ELFEXEC_MODULE
+	MODULE(elfexec,exec);
+	STUB(elfexec, elfexec,		nomod_einval);
+	STUB(elfexec, mapexec_brand,	nomod_einval);
+	STUB(elfexec, elf32exec,	nomod_einval);
+	STUB(elfexec, mapexec32_brand,	nomod_einval);
+	END_MODULE(elfexec);
+#endif
+
+/*
+ * Stub(s) for APIX module.
+ */
+#ifndef APIX_MODULE
+	MODULE(apix,mach);
+	WSTUB(apix, apix_loaded, nomod_zero);
+	END_MODULE(apix);
+#endif
+
+/*
+ * Stubs for ppt module (bhyve PCI passthrough driver)
+ */
+#ifndef PPT_MODULE
+	MODULE(ppt,drv);
+	WSTUB(ppt, ppt_unassign_all,		nomod_zero);
+	WSTUB(ppt, ppt_map_mmio,		nomod_einval);
+	WSTUB(ppt, ppt_unmap_mmio,		nomod_einval);
+	WSTUB(ppt, ppt_setup_msi,		nomod_einval);
+	WSTUB(ppt, ppt_setup_msix,		nomod_einval);
+	WSTUB(ppt, ppt_disable_msix,		nomod_einval);
+	WSTUB(ppt, ppt_assigned_devices,	nomod_zero);
+	WSTUB(ppt, ppt_is_mmio,			nomod_zero);
+	WSTUB(ppt, ppt_assign_device,		nomod_einval);
+	WSTUB(ppt, ppt_unassign_device,		nomod_einval);
+	WSTUB(ppt, ppt_get_limits,		nomod_einval);
+	END_MODULE(ppt);
+#endif
+
+/*
+ * this is just a marker for the area of text that contains stubs
+ */
+	ENTRY_NP(stubs_end)
+	nop
+
diff --git a/usr/src/uts/intel/ml/ovbcopy.s b/usr/src/uts/intel/ml/ovbcopy.s
new file mode 100644
index 0000000000..0687e67e4b
--- /dev/null
+++ b/usr/src/uts/intel/ml/ovbcopy.s
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+/*-
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/amd64/amd64/support.S,v 1.102 2003/10/02 05:08:13 alc Exp $
+ */
+
+#include <sys/asm_linkage.h>
+
+/*
+ * Adapted from fbsd bcopy().
+ *
+ * bcopy(src, dst, cnt)
+ *       rdi, rsi, rdx
+ *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
+ */
+
+	ENTRY(ovbcopy)
+	xchgq	%rsi,%rdi
+	movq	%rdx,%rcx
+
+	movq	%rdi,%rax
+	subq	%rsi,%rax
+	cmpq	%rcx,%rax		/* overlapping && src < dst? */
+	jb	reverse
+
+	shrq	$3,%rcx			/* copy by 64-bit words */
+	cld				/* nope, copy forwards */
+	rep
+	movsq
+	movq	%rdx,%rcx
+	andq	$7,%rcx			/* any bytes left? */
+	rep
+	movsb
+	ret
+
+reverse:
+	addq	%rcx,%rdi		/* copy backwards */
+	addq	%rcx,%rsi
+	decq	%rdi
+	decq	%rsi
+	andq	$7,%rcx			/* any fractional bytes? */
+	std
+	rep
+	movsb
+	movq	%rdx,%rcx		/* copy remainder by 32-bit words */
+	shrq	$3,%rcx
+	subq	$7,%rsi
+	subq	$7,%rdi
+	rep
+	movsq
+	cld
+	ret
+	SET_SIZE(ovbcopy)
+
diff --git a/usr/src/uts/intel/ml/retpoline.s b/usr/src/uts/intel/ml/retpoline.s
new file mode 100644
index 0000000000..a68d9504c1
--- /dev/null
+++ b/usr/src/uts/intel/ml/retpoline.s
@@ -0,0 +1,211 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+	.file	"retpoline.s"
+
+/*
+ * This file implements the various hooks that are needed for retpolines and
+ * return stack buffer (RSB) stuffing. For more information, please see the
+ * 'Speculative Execution CPU Side Channel Security' section of the
+ * uts/i86pc/os/cpuid.c big theory statement.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/x86_archext.h>
+
+#if defined(__amd64)
+
+/*
+ * This macro generates the default retpoline entry point that the compiler
+ * expects. It implements the expected retpoline form.
+ */
+#define	RETPOLINE_MKTHUNK(reg) \
+	ENTRY(__x86_indirect_thunk_/**/reg)	\
+	call	2f;				\
+1:						\
+	pause;					\
+	lfence;					\
+	jmp	1b;				\
+2:						\
+	movq	%/**/reg, (%rsp);		\
+	ret;					\
+	SET_SIZE(__x86_indirect_thunk_/**/reg)
+
+/*
+ * This macro generates the default retpoline form. It exists in addition to the
+ * thunk so if we need to restore the default retpoline behavior to the thunk
+ * we can.
+ */
+#define	RETPOLINE_MKGENERIC(reg) \
+	ENTRY(__x86_indirect_thunk_gen_/**/reg)	\
+	call	2f;				\
+1:						\
+	pause;					\
+	lfence;					\
+	jmp	1b;				\
+2:						\
+	movq	%/**/reg, (%rsp);		\
+	ret;					\
+	SET_SIZE(__x86_indirect_thunk_gen_/**/reg)
+
+/*
+ * This macro generates the AMD optimized form of a retpoline which will be used
+ * on systems where the lfence dispatch serializing behavior has been changed.
+ */
+#define	RETPOLINE_MKLFENCE(reg)			\
+	ENTRY(__x86_indirect_thunk_amd_/**/reg)	\
+	lfence;					\
+	jmp	*%/**/reg;			\
+	SET_SIZE(__x86_indirect_thunk_amd_/**/reg)
+
+
+/*
+ * This macro generates the no-op form of the retpoline which will be used if we
+ * either need to disable retpolines because we have enhanced IBRS or because we
+ * have been asked to disable mitigations.
+ */
+#define	RETPOLINE_MKJUMP(reg)			\
+	ENTRY(__x86_indirect_thunk_jmp_/**/reg)	\
+	jmp	*%/**/reg;			\
+	SET_SIZE(__x86_indirect_thunk_jmp_/**/reg)
+
+	RETPOLINE_MKTHUNK(rax)
+	RETPOLINE_MKTHUNK(rbx)
+	RETPOLINE_MKTHUNK(rcx)
+	RETPOLINE_MKTHUNK(rdx)
+	RETPOLINE_MKTHUNK(rdi)
+	RETPOLINE_MKTHUNK(rsi)
+	RETPOLINE_MKTHUNK(rbp)
+	RETPOLINE_MKTHUNK(r8)
+	RETPOLINE_MKTHUNK(r9)
+	RETPOLINE_MKTHUNK(r10)
+	RETPOLINE_MKTHUNK(r11)
+	RETPOLINE_MKTHUNK(r12)
+	RETPOLINE_MKTHUNK(r13)
+	RETPOLINE_MKTHUNK(r14)
+	RETPOLINE_MKTHUNK(r15)
+
+	RETPOLINE_MKGENERIC(rax)
+	RETPOLINE_MKGENERIC(rbx)
+	RETPOLINE_MKGENERIC(rcx)
+	RETPOLINE_MKGENERIC(rdx)
+	RETPOLINE_MKGENERIC(rdi)
+	RETPOLINE_MKGENERIC(rsi)
+	RETPOLINE_MKGENERIC(rbp)
+	RETPOLINE_MKGENERIC(r8)
+	RETPOLINE_MKGENERIC(r9)
+	RETPOLINE_MKGENERIC(r10)
+	RETPOLINE_MKGENERIC(r11)
+	RETPOLINE_MKGENERIC(r12)
+	RETPOLINE_MKGENERIC(r13)
+	RETPOLINE_MKGENERIC(r14)
+	RETPOLINE_MKGENERIC(r15)
+
+	RETPOLINE_MKLFENCE(rax)
+	RETPOLINE_MKLFENCE(rbx)
+	RETPOLINE_MKLFENCE(rcx)
+	RETPOLINE_MKLFENCE(rdx)
+	RETPOLINE_MKLFENCE(rdi)
+	RETPOLINE_MKLFENCE(rsi)
+	RETPOLINE_MKLFENCE(rbp)
+	RETPOLINE_MKLFENCE(r8)
+	RETPOLINE_MKLFENCE(r9)
+	RETPOLINE_MKLFENCE(r10)
+	RETPOLINE_MKLFENCE(r11)
+	RETPOLINE_MKLFENCE(r12)
+	RETPOLINE_MKLFENCE(r13)
+	RETPOLINE_MKLFENCE(r14)
+	RETPOLINE_MKLFENCE(r15)
+
+	RETPOLINE_MKJUMP(rax)
+	RETPOLINE_MKJUMP(rbx)
+	RETPOLINE_MKJUMP(rcx)
+	RETPOLINE_MKJUMP(rdx)
+	RETPOLINE_MKJUMP(rdi)
+	RETPOLINE_MKJUMP(rsi)
+	RETPOLINE_MKJUMP(rbp)
+	RETPOLINE_MKJUMP(r8)
+	RETPOLINE_MKJUMP(r9)
+	RETPOLINE_MKJUMP(r10)
+	RETPOLINE_MKJUMP(r11)
+	RETPOLINE_MKJUMP(r12)
+	RETPOLINE_MKJUMP(r13)
+	RETPOLINE_MKJUMP(r14)
+	RETPOLINE_MKJUMP(r15)
+
+	/*
+	 * The x86_rsb_stuff function is called from pretty arbitrary
+	 * contexts. It's much easier for us to save and restore all the
+	 * registers we touch rather than clobber them for callers. You must
+	 * preserve this property or the system will panic at best.
+	 */
+	ENTRY(x86_rsb_stuff)
+	/*
+	 * These nops are present so we can patch a ret instruction if we need
+	 * to disable RSB stuffing because enhanced IBRS is present or we're
+	 * disabling mitigations.
+	 */
+	nop
+	nop
+	pushq	%rdi
+	pushq	%rax
+	movl	$16, %edi
+	movq	%rsp, %rax
+rsb_loop:
+	call	2f
+1:
+	pause
+	call	1b
+2:
+	call	2f
+1:
+	pause
+	call	1b
+2:
+	subl	$1, %edi
+	jnz	rsb_loop
+	movq	%rax, %rsp
+	popq	%rax
+	popq	%rdi
+	ret
+	SET_SIZE(x86_rsb_stuff)
+
+#elif defined(__i386)
+
+/*
+ * While the kernel is 64-bit only, dboot is still 32-bit, so there are a
+ * limited number of variants that are used for 32-bit. However as dboot is
+ * short lived and uses them sparingly, we only do the full variant and do not
+ * have an AMD specific version.
+ */
+
+#define	RETPOLINE_MKTHUNK(reg) \
+	ENTRY(__x86_indirect_thunk_/**/reg)	\
+	call	2f;				\
+1:						\
+	pause;					\
+	lfence;					\
+	jmp	1b;				\
+2:						\
+	movl	%/**/reg, (%esp);		\
+	ret;					\
+	SET_SIZE(__x86_indirect_thunk_/**/reg)
+
+	RETPOLINE_MKTHUNK(edi)
+	RETPOLINE_MKTHUNK(eax)
+
+#else
+#error	"Your architecture is in another castle."
+#endif
diff --git a/usr/src/uts/intel/ml/sseblk.s b/usr/src/uts/intel/ml/sseblk.s
new file mode 100644
index 0000000000..836b6b6c97
--- /dev/null
+++ b/usr/src/uts/intel/ml/sseblk.s
@@ -0,0 +1,280 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+
+#include "assym.h"
+
+/*
+ * Do block operations using Streaming SIMD extensions
+ */
+
+#if defined(DEBUG)
+#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)	\
+	movq	%gs:CPU_THREAD, t;		\
+	movsbl	T_PREEMPT(t), r32;		\
+	testl	r32, r32;			\
+	jne	5f;				\
+	pushq	%rbp;				\
+	movq	%rsp, %rbp;			\
+	leaq	msg(%rip), %rdi;		\
+	xorl	%eax, %eax;			\
+	call	panic;				\
+5:
+#else	/* DEBUG */
+#define	ASSERT_KPREEMPT_DISABLED(t, r32, msg)
+#endif	/* DEBUG */
+
+#define	BLOCKSHIFT	6
+#define	BLOCKSIZE	64	/* (1 << BLOCKSHIFT) */
+#define	BLOCKMASK	63	/* (BLOCKSIZE - 1) */
+
+#if (1 << BLOCKSHIFT) != BLOCKSIZE || BLOCKMASK != (BLOCKSIZE - 1)
+#error	"mucked up constants"
+#endif
+
+#define	SAVE_XMM0(r)				\
+	SAVE_XMM_PROLOG(r, 1);			\
+	movdqa	%xmm0, (r)
+
+#define	ZERO_LOOP_INIT_XMM(dst)			\
+	pxor	%xmm0, %xmm0
+
+#define	ZERO_LOOP_BODY_XMM(dst, cnt)		\
+	movntdq	%xmm0, (dst);			\
+	movntdq	%xmm0, 0x10(dst);		\
+	movntdq	%xmm0, 0x20(dst);		\
+	movntdq	%xmm0, 0x30(dst);		\
+	addq	$BLOCKSIZE, dst;		\
+	subq	$1, cnt
+
+#define	ZERO_LOOP_FINI_XMM(dst)			\
+	mfence
+
+#define	RSTOR_XMM0(r)				\
+	movdqa	0x0(r), %xmm0;			\
+	RSTOR_XMM_EPILOG(r, 1)
+
+	/*
+	 * %rdi		dst
+	 * %rsi		size
+	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_preempt)
+	 * %r8		pointer to %xmm register save area
+	 */
+	ENTRY(hwblkclr)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	testl	$BLOCKMASK, %edi	/* address must be BLOCKSIZE aligned */
+	jne	.dobzero
+	cmpq	$BLOCKSIZE, %rsi	/* size must be at least BLOCKSIZE */
+	jl	.dobzero
+	testq	$BLOCKMASK, %rsi	/* .. and be a multiple of BLOCKSIZE */
+	jne	.dobzero
+	shrq	$BLOCKSHIFT, %rsi
+
+	ASSERT_KPREEMPT_DISABLED(%r11, %eax, .not_disabled)
+	movq	%cr0, %rax
+	clts
+	testl	$CR0_TS, %eax
+	jnz	1f
+
+	SAVE_XMM0(%r8)
+1:	ZERO_LOOP_INIT_XMM(%rdi)
+9:	ZERO_LOOP_BODY_XMM(%rdi, %rsi)
+	jnz	9b
+	ZERO_LOOP_FINI_XMM(%rdi)
+
+	testl	$CR0_TS, %eax
+	jnz	2f
+	RSTOR_XMM0(%r8)
+2:	movq	%rax, %cr0
+	leave
+	ret
+.dobzero:
+	leave
+	jmp	bzero
+	SET_SIZE(hwblkclr)
+
+
+#define	PREFETCH_START(src)			\
+	prefetchnta	0x0(src);		\
+	prefetchnta	0x40(src)
+
+#define	SAVE_XMMS(r)				\
+	SAVE_XMM_PROLOG(r, 8);			\
+	movdqa	%xmm0, (r);			\
+	movdqa	%xmm1, 0x10(r);			\
+	movdqa	%xmm2, 0x20(r);			\
+	movdqa	%xmm3, 0x30(r);			\
+	movdqa	%xmm4, 0x40(r);			\
+	movdqa	%xmm5, 0x50(r);			\
+	movdqa	%xmm6, 0x60(r);			\
+	movdqa	%xmm7, 0x70(r)
+
+#define	COPY_LOOP_INIT_XMM(src)			\
+	prefetchnta	0x80(src);		\
+	prefetchnta	0xc0(src);		\
+	movdqa	0x0(src), %xmm0;		\
+	movdqa	0x10(src), %xmm1;		\
+	movdqa	0x20(src), %xmm2;		\
+	movdqa	0x30(src), %xmm3;		\
+	movdqa	0x40(src), %xmm4;		\
+	movdqa	0x50(src), %xmm5;		\
+	movdqa	0x60(src), %xmm6;		\
+	movdqa	0x70(src), %xmm7;		\
+	addq	$0x80, src
+
+#define	COPY_LOOP_BODY_XMM(src, dst, cnt)	\
+	prefetchnta	0x80(src);		\
+	prefetchnta	0xc0(src);		\
+	prefetchnta	0x100(src);		\
+	prefetchnta	0x140(src);		\
+	movntdq	%xmm0, (dst);			\
+	movntdq	%xmm1, 0x10(dst);		\
+	movntdq	%xmm2, 0x20(dst);		\
+	movntdq	%xmm3, 0x30(dst);		\
+	movdqa	0x0(src), %xmm0;		\
+	movdqa	0x10(src), %xmm1;		\
+	movntdq	%xmm4, 0x40(dst);		\
+	movntdq	%xmm5, 0x50(dst);		\
+	movdqa	0x20(src), %xmm2;		\
+	movdqa	0x30(src), %xmm3;		\
+	movntdq	%xmm6, 0x60(dst);		\
+	movntdq	%xmm7, 0x70(dst);		\
+	movdqa	0x40(src), %xmm4;		\
+	movdqa	0x50(src), %xmm5;		\
+	addq	$0x80, dst;			\
+	movdqa	0x60(src), %xmm6;		\
+	movdqa	0x70(src), %xmm7;		\
+	addq	$0x80, src;			\
+	subl	$1, cnt
+
+#define	COPY_LOOP_FINI_XMM(dst)			\
+	movntdq	%xmm0, 0x0(dst);		\
+	movntdq	%xmm1, 0x10(dst);		\
+	movntdq	%xmm2, 0x20(dst);		\
+	movntdq	%xmm3, 0x30(dst);		\
+	movntdq	%xmm4, 0x40(dst);		\
+	movntdq	%xmm5, 0x50(dst);		\
+	movntdq %xmm6, 0x60(dst);		\
+	movntdq	%xmm7, 0x70(dst)
+
+#define	RSTOR_XMMS(r)				\
+	movdqa	0x0(r), %xmm0;			\
+	movdqa	0x10(r), %xmm1;			\
+	movdqa	0x20(r), %xmm2;			\
+	movdqa	0x30(r), %xmm3;			\
+	movdqa	0x40(r), %xmm4;			\
+	movdqa	0x50(r), %xmm5;			\
+	movdqa	0x60(r), %xmm6;			\
+	movdqa	0x70(r), %xmm7;			\
+	RSTOR_XMM_EPILOG(r, 8)
+
+	/*
+	 * %rdi		src
+	 * %rsi		dst
+	 * %rdx		#if DEBUG then curthread
+	 * %ecx		loop count
+	 * %rax		saved %cr0 (#if DEBUG then %eax is t->t_prempt)
+	 * %r8		pointer to %xmm register save area
+	 */
+	ENTRY(hwblkpagecopy)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	PREFETCH_START(%rdi)
+	/*
+	 * PAGESIZE is 4096, each loop moves 128 bytes, but the initial
+	 * load and final store save us on loop count
+	 */
+	movl	$_CONST(32 - 1), %ecx
+	ASSERT_KPREEMPT_DISABLED(%rdx, %eax, .not_disabled)
+	movq	%cr0, %rax
+	clts
+	testl	$CR0_TS, %eax
+	jnz	3f
+	SAVE_XMMS(%r8)
+3:	COPY_LOOP_INIT_XMM(%rdi)
+4:	COPY_LOOP_BODY_XMM(%rdi, %rsi, %ecx)
+	jnz	4b
+	COPY_LOOP_FINI_XMM(%rsi)
+	testl	$CR0_TS, %eax
+	jnz	5f
+	RSTOR_XMMS(%r8)
+5:	movq	%rax, %cr0
+	mfence
+	leave
+	ret
+	SET_SIZE(hwblkpagecopy)
+
+	ENTRY(block_zero_no_xmm)
+	pushq	%rbp
+	movq	%rsp, %rbp
+	xorl	%eax, %eax
+	addq	%rsi, %rdi
+	negq	%rsi
+1:
+	movnti	%rax, (%rdi, %rsi)
+	movnti	%rax, 8(%rdi, %rsi)
+	movnti	%rax, 16(%rdi, %rsi)
+	movnti	%rax, 24(%rdi, %rsi)
+	addq	$32, %rsi
+	jnz	1b
+	mfence
+	leave
+	ret
+	SET_SIZE(block_zero_no_xmm)
+
+
+	ENTRY(page_copy_no_xmm)
+	movq	$MMU_STD_PAGESIZE, %rcx
+	addq	%rcx, %rdi
+	addq	%rcx, %rsi
+	negq	%rcx
+1:
+	movq	(%rsi, %rcx), %rax
+	movnti	%rax, (%rdi, %rcx)
+	movq	8(%rsi, %rcx), %rax
+	movnti	%rax, 8(%rdi, %rcx)
+	movq	16(%rsi, %rcx), %rax
+	movnti	%rax, 16(%rdi, %rcx)
+	movq	24(%rsi, %rcx), %rax
+	movnti	%rax, 24(%rdi, %rcx)
+	addq	$32, %rcx
+	jnz	1b
+	mfence
+	ret
+	SET_SIZE(page_copy_no_xmm)
+
+#if defined(DEBUG)
+	.text
+.not_disabled:
+	.string	"sseblk: preemption not disabled!"
+#endif
diff --git a/usr/src/uts/intel/ml/swtch.s b/usr/src/uts/intel/ml/swtch.s
new file mode 100644
index 0000000000..c6c606b11e
--- /dev/null
+++ b/usr/src/uts/intel/ml/swtch.s
@@ -0,0 +1,509 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2020 Joyent, Inc.
+ */
+
+/*
+ * Process switching routines.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/stack.h>
+#include <sys/segments.h>
+#include <sys/psw.h>
+
+#include "assym.h"
+
+/*
+ * resume(thread_id_t t);
+ *
+ * a thread can only run on one processor at a time. there
+ * exists a window on MPs where the current thread on one
+ * processor is capable of being dispatched by another processor.
+ * some overlap between outgoing and incoming threads can happen
+ * when they are the same thread. in this case where the threads
+ * are the same, resume() on one processor will spin on the incoming
+ * thread until resume() on the other processor has finished with
+ * the outgoing thread.
+ *
+ * The MMU context changes when the resuming thread resides in a different
+ * process.  Kernel threads are known by resume to reside in process 0.
+ * The MMU context, therefore, only changes when resuming a thread in
+ * a process different from curproc.
+ *
+ * resume_from_intr() is called when the thread being resumed was not
+ * passivated by resume (e.g. was interrupted).  This means that the
+ * resume lock is already held and that a restore context is not needed.
+ * Also, the MMU context is not changed on the resume in this case.
+ *
+ * resume_from_zombie() is the same as resume except the calling thread
+ * is a zombie and must be put on the deathrow list after the CPU is
+ * off the stack.
+ */
+
+#if LWP_PCB_FPU != 0
+#error LWP_PCB_FPU MUST be defined as 0 for code in swtch.s to work
+#endif	/* LWP_PCB_FPU != 0 */
+
+/*
+ * Save non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
+ *
+ * The stack frame must be created before the save of %rsp so that tracebacks
+ * of swtch()ed-out processes show the process as having last called swtch().
+ */
+#define SAVE_REGS(thread_t, retaddr)			\
+	movq	%rbp, T_RBP(thread_t);			\
+	movq	%rbx, T_RBX(thread_t);			\
+	movq	%r12, T_R12(thread_t);			\
+	movq	%r13, T_R13(thread_t);			\
+	movq	%r14, T_R14(thread_t);			\
+	movq	%r15, T_R15(thread_t);			\
+	pushq	%rbp;					\
+	movq	%rsp, %rbp;				\
+	movq	%rsp, T_SP(thread_t);			\
+	movq	retaddr, T_PC(thread_t);		\
+	movq	%rdi, %r12;				\
+	call	__dtrace_probe___sched_off__cpu
+
+/*
+ * Restore non-volatile regs other than %rsp (%rbx, %rbp, and %r12 - %r15)
+ *
+ * We load up %rsp from the label_t as part of the context switch, so
+ * we don't repeat that here.
+ *
+ * We don't do a 'leave,' because reloading %rsp/%rbp from the label_t
+ * already has the effect of putting the stack back the way it was when
+ * we came in.
+ */
+#define RESTORE_REGS(scratch_reg)			\
+	movq	%gs:CPU_THREAD, scratch_reg;		\
+	movq	T_RBP(scratch_reg), %rbp;		\
+	movq	T_RBX(scratch_reg), %rbx;		\
+	movq	T_R12(scratch_reg), %r12;		\
+	movq	T_R13(scratch_reg), %r13;		\
+	movq	T_R14(scratch_reg), %r14;		\
+	movq	T_R15(scratch_reg), %r15
+
+/*
+ * Get pointer to a thread's hat structure
+ */
+#define GET_THREAD_HATP(hatp, thread_t, scratch_reg)	\
+	movq	T_PROCP(thread_t), hatp;		\
+	movq	P_AS(hatp), scratch_reg;		\
+	movq	A_HAT(scratch_reg), hatp
+
+#define	TSC_READ()					\
+	call	tsc_read;				\
+	movq	%rax, %r14;
+
+/*
+ * If we are resuming an interrupt thread, store a timestamp in the thread
+ * structure.  If an interrupt occurs between tsc_read() and its subsequent
+ * store, the timestamp will be stale by the time it is stored.  We can detect
+ * this by doing a compare-and-swap on the thread's timestamp, since any
+ * interrupt occurring in this window will put a new timestamp in the thread's
+ * t_intr_start field.
+ */
+#define	STORE_INTR_START(thread_t)			\
+	testw	$T_INTR_THREAD, T_FLAGS(thread_t);	\
+	jz	1f;					\
+0:							\
+	TSC_READ();					\
+	movq	T_INTR_START(thread_t), %rax;		\
+	cmpxchgq %r14, T_INTR_START(thread_t);		\
+	jnz	0b;					\
+1:
+
+	.global	kpti_enable
+
+	ENTRY(resume)
+	movq	%gs:CPU_THREAD, %rax
+	leaq	resume_return(%rip), %r11
+
+	/*
+	 * Deal with SMAP here. A thread may be switched out at any point while
+	 * it is executing. The thread could be under on_fault() or it could be
+	 * pre-empted while performing a copy interruption. If this happens and
+	 * we're not in the context of an interrupt which happens to handle
+	 * saving and restoring rflags correctly, we may lose our SMAP related
+	 * state.
+	 *
+	 * To handle this, as part of being switched out, we first save whether
+	 * or not userland access is allowed ($PS_ACHK in rflags) and store that
+	 * in t_useracc on the kthread_t and unconditionally enable SMAP to
+	 * protect the system.
+	 *
+	 * Later, when the thread finishes resuming, we potentially disable smap
+	 * if PS_ACHK was present in rflags. See uts/intel/ml/copy.s for
+	 * more information on rflags and SMAP.
+	 */
+	pushfq
+	popq	%rsi
+	andq	$PS_ACHK, %rsi
+	movq	%rsi, T_USERACC(%rax)
+	call	smap_enable
+
+	/*
+	 * Take a moment to potentially clear the RSB buffer. This is done to
+	 * prevent various Spectre variant 2 and SpectreRSB attacks. This may
+	 * not be sufficient. Please see uts/intel/ml/retpoline.s for more
+	 * information about this.
+	 */
+	call	x86_rsb_stuff
+
+	/*
+	 * Save non-volatile registers, and set return address for current
+	 * thread to resume_return.
+	 *
+	 * %r12 = t (new thread) when done
+	 */
+	SAVE_REGS(%rax, %r11)
+
+
+	LOADCPU(%r15)				/* %r15 = CPU */
+	movq	CPU_THREAD(%r15), %r13		/* %r13 = curthread */
+
+	/*
+	 * Call savectx if thread has installed context ops.
+	 *
+	 * Note that if we have floating point context, the save op
+	 * (either fpsave_begin or fpxsave_begin) will issue the
+	 * async save instruction (fnsave or fxsave respectively)
+	 * that we fwait for below.
+	 */
+	cmpq	$0, T_CTX(%r13)		/* should current thread savectx? */
+	je	.nosavectx		/* skip call when zero */
+
+	movq	%r13, %rdi		/* arg = thread pointer */
+	call	savectx			/* call ctx ops */
+.nosavectx:
+
+	/*
+	 * Check that the curthread is not using the FPU while in the kernel.
+	 */
+	call	kernel_fpu_no_swtch
+
+        /*
+         * Call savepctx if process has installed context ops.
+         */
+	movq	T_PROCP(%r13), %r14	/* %r14 = proc */
+        cmpq    $0, P_PCTX(%r14)         /* should current thread savepctx? */
+        je      .nosavepctx              /* skip call when zero */
+
+        movq    %r14, %rdi              /* arg = proc pointer */
+        call    savepctx                 /* call ctx ops */
+.nosavepctx:
+
+	/*
+	 * Temporarily switch to the idle thread's stack
+	 */
+	movq	CPU_IDLE_THREAD(%r15), %rax	/* idle thread pointer */
+
+	/*
+	 * Set the idle thread as the current thread
+	 */
+	movq	T_SP(%rax), %rsp	/* It is safe to set rsp */
+	movq	%rax, CPU_THREAD(%r15)
+
+	/*
+	 * Switch in the hat context for the new thread
+	 *
+	 */
+	GET_THREAD_HATP(%rdi, %r12, %r11)
+	call	hat_switch
+
+	/*
+	 * Clear and unlock previous thread's t_lock
+	 * to allow it to be dispatched by another processor.
+	 */
+	movb	$0, T_LOCK(%r13)
+
+	/*
+	 * IMPORTANT: Registers at this point must be:
+	 *       %r12 = new thread
+	 *
+	 * Here we are in the idle thread, have dropped the old thread.
+	 */
+	ALTENTRY(_resume_from_idle)
+	/*
+	 * spin until dispatched thread's mutex has
+	 * been unlocked. this mutex is unlocked when
+	 * it becomes safe for the thread to run.
+	 */
+.lock_thread_mutex:
+	lock
+	btsl	$0, T_LOCK(%r12)	/* attempt to lock new thread's mutex */
+	jnc	.thread_mutex_locked	/* got it */
+
+.spin_thread_mutex:
+	pause
+	cmpb	$0, T_LOCK(%r12)	/* check mutex status */
+	jz	.lock_thread_mutex	/* clear, retry lock */
+	jmp	.spin_thread_mutex	/* still locked, spin... */
+
+.thread_mutex_locked:
+	/*
+	 * Fix CPU structure to indicate new running thread.
+	 * Set pointer in new thread to the CPU structure.
+	 */
+	LOADCPU(%r13)			/* load current CPU pointer */
+	cmpq	%r13, T_CPU(%r12)
+	je	.setup_cpu
+
+	/* cp->cpu_stats.sys.cpumigrate++ */
+	incq    CPU_STATS_SYS_CPUMIGRATE(%r13)
+	movq	%r13, T_CPU(%r12)	/* set new thread's CPU pointer */
+
+.setup_cpu:
+	/*
+	 * Setup rsp0 (kernel stack) in TSS to curthread's saved regs
+	 * structure.  If this thread doesn't have a regs structure above
+	 * the stack -- that is, if lwp_stk_init() was never called for the
+	 * thread -- this will set rsp0 to the wrong value, but it's harmless
+	 * as it's a kernel thread, and it won't actually attempt to implicitly
+	 * use the rsp0 via a privilege change.
+	 *
+	 * Note that when we have KPTI enabled on amd64, we never use this
+	 * value at all (since all the interrupts have an IST set).
+	 */
+	movq	CPU_TSS(%r13), %r14
+#if !defined(__xpv)
+	cmpq	$1, kpti_enable
+	jne	1f
+	leaq	CPU_KPTI_TR_RSP(%r13), %rax
+	jmp	2f
+1:
+	movq	T_STACK(%r12), %rax
+	addq	$REGSIZE+MINFRAME, %rax	/* to the bottom of thread stack */
+2:
+	movq	%rax, TSS_RSP0(%r14)
+#else
+	movq	T_STACK(%r12), %rax
+	addq	$REGSIZE+MINFRAME, %rax	/* to the bottom of thread stack */
+	movl	$KDS_SEL, %edi
+	movq	%rax, %rsi
+	call	HYPERVISOR_stack_switch
+#endif	/* __xpv */
+
+	movq	%r12, CPU_THREAD(%r13)	/* set CPU's thread pointer */
+	mfence				/* synchronize with mutex_exit() */
+	xorl	%ebp, %ebp		/* make $<threadlist behave better */
+	movq	T_LWP(%r12), %rax	/* set associated lwp to  */
+	movq	%rax, CPU_LWP(%r13)	/* CPU's lwp ptr */
+
+	movq	T_SP(%r12), %rsp	/* switch to outgoing thread's stack */
+	movq	T_PC(%r12), %r13	/* saved return addr */
+
+	/*
+	 * Call restorectx if context ops have been installed.
+	 */
+	cmpq	$0, T_CTX(%r12)		/* should resumed thread restorectx? */
+	jz	.norestorectx		/* skip call when zero */
+	movq	%r12, %rdi		/* arg = thread pointer */
+	call	restorectx		/* call ctx ops */
+.norestorectx:
+
+	/*
+	 * Call restorepctx if context ops have been installed for the proc.
+	 */
+	movq	T_PROCP(%r12), %rcx
+	cmpq	$0, P_PCTX(%rcx)
+	jz	.norestorepctx
+	movq	%rcx, %rdi
+	call	restorepctx
+.norestorepctx:
+
+	STORE_INTR_START(%r12)
+
+	/*
+	 * If we came into swtch with the ability to access userland pages, go
+	 * ahead and restore that fact by disabling SMAP.  Clear the indicator
+	 * flag out of paranoia.
+	 */
+	movq	T_USERACC(%r12), %rax	/* should we disable smap? */
+	cmpq	$0, %rax		/* skip call when zero */
+	jz	.nosmap
+	xorq	%rax, %rax
+	movq	%rax, T_USERACC(%r12)
+	call	smap_disable
+.nosmap:
+
+	call	smt_mark
+
+	/*
+	 * Restore non-volatile registers, then have spl0 return to the
+	 * resuming thread's PC after first setting the priority as low as
+	 * possible and blocking all interrupt threads that may be active.
+	 */
+	movq	%r13, %rax	/* save return address */
+	RESTORE_REGS(%r11)
+	pushq	%rax		/* push return address for spl0() */
+	call	__dtrace_probe___sched_on__cpu
+	jmp	spl0
+
+resume_return:
+	/*
+	 * Remove stack frame created in SAVE_REGS()
+	 */
+	addq	$CLONGSIZE, %rsp
+	ret
+	SET_SIZE(_resume_from_idle)
+	SET_SIZE(resume)
+
+	ENTRY(resume_from_zombie)
+	movq	%gs:CPU_THREAD, %rax
+	leaq	resume_from_zombie_return(%rip), %r11
+
+	/*
+	 * Save non-volatile registers, and set return address for current
+	 * thread to resume_from_zombie_return.
+	 *
+	 * %r12 = t (new thread) when done
+	 */
+	SAVE_REGS(%rax, %r11)
+
+	movq	%gs:CPU_THREAD, %r13	/* %r13 = curthread */
+
+	/* clean up the fp unit. It might be left enabled */
+
+#if defined(__xpv)		/* XXPV XXtclayton */
+	/*
+	 * Remove this after bringup.
+	 * (Too many #gp's for an instrumented hypervisor.)
+	 */
+	STTS(%rax)
+#else
+	movq	%cr0, %rax
+	testq	$CR0_TS, %rax
+	jnz	.zfpu_disabled		/* if TS already set, nothing to do */
+	fninit				/* init fpu & discard pending error */
+	orq	$CR0_TS, %rax
+	movq	%rax, %cr0
+.zfpu_disabled:
+
+#endif	/* __xpv */
+
+	/*
+	 * Temporarily switch to the idle thread's stack so that the zombie
+	 * thread's stack can be reclaimed by the reaper.
+	 */
+	movq	%gs:CPU_IDLE_THREAD, %rax /* idle thread pointer */
+	movq	T_SP(%rax), %rsp	/* get onto idle thread stack */
+
+	/*
+	 * Sigh. If the idle thread has never run thread_start()
+	 * then t_sp is mis-aligned by thread_load().
+	 */
+	andq	$_BITNOT(STACK_ALIGN-1), %rsp
+
+	/*
+	 * Set the idle thread as the current thread.
+	 */
+	movq	%rax, %gs:CPU_THREAD
+
+	/* switch in the hat context for the new thread */
+	GET_THREAD_HATP(%rdi, %r12, %r11)
+	call	hat_switch
+
+	/*
+	 * Put the zombie on death-row.
+	 */
+	movq	%r13, %rdi
+	call	reapq_add
+
+	jmp	_resume_from_idle	/* finish job of resume */
+
+resume_from_zombie_return:
+	RESTORE_REGS(%r11)		/* restore non-volatile registers */
+	call	__dtrace_probe___sched_on__cpu
+
+	/*
+	 * Remove stack frame created in SAVE_REGS()
+	 */
+	addq	$CLONGSIZE, %rsp
+	ret
+	SET_SIZE(resume_from_zombie)
+
+	ENTRY(resume_from_intr)
+	movq	%gs:CPU_THREAD, %rax
+	leaq	resume_from_intr_return(%rip), %r11
+
+	/*
+	 * Save non-volatile registers, and set return address for current
+	 * thread to resume_from_intr_return.
+	 *
+	 * %r12 = t (new thread) when done
+	 */
+	SAVE_REGS(%rax, %r11)
+
+	movq	%gs:CPU_THREAD, %r13	/* %r13 = curthread */
+	movq	%r12, %gs:CPU_THREAD	/* set CPU's thread pointer */
+	mfence				/* synchronize with mutex_exit() */
+	movq	T_SP(%r12), %rsp	/* restore resuming thread's sp */
+	xorl	%ebp, %ebp		/* make $<threadlist behave better */
+
+	/*
+	 * Unlock outgoing thread's mutex dispatched by another processor.
+	 */
+	xorl	%eax, %eax
+	xchgb	%al, T_LOCK(%r13)
+
+	STORE_INTR_START(%r12)
+
+	call	smt_mark
+
+	/*
+	 * Restore non-volatile registers, then have spl0 return to the
+	 * resuming thread's PC after first setting the priority as low as
+	 * possible and blocking all interrupt threads that may be active.
+	 */
+	movq	T_PC(%r12), %rax	/* saved return addr */
+	RESTORE_REGS(%r11);
+	pushq	%rax			/* push return address for spl0() */
+	call	__dtrace_probe___sched_on__cpu
+	jmp	spl0
+
+resume_from_intr_return:
+	/*
+	 * Remove stack frame created in SAVE_REGS()
+	 */
+	addq	$CLONGSIZE, %rsp
+	ret
+	SET_SIZE(resume_from_intr)
+
+	ENTRY(thread_start)
+	popq	%rax		/* start() */
+	popq	%rdi		/* arg */
+	popq	%rsi		/* len */
+	movq	%rsp, %rbp
+	INDIRECT_CALL_REG(rax)
+	call	thread_exit	/* destroy thread if it returns. */
+	/*NOTREACHED*/
+	SET_SIZE(thread_start)
author	Richard Lowe <richlowe@richlowe.net>	2021-06-04 15:15:12 -0500
committer	Richard Lowe <richlowe@richlowe.net>	2021-08-16 12:46:39 -0500
commit	f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988 (patch)
tree	c4ac2f5e703ed459d50bcee7ddb38a993d961520 /usr/src/uts/intel/ml
parent	d083fed0c91296a88878f7a468910ad5b5c888ea (diff)
download	illumos-gate-f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988.tar.gz