13941 intel code and headers should not look ia32 specific

Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: Patrick Mooney <pmooney@pfmooney.com> Approved by: Garret D'Amore <garrett@damore.org>
author: Richard Lowe <richlowe@richlowe.net> 2021-06-04 15:15:12 -0500
committer: Richard Lowe <richlowe@richlowe.net> 2021-08-16 12:46:39 -0500
commit: f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988 (patch)
tree: c4ac2f5e703ed459d50bcee7ddb38a993d961520 /usr/src/uts/intel/os
parent: d083fed0c91296a88878f7a468910ad5b5c888ea (diff)
download: illumos-gate-f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988.tar.gz
12 files changed, 10253 insertions, 0 deletions
diff --git a/usr/src/uts/intel/os/archdep.c b/usr/src/uts/intel/os/archdep.c
new file mode 100644
index 0000000000..14d20bb487
--- /dev/null
+++ b/usr/src/uts/intel/os/archdep.c
@@ -0,0 +1,1240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved	*/
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/vmparam.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/stack.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/frame.h>
+#include <sys/proc.h>
+#include <sys/psw.h>
+#include <sys/siginfo.h>
+#include <sys/cpuvar.h>
+#include <sys/asm_linkage.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/bootconf.h>
+#include <sys/archsystm.h>
+#include <sys/debug.h>
+#include <sys/elf.h>
+#include <sys/spl.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/modctl.h>
+#include <sys/kobj.h>
+#include <sys/panic.h>
+#include <sys/reboot.h>
+#include <sys/time.h>
+#include <sys/fp.h>
+#include <sys/x86_archext.h>
+#include <sys/auxv.h>
+#include <sys/auxv_386.h>
+#include <sys/dtrace.h>
+#include <sys/brand.h>
+#include <sys/machbrand.h>
+#include <sys/cmn_err.h>
+
+/*
+ * Map an fnsave-formatted save area into an fxsave-formatted save area.
+ *
+ * Most fields are the same width, content and semantics.  However
+ * the tag word is compressed.
+ */
+static void
+fnsave_to_fxsave(const struct fnsave_state *fn, struct fxsave_state *fx)
+{
+	uint_t i, tagbits;
+
+	fx->fx_fcw = fn->f_fcw;
+	fx->fx_fsw = fn->f_fsw;
+
+	/*
+	 * copy element by element (because of holes)
+	 */
+	for (i = 0; i < 8; i++)
+		bcopy(&fn->f_st[i].fpr_16[0], &fx->fx_st[i].fpr_16[0],
+		    sizeof (fn->f_st[0].fpr_16)); /* 80-bit x87-style floats */
+
+	/*
+	 * synthesize compressed tag bits
+	 */
+	fx->fx_fctw = 0;
+	for (tagbits = fn->f_ftw, i = 0; i < 8; i++, tagbits >>= 2)
+		if ((tagbits & 3) != 3)
+			fx->fx_fctw |= (1 << i);
+
+	fx->fx_fop = fn->f_fop;
+
+	fx->fx_rip = (uint64_t)fn->f_eip;
+	fx->fx_rdp = (uint64_t)fn->f_dp;
+}
+
+/*
+ * Map from an fxsave-format save area to an fnsave-format save area.
+ */
+static void
+fxsave_to_fnsave(const struct fxsave_state *fx, struct fnsave_state *fn)
+{
+	uint_t i, top, tagbits;
+
+	fn->f_fcw = fx->fx_fcw;
+	fn->__f_ign0 = 0;
+	fn->f_fsw = fx->fx_fsw;
+	fn->__f_ign1 = 0;
+
+	top = (fx->fx_fsw & FPS_TOP) >> 11;
+
+	/*
+	 * copy element by element (because of holes)
+	 */
+	for (i = 0; i < 8; i++)
+		bcopy(&fx->fx_st[i].fpr_16[0], &fn->f_st[i].fpr_16[0],
+		    sizeof (fn->f_st[0].fpr_16)); /* 80-bit x87-style floats */
+
+	/*
+	 * synthesize uncompressed tag bits
+	 */
+	fn->f_ftw = 0;
+	for (tagbits = fx->fx_fctw, i = 0; i < 8; i++, tagbits >>= 1) {
+		uint_t ibit, expo;
+		const uint16_t *fpp;
+		static const uint16_t zero[5] = { 0, 0, 0, 0, 0 };
+
+		if ((tagbits & 1) == 0) {
+			fn->f_ftw |= 3 << (i << 1);	/* empty */
+			continue;
+		}
+
+		/*
+		 * (tags refer to *physical* registers)
+		 */
+		fpp = &fx->fx_st[(i - top + 8) & 7].fpr_16[0];
+		ibit = fpp[3] >> 15;
+		expo = fpp[4] & 0x7fff;
+
+		if (ibit && expo != 0 && expo != 0x7fff)
+			continue;			/* valid fp number */
+
+		if (bcmp(fpp, &zero, sizeof (zero)))
+			fn->f_ftw |= 2 << (i << 1);	/* NaN */
+		else
+			fn->f_ftw |= 1 << (i << 1);	/* fp zero */
+	}
+
+	fn->f_fop = fx->fx_fop;
+
+	fn->__f_ign2 = 0;
+	fn->f_eip = (uint32_t)fx->fx_rip;
+	fn->f_cs = U32CS_SEL;
+	fn->f_dp = (uint32_t)fx->fx_rdp;
+	fn->f_ds = UDS_SEL;
+	fn->__f_ign3 = 0;
+}
+
+/*
+ * Map from an fpregset_t into an fxsave-format save area
+ */
+static void
+fpregset_to_fxsave(const fpregset_t *fp, struct fxsave_state *fx)
+{
+	bcopy(fp, fx, sizeof (*fx));
+	/*
+	 * avoid useless #gp exceptions - mask reserved bits
+	 */
+	fx->fx_mxcsr &= sse_mxcsr_mask;
+}
+
+/*
+ * Map from an fxsave-format save area into a fpregset_t
+ */
+static void
+fxsave_to_fpregset(const struct fxsave_state *fx, fpregset_t *fp)
+{
+	bcopy(fx, fp, sizeof (*fx));
+}
+
+#if defined(_SYSCALL32_IMPL)
+static void
+fpregset32_to_fxsave(const fpregset32_t *fp, struct fxsave_state *fx)
+{
+	const struct fpchip32_state *fc = &fp->fp_reg_set.fpchip_state;
+
+	fnsave_to_fxsave((const struct fnsave_state *)fc, fx);
+	/*
+	 * avoid useless #gp exceptions - mask reserved bits
+	 */
+	fx->fx_mxcsr = sse_mxcsr_mask & fc->mxcsr;
+	bcopy(&fc->xmm[0], &fx->fx_xmm[0], sizeof (fc->xmm));
+}
+
+static void
+fxsave_to_fpregset32(const struct fxsave_state *fx, fpregset32_t *fp)
+{
+	struct fpchip32_state *fc = &fp->fp_reg_set.fpchip_state;
+
+	fxsave_to_fnsave(fx, (struct fnsave_state *)fc);
+	fc->mxcsr = fx->fx_mxcsr;
+	bcopy(&fx->fx_xmm[0], &fc->xmm[0], sizeof (fc->xmm));
+}
+
+static void
+fpregset_nto32(const fpregset_t *src, fpregset32_t *dst)
+{
+	fxsave_to_fpregset32((struct fxsave_state *)src, dst);
+	dst->fp_reg_set.fpchip_state.status =
+	    src->fp_reg_set.fpchip_state.status;
+	dst->fp_reg_set.fpchip_state.xstatus =
+	    src->fp_reg_set.fpchip_state.xstatus;
+}
+
+static void
+fpregset_32ton(const fpregset32_t *src, fpregset_t *dst)
+{
+	fpregset32_to_fxsave(src, (struct fxsave_state *)dst);
+	dst->fp_reg_set.fpchip_state.status =
+	    src->fp_reg_set.fpchip_state.status;
+	dst->fp_reg_set.fpchip_state.xstatus =
+	    src->fp_reg_set.fpchip_state.xstatus;
+}
+#endif
+
+/*
+ * Set floating-point registers from a native fpregset_t.
+ */
+void
+setfpregs(klwp_t *lwp, fpregset_t *fp)
+{
+	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+
+	if (fpu->fpu_flags & FPU_EN) {
+		if (!(fpu->fpu_flags & FPU_VALID)) {
+			/*
+			 * FPU context is still active, release the
+			 * ownership.
+			 */
+			fp_free(fpu, 0);
+		}
+	}
+	/*
+	 * Else: if we are trying to change the FPU state of a thread which
+	 * hasn't yet initialized floating point, store the state in
+	 * the pcb and indicate that the state is valid.  When the
+	 * thread enables floating point, it will use this state instead
+	 * of the default state.
+	 */
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fpregset_to_fxsave(fp, fpu->fpu_regs.kfpu_u.kfpu_fx);
+		fpu->fpu_regs.kfpu_xstatus =
+		    fp->fp_reg_set.fpchip_state.xstatus;
+		break;
+
+	case FP_XSAVE:
+		fpregset_to_fxsave(fp,
+		    &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave);
+		fpu->fpu_regs.kfpu_xstatus =
+		    fp->fp_reg_set.fpchip_state.xstatus;
+		fpu->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |=
+		    (XFEATURE_LEGACY_FP | XFEATURE_SSE);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status;
+	fpu->fpu_flags |= FPU_VALID;
+	PCB_SET_UPDATE_FPU(&lwp->lwp_pcb);
+}
+
+/*
+ * Get floating-point registers into a native fpregset_t.
+ */
+void
+getfpregs(klwp_t *lwp, fpregset_t *fp)
+{
+	struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu;
+
+	kpreempt_disable();
+	if (fpu->fpu_flags & FPU_EN) {
+		/*
+		 * If we have FPU hw and the thread's pcb doesn't have
+		 * a valid FPU state then get the state from the hw.
+		 */
+		if (fpu_exists && ttolwp(curthread) == lwp &&
+		    !(fpu->fpu_flags & FPU_VALID))
+			fp_save(fpu); /* get the current FPU state */
+	}
+
+	/*
+	 * There are 3 possible cases we have to be aware of here:
+	 *
+	 * 1. FPU is enabled.  FPU state is stored in the current LWP.
+	 *
+	 * 2. FPU is not enabled, and there have been no intervening /proc
+	 *    modifications.  Return initial FPU state.
+	 *
+	 * 3. FPU is not enabled, but a /proc consumer has modified FPU state.
+	 *    FPU state is stored in the current LWP.
+	 */
+	if ((fpu->fpu_flags & FPU_EN) || (fpu->fpu_flags & FPU_VALID)) {
+		/*
+		 * Cases 1 and 3.
+		 */
+		switch (fp_save_mech) {
+		case FP_FXSAVE:
+			fxsave_to_fpregset(fpu->fpu_regs.kfpu_u.kfpu_fx, fp);
+			fp->fp_reg_set.fpchip_state.xstatus =
+			    fpu->fpu_regs.kfpu_xstatus;
+			break;
+		case FP_XSAVE:
+			fxsave_to_fpregset(
+			    &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave, fp);
+			fp->fp_reg_set.fpchip_state.xstatus =
+			    fpu->fpu_regs.kfpu_xstatus;
+			break;
+		default:
+			panic("Invalid fp_save_mech");
+			/*NOTREACHED*/
+		}
+		fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
+	} else {
+		/*
+		 * Case 2.
+		 */
+		switch (fp_save_mech) {
+		case FP_FXSAVE:
+		case FP_XSAVE:
+			/*
+			 * For now, we don't have any AVX specific field in ABI.
+			 * If we add any in the future, we need to initial them
+			 * as well.
+			 */
+			fxsave_to_fpregset(&sse_initial, fp);
+			fp->fp_reg_set.fpchip_state.xstatus =
+			    fpu->fpu_regs.kfpu_xstatus;
+			break;
+		default:
+			panic("Invalid fp_save_mech");
+			/*NOTREACHED*/
+		}
+		fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status;
+	}
+	kpreempt_enable();
+}
+
+#if defined(_SYSCALL32_IMPL)
+
+/*
+ * Set floating-point registers from an fpregset32_t.
+ */
+void
+setfpregs32(klwp_t *lwp, fpregset32_t *fp)
+{
+	fpregset_t fpregs;
+
+	fpregset_32ton(fp, &fpregs);
+	setfpregs(lwp, &fpregs);
+}
+
+/*
+ * Get floating-point registers into an fpregset32_t.
+ */
+void
+getfpregs32(klwp_t *lwp, fpregset32_t *fp)
+{
+	fpregset_t fpregs;
+
+	getfpregs(lwp, &fpregs);
+	fpregset_nto32(&fpregs, fp);
+}
+
+#endif	/* _SYSCALL32_IMPL */
+
+/*
+ * Return the general registers
+ */
+void
+getgregs(klwp_t *lwp, gregset_t grp)
+{
+	struct regs *rp = lwptoregs(lwp);
+	struct pcb *pcb = &lwp->lwp_pcb;
+	int thisthread = lwptot(lwp) == curthread;
+
+	grp[REG_RDI] = rp->r_rdi;
+	grp[REG_RSI] = rp->r_rsi;
+	grp[REG_RDX] = rp->r_rdx;
+	grp[REG_RCX] = rp->r_rcx;
+	grp[REG_R8] = rp->r_r8;
+	grp[REG_R9] = rp->r_r9;
+	grp[REG_RAX] = rp->r_rax;
+	grp[REG_RBX] = rp->r_rbx;
+	grp[REG_RBP] = rp->r_rbp;
+	grp[REG_R10] = rp->r_r10;
+	grp[REG_R11] = rp->r_r11;
+	grp[REG_R12] = rp->r_r12;
+	grp[REG_R13] = rp->r_r13;
+	grp[REG_R14] = rp->r_r14;
+	grp[REG_R15] = rp->r_r15;
+	grp[REG_FSBASE] = pcb->pcb_fsbase;
+	grp[REG_GSBASE] = pcb->pcb_gsbase;
+	if (thisthread)
+		kpreempt_disable();
+	if (PCB_NEED_UPDATE_SEGS(pcb)) {
+		grp[REG_DS] = pcb->pcb_ds;
+		grp[REG_ES] = pcb->pcb_es;
+		grp[REG_FS] = pcb->pcb_fs;
+		grp[REG_GS] = pcb->pcb_gs;
+	} else {
+		grp[REG_DS] = rp->r_ds;
+		grp[REG_ES] = rp->r_es;
+		grp[REG_FS] = rp->r_fs;
+		grp[REG_GS] = rp->r_gs;
+	}
+	if (thisthread)
+		kpreempt_enable();
+	grp[REG_TRAPNO] = rp->r_trapno;
+	grp[REG_ERR] = rp->r_err;
+	grp[REG_RIP] = rp->r_rip;
+	grp[REG_CS] = rp->r_cs;
+	grp[REG_SS] = rp->r_ss;
+	grp[REG_RFL] = rp->r_rfl;
+	grp[REG_RSP] = rp->r_rsp;
+}
+
+#if defined(_SYSCALL32_IMPL)
+
+void
+getgregs32(klwp_t *lwp, gregset32_t grp)
+{
+	struct regs *rp = lwptoregs(lwp);
+	struct pcb *pcb = &lwp->lwp_pcb;
+	int thisthread = lwptot(lwp) == curthread;
+
+	if (thisthread)
+		kpreempt_disable();
+	if (PCB_NEED_UPDATE_SEGS(pcb)) {
+		grp[GS] = (uint16_t)pcb->pcb_gs;
+		grp[FS] = (uint16_t)pcb->pcb_fs;
+		grp[DS] = (uint16_t)pcb->pcb_ds;
+		grp[ES] = (uint16_t)pcb->pcb_es;
+	} else {
+		grp[GS] = (uint16_t)rp->r_gs;
+		grp[FS] = (uint16_t)rp->r_fs;
+		grp[DS] = (uint16_t)rp->r_ds;
+		grp[ES] = (uint16_t)rp->r_es;
+	}
+	if (thisthread)
+		kpreempt_enable();
+	grp[EDI] = (greg32_t)rp->r_rdi;
+	grp[ESI] = (greg32_t)rp->r_rsi;
+	grp[EBP] = (greg32_t)rp->r_rbp;
+	grp[ESP] = 0;
+	grp[EBX] = (greg32_t)rp->r_rbx;
+	grp[EDX] = (greg32_t)rp->r_rdx;
+	grp[ECX] = (greg32_t)rp->r_rcx;
+	grp[EAX] = (greg32_t)rp->r_rax;
+	grp[TRAPNO] = (greg32_t)rp->r_trapno;
+	grp[ERR] = (greg32_t)rp->r_err;
+	grp[EIP] = (greg32_t)rp->r_rip;
+	grp[CS] = (uint16_t)rp->r_cs;
+	grp[EFL] = (greg32_t)rp->r_rfl;
+	grp[UESP] = (greg32_t)rp->r_rsp;
+	grp[SS] = (uint16_t)rp->r_ss;
+}
+
+void
+ucontext_32ton(const ucontext32_t *src, ucontext_t *dst)
+{
+	mcontext_t *dmc = &dst->uc_mcontext;
+	const mcontext32_t *smc = &src->uc_mcontext;
+
+	bzero(dst, sizeof (*dst));
+	dst->uc_flags = src->uc_flags;
+	dst->uc_link = (ucontext_t *)(uintptr_t)src->uc_link;
+
+	bcopy(&src->uc_sigmask, &dst->uc_sigmask, sizeof (dst->uc_sigmask));
+
+	dst->uc_stack.ss_sp = (void *)(uintptr_t)src->uc_stack.ss_sp;
+	dst->uc_stack.ss_size = (size_t)src->uc_stack.ss_size;
+	dst->uc_stack.ss_flags = src->uc_stack.ss_flags;
+
+	dmc->gregs[REG_GS] = (greg_t)(uint32_t)smc->gregs[GS];
+	dmc->gregs[REG_FS] = (greg_t)(uint32_t)smc->gregs[FS];
+	dmc->gregs[REG_ES] = (greg_t)(uint32_t)smc->gregs[ES];
+	dmc->gregs[REG_DS] = (greg_t)(uint32_t)smc->gregs[DS];
+	dmc->gregs[REG_RDI] = (greg_t)(uint32_t)smc->gregs[EDI];
+	dmc->gregs[REG_RSI] = (greg_t)(uint32_t)smc->gregs[ESI];
+	dmc->gregs[REG_RBP] = (greg_t)(uint32_t)smc->gregs[EBP];
+	dmc->gregs[REG_RBX] = (greg_t)(uint32_t)smc->gregs[EBX];
+	dmc->gregs[REG_RDX] = (greg_t)(uint32_t)smc->gregs[EDX];
+	dmc->gregs[REG_RCX] = (greg_t)(uint32_t)smc->gregs[ECX];
+	dmc->gregs[REG_RAX] = (greg_t)(uint32_t)smc->gregs[EAX];
+	dmc->gregs[REG_TRAPNO] = (greg_t)(uint32_t)smc->gregs[TRAPNO];
+	dmc->gregs[REG_ERR] = (greg_t)(uint32_t)smc->gregs[ERR];
+	dmc->gregs[REG_RIP] = (greg_t)(uint32_t)smc->gregs[EIP];
+	dmc->gregs[REG_CS] = (greg_t)(uint32_t)smc->gregs[CS];
+	dmc->gregs[REG_RFL] = (greg_t)(uint32_t)smc->gregs[EFL];
+	dmc->gregs[REG_RSP] = (greg_t)(uint32_t)smc->gregs[UESP];
+	dmc->gregs[REG_SS] = (greg_t)(uint32_t)smc->gregs[SS];
+
+	/*
+	 * A valid fpregs is only copied in if uc.uc_flags has UC_FPU set
+	 * otherwise there is no guarantee that anything in fpregs is valid.
+	 */
+	if (src->uc_flags & UC_FPU)
+		fpregset_32ton(&src->uc_mcontext.fpregs,
+		    &dst->uc_mcontext.fpregs);
+}
+
+#endif	/* _SYSCALL32_IMPL */
+
+/*
+ * Return the user-level PC.
+ * If in a system call, return the address of the syscall trap.
+ */
+greg_t
+getuserpc()
+{
+	greg_t upc = lwptoregs(ttolwp(curthread))->r_pc;
+	uint32_t insn;
+
+	if (curthread->t_sysnum == 0)
+		return (upc);
+
+	/*
+	 * We might've gotten here from sysenter (0xf 0x34),
+	 * syscall (0xf 0x5) or lcall (0x9a 0 0 0 0 0x27 0).
+	 *
+	 * Go peek at the binary to figure it out..
+	 */
+	if (fuword32((void *)(upc - 2), &insn) != -1 &&
+	    (insn & 0xffff) == 0x340f || (insn & 0xffff) == 0x050f)
+		return (upc - 2);
+	return (upc - 7);
+}
+
+/*
+ * Protect segment registers from non-user privilege levels and GDT selectors
+ * other than USER_CS, USER_DS and lwp FS and GS values.  If the segment
+ * selector is non-null and not USER_CS/USER_DS, we make sure that the
+ * TI bit is set to point into the LDT and that the RPL is set to 3.
+ *
+ * Since struct regs stores each 16-bit segment register as a 32-bit greg_t, we
+ * also explicitly zero the top 16 bits since they may be coming from the
+ * user's address space via setcontext(2) or /proc.
+ *
+ * Note about null selector. When running on the hypervisor if we allow a
+ * process to set its %cs to null selector with RPL of 0 the hypervisor will
+ * crash the domain. If running on bare metal we would get a #gp fault and
+ * be able to kill the process and continue on. Therefore we make sure to
+ * force RPL to SEL_UPL even for null selector when setting %cs.
+ */
+
+#if defined(IS_CS) || defined(IS_NOT_CS)
+#error	"IS_CS and IS_NOT_CS already defined"
+#endif
+
+#define	IS_CS		1
+#define	IS_NOT_CS	0
+
+/*ARGSUSED*/
+static greg_t
+fix_segreg(greg_t sr, int iscs, model_t datamodel)
+{
+	switch (sr &= 0xffff) {
+
+	case 0:
+		if (iscs == IS_CS)
+			return (0 | SEL_UPL);
+		else
+			return (0);
+
+	/*
+	 * If lwp attempts to switch data model then force their
+	 * code selector to be null selector.
+	 */
+	case U32CS_SEL:
+		if (datamodel == DATAMODEL_NATIVE)
+			return (0 | SEL_UPL);
+		else
+			return (sr);
+
+	case UCS_SEL:
+		if (datamodel == DATAMODEL_ILP32)
+			return (0 | SEL_UPL);
+	/*FALLTHROUGH*/
+	case UDS_SEL:
+	case LWPFS_SEL:
+	case LWPGS_SEL:
+	case SEL_UPL:
+		return (sr);
+	default:
+		break;
+	}
+
+	/*
+	 * Force it into the LDT in ring 3 for 32-bit processes, which by
+	 * default do not have an LDT, so that any attempt to use an invalid
+	 * selector will reference the (non-existant) LDT, and cause a #gp
+	 * fault for the process.
+	 *
+	 * 64-bit processes get the null gdt selector since they
+	 * are not allowed to have a private LDT.
+	 */
+	if (datamodel == DATAMODEL_ILP32) {
+		return (sr | SEL_TI_LDT | SEL_UPL);
+	} else {
+		if (iscs == IS_CS)
+			return (0 | SEL_UPL);
+		else
+			return (0);
+	}
+
+}
+
+/*
+ * Set general registers.
+ */
+void
+setgregs(klwp_t *lwp, gregset_t grp)
+{
+	struct regs *rp = lwptoregs(lwp);
+	model_t	datamodel = lwp_getdatamodel(lwp);
+
+	struct pcb *pcb = &lwp->lwp_pcb;
+	int thisthread = lwptot(lwp) == curthread;
+
+	if (datamodel == DATAMODEL_NATIVE) {
+		if (thisthread)
+			(void) save_syscall_args();	/* copy the args */
+
+		rp->r_rdi = grp[REG_RDI];
+		rp->r_rsi = grp[REG_RSI];
+		rp->r_rdx = grp[REG_RDX];
+		rp->r_rcx = grp[REG_RCX];
+		rp->r_r8 = grp[REG_R8];
+		rp->r_r9 = grp[REG_R9];
+		rp->r_rax = grp[REG_RAX];
+		rp->r_rbx = grp[REG_RBX];
+		rp->r_rbp = grp[REG_RBP];
+		rp->r_r10 = grp[REG_R10];
+		rp->r_r11 = grp[REG_R11];
+		rp->r_r12 = grp[REG_R12];
+		rp->r_r13 = grp[REG_R13];
+		rp->r_r14 = grp[REG_R14];
+		rp->r_r15 = grp[REG_R15];
+		rp->r_trapno = grp[REG_TRAPNO];
+		rp->r_err = grp[REG_ERR];
+		rp->r_rip = grp[REG_RIP];
+		/*
+		 * Setting %cs or %ss to anything else is quietly but
+		 * quite definitely forbidden!
+		 */
+		rp->r_cs = UCS_SEL;
+		rp->r_ss = UDS_SEL;
+		rp->r_rsp = grp[REG_RSP];
+
+		if (thisthread)
+			kpreempt_disable();
+
+		pcb->pcb_ds = UDS_SEL;
+		pcb->pcb_es = UDS_SEL;
+
+		/*
+		 * 64-bit processes -are- allowed to set their fsbase/gsbase
+		 * values directly, but only if they're using the segment
+		 * selectors that allow that semantic.
+		 *
+		 * (32-bit processes must use lwp_set_private().)
+		 */
+		pcb->pcb_fsbase = grp[REG_FSBASE];
+		pcb->pcb_gsbase = grp[REG_GSBASE];
+		pcb->pcb_fs = fix_segreg(grp[REG_FS], IS_NOT_CS, datamodel);
+		pcb->pcb_gs = fix_segreg(grp[REG_GS], IS_NOT_CS, datamodel);
+
+		/*
+		 * Ensure that we go out via update_sregs
+		 */
+		PCB_SET_UPDATE_SEGS(pcb);
+		lwptot(lwp)->t_post_sys = 1;
+		if (thisthread)
+			kpreempt_enable();
+#if defined(_SYSCALL32_IMPL)
+	} else {
+		rp->r_rdi = (uint32_t)grp[REG_RDI];
+		rp->r_rsi = (uint32_t)grp[REG_RSI];
+		rp->r_rdx = (uint32_t)grp[REG_RDX];
+		rp->r_rcx = (uint32_t)grp[REG_RCX];
+		rp->r_rax = (uint32_t)grp[REG_RAX];
+		rp->r_rbx = (uint32_t)grp[REG_RBX];
+		rp->r_rbp = (uint32_t)grp[REG_RBP];
+		rp->r_trapno = (uint32_t)grp[REG_TRAPNO];
+		rp->r_err = (uint32_t)grp[REG_ERR];
+		rp->r_rip = (uint32_t)grp[REG_RIP];
+
+		rp->r_cs = fix_segreg(grp[REG_CS], IS_CS, datamodel);
+		rp->r_ss = fix_segreg(grp[REG_DS], IS_NOT_CS, datamodel);
+
+		rp->r_rsp = (uint32_t)grp[REG_RSP];
+
+		if (thisthread)
+			kpreempt_disable();
+
+		pcb->pcb_ds = fix_segreg(grp[REG_DS], IS_NOT_CS, datamodel);
+		pcb->pcb_es = fix_segreg(grp[REG_ES], IS_NOT_CS, datamodel);
+
+		/*
+		 * (See fsbase/gsbase commentary above)
+		 */
+		pcb->pcb_fs = fix_segreg(grp[REG_FS], IS_NOT_CS, datamodel);
+		pcb->pcb_gs = fix_segreg(grp[REG_GS], IS_NOT_CS, datamodel);
+
+		/*
+		 * Ensure that we go out via update_sregs
+		 */
+		PCB_SET_UPDATE_SEGS(pcb);
+		lwptot(lwp)->t_post_sys = 1;
+		if (thisthread)
+			kpreempt_enable();
+#endif
+	}
+
+	/*
+	 * Only certain bits of the flags register can be modified.
+	 */
+	rp->r_rfl = (rp->r_rfl & ~PSL_USERMASK) |
+	    (grp[REG_RFL] & PSL_USERMASK);
+}
+
+/*
+ * Determine whether eip is likely to have an interrupt frame
+ * on the stack.  We do this by comparing the address to the
+ * range of addresses spanned by several well-known routines.
+ */
+extern void _interrupt();
+extern void _allsyscalls();
+extern void _cmntrap();
+extern void fakesoftint();
+
+extern size_t _interrupt_size;
+extern size_t _allsyscalls_size;
+extern size_t _cmntrap_size;
+extern size_t _fakesoftint_size;
+
+/*
+ * Get a pc-only stacktrace.  Used for kmem_alloc() buffer ownership tracking.
+ * Returns MIN(current stack depth, pcstack_limit).
+ */
+int
+getpcstack(pc_t *pcstack, int pcstack_limit)
+{
+	struct frame *fp = (struct frame *)getfp();
+	struct frame *nextfp, *minfp, *stacktop;
+	int depth = 0;
+	int on_intr;
+	uintptr_t pc;
+
+	if ((on_intr = CPU_ON_INTR(CPU)) != 0)
+		stacktop = (struct frame *)(CPU->cpu_intr_stack + SA(MINFRAME));
+	else
+		stacktop = (struct frame *)curthread->t_stk;
+	minfp = fp;
+
+	pc = ((struct regs *)fp)->r_pc;
+
+	while (depth < pcstack_limit) {
+		nextfp = (struct frame *)fp->fr_savfp;
+		pc = fp->fr_savpc;
+		if (nextfp <= minfp || nextfp >= stacktop) {
+			if (on_intr) {
+				/*
+				 * Hop from interrupt stack to thread stack.
+				 */
+				stacktop = (struct frame *)curthread->t_stk;
+				minfp = (struct frame *)curthread->t_stkbase;
+				on_intr = 0;
+				continue;
+			}
+			break;
+		}
+		pcstack[depth++] = (pc_t)pc;
+		fp = nextfp;
+		minfp = fp;
+	}
+	return (depth);
+}
+
+/*
+ * The following ELF header fields are defined as processor-specific
+ * in the V8 ABI:
+ *
+ *	e_ident[EI_DATA]	encoding of the processor-specific
+ *				data in the object file
+ *	e_machine		processor identification
+ *	e_flags			processor-specific flags associated
+ *				with the file
+ */
+
+/*
+ * The value of at_flags reflects a platform's cpu module support.
+ * at_flags is used to check for allowing a binary to execute and
+ * is passed as the value of the AT_FLAGS auxiliary vector.
+ */
+int at_flags = 0;
+
+/*
+ * Check the processor-specific fields of an ELF header.
+ *
+ * returns 1 if the fields are valid, 0 otherwise
+ */
+/*ARGSUSED2*/
+int
+elfheadcheck(
+	unsigned char e_data,
+	Elf32_Half e_machine,
+	Elf32_Word e_flags)
+{
+	if (e_data != ELFDATA2LSB)
+		return (0);
+	if (e_machine == EM_AMD64)
+		return (1);
+	return (e_machine == EM_386);
+}
+
+uint_t auxv_hwcap_include = 0;	/* patch to enable unrecognized features */
+uint_t auxv_hwcap_include_2 = 0;	/* second word */
+uint_t auxv_hwcap_exclude = 0;	/* patch for broken cpus, debugging */
+uint_t auxv_hwcap_exclude_2 = 0;	/* second word */
+#if defined(_SYSCALL32_IMPL)
+uint_t auxv_hwcap32_include = 0;	/* ditto for 32-bit apps */
+uint_t auxv_hwcap32_include_2 = 0;	/* ditto for 32-bit apps */
+uint_t auxv_hwcap32_exclude = 0;	/* ditto for 32-bit apps */
+uint_t auxv_hwcap32_exclude_2 = 0;	/* ditto for 32-bit apps */
+#endif
+
+/*
+ * Gather information about the processor and place it into auxv_hwcap
+ * so that it can be exported to the linker via the aux vector.
+ *
+ * We use this seemingly complicated mechanism so that we can ensure
+ * that /etc/system can be used to override what the system can or
+ * cannot discover for itself.
+ */
+void
+bind_hwcap(void)
+{
+	uint_t cpu_hwcap_flags[2];
+	cpuid_pass4(NULL, cpu_hwcap_flags);
+
+	auxv_hwcap = (auxv_hwcap_include | cpu_hwcap_flags[0]) &
+	    ~auxv_hwcap_exclude;
+	auxv_hwcap_2 = (auxv_hwcap_include_2 | cpu_hwcap_flags[1]) &
+	    ~auxv_hwcap_exclude_2;
+
+	/*
+	 * On AMD processors, sysenter just doesn't work at all
+	 * when the kernel is in long mode.  On IA-32e processors
+	 * it does, but there's no real point in all the alternate
+	 * mechanism when syscall works on both.
+	 *
+	 * Besides, the kernel's sysenter handler is expecting a
+	 * 32-bit lwp ...
+	 */
+	auxv_hwcap &= ~AV_386_SEP;
+
+	if (auxv_hwcap_include || auxv_hwcap_exclude || auxv_hwcap_include_2 ||
+	    auxv_hwcap_exclude_2) {
+		/*
+		 * The below assignment is regrettably required to get lint
+		 * to accept the validity of our format string.  The format
+		 * string is in fact valid, but whatever intelligence in lint
+		 * understands the cmn_err()-specific %b appears to have an
+		 * off-by-one error:  it (mistakenly) complains about bit
+		 * number 32 (even though this is explicitly permitted).
+		 * Normally, one would will away such warnings with a "LINTED"
+		 * directive, but for reasons unclear and unknown, lint
+		 * refuses to be assuaged in this case.  Fortunately, lint
+		 * doesn't pretend to have solved the Halting Problem --
+		 * and as soon as the format string is programmatic, it
+		 * knows enough to shut up.
+		 */
+		char *fmt = "?user ABI extensions: %b\n";
+		cmn_err(CE_CONT, fmt, auxv_hwcap, FMT_AV_386);
+		fmt = "?user ABI extensions (word 2): %b\n";
+		cmn_err(CE_CONT, fmt, auxv_hwcap_2, FMT_AV_386_2);
+	}
+
+#if defined(_SYSCALL32_IMPL)
+	auxv_hwcap32 = (auxv_hwcap32_include | cpu_hwcap_flags[0]) &
+	    ~auxv_hwcap32_exclude;
+	auxv_hwcap32_2 = (auxv_hwcap32_include_2 | cpu_hwcap_flags[1]) &
+	    ~auxv_hwcap32_exclude_2;
+
+	/*
+	 * If this is an amd64 architecture machine from Intel, then
+	 * syscall -doesn't- work in compatibility mode, only sysenter does.
+	 *
+	 * Sigh.
+	 */
+	if (!cpuid_syscall32_insn(NULL))
+		auxv_hwcap32 &= ~AV_386_AMD_SYSC;
+
+	/*
+	 * 32-bit processes can -always- use the lahf/sahf instructions
+	 */
+	auxv_hwcap32 |= AV_386_AHF;
+
+	/*
+	 * 32-bit processes can -never- use fsgsbase instructions.
+	 */
+	auxv_hwcap32_2 &= ~AV_386_2_FSGSBASE;
+
+	if (auxv_hwcap32_include || auxv_hwcap32_exclude ||
+	    auxv_hwcap32_include_2 || auxv_hwcap32_exclude_2) {
+		/*
+		 * See the block comment in the cmn_err() of auxv_hwcap, above.
+		 */
+		char *fmt = "?32-bit user ABI extensions: %b\n";
+		cmn_err(CE_CONT, fmt, auxv_hwcap32, FMT_AV_386);
+		fmt = "?32-bit user ABI extensions (word 2): %b\n";
+		cmn_err(CE_CONT, fmt, auxv_hwcap32_2, FMT_AV_386_2);
+	}
+#endif
+}
+
+/*
+ *	sync_icache() - this is called
+ *	in proc/fs/prusrio.c. x86 has an unified cache and therefore
+ *	this is a nop.
+ */
+/* ARGSUSED */
+void
+sync_icache(caddr_t addr, uint_t len)
+{
+	/* Do nothing for now */
+}
+
+/*ARGSUSED*/
+void
+sync_data_memory(caddr_t va, size_t len)
+{
+	/* Not implemented for this platform */
+}
+
+int
+__ipltospl(int ipl)
+{
+	return (ipltospl(ipl));
+}
+
+/*
+ * The panic code invokes panic_saveregs() to record the contents of a
+ * regs structure into the specified panic_data structure for debuggers.
+ */
+void
+panic_saveregs(panic_data_t *pdp, struct regs *rp)
+{
+	panic_nv_t *pnv = PANICNVGET(pdp);
+
+	struct cregs	creg;
+
+	getcregs(&creg);
+
+	PANICNVADD(pnv, "rdi", rp->r_rdi);
+	PANICNVADD(pnv, "rsi", rp->r_rsi);
+	PANICNVADD(pnv, "rdx", rp->r_rdx);
+	PANICNVADD(pnv, "rcx", rp->r_rcx);
+	PANICNVADD(pnv, "r8", rp->r_r8);
+	PANICNVADD(pnv, "r9", rp->r_r9);
+	PANICNVADD(pnv, "rax", rp->r_rax);
+	PANICNVADD(pnv, "rbx", rp->r_rbx);
+	PANICNVADD(pnv, "rbp", rp->r_rbp);
+	PANICNVADD(pnv, "r10", rp->r_r10);
+	PANICNVADD(pnv, "r11", rp->r_r11);
+	PANICNVADD(pnv, "r12", rp->r_r12);
+	PANICNVADD(pnv, "r13", rp->r_r13);
+	PANICNVADD(pnv, "r14", rp->r_r14);
+	PANICNVADD(pnv, "r15", rp->r_r15);
+	PANICNVADD(pnv, "fsbase", rdmsr(MSR_AMD_FSBASE));
+	PANICNVADD(pnv, "gsbase", rdmsr(MSR_AMD_GSBASE));
+	PANICNVADD(pnv, "ds", rp->r_ds);
+	PANICNVADD(pnv, "es", rp->r_es);
+	PANICNVADD(pnv, "fs", rp->r_fs);
+	PANICNVADD(pnv, "gs", rp->r_gs);
+	PANICNVADD(pnv, "trapno", rp->r_trapno);
+	PANICNVADD(pnv, "err", rp->r_err);
+	PANICNVADD(pnv, "rip", rp->r_rip);
+	PANICNVADD(pnv, "cs", rp->r_cs);
+	PANICNVADD(pnv, "rflags", rp->r_rfl);
+	PANICNVADD(pnv, "rsp", rp->r_rsp);
+	PANICNVADD(pnv, "ss", rp->r_ss);
+	PANICNVADD(pnv, "gdt_hi", (uint64_t)(creg.cr_gdt._l[3]));
+	PANICNVADD(pnv, "gdt_lo", (uint64_t)(creg.cr_gdt._l[0]));
+	PANICNVADD(pnv, "idt_hi", (uint64_t)(creg.cr_idt._l[3]));
+	PANICNVADD(pnv, "idt_lo", (uint64_t)(creg.cr_idt._l[0]));
+
+	PANICNVADD(pnv, "ldt", creg.cr_ldt);
+	PANICNVADD(pnv, "task", creg.cr_task);
+	PANICNVADD(pnv, "cr0", creg.cr_cr0);
+	PANICNVADD(pnv, "cr2", creg.cr_cr2);
+	PANICNVADD(pnv, "cr3", creg.cr_cr3);
+	if (creg.cr_cr4)
+		PANICNVADD(pnv, "cr4", creg.cr_cr4);
+
+	PANICNVSET(pdp, pnv);
+}
+
+#define	TR_ARG_MAX 6	/* Max args to print, same as SPARC */
+
+
+/*
+ * Print a stack backtrace using the specified frame pointer.  We delay two
+ * seconds before continuing, unless this is the panic traceback.
+ * If we are in the process of panicking, we also attempt to write the
+ * stack backtrace to a staticly assigned buffer, to allow the panic
+ * code to find it and write it in to uncompressed pages within the
+ * system crash dump.
+ * Note that the frame for the starting stack pointer value is omitted because
+ * the corresponding %eip is not known.
+ */
+
+extern char *dump_stack_scratch;
+
+
+void
+traceback(caddr_t fpreg)
+{
+	struct frame	*fp = (struct frame *)fpreg;
+	struct frame	*nextfp;
+	uintptr_t	pc, nextpc;
+	ulong_t		off;
+	char		args[TR_ARG_MAX * 2 + 16], *sym;
+	uint_t	  offset = 0;
+	uint_t	  next_offset = 0;
+	char	    stack_buffer[1024];
+
+	if (!panicstr)
+		printf("traceback: %%fp = %p\n", (void *)fp);
+
+	if (panicstr && !dump_stack_scratch) {
+		printf("Warning - stack not written to the dump buffer\n");
+	}
+
+	fp = (struct frame *)plat_traceback(fpreg);
+	if ((uintptr_t)fp < KERNELBASE)
+		goto out;
+
+	pc = fp->fr_savpc;
+	fp = (struct frame *)fp->fr_savfp;
+
+	while ((uintptr_t)fp >= KERNELBASE) {
+		/*
+		 * XX64 Until port is complete tolerate 8-byte aligned
+		 * frame pointers but flag with a warning so they can
+		 * be fixed.
+		 */
+		if (((uintptr_t)fp & (STACK_ALIGN - 1)) != 0) {
+			if (((uintptr_t)fp & (8 - 1)) == 0) {
+				printf("  >> warning! 8-byte"
+				    " aligned %%fp = %p\n", (void *)fp);
+			} else {
+				printf(
+				    "  >> mis-aligned %%fp = %p\n", (void *)fp);
+				break;
+			}
+		}
+
+		args[0] = '\0';
+		nextpc = (uintptr_t)fp->fr_savpc;
+		nextfp = (struct frame *)fp->fr_savfp;
+		if ((sym = kobj_getsymname(pc, &off)) != NULL) {
+			printf("%016lx %s:%s+%lx (%s)\n", (uintptr_t)fp,
+			    mod_containing_pc((caddr_t)pc), sym, off, args);
+			(void) snprintf(stack_buffer, sizeof (stack_buffer),
+			    "%s:%s+%lx (%s) | ",
+			    mod_containing_pc((caddr_t)pc), sym, off, args);
+		} else {
+			printf("%016lx %lx (%s)\n",
+			    (uintptr_t)fp, pc, args);
+			(void) snprintf(stack_buffer, sizeof (stack_buffer),
+			    "%lx (%s) | ", pc, args);
+		}
+
+		if (panicstr && dump_stack_scratch) {
+			next_offset = offset + strlen(stack_buffer);
+			if (next_offset < STACK_BUF_SIZE) {
+				bcopy(stack_buffer, dump_stack_scratch + offset,
+				    strlen(stack_buffer));
+				offset = next_offset;
+			} else {
+				/*
+				 * In attempting to save the panic stack
+				 * to the dumpbuf we have overflowed that area.
+				 * Print a warning and continue to printf the
+				 * stack to the msgbuf
+				 */
+				printf("Warning: stack in the dump buffer"
+				    " may be incomplete\n");
+				offset = next_offset;
+			}
+		}
+
+		pc = nextpc;
+		fp = nextfp;
+	}
+out:
+	if (!panicstr) {
+		printf("end of traceback\n");
+		DELAY(2 * MICROSEC);
+	} else if (dump_stack_scratch) {
+		dump_stack_scratch[offset] = '\0';
+	}
+}
+
+
+/*
+ * Generate a stack backtrace from a saved register set.
+ */
+void
+traceregs(struct regs *rp)
+{
+	traceback((caddr_t)rp->r_fp);
+}
+
+void
+exec_set_sp(size_t stksize)
+{
+	klwp_t *lwp = ttolwp(curthread);
+
+	lwptoregs(lwp)->r_sp = (uintptr_t)curproc->p_usrstack - stksize;
+}
+
+hrtime_t
+gethrtime_waitfree(void)
+{
+	return (dtrace_gethrtime());
+}
+
+hrtime_t
+gethrtime(void)
+{
+	return (gethrtimef());
+}
+
+hrtime_t
+gethrtime_unscaled(void)
+{
+	return (gethrtimeunscaledf());
+}
+
+void
+scalehrtime(hrtime_t *hrt)
+{
+	scalehrtimef(hrt);
+}
+
+uint64_t
+unscalehrtime(hrtime_t nsecs)
+{
+	return (unscalehrtimef(nsecs));
+}
+
+void
+gethrestime(timespec_t *tp)
+{
+	gethrestimef(tp);
+}
+
+/*
+ * Part of the implementation of hres_tick(); this routine is
+ * easier in C than assembler .. called with the hres_lock held.
+ *
+ * XX64	Many of these timekeeping variables need to be extern'ed in a header
+ */
+
+#include <sys/time.h>
+#include <sys/machlock.h>
+
+extern int one_sec;
+extern int max_hres_adj;
+
+void
+__adj_hrestime(void)
+{
+	long long adj;
+
+	if (hrestime_adj == 0)
+		adj = 0;
+	else if (hrestime_adj > 0) {
+		if (hrestime_adj < max_hres_adj)
+			adj = hrestime_adj;
+		else
+			adj = max_hres_adj;
+	} else {
+		if (hrestime_adj < -max_hres_adj)
+			adj = -max_hres_adj;
+		else
+			adj = hrestime_adj;
+	}
+
+	timedelta -= adj;
+	hrestime_adj = timedelta;
+	hrestime.tv_nsec += adj;
+
+	while (hrestime.tv_nsec >= NANOSEC) {
+		one_sec++;
+		hrestime.tv_sec++;
+		hrestime.tv_nsec -= NANOSEC;
+	}
+}
+
+/*
+ * Wrapper functions to maintain backwards compability
+ */
+int
+xcopyin(const void *uaddr, void *kaddr, size_t count)
+{
+	return (xcopyin_nta(uaddr, kaddr, count, UIO_COPY_CACHED));
+}
+
+int
+xcopyout(const void *kaddr, void *uaddr, size_t count)
+{
+	return (xcopyout_nta(kaddr, uaddr, count, UIO_COPY_CACHED));
+}
diff --git a/usr/src/uts/intel/os/bootdev.c b/usr/src/uts/intel/os/bootdev.c
new file mode 100644
index 0000000000..02f31efd56
--- /dev/null
+++ b/usr/src/uts/intel/os/bootdev.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+
+/* internal global data */
+static struct modlmisc modlmisc = {
+	&mod_miscops, "bootdev misc module"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modlmisc, NULL
+};
+
+int
+_init()
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_fini()
+{
+	return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * convert a prom device path to an equivalent path in /devices
+ * Does not deal with aliases.  Does deal with pathnames which
+ * are not fully qualified.  This routine is generalized
+ * to work across several flavors of OBP
+ */
+int
+i_promname_to_devname(char *prom_name, char *ret_buf)
+{
+	if (prom_name == NULL || ret_buf == NULL ||
+	    (strlen(prom_name) >= MAXPATHLEN)) {
+		return (EINVAL);
+	}
+	if (i_ddi_prompath_to_devfspath(prom_name, ret_buf) != DDI_SUCCESS)
+		return (EINVAL);
+
+	return (0);
+}
+
+/*
+ * If bootstring contains a device path, we need to convert to a format
+ * the prom will understand.  To do so, we convert the existing path to
+ * a prom-compatible path and return the value of new_path.  If the
+ * caller specifies new_path as NULL, we allocate an appropriately
+ * sized new_path on behalf of the caller.  If the caller invokes this
+ * function with new_path = NULL, they must do so from a context in
+ * which it is safe to perform a sleeping memory allocation.
+ *
+ * NOTE: Intel does not have a real PROM, so the implementation
+ *       simply returns a copy of the string passed in.
+ */
+char *
+i_convert_boot_device_name(char *cur_path, char *new_path, size_t *len)
+{
+	if (new_path != NULL) {
+		(void) snprintf(new_path, *len, "%s", cur_path);
+		return (new_path);
+	} else {
+		*len = strlen(cur_path) + 1;
+		new_path = kmem_alloc(*len, KM_SLEEP);
+		(void) snprintf(new_path, *len, "%s", cur_path);
+		return (new_path);
+	}
+}
diff --git a/usr/src/uts/intel/os/comm_page_util.c b/usr/src/uts/intel/os/comm_page_util.c
new file mode 100644
index 0000000000..f286bee7f6
--- /dev/null
+++ b/usr/src/uts/intel/os/comm_page_util.c
@@ -0,0 +1,62 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/mman.h>
+#include <sys/vmsystm.h>
+#include <vm/as.h>
+#include <vm/seg_umap.h>
+
+#if !defined(__xpv)
+#include <sys/comm_page.h>
+#endif /* !defined(__xpv) */
+
+/*
+ * Map in the comm page.
+ *
+ * The contents of the comm page are only defined on non-xpv x86 at this time.
+ * Furthermore, the data is only valid in userspace (32-bit or 64-bit) when
+ * mapped from a 64-bit kernel.
+ * See: "uts/i86pc/sys/comm_page.h"
+ */
+caddr_t
+comm_page_mapin()
+{
+#if !defined(__xpv)
+	proc_t *p = curproc;
+	caddr_t addr = NULL;
+	size_t len = COMM_PAGE_SIZE;
+	uint_t prot = PROT_USER | PROT_READ;
+	segumap_crargs_t suarg;
+
+	map_addr(&addr, len, (offset_t)0, 1, 0);
+	if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as,
+	    p->p_as->a_userlimit) != RANGE_OKAY) {
+		return (NULL);
+	}
+
+	suarg.kaddr = (caddr_t)&comm_page;
+	suarg.prot = suarg.maxprot = prot;
+	if (as_map(p->p_as, addr, len, segumap_create, &suarg) != 0) {
+		return (NULL);
+	}
+	return (addr);
+#else /* !defined(__xpv) */
+	return (NULL);
+#endif /* !defined(__xpv) */
+}
diff --git a/usr/src/uts/intel/os/copy_subr.c b/usr/src/uts/intel/os/copy_subr.c
new file mode 100644
index 0000000000..b69f052e68
--- /dev/null
+++ b/usr/src/uts/intel/os/copy_subr.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Miscellaneous C routines for copying data around without
+ * descending into assembler.  Compilers are pretty good at
+ * scheduling instructions, and humans are pretty hopeless at
+ * writing correct assembler.
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+
+/*
+ * copyinstr_noerr and copyoutstr_noerr can be implemented completely
+ * in C on machines with shared user and kernel context.
+ */
+static int
+copystr_nofault(const char *src, char *dst, size_t maxlength,
+    size_t *lencopied)
+{
+	int error = 0;
+	size_t leftover;
+
+	if ((leftover = maxlength) == 0)
+		error = ENAMETOOLONG;
+	else
+		do {
+			leftover--;
+			if ((*dst++ = *src++) == '\0')
+				break;
+			if (leftover == 0) {
+				error = ENAMETOOLONG;
+				break;
+			}
+		/*CONSTCOND*/
+		} while (1);
+
+	if (lencopied)
+		*lencopied = maxlength - leftover;
+	return (error);
+}
+
+
+int
+copyinstr_noerr(const char *uaddr, char *kaddr, size_t maxlength,
+    size_t *lencopied)
+{
+	char *ua = (char *)uaddr;
+
+	ASSERT((uintptr_t)kaddr > kernelbase);
+
+	if ((uintptr_t)ua > kernelbase) {
+		/*
+		 * force fault at kernelbase
+		 */
+		ua = (char *)kernelbase;
+	}
+	return (copystr_nofault(ua, kaddr, maxlength, lencopied));
+}
+
+int
+copyoutstr_noerr(const char *kaddr, char *uaddr, size_t maxlength,
+    size_t *lencopied)
+{
+	char *ua = (char *)uaddr;
+
+	ASSERT((uintptr_t)kaddr > kernelbase);
+
+	if ((uintptr_t)ua > kernelbase) {
+		/*
+		 * force fault at kernelbase
+		 */
+		ua = (char *)kernelbase;
+	}
+	return (copystr_nofault(kaddr, ua, maxlength, lencopied));
+}
diff --git a/usr/src/uts/intel/os/cpc_subr.c b/usr/src/uts/intel/os/cpc_subr.c
new file mode 100644
index 0000000000..71e1ebaeee
--- /dev/null
+++ b/usr/src/uts/intel/os/cpc_subr.c
@@ -0,0 +1,274 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ */
+
+/*
+ * x86-specific routines used by the CPU Performance counter driver.
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/x86_archext.h>
+#include <sys/cpuvar.h>
+#include <sys/machcpuvar.h>
+#include <sys/archsystm.h>
+#include <sys/cpc_pcbe.h>
+#include <sys/cpc_impl.h>
+#include <sys/x_call.h>
+#include <sys/cmn_err.h>
+#include <sys/cmt.h>
+#include <sys/spl.h>
+#include <sys/apic.h>
+
+static const uint64_t allstopped = 0;
+static kcpc_ctx_t *(*overflow_intr_handler)(caddr_t);
+
+/* Do threads share performance monitoring hardware? */
+static int strands_perfmon_shared = 0;
+
+int kcpc_hw_overflow_intr_installed;		/* set by APIC code */
+extern kcpc_ctx_t *kcpc_overflow_intr(caddr_t arg, uint64_t bitmap);
+
+extern int kcpc_counts_include_idle; /* Project Private /etc/system variable */
+
+void (*kcpc_hw_enable_cpc_intr)(void);		/* set by APIC code */
+
+int
+kcpc_hw_add_ovf_intr(kcpc_ctx_t *(*handler)(caddr_t))
+{
+	if (x86_type != X86_TYPE_P6)
+		return (0);
+	overflow_intr_handler = handler;
+	return (ipltospl(APIC_PCINT_IPL));
+}
+
+void
+kcpc_hw_rem_ovf_intr(void)
+{
+	overflow_intr_handler = NULL;
+}
+
+/*
+ * Hook used on P4 systems to catch online/offline events.
+ */
+/*ARGSUSED*/
+static int
+kcpc_cpu_setup(cpu_setup_t what, int cpuid, void *arg)
+{
+	pg_cmt_t	*chip_pg;
+	int		active_cpus_cnt;
+
+	if (what != CPU_ON)
+		return (0);
+
+	/*
+	 * If any CPU-bound contexts exist, we don't need to invalidate
+	 * anything, as no per-LWP contexts can coexist.
+	 */
+	if (kcpc_cpuctx || dtrace_cpc_in_use)
+		return (0);
+
+	/*
+	 * If this chip now has more than 1 active cpu, we must invalidate all
+	 * contexts in the system.
+	 */
+	chip_pg = (pg_cmt_t *)pghw_find_pg(cpu[cpuid], PGHW_CHIP);
+	if (chip_pg != NULL) {
+		active_cpus_cnt = GROUP_SIZE(&chip_pg->cmt_cpus_actv);
+		if (active_cpus_cnt > 1)
+			kcpc_invalidate_all();
+	}
+
+	return (0);
+}
+
+static kmutex_t cpu_setup_lock;	/* protects setup_registered */
+static int setup_registered;
+
+
+void
+kcpc_hw_init(cpu_t *cp)
+{
+	kthread_t *t = cp->cpu_idle_thread;
+	uint32_t versionid;
+	struct cpuid_regs cpuid;
+
+	strands_perfmon_shared = 0;
+	if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
+		if (cpuid_getvendor(cpu[0]) == X86_VENDOR_Intel) {
+			/*
+			 * Intel processors that support Architectural
+			 * Performance Monitoring Version 3 have per strand
+			 * performance monitoring hardware.
+			 * Hence we can allow use of performance counters on
+			 * multiple strands on the same core simultaneously.
+			 */
+			cpuid.cp_eax = 0x0;
+			(void) __cpuid_insn(&cpuid);
+			if (cpuid.cp_eax < 0xa) {
+				strands_perfmon_shared = 1;
+			} else {
+				cpuid.cp_eax = 0xa;
+				(void) __cpuid_insn(&cpuid);
+
+				versionid = cpuid.cp_eax & 0xFF;
+				if (versionid < 3) {
+					strands_perfmon_shared = 1;
+				}
+			}
+		} else if (cpuid_getvendor(cpu[0]) == X86_VENDOR_AMD ||
+		    cpuid_getvendor(cpu[0]) == X86_VENDOR_HYGON) {
+			/*
+			 * On AMD systems with HT, all of the performance
+			 * monitors exist on a per-logical CPU basis.
+			 */
+			strands_perfmon_shared = 0;
+		} else {
+			strands_perfmon_shared = 1;
+		}
+	}
+
+	if (strands_perfmon_shared) {
+		mutex_enter(&cpu_setup_lock);
+		if (setup_registered == 0) {
+			mutex_enter(&cpu_lock);
+			register_cpu_setup_func(kcpc_cpu_setup, NULL);
+			mutex_exit(&cpu_lock);
+			setup_registered = 1;
+		}
+		mutex_exit(&cpu_setup_lock);
+	}
+
+	mutex_init(&cp->cpu_cpc_ctxlock, "cpu_cpc_ctxlock", MUTEX_DEFAULT, 0);
+
+	if (kcpc_counts_include_idle)
+		return;
+
+	installctx(t, cp, kcpc_idle_save, kcpc_idle_restore,
+	    NULL, NULL, NULL, NULL, NULL);
+}
+
+void
+kcpc_hw_fini(cpu_t *cp)
+{
+	ASSERT(cp->cpu_idle_thread == NULL);
+
+	mutex_destroy(&cp->cpu_cpc_ctxlock);
+}
+
+#define	BITS(v, u, l)	\
+	(((v) >> (l)) & ((1 << (1 + (u) - (l))) - 1))
+
+#define	PCBE_NAMELEN 30	/* Enough Room for pcbe.manuf.model.family.stepping */
+
+/*
+ * Examine the processor and load an appropriate PCBE.
+ */
+int
+kcpc_hw_load_pcbe(void)
+{
+	return (kcpc_pcbe_tryload(cpuid_getvendorstr(CPU), cpuid_getfamily(CPU),
+	    cpuid_getmodel(CPU), cpuid_getstep(CPU)));
+}
+
+/*
+ * Called by the generic framework to check if it's OK to bind a set to a CPU.
+ */
+int
+kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap)
+{
+	cpu_t		*cpu, *p;
+	pg_t		*chip_pg;
+	pg_cpu_itr_t	itr;
+
+	if (!strands_perfmon_shared)
+		return (0);
+
+	/*
+	 * Only one logical CPU on each Pentium 4 HT CPU may be bound to at
+	 * once.
+	 *
+	 * This loop is protected by holding cpu_lock, in order to properly
+	 * access the cpu_t of the desired cpu.
+	 */
+	mutex_enter(&cpu_lock);
+	if ((cpu = cpu_get(cpuid)) == NULL) {
+		mutex_exit(&cpu_lock);
+		return (-1);
+	}
+
+	chip_pg = (pg_t *)pghw_find_pg(cpu, PGHW_CHIP);
+
+	PG_CPU_ITR_INIT(chip_pg, itr);
+	while ((p = pg_cpu_next(&itr)) != NULL) {
+		if (p == cpu)
+			continue;
+		if (BT_TEST(kcpc_cpumap, p->cpu_id)) {
+			mutex_exit(&cpu_lock);
+			return (-1);
+		}
+	}
+
+	mutex_exit(&cpu_lock);
+	return (0);
+}
+
+/*
+ * Called by the generic framework to check if it's OK to bind a set to an LWP.
+ */
+int
+kcpc_hw_lwp_hook(void)
+{
+	pg_cmt_t	*chip;
+	group_t		*chips;
+	group_iter_t	i;
+
+	if (!strands_perfmon_shared)
+		return (0);
+
+	/*
+	 * Only one CPU per chip may be online.
+	 */
+	mutex_enter(&cpu_lock);
+
+	chips = pghw_set_lookup(PGHW_CHIP);
+	if (chips == NULL) {
+		mutex_exit(&cpu_lock);
+		return (0);
+	}
+
+	group_iter_init(&i);
+	while ((chip = group_iterate(chips, &i)) != NULL) {
+		if (GROUP_SIZE(&chip->cmt_cpus_actv) > 1) {
+			mutex_exit(&cpu_lock);
+			return (-1);
+		}
+	}
+
+	mutex_exit(&cpu_lock);
+	return (0);
+}
diff --git a/usr/src/uts/intel/os/ddi_i86.c b/usr/src/uts/intel/os/ddi_i86.c
new file mode 100644
index 0000000000..f135d0673c
--- /dev/null
+++ b/usr/src/uts/intel/os/ddi_i86.c
@@ -0,0 +1,1903 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2014 Garrett D'Amore <garrett@damore.org>
+ */
+
+#include <sys/conf.h>
+#include <sys/kmem.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ddifm.h>
+#include <sys/fm/io/ddi.h>
+#include <sys/fm/protocol.h>
+#include <sys/ontrap.h>
+
+
+/*
+ * DDI DMA Engine functions for x86.
+ * These functions are more naturally generic, but do not apply to SPARC.
+ */
+
+int
+ddi_dmae_alloc(dev_info_t *dip, int chnl, int (*dmae_waitfp)(), caddr_t arg)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_ACQUIRE,
+	    (off_t *)dmae_waitfp, (size_t *)arg,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_release(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_FREE, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_getattr(dev_info_t *dip, ddi_dma_attr_t *attrp)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_GETATTR, 0, 0,
+	    (caddr_t *)attrp, 0));
+}
+
+int
+ddi_dmae_1stparty(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_1STPTY, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_prog(dev_info_t *dip, struct ddi_dmae_req *dmaereqp,
+    ddi_dma_cookie_t *cookiep, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_PROG, (off_t *)dmaereqp,
+	    (size_t *)cookiep, (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_swsetup(dev_info_t *dip, struct ddi_dmae_req *dmaereqp,
+    ddi_dma_cookie_t *cookiep, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_SWSETUP, (off_t *)dmaereqp,
+	    (size_t *)cookiep, (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_swstart(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_SWSTART, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_stop(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_STOP, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_enable(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_ENABLE, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_disable(dev_info_t *dip, int chnl)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_DISABLE, 0, 0,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+int
+ddi_dmae_getcnt(dev_info_t *dip, int chnl, int *countp)
+{
+	return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_GETCNT, 0, (size_t *)countp,
+	    (caddr_t *)(uintptr_t)chnl, 0));
+}
+
+/*
+ * implementation specific access handle and routines:
+ */
+
+static uintptr_t impl_acc_hdl_id = 0;
+
+/*
+ * access handle allocator
+ */
+ddi_acc_hdl_t *
+impl_acc_hdl_get(ddi_acc_handle_t hdl)
+{
+	/*
+	 * recast to ddi_acc_hdl_t instead of
+	 * casting to ddi_acc_impl_t and then return the ah_platform_private
+	 *
+	 * this optimization based on the ddi_acc_hdl_t is the
+	 * first member of the ddi_acc_impl_t.
+	 */
+	return ((ddi_acc_hdl_t *)hdl);
+}
+
+ddi_acc_handle_t
+impl_acc_hdl_alloc(int (*waitfp)(caddr_t), caddr_t arg)
+{
+	ddi_acc_impl_t *hp;
+	on_trap_data_t *otp;
+	int sleepflag;
+
+	sleepflag = ((waitfp == (int (*)())KM_SLEEP) ? KM_SLEEP : KM_NOSLEEP);
+	/*
+	 * Allocate and initialize the data access handle and error status.
+	 */
+	if ((hp = kmem_zalloc(sizeof (ddi_acc_impl_t), sleepflag)) == NULL)
+		goto fail;
+	if ((hp->ahi_err = (ndi_err_t *)kmem_zalloc(
+	    sizeof (ndi_err_t), sleepflag)) == NULL) {
+		kmem_free(hp, sizeof (ddi_acc_impl_t));
+		goto fail;
+	}
+	if ((otp = (on_trap_data_t *)kmem_zalloc(
+	    sizeof (on_trap_data_t), sleepflag)) == NULL) {
+		kmem_free(hp->ahi_err, sizeof (ndi_err_t));
+		kmem_free(hp, sizeof (ddi_acc_impl_t));
+		goto fail;
+	}
+	hp->ahi_err->err_ontrap = otp;
+	hp->ahi_common.ah_platform_private = (void *)hp;
+
+	return ((ddi_acc_handle_t)hp);
+fail:
+	if ((waitfp != (int (*)())KM_SLEEP) &&
+	    (waitfp != (int (*)())KM_NOSLEEP))
+		ddi_set_callback(waitfp, arg, &impl_acc_hdl_id);
+	return (NULL);
+}
+
+void
+impl_acc_hdl_free(ddi_acc_handle_t handle)
+{
+	ddi_acc_impl_t *hp;
+
+	/*
+	 * The supplied (ddi_acc_handle_t) is actually a (ddi_acc_impl_t *),
+	 * because that's what we allocated in impl_acc_hdl_alloc() above.
+	 */
+	hp = (ddi_acc_impl_t *)handle;
+	if (hp) {
+		kmem_free(hp->ahi_err->err_ontrap, sizeof (on_trap_data_t));
+		kmem_free(hp->ahi_err, sizeof (ndi_err_t));
+		kmem_free(hp, sizeof (ddi_acc_impl_t));
+		if (impl_acc_hdl_id)
+			ddi_run_callback(&impl_acc_hdl_id);
+	}
+}
+
+/*
+ * Function used to check if a given access handle owns the failing address.
+ * Called by ndi_fmc_error, when we detect a PIO error.
+ */
+/* ARGSUSED */
+static int
+impl_acc_check(dev_info_t *dip, const void *handle, const void *addr,
+    const void *not_used)
+{
+	pfn_t pfn, fault_pfn;
+	ddi_acc_hdl_t *hp;
+
+	hp = impl_acc_hdl_get((ddi_acc_handle_t)handle);
+
+	ASSERT(hp);
+
+	if (addr != NULL) {
+		pfn = hp->ah_pfn;
+		fault_pfn = mmu_btop(*(uint64_t *)addr);
+		if (fault_pfn >= pfn && fault_pfn < (pfn + hp->ah_pnum))
+			return (DDI_FM_NONFATAL);
+	}
+	return (DDI_FM_UNKNOWN);
+}
+
+void
+impl_acc_err_init(ddi_acc_hdl_t *handlep)
+{
+	int fmcap;
+	ndi_err_t *errp;
+	on_trap_data_t *otp;
+	ddi_acc_impl_t *hp = (ddi_acc_impl_t *)handlep;
+
+	fmcap = ddi_fm_capable(handlep->ah_dip);
+
+	if (handlep->ah_acc.devacc_attr_version < DDI_DEVICE_ATTR_V1 ||
+	    !DDI_FM_ACC_ERR_CAP(fmcap)) {
+		handlep->ah_acc.devacc_attr_access = DDI_DEFAULT_ACC;
+	} else if (handlep->ah_acc.devacc_attr_access == DDI_FLAGERR_ACC &&
+	    hp->ahi_scan == NULL) {
+		handlep->ah_acc.devacc_attr_access = DDI_DEFAULT_ACC;
+	} else if (DDI_FM_ACC_ERR_CAP(fmcap)) {
+		if (handlep->ah_acc.devacc_attr_access == DDI_DEFAULT_ACC) {
+			if (handlep->ah_xfermodes)
+				return;
+			i_ddi_drv_ereport_post(handlep->ah_dip, DVR_EFMCAP,
+			    NULL, DDI_NOSLEEP);
+		} else {
+			errp = hp->ahi_err;
+			otp = (on_trap_data_t *)errp->err_ontrap;
+			otp->ot_handle = (void *)(hp);
+			otp->ot_prot = OT_DATA_ACCESS;
+			errp->err_status = DDI_FM_OK;
+			errp->err_expected = DDI_FM_ERR_UNEXPECTED;
+			errp->err_cf = impl_acc_check;
+		}
+	}
+}
+
+/* ARGSUSED */
+int
+impl_dma_check(dev_info_t *dip, const void *handle, const void *pci_hdl,
+    const void *not_used)
+{
+	return (DDI_FM_UNKNOWN);
+}
+
+void
+impl_acc_hdl_init(ddi_acc_hdl_t *handlep)
+{
+	ddi_acc_impl_t *hp;
+	int fmcap;
+	int devacc_attr_access;
+
+	if (!handlep)
+		return;
+	fmcap = ddi_fm_capable(handlep->ah_dip);
+	if (handlep->ah_acc.devacc_attr_version < DDI_DEVICE_ATTR_V1 ||
+	    !DDI_FM_ACC_ERR_CAP(fmcap))
+		devacc_attr_access = DDI_DEFAULT_ACC;
+	else
+		devacc_attr_access = handlep->ah_acc.devacc_attr_access;
+
+	hp = (ddi_acc_impl_t *)handlep->ah_platform_private;
+
+	/*
+	 * Can only do FLAGERR if scan callback is set up. This should
+	 * also guarantee that the peekpoke_mutex and err_mutex are defined.
+	 */
+	if (devacc_attr_access == DDI_FLAGERR_ACC && hp->ahi_scan == NULL)
+		devacc_attr_access = DDI_DEFAULT_ACC;
+
+	switch (devacc_attr_access) {
+	case DDI_CAUTIOUS_ACC:
+		hp->ahi_get8 = i_ddi_caut_get8;
+		hp->ahi_put8 = i_ddi_caut_put8;
+		hp->ahi_rep_get8 = i_ddi_caut_rep_get8;
+		hp->ahi_rep_put8 = i_ddi_caut_rep_put8;
+		hp->ahi_get16 = i_ddi_caut_get16;
+		hp->ahi_get32 = i_ddi_caut_get32;
+		hp->ahi_put16 = i_ddi_caut_put16;
+		hp->ahi_put32 = i_ddi_caut_put32;
+		hp->ahi_rep_get16 = i_ddi_caut_rep_get16;
+		hp->ahi_rep_get32 = i_ddi_caut_rep_get32;
+		hp->ahi_rep_put16 = i_ddi_caut_rep_put16;
+		hp->ahi_rep_put32 = i_ddi_caut_rep_put32;
+		hp->ahi_get64 = i_ddi_caut_get64;
+		hp->ahi_put64 = i_ddi_caut_put64;
+		hp->ahi_rep_get64 = i_ddi_caut_rep_get64;
+		hp->ahi_rep_put64 = i_ddi_caut_rep_put64;
+		break;
+	case DDI_FLAGERR_ACC:
+		if (hp->ahi_acc_attr & DDI_ACCATTR_IO_SPACE) {
+			hp->ahi_get8 = i_ddi_prot_io_get8;
+			hp->ahi_put8 = i_ddi_prot_io_put8;
+			hp->ahi_rep_get8 = i_ddi_prot_io_rep_get8;
+			hp->ahi_rep_put8 = i_ddi_prot_io_rep_put8;
+
+			/* temporary set these 64 functions to no-ops */
+			hp->ahi_get64 = i_ddi_io_get64;
+			hp->ahi_put64 = i_ddi_io_put64;
+			hp->ahi_rep_get64 = i_ddi_io_rep_get64;
+			hp->ahi_rep_put64 = i_ddi_io_rep_put64;
+
+			/*
+			 * check for BIG endian access
+			 */
+			if (handlep->ah_acc.devacc_attr_endian_flags ==
+			    DDI_STRUCTURE_BE_ACC) {
+				hp->ahi_get16 = i_ddi_prot_io_swap_get16;
+				hp->ahi_get32 = i_ddi_prot_io_swap_get32;
+				hp->ahi_put16 = i_ddi_prot_io_swap_put16;
+				hp->ahi_put32 = i_ddi_prot_io_swap_put32;
+				hp->ahi_rep_get16 =
+				    i_ddi_prot_io_swap_rep_get16;
+				hp->ahi_rep_get32 =
+				    i_ddi_prot_io_swap_rep_get32;
+				hp->ahi_rep_put16 =
+				    i_ddi_prot_io_swap_rep_put16;
+				hp->ahi_rep_put32 =
+				    i_ddi_prot_io_swap_rep_put32;
+			} else {
+				hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT;
+				hp->ahi_get16 = i_ddi_prot_io_get16;
+				hp->ahi_get32 = i_ddi_prot_io_get32;
+				hp->ahi_put16 = i_ddi_prot_io_put16;
+				hp->ahi_put32 = i_ddi_prot_io_put32;
+				hp->ahi_rep_get16 = i_ddi_prot_io_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_prot_io_rep_get32;
+				hp->ahi_rep_put16 = i_ddi_prot_io_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_prot_io_rep_put32;
+			}
+
+		} else if (hp->ahi_acc_attr & DDI_ACCATTR_CPU_VADDR) {
+
+			hp->ahi_get8 = i_ddi_prot_vaddr_get8;
+			hp->ahi_put8 = i_ddi_prot_vaddr_put8;
+			hp->ahi_rep_get8 = i_ddi_prot_vaddr_rep_get8;
+			hp->ahi_rep_put8 = i_ddi_prot_vaddr_rep_put8;
+
+			/*
+			 * check for BIG endian access
+			 */
+			if (handlep->ah_acc.devacc_attr_endian_flags ==
+			    DDI_STRUCTURE_BE_ACC) {
+
+				hp->ahi_get16 = i_ddi_prot_vaddr_swap_get16;
+				hp->ahi_get32 = i_ddi_prot_vaddr_swap_get32;
+				hp->ahi_get64 = i_ddi_prot_vaddr_swap_get64;
+				hp->ahi_put16 = i_ddi_prot_vaddr_swap_put16;
+				hp->ahi_put32 = i_ddi_prot_vaddr_swap_put32;
+				hp->ahi_put64 = i_ddi_prot_vaddr_swap_put64;
+				hp->ahi_rep_get16 =
+				    i_ddi_prot_vaddr_swap_rep_get16;
+				hp->ahi_rep_get32 =
+				    i_ddi_prot_vaddr_swap_rep_get32;
+				hp->ahi_rep_get64 =
+				    i_ddi_prot_vaddr_swap_rep_get64;
+				hp->ahi_rep_put16 =
+				    i_ddi_prot_vaddr_swap_rep_put16;
+				hp->ahi_rep_put32 =
+				    i_ddi_prot_vaddr_swap_rep_put32;
+				hp->ahi_rep_put64 =
+				    i_ddi_prot_vaddr_swap_rep_put64;
+			} else {
+				hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT;
+				hp->ahi_get16 = i_ddi_prot_vaddr_get16;
+				hp->ahi_get32 = i_ddi_prot_vaddr_get32;
+				hp->ahi_get64 = i_ddi_prot_vaddr_get64;
+				hp->ahi_put16 = i_ddi_prot_vaddr_put16;
+				hp->ahi_put32 = i_ddi_prot_vaddr_put32;
+				hp->ahi_put64 = i_ddi_prot_vaddr_put64;
+				hp->ahi_rep_get16 = i_ddi_prot_vaddr_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_prot_vaddr_rep_get32;
+				hp->ahi_rep_get64 = i_ddi_prot_vaddr_rep_get64;
+				hp->ahi_rep_put16 = i_ddi_prot_vaddr_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_prot_vaddr_rep_put32;
+				hp->ahi_rep_put64 = i_ddi_prot_vaddr_rep_put64;
+			}
+		}
+		break;
+	case DDI_DEFAULT_ACC:
+		if (hp->ahi_acc_attr & DDI_ACCATTR_IO_SPACE) {
+			hp->ahi_get8 = i_ddi_io_get8;
+			hp->ahi_put8 = i_ddi_io_put8;
+			hp->ahi_rep_get8 = i_ddi_io_rep_get8;
+			hp->ahi_rep_put8 = i_ddi_io_rep_put8;
+
+			/* temporary set these 64 functions to no-ops */
+			hp->ahi_get64 = i_ddi_io_get64;
+			hp->ahi_put64 = i_ddi_io_put64;
+			hp->ahi_rep_get64 = i_ddi_io_rep_get64;
+			hp->ahi_rep_put64 = i_ddi_io_rep_put64;
+
+			/*
+			 * check for BIG endian access
+			 */
+			if (handlep->ah_acc.devacc_attr_endian_flags ==
+			    DDI_STRUCTURE_BE_ACC) {
+				hp->ahi_get16 = i_ddi_io_swap_get16;
+				hp->ahi_get32 = i_ddi_io_swap_get32;
+				hp->ahi_put16 = i_ddi_io_swap_put16;
+				hp->ahi_put32 = i_ddi_io_swap_put32;
+				hp->ahi_rep_get16 = i_ddi_io_swap_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_io_swap_rep_get32;
+				hp->ahi_rep_put16 = i_ddi_io_swap_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_io_swap_rep_put32;
+			} else {
+				hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT;
+				hp->ahi_get16 = i_ddi_io_get16;
+				hp->ahi_get32 = i_ddi_io_get32;
+				hp->ahi_put16 = i_ddi_io_put16;
+				hp->ahi_put32 = i_ddi_io_put32;
+				hp->ahi_rep_get16 = i_ddi_io_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_io_rep_get32;
+				hp->ahi_rep_put16 = i_ddi_io_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_io_rep_put32;
+			}
+
+		} else if (hp->ahi_acc_attr & DDI_ACCATTR_CPU_VADDR) {
+
+			hp->ahi_get8 = i_ddi_vaddr_get8;
+			hp->ahi_put8 = i_ddi_vaddr_put8;
+			hp->ahi_rep_get8 = i_ddi_vaddr_rep_get8;
+			hp->ahi_rep_put8 = i_ddi_vaddr_rep_put8;
+
+			/*
+			 * check for BIG endian access
+			 */
+			if (handlep->ah_acc.devacc_attr_endian_flags ==
+			    DDI_STRUCTURE_BE_ACC) {
+
+				hp->ahi_get16 = i_ddi_vaddr_swap_get16;
+				hp->ahi_get32 = i_ddi_vaddr_swap_get32;
+				hp->ahi_get64 = i_ddi_vaddr_swap_get64;
+				hp->ahi_put16 = i_ddi_vaddr_swap_put16;
+				hp->ahi_put32 = i_ddi_vaddr_swap_put32;
+				hp->ahi_put64 = i_ddi_vaddr_swap_put64;
+				hp->ahi_rep_get16 = i_ddi_vaddr_swap_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_vaddr_swap_rep_get32;
+				hp->ahi_rep_get64 = i_ddi_vaddr_swap_rep_get64;
+				hp->ahi_rep_put16 = i_ddi_vaddr_swap_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_vaddr_swap_rep_put32;
+				hp->ahi_rep_put64 = i_ddi_vaddr_swap_rep_put64;
+			} else {
+				hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT;
+				hp->ahi_get16 = i_ddi_vaddr_get16;
+				hp->ahi_get32 = i_ddi_vaddr_get32;
+				hp->ahi_get64 = i_ddi_vaddr_get64;
+				hp->ahi_put16 = i_ddi_vaddr_put16;
+				hp->ahi_put32 = i_ddi_vaddr_put32;
+				hp->ahi_put64 = i_ddi_vaddr_put64;
+				hp->ahi_rep_get16 = i_ddi_vaddr_rep_get16;
+				hp->ahi_rep_get32 = i_ddi_vaddr_rep_get32;
+				hp->ahi_rep_get64 = i_ddi_vaddr_rep_get64;
+				hp->ahi_rep_put16 = i_ddi_vaddr_rep_put16;
+				hp->ahi_rep_put32 = i_ddi_vaddr_rep_put32;
+				hp->ahi_rep_put64 = i_ddi_vaddr_rep_put64;
+			}
+		}
+		break;
+	}
+	hp->ahi_fault_check = i_ddi_acc_fault_check;
+	hp->ahi_fault_notify = i_ddi_acc_fault_notify;
+	hp->ahi_fault = 0;
+	impl_acc_err_init(handlep);
+}
+
+/*
+ * The followings are low-level routines for data access.
+ *
+ * All of these routines should be implemented in assembly. Those
+ * that have been rewritten be found in ~ml/ddi_i86_asm.s
+ */
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_vaddr_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	return (ddi_swap16(*addr));
+}
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_io_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	return (ddi_swap16(inw((uintptr_t)addr)));
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_vaddr_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	return (ddi_swap32(*addr));
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_io_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	return (ddi_swap32(inl((uintptr_t)addr)));
+}
+
+/*ARGSUSED*/
+uint64_t
+i_ddi_vaddr_swap_get64(ddi_acc_impl_t *hdlp, uint64_t *addr)
+{
+	return (ddi_swap64(*addr));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value)
+{
+	*addr = ddi_swap16(value);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value)
+{
+	outw((uintptr_t)addr, ddi_swap16(value));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value)
+{
+	*addr = ddi_swap32(value);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value)
+{
+	outl((uintptr_t)addr, ddi_swap32(value));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_put64(ddi_acc_impl_t *hdlp, uint64_t *addr, uint64_t value)
+{
+	*addr = ddi_swap64(value);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint8_t	*h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = *d++;
+	else
+		for (; repcount; repcount--)
+			*h++ = *d;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = *d++;
+	else
+		for (; repcount; repcount--)
+			*h++ = *d;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = ddi_swap16(*d++);
+	else
+		for (; repcount; repcount--)
+			*h++ = ddi_swap16(*d);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 2)
+			*h++ = ddi_swap16(inw(port));
+	else
+		for (; repcount; repcount--)
+			*h++ = ddi_swap16(inw(port));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = *d++;
+	else
+		for (; repcount; repcount--)
+			*h++ = *d;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = ddi_swap32(*d++);
+	else
+		for (; repcount; repcount--)
+			*h++ = ddi_swap32(*d);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 4)
+			*h++ = ddi_swap32(inl(port));
+	else
+		for (; repcount; repcount--)
+			*h++ = ddi_swap32(inl(port));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = *d++;
+	else
+		for (; repcount; repcount--)
+			*h++ = *d;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*h++ = ddi_swap64(*d++);
+	else
+		for (; repcount; repcount--)
+			*h++ = ddi_swap64(*d);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint8_t	*h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap16(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap16(*h++);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 2)
+			outw(port, ddi_swap16(*h++));
+	else
+		for (; repcount; repcount--)
+			outw(port, ddi_swap16(*h++));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap32(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap32(*h++);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 4)
+			outl(port, ddi_swap32(*h++));
+	else
+		for (; repcount; repcount--)
+			outl(port, ddi_swap32(*h++));
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+}
+
+/*ARGSUSED*/
+void
+i_ddi_vaddr_swap_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap64(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap64(*h++);
+}
+
+/*ARGSUSED*/
+uint64_t
+i_ddi_io_get64(ddi_acc_impl_t *hdlp, uint64_t *addr)
+{
+	panic("ddi_get64 from i/o space");
+	/*NOTREACHED*/
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, uint64_t value)
+{
+	panic("ddi_put64 to i/o space");
+	/*NOTREACHED*/
+}
+
+void
+do_scan(ddi_acc_impl_t *hdlp)
+{
+	ddi_fm_error_t de;
+	ndi_err_t *errp = (ndi_err_t *)hdlp->ahi_err;
+
+	bzero(&de, sizeof (ddi_fm_error_t));
+	de.fme_version = DDI_FME_VERSION;
+	de.fme_ena = fm_ena_generate(0, FM_ENA_FMT1);
+	de.fme_flag = DDI_FM_ERR_UNEXPECTED;
+
+	mutex_enter(hdlp->ahi_err_mutexp);
+	hdlp->ahi_scan(hdlp->ahi_scan_dip, &de);
+	if (de.fme_status != DDI_FM_OK) {
+		errp->err_ena = de.fme_ena;
+		errp->err_expected = de.fme_flag;
+		errp->err_status = DDI_FM_NONFATAL;
+	}
+	mutex_exit(hdlp->ahi_err_mutexp);
+}
+
+/*ARGSUSED*/
+uint8_t
+i_ddi_prot_vaddr_get8(ddi_acc_impl_t *hdlp, uint8_t *addr)
+{
+	uint8_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = *addr;
+	if (val == 0xff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_prot_vaddr_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	uint16_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = *addr;
+	if (val == 0xffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_prot_vaddr_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	uint32_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = *addr;
+	if (val == 0xffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint64_t
+i_ddi_prot_vaddr_get64(ddi_acc_impl_t *hdlp, uint64_t *addr)
+{
+	uint64_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = *addr;
+	if (val == 0xffffffffffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint8_t
+i_ddi_prot_io_get8(ddi_acc_impl_t *hdlp, uint8_t *addr)
+{
+	uint8_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = inb((uintptr_t)addr);
+	if (val == 0xff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_prot_io_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	uint16_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = inw((uintptr_t)addr);
+	if (val == 0xffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_prot_io_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	uint32_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = inl((uintptr_t)addr);
+	if (val == 0xffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_prot_vaddr_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	uint16_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = ddi_swap16(*addr);
+	if (val == 0xffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint16_t
+i_ddi_prot_io_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr)
+{
+	uint16_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = ddi_swap16(inw((uintptr_t)addr));
+	if (val == 0xffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_prot_vaddr_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	uint32_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = ddi_swap32(*addr);
+	if (val == 0xffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint32_t
+i_ddi_prot_io_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr)
+{
+	uint32_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = ddi_swap32(inl((uintptr_t)addr));
+	if (val == 0xffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+uint64_t
+i_ddi_prot_vaddr_swap_get64(ddi_acc_impl_t *hdlp, uint64_t *addr)
+{
+	uint64_t val;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	val = ddi_swap64(*addr);
+	if (val == 0xffffffffffffffff)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+
+	return (val);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_put8(ddi_acc_impl_t *hdlp, uint8_t *addr, uint8_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = value;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_put8(ddi_acc_impl_t *hdlp, uint8_t *addr, uint8_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	outb((uintptr_t)addr, value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = value;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	outw((uintptr_t)addr, value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_put32(ddi_acc_impl_t *hdlp, uint32_t *addr,
+    uint32_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = value;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	outl((uintptr_t)addr, value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_put64(ddi_acc_impl_t *hdlp, uint64_t *addr,
+    uint64_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = value;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr,
+    uint16_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = ddi_swap16(value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	outw((uintptr_t)addr, ddi_swap16(value));
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr,
+    uint32_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = ddi_swap32(value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	outl((uintptr_t)addr, ddi_swap32(value));
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_put64(ddi_acc_impl_t *hdlp, uint64_t *addr,
+    uint64_t value)
+{
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	*addr = ddi_swap64(value);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint8_t	*h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--, port++)
+			if ((*h++ = inb(port)) == 0xff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = inb(port)) == 0xff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--, port += 2)
+			if ((*h++ = inw(port)) == 0xffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = inw(port)) == 0xffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--, port += 4)
+			if ((*h++ = inl(port)) == 0xffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = inl(port)) == 0xffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint8_t	*h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = *d++) == 0xff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = *d) == 0xff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = *d++) == 0xffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = *d) == 0xffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap16(*d++)) == 0xffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap16(*d)) == 0xffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--, port += 2)
+			if ((*h++ = ddi_swap16(inw(port))) == 0xffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap16(inw(port))) == 0xffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = *d++) == 0xffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = *d) == 0xffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap32(*d++)) == 0xffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap32(*d)) == 0xffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--, port += 4)
+			if ((*h++ = ddi_swap32(inl(port))) == 0xffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap32(inl(port))) == 0xffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = *d++) == 0xffffffffffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = *d) == 0xffffffffffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	int fail = 0;
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR) {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap64(*d++)) == 0xffffffffffffffff)
+				fail = 1;
+	} else {
+		for (; repcount; repcount--)
+			if ((*h++ = ddi_swap64(*d)) == 0xffffffffffffffff)
+				fail = 1;
+	}
+	if (fail == 1)
+		do_scan(hdlp);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint8_t	*h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr,
+    uint8_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint8_t	*h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port++)
+			outb(port, *h++);
+	else
+		for (; repcount; repcount--)
+			outb(port, *h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 2)
+			outw(port, *h++);
+	else
+		for (; repcount; repcount--)
+			outw(port, *h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap16(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap16(*h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr,
+    uint16_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint16_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 2)
+			outw(port, ddi_swap16(*h++));
+	else
+		for (; repcount; repcount--)
+			outw(port, ddi_swap16(*h++));
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 4)
+			outl(port, *h++);
+	else
+		for (; repcount; repcount--)
+			outl(port, *h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap32(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap32(*h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_io_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr,
+    uint32_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint32_t *h;
+	uintptr_t port;
+
+	h = host_addr;
+	port = (uintptr_t)dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--, port += 4)
+			outl(port, ddi_swap32(*h++));
+	else
+		for (; repcount; repcount--)
+			outl(port, ddi_swap32(*h++));
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = *h++;
+	else
+		for (; repcount; repcount--)
+			*d = *h++;
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_prot_vaddr_swap_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	uint64_t *h, *d;
+
+	h = host_addr;
+	d = dev_addr;
+
+	mutex_enter(hdlp->ahi_peekpoke_mutexp);
+	if (flags == DDI_DEV_AUTOINCR)
+		for (; repcount; repcount--)
+			*d++ = ddi_swap64(*h++);
+	else
+		for (; repcount; repcount--)
+			*d = ddi_swap64(*h++);
+	mutex_exit(hdlp->ahi_peekpoke_mutexp);
+}
+
+void
+ddi_io_rep_get8(ddi_acc_handle_t handle,
+    uint8_t *host_addr, uint8_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_get8)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+void
+ddi_io_rep_get16(ddi_acc_handle_t handle,
+    uint16_t *host_addr, uint16_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_get16)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+void
+ddi_io_rep_get32(ddi_acc_handle_t handle,
+    uint32_t *host_addr, uint32_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_get32)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	cmn_err(CE_PANIC, "ddi_rep_get64 from i/o space");
+}
+
+void
+ddi_io_rep_put8(ddi_acc_handle_t handle,
+    uint8_t *host_addr, uint8_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_put8)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+void
+ddi_io_rep_put16(ddi_acc_handle_t handle,
+    uint16_t *host_addr, uint16_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_put16)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+void
+ddi_io_rep_put32(ddi_acc_handle_t handle,
+    uint32_t *host_addr, uint32_t *dev_addr, size_t repcount)
+{
+	(((ddi_acc_impl_t *)handle)->ahi_rep_put32)
+	    ((ddi_acc_impl_t *)handle, host_addr, dev_addr,
+	    repcount, DDI_DEV_NO_AUTOINCR);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_io_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr,
+    uint64_t *dev_addr, size_t repcount, uint_t flags)
+{
+	cmn_err(CE_PANIC, "ddi_rep_put64 to i/o space");
+}
+
+/*
+ * These next two functions could be translated into assembler someday
+ */
+int
+ddi_check_acc_handle(ddi_acc_handle_t handle)
+{
+	ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle;
+	return (((*hdlp->ahi_fault_check)(hdlp) == DDI_SUCCESS) ? DDI_SUCCESS :
+	    DDI_FAILURE);
+}
+
+int
+i_ddi_acc_fault_check(ddi_acc_impl_t *hdlp)
+{
+	/* Default version, just returns flag value */
+	return (hdlp->ahi_fault);
+}
+
+/*ARGSUSED*/
+void
+i_ddi_acc_fault_notify(ddi_acc_impl_t *hdlp)
+{
+	/* Default version, does nothing for now */
+}
+
+void
+i_ddi_acc_set_fault(ddi_acc_handle_t handle)
+{
+	ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle;
+
+	if (!hdlp->ahi_fault) {
+		hdlp->ahi_fault = 1;
+		(*hdlp->ahi_fault_notify)(hdlp);
+	}
+}
+
+void
+i_ddi_acc_clr_fault(ddi_acc_handle_t handle)
+{
+	ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle;
+
+	if (hdlp->ahi_fault) {
+		hdlp->ahi_fault = 0;
+		(*hdlp->ahi_fault_notify)(hdlp);
+	}
+}
diff --git a/usr/src/uts/intel/os/desctbls.c b/usr/src/uts/intel/os/desctbls.c
new file mode 100644
index 0000000000..35345c3fe8
--- /dev/null
+++ b/usr/src/uts/intel/os/desctbls.c
@@ -0,0 +1,1218 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Copyright 2018 Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/tss.h>
+#include <sys/segments.h>
+#include <sys/trap.h>
+#include <sys/cpuvar.h>
+#include <sys/bootconf.h>
+#include <sys/x86_archext.h>
+#include <sys/controlregs.h>
+#include <sys/archsystm.h>
+#include <sys/machsystm.h>
+#include <sys/kobj.h>
+#include <sys/cmn_err.h>
+#include <sys/reboot.h>
+#include <sys/kdi.h>
+#include <sys/mach_mmu.h>
+#include <sys/systm.h>
+#include <sys/note.h>
+
+#ifdef __xpv
+#include <sys/hypervisor.h>
+#include <vm/as.h>
+#endif
+
+#include <sys/promif.h>
+#include <sys/bootinfo.h>
+#include <vm/kboot_mmu.h>
+#include <vm/hat_pte.h>
+
+/*
+ * cpu0 and default tables and structures.
+ */
+user_desc_t	*gdt0;
+#if !defined(__xpv)
+desctbr_t	gdt0_default_r;
+#endif
+
+gate_desc_t	*idt0;		/* interrupt descriptor table */
+
+tss_t		*ktss0;			/* kernel task state structure */
+
+
+user_desc_t	zero_udesc;		/* base zero user desc native procs */
+user_desc_t	null_udesc;		/* null user descriptor */
+system_desc_t	null_sdesc;		/* null system descriptor */
+
+user_desc_t	zero_u32desc;		/* 32-bit compatibility procs */
+
+user_desc_t	ucs_on;
+user_desc_t	ucs_off;
+user_desc_t	ucs32_on;
+user_desc_t	ucs32_off;
+
+/*
+ * If the size of this is changed, you must update hat_pcp_setup() and the
+ * definitions in exception.s
+ */
+extern char dblfault_stack0[DEFAULTSTKSZ];
+extern char nmi_stack0[DEFAULTSTKSZ];
+extern char mce_stack0[DEFAULTSTKSZ];
+
+extern void	fast_null(void);
+extern hrtime_t	get_hrtime(void);
+extern hrtime_t	gethrvtime(void);
+extern hrtime_t	get_hrestime(void);
+extern uint64_t	getlgrp(void);
+
+void (*(fasttable[]))(void) = {
+	fast_null,			/* T_FNULL routine */
+	fast_null,			/* T_FGETFP routine (initially null) */
+	fast_null,			/* T_FSETFP routine (initially null) */
+	(void (*)())(uintptr_t)get_hrtime,	/* T_GETHRTIME */
+	(void (*)())(uintptr_t)gethrvtime,	/* T_GETHRVTIME */
+	(void (*)())(uintptr_t)get_hrestime,	/* T_GETHRESTIME */
+	(void (*)())(uintptr_t)getlgrp		/* T_GETLGRP */
+};
+
+/*
+ * Structure containing pre-computed descriptors to allow us to temporarily
+ * interpose on a standard handler.
+ */
+struct interposing_handler {
+	int ih_inum;
+	gate_desc_t ih_interp_desc;
+	gate_desc_t ih_default_desc;
+};
+
+/*
+ * The brand infrastructure interposes on two handlers, and we use one as a
+ * NULL signpost.
+ */
+static struct interposing_handler brand_tbl[2];
+
+/*
+ * software prototypes for default local descriptor table
+ */
+
+/*
+ * Routines for loading segment descriptors in format the hardware
+ * can understand.
+ */
+
+/*
+ * In long mode we have the new L or long mode attribute bit
+ * for code segments. Only the conforming bit in type is used along
+ * with descriptor priority and present bits. Default operand size must
+ * be zero when in long mode. In 32-bit compatibility mode all fields
+ * are treated as in legacy mode. For data segments while in long mode
+ * only the present bit is loaded.
+ */
+void
+set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size,
+    uint_t type, uint_t dpl, uint_t gran, uint_t defopsz)
+{
+	ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG);
+	/* This should never be a "system" segment. */
+	ASSERT3U(type & SDT_S, !=, 0);
+
+	/*
+	 * 64-bit long mode.
+	 */
+	if (lmode == SDP_LONG)
+		dp->usd_def32 = 0;		/* 32-bit operands only */
+	else
+		/*
+		 * 32-bit compatibility mode.
+		 */
+		dp->usd_def32 = defopsz;	/* 0 = 16, 1 = 32-bit ops */
+
+	/*
+	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
+	 * will write to the GDT whenever we change segment registers around.
+	 * With KPTI on, the GDT is read-only in the user page table, which
+	 * causes crashes if we don't set this.
+	 */
+	ASSERT3U(type & SDT_A, !=, 0);
+
+	dp->usd_long = lmode;	/* 64-bit mode */
+	dp->usd_type = type;
+	dp->usd_dpl = dpl;
+	dp->usd_p = 1;
+	dp->usd_gran = gran;		/* 0 = bytes, 1 = pages */
+
+	dp->usd_lobase = (uintptr_t)base;
+	dp->usd_midbase = (uintptr_t)base >> 16;
+	dp->usd_hibase = (uintptr_t)base >> (16 + 8);
+	dp->usd_lolimit = size;
+	dp->usd_hilimit = (uintptr_t)size >> 16;
+}
+
+/*
+ * Install system segment descriptor for LDT and TSS segments.
+ */
+
+void
+set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type,
+    uint_t dpl)
+{
+	dp->ssd_lolimit = size;
+	dp->ssd_hilimit = (uintptr_t)size >> 16;
+
+	dp->ssd_lobase = (uintptr_t)base;
+	dp->ssd_midbase = (uintptr_t)base >> 16;
+	dp->ssd_hibase = (uintptr_t)base >> (16 + 8);
+	dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8);
+
+	dp->ssd_type = type;
+	dp->ssd_zero1 = 0;	/* must be zero */
+	dp->ssd_zero2 = 0;
+	dp->ssd_dpl = dpl;
+	dp->ssd_p = 1;
+	dp->ssd_gran = 0;	/* force byte units */
+}
+
+void *
+get_ssd_base(system_desc_t *dp)
+{
+	uintptr_t	base;
+
+	base = (uintptr_t)dp->ssd_lobase |
+	    (uintptr_t)dp->ssd_midbase << 16 |
+	    (uintptr_t)dp->ssd_hibase << (16 + 8) |
+	    (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8);
+	return ((void *)base);
+}
+
+/*
+ * Install gate segment descriptor for interrupt, trap, call and task gates.
+ *
+ * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
+ * all interrupts.  We have different ISTs for each class of exceptions that are
+ * most likely to occur while handling an existing exception; while many of
+ * these are just going to panic, it's nice not to trample on the existing
+ * exception state for debugging purposes.
+ *
+ * Normal interrupts are all redirected unconditionally to the KPTI trampoline
+ * stack space. This unifies the trampoline handling between user and kernel
+ * space (and avoids the need to touch %gs).
+ *
+ * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
+ * we do a read from KMDB that cause another #PF.  Without its own IST, this
+ * would stomp on the kernel's mcpu_kpti_flt frame.
+ */
+uint_t
+idt_vector_to_ist(uint_t vector)
+{
+#if defined(__xpv)
+	_NOTE(ARGUNUSED(vector));
+	return (IST_NONE);
+#else
+	switch (vector) {
+	/* These should always use IST even without KPTI enabled. */
+	case T_DBLFLT:
+		return (IST_DF);
+	case T_NMIFLT:
+		return (IST_NMI);
+	case T_MCE:
+		return (IST_MCE);
+
+	case T_BPTFLT:
+	case T_SGLSTP:
+		if (kpti_enable == 1) {
+			return (IST_DBG);
+		}
+		return (IST_NONE);
+	case T_STKFLT:
+	case T_GPFLT:
+	case T_PGFLT:
+		if (kpti_enable == 1) {
+			return (IST_NESTABLE);
+		}
+		return (IST_NONE);
+	default:
+		if (kpti_enable == 1) {
+			return (IST_DEFAULT);
+		}
+		return (IST_NONE);
+	}
+#endif
+}
+
+void
+set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
+    uint_t type, uint_t dpl, uint_t ist)
+{
+	dp->sgd_looffset = (uintptr_t)func;
+	dp->sgd_hioffset = (uintptr_t)func >> 16;
+	dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
+	dp->sgd_selector =  (uint16_t)sel;
+	dp->sgd_ist = ist;
+	dp->sgd_type = type;
+	dp->sgd_dpl = dpl;
+	dp->sgd_p = 1;
+}
+
+/*
+ * Updates a single user descriptor in the the GDT of the current cpu.
+ * Caller is responsible for preventing cpu migration.
+ */
+
+void
+gdt_update_usegd(uint_t sidx, user_desc_t *udp)
+{
+#if defined(DEBUG)
+	/* This should never be a "system" segment, but it might be null. */
+	if (udp->usd_p != 0 || udp->usd_type != 0) {
+		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
+	}
+	/*
+	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
+	 * will write to the GDT whenever we change segment registers around.
+	 * With KPTI on, the GDT is read-only in the user page table, which
+	 * causes crashes if we don't set this.
+	 */
+	if (udp->usd_p != 0 || udp->usd_type != 0) {
+		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
+	}
+#endif
+
+#if defined(__xpv)
+	uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx;
+
+	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp))
+		panic("gdt_update_usegd: HYPERVISOR_update_descriptor");
+
+#else	/* __xpv */
+	CPU->cpu_gdt[sidx] = *udp;
+#endif	/* __xpv */
+}
+
+/*
+ * Writes single descriptor pointed to by udp into a processes
+ * LDT entry pointed to by ldp.
+ */
+int
+ldt_update_segd(user_desc_t *ldp, user_desc_t *udp)
+{
+#if defined(DEBUG)
+	/* This should never be a "system" segment, but it might be null. */
+	if (udp->usd_p != 0 || udp->usd_type != 0) {
+		ASSERT3U(udp->usd_type & SDT_S, !=, 0);
+	}
+	/*
+	 * We should always set the "accessed" bit (SDT_A), otherwise the CPU
+	 * will write to the LDT whenever we change segment registers around.
+	 * With KPTI on, the LDT is read-only in the user page table, which
+	 * causes crashes if we don't set this.
+	 */
+	if (udp->usd_p != 0 || udp->usd_type != 0) {
+		ASSERT3U(udp->usd_type & SDT_A, !=, 0);
+	}
+#endif
+
+#if defined(__xpv)
+	uint64_t dpa;
+
+	dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) |
+	    ((uintptr_t)ldp & PAGEOFFSET);
+
+	/*
+	 * The hypervisor is a little more restrictive about what it
+	 * supports in the LDT.
+	 */
+	if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0)
+		return (EINVAL);
+
+#else	/* __xpv */
+	*ldp = *udp;
+
+#endif	/* __xpv */
+	return (0);
+}
+
+#if defined(__xpv)
+
+/*
+ * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor.
+ * Returns true if a valid entry was written.
+ */
+int
+xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg)
+{
+	trap_info_t *ti = ti_arg;	/* XXPV	Aargh - segments.h comment */
+
+	/*
+	 * skip holes in the IDT
+	 */
+	if (GATESEG_GETOFFSET(sgd) == 0)
+		return (0);
+
+	ASSERT(sgd->sgd_type == SDT_SYSIGT);
+	ti->vector = vec;
+	TI_SET_DPL(ti, sgd->sgd_dpl);
+
+	/*
+	 * Is this an interrupt gate?
+	 */
+	if (sgd->sgd_type == SDT_SYSIGT) {
+		/* LINTED */
+		TI_SET_IF(ti, 1);
+	}
+	ti->cs = sgd->sgd_selector;
+	ti->cs |= SEL_KPL;	/* force into ring 3. see KCS_SEL  */
+	ti->address = GATESEG_GETOFFSET(sgd);
+	return (1);
+}
+
+/*
+ * Convert a single hw format gate descriptor and write it into our virtual IDT.
+ */
+void
+xen_idt_write(gate_desc_t *sgd, uint_t vec)
+{
+	trap_info_t trapinfo[2];
+
+	bzero(trapinfo, sizeof (trapinfo));
+	if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0)
+		return;
+	if (xen_set_trap_table(trapinfo) != 0)
+		panic("xen_idt_write: xen_set_trap_table() failed");
+}
+
+#endif	/* __xpv */
+
+
+/*
+ * Build kernel GDT.
+ */
+
+static void
+init_gdt_common(user_desc_t *gdt)
+{
+	int i;
+
+	/*
+	 * 64-bit kernel code segment.
+	 */
+	set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL,
+	    SDP_PAGES, SDP_OP32);
+
+	/*
+	 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
+	 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
+	 * instruction to return from system calls back to 32-bit applications.
+	 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
+	 * descriptors. We therefore must ensure that the kernel uses something,
+	 * though it will be ignored by hardware, that is compatible with 32-bit
+	 * apps. For the same reason we must set the default op size of this
+	 * descriptor to 32-bit operands.
+	 */
+	set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
+	    SEL_KPL, SDP_PAGES, SDP_OP32);
+	gdt[GDT_KDATA].usd_def32 = 1;
+
+	/*
+	 * 64-bit user code segment.
+	 */
+	set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL,
+	    SDP_PAGES, SDP_OP32);
+
+	/*
+	 * 32-bit user code segment.
+	 */
+	set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA,
+	    SEL_UPL, SDP_PAGES, SDP_OP32);
+
+	/*
+	 * See gdt_ucode32() and gdt_ucode_native().
+	 */
+	ucs_on = ucs_off = gdt[GDT_UCODE];
+	ucs_off.usd_p = 0;	/* forces #np fault */
+
+	ucs32_on = ucs32_off = gdt[GDT_U32CODE];
+	ucs32_off.usd_p = 0;	/* forces #np fault */
+
+	/*
+	 * 32 and 64 bit data segments can actually share the same descriptor.
+	 * In long mode only the present bit is checked but all other fields
+	 * are loaded. But in compatibility mode all fields are interpreted
+	 * as in legacy mode so they must be set correctly for a 32-bit data
+	 * segment.
+	 */
+	set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL,
+	    SDP_PAGES, SDP_OP32);
+
+#if !defined(__xpv)
+
+	/*
+	 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
+	 * in the GDT is 0.
+	 */
+
+	/*
+	 * Kernel TSS
+	 */
+	set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0,
+	    sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL);
+
+#endif	/* !__xpv */
+
+	/*
+	 * Initialize fs and gs descriptors for 32 bit processes.
+	 * Only attributes and limits are initialized, the effective
+	 * base address is programmed via fsbase/gsbase.
+	 */
+	set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
+	    SEL_UPL, SDP_PAGES, SDP_OP32);
+	set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA,
+	    SEL_UPL, SDP_PAGES, SDP_OP32);
+
+	/*
+	 * Initialize the descriptors set aside for brand usage.
+	 * Only attributes and limits are initialized.
+	 */
+	for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++)
+		set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA,
+		    SEL_UPL, SDP_PAGES, SDP_OP32);
+
+	/*
+	 * Initialize convenient zero base user descriptors for clearing
+	 * lwp private %fs and %gs descriptors in GDT. See setregs() for
+	 * an example.
+	 */
+	set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL,
+	    SDP_BYTES, SDP_OP32);
+	set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL,
+	    SDP_PAGES, SDP_OP32);
+}
+
+#if defined(__xpv)
+
+static user_desc_t *
+init_gdt(void)
+{
+	uint64_t gdtpa;
+	ulong_t ma[1];		/* XXPV should be a memory_t */
+	ulong_t addr;
+
+#if !defined(__lint)
+	/*
+	 * Our gdt is never larger than a single page.
+	 */
+	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
+#endif
+	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
+	    PAGESIZE, PAGESIZE);
+	bzero(gdt0, PAGESIZE);
+
+	init_gdt_common(gdt0);
+
+	/*
+	 * XXX Since we never invoke kmdb until after the kernel takes
+	 * over the descriptor tables why not have it use the kernel's
+	 * selectors?
+	 */
+	if (boothowto & RB_DEBUG) {
+		set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA,
+		    SEL_KPL, SDP_PAGES, SDP_OP32);
+		set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA,
+		    SEL_KPL, SDP_PAGES, SDP_OP32);
+	}
+
+	/*
+	 * Clear write permission for page containing the gdt and install it.
+	 */
+	gdtpa = pfn_to_pa(va_to_pfn(gdt0));
+	ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT);
+	kbm_read_only((uintptr_t)gdt0, gdtpa);
+	xen_set_gdt(ma, NGDT);
+
+	/*
+	 * Reload the segment registers to use the new GDT.
+	 * On 64-bit, fixup KCS_SEL to be in ring 3.
+	 * See KCS_SEL in segments.h.
+	 */
+	load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL);
+
+	/*
+	 *  setup %gs for kernel
+	 */
+	xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]);
+
+	/*
+	 * XX64 We should never dereference off "other gsbase" or
+	 * "fsbase".  So, we should arrange to point FSBASE and
+	 * KGSBASE somewhere truly awful e.g. point it at the last
+	 * valid address below the hole so that any attempts to index
+	 * off them cause an exception.
+	 *
+	 * For now, point it at 8G -- at least it should be unmapped
+	 * until some 64-bit processes run.
+	 */
+	addr = 0x200000000ul;
+	xen_set_segment_base(SEGBASE_FS, addr);
+	xen_set_segment_base(SEGBASE_GS_USER, addr);
+	xen_set_segment_base(SEGBASE_GS_USER_SEL, 0);
+
+	return (gdt0);
+}
+
+#else	/* __xpv */
+
+static user_desc_t *
+init_gdt(void)
+{
+	desctbr_t	r_bgdt, r_gdt;
+	user_desc_t	*bgdt;
+
+#if !defined(__lint)
+	/*
+	 * Our gdt is never larger than a single page.
+	 */
+	ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE);
+#endif
+	gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA,
+	    PAGESIZE, PAGESIZE);
+	bzero(gdt0, PAGESIZE);
+
+	init_gdt_common(gdt0);
+
+	/*
+	 * Copy in from boot's gdt to our gdt.
+	 * Entry 0 is the null descriptor by definition.
+	 */
+	rd_gdtr(&r_bgdt);
+	bgdt = (user_desc_t *)r_bgdt.dtr_base;
+	if (bgdt == NULL)
+		panic("null boot gdt");
+
+	gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA];
+	gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE];
+	gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE];
+	gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA];
+	gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE];
+
+	/*
+	 * Install our new GDT
+	 */
+	r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1;
+	r_gdt.dtr_base = (uintptr_t)gdt0;
+	wr_gdtr(&r_gdt);
+
+	/*
+	 * Reload the segment registers to use the new GDT
+	 */
+	load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL);
+
+	/*
+	 *  setup %gs for kernel
+	 */
+	wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]);
+
+	/*
+	 * XX64 We should never dereference off "other gsbase" or
+	 * "fsbase".  So, we should arrange to point FSBASE and
+	 * KGSBASE somewhere truly awful e.g. point it at the last
+	 * valid address below the hole so that any attempts to index
+	 * off them cause an exception.
+	 *
+	 * For now, point it at 8G -- at least it should be unmapped
+	 * until some 64-bit processes run.
+	 */
+	wrmsr(MSR_AMD_FSBASE, 0x200000000ul);
+	wrmsr(MSR_AMD_KGSBASE, 0x200000000ul);
+	return (gdt0);
+}
+
+#endif	/* __xpv */
+
+
+/*
+ * Build kernel IDT.
+ *
+ * Note that for amd64 we pretty much require every gate to be an interrupt
+ * gate which blocks interrupts atomically on entry; that's because of our
+ * dependency on using 'swapgs' every time we come into the kernel to find
+ * the cpu structure. If we get interrupted just before doing that, %cs could
+ * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
+ * %gsbase is really still pointing at something in userland. Bad things will
+ * ensue. We also use interrupt gates for i386 as well even though this is not
+ * required for some traps.
+ *
+ * Perhaps they should have invented a trap gate that does an atomic swapgs?
+ */
+static void
+init_idt_common(gate_desc_t *idt)
+{
+	set_gatesegd(&idt[T_ZERODIV],
+	    (kpti_enable == 1) ? &tr_div0trap : &div0trap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
+	set_gatesegd(&idt[T_SGLSTP],
+	    (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
+	set_gatesegd(&idt[T_NMIFLT],
+	    (kpti_enable == 1) ? &tr_nmiint : &nmiint,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
+	set_gatesegd(&idt[T_BPTFLT],
+	    (kpti_enable == 1) ? &tr_brktrap : &brktrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
+	set_gatesegd(&idt[T_OVFLW],
+	    (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
+	set_gatesegd(&idt[T_BOUNDFLT],
+	    (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
+	set_gatesegd(&idt[T_ILLINST],
+	    (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
+	set_gatesegd(&idt[T_NOEXTFLT],
+	    (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
+
+	/*
+	 * double fault handler.
+	 *
+	 * Note that on the hypervisor a guest does not receive #df faults.
+	 * Instead a failsafe event is injected into the guest if its selectors
+	 * and/or stack is in a broken state. See xen_failsafe_callback.
+	 */
+#if !defined(__xpv)
+	set_gatesegd(&idt[T_DBLFLT],
+	    (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
+#endif	/* !__xpv */
+
+	/*
+	 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
+	 */
+	set_gatesegd(&idt[T_TSSFLT],
+	    (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
+	set_gatesegd(&idt[T_SEGFLT],
+	    (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
+	set_gatesegd(&idt[T_STKFLT],
+	    (kpti_enable == 1) ? &tr_stktrap : &stktrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
+	set_gatesegd(&idt[T_GPFLT],
+	    (kpti_enable == 1) ? &tr_gptrap : &gptrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
+	set_gatesegd(&idt[T_PGFLT],
+	    (kpti_enable == 1) ? &tr_pftrap : &pftrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
+	set_gatesegd(&idt[T_EXTERRFLT],
+	    (kpti_enable == 1) ? &tr_ndperr : &ndperr,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
+	set_gatesegd(&idt[T_ALIGNMENT],
+	    (kpti_enable == 1) ? &tr_achktrap : &achktrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
+	set_gatesegd(&idt[T_MCE],
+	    (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
+	set_gatesegd(&idt[T_SIMDFPE],
+	    (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
+
+	/*
+	 * install fast trap handler at 210.
+	 */
+	set_gatesegd(&idt[T_FASTTRAP],
+	    (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
+	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
+
+	/*
+	 * System call handler.
+	 */
+	set_gatesegd(&idt[T_SYSCALLINT],
+	    (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
+	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
+
+	/*
+	 * Install the DTrace interrupt handler for the pid provider.
+	 */
+	set_gatesegd(&idt[T_DTRACE_RET],
+	    (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
+	    KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
+
+	/*
+	 * Prepare interposing descriptor for the syscall handler
+	 * and cache copy of the default descriptor.
+	 */
+	brand_tbl[0].ih_inum = T_SYSCALLINT;
+	brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT];
+
+	set_gatesegd(&(brand_tbl[0].ih_interp_desc),
+	    (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
+	    &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
+	    idt_vector_to_ist(T_SYSCALLINT));
+
+	brand_tbl[1].ih_inum = 0;
+}
+
+#if defined(__xpv)
+
+static void
+init_idt(gate_desc_t *idt)
+{
+	init_idt_common(idt);
+}
+
+#else	/* __xpv */
+
+static void
+init_idt(gate_desc_t *idt)
+{
+	char	ivctname[80];
+	void	(*ivctptr)(void);
+	int	i;
+
+	/*
+	 * Initialize entire table with 'reserved' trap and then overwrite
+	 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
+	 * since it can only be generated on a 386 processor. 15 is also
+	 * unsupported and reserved.
+	 */
+#if !defined(__xpv)
+	for (i = 0; i < NIDT; i++) {
+		set_gatesegd(&idt[i],
+		    (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
+		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
+		    idt_vector_to_ist(T_RESVTRAP));
+	}
+#else
+	for (i = 0; i < NIDT; i++) {
+		set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
+		    IST_NONE);
+	}
+#endif
+
+	/*
+	 * 20-31 reserved
+	 */
+#if !defined(__xpv)
+	for (i = 20; i < 32; i++) {
+		set_gatesegd(&idt[i],
+		    (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
+		    KCS_SEL, SDT_SYSIGT, TRP_KPL,
+		    idt_vector_to_ist(T_INVALTRAP));
+	}
+#else
+	for (i = 20; i < 32; i++) {
+		set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
+		    IST_NONE);
+	}
+#endif
+
+	/*
+	 * interrupts 32 - 255
+	 */
+	for (i = 32; i < 256; i++) {
+#if !defined(__xpv)
+		(void) snprintf(ivctname, sizeof (ivctname),
+		    (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
+#else
+		(void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
+#endif
+		ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
+		if (ivctptr == NULL)
+			panic("kobj_getsymvalue(%s) failed", ivctname);
+
+		set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
+		    idt_vector_to_ist(i));
+	}
+
+	/*
+	 * Now install the common ones. Note that it will overlay some
+	 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
+	 */
+	init_idt_common(idt);
+}
+
+#endif	/* __xpv */
+
+/*
+ * The kernel does not deal with LDTs unless a user explicitly creates
+ * one. Under normal circumstances, the LDTR contains 0. Any process attempting
+ * to reference the LDT will therefore cause a #gp. System calls made via the
+ * obsolete lcall mechanism are emulated by the #gp fault handler.
+ */
+static void
+init_ldt(void)
+{
+#if defined(__xpv)
+	xen_set_ldt(NULL, 0);
+#else
+	wr_ldtr(0);
+#endif
+}
+
+#if !defined(__xpv)
+
+static void
+init_tss(void)
+{
+	extern struct cpu cpus[];
+
+	/*
+	 * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
+	 * context switch but it'll be overwritten with this same value anyway.
+	 */
+	if (kpti_enable == 1) {
+		ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
+	}
+
+	/* Set up the IST stacks for double fault, NMI, MCE. */
+	ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
+	ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
+	ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
+
+	/*
+	 * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
+	 * enabled), and also for KDI (always).
+	 */
+	ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
+
+	if (kpti_enable == 1) {
+		/* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
+		ktss0->tss_ist5 =
+		    (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
+
+		/* This IST stack is used for all other intrs (for KPTI). */
+		ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
+	}
+
+	/*
+	 * Set I/O bit map offset equal to size of TSS segment limit
+	 * for no I/O permission map. This will force all user I/O
+	 * instructions to generate #gp fault.
+	 */
+	ktss0->tss_bitmapbase = sizeof (*ktss0);
+
+	/*
+	 * Point %tr to descriptor for ktss0 in gdt.
+	 */
+	wr_tsr(KTSS_SEL);
+}
+
+#endif	/* !__xpv */
+
+#if defined(__xpv)
+
+void
+init_desctbls(void)
+{
+	uint_t vec;
+	user_desc_t *gdt;
+
+	/*
+	 * Setup and install our GDT.
+	 */
+	gdt = init_gdt();
+
+	/*
+	 * Store static pa of gdt to speed up pa_to_ma() translations
+	 * on lwp context switches.
+	 */
+	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
+	CPU->cpu_gdt = gdt;
+	CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt));
+
+	/*
+	 * Setup and install our IDT.
+	 */
+#if !defined(__lint)
+	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
+#endif
+	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
+	    PAGESIZE, PAGESIZE);
+	bzero(idt0, PAGESIZE);
+	init_idt(idt0);
+	for (vec = 0; vec < NIDT; vec++)
+		xen_idt_write(&idt0[vec], vec);
+
+	CPU->cpu_idt = idt0;
+
+	/*
+	 * set default kernel stack
+	 */
+	xen_stack_switch(KDS_SEL,
+	    (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]);
+
+	xen_init_callbacks();
+
+	init_ldt();
+}
+
+#else	/* __xpv */
+
+void
+init_desctbls(void)
+{
+	user_desc_t *gdt;
+	desctbr_t idtr;
+
+	/*
+	 * Allocate IDT and TSS structures on unique pages for better
+	 * performance in virtual machines.
+	 */
+#if !defined(__lint)
+	ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE);
+#endif
+	idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA,
+	    PAGESIZE, PAGESIZE);
+	bzero(idt0, PAGESIZE);
+#if !defined(__lint)
+	ASSERT(sizeof (*ktss0) <= PAGESIZE);
+#endif
+	ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA,
+	    PAGESIZE, PAGESIZE);
+	bzero(ktss0, PAGESIZE);
+
+
+	/*
+	 * Setup and install our GDT.
+	 */
+	gdt = init_gdt();
+	ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE));
+	CPU->cpu_gdt = gdt;
+
+	/*
+	 * Initialize this CPU's LDT.
+	 */
+	CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
+	    LDT_CPU_SIZE, PAGESIZE);
+	bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
+	CPU->cpu_m.mcpu_ldt_len = 0;
+
+	/*
+	 * Setup and install our IDT.
+	 */
+	init_idt(idt0);
+
+	idtr.dtr_base = (uintptr_t)idt0;
+	idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
+	wr_idtr(&idtr);
+	CPU->cpu_idt = idt0;
+
+
+	init_tss();
+	CPU->cpu_tss = ktss0;
+	init_ldt();
+
+	/* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
+	kpti_safe_cr3 = (uint64_t)getcr3();
+}
+
+#endif	/* __xpv */
+
+#ifndef __xpv
+/*
+ * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so
+ * we have to manually fix it up ourselves.
+ *
+ * The caller may still need to make sure that it can't go off-CPU with the
+ * incorrect limit, before calling this (such as disabling pre-emption).
+ */
+void
+reset_gdtr_limit(void)
+{
+	ulong_t flags = intr_clear();
+	desctbr_t gdtr;
+
+	rd_gdtr(&gdtr);
+	gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1;
+	wr_gdtr(&gdtr);
+
+	intr_restore(flags);
+}
+#endif /* __xpv */
+
+/*
+ * In the early kernel, we need to set up a simple GDT to run on.
+ *
+ * XXPV	Can dboot use this too?  See dboot_gdt.s
+ */
+void
+init_boot_gdt(user_desc_t *bgdt)
+{
+	set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL,
+	    SDP_PAGES, SDP_OP32);
+	set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL,
+	    SDP_PAGES, SDP_OP32);
+}
+
+/*
+ * Enable interpositioning on the system call path by rewriting the
+ * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
+ * the branded entry points.
+ */
+void
+brand_interpositioning_enable(void)
+{
+	gate_desc_t	*idt = CPU->cpu_idt;
+	int		i;
+
+	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
+
+	for (i = 0; brand_tbl[i].ih_inum; i++) {
+		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc;
+#if defined(__xpv)
+		xen_idt_write(&idt[brand_tbl[i].ih_inum],
+		    brand_tbl[i].ih_inum);
+#endif
+	}
+
+#if defined(__xpv)
+
+	/*
+	 * Currently the hypervisor only supports 64-bit syscalls via
+	 * syscall instruction. The 32-bit syscalls are handled by
+	 * interrupt gate above.
+	 */
+	xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall,
+	    CALLBACKF_mask_events);
+
+#else
+
+	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
+		if (kpti_enable == 1) {
+			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
+			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
+		} else {
+			wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
+			wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
+		}
+	}
+
+#endif
+
+	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
+		if (kpti_enable == 1) {
+			wrmsr(MSR_INTC_SEP_EIP,
+			    (uintptr_t)tr_brand_sys_sysenter);
+		} else {
+			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
+		}
+	}
+}
+
+/*
+ * Disable interpositioning on the system call path by rewriting the
+ * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
+ * the standard entry points, which bypass the interpositioning hooks.
+ */
+void
+brand_interpositioning_disable(void)
+{
+	gate_desc_t	*idt = CPU->cpu_idt;
+	int i;
+
+	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
+
+	for (i = 0; brand_tbl[i].ih_inum; i++) {
+		idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc;
+#if defined(__xpv)
+		xen_idt_write(&idt[brand_tbl[i].ih_inum],
+		    brand_tbl[i].ih_inum);
+#endif
+	}
+
+#if defined(__xpv)
+
+	/*
+	 * See comment above in brand_interpositioning_enable.
+	 */
+	xen_set_callback(sys_syscall, CALLBACKTYPE_syscall,
+	    CALLBACKF_mask_events);
+
+#else
+
+	if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
+		if (kpti_enable == 1) {
+			wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
+			wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
+		} else {
+			wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
+			wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
+		}
+	}
+
+#endif
+
+	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
+		if (kpti_enable == 1) {
+			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
+		} else {
+			wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
+		}
+	}
+}
diff --git a/usr/src/uts/intel/os/fpu.c b/usr/src/uts/intel/os/fpu.c
new file mode 100644
index 0000000000..0037f49f85
--- /dev/null
+++ b/usr/src/uts/intel/os/fpu.c
@@ -0,0 +1,1506 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ * Copyright 2021 RackTop Systems, Inc.
+ */
+
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
+/*		All Rights Reserved				*/
+
+/*	Copyright (c) 1987, 1988 Microsoft Corporation		*/
+/*		All Rights Reserved				*/
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/signal.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/psw.h>
+#include <sys/trap.h>
+#include <sys/fault.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/pcb.h>
+#include <sys/lwp.h>
+#include <sys/cpuvar.h>
+#include <sys/thread.h>
+#include <sys/disp.h>
+#include <sys/fp.h>
+#include <sys/siginfo.h>
+#include <sys/archsystm.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/x86_archext.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kfpu.h>
+
+/*
+ * FPU Management Overview
+ * -----------------------
+ *
+ * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
+ * however, many aspects of its life as a coprocessor are still around in x86.
+ *
+ * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
+ * While that state still exists, there is much more that is covered by the FPU.
+ * Today, this includes not just traditional FPU state, but also supervisor only
+ * state. The following state is currently managed and covered logically by the
+ * idea of the FPU registers:
+ *
+ *    o Traditional x87 FPU
+ *    o Vector Registers (%xmm, %ymm, %zmm)
+ *    o Memory Protection Extensions (MPX) Bounds Registers
+ *    o Protected Key Rights Registers (PKRU)
+ *    o Processor Trace data
+ *
+ * The rest of this covers how the FPU is managed and controlled, how state is
+ * saved and restored between threads, interactions with hypervisors, and other
+ * information exported to user land through aux vectors. A lot of background
+ * information is here to synthesize major parts of the Intel SDM, but
+ * unfortunately, it is not a replacement for reading it.
+ *
+ * FPU Control Registers
+ * ---------------------
+ *
+ * Because the x87 FPU began its life as a co-processor and the FPU was
+ * optional there are several bits that show up in %cr0 that we have to
+ * manipulate when dealing with the FPU. These are:
+ *
+ *   o CR0.ET	The 'extension type' bit. This was used originally to indicate
+ *		that the FPU co-processor was present. Now it is forced on for
+ *		compatibility. This is often used to verify whether or not the
+ *		FPU is present.
+ *
+ *   o CR0.NE	The 'native error' bit. Used to indicate that native error
+ *		mode should be enabled. This indicates that we should take traps
+ *		on FPU errors. The OS enables this early in boot.
+ *
+ *   o CR0.MP	The 'Monitor Coprocessor' bit. Used to control whether or not
+ *		wait/fwait instructions generate a #NM if CR0.TS is set.
+ *
+ *   o CR0.EM	The 'Emulation' bit. This is used to cause floating point
+ *		operations (x87 through SSE4) to trap with a #UD so they can be
+ *		emulated. The system never sets this bit, but makes sure it is
+ *		clear on processor start up.
+ *
+ *   o CR0.TS	The 'Task Switched' bit. When this is turned on, a floating
+ *		point operation will generate a #NM. An fwait will as well,
+ *		depending on the value in CR0.MP.
+ *
+ * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
+ * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
+ * complicated role. Historically it has been used to allow running systems to
+ * restore the FPU registers lazily. This will be discussed in greater depth
+ * later on.
+ *
+ * %cr4 is also used as part of the FPU control. Specifically we need to worry
+ * about the following bits in the system:
+ *
+ *   o CR4.OSFXSR	This bit is used to indicate that the OS understands and
+ *			supports the execution of the fxsave and fxrstor
+ *			instructions. This bit is required to be set to enable
+ *			the use of the SSE->SSE4 instructions.
+ *
+ *   o CR4.OSXMMEXCPT	This bit is used to indicate that the OS can understand
+ *			and take a SIMD floating point exception (#XM). This bit
+ *			is always enabled by the system.
+ *
+ *   o CR4.OSXSAVE	This bit is used to indicate that the OS understands and
+ *			supports the execution of the xsave and xrstor family of
+ *			instructions. This bit is required to use any of the AVX
+ *			and newer feature sets.
+ *
+ * Because all supported processors are 64-bit, they'll always support the XMM
+ * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
+ * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
+ *
+ * %xcr0 is used to manage the behavior of the xsave feature set and is only
+ * present on the system if xsave is supported. %xcr0 is read and written to
+ * through by the xgetbv and xsetbv instructions. This register is present
+ * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
+ * different component of the xsave state and controls whether or not that
+ * information is saved and restored. For newer feature sets like AVX and MPX,
+ * it also controls whether or not the corresponding instructions can be
+ * executed (much like CR0.OSFXSR does for the SSE feature sets).
+ *
+ * Everything in %xcr0 is around features available to users. There is also the
+ * IA32_XSS MSR which is used to control supervisor-only features that are still
+ * part of the xsave state. Bits that can be set in %xcr0 are reserved in
+ * IA32_XSS and vice versa. This is an important property that is particularly
+ * relevant to how the xsave instructions operate.
+ *
+ * Save Mechanisms
+ * ---------------
+ *
+ * When switching between running threads the FPU state needs to be saved and
+ * restored by the OS. If this state was not saved, users would rightfully
+ * complain about corrupt state. There are three mechanisms that exist on the
+ * processor for saving and restoring these state images:
+ *
+ *   o fsave
+ *   o fxsave
+ *   o xsave
+ *
+ * fsave saves and restores only the x87 FPU and is the oldest of these
+ * mechanisms. This mechanism is never used in the kernel today because we are
+ * always running on systems that support fxsave.
+ *
+ * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
+ * state to be saved and restored to and from a struct fxsave_state. This is the
+ * default mechanism that is used to save and restore the FPU on amd64. An
+ * important aspect of fxsave that was different from the original i386 fsave
+ * mechanism is that the restoring of FPU state with pending exceptions will not
+ * generate an exception, it will be deferred to the next use of the FPU.
+ *
+ * The final and by far the most complex mechanism is that of the xsave set.
+ * xsave allows for saving and restoring all of the traditional x86 pieces (x87
+ * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
+ * registers.
+ *
+ * Data is saved and restored into and out of a struct xsave_state. The first
+ * part of the struct xsave_state is equivalent to the struct fxsave_state.
+ * After that, there is a header which is used to describe the remaining
+ * portions of the state. The header is a 64-byte value of which the first two
+ * uint64_t values are defined and the rest are reserved and must be zero. The
+ * first uint64_t is the xstate_bv member. This describes which values in the
+ * xsave_state are actually valid and present. This is updated on a save and
+ * used on restore. The second member is the xcomp_bv member. Its last bit
+ * determines whether or not a compressed version of the structure is used.
+ *
+ * When the uncompressed structure is used (currently the only format we
+ * support), then each state component is at a fixed offset in the structure,
+ * even if it is not being used. For example, if you only saved the AVX related
+ * state, but did not save the MPX related state, the offset would not change
+ * for any component. With the compressed format, components that aren't used
+ * are all elided (though the x87 and SSE state are always there).
+ *
+ * Unlike fxsave which saves all state, the xsave family does not always save
+ * and restore all the state that could be covered by the xsave_state. The
+ * instructions all take an argument which is a mask of what to consider. This
+ * is the same mask that will be used in the xstate_bv vector and it is also the
+ * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
+ * considered with the xsaves and xrstors instructions.
+ *
+ * When a save or restore is requested, a bitwise and is performed between the
+ * requested bits and those that have been enabled in %xcr0. Only the bits that
+ * match that are then saved or restored. Others will be silently ignored by
+ * the processor. This idea is used often in the OS. We will always request that
+ * we save and restore all of the state, but only those portions that are
+ * actually enabled in %xcr0 will be touched.
+ *
+ * If a feature has been asked to be restored that is not set in the xstate_bv
+ * feature vector of the save state, then it will be set to its initial state by
+ * the processor (usually zeros). Also, when asked to save state, the processor
+ * may not write out data that is in its initial state as an optimization. This
+ * optimization only applies to saving data and not to restoring data.
+ *
+ * There are a few different variants of the xsave and xrstor instruction. They
+ * are:
+ *
+ *   o xsave	This is the original save instruction. It will save all of the
+ *		requested data in the xsave state structure. It only saves data
+ *		in the uncompressed (xcomp_bv[63] is zero) format. It may be
+ *		executed at all privilege levels.
+ *
+ *   o xrstor	This is the original restore instruction. It will restore all of
+ *		the requested data. The xrstor function can handle both the
+ *		compressed and uncompressed formats. It may be executed at all
+ *		privilege levels.
+ *
+ *   o xsaveopt	This is a variant of the xsave instruction that employs
+ *		optimizations to try and only write out state that has been
+ *		modified since the last time an xrstor instruction was called.
+ *		The processor tracks a tuple of information about the last
+ *		xrstor and tries to ensure that the same buffer is being used
+ *		when this optimization is being used. However, because of the
+ *		way that it tracks the xrstor buffer based on the address of it,
+ *		it is not suitable for use if that buffer can be easily reused.
+ *		The most common case is trying to save data to the stack in
+ *		rtld. It may be executed at all privilege levels.
+ *
+ *   o xsavec	This is a variant of the xsave instruction that writes out the
+ *		compressed form of the xsave_state. Otherwise it behaves as
+ *		xsave. It may be executed at all privilege levels.
+ *
+ *   o xsaves	This is a variant of the xsave instruction. It is similar to
+ *		xsavec in that it always writes the compressed form of the
+ *		buffer. Unlike all the other forms, this instruction looks at
+ *		both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
+ *		what to save and restore. xsaves also implements the same
+ *		optimization that xsaveopt does around modified pieces. User
+ *		land may not execute the instruction.
+ *
+ *   o xrstors	This is a variant of the xrstor instruction. Similar to xsaves
+ *		it can save and restore both the user and privileged states.
+ *		Unlike xrstor it can only operate on the compressed form.
+ *		User land may not execute the instruction.
+ *
+ * Based on all of these, the kernel has a precedence for what it will use.
+ * Basically, xsaves (not supported) is preferred to xsaveopt, which is
+ * preferred to xsave. A similar scheme is used when informing rtld (more later)
+ * about what it should use. xsavec is preferred to xsave. xsaveopt is not
+ * recommended due to the modified optimization not being appropriate for this
+ * use.
+ *
+ * Finally, there is one last gotcha with the xsave state. Importantly some AMD
+ * processors did not always save and restore some of the FPU exception state in
+ * some cases like Intel did. In those cases the OS will make up for this fact
+ * itself.
+ *
+ * FPU Initialization
+ * ------------------
+ *
+ * One difference with the FPU registers is that not all threads have FPU state,
+ * only those that have an lwp. Generally this means kernel threads, which all
+ * share p0 and its lwp, do not have FPU state. Though there are definitely
+ * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
+ * and lwp interchangeably, just think of thread meaning a thread that has a
+ * lwp.
+ *
+ * Each lwp has its FPU state allocated in its pcb (process control block). The
+ * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
+ * dynamically at start up based on the save mechanism that we're using and the
+ * amount of memory required for it. This is dynamic because the xsave_state
+ * size varies based on the supported feature set.
+ *
+ * The hardware side of the FPU is initialized early in boot before we mount the
+ * root file system. This is effectively done in fpu_probe(). This is where we
+ * make the final decision about what the save and restore mechanisms we should
+ * use are, create the fpsave_cachep kmem cache, and initialize a number of
+ * function pointers that use save and restoring logic.
+ *
+ * The thread/lwp side is a a little more involved. There are two different
+ * things that we need to concern ourselves with. The first is how the FPU
+ * resources are allocated and the second is how the FPU state is initialized
+ * for a given lwp.
+ *
+ * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
+ * This is always called unconditionally by the system as part of creating an
+ * LWP.
+ *
+ * There are three different initialization paths that we deal with. The first
+ * is when we are executing a new process. As part of exec all of the register
+ * state is reset. The exec case is particularly important because init is born
+ * like Athena, sprouting from the head of the kernel, without any true parent
+ * to fork from. The second is used whenever we fork or create a new lwp.  The
+ * third is to deal with special lwps like the agent lwp.
+ *
+ * During exec, we will call fp_exec() which will initialize and set up the FPU
+ * state for the process. That will fill in the initial state for the FPU and
+ * also set that state in the FPU itself. As part of fp_exec() we also install a
+ * thread context operations vector that takes care of dealing with the saving
+ * and restoring of the FPU. These context handlers will also be called whenever
+ * an lwp is created or forked. In those cases, to initialize the FPU we will
+ * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
+ * operations vector for the new thread.
+ *
+ * Next we'll end up in the context operation fp_new_lwp(). This saves the
+ * current thread's state, initializes the new thread's state, and copies over
+ * the relevant parts of the originating thread's state. It's as this point that
+ * we also install the FPU context operations into the new thread, which ensures
+ * that all future threads that are descendants of the current one get the
+ * thread context operations (unless they call exec).
+ *
+ * To deal with some things like the agent lwp, we double check the state of the
+ * FPU in sys_rtt_common() to make sure that it has been enabled before
+ * returning to user land. In general, this path should be rare, but it's useful
+ * for the odd lwp here and there.
+ *
+ * The FPU state will remain valid most of the time. There are times that
+ * the state will be rewritten. For example in restorecontext, due to /proc, or
+ * the lwp calls exec(). Whether the context is being freed or we are resetting
+ * the state, we will call fp_free() to disable the FPU and our context.
+ *
+ * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
+ * state by calling fp_lwp_cleanup().
+ *
+ * Kernel FPU Multiplexing
+ * -----------------------
+ *
+ * Just as the kernel has to maintain all of the general purpose registers when
+ * switching between scheduled threads, the same is true of the FPU registers.
+ *
+ * When a thread has FPU state, it also has a set of context operations
+ * installed. These context operations take care of making sure that the FPU is
+ * properly saved and restored during a context switch (fpsave_ctxt and
+ * fprestore_ctxt respectively). This means that the current implementation of
+ * the FPU is 'eager', when a thread is running the CPU will have its FPU state
+ * loaded. While this is always true when executing in userland, there are a few
+ * cases where this is not true in the kernel.
+ *
+ * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
+ * employed. This meant that the FPU would be saved on a context switch and the
+ * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
+ * then take a #NM trap, at which point we would restore the FPU from the save
+ * area and return to user land. Given the frequency of use of the FPU alone by
+ * libc, there's no point returning to user land just to trap again.
+ *
+ * There are a few cases though where the FPU state may need to be changed for a
+ * thread on its behalf. The most notable cases are in the case of processes
+ * using /proc, restorecontext, forking, etc. In all of these cases the kernel
+ * will force a threads FPU state to be saved into the PCB through the fp_save()
+ * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
+ * pcb. This indicates that the save state holds currently valid data. As a side
+ * effect of this, CR0.TS will be set. To make sure that all of the state is
+ * updated before returning to user land, in these cases, we set a flag on the
+ * PCB that says the FPU needs to be updated. This will make sure that we take
+ * the slow path out of a system call to fix things up for the thread. Due to
+ * the fact that this is a rather rare case, effectively setting the equivalent
+ * of t_postsys is acceptable.
+ *
+ * CR0.TS will be set after a save occurs and cleared when a restore occurs.
+ * Generally this means it will be cleared immediately by the new thread that is
+ * running in a context switch. However, this isn't the case for kernel threads.
+ * They currently operate with CR0.TS set as no kernel state is restored for
+ * them. This means that using the FPU will cause a #NM and panic.
+ *
+ * The FPU_VALID flag on the currently executing thread's pcb is meant to track
+ * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
+ * However, because we eagerly restore, the only time that CR0.TS should be set
+ * for a non-kernel thread is during operations where it will be cleared before
+ * returning to user land and importantly, the only data that is in it is its
+ * own.
+ *
+ * Kernel FPU Usage
+ * ----------------
+ *
+ * Traditionally the kernel never used the FPU since it had no need for
+ * floating point operations. However, modern FPU hardware supports a variety
+ * of SIMD extensions which can speed up code such as parity calculations or
+ * encryption.
+ *
+ * To allow the kernel to take advantage of these features, the
+ * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped
+ * around any usage of the FPU by the kernel to ensure that user-level context
+ * is properly saved/restored, as well as to properly setup the FPU for use by
+ * the kernel. There are a variety of ways this wrapping can be used, as
+ * discussed in this section below.
+ *
+ * When kernel_fpu_begin() and kernel_fpu_end() are used for extended
+ * operations, the kernel_fpu_alloc() function should be used to allocate a
+ * kfpu_state_t structure that is used to save/restore the thread's kernel FPU
+ * state. This structure is not tied to any thread. That is, different threads
+ * can reuse the same kfpu_state_t structure, although not concurrently. A
+ * kfpu_state_t structure is freed by the kernel_fpu_free() function.
+ *
+ * In some cases, the kernel may need to use the FPU for a short operation
+ * without the overhead to manage a kfpu_state_t structure and without
+ * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE
+ * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags
+ * parameter. This indicates that there is no kfpu_state_t. When used this way,
+ * kernel preemption should be disabled by the caller (kpreempt_disable) before
+ * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end().
+ * For this usage, it is important to limit the kernel's FPU use to short
+ * operations. The tradeoff between using the FPU without a kfpu_state_t
+ * structure vs. the overhead of allowing a context switch while using the FPU
+ * should be carefully considered on a case by case basis.
+ *
+ * In other cases, kernel threads have an LWP, but never execute in user space.
+ * In this situation, the LWP's pcb_fpu area can be used to save/restore the
+ * kernel's FPU state if the thread is context switched, instead of having to
+ * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the
+ * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to
+ * enable this behavior. It is the caller's responsibility to ensure that this
+ * is only used for a kernel thread which never executes in user space.
+ *
+ * FPU Exceptions
+ * --------------
+ *
+ * Certain operations can cause the kernel to take traps due to FPU activity.
+ * Generally these events will cause a user process to receive a SIGFPU and if
+ * the kernel receives it in kernel context, we will die. Traditionally the #NM
+ * (Device Not Available / No Math) exception generated by CR0.TS would have
+ * caused us to restore the FPU. Now it is a fatal event regardless of whether
+ * or not user land causes it.
+ *
+ * While there are some cases where the kernel uses the FPU, it is up to the
+ * kernel to use the FPU in a way such that it cannot receive a trap or to use
+ * the appropriate trap protection mechanisms.
+ *
+ * Hypervisors
+ * -----------
+ *
+ * When providing support for hypervisors things are a little bit more
+ * complicated because the FPU is not virtualized at all. This means that they
+ * need to save and restore the FPU and %xcr0 across entry and exit to the
+ * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
+ * allow us to use the full native state to make sure that we are always saving
+ * and restoring the full FPU that the host sees, even when the guest is using a
+ * subset.
+ *
+ * One tricky aspect of this is that the guest may be using a subset of %xcr0
+ * and therefore changing our %xcr0 on the fly. It is vital that when we're
+ * saving and restoring the FPU that we always use the largest %xcr0 contents
+ * otherwise we will end up leaving behind data in it.
+ *
+ * ELF PLT Support
+ * ---------------
+ *
+ * rtld has to preserve a subset of the FPU when it is saving and restoring
+ * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
+ * more information. As a result, we set up an aux vector that contains
+ * information about what save and restore mechanisms it should be using and
+ * the sizing thereof based on what the kernel supports. This is passed down in
+ * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
+ * initialized in fpu_subr.c.
+ */
+
+kmem_cache_t *fpsave_cachep;
+
+/* Legacy fxsave layout + xsave header + ymm */
+#define	AVX_XSAVE_SIZE		(512 + 64 + 256)
+
+/*
+ * Various sanity checks.
+ */
+CTASSERT(sizeof (struct fxsave_state) == 512);
+CTASSERT(sizeof (struct fnsave_state) == 108);
+CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
+CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
+
+/*
+ * This structure is the x86 implementation of the kernel FPU that is defined in
+ * uts/common/sys/kfpu.h.
+ */
+
+typedef enum kfpu_flags {
+	/*
+	 * This indicates that the save state has initial FPU data.
+	 */
+	KFPU_F_INITIALIZED = 0x01
+} kfpu_flags_t;
+
+struct kfpu_state {
+	fpu_ctx_t	kfpu_ctx;
+	kfpu_flags_t	kfpu_flags;
+	kthread_t	*kfpu_curthread;
+};
+
+/*
+ * Initial kfpu state for SSE/SSE2 used by fpinit()
+ */
+const struct fxsave_state sse_initial = {
+	FPU_CW_INIT,	/* fx_fcw */
+	0,		/* fx_fsw */
+	0,		/* fx_fctw */
+	0,		/* fx_fop */
+	0,		/* fx_rip */
+	0,		/* fx_rdp */
+	SSE_MXCSR_INIT	/* fx_mxcsr */
+	/* rest of structure is zero */
+};
+
+/*
+ * Initial kfpu state for AVX used by fpinit()
+ */
+const struct xsave_state avx_initial = {
+	/*
+	 * The definition below needs to be identical with sse_initial
+	 * defined above.
+	 */
+	{
+		FPU_CW_INIT,	/* fx_fcw */
+		0,		/* fx_fsw */
+		0,		/* fx_fctw */
+		0,		/* fx_fop */
+		0,		/* fx_rip */
+		0,		/* fx_rdp */
+		SSE_MXCSR_INIT	/* fx_mxcsr */
+		/* rest of structure is zero */
+	},
+	/*
+	 * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid,
+	 * and CPU should initialize XMM/YMM.
+	 */
+	1,
+	0	/* xs_xcomp_bv */
+	/* rest of structure is zero */
+};
+
+/*
+ * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
+ * the #gp exception caused by setting unsupported bits in the
+ * MXCSR register
+ */
+uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
+
+/*
+ * Initial kfpu state for x87 used by fpinit()
+ */
+const struct fnsave_state x87_initial = {
+	FPU_CW_INIT,	/* f_fcw */
+	0,		/* __f_ign0 */
+	0,		/* f_fsw */
+	0,		/* __f_ign1 */
+	0xffff,		/* f_ftw */
+	/* rest of structure is zero */
+};
+
+/*
+ * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we
+ * have an XSAVE-capable chip in fpu_probe.
+ */
+void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
+void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
+
+/*
+ * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
+ */
+void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
+
+static int fpe_sicode(uint_t);
+static int fpe_simd_sicode(uint_t);
+
+/*
+ * Copy the state of parent lwp's floating point context into the new lwp.
+ * Invoked for both fork() and lwp_create().
+ *
+ * Note that we inherit -only- the control state (e.g. exception masks,
+ * rounding, precision control, etc.); the FPU registers are otherwise
+ * reset to their initial state.
+ */
+static void
+fp_new_lwp(kthread_id_t t, kthread_id_t ct)
+{
+	struct fpu_ctx *fp;		/* parent fpu context */
+	struct fpu_ctx *cfp;		/* new fpu context */
+	struct fxsave_state *fx, *cfx;
+	struct xsave_state *cxs;
+
+	ASSERT(fp_kind != FP_NO);
+
+	fp = &t->t_lwp->lwp_pcb.pcb_fpu;
+	cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
+
+	/*
+	 * If the parent FPU state is still in the FPU hw then save it;
+	 * conveniently, fp_save() already does this for us nicely.
+	 */
+	fp_save(fp);
+
+	cfp->fpu_flags = FPU_EN | FPU_VALID;
+	cfp->fpu_regs.kfpu_status = 0;
+	cfp->fpu_regs.kfpu_xstatus = 0;
+
+	/*
+	 * Make sure that the child's FPU is cleaned up and made ready for user
+	 * land.
+	 */
+	PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
+		cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
+		bcopy(&sse_initial, cfx, sizeof (*cfx));
+		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
+		cfx->fx_fcw = fx->fx_fcw;
+		break;
+
+	case FP_XSAVE:
+		cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
+
+		VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
+
+		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
+		cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
+		cfx = &cxs->xs_fxsave;
+
+		bcopy(&avx_initial, cxs, sizeof (*cxs));
+		cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
+		cfx->fx_fcw = fx->fx_fcw;
+		cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) &
+		    XFEATURE_FP_INITIAL);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Mark that both the parent and child need to have the FPU cleaned up
+	 * before returning to user land.
+	 */
+
+	installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
+	    fp_new_lwp, NULL, fp_free, NULL);
+}
+
+/*
+ * Free any state associated with floating point context.
+ * Fp_free can be called in three cases:
+ * 1) from reaper -> thread_free -> freectx-> fp_free
+ *	fp context belongs to a thread on deathrow
+ *	nothing to do,  thread will never be resumed
+ *	thread calling ctxfree is reaper
+ *
+ * 2) from exec -> freectx -> fp_free
+ *	fp context belongs to the current thread
+ *	must disable fpu, thread calling ctxfree is curthread
+ *
+ * 3) from restorecontext -> setfpregs -> fp_free
+ *	we have a modified context in the memory (lwp->pcb_fpu)
+ *	disable fpu and release the fp context for the CPU
+ *
+ */
+/*ARGSUSED*/
+void
+fp_free(struct fpu_ctx *fp, int isexec)
+{
+	ASSERT(fp_kind != FP_NO);
+
+	if (fp->fpu_flags & FPU_VALID)
+		return;
+
+	kpreempt_disable();
+	/*
+	 * We want to do fpsave rather than fpdisable so that we can
+	 * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
+	 */
+	fp->fpu_flags |= FPU_VALID;
+	/* If for current thread disable FP to track FPU_VALID */
+	if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
+		/* Clear errors if any to prevent frstor from complaining */
+		(void) fperr_reset();
+		if (fp_kind & __FP_SSE)
+			(void) fpxerr_reset();
+		fpdisable();
+	}
+	kpreempt_enable();
+}
+
+/*
+ * Store the floating point state and disable the floating point unit.
+ */
+void
+fp_save(struct fpu_ctx *fp)
+{
+	ASSERT(fp_kind != FP_NO);
+
+	kpreempt_disable();
+	if (!fp || fp->fpu_flags & FPU_VALID ||
+	    (fp->fpu_flags & FPU_EN) == 0) {
+		kpreempt_enable();
+		return;
+	}
+	ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
+		break;
+
+	case FP_XSAVE:
+		xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	fp->fpu_flags |= FPU_VALID;
+
+	/*
+	 * We save the FPU as part of forking, execing, modifications via /proc,
+	 * restorecontext, etc. As such, we need to make sure that we return to
+	 * userland with valid state in the FPU. If we're context switched out
+	 * before we hit sys_rtt_common() we'll end up having restored the FPU
+	 * as part of the context ops operations. The restore logic always makes
+	 * sure that FPU_VALID is set before doing a restore so we don't restore
+	 * it a second time.
+	 */
+	PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
+
+	kpreempt_enable();
+}
+
+/*
+ * Restore the FPU context for the thread:
+ * The possibilities are:
+ *	1. No active FPU context: Load the new context into the FPU hw
+ *	   and enable the FPU.
+ */
+void
+fp_restore(struct fpu_ctx *fp)
+{
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
+		break;
+
+	case FP_XSAVE:
+		xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	fp->fpu_flags &= ~FPU_VALID;
+}
+
+/*
+ * Reset the FPU such that it is in a valid state for a new thread that is
+ * coming out of exec. The FPU will be in a usable state at this point. At this
+ * point we know that the FPU state has already been allocated and if this
+ * wasn't an init process, then it will have had fp_free() previously called.
+ */
+void
+fp_exec(void)
+{
+	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
+	struct ctxop *ctx = installctx_preallocate();
+
+	if (fp_save_mech == FP_XSAVE) {
+		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
+	}
+
+	/*
+	 * Make sure that we're not preempted in the middle of initializing the
+	 * FPU on CPU.
+	 */
+	kpreempt_disable();
+	installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
+	    fp_new_lwp, NULL, fp_free, ctx);
+	fpinit();
+	fp->fpu_flags = FPU_EN;
+	kpreempt_enable();
+}
+
+
+/*
+ * Seeds the initial state for the current thread.  The possibilities are:
+ *      1. Another process has modified the FPU state before we have done any
+ *         initialization: Load the FPU state from the LWP state.
+ *      2. The FPU state has not been externally modified:  Load a clean state.
+ */
+void
+fp_seed(void)
+{
+	struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
+
+	ASSERT(curthread->t_preempt >= 1);
+	ASSERT((fp->fpu_flags & FPU_EN) == 0);
+
+	/*
+	 * Always initialize a new context and initialize the hardware.
+	 */
+	if (fp_save_mech == FP_XSAVE) {
+		fp->fpu_xsave_mask = XFEATURE_FP_ALL;
+	}
+
+	installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
+	    fp_new_lwp, NULL, fp_free, NULL);
+	fpinit();
+
+	/*
+	 * If FPU_VALID is set, it means someone has modified registers via
+	 * /proc.  In this case, restore the current lwp's state.
+	 */
+	if (fp->fpu_flags & FPU_VALID)
+		fp_restore(fp);
+
+	ASSERT((fp->fpu_flags & FPU_VALID) == 0);
+	fp->fpu_flags = FPU_EN;
+}
+
+/*
+ * When using xsave/xrstor, these three functions are used by the lwp code to
+ * manage the memory for the xsave area.
+ */
+void
+fp_lwp_init(struct _klwp *lwp)
+{
+	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
+
+	/*
+	 * We keep a copy of the pointer in lwp_fpu so that we can restore the
+	 * value in forklwp() after we duplicate the parent's LWP state.
+	 */
+	lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
+	    kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
+
+	if (fp_save_mech == FP_XSAVE) {
+		/*
+		 *
+		 * We bzero since the fpinit() code path will only
+		 * partially initialize the xsave area using avx_inital.
+		 */
+		ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
+		bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
+	}
+}
+
+void
+fp_lwp_cleanup(struct _klwp *lwp)
+{
+	struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
+
+	if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
+		kmem_cache_free(fpsave_cachep,
+		    fp->fpu_regs.kfpu_u.kfpu_generic);
+		lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
+	}
+}
+
+/*
+ * Called during the process of forklwp(). The kfpu_u pointer will have been
+ * overwritten while copying the parent's LWP structure. We have a valid copy
+ * stashed in the child's lwp_fpu which we use to restore the correct value.
+ */
+void
+fp_lwp_dup(struct _klwp *lwp)
+{
+	void *xp = lwp->lwp_fpu;
+	size_t sz;
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		sz = sizeof (struct fxsave_state);
+		break;
+	case FP_XSAVE:
+		sz = cpuid_get_xsave_size();
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	/* copy the parent's values into the new lwp's struct */
+	bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
+	/* now restore the pointer */
+	lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
+}
+
+/*
+ * Handle a processor extension error fault
+ * Returns non zero for error.
+ */
+
+/*ARGSUSED*/
+int
+fpexterrflt(struct regs *rp)
+{
+	uint32_t fpcw, fpsw;
+	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
+
+	ASSERT(fp_kind != FP_NO);
+
+	/*
+	 * Now we can enable the interrupts.
+	 * (NOTE: x87 fp exceptions come thru interrupt gate)
+	 */
+	sti();
+
+	if (!fpu_exists)
+		return (FPE_FLTINV);
+
+	/*
+	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
+	 * it'll be saved into the fpu context area passed in (that of the
+	 * current thread).  If it's not dirty (it may not be, due to
+	 * an intervening save due to a context switch between the sti(),
+	 * above and here, then it's safe to just use the stored values in
+	 * the context save area to determine the cause of the fault.
+	 */
+	fp_save(fp);
+
+	/* clear exception flags in saved state, as if by fnclex */
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
+		fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
+		fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
+		break;
+
+	case FP_XSAVE:
+		fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
+		fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
+		fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
+		/*
+		 * Always set LEGACY_FP as it may have been cleared by XSAVE
+		 * instruction
+		 */
+		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+
+	fp->fpu_regs.kfpu_status = fpsw;
+
+	if ((fpsw & FPS_ES) == 0)
+		return (0);		/* No exception */
+
+	/*
+	 * "and" the exception flags with the complement of the mask
+	 * bits to determine which exception occurred
+	 */
+	return (fpe_sicode(fpsw & ~fpcw & 0x3f));
+}
+
+/*
+ * Handle an SSE/SSE2 precise exception.
+ * Returns a non-zero sicode for error.
+ */
+/*ARGSUSED*/
+int
+fpsimderrflt(struct regs *rp)
+{
+	uint32_t mxcsr, xmask;
+	fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
+
+	ASSERT(fp_kind & __FP_SSE);
+
+	/*
+	 * NOTE: Interrupts are disabled during execution of this
+	 * function.  They are enabled by the caller in trap.c.
+	 */
+
+	/*
+	 * The only way we could have gotten here if there is no FP unit
+	 * is via a user executing an INT $19 instruction, so there is
+	 * no fault in that case.
+	 */
+	if (!fpu_exists)
+		return (0);
+
+	/*
+	 * Do an unconditional save of the FP state.  If it's dirty (TS=0),
+	 * it'll be saved into the fpu context area passed in (that of the
+	 * current thread).  If it's not dirty, then it's safe to just use
+	 * the stored values in the context save area to determine the
+	 * cause of the fault.
+	 */
+	fp_save(fp);		/* save the FPU state */
+
+	if (fp_save_mech == FP_XSAVE) {
+		mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
+		fp->fpu_regs.kfpu_status =
+		    fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
+	} else {
+		mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
+		fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
+	}
+	fp->fpu_regs.kfpu_xstatus = mxcsr;
+
+	/*
+	 * compute the mask that determines which conditions can cause
+	 * a #xm exception, and use this to clean the status bits so that
+	 * we can identify the true cause of this one.
+	 */
+	xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
+	return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
+}
+
+/*
+ * In the unlikely event that someone is relying on this subcode being
+ * FPE_FLTILL for denormalize exceptions, it can always be patched back
+ * again to restore old behaviour.
+ */
+int fpe_fltden = FPE_FLTDEN;
+
+/*
+ * Map from the FPU status word to the FP exception si_code.
+ */
+static int
+fpe_sicode(uint_t sw)
+{
+	if (sw & FPS_IE)
+		return (FPE_FLTINV);
+	if (sw & FPS_ZE)
+		return (FPE_FLTDIV);
+	if (sw & FPS_DE)
+		return (fpe_fltden);
+	if (sw & FPS_OE)
+		return (FPE_FLTOVF);
+	if (sw & FPS_UE)
+		return (FPE_FLTUND);
+	if (sw & FPS_PE)
+		return (FPE_FLTRES);
+	return (FPE_FLTINV);	/* default si_code for other exceptions */
+}
+
+/*
+ * Map from the SSE status word to the FP exception si_code.
+ */
+static int
+fpe_simd_sicode(uint_t sw)
+{
+	if (sw & SSE_IE)
+		return (FPE_FLTINV);
+	if (sw & SSE_ZE)
+		return (FPE_FLTDIV);
+	if (sw & SSE_DE)
+		return (FPE_FLTDEN);
+	if (sw & SSE_OE)
+		return (FPE_FLTOVF);
+	if (sw & SSE_UE)
+		return (FPE_FLTUND);
+	if (sw & SSE_PE)
+		return (FPE_FLTRES);
+	return (FPE_FLTINV);	/* default si_code for other exceptions */
+}
+
+/*
+ * This routine is invoked as part of libc's __fpstart implementation
+ * via sysi86(2).
+ *
+ * It may be called -before- any context has been assigned in which case
+ * we try and avoid touching the hardware.  Or it may be invoked well
+ * after the context has been assigned and fiddled with, in which case
+ * just tweak it directly.
+ */
+void
+fpsetcw(uint16_t fcw, uint32_t mxcsr)
+{
+	struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
+	struct fxsave_state *fx;
+
+	if (!fpu_exists || fp_kind == FP_NO)
+		return;
+
+	if ((fp->fpu_flags & FPU_EN) == 0) {
+		if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
+			/*
+			 * Common case.  Floating point unit not yet
+			 * enabled, and kernel already intends to initialize
+			 * the hardware the way the caller wants.
+			 */
+			return;
+		}
+		/*
+		 * Hmm.  Userland wants a different default.
+		 * Do a fake "first trap" to establish the context, then
+		 * handle as if we already had a context before we came in.
+		 */
+		kpreempt_disable();
+		fp_seed();
+		kpreempt_enable();
+	}
+
+	/*
+	 * Ensure that the current hardware state is flushed back to the
+	 * pcb, then modify that copy.  Next use of the fp will
+	 * restore the context.
+	 */
+	fp_save(fp);
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		fx = fp->fpu_regs.kfpu_u.kfpu_fx;
+		fx->fx_fcw = fcw;
+		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
+		break;
+
+	case FP_XSAVE:
+		fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
+		fx->fx_fcw = fcw;
+		fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
+		/*
+		 * Always set LEGACY_FP as it may have been cleared by XSAVE
+		 * instruction
+		 */
+		fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
+		break;
+	default:
+		panic("Invalid fp_save_mech");
+		/*NOTREACHED*/
+	}
+}
+
+static void
+kernel_fpu_fpstate_init(kfpu_state_t *kfpu)
+{
+	struct xsave_state *xs;
+
+	switch (fp_save_mech) {
+	case FP_FXSAVE:
+		bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx,
+		    sizeof (struct fxsave_state));
+		kfpu->kfpu_ctx.fpu_xsave_mask = 0;
+		break;
+	case FP_XSAVE:
+		xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs;
+		bzero(xs, cpuid_get_xsave_size());
+		bcopy(&avx_initial, xs, sizeof (*xs));
+		xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE;
+		kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL;
+		break;
+	default:
+		panic("invalid fp_save_mech");
+	}
+
+	/*
+	 * Set the corresponding flags that the system expects on the FPU state
+	 * to indicate that this is our state. The FPU_EN flag is required to
+	 * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly
+	 * not set below as it represents that this state is being suppressed
+	 * by the kernel.
+	 */
+	kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID;
+	kfpu->kfpu_flags |= KFPU_F_INITIALIZED;
+}
+
+kfpu_state_t *
+kernel_fpu_alloc(int kmflags)
+{
+	kfpu_state_t *kfpu;
+
+	if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) {
+		return (NULL);
+	}
+
+	kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic =
+	    kmem_cache_alloc(fpsave_cachep, kmflags);
+	if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) {
+		kmem_free(kfpu, sizeof (kfpu_state_t));
+		return (NULL);
+	}
+
+	kernel_fpu_fpstate_init(kfpu);
+
+	return (kfpu);
+}
+
+void
+kernel_fpu_free(kfpu_state_t *kfpu)
+{
+	kmem_cache_free(fpsave_cachep,
+	    kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic);
+	kmem_free(kfpu, sizeof (kfpu_state_t));
+}
+
+static void
+kernel_fpu_ctx_save(void *arg)
+{
+	kfpu_state_t *kfpu = arg;
+	fpu_ctx_t *pf;
+
+	if (kfpu == NULL) {
+		/*
+		 * A NULL kfpu implies this is a kernel thread with an LWP and
+		 * no user-level FPU usage. Use the lwp fpu save area.
+		 */
+		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
+
+		ASSERT(curthread->t_procp->p_flag & SSYS);
+		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
+
+		fp_save(pf);
+	} else {
+		pf = &kfpu->kfpu_ctx;
+
+		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
+		ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0);
+
+		/*
+		 * Note, we can't use fp_save because it assumes that we're
+		 * saving to the thread's PCB and not somewhere else. Because
+		 * this is a different FPU context, we instead have to do this
+		 * ourselves.
+		 */
+		switch (fp_save_mech) {
+		case FP_FXSAVE:
+			fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx);
+			break;
+		case FP_XSAVE:
+			xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask);
+			break;
+		default:
+			panic("Invalid fp_save_mech");
+		}
+
+		/*
+		 * Because we have saved context here, our save state is no
+		 * longer valid and therefore needs to be reinitialized.
+		 */
+		kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED;
+	}
+
+	pf->fpu_flags |= FPU_VALID;
+
+	/*
+	 * Clear KFPU flag. This allows swtch to check for improper kernel
+	 * usage of the FPU (i.e. switching to a new thread while the old
+	 * thread was in the kernel and using the FPU, but did not perform a
+	 * context save).
+	 */
+	curthread->t_flag &= ~T_KFPU;
+}
+
+static void
+kernel_fpu_ctx_restore(void *arg)
+{
+	kfpu_state_t *kfpu = arg;
+	fpu_ctx_t *pf;
+
+	if (kfpu == NULL) {
+		/*
+		 * A NULL kfpu implies this is a kernel thread with an LWP and
+		 * no user-level FPU usage. Use the lwp fpu save area.
+		 */
+		pf = &curthread->t_lwp->lwp_pcb.pcb_fpu;
+
+		ASSERT(curthread->t_procp->p_flag & SSYS);
+		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
+	} else {
+		pf = &kfpu->kfpu_ctx;
+
+		ASSERT3P(kfpu->kfpu_curthread, ==, curthread);
+		ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0);
+	}
+
+	fp_restore(pf);
+	curthread->t_flag |= T_KFPU;
+}
+
+/*
+ * Validate that the thread is not switching off-cpu while actively using the
+ * FPU within the kernel.
+ */
+void
+kernel_fpu_no_swtch(void)
+{
+	if ((curthread->t_flag & T_KFPU) != 0) {
+		panic("curthread swtch-ing while the kernel is using the FPU");
+	}
+}
+
+void
+kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags)
+{
+	klwp_t *pl = curthread->t_lwp;
+	struct ctxop *ctx;
+
+	if ((curthread->t_flag & T_KFPU) != 0) {
+		panic("curthread attempting to nest kernel FPU states");
+	}
+
+	/* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */
+	ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) !=
+	    (KFPU_USE_LWP | KFPU_NO_STATE));
+
+	if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) {
+		/*
+		 * Since we don't have a kfpu_state or usable lwp pcb_fpu to
+		 * hold our kernel FPU context, we depend on the caller doing
+		 * kpreempt_disable for the duration of our FPU usage. This
+		 * should only be done for very short periods of time.
+		 */
+		ASSERT(curthread->t_preempt > 0);
+		ASSERT(kfpu == NULL);
+
+		if (pl != NULL) {
+			/*
+			 * We might have already saved once so FPU_VALID could
+			 * be set. This is handled in fp_save.
+			 */
+			fp_save(&pl->lwp_pcb.pcb_fpu);
+			pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
+		}
+
+		curthread->t_flag |= T_KFPU;
+
+		/* Always restore the fpu to the initial state. */
+		fpinit();
+
+		return;
+	}
+
+	/*
+	 * We either have a kfpu, or are using the LWP pcb_fpu for context ops.
+	 */
+
+	if ((flags & KFPU_USE_LWP) == 0) {
+		if (kfpu->kfpu_curthread != NULL)
+			panic("attempting to reuse kernel FPU state at %p when "
+			    "another thread already is using", kfpu);
+
+		if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0)
+			kernel_fpu_fpstate_init(kfpu);
+
+		kfpu->kfpu_curthread = curthread;
+	}
+
+	/*
+	 * Not all threads may have an active LWP. If they do and we're not
+	 * going to re-use the LWP, then we should go ahead and save the state.
+	 * We must also note that the fpu is now being used by the kernel and
+	 * therefore we do not want to manage the fpu state via the user-level
+	 * thread's context handlers.
+	 *
+	 * We might have already saved once (due to a prior use of the kernel
+	 * FPU or another code path) so FPU_VALID could be set. This is handled
+	 * by fp_save, as is the FPU_EN check.
+	 */
+	ctx = installctx_preallocate();
+	kpreempt_disable();
+	if (pl != NULL) {
+		if ((flags & KFPU_USE_LWP) == 0)
+			fp_save(&pl->lwp_pcb.pcb_fpu);
+		pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL;
+	}
+
+	/*
+	 * Set the context operations for kernel FPU usage. Note that this is
+	 * done with a preallocated buffer and under kpreempt_disable because
+	 * without a preallocated buffer, installctx does a sleeping
+	 * allocation. We haven't finished initializing our kernel FPU state
+	 * yet, and in the rare case that we happen to save/restore just as
+	 * installctx() exits its own kpreempt_enable() internal call, we
+	 * guard against restoring an uninitialized buffer (0xbaddcafe).
+	 */
+	installctx(curthread, kfpu, kernel_fpu_ctx_save, kernel_fpu_ctx_restore,
+	    NULL, NULL, NULL, NULL, ctx);
+
+	curthread->t_flag |= T_KFPU;
+
+	if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) {
+		/*
+		 * For pure kernel threads with an LWP, we can use the LWP's
+		 * pcb_fpu to save/restore context.
+		 */
+		fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu;
+
+		VERIFY(curthread->t_procp->p_flag & SSYS);
+		VERIFY(kfpu == NULL);
+		ASSERT((pf->fpu_flags & FPU_EN) == 0);
+
+		/* Always restore the fpu to the initial state. */
+		if (fp_save_mech == FP_XSAVE)
+			pf->fpu_xsave_mask = XFEATURE_FP_ALL;
+		fpinit();
+		pf->fpu_flags = FPU_EN | FPU_KERNEL;
+	} else {
+		/* initialize the kfpu state */
+		kernel_fpu_ctx_restore(kfpu);
+	}
+	kpreempt_enable();
+}
+
+void
+kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags)
+{
+	ulong_t iflags;
+
+	if ((curthread->t_flag & T_KFPU) == 0) {
+		panic("curthread attempting to clear kernel FPU state "
+		    "without using it");
+	}
+
+	/*
+	 * General comments on why the rest of this function is structured the
+	 * way it is. Be aware that there is a lot of subtlety here.
+	 *
+	 * If a user-level thread ever uses the fpu while in the kernel, then
+	 * we cannot call fpdisable since that does STTS. That will set the
+	 * ts bit in %cr0 which will cause an exception if anything touches the
+	 * fpu. However, the user-level context switch handler (fpsave_ctxt)
+	 * needs to access the fpu to save the registers into the pcb.
+	 * fpsave_ctxt relies on CLTS having been done to clear the ts bit in
+	 * fprestore_ctxt when the thread context switched onto the CPU.
+	 *
+	 * Calling fpdisable only effects the current CPU's %cr0 register.
+	 *
+	 * During removectx and kpreempt_enable, we can voluntarily context
+	 * switch, so the CPU we were on when we entered this function might
+	 * not be the same one we're on when we return from removectx or end
+	 * the function. Note there can be user-level context switch handlers
+	 * still installed if this is a user-level thread.
+	 *
+	 * We also must be careful in the unlikely chance we're running in an
+	 * interrupt thread, since we can't leave the CPU's %cr0 TS state set
+	 * incorrectly for the "real" thread to resume on this CPU.
+	 */
+
+	if ((flags & KFPU_NO_STATE) == 0) {
+		kpreempt_disable();
+	} else {
+		ASSERT(curthread->t_preempt > 0);
+	}
+
+	curthread->t_flag &= ~T_KFPU;
+
+	/*
+	 * When we are ending things, we explicitly don't save the current
+	 * kernel FPU state back to the temporary state. The kfpu API is not
+	 * intended to be a permanent save location.
+	 *
+	 * If this is a user-level thread and we were to context switch
+	 * before returning to user-land, fpsave_ctxt will be a no-op since we
+	 * already saved the user-level FPU state the first time we run
+	 * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over
+	 * the user-level fpu state). The fpsave_ctxt functions only save if
+	 * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so
+	 * fprestore_ctxt will be done in sys_rtt_common when the thread
+	 * finally returns to user-land.
+	 */
+
+	if ((curthread->t_procp->p_flag & SSYS) != 0 &&
+	    curthread->t_intr == NULL) {
+		/*
+		 * A kernel thread which is not an interrupt thread, so we
+		 * STTS now.
+		 */
+		fpdisable();
+	}
+
+	if ((flags & KFPU_NO_STATE) == 0) {
+		removectx(curthread, kfpu, kernel_fpu_ctx_save,
+		    kernel_fpu_ctx_restore, NULL, NULL, NULL, NULL);
+
+		if (kfpu != NULL) {
+			if (kfpu->kfpu_curthread != curthread) {
+				panic("attempting to end kernel FPU state "
+				    "for %p, but active thread is not "
+				    "curthread", kfpu);
+			} else {
+				kfpu->kfpu_curthread = NULL;
+			}
+		}
+
+		kpreempt_enable();
+	}
+
+	if (curthread->t_lwp != NULL) {
+		uint_t f;
+
+		if (flags & KFPU_USE_LWP) {
+			f = FPU_EN | FPU_KERNEL;
+		} else {
+			f = FPU_KERNEL;
+		}
+		curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f;
+	}
+}
diff --git a/usr/src/uts/intel/os/sendsig.c b/usr/src/uts/intel/os/sendsig.c
new file mode 100644
index 0000000000..e3d60eb62b
--- /dev/null
+++ b/usr/src/uts/intel/os/sendsig.c
@@ -0,0 +1,589 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
+/*	All Rights Reserved   */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/signal.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <sys/class.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/cred.h>
+#include <sys/archsystm.h>
+#include <sys/vmparam.h>
+#include <sys/prsystm.h>
+#include <sys/reboot.h>
+#include <sys/uadmin.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/session.h>
+#include <sys/ucontext.h>
+#include <sys/dnlc.h>
+#include <sys/var.h>
+#include <sys/cmn_err.h>
+#include <sys/debugreg.h>
+#include <sys/thread.h>
+#include <sys/vtrace.h>
+#include <sys/consdev.h>
+#include <sys/psw.h>
+#include <sys/regset.h>
+
+#include <sys/privregs.h>
+
+#include <sys/stack.h>
+#include <sys/swap.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/as.h>
+#include <vm/page.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_map.h>
+#include <vm/seg_vn.h>
+#include <sys/exec.h>
+#include <sys/acct.h>
+#include <sys/core.h>
+#include <sys/corectl.h>
+#include <sys/modctl.h>
+#include <sys/tuneable.h>
+#include <c2/audit.h>
+#include <sys/bootconf.h>
+#include <sys/dumphdr.h>
+#include <sys/promif.h>
+#include <sys/systeminfo.h>
+#include <sys/kdi.h>
+#include <sys/contract_impl.h>
+#include <sys/x86_archext.h>
+
+/*
+ * Construct the execution environment for the user's signal
+ * handler and arrange for control to be given to it on return
+ * to userland.  The library code now calls setcontext() to
+ * clean up after the signal handler, so sigret() is no longer
+ * needed.
+ *
+ * (The various 'volatile' declarations are need to ensure that values
+ * are correct on the error return from on_fault().)
+ */
+
+
+/*
+ * An amd64 signal frame looks like this on the stack:
+ *
+ * old %rsp:
+ *		<128 bytes of untouched stack space>
+ *		<a siginfo_t [optional]>
+ *		<a ucontext_t>
+ *		<siginfo_t *>
+ *		<signal number>
+ * new %rsp:	<return address (deliberately invalid)>
+ *
+ * The signal number and siginfo_t pointer are only pushed onto the stack in
+ * order to allow stack backtraces.  The actual signal handling code expects the
+ * arguments in registers.
+ */
+
+struct sigframe {
+	caddr_t retaddr;
+	long	signo;
+	siginfo_t *sip;
+};
+
+int
+sendsig(int sig, k_siginfo_t *sip, void (*hdlr)())
+{
+	volatile int minstacksz;
+	int newstack;
+	label_t ljb;
+	volatile caddr_t sp;
+	caddr_t fp;
+	volatile struct regs *rp;
+	volatile greg_t upc;
+	volatile proc_t *p = ttoproc(curthread);
+	struct as *as = p->p_as;
+	klwp_t *lwp = ttolwp(curthread);
+	ucontext_t *volatile tuc = NULL;
+	ucontext_t *uc;
+	siginfo_t *sip_addr;
+	volatile int watched;
+
+	/*
+	 * This routine is utterly dependent upon STACK_ALIGN being
+	 * 16 and STACK_ENTRY_ALIGN being 8. Let's just acknowledge
+	 * that and require it.
+	 */
+
+#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8
+#error "sendsig() amd64 did not find the expected stack alignments"
+#endif
+
+	rp = lwptoregs(lwp);
+	upc = rp->r_pc;
+
+	/*
+	 * Since we're setting up to run the signal handler we have to
+	 * arrange that the stack at entry to the handler is (only)
+	 * STACK_ENTRY_ALIGN (i.e. 8) byte aligned so that when the handler
+	 * executes its push of %rbp, the stack realigns to STACK_ALIGN
+	 * (i.e. 16) correctly.
+	 *
+	 * The new sp will point to the sigframe and the ucontext_t. The
+	 * above means that sp (and thus sigframe) will be 8-byte aligned,
+	 * but not 16-byte aligned. ucontext_t, however, contains %xmm regs
+	 * which must be 16-byte aligned. Because of this, for correct
+	 * alignment, sigframe must be a multiple of 8-bytes in length, but
+	 * not 16-bytes. This will place ucontext_t at a nice 16-byte boundary.
+	 */
+
+	/* LINTED: logical expression always true: op "||" */
+	ASSERT((sizeof (struct sigframe) % 16) == 8);
+
+	minstacksz = sizeof (struct sigframe) + SA(sizeof (*uc));
+	if (sip != NULL)
+		minstacksz += SA(sizeof (siginfo_t));
+	ASSERT((minstacksz & (STACK_ENTRY_ALIGN - 1ul)) == 0);
+
+	/*
+	 * Figure out whether we will be handling this signal on
+	 * an alternate stack specified by the user.  Then allocate
+	 * and validate the stack requirements for the signal handler
+	 * context.  on_fault will catch any faults.
+	 */
+	newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
+	    !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));
+
+	if (newstack) {
+		fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
+		    SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN);
+	} else {
+		/*
+		 * Drop below the 128-byte reserved region of the stack frame
+		 * we're interrupting.
+		 */
+		fp = (caddr_t)rp->r_sp - STACK_RESERVE;
+	}
+
+	/*
+	 * Force proper stack pointer alignment, even in the face of a
+	 * misaligned stack pointer from user-level before the signal.
+	 */
+	fp = (caddr_t)((uintptr_t)fp & ~(STACK_ENTRY_ALIGN - 1ul));
+
+	/*
+	 * Most of the time during normal execution, the stack pointer
+	 * is aligned on a STACK_ALIGN (i.e. 16 byte) boundary.  However,
+	 * (for example) just after a call instruction (which pushes
+	 * the return address), the callers stack misaligns until the
+	 * 'push %rbp' happens in the callee prolog.  So while we should
+	 * expect the stack pointer to be always at least STACK_ENTRY_ALIGN
+	 * aligned, we should -not- expect it to always be STACK_ALIGN aligned.
+	 * We now adjust to ensure that the new sp is aligned to
+	 * STACK_ENTRY_ALIGN but not to STACK_ALIGN.
+	 */
+	sp = fp - minstacksz;
+	if (((uintptr_t)sp & (STACK_ALIGN - 1ul)) == 0) {
+		sp -= STACK_ENTRY_ALIGN;
+		minstacksz = fp - sp;
+	}
+
+	/*
+	 * Now, make sure the resulting signal frame address is sane
+	 */
+	if (sp >= as->a_userlimit || fp >= as->a_userlimit) {
+#ifdef DEBUG
+		printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
+		    PTOU(p)->u_comm, p->p_pid, sig);
+		printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
+		    (void *)sp, (void *)hdlr, (uintptr_t)upc);
+		printf("sp above USERLIMIT\n");
+#endif
+		return (0);
+	}
+
+	watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE);
+
+	if (on_fault(&ljb))
+		goto badstack;
+
+	if (sip != NULL) {
+		zoneid_t zoneid;
+
+		fp -= SA(sizeof (siginfo_t));
+		uzero(fp, sizeof (siginfo_t));
+		if (SI_FROMUSER(sip) &&
+		    (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID &&
+		    zoneid != sip->si_zoneid) {
+			k_siginfo_t sani_sip = *sip;
+
+			sani_sip.si_pid = p->p_zone->zone_zsched->p_pid;
+			sani_sip.si_uid = 0;
+			sani_sip.si_ctid = -1;
+			sani_sip.si_zoneid = zoneid;
+			copyout_noerr(&sani_sip, fp, sizeof (sani_sip));
+		} else
+			copyout_noerr(sip, fp, sizeof (*sip));
+		sip_addr = (siginfo_t *)fp;
+
+		if (sig == SIGPROF &&
+		    curthread->t_rprof != NULL &&
+		    curthread->t_rprof->rp_anystate) {
+			/*
+			 * We stand on our head to deal with
+			 * the real time profiling signal.
+			 * Fill in the stuff that doesn't fit
+			 * in a normal k_siginfo structure.
+			 */
+			int i = sip->si_nsysarg;
+
+			while (--i >= 0)
+				sulword_noerr(
+				    (ulong_t *)&(sip_addr->si_sysarg[i]),
+				    (ulong_t)lwp->lwp_arg[i]);
+			copyout_noerr(curthread->t_rprof->rp_state,
+			    sip_addr->si_mstate,
+			    sizeof (curthread->t_rprof->rp_state));
+		}
+	} else
+		sip_addr = NULL;
+
+	/*
+	 * save the current context on the user stack directly after the
+	 * sigframe. Since sigframe is 8-byte-but-not-16-byte aligned,
+	 * and since sizeof (struct sigframe) is 24, this guarantees
+	 * 16-byte alignment for ucontext_t and its %xmm registers.
+	 */
+	uc = (ucontext_t *)(sp + sizeof (struct sigframe));
+	tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP);
+	no_fault();
+	savecontext(tuc, &lwp->lwp_sigoldmask);
+	if (on_fault(&ljb))
+		goto badstack;
+	copyout_noerr(tuc, uc, sizeof (*tuc));
+	kmem_free(tuc, sizeof (*tuc));
+	tuc = NULL;
+
+	lwp->lwp_oldcontext = (uintptr_t)uc;
+
+	if (newstack) {
+		lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK;
+		if (lwp->lwp_ustack)
+			copyout_noerr(&lwp->lwp_sigaltstack,
+			    (stack_t *)lwp->lwp_ustack, sizeof (stack_t));
+	}
+
+	/*
+	 * Set up signal handler return and stack linkage
+	 */
+	{
+		struct sigframe frame;
+
+		/*
+		 * ensure we never return "normally"
+		 */
+		frame.retaddr = (caddr_t)(uintptr_t)-1L;
+		frame.signo = sig;
+		frame.sip = sip_addr;
+		copyout_noerr(&frame, sp, sizeof (frame));
+	}
+
+	no_fault();
+	if (watched)
+		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
+
+	/*
+	 * Set up user registers for execution of signal handler.
+	 */
+	rp->r_sp = (greg_t)sp;
+	rp->r_pc = (greg_t)hdlr;
+	rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL);
+
+	rp->r_rdi = sig;
+	rp->r_rsi = (uintptr_t)sip_addr;
+	rp->r_rdx = (uintptr_t)uc;
+
+	if ((rp->r_cs & 0xffff) != UCS_SEL ||
+	    (rp->r_ss & 0xffff) != UDS_SEL) {
+		/*
+		 * Try our best to deliver the signal.
+		 */
+		rp->r_cs = UCS_SEL;
+		rp->r_ss = UDS_SEL;
+	}
+
+	/*
+	 * Don't set lwp_eosys here.  sendsig() is called via psig() after
+	 * lwp_eosys is handled, so setting it here would affect the next
+	 * system call.
+	 */
+	return (1);
+
+badstack:
+	no_fault();
+	if (watched)
+		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
+	if (tuc)
+		kmem_free(tuc, sizeof (*tuc));
+#ifdef DEBUG
+	printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n",
+	    PTOU(p)->u_comm, p->p_pid, sig);
+	printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
+	    (void *)sp, (void *)hdlr, (uintptr_t)upc);
+#endif
+	return (0);
+}
+
+#ifdef _SYSCALL32_IMPL
+
+/*
+ * An i386 SVR4/ABI signal frame looks like this on the stack:
+ *
+ * old %esp:
+ *		<a siginfo32_t [optional]>
+ *		<a ucontext32_t>
+ *		<pointer to that ucontext32_t>
+ *		<pointer to that siginfo32_t>
+ *		<signo>
+ * new %esp:	<return address (deliberately invalid)>
+ */
+struct sigframe32 {
+	caddr32_t	retaddr;
+	uint32_t	signo;
+	caddr32_t	sip;
+	caddr32_t	ucp;
+};
+
+int
+sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)())
+{
+	volatile int minstacksz;
+	int newstack;
+	label_t ljb;
+	volatile caddr_t sp;
+	caddr_t fp;
+	volatile struct regs *rp;
+	volatile greg_t upc;
+	volatile proc_t *p = ttoproc(curthread);
+	klwp_t *lwp = ttolwp(curthread);
+	ucontext32_t *volatile tuc = NULL;
+	ucontext32_t *uc;
+	siginfo32_t *sip_addr;
+	volatile int watched;
+
+	rp = lwptoregs(lwp);
+	upc = rp->r_pc;
+
+	minstacksz = SA32(sizeof (struct sigframe32)) + SA32(sizeof (*uc));
+	if (sip != NULL)
+		minstacksz += SA32(sizeof (siginfo32_t));
+	ASSERT((minstacksz & (STACK_ALIGN32 - 1)) == 0);
+
+	/*
+	 * Figure out whether we will be handling this signal on
+	 * an alternate stack specified by the user.  Then allocate
+	 * and validate the stack requirements for the signal handler
+	 * context.  on_fault will catch any faults.
+	 */
+	newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) &&
+	    !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE));
+
+	if (newstack) {
+		fp = (caddr_t)(SA32((uintptr_t)lwp->lwp_sigaltstack.ss_sp) +
+		    SA32(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN32);
+	} else if ((rp->r_ss & 0xffff) != UDS_SEL) {
+		user_desc_t *ldt;
+		/*
+		 * If the stack segment selector is -not- pointing at
+		 * the UDS_SEL descriptor and we have an LDT entry for
+		 * it instead, add the base address to find the effective va.
+		 */
+		if ((ldt = p->p_ldt) != NULL)
+			fp = (caddr_t)rp->r_sp +
+			    USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]);
+		else
+			fp = (caddr_t)rp->r_sp;
+	} else
+		fp = (caddr_t)rp->r_sp;
+
+	/*
+	 * Force proper stack pointer alignment, even in the face of a
+	 * misaligned stack pointer from user-level before the signal.
+	 * Don't use the SA32() macro because that rounds up, not down.
+	 */
+	fp = (caddr_t)((uintptr_t)fp & ~(STACK_ALIGN32 - 1));
+	sp = fp - minstacksz;
+
+	/*
+	 * Make sure lwp hasn't trashed its stack
+	 */
+	if (sp >= (caddr_t)(uintptr_t)USERLIMIT32 ||
+	    fp >= (caddr_t)(uintptr_t)USERLIMIT32) {
+#ifdef DEBUG
+		printf("sendsig32: bad signal stack cmd=%s, pid=%d, sig=%d\n",
+		    PTOU(p)->u_comm, p->p_pid, sig);
+		printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
+		    (void *)sp, (void *)hdlr, (uintptr_t)upc);
+		printf("sp above USERLIMIT\n");
+#endif
+		return (0);
+	}
+
+	watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE);
+
+	if (on_fault(&ljb))
+		goto badstack;
+
+	if (sip != NULL) {
+		siginfo32_t si32;
+		zoneid_t zoneid;
+
+		siginfo_kto32(sip, &si32);
+		if (SI_FROMUSER(sip) &&
+		    (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID &&
+		    zoneid != sip->si_zoneid) {
+			si32.si_pid = p->p_zone->zone_zsched->p_pid;
+			si32.si_uid = 0;
+			si32.si_ctid = -1;
+			si32.si_zoneid = zoneid;
+		}
+		fp -= SA32(sizeof (si32));
+		uzero(fp, sizeof (si32));
+		copyout_noerr(&si32, fp, sizeof (si32));
+		sip_addr = (siginfo32_t *)fp;
+
+		if (sig == SIGPROF &&
+		    curthread->t_rprof != NULL &&
+		    curthread->t_rprof->rp_anystate) {
+			/*
+			 * We stand on our head to deal with
+			 * the real-time profiling signal.
+			 * Fill in the stuff that doesn't fit
+			 * in a normal k_siginfo structure.
+			 */
+			int i = sip->si_nsysarg;
+
+			while (--i >= 0)
+				suword32_noerr(&(sip_addr->si_sysarg[i]),
+				    (uint32_t)lwp->lwp_arg[i]);
+			copyout_noerr(curthread->t_rprof->rp_state,
+			    sip_addr->si_mstate,
+			    sizeof (curthread->t_rprof->rp_state));
+		}
+	} else
+		sip_addr = NULL;
+
+	/* save the current context on the user stack */
+	fp -= SA32(sizeof (*tuc));
+	uc = (ucontext32_t *)fp;
+	tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP);
+	no_fault();
+	savecontext32(tuc, &lwp->lwp_sigoldmask);
+	if (on_fault(&ljb))
+		goto badstack;
+	copyout_noerr(tuc, uc, sizeof (*tuc));
+	kmem_free(tuc, sizeof (*tuc));
+	tuc = NULL;
+
+	lwp->lwp_oldcontext = (uintptr_t)uc;
+
+	if (newstack) {
+		lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK;
+		if (lwp->lwp_ustack) {
+			stack32_t stk32;
+
+			stk32.ss_sp = (caddr32_t)(uintptr_t)
+			    lwp->lwp_sigaltstack.ss_sp;
+			stk32.ss_size = (size32_t)
+			    lwp->lwp_sigaltstack.ss_size;
+			stk32.ss_flags = (int32_t)
+			    lwp->lwp_sigaltstack.ss_flags;
+			copyout_noerr(&stk32,
+			    (stack32_t *)lwp->lwp_ustack, sizeof (stk32));
+		}
+	}
+
+	/*
+	 * Set up signal handler arguments
+	 */
+	{
+		struct sigframe32 frame32;
+
+		frame32.sip = (caddr32_t)(uintptr_t)sip_addr;
+		frame32.ucp = (caddr32_t)(uintptr_t)uc;
+		frame32.signo = sig;
+		frame32.retaddr = 0xffffffff;	/* never return! */
+		copyout_noerr(&frame32, sp, sizeof (frame32));
+	}
+
+	no_fault();
+	if (watched)
+		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
+
+	rp->r_sp = (greg_t)(uintptr_t)sp;
+	rp->r_pc = (greg_t)(uintptr_t)hdlr;
+	rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL);
+
+	if ((rp->r_cs & 0xffff) != U32CS_SEL ||
+	    (rp->r_ss & 0xffff) != UDS_SEL) {
+		/*
+		 * Try our best to deliver the signal.
+		 */
+		rp->r_cs = U32CS_SEL;
+		rp->r_ss = UDS_SEL;
+	}
+
+	/*
+	 * Don't set lwp_eosys here.  sendsig() is called via psig() after
+	 * lwp_eosys is handled, so setting it here would affect the next
+	 * system call.
+	 */
+	return (1);
+
+badstack:
+	no_fault();
+	if (watched)
+		watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE);
+	if (tuc)
+		kmem_free(tuc, sizeof (*tuc));
+#ifdef DEBUG
+	printf("sendsig32: bad signal stack cmd=%s pid=%d, sig=%d\n",
+	    PTOU(p)->u_comm, p->p_pid, sig);
+	printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n",
+	    (void *)sp, (void *)hdlr, (uintptr_t)upc);
+#endif
+	return (0);
+}
+
+#endif	/* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/intel/os/sundep.c b/usr/src/uts/intel/os/sundep.c
new file mode 100644
index 0000000000..80e149f01b
--- /dev/null
+++ b/usr/src/uts/intel/os/sundep.c
@@ -0,0 +1,1012 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ */
+
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
+/*	All Rights Reserved   */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/signal.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <sys/class.h>
+#include <sys/proc.h>
+#include <sys/procfs.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/cred.h>
+#include <sys/archsystm.h>
+#include <sys/vmparam.h>
+#include <sys/prsystm.h>
+#include <sys/reboot.h>
+#include <sys/uadmin.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/session.h>
+#include <sys/ucontext.h>
+#include <sys/dnlc.h>
+#include <sys/var.h>
+#include <sys/cmn_err.h>
+#include <sys/debugreg.h>
+#include <sys/thread.h>
+#include <sys/vtrace.h>
+#include <sys/consdev.h>
+#include <sys/psw.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/cpu.h>
+#include <sys/stack.h>
+#include <sys/swap.h>
+#include <vm/hat.h>
+#include <vm/anon.h>
+#include <vm/as.h>
+#include <vm/page.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_map.h>
+#include <vm/seg_vn.h>
+#include <sys/exec.h>
+#include <sys/acct.h>
+#include <sys/core.h>
+#include <sys/corectl.h>
+#include <sys/modctl.h>
+#include <sys/tuneable.h>
+#include <c2/audit.h>
+#include <sys/bootconf.h>
+#include <sys/brand.h>
+#include <sys/dumphdr.h>
+#include <sys/promif.h>
+#include <sys/systeminfo.h>
+#include <sys/kdi.h>
+#include <sys/contract_impl.h>
+#include <sys/x86_archext.h>
+#include <sys/segments.h>
+#include <sys/ontrap.h>
+#include <sys/cpu.h>
+#ifdef __xpv
+#include <sys/hypervisor.h>
+#endif
+
+/*
+ * Compare the version of boot that boot says it is against
+ * the version of boot the kernel expects.
+ */
+int
+check_boot_version(int boots_version)
+{
+	if (boots_version == BO_VERSION)
+		return (0);
+
+	prom_printf("Wrong boot interface - kernel needs v%d found v%d\n",
+	    BO_VERSION, boots_version);
+	prom_panic("halting");
+	/*NOTREACHED*/
+}
+
+/*
+ * Process the physical installed list for boot.
+ * Finds:
+ * 1) the pfn of the highest installed physical page,
+ * 2) the number of pages installed
+ * 3) the number of distinct contiguous regions these pages fall into.
+ * 4) the number of contiguous memory ranges
+ */
+void
+installed_top_size_ex(
+	struct memlist *list,	/* pointer to start of installed list */
+	pfn_t *high_pfn,	/* return ptr for top value */
+	pgcnt_t *pgcnt,		/* return ptr for sum of installed pages */
+	int	*ranges)	/* return ptr for the count of contig. ranges */
+{
+	pfn_t top = 0;
+	pgcnt_t sumpages = 0;
+	pfn_t highp;		/* high page in a chunk */
+	int cnt = 0;
+
+	for (; list; list = list->ml_next) {
+		++cnt;
+		highp = (list->ml_address + list->ml_size - 1) >> PAGESHIFT;
+		if (top < highp)
+			top = highp;
+		sumpages += btop(list->ml_size);
+	}
+
+	*high_pfn = top;
+	*pgcnt = sumpages;
+	*ranges = cnt;
+}
+
+void
+installed_top_size(
+	struct memlist *list,	/* pointer to start of installed list */
+	pfn_t *high_pfn,	/* return ptr for top value */
+	pgcnt_t *pgcnt)		/* return ptr for sum of installed pages */
+{
+	int ranges;
+
+	installed_top_size_ex(list, high_pfn, pgcnt, &ranges);
+}
+
+void
+phys_install_has_changed(void)
+{}
+
+/*
+ * Copy in a memory list from boot to kernel, with a filter function
+ * to remove pages. The filter function can increase the address and/or
+ * decrease the size to filter out pages.  It will also align addresses and
+ * sizes to PAGESIZE.
+ */
+void
+copy_memlist_filter(
+	struct memlist *src,
+	struct memlist **dstp,
+	void (*filter)(uint64_t *, uint64_t *))
+{
+	struct memlist *dst, *prev;
+	uint64_t addr;
+	uint64_t size;
+	uint64_t eaddr;
+
+	dst = *dstp;
+	prev = dst;
+
+	/*
+	 * Move through the memlist applying a filter against
+	 * each range of memory. Note that we may apply the
+	 * filter multiple times against each memlist entry.
+	 */
+	for (; src; src = src->ml_next) {
+		addr = P2ROUNDUP(src->ml_address, PAGESIZE);
+		eaddr = P2ALIGN(src->ml_address + src->ml_size, PAGESIZE);
+		while (addr < eaddr) {
+			size = eaddr - addr;
+			if (filter != NULL)
+				filter(&addr, &size);
+			if (size == 0)
+				break;
+			dst->ml_address = addr;
+			dst->ml_size = size;
+			dst->ml_next = 0;
+			if (prev == dst) {
+				dst->ml_prev = 0;
+				dst++;
+			} else {
+				dst->ml_prev = prev;
+				prev->ml_next = dst;
+				dst++;
+				prev++;
+			}
+			addr += size;
+		}
+	}
+
+	*dstp = dst;
+}
+
+/*
+ * Kernel setup code, called from startup().
+ */
+void
+kern_setup1(void)
+{
+	proc_t *pp;
+
+	pp = &p0;
+
+	proc_sched = pp;
+
+	/*
+	 * Initialize process 0 data structures
+	 */
+	pp->p_stat = SRUN;
+	pp->p_flag = SSYS;
+
+	pp->p_pidp = &pid0;
+	pp->p_pgidp = &pid0;
+	pp->p_sessp = &session0;
+	pp->p_tlist = &t0;
+	pid0.pid_pglink = pp;
+	pid0.pid_pgtail = pp;
+
+	/*
+	 * XXX - we asssume that the u-area is zeroed out except for
+	 * ttolwp(curthread)->lwp_regs.
+	 */
+	PTOU(curproc)->u_cmask = (mode_t)CMASK;
+
+	thread_init();		/* init thread_free list */
+	pid_init();		/* initialize pid (proc) table */
+	contract_init();	/* initialize contracts */
+
+	init_pages_pp_maximum();
+}
+
+/*
+ * Load a procedure into a thread.
+ */
+void
+thread_load(kthread_t *t, void (*start)(), caddr_t arg, size_t len)
+{
+	caddr_t sp;
+	size_t framesz;
+	caddr_t argp;
+	long *p;
+	extern void thread_start();
+
+	/*
+	 * Push a "c" call frame onto the stack to represent
+	 * the caller of "start".
+	 */
+	sp = t->t_stk;
+	ASSERT(((uintptr_t)t->t_stk & (STACK_ENTRY_ALIGN - 1)) == 0);
+	if (len != 0) {
+		/*
+		 * the object that arg points at is copied into the
+		 * caller's frame.
+		 */
+		framesz = SA(len);
+		sp -= framesz;
+		ASSERT(sp > t->t_stkbase);
+		argp = sp + SA(MINFRAME);
+		bcopy(arg, argp, len);
+		arg = argp;
+	}
+	/*
+	 * Set up arguments (arg and len) on the caller's stack frame.
+	 */
+	p = (long *)sp;
+
+	*--p = 0;		/* fake call */
+	*--p = 0;		/* null frame pointer terminates stack trace */
+	*--p = (long)len;
+	*--p = (intptr_t)arg;
+	*--p = (intptr_t)start;
+
+	/*
+	 * initialize thread to resume at thread_start() which will
+	 * turn around and invoke (*start)(arg, len).
+	 */
+	t->t_pc = (uintptr_t)thread_start;
+	t->t_sp = (uintptr_t)p;
+
+	ASSERT((t->t_sp & (STACK_ENTRY_ALIGN - 1)) == 0);
+}
+
+/*
+ * load user registers into lwp.
+ */
+/*ARGSUSED2*/
+void
+lwp_load(klwp_t *lwp, gregset_t grp, uintptr_t thrptr)
+{
+	struct regs *rp = lwptoregs(lwp);
+
+	setgregs(lwp, grp);
+	rp->r_ps = PSL_USER;
+
+	/*
+	 * For 64-bit lwps, we allow one magic %fs selector value, and one
+	 * magic %gs selector to point anywhere in the address space using
+	 * %fsbase and %gsbase behind the scenes.  libc uses %fs to point
+	 * at the ulwp_t structure.
+	 *
+	 * For 32-bit lwps, libc wedges its lwp thread pointer into the
+	 * ucontext ESP slot (which is otherwise irrelevant to setting a
+	 * ucontext) and LWPGS_SEL value into gregs[REG_GS].  This is so
+	 * syslwp_create() can atomically setup %gs.
+	 *
+	 * See setup_context() in libc.
+	 */
+#ifdef _SYSCALL32_IMPL
+	if (lwp_getdatamodel(lwp) == DATAMODEL_ILP32) {
+		if (grp[REG_GS] == LWPGS_SEL)
+			(void) lwp_setprivate(lwp, _LWP_GSBASE, thrptr);
+	} else {
+		/*
+		 * See lwp_setprivate in kernel and setup_context in libc.
+		 *
+		 * Currently libc constructs a ucontext from whole cloth for
+		 * every new (not main) lwp created.  For 64 bit processes
+		 * %fsbase is directly set to point to current thread pointer.
+		 * In the past (solaris 10) %fs was also set LWPFS_SEL to
+		 * indicate %fsbase. Now we use the null GDT selector for
+		 * this purpose. LWP[FS|GS]_SEL are only intended for 32 bit
+		 * processes. To ease transition we support older libcs in
+		 * the newer kernel by forcing %fs or %gs selector to null
+		 * by calling lwp_setprivate if LWP[FS|GS]_SEL is passed in
+		 * the ucontext.  This is should be ripped out at some future
+		 * date.  Another fix would be for libc to do a getcontext
+		 * and inherit the null %fs/%gs from the current context but
+		 * that means an extra system call and could hurt performance.
+		 */
+		if (grp[REG_FS] == 0x1bb) /* hard code legacy LWPFS_SEL */
+			(void) lwp_setprivate(lwp, _LWP_FSBASE,
+			    (uintptr_t)grp[REG_FSBASE]);
+
+		if (grp[REG_GS] == 0x1c3) /* hard code legacy LWPGS_SEL */
+			(void) lwp_setprivate(lwp, _LWP_GSBASE,
+			    (uintptr_t)grp[REG_GSBASE]);
+	}
+#else
+	if (grp[GS] == LWPGS_SEL)
+		(void) lwp_setprivate(lwp, _LWP_GSBASE, thrptr);
+#endif
+
+	lwp->lwp_eosys = JUSTRETURN;
+	lwptot(lwp)->t_post_sys = 1;
+}
+
+/*
+ * set syscall()'s return values for a lwp.
+ */
+void
+lwp_setrval(klwp_t *lwp, int v1, int v2)
+{
+	lwptoregs(lwp)->r_ps &= ~PS_C;
+	lwptoregs(lwp)->r_r0 = v1;
+	lwptoregs(lwp)->r_r1 = v2;
+}
+
+/*
+ * set syscall()'s return values for a lwp.
+ */
+void
+lwp_setsp(klwp_t *lwp, caddr_t sp)
+{
+	lwptoregs(lwp)->r_sp = (intptr_t)sp;
+}
+
+/*
+ * Copy regs from parent to child.
+ */
+void
+lwp_forkregs(klwp_t *lwp, klwp_t *clwp)
+{
+	struct pcb *pcb = &clwp->lwp_pcb;
+	struct regs *rp = lwptoregs(lwp);
+
+	if (!PCB_NEED_UPDATE_SEGS(pcb)) {
+		pcb->pcb_ds = rp->r_ds;
+		pcb->pcb_es = rp->r_es;
+		pcb->pcb_fs = rp->r_fs;
+		pcb->pcb_gs = rp->r_gs;
+		PCB_SET_UPDATE_SEGS(pcb);
+		lwptot(clwp)->t_post_sys = 1;
+	}
+	ASSERT(lwptot(clwp)->t_post_sys);
+
+	fp_lwp_dup(clwp);
+
+	bcopy(lwp->lwp_regs, clwp->lwp_regs, sizeof (struct regs));
+}
+
+/*
+ * This function is currently unused on x86.
+ */
+/*ARGSUSED*/
+void
+lwp_freeregs(klwp_t *lwp, int isexec)
+{}
+
+/*
+ * This function is currently unused on x86.
+ */
+void
+lwp_pcb_exit(void)
+{}
+
+/*
+ * Lwp context ops for segment registers.
+ */
+
+/*
+ * Every time we come into the kernel (syscall, interrupt or trap
+ * but not fast-traps) we capture the current values of the user's
+ * segment registers into the lwp's reg structure. This includes
+ * lcall for i386 generic system call support since it is handled
+ * as a segment-not-present trap.
+ *
+ * Here we save the current values from the lwp regs into the pcb
+ * and or PCB_UPDATE_SEGS (1) in pcb->pcb_rupdate to tell the rest
+ * of the kernel that the pcb copy of the segment registers is the
+ * current one.  This ensures the lwp's next trip to user land via
+ * update_sregs.  Finally we set t_post_sys to ensure that no
+ * system call fast-path's its way out of the kernel via sysret.
+ *
+ * (This means that we need to have interrupts disabled when we
+ * test t->t_post_sys in the syscall handlers; if the test fails,
+ * we need to keep interrupts disabled until we return to userland
+ * so we can't be switched away.)
+ *
+ * As a result of all this, we don't really have to do a whole lot
+ * if the thread is just mucking about in the kernel, switching on
+ * and off the cpu for whatever reason it feels like. And yet we
+ * still preserve fast syscalls, cause if we -don't- get
+ * descheduled, we never come here either.
+ */
+
+#define	VALID_LWP_DESC(udp) ((udp)->usd_type == SDT_MEMRWA && \
+	    (udp)->usd_p == 1 && (udp)->usd_dpl == SEL_UPL)
+
+/*ARGSUSED*/
+void
+lwp_segregs_save(klwp_t *lwp)
+{
+	pcb_t *pcb = &lwp->lwp_pcb;
+	struct regs *rp;
+
+	ASSERT(VALID_LWP_DESC(&pcb->pcb_fsdesc));
+	ASSERT(VALID_LWP_DESC(&pcb->pcb_gsdesc));
+
+	if (!PCB_NEED_UPDATE_SEGS(pcb)) {
+		rp = lwptoregs(lwp);
+
+		/*
+		 * If there's no update already pending, capture the current
+		 * %ds/%es/%fs/%gs values from lwp's regs in case the user
+		 * changed them; %fsbase and %gsbase are privileged so the
+		 * kernel versions of these registers in pcb_fsbase and
+		 * pcb_gsbase are always up-to-date.
+		 */
+		pcb->pcb_ds = rp->r_ds;
+		pcb->pcb_es = rp->r_es;
+		pcb->pcb_fs = rp->r_fs;
+		pcb->pcb_gs = rp->r_gs;
+		PCB_SET_UPDATE_SEGS(pcb);
+		lwp->lwp_thread->t_post_sys = 1;
+	}
+
+#if !defined(__xpv)	/* XXPV not sure if we can re-read gdt? */
+	ASSERT(bcmp(&CPU->cpu_gdt[GDT_LWPFS], &lwp->lwp_pcb.pcb_fsdesc,
+	    sizeof (lwp->lwp_pcb.pcb_fsdesc)) == 0);
+	ASSERT(bcmp(&CPU->cpu_gdt[GDT_LWPGS], &lwp->lwp_pcb.pcb_gsdesc,
+	    sizeof (lwp->lwp_pcb.pcb_gsdesc)) == 0);
+#endif
+}
+
+/*
+ * Update the segment registers with new values from the pcb.
+ *
+ * We have to do this carefully, and in the following order,
+ * in case any of the selectors points at a bogus descriptor.
+ * If they do, we'll catch trap with on_trap and return 1.
+ * returns 0 on success.
+ *
+ * This is particularly tricky for %gs.
+ * This routine must be executed under a cli.
+ */
+int
+update_sregs(struct regs *rp,  klwp_t *lwp)
+{
+	pcb_t *pcb = &lwp->lwp_pcb;
+	ulong_t	kgsbase;
+	on_trap_data_t	otd;
+	int rc = 0;
+
+	if (!on_trap(&otd, OT_SEGMENT_ACCESS)) {
+
+#if defined(__xpv)
+		/*
+		 * On the hyervisor this is easy. The hypercall below will
+		 * swapgs and load %gs with the user selector. If the user
+		 * selector is bad the hypervisor will catch the fault and
+		 * load %gs with the null selector instead. Either way the
+		 * kernel's gsbase is not damaged.
+		 */
+		kgsbase = (ulong_t)CPU;
+		if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL,
+		    pcb->pcb_gs) != 0) {
+				no_trap();
+				return (1);
+		}
+
+		rp->r_gs = pcb->pcb_gs;
+		ASSERT((cpu_t *)kgsbase == CPU);
+
+#else	/* __xpv */
+
+		/*
+		 * A little more complicated running native.
+		 */
+		kgsbase = (ulong_t)CPU;
+		__set_gs(pcb->pcb_gs);
+
+		/*
+		 * If __set_gs fails it's because the new %gs is a bad %gs,
+		 * we'll be taking a trap but with the original %gs and %gsbase
+		 * undamaged (i.e. pointing at curcpu).
+		 *
+		 * We've just mucked up the kernel's gsbase.  Oops.  In
+		 * particular we can't take any traps at all.  Make the newly
+		 * computed gsbase be the hidden gs via swapgs, and fix
+		 * the kernel's gsbase back again. Later, when we return to
+		 * userland we'll swapgs again restoring gsbase just loaded
+		 * above.
+		 */
+		__asm__ __volatile__("mfence; swapgs");
+
+		rp->r_gs = pcb->pcb_gs;
+
+		/*
+		 * Restore kernel's gsbase. Note that this also serializes any
+		 * attempted speculation from loading the user-controlled
+		 * %gsbase.
+		 */
+		wrmsr(MSR_AMD_GSBASE, kgsbase);
+
+#endif	/* __xpv */
+
+		/*
+		 * Only override the descriptor base address if
+		 * r_gs == LWPGS_SEL or if r_gs == NULL. A note on
+		 * NULL descriptors -- 32-bit programs take faults
+		 * if they deference NULL descriptors; however,
+		 * when 64-bit programs load them into %fs or %gs,
+		 * they DONT fault -- only the base address remains
+		 * whatever it was from the last load.   Urk.
+		 *
+		 * XXX - note that lwp_setprivate now sets %fs/%gs to the
+		 * null selector for 64 bit processes. Whereas before
+		 * %fs/%gs were set to LWP(FS|GS)_SEL regardless of
+		 * the process's data model. For now we check for both
+		 * values so that the kernel can also support the older
+		 * libc. This should be ripped out at some point in the
+		 * future.
+		 */
+		if (pcb->pcb_gs == LWPGS_SEL || pcb->pcb_gs == 0) {
+#if defined(__xpv)
+			if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER,
+			    pcb->pcb_gsbase)) {
+				no_trap();
+				return (1);
+			}
+#else
+			wrmsr(MSR_AMD_KGSBASE, pcb->pcb_gsbase);
+#endif
+		}
+
+		__set_ds(pcb->pcb_ds);
+		rp->r_ds = pcb->pcb_ds;
+
+		__set_es(pcb->pcb_es);
+		rp->r_es = pcb->pcb_es;
+
+		__set_fs(pcb->pcb_fs);
+		rp->r_fs = pcb->pcb_fs;
+
+		/*
+		 * Same as for %gs
+		 */
+		if (pcb->pcb_fs == LWPFS_SEL || pcb->pcb_fs == 0) {
+#if defined(__xpv)
+			if (HYPERVISOR_set_segment_base(SEGBASE_FS,
+			    pcb->pcb_fsbase)) {
+				no_trap();
+				return (1);
+			}
+#else
+			wrmsr(MSR_AMD_FSBASE, pcb->pcb_fsbase);
+#endif
+		}
+
+	} else {
+		cli();
+		rc = 1;
+	}
+	no_trap();
+	return (rc);
+}
+
+/*
+ * Make sure any stale selectors are cleared from the segment registers
+ * by putting KDS_SEL (the kernel's default %ds gdt selector) into them.
+ * This is necessary because the kernel itself does not use %es, %fs, nor
+ * %ds. (%cs and %ss are necessary, and are set up by the kernel - along with
+ * %gs - to point to the current cpu struct.) If we enter kmdb while in the
+ * kernel and resume with a stale ldt or brandz selector sitting there in a
+ * segment register, kmdb will #gp fault if the stale selector points to,
+ * for example, an ldt in the context of another process.
+ *
+ * WARNING: Intel and AMD chips behave differently when storing
+ * the null selector into %fs and %gs while in long mode. On AMD
+ * chips fsbase and gsbase are not cleared. But on Intel chips, storing
+ * a null selector into %fs or %gs has the side effect of clearing
+ * fsbase or gsbase. For that reason we use KDS_SEL, which has
+ * consistent behavor between AMD and Intel.
+ *
+ * Caller responsible for preventing cpu migration.
+ */
+void
+reset_sregs(void)
+{
+	ulong_t kgsbase = (ulong_t)CPU;
+
+	ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL);
+
+	cli();
+	__set_gs(KGS_SEL);
+
+	/*
+	 * restore kernel gsbase
+	 */
+#if defined(__xpv)
+	xen_set_segment_base(SEGBASE_GS_KERNEL, kgsbase);
+#else
+	wrmsr(MSR_AMD_GSBASE, kgsbase);
+#endif
+
+	sti();
+
+	__set_ds(KDS_SEL);
+	__set_es(0 | SEL_KPL);	/* selector RPL not ring 0 on hypervisor */
+	__set_fs(KFS_SEL);
+}
+
+
+#ifdef _SYSCALL32_IMPL
+
+/*
+ * Make it impossible for a process to change its data model.
+ * We do this by toggling the present bits for the 32 and
+ * 64-bit user code descriptors. That way if a user lwp attempts
+ * to change its data model (by using the wrong code descriptor in
+ * %cs) it will fault immediately. This also allows us to simplify
+ * assertions and checks in the kernel.
+ */
+
+static void
+gdt_ucode_model(model_t model)
+{
+	kpreempt_disable();
+	if (model == DATAMODEL_NATIVE) {
+		gdt_update_usegd(GDT_UCODE, &ucs_on);
+		gdt_update_usegd(GDT_U32CODE, &ucs32_off);
+	} else {
+		gdt_update_usegd(GDT_U32CODE, &ucs32_on);
+		gdt_update_usegd(GDT_UCODE, &ucs_off);
+	}
+	kpreempt_enable();
+}
+
+#endif	/* _SYSCALL32_IMPL */
+
+/*
+ * Restore lwp private fs and gs segment descriptors
+ * on current cpu's GDT.
+ */
+static void
+lwp_segregs_restore(klwp_t *lwp)
+{
+	pcb_t *pcb = &lwp->lwp_pcb;
+
+	ASSERT(VALID_LWP_DESC(&pcb->pcb_fsdesc));
+	ASSERT(VALID_LWP_DESC(&pcb->pcb_gsdesc));
+
+#ifdef	_SYSCALL32_IMPL
+	gdt_ucode_model(DATAMODEL_NATIVE);
+#endif
+
+	gdt_update_usegd(GDT_LWPFS, &pcb->pcb_fsdesc);
+	gdt_update_usegd(GDT_LWPGS, &pcb->pcb_gsdesc);
+
+}
+
+#ifdef _SYSCALL32_IMPL
+
+static void
+lwp_segregs_restore32(klwp_t *lwp)
+{
+	/*LINTED*/
+	cpu_t *cpu = CPU;
+	pcb_t *pcb = &lwp->lwp_pcb;
+
+	ASSERT(VALID_LWP_DESC(&lwp->lwp_pcb.pcb_fsdesc));
+	ASSERT(VALID_LWP_DESC(&lwp->lwp_pcb.pcb_gsdesc));
+
+	gdt_ucode_model(DATAMODEL_ILP32);
+	gdt_update_usegd(GDT_LWPFS, &pcb->pcb_fsdesc);
+	gdt_update_usegd(GDT_LWPGS, &pcb->pcb_gsdesc);
+}
+
+#endif	/* _SYSCALL32_IMPL */
+
+/*
+ * If this is a process in a branded zone, then we want it to use the brand
+ * syscall entry points instead of the standard Solaris entry points.  This
+ * routine must be called when a new lwp is created within a branded zone
+ * or when an existing lwp moves into a branded zone via a zone_enter()
+ * operation.
+ */
+void
+lwp_attach_brand_hdlrs(klwp_t *lwp)
+{
+	kthread_t *t = lwptot(lwp);
+
+	ASSERT(PROC_IS_BRANDED(lwptoproc(lwp)));
+
+	ASSERT(removectx(t, NULL, brand_interpositioning_disable,
+	    brand_interpositioning_enable, NULL, NULL,
+	    brand_interpositioning_disable, NULL) == 0);
+	installctx(t, NULL, brand_interpositioning_disable,
+	    brand_interpositioning_enable, NULL, NULL,
+	    brand_interpositioning_disable, NULL, NULL);
+
+	if (t == curthread) {
+		kpreempt_disable();
+		brand_interpositioning_enable();
+		kpreempt_enable();
+	}
+}
+
+/*
+ * If this is a process in a branded zone, then we want it to disable the
+ * brand syscall entry points.  This routine must be called when the last
+ * lwp in a process is exiting in proc_exit().
+ */
+void
+lwp_detach_brand_hdlrs(klwp_t *lwp)
+{
+	kthread_t *t = lwptot(lwp);
+
+	ASSERT(PROC_IS_BRANDED(lwptoproc(lwp)));
+	if (t == curthread)
+		kpreempt_disable();
+
+	/* Remove the original context handlers */
+	VERIFY(removectx(t, NULL, brand_interpositioning_disable,
+	    brand_interpositioning_enable, NULL, NULL,
+	    brand_interpositioning_disable, NULL) != 0);
+
+	if (t == curthread) {
+		/* Cleanup our MSR and IDT entries. */
+		brand_interpositioning_disable();
+		kpreempt_enable();
+	}
+}
+
+/*
+ * Add any lwp-associated context handlers to the lwp at the beginning
+ * of the lwp's useful life.
+ *
+ * All paths which create lwp's invoke lwp_create(); lwp_create()
+ * invokes lwp_stk_init() which initializes the stack, sets up
+ * lwp_regs, and invokes this routine.
+ *
+ * All paths which destroy lwp's invoke lwp_exit() to rip the lwp
+ * apart and put it on 'lwp_deathrow'; if the lwp is destroyed it
+ * ends up in thread_free() which invokes freectx(t, 0) before
+ * invoking lwp_stk_fini().  When the lwp is recycled from death
+ * row, lwp_stk_fini() is invoked, then thread_free(), and thus
+ * freectx(t, 0) as before.
+ *
+ * In the case of exec, the surviving lwp is thoroughly scrubbed
+ * clean; exec invokes freectx(t, 1) to destroy associated contexts.
+ * On the way back to the new image, it invokes setregs() which
+ * in turn invokes this routine.
+ */
+void
+lwp_installctx(klwp_t *lwp)
+{
+	kthread_t *t = lwptot(lwp);
+	int thisthread = t == curthread;
+#ifdef _SYSCALL32_IMPL
+	void (*restop)(klwp_t *) = lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ?
+	    lwp_segregs_restore : lwp_segregs_restore32;
+#else
+	void (*restop)(klwp_t *) = lwp_segregs_restore;
+#endif
+	struct ctxop *ctx;
+
+	/*
+	 * Install the basic lwp context handlers on each lwp.
+	 *
+	 * On the amd64 kernel, the context handlers are responsible for
+	 * virtualizing %ds, %es, %fs, and %gs to the lwp.  The register
+	 * values are only ever changed via sys_rtt when the
+	 * PCB_UPDATE_SEGS bit (1) is set in pcb->pcb_rupdate. Only
+	 * sys_rtt gets to clear the bit.
+	 *
+	 * On the i386 kernel, the context handlers are responsible for
+	 * virtualizing %gs/%fs to the lwp by updating the per-cpu GDTs
+	 */
+	ASSERT(removectx(t, lwp, lwp_segregs_save, restop,
+	    NULL, NULL, NULL, NULL) == 0);
+	if (thisthread) {
+		ctx = installctx_preallocate();
+		kpreempt_disable();
+	} else {
+		ctx = NULL;
+	}
+	installctx(t, lwp, lwp_segregs_save, restop,
+	    NULL, NULL, NULL, NULL, ctx);
+	if (thisthread) {
+		/*
+		 * Since we're the right thread, set the values in the GDT
+		 */
+		restop(lwp);
+		kpreempt_enable();
+	}
+
+	/*
+	 * If we have sysenter/sysexit instructions enabled, we need
+	 * to ensure that the hardware mechanism is kept up-to-date with the
+	 * lwp's kernel stack pointer across context switches.
+	 *
+	 * sep_save zeros the sysenter stack pointer msr; sep_restore sets
+	 * it to the lwp's kernel stack pointer (kstktop).
+	 */
+	if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
+		caddr_t kstktop = (caddr_t)lwp->lwp_regs;
+		ASSERT(removectx(t, kstktop,
+		    sep_save, sep_restore, NULL, NULL, NULL, NULL) == 0);
+
+		if (thisthread) {
+			ctx = installctx_preallocate();
+			kpreempt_disable();
+		} else {
+			ctx = NULL;
+		}
+		installctx(t, kstktop,
+		    sep_save, sep_restore, NULL, NULL, NULL, NULL, ctx);
+		if (thisthread) {
+			/*
+			 * We're the right thread, so set the stack pointer
+			 * for the first sysenter instruction to use
+			 */
+			sep_restore(kstktop);
+			kpreempt_enable();
+		}
+	}
+
+	if (PROC_IS_BRANDED(ttoproc(t)))
+		lwp_attach_brand_hdlrs(lwp);
+}
+
+/*
+ * Clear registers on exec(2).
+ */
+void
+setregs(uarg_t *args)
+{
+	struct regs *rp;
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	pcb_t *pcb = &lwp->lwp_pcb;
+	greg_t sp;
+
+	/*
+	 * Initialize user registers
+	 */
+	(void) save_syscall_args();	/* copy args from registers first */
+	rp = lwptoregs(lwp);
+	sp = rp->r_sp;
+	bzero(rp, sizeof (*rp));
+
+	rp->r_ss = UDS_SEL;
+	rp->r_sp = sp;
+	rp->r_pc = args->entry;
+	rp->r_ps = PSL_USER;
+
+	pcb->pcb_fs = pcb->pcb_gs = 0;
+	pcb->pcb_fsbase = pcb->pcb_gsbase = 0;
+
+	if (ttoproc(t)->p_model == DATAMODEL_NATIVE) {
+
+		rp->r_cs = UCS_SEL;
+
+		/*
+		 * Only allow 64-bit user code descriptor to be present.
+		 */
+		gdt_ucode_model(DATAMODEL_NATIVE);
+
+		/*
+		 * Arrange that the virtualized %fs and %gs GDT descriptors
+		 * have a well-defined initial state (present, ring 3
+		 * and of type data).
+		 */
+		pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc;
+
+		/*
+		 * thrptr is either NULL or a value used by DTrace.
+		 * 64-bit processes use %fs as their "thread" register.
+		 */
+		if (args->thrptr)
+			(void) lwp_setprivate(lwp, _LWP_FSBASE, args->thrptr);
+
+	} else {
+
+		rp->r_cs = U32CS_SEL;
+		rp->r_ds = rp->r_es = UDS_SEL;
+
+		/*
+		 * only allow 32-bit user code selector to be present.
+		 */
+		gdt_ucode_model(DATAMODEL_ILP32);
+
+		pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_u32desc;
+
+		/*
+		 * thrptr is either NULL or a value used by DTrace.
+		 * 32-bit processes use %gs as their "thread" register.
+		 */
+		if (args->thrptr)
+			(void) lwp_setprivate(lwp, _LWP_GSBASE, args->thrptr);
+
+	}
+
+	pcb->pcb_ds = rp->r_ds;
+	pcb->pcb_es = rp->r_es;
+	PCB_SET_UPDATE_SEGS(pcb);
+
+	lwp->lwp_eosys = JUSTRETURN;
+	t->t_post_sys = 1;
+
+	/*
+	 * Add the lwp context handlers that virtualize segment registers,
+	 * and/or system call stacks etc.
+	 */
+	lwp_installctx(lwp);
+
+	/*
+	 * Reset the FPU flags and then initialize the FPU for this lwp.
+	 */
+	fp_exec();
+}
+
+user_desc_t *
+cpu_get_gdt(void)
+{
+	return (CPU->cpu_gdt);
+}
+
+
+#if !defined(lwp_getdatamodel)
+
+/*
+ * Return the datamodel of the given lwp.
+ */
+/*ARGSUSED*/
+model_t
+lwp_getdatamodel(klwp_t *lwp)
+{
+	return (lwp->lwp_procp->p_model);
+}
+
+#endif	/* !lwp_getdatamodel */
+
+#if !defined(get_udatamodel)
+
+model_t
+get_udatamodel(void)
+{
+	return (curproc->p_model);
+}
+
+#endif	/* !get_udatamodel */
diff --git a/usr/src/uts/intel/os/syscall.c b/usr/src/uts/intel/os/syscall.c
new file mode 100644
index 0000000000..6cf4293ff4
--- /dev/null
+++ b/usr/src/uts/intel/os/syscall.c
@@ -0,0 +1,1397 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/vmparam.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/stack.h>
+#include <sys/cred.h>
+#include <sys/cmn_err.h>
+#include <sys/user.h>
+#include <sys/privregs.h>
+#include <sys/psw.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/modctl.h>
+#include <sys/var.h>
+#include <sys/inline.h>
+#include <sys/syscall.h>
+#include <sys/ucontext.h>
+#include <sys/cpuvar.h>
+#include <sys/siginfo.h>
+#include <sys/trap.h>
+#include <sys/vtrace.h>
+#include <sys/sysinfo.h>
+#include <sys/procfs.h>
+#include <sys/prsystm.h>
+#include <c2/audit.h>
+#include <sys/modctl.h>
+#include <sys/aio_impl.h>
+#include <sys/tnf.h>
+#include <sys/tnf_probe.h>
+#include <sys/copyops.h>
+#include <sys/priv.h>
+#include <sys/msacct.h>
+
+int syscalltrace = 0;
+#ifdef SYSCALLTRACE
+static kmutex_t systrace_lock;		/* syscall tracing lock */
+#else
+#define	syscalltrace 0
+#endif /* SYSCALLTRACE */
+
+typedef	int64_t (*llfcn_t)();	/* function returning long long */
+
+int pre_syscall(void);
+void post_syscall(long rval1, long rval2);
+static krwlock_t *lock_syscall(struct sysent *, uint_t);
+void deferred_singlestep_trap(caddr_t);
+
+#ifdef _SYSCALL32_IMPL
+#define	LWP_GETSYSENT(lwp)	\
+	(lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ? sysent : sysent32)
+#else
+#define	LWP_GETSYSENT(lwp)	(sysent)
+#endif
+
+/*
+ * If watchpoints are active, don't make copying in of
+ * system call arguments take a read watchpoint trap.
+ */
+static int
+copyin_args(struct regs *rp, long *ap, uint_t nargs)
+{
+	greg_t *sp = 1 + (greg_t *)rp->r_sp;		/* skip ret addr */
+
+	ASSERT(nargs <= MAXSYSARGS);
+
+	return (copyin_nowatch(sp, ap, nargs * sizeof (*sp)));
+}
+
+#if defined(_SYSCALL32_IMPL)
+static int
+copyin_args32(struct regs *rp, long *ap, uint_t nargs)
+{
+	greg32_t *sp = 1 + (greg32_t *)rp->r_sp;	/* skip ret addr */
+	uint32_t a32[MAXSYSARGS];
+	int rc;
+
+	ASSERT(nargs <= MAXSYSARGS);
+
+	if ((rc = copyin_nowatch(sp, a32, nargs * sizeof (*sp))) == 0) {
+		uint32_t *a32p = &a32[0];
+
+		while (nargs--)
+			*ap++ = (ulong_t)*a32p++;
+	}
+	return (rc);
+}
+#define	COPYIN_ARGS32	copyin_args32
+#else
+#define	COPYIN_ARGS32	copyin_args
+#endif
+
+/*
+ * Error handler for system calls where arg copy gets fault.
+ */
+static longlong_t
+syscall_err()
+{
+	return (0);
+}
+
+/*
+ * Corresponding sysent entry to allow syscall_entry caller
+ * to invoke syscall_err.
+ */
+static struct sysent sysent_err =  {
+	0, SE_32RVAL1, NULL, NULL, (llfcn_t)syscall_err
+};
+
+/*
+ * Called from syscall() when a non-trivial 32-bit system call occurs.
+ *	Sets up the args and returns a pointer to the handler.
+ */
+struct sysent *
+syscall_entry(kthread_t *t, long *argp)
+{
+	klwp_t *lwp = ttolwp(t);
+	struct regs *rp = lwptoregs(lwp);
+	unsigned int code;
+	struct sysent *callp;
+	struct sysent *se = LWP_GETSYSENT(lwp);
+	int error = 0;
+	uint_t nargs;
+
+	ASSERT(t == curthread && curthread->t_schedflag & TS_DONT_SWAP);
+
+	lwp->lwp_ru.sysc++;
+	lwp->lwp_eosys = NORMALRETURN;	/* assume this will be normal */
+
+	/*
+	 * Set lwp_ap to point to the args, even if none are needed for this
+	 * system call.  This is for the loadable-syscall case where the
+	 * number of args won't be known until the system call is loaded, and
+	 * also maintains a non-NULL lwp_ap setup for get_syscall_args(). Note
+	 * that lwp_ap MUST be set to a non-NULL value _BEFORE_ t_sysnum is
+	 * set to non-zero; otherwise get_syscall_args(), seeing a non-zero
+	 * t_sysnum for this thread, will charge ahead and dereference lwp_ap.
+	 */
+	lwp->lwp_ap = argp;		/* for get_syscall_args */
+
+	code = rp->r_r0;
+	t->t_sysnum = (short)code;
+	callp = code >= NSYSCALL ? &nosys_ent : se + code;
+
+	if ((t->t_pre_sys | syscalltrace) != 0) {
+		error = pre_syscall();
+
+		/*
+		 * pre_syscall() has taken care so that lwp_ap is current;
+		 * it either points to syscall-entry-saved amd64 regs,
+		 * or it points to lwp_arg[], which has been re-copied from
+		 * the ia32 ustack, but either way, it's a current copy after
+		 * /proc has possibly mucked with the syscall args.
+		 */
+
+		if (error)
+			return (&sysent_err);	/* use dummy handler */
+	}
+
+	/*
+	 * Fetch the system call arguments to the kernel stack copy used
+	 * for syscall handling.
+	 * Note: for loadable system calls the number of arguments required
+	 * may not be known at this point, and will be zero if the system call
+	 * was never loaded.  Once the system call has been loaded, the number
+	 * of args is not allowed to be changed.
+	 */
+	if ((nargs = (uint_t)callp->sy_narg) != 0 &&
+	    COPYIN_ARGS32(rp, argp, nargs)) {
+		(void) set_errno(EFAULT);
+		return (&sysent_err);	/* use dummy handler */
+	}
+
+	return (callp);		/* return sysent entry for caller */
+}
+
+void
+syscall_exit(kthread_t *t, long rval1, long rval2)
+{
+	/*
+	 * Handle signals and other post-call events if necessary.
+	 */
+	if ((t->t_post_sys_ast | syscalltrace) == 0) {
+		klwp_t *lwp = ttolwp(t);
+		struct regs *rp = lwptoregs(lwp);
+
+		/*
+		 * Normal return.
+		 * Clear error indication and set return values.
+		 */
+		rp->r_ps &= ~PS_C;	/* reset carry bit */
+		rp->r_r0 = rval1;
+		rp->r_r1 = rval2;
+		lwp->lwp_state = LWP_USER;
+	} else {
+		post_syscall(rval1, rval2);
+	}
+	t->t_sysnum = 0;		/* invalidate args */
+}
+
+/*
+ * Perform pre-system-call processing, including stopping for tracing,
+ * auditing, etc.
+ *
+ * This routine is called only if the t_pre_sys flag is set. Any condition
+ * requiring pre-syscall handling must set the t_pre_sys flag. If the
+ * condition is persistent, this routine will repost t_pre_sys.
+ */
+int
+pre_syscall()
+{
+	kthread_t *t = curthread;
+	unsigned code = t->t_sysnum;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	int	repost;
+
+	t->t_pre_sys = repost = 0;	/* clear pre-syscall processing flag */
+
+	ASSERT(t->t_schedflag & TS_DONT_SWAP);
+
+#if defined(DEBUG)
+	/*
+	 * On the i386 kernel, lwp_ap points at the piece of the thread
+	 * stack that we copy the users arguments into.
+	 *
+	 * On the amd64 kernel, the syscall arguments in the rdi..r9
+	 * registers should be pointed at by lwp_ap.  If the args need to
+	 * be copied so that those registers can be changed without losing
+	 * the ability to get the args for /proc, they can be saved by
+	 * save_syscall_args(), and lwp_ap will be restored by post_syscall().
+	 */
+	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+#if defined(_LP64)
+		ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi);
+	} else {
+#endif
+		ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase &&
+		    (caddr_t)lwp->lwp_ap < t->t_stk);
+	}
+#endif	/* DEBUG */
+
+	/*
+	 * Make sure the thread is holding the latest credentials for the
+	 * process.  The credentials in the process right now apply to this
+	 * thread for the entire system call.
+	 */
+	if (t->t_cred != p->p_cred) {
+		cred_t *oldcred = t->t_cred;
+		/*
+		 * DTrace accesses t_cred in probe context.  t_cred must
+		 * always be either NULL, or point to a valid, allocated cred
+		 * structure.
+		 */
+		t->t_cred = crgetcred();
+		crfree(oldcred);
+	}
+
+	/*
+	 * From the proc(4) manual page:
+	 * When entry to a system call is being traced, the traced process
+	 * stops after having begun the call to the system but before the
+	 * system call arguments have been fetched from the process.
+	 */
+	if (PTOU(p)->u_systrap) {
+		if (prismember(&PTOU(p)->u_entrymask, code)) {
+			mutex_enter(&p->p_lock);
+			/*
+			 * Recheck stop condition, now that lock is held.
+			 */
+			if (PTOU(p)->u_systrap &&
+			    prismember(&PTOU(p)->u_entrymask, code)) {
+				stop(PR_SYSENTRY, code);
+
+				/*
+				 * /proc may have modified syscall args,
+				 * either in regs for amd64 or on ustack
+				 * for ia32.  Either way, arrange to
+				 * copy them again, both for the syscall
+				 * handler and for other consumers in
+				 * post_syscall (like audit).  Here, we
+				 * only do amd64, and just set lwp_ap
+				 * back to the kernel-entry stack copy;
+				 * the syscall ml code redoes
+				 * move-from-regs to set up for the
+				 * syscall handler after we return.  For
+				 * ia32, save_syscall_args() below makes
+				 * an lwp_ap-accessible copy.
+				 */
+#if defined(_LP64)
+				if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+					lwp->lwp_argsaved = 0;
+					lwp->lwp_ap =
+					    (long *)&lwptoregs(lwp)->r_rdi;
+				}
+#endif
+			}
+			mutex_exit(&p->p_lock);
+		}
+		repost = 1;
+	}
+
+	/*
+	 * ia32 kernel, or ia32 proc on amd64 kernel: keep args in
+	 * lwp_arg for post-syscall processing, regardless of whether
+	 * they might have been changed in /proc above.
+	 */
+#if defined(_LP64)
+	if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE)
+#endif
+		(void) save_syscall_args();
+
+	if (lwp->lwp_sysabort) {
+		/*
+		 * lwp_sysabort may have been set via /proc while the process
+		 * was stopped on PR_SYSENTRY.  If so, abort the system call.
+		 * Override any error from the copyin() of the arguments.
+		 */
+		lwp->lwp_sysabort = 0;
+		(void) set_errno(EINTR);	/* forces post_sys */
+		t->t_pre_sys = 1;	/* repost anyway */
+		return (1);		/* don't do system call, return EINTR */
+	}
+
+	/*
+	 * begin auditing for this syscall if the c2audit module is loaded
+	 * and auditing is enabled
+	 */
+	if (audit_active == C2AUDIT_LOADED) {
+		uint32_t auditing = au_zone_getstate(NULL);
+
+		if (auditing & AU_AUDIT_MASK) {
+			int error;
+			if (error = audit_start(T_SYSCALL, code, auditing, \
+			    0, lwp)) {
+				t->t_pre_sys = 1;	/* repost anyway */
+				(void) set_errno(error);
+				return (1);
+			}
+			repost = 1;
+		}
+	}
+
+#ifndef NPROBE
+	/* Kernel probe */
+	if (tnf_tracing_active) {
+		TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */,
+			tnf_sysnum,	sysnum,		t->t_sysnum);
+		t->t_post_sys = 1;	/* make sure post_syscall runs */
+		repost = 1;
+	}
+#endif /* NPROBE */
+
+#ifdef SYSCALLTRACE
+	if (syscalltrace) {
+		int i;
+		long *ap;
+		char *cp;
+		char *sysname;
+		struct sysent *callp;
+
+		if (code >= NSYSCALL)
+			callp = &nosys_ent;	/* nosys has no args */
+		else
+			callp = LWP_GETSYSENT(lwp) + code;
+		(void) save_syscall_args();
+		mutex_enter(&systrace_lock);
+		printf("%d: ", p->p_pid);
+		if (code >= NSYSCALL) {
+			printf("0x%x", code);
+		} else {
+			sysname = mod_getsysname(code);
+			printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" :
+			    sysname, code, callp->sy_callc);
+		}
+		cp = "(";
+		for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) {
+			printf("%s%lx", cp, *ap);
+			cp = ", ";
+		}
+		if (i)
+			printf(")");
+		printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread);
+		mutex_exit(&systrace_lock);
+	}
+#endif /* SYSCALLTRACE */
+
+	/*
+	 * If there was a continuing reason for pre-syscall processing,
+	 * set the t_pre_sys flag for the next system call.
+	 */
+	if (repost)
+		t->t_pre_sys = 1;
+	lwp->lwp_error = 0;	/* for old drivers */
+	lwp->lwp_badpriv = PRIV_NONE;
+	return (0);
+}
+
+
+/*
+ * Post-syscall processing.  Perform abnormal system call completion
+ * actions such as /proc tracing, profiling, signals, preemption, etc.
+ *
+ * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set.
+ * Any condition requiring pre-syscall handling must set one of these.
+ * If the condition is persistent, this routine will repost t_post_sys.
+ */
+void
+post_syscall(long rval1, long rval2)
+{
+	kthread_t *t = curthread;
+	klwp_t *lwp = ttolwp(t);
+	proc_t *p = ttoproc(t);
+	struct regs *rp = lwptoregs(lwp);
+	uint_t	error;
+	uint_t	code = t->t_sysnum;
+	int	repost = 0;
+	int	proc_stop = 0;		/* non-zero if stopping */
+	int	sigprof = 0;		/* non-zero if sending SIGPROF */
+
+	t->t_post_sys = 0;
+
+	error = lwp->lwp_errno;
+
+	/*
+	 * Code can be zero if this is a new LWP returning after a forkall(),
+	 * other than the one which matches the one in the parent which called
+	 * forkall().  In these LWPs, skip most of post-syscall activity.
+	 */
+	if (code == 0)
+		goto sig_check;
+	/*
+	 * If the trace flag is set, mark the lwp to take a single-step trap
+	 * on return to user level (below). The x86 lcall interface and
+	 * sysenter has already done this, and turned off the flag, but
+	 * amd64 syscall interface has not.
+	 */
+	if (rp->r_ps & PS_T) {
+		lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
+		rp->r_ps &= ~PS_T;
+		aston(curthread);
+	}
+
+	/* put out audit record for this syscall */
+	if (AU_AUDITING()) {
+		rval_t	rval;
+
+		/* XX64 -- truncation of 64-bit return values? */
+		rval.r_val1 = (int)rval1;
+		rval.r_val2 = (int)rval2;
+		audit_finish(T_SYSCALL, code, error, &rval);
+		repost = 1;
+	}
+
+	if (curthread->t_pdmsg != NULL) {
+		char *m = curthread->t_pdmsg;
+
+		uprintf("%s", m);
+		kmem_free(m, strlen(m) + 1);
+		curthread->t_pdmsg = NULL;
+	}
+
+	/*
+	 * If we're going to stop for /proc tracing, set the flag and
+	 * save the arguments so that the return values don't smash them.
+	 */
+	if (PTOU(p)->u_systrap) {
+		if (prismember(&PTOU(p)->u_exitmask, code)) {
+			if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
+				(void) save_syscall_args();
+			proc_stop = 1;
+		}
+		repost = 1;
+	}
+
+	/*
+	 * Similarly check to see if SIGPROF might be sent.
+	 */
+	if (curthread->t_rprof != NULL &&
+	    curthread->t_rprof->rp_anystate != 0) {
+		if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
+			(void) save_syscall_args();
+		sigprof = 1;
+	}
+
+	if (lwp->lwp_eosys == NORMALRETURN) {
+		if (error == 0) {
+#ifdef SYSCALLTRACE
+			if (syscalltrace) {
+				mutex_enter(&systrace_lock);
+				printf(
+				    "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n",
+				    p->p_pid, rval1, rval2, curthread);
+				mutex_exit(&systrace_lock);
+			}
+#endif /* SYSCALLTRACE */
+			rp->r_ps &= ~PS_C;
+			rp->r_r0 = rval1;
+			rp->r_r1 = rval2;
+		} else {
+			int sig;
+#ifdef SYSCALLTRACE
+			if (syscalltrace) {
+				mutex_enter(&systrace_lock);
+				printf("%d: error=%d, id 0x%p\n",
+				    p->p_pid, error, curthread);
+				mutex_exit(&systrace_lock);
+			}
+#endif /* SYSCALLTRACE */
+			if (error == EINTR && t->t_activefd.a_stale)
+				error = EBADF;
+			if (error == EINTR &&
+			    (sig = lwp->lwp_cursig) != 0 &&
+			    sigismember(&PTOU(p)->u_sigrestart, sig) &&
+			    PTOU(p)->u_signal[sig - 1] != SIG_DFL &&
+			    PTOU(p)->u_signal[sig - 1] != SIG_IGN)
+				error = ERESTART;
+			rp->r_r0 = error;
+			rp->r_ps |= PS_C;
+		}
+	}
+
+	/*
+	 * From the proc(4) manual page:
+	 * When exit from a system call is being traced, the traced process
+	 * stops on completion of the system call just prior to checking for
+	 * signals and returning to user level.  At this point all return
+	 * values have been stored into the traced process's saved registers.
+	 */
+	if (proc_stop) {
+		mutex_enter(&p->p_lock);
+		if (PTOU(p)->u_systrap &&
+		    prismember(&PTOU(p)->u_exitmask, code))
+			stop(PR_SYSEXIT, code);
+		mutex_exit(&p->p_lock);
+	}
+
+	/*
+	 * If we are the parent returning from a successful
+	 * vfork, wait for the child to exec or exit.
+	 * This code must be here and not in the bowels of the system
+	 * so that /proc can intercept exit from vfork in a timely way.
+	 */
+	if (t->t_flag & T_VFPARENT) {
+		ASSERT(code == SYS_vfork || code == SYS_forksys);
+		ASSERT(rp->r_r1 == 0 && error == 0);
+		vfwait((pid_t)rval1);
+		t->t_flag &= ~T_VFPARENT;
+	}
+
+	/*
+	 * If profiling is active, bill the current PC in user-land
+	 * and keep reposting until profiling is disabled.
+	 */
+	if (p->p_prof.pr_scale) {
+		if (lwp->lwp_oweupc)
+			profil_tick(rp->r_pc);
+		repost = 1;
+	}
+
+sig_check:
+	/*
+	 * Reset flag for next time.
+	 * We must do this after stopping on PR_SYSEXIT
+	 * because /proc uses the information in lwp_eosys.
+	 */
+	lwp->lwp_eosys = NORMALRETURN;
+	clear_stale_fd();
+	t->t_flag &= ~T_FORKALL;
+
+	if (t->t_astflag | t->t_sig_check) {
+		/*
+		 * Turn off the AST flag before checking all the conditions that
+		 * may have caused an AST.  This flag is on whenever a signal or
+		 * unusual condition should be handled after the next trap or
+		 * syscall.
+		 */
+		astoff(t);
+		/*
+		 * If a single-step trap occurred on a syscall (see trap())
+		 * recognize it now.  Do this before checking for signals
+		 * because deferred_singlestep_trap() may generate a SIGTRAP to
+		 * the LWP or may otherwise mark the LWP to call issig(FORREAL).
+		 */
+		if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
+			deferred_singlestep_trap((caddr_t)rp->r_pc);
+
+		t->t_sig_check = 0;
+
+		/*
+		 * The following check is legal for the following reasons:
+		 *	1) The thread we are checking, is ourselves, so there is
+		 *	   no way the proc can go away.
+		 *	2) The only time we need to be protected by the
+		 *	   lock is if the binding is changed.
+		 *
+		 *	Note we will still take the lock and check the binding
+		 *	if the condition was true without the lock held.  This
+		 *	prevents lock contention among threads owned by the
+		 *	same proc.
+		 */
+
+		if (curthread->t_proc_flag & TP_CHANGEBIND) {
+			mutex_enter(&p->p_lock);
+			if (curthread->t_proc_flag & TP_CHANGEBIND) {
+				timer_lwpbind();
+				curthread->t_proc_flag &= ~TP_CHANGEBIND;
+			}
+			mutex_exit(&p->p_lock);
+		}
+
+		/*
+		 * for kaio requests on the special kaio poll queue,
+		 * copyout their results to user memory.
+		 */
+		if (p->p_aio)
+			aio_cleanup(0);
+		/*
+		 * If this LWP was asked to hold, call holdlwp(), which will
+		 * stop.  holdlwps() sets this up and calls pokelwps() which
+		 * sets the AST flag.
+		 *
+		 * Also check TP_EXITLWP, since this is used by fresh new LWPs
+		 * through lwp_rtt().  That flag is set if the lwp_create(2)
+		 * syscall failed after creating the LWP.
+		 */
+		if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP))
+			holdlwp();
+
+		/*
+		 * All code that sets signals and makes ISSIG_PENDING
+		 * evaluate true must set t_sig_check afterwards.
+		 */
+		if (ISSIG_PENDING(t, lwp, p)) {
+			if (issig(FORREAL))
+				psig();
+			t->t_sig_check = 1;	/* recheck next time */
+		}
+
+		if (sigprof) {
+			int nargs = (code > 0 && code < NSYSCALL)?
+			    LWP_GETSYSENT(lwp)[code].sy_narg : 0;
+			realsigprof(code, nargs, error);
+			t->t_sig_check = 1;	/* recheck next time */
+		}
+
+		/*
+		 * If a performance counter overflow interrupt was
+		 * delivered *during* the syscall, then re-enable the
+		 * AST so that we take a trip through trap() to cause
+		 * the SIGEMT to be delivered.
+		 */
+		if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW)
+			aston(t);
+
+		/*
+		 * /proc can't enable/disable the trace bit itself
+		 * because that could race with the call gate used by
+		 * system calls via "lcall". If that happened, an
+		 * invalid EFLAGS would result. prstep()/prnostep()
+		 * therefore schedule an AST for the purpose.
+		 */
+		if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
+			lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
+			rp->r_ps |= PS_T;
+		}
+		if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
+			lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
+			rp->r_ps &= ~PS_T;
+		}
+	}
+
+	lwp->lwp_errno = 0;		/* clear error for next time */
+
+#ifndef NPROBE
+	/* Kernel probe */
+	if (tnf_tracing_active) {
+		TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */,
+		    tnf_long,	rval1,		rval1,
+		    tnf_long,	rval2,		rval2,
+		    tnf_long,	errno,		(long)error);
+		repost = 1;
+	}
+#endif /* NPROBE */
+
+	/*
+	 * Set state to LWP_USER here so preempt won't give us a kernel
+	 * priority if it occurs after this point.  Call CL_TRAPRET() to
+	 * restore the user-level priority.
+	 *
+	 * It is important that no locks (other than spinlocks) be entered
+	 * after this point before returning to user mode (unless lwp_state
+	 * is set back to LWP_SYS).
+	 *
+	 * XXX Sampled times past this point are charged to the user.
+	 */
+	lwp->lwp_state = LWP_USER;
+
+	if (t->t_trapret) {
+		t->t_trapret = 0;
+		thread_lock(t);
+		CL_TRAPRET(t);
+		thread_unlock(t);
+	}
+	if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ)
+		preempt();
+	prunstop();
+
+	lwp->lwp_errno = 0;		/* clear error for next time */
+
+	/*
+	 * The thread lock must be held in order to clear sysnum and reset
+	 * lwp_ap atomically with respect to other threads in the system that
+	 * may be looking at the args via lwp_ap from get_syscall_args().
+	 */
+
+	thread_lock(t);
+	t->t_sysnum = 0;		/* no longer in a system call */
+
+	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+#if defined(_LP64)
+		/*
+		 * In case the args were copied to the lwp, reset the
+		 * pointer so the next syscall will have the right
+		 * lwp_ap pointer.
+		 */
+		lwp->lwp_ap = (long *)&rp->r_rdi;
+	} else {
+#endif
+		lwp->lwp_ap = NULL;	/* reset on every syscall entry */
+	}
+	thread_unlock(t);
+
+	lwp->lwp_argsaved = 0;
+
+	/*
+	 * If there was a continuing reason for post-syscall processing,
+	 * set the t_post_sys flag for the next system call.
+	 */
+	if (repost)
+		t->t_post_sys = 1;
+
+	/*
+	 * If there is a ustack registered for this lwp, and the stack rlimit
+	 * has been altered, read in the ustack. If the saved stack rlimit
+	 * matches the bounds of the ustack, update the ustack to reflect
+	 * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable
+	 * stack checking by setting the size to 0.
+	 */
+	if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) {
+		rlim64_t new_size;
+		caddr_t top;
+		stack_t stk;
+		struct rlimit64 rl;
+
+		mutex_enter(&p->p_lock);
+		new_size = p->p_stk_ctl;
+		top = p->p_usrstack;
+		(void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl);
+		mutex_exit(&p->p_lock);
+
+		if (rl.rlim_cur == RLIM64_INFINITY)
+			new_size = 0;
+
+		if (copyin((stack_t *)lwp->lwp_ustack, &stk,
+		    sizeof (stack_t)) == 0 &&
+		    (stk.ss_size == lwp->lwp_old_stk_ctl ||
+		    stk.ss_size == 0) &&
+		    stk.ss_sp == top - stk.ss_size) {
+			stk.ss_sp = (void *)((uintptr_t)stk.ss_sp +
+			    stk.ss_size - (uintptr_t)new_size);
+			stk.ss_size = new_size;
+
+			(void) copyout(&stk, (stack_t *)lwp->lwp_ustack,
+			    sizeof (stack_t));
+		}
+
+		lwp->lwp_old_stk_ctl = 0;
+	}
+}
+
+/*
+ * Called from post_syscall() when a deferred singlestep is to be taken.
+ */
+void
+deferred_singlestep_trap(caddr_t pc)
+{
+	proc_t *p = ttoproc(curthread);
+	klwp_t *lwp = ttolwp(curthread);
+	pcb_t *pcb = &lwp->lwp_pcb;
+	uint_t fault = 0;
+	k_siginfo_t siginfo;
+
+	bzero(&siginfo, sizeof (siginfo));
+
+	/*
+	 * If both NORMAL_STEP and WATCH_STEP are in
+	 * effect, give precedence to WATCH_STEP.
+	 * If neither is set, user must have set the
+	 * PS_T bit in %efl; treat this as NORMAL_STEP.
+	 */
+	if ((fault = undo_watch_step(&siginfo)) == 0 &&
+	    ((pcb->pcb_flags & NORMAL_STEP) ||
+	    !(pcb->pcb_flags & WATCH_STEP))) {
+		siginfo.si_signo = SIGTRAP;
+		siginfo.si_code = TRAP_TRACE;
+		siginfo.si_addr  = pc;
+		fault = FLTTRACE;
+	}
+	pcb->pcb_flags &= ~(DEBUG_PENDING|NORMAL_STEP|WATCH_STEP);
+
+	if (fault) {
+		/*
+		 * Remember the fault and fault adddress
+		 * for real-time (SIGPROF) profiling.
+		 */
+		lwp->lwp_lastfault = fault;
+		lwp->lwp_lastfaddr = siginfo.si_addr;
+		/*
+		 * If a debugger has declared this fault to be an
+		 * event of interest, stop the lwp.  Otherwise just
+		 * deliver the associated signal.
+		 */
+		if (prismember(&p->p_fltmask, fault) &&
+		    stop_on_fault(fault, &siginfo) == 0)
+			siginfo.si_signo = 0;
+	}
+
+	if (siginfo.si_signo)
+		trapsig(&siginfo, 1);
+}
+
+/*
+ * nonexistent system call-- signal lwp (may want to handle it)
+ * flag error if lwp won't see signal immediately
+ */
+int64_t
+nosys(void)
+{
+	tsignal(curthread, SIGSYS);
+	return (set_errno(ENOSYS));
+}
+
+int
+nosys32(void)
+{
+	return (nosys());
+}
+
+/*
+ * Execute a 32-bit system call on behalf of the current thread.
+ */
+void
+dosyscall(void)
+{
+	/*
+	 * Need space on the stack to store syscall arguments.
+	 */
+	long		syscall_args[MAXSYSARGS];
+	struct sysent	*se;
+	int64_t		ret;
+
+	syscall_mstate(LMS_TRAP, LMS_SYSTEM);
+
+	ASSERT(curproc->p_model == DATAMODEL_ILP32);
+
+	CPU_STATS_ENTER_K();
+	CPU_STATS_ADDQ(CPU, sys, syscall, 1);
+	CPU_STATS_EXIT_K();
+
+	se = syscall_entry(curthread, syscall_args);
+
+	/*
+	 * syscall_entry() copied all 8 arguments into syscall_args.
+	 */
+	ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2],
+	    syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6],
+	    syscall_args[7]);
+
+	syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32));
+	syscall_mstate(LMS_SYSTEM, LMS_TRAP);
+}
+
+/*
+ * Get the arguments to the current system call. See comment atop
+ * save_syscall_args() regarding lwp_ap usage.
+ */
+
+uint_t
+get_syscall_args(klwp_t *lwp, long *argp, int *nargsp)
+{
+	kthread_t	*t = lwptot(lwp);
+	ulong_t	mask = 0xfffffffful;
+	uint_t	code;
+	long	*ap;
+	int	nargs;
+
+#if defined(_LP64)
+	if (lwp_getdatamodel(lwp) == DATAMODEL_LP64)
+		mask = 0xfffffffffffffffful;
+#endif
+
+	/*
+	 * The thread lock must be held while looking at the arguments to ensure
+	 * they don't go away via post_syscall().
+	 * get_syscall_args() is the only routine to read them which is callable
+	 * outside the LWP in question and hence the only one that must be
+	 * synchronized in this manner.
+	 */
+	thread_lock(t);
+
+	code = t->t_sysnum;
+	ap = lwp->lwp_ap;
+
+	thread_unlock(t);
+
+	if (code != 0 && code < NSYSCALL) {
+		nargs = LWP_GETSYSENT(lwp)[code].sy_narg;
+
+		ASSERT(nargs <= MAXSYSARGS);
+
+		*nargsp = nargs;
+		while (nargs-- > 0)
+			*argp++ = *ap++ & mask;
+	} else {
+		*nargsp = 0;
+	}
+
+	return (code);
+}
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * Get the arguments to the current 32-bit system call.
+ */
+uint_t
+get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp)
+{
+	long args[MAXSYSARGS];
+	uint_t i, code;
+
+	code = get_syscall_args(lwp, args, nargsp);
+
+	for (i = 0; i != *nargsp; i++)
+		*argp++ = (int)args[i];
+	return (code);
+}
+#endif
+
+/*
+ * Save the system call arguments in a safe place.
+ *
+ * On the i386 kernel:
+ *
+ *	Copy the users args prior to changing the stack or stack pointer.
+ *	This is so /proc will be able to get a valid copy of the
+ *	args from the user stack even after the user stack has been changed.
+ *	Note that the kernel stack copy of the args may also have been
+ *	changed by a system call handler which takes C-style arguments.
+ *
+ *	Note that this may be called by stop() from trap().  In that case
+ *	t_sysnum will be zero (syscall_exit clears it), so no args will be
+ *	copied.
+ *
+ * On the amd64 kernel:
+ *
+ *	For 64-bit applications, lwp->lwp_ap normally points to %rdi..%r9
+ *	in the reg structure. If the user is going to change the argument
+ *	registers, rax, or the stack and might want to get the args (for
+ *	/proc tracing), it must copy the args elsewhere via save_syscall_args().
+ *
+ *	For 32-bit applications, lwp->lwp_ap normally points to a copy of
+ *	the system call arguments on the kernel stack made from the user
+ *	stack.  Copy the args prior to change the stack or stack pointer.
+ *	This is so /proc will be able to get a valid copy of the args
+ *	from the user stack even after that stack has been changed.
+ *
+ *	This may be called from stop() even when we're not in a system call.
+ *	Since there's no easy way to tell, this must be safe (not panic).
+ *	If the copyins get data faults, return non-zero.
+ */
+int
+save_syscall_args()
+{
+	kthread_t	*t = curthread;
+	klwp_t		*lwp = ttolwp(t);
+	uint_t		code = t->t_sysnum;
+	uint_t		nargs;
+
+	if (lwp->lwp_argsaved || code == 0)
+		return (0);		/* args already saved or not needed */
+
+	if (code >= NSYSCALL) {
+		nargs = 0;		/* illegal syscall */
+	} else {
+		struct sysent *se = LWP_GETSYSENT(lwp);
+		struct sysent *callp = se + code;
+
+		nargs = callp->sy_narg;
+		if (LOADABLE_SYSCALL(callp) && nargs == 0) {
+			krwlock_t	*module_lock;
+
+			/*
+			 * Find out how many arguments the system
+			 * call uses.
+			 *
+			 * We have the property that loaded syscalls
+			 * never change the number of arguments they
+			 * use after they've been loaded once.  This
+			 * allows us to stop for /proc tracing without
+			 * holding the module lock.
+			 * /proc is assured that sy_narg is valid.
+			 */
+			module_lock = lock_syscall(se, code);
+			nargs = callp->sy_narg;
+			rw_exit(module_lock);
+		}
+	}
+
+	/*
+	 * Fetch the system call arguments.
+	 */
+	if (nargs == 0)
+		goto out;
+
+	ASSERT(nargs <= MAXSYSARGS);
+
+	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+#if defined(_LP64)
+		struct regs *rp = lwptoregs(lwp);
+
+		lwp->lwp_arg[0] = rp->r_rdi;
+		lwp->lwp_arg[1] = rp->r_rsi;
+		lwp->lwp_arg[2] = rp->r_rdx;
+		lwp->lwp_arg[3] = rp->r_rcx;
+		lwp->lwp_arg[4] = rp->r_r8;
+		lwp->lwp_arg[5] = rp->r_r9;
+		if (nargs > 6 && copyin_args(rp, &lwp->lwp_arg[6], nargs - 6))
+			return (-1);
+	} else {
+#endif
+		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_arg, nargs))
+			return (-1);
+	}
+out:
+	lwp->lwp_ap = lwp->lwp_arg;
+	lwp->lwp_argsaved = 1;
+	t->t_post_sys = 1;	/* so lwp_ap will be reset */
+	return (0);
+}
+
+void
+reset_syscall_args(void)
+{
+	ttolwp(curthread)->lwp_argsaved = 0;
+}
+
+/*
+ * Call a system call which takes a pointer to the user args struct and
+ * a pointer to the return values.  This is a bit slower than the standard
+ * C arg-passing method in some cases.
+ */
+int64_t
+syscall_ap(void)
+{
+	uint_t	error;
+	struct sysent *callp;
+	rval_t	rval;
+	kthread_t *t = curthread;
+	klwp_t	*lwp = ttolwp(t);
+	struct regs *rp = lwptoregs(lwp);
+
+	callp = LWP_GETSYSENT(lwp) + t->t_sysnum;
+
+	/*
+	 * If the arguments don't fit in registers %rdi-%r9, make sure they
+	 * have been copied to the lwp_arg array.
+	 */
+	if (callp->sy_narg > 6 && save_syscall_args())
+		return ((int64_t)set_errno(EFAULT));
+
+	rval.r_val1 = 0;
+	rval.r_val2 = rp->r_r1;
+	lwp->lwp_error = 0;	/* for old drivers */
+	error = (*(callp->sy_call))(lwp->lwp_ap, &rval);
+	if (error)
+		return ((longlong_t)set_errno(error));
+	return (rval.r_vals);
+}
+
+/*
+ * Load system call module.
+ *	Returns with pointer to held read lock for module.
+ */
+static krwlock_t *
+lock_syscall(struct sysent *table, uint_t code)
+{
+	krwlock_t	*module_lock;
+	struct modctl	*modp;
+	int		id;
+	struct sysent   *callp;
+
+	callp = table + code;
+	module_lock = callp->sy_lock;
+
+	/*
+	 * Optimization to only call modload if we don't have a loaded
+	 * syscall.
+	 */
+	rw_enter(module_lock, RW_READER);
+	if (LOADED_SYSCALL(callp))
+		return (module_lock);
+	rw_exit(module_lock);
+
+	for (;;) {
+		if ((id = modload("sys", syscallnames[code])) == -1)
+			break;
+
+		/*
+		 * If we loaded successfully at least once, the modctl
+		 * will still be valid, so we try to grab it by filename.
+		 * If this call fails, it's because the mod_filename
+		 * was changed after the call to modload() (mod_hold_by_name()
+		 * is the likely culprit).  We can safely just take
+		 * another lap if this is the case;  the modload() will
+		 * change the mod_filename back to one by which we can
+		 * find the modctl.
+		 */
+		modp = mod_find_by_filename("sys", syscallnames[code]);
+
+		if (modp == NULL)
+			continue;
+
+		mutex_enter(&mod_lock);
+
+		if (!modp->mod_installed) {
+			mutex_exit(&mod_lock);
+			continue;
+		}
+		break;
+	}
+	rw_enter(module_lock, RW_READER);
+
+	if (id != -1)
+		mutex_exit(&mod_lock);
+
+	return (module_lock);
+}
+
+/*
+ * Loadable syscall support.
+ *	If needed, load the module, then reserve it by holding a read
+ *	lock for the duration of the call.
+ *	Later, if the syscall is not unloadable, it could patch the vector.
+ */
+/*ARGSUSED*/
+int64_t
+loadable_syscall(
+    long a0, long a1, long a2, long a3,
+    long a4, long a5, long a6, long a7)
+{
+	klwp_t *lwp = ttolwp(curthread);
+	int64_t	rval;
+	struct sysent *callp;
+	struct sysent *se = LWP_GETSYSENT(lwp);
+	krwlock_t *module_lock;
+	int code, error = 0;
+
+	code = curthread->t_sysnum;
+	callp = se + code;
+
+	/*
+	 * Try to autoload the system call if necessary
+	 */
+	module_lock = lock_syscall(se, code);
+
+	/*
+	 * we've locked either the loaded syscall or nosys
+	 */
+
+	if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) {
+#if defined(_LP64)
+		if (callp->sy_flags & SE_ARGC) {
+			rval = (int64_t)(*callp->sy_call)(a0, a1, a2, a3,
+			    a4, a5);
+		} else {
+			rval = syscall_ap();
+		}
+	} else {
+#endif
+		/*
+		 * Now that it's loaded, make sure enough args were copied.
+		 */
+		if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_ap, callp->sy_narg))
+			error = EFAULT;
+		if (error) {
+			rval = set_errno(error);
+		} else if (callp->sy_flags & SE_ARGC) {
+			rval = (int64_t)(*callp->sy_call)(lwp->lwp_ap[0],
+			    lwp->lwp_ap[1], lwp->lwp_ap[2], lwp->lwp_ap[3],
+			    lwp->lwp_ap[4], lwp->lwp_ap[5]);
+		} else {
+			rval = syscall_ap();
+		}
+	}
+
+	rw_exit(module_lock);
+	return (rval);
+}
+
+/*
+ * Indirect syscall handled in libc on x86 architectures
+ */
+int64_t
+indir()
+{
+	return (nosys());
+}
+
+/*
+ * set_errno - set an error return from the current system call.
+ *	This could be a macro.
+ *	This returns the value it is passed, so that the caller can
+ *	use tail-recursion-elimination and do return (set_errno(ERRNO));
+ */
+uint_t
+set_errno(uint_t error)
+{
+	ASSERT(error != 0);		/* must not be used to clear errno */
+
+	curthread->t_post_sys = 1;	/* have post_syscall do error return */
+	return (ttolwp(curthread)->lwp_errno = error);
+}
+
+/*
+ * set_proc_pre_sys - Set pre-syscall processing for entire process.
+ */
+void
+set_proc_pre_sys(proc_t *p)
+{
+	kthread_t	*t;
+	kthread_t	*first;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	t = first = p->p_tlist;
+	do {
+		t->t_pre_sys = 1;
+	} while ((t = t->t_forw) != first);
+}
+
+/*
+ * set_proc_post_sys - Set post-syscall processing for entire process.
+ */
+void
+set_proc_post_sys(proc_t *p)
+{
+	kthread_t	*t;
+	kthread_t	*first;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	t = first = p->p_tlist;
+	do {
+		t->t_post_sys = 1;
+	} while ((t = t->t_forw) != first);
+}
+
+/*
+ * set_proc_sys - Set pre- and post-syscall processing for entire process.
+ */
+void
+set_proc_sys(proc_t *p)
+{
+	kthread_t	*t;
+	kthread_t	*first;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	t = first = p->p_tlist;
+	do {
+		t->t_pre_sys = 1;
+		t->t_post_sys = 1;
+	} while ((t = t->t_forw) != first);
+}
+
+/*
+ * set_all_proc_sys - set pre- and post-syscall processing flags for all
+ * user processes.
+ *
+ * This is needed when auditing, tracing, or other facilities which affect
+ * all processes are turned on.
+ */
+void
+set_all_proc_sys()
+{
+	kthread_t	*t;
+	kthread_t	*first;
+
+	mutex_enter(&pidlock);
+	t = first = curthread;
+	do {
+		t->t_pre_sys = 1;
+		t->t_post_sys = 1;
+	} while ((t = t->t_next) != first);
+	mutex_exit(&pidlock);
+}
+
+/*
+ * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for
+ * all user processes running in the zone of the current process
+ *
+ * This is needed when auditing, tracing, or other facilities which affect
+ * all processes are turned on.
+ */
+void
+set_all_zone_usr_proc_sys(zoneid_t zoneid)
+{
+	proc_t	    *p;
+	kthread_t   *t;
+
+	mutex_enter(&pidlock);
+	for (p = practive; p != NULL; p = p->p_next) {
+		/* skip kernel and incomplete processes */
+		if (p->p_exec == NULLVP || p->p_as == &kas ||
+		    p->p_stat == SIDL || p->p_stat == SZOMB ||
+		    (p->p_flag & (SSYS | SEXITING | SEXITLWPS)))
+			continue;
+		/*
+		 * Only processes in the given zone (eventually in
+		 * all zones) are taken into account
+		 */
+		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) {
+			mutex_enter(&p->p_lock);
+			if ((t = p->p_tlist) == NULL) {
+				mutex_exit(&p->p_lock);
+				continue;
+			}
+			/*
+			 * Set pre- and post-syscall processing flags
+			 * for all threads of the process
+			 */
+			do {
+				t->t_pre_sys = 1;
+				t->t_post_sys = 1;
+			} while (p->p_tlist != (t = t->t_forw));
+			mutex_exit(&p->p_lock);
+		}
+	}
+	mutex_exit(&pidlock);
+}
+
+/*
+ * set_proc_ast - Set asynchronous service trap (AST) flag for all
+ * threads in process.
+ */
+void
+set_proc_ast(proc_t *p)
+{
+	kthread_t	*t;
+	kthread_t	*first;
+
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	t = first = p->p_tlist;
+	do {
+		aston(t);
+	} while ((t = t->t_forw) != first);
+}
diff --git a/usr/src/uts/intel/os/sysi86.c b/usr/src/uts/intel/os/sysi86.c
new file mode 100644
index 0000000000..b107afddfb
--- /dev/null
+++ b/usr/src/uts/intel/os/sysi86.c
@@ -0,0 +1,850 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2021 Joyent, Inc.
+ */
+
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
+/*	  All Rights Reserved	*/
+
+/*	Copyright (c) 1987, 1988 Microsoft Corporation	*/
+/*	  All Rights Reserved	*/
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/signal.h>
+#include <sys/errno.h>
+#include <sys/fault.h>
+#include <sys/syscall.h>
+#include <sys/cpuvar.h>
+#include <sys/sysi86.h>
+#include <sys/psw.h>
+#include <sys/cred.h>
+#include <sys/policy.h>
+#include <sys/thread.h>
+#include <sys/debug.h>
+#include <sys/ontrap.h>
+#include <sys/privregs.h>
+#include <sys/x86_archext.h>
+#include <sys/vmem.h>
+#include <sys/kmem.h>
+#include <sys/mman.h>
+#include <sys/archsystm.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/faultcode.h>
+#include <sys/fp.h>
+#include <sys/cmn_err.h>
+#include <sys/segments.h>
+#include <sys/clock.h>
+#include <vm/hat_i86.h>
+#if defined(__xpv)
+#include <sys/hypervisor.h>
+#include <sys/note.h>
+#endif
+
+static void ldt_alloc(proc_t *, uint_t);
+static void ldt_free(proc_t *);
+static void ldt_dup(proc_t *, proc_t *);
+static void ldt_grow(proc_t *, uint_t);
+
+/*
+ * sysi86 System Call
+ */
+
+/* ARGSUSED */
+int
+sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
+{
+	struct ssd ssd;
+	int error = 0;
+	int c;
+	proc_t *pp = curproc;
+
+	switch (cmd) {
+
+	/*
+	 * The SI86V86 subsystem call of the SYSI86 system call
+	 * supports only one subcode -- V86SC_IOPL.
+	 */
+	case SI86V86:
+		if (arg1 == V86SC_IOPL) {
+#if defined(__xpv)
+			struct ctxop *ctx;
+#endif
+			struct regs *rp = lwptoregs(ttolwp(curthread));
+			greg_t oldpl = rp->r_ps & PS_IOPL;
+			greg_t newpl = arg2 & PS_IOPL;
+
+			/*
+			 * Must be privileged to run this system call
+			 * if giving more io privilege.
+			 */
+			if (newpl > oldpl && (error =
+			    secpolicy_sys_config(CRED(), B_FALSE)) != 0)
+				return (set_errno(error));
+#if defined(__xpv)
+			ctx = installctx_preallocate();
+			kpreempt_disable();
+			installctx(curthread, NULL, xen_disable_user_iopl,
+			    xen_enable_user_iopl, NULL, NULL,
+			    xen_disable_user_iopl, NULL, ctx);
+			xen_enable_user_iopl();
+			kpreempt_enable();
+#else
+			rp->r_ps ^= oldpl ^ newpl;
+#endif
+		} else
+			error = EINVAL;
+		break;
+
+	/*
+	 * Set a segment descriptor
+	 */
+	case SI86DSCR:
+		/*
+		 * There are considerable problems here manipulating
+		 * resources shared by many running lwps.  Get everyone
+		 * into a safe state before changing the LDT.
+		 */
+		if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) {
+			error = EINTR;
+			break;
+		}
+
+		if (get_udatamodel() == DATAMODEL_LP64) {
+			error = EINVAL;
+			break;
+		}
+
+		if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) {
+			error = EFAULT;
+			break;
+		}
+
+		error = setdscr(&ssd);
+
+		mutex_enter(&pp->p_lock);
+		if (curthread != pp->p_agenttp)
+			continuelwps(pp);
+		mutex_exit(&pp->p_lock);
+		break;
+
+	case SI86FPHW:
+		c = fp_kind & 0xff;
+		if (suword32((void *)arg1, c) == -1)
+			error = EFAULT;
+		break;
+
+	case SI86FPSTART:
+		/*
+		 * arg1 is the address of _fp_hw
+		 * arg2 is the desired x87 FCW value
+		 * arg3 is the desired SSE MXCSR value
+		 * a return value of one means SSE hardware, else none.
+		 */
+		c = fp_kind & 0xff;
+		if (suword32((void *)arg1, c) == -1) {
+			error = EFAULT;
+			break;
+		}
+		fpsetcw((uint16_t)arg2, (uint32_t)arg3);
+		return ((fp_kind & __FP_SSE) ? 1 : 0);
+
+	/* real time clock management commands */
+
+	case WTODC:
+		if ((error = secpolicy_settime(CRED())) == 0) {
+			timestruc_t ts;
+			mutex_enter(&tod_lock);
+			gethrestime(&ts);
+			tod_set(ts);
+			mutex_exit(&tod_lock);
+		}
+		break;
+
+/* Give some timezone playing room */
+#define	ONEWEEK	(7 * 24 * 60 * 60)
+
+	case SGMTL:
+		/*
+		 * Called from 32 bit land, negative values
+		 * are not sign extended, so we do that here
+		 * by casting it to an int and back.  We also
+		 * clamp the value to within reason and detect
+		 * when a 64 bit call overflows an int.
+		 */
+		if ((error = secpolicy_settime(CRED())) == 0) {
+			int newlag = (int)arg1;
+
+#ifdef _SYSCALL32_IMPL
+			if (get_udatamodel() == DATAMODEL_NATIVE &&
+			    (long)newlag != (long)arg1) {
+				error = EOVERFLOW;
+			} else
+#endif
+			if (newlag >= -ONEWEEK && newlag <= ONEWEEK)
+				sgmtl(newlag);
+			else
+				error = EOVERFLOW;
+		}
+		break;
+
+	case GGMTL:
+		if (get_udatamodel() == DATAMODEL_NATIVE) {
+			if (sulword((void *)arg1, ggmtl()) == -1)
+				error = EFAULT;
+#ifdef _SYSCALL32_IMPL
+		} else {
+			time_t gmtl;
+
+			if ((gmtl = ggmtl()) > INT32_MAX) {
+				/*
+				 * Since gmt_lag can at most be
+				 * +/- 12 hours, something is
+				 * *seriously* messed up here.
+				 */
+				error = EOVERFLOW;
+			} else if (suword32((void *)arg1, (int32_t)gmtl) == -1)
+				error = EFAULT;
+#endif
+		}
+		break;
+
+	case RTCSYNC:
+		if ((error = secpolicy_settime(CRED())) == 0)
+			rtcsync();
+		break;
+
+	/* END OF real time clock management commands */
+
+	default:
+		error = EINVAL;
+		break;
+	}
+	return (error == 0 ? 0 : set_errno(error));
+}
+
+void
+usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel)
+{
+	ssd->bo = USEGD_GETBASE(usd);
+	ssd->ls = USEGD_GETLIMIT(usd);
+	ssd->sel = sel;
+
+	/*
+	 * set type, dpl and present bits.
+	 */
+	ssd->acc1 = usd->usd_type;
+	ssd->acc1 |= usd->usd_dpl << 5;
+	ssd->acc1 |= usd->usd_p << (5 + 2);
+
+	/*
+	 * set avl, DB and granularity bits.
+	 */
+	ssd->acc2 = usd->usd_avl;
+
+	ssd->acc2 |= usd->usd_long << 1;
+
+	ssd->acc2 |= usd->usd_def32 << (1 + 1);
+	ssd->acc2 |= usd->usd_gran << (1 + 1 + 1);
+}
+
+static void
+ssd_to_usd(struct ssd *ssd, user_desc_t *usd)
+{
+
+	ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0);
+
+	USEGD_SETBASE(usd, ssd->bo);
+	USEGD_SETLIMIT(usd, ssd->ls);
+
+	/*
+	 * Set type, dpl and present bits.
+	 *
+	 * Force the "accessed" bit to on so that we don't run afoul of
+	 * KPTI.
+	 */
+	usd->usd_type = ssd->acc1 | SDT_A;
+	usd->usd_dpl = ssd->acc1 >> 5;
+	usd->usd_p = ssd->acc1 >> (5 + 2);
+
+	ASSERT(usd->usd_type >= SDT_MEMRO);
+	ASSERT(usd->usd_dpl == SEL_UPL);
+
+	/*
+	 * 64-bit code selectors are never allowed in the LDT.
+	 * Reserved bit is always 0 on 32-bit systems.
+	 */
+	usd->usd_long = 0;
+
+	/*
+	 * set avl, DB and granularity bits.
+	 */
+	usd->usd_avl = ssd->acc2;
+	usd->usd_def32 = ssd->acc2 >> (1 + 1);
+	usd->usd_gran = ssd->acc2 >> (1 + 1 + 1);
+}
+
+
+
+/*
+ * Load LDT register with the current process's LDT.
+ */
+static void
+ldt_load(void)
+{
+#if defined(__xpv)
+	xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1);
+#else
+	size_t len;
+	system_desc_t desc;
+
+	/*
+	 * Before we can use the LDT on this CPU, we must install the LDT in the
+	 * user mapping table.
+	 */
+	len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
+	bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
+	CPU->cpu_m.mcpu_ldt_len = len;
+	set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
+	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
+
+	wr_ldtr(ULDT_SEL);
+#endif
+}
+
+/*
+ * Store a NULL selector in the LDTR. All subsequent illegal references to
+ * the LDT will result in a #gp.
+ */
+void
+ldt_unload(void)
+{
+#if defined(__xpv)
+	xen_set_ldt(NULL, 0);
+#else
+	*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
+	wr_ldtr(0);
+
+	bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
+	CPU->cpu_m.mcpu_ldt_len = 0;
+#endif
+}
+
+/*ARGSUSED*/
+static void
+ldt_savectx(proc_t *p)
+{
+	ASSERT(p->p_ldt != NULL);
+	ASSERT(p == curproc);
+
+	/*
+	 * The 64-bit kernel must be sure to clear any stale ldt
+	 * selectors when context switching away from a process that
+	 * has a private ldt. Consider the following example:
+	 *
+	 *	Wine creats a ldt descriptor and points a segment register
+	 *	to it.
+	 *
+	 *	We then context switch away from wine lwp to kernel
+	 *	thread and hit breakpoint in kernel with kmdb
+	 *
+	 *	When we continue and resume from kmdb we will #gp
+	 *	fault since kmdb will have saved the stale ldt selector
+	 *	from wine and will try to restore it but we are no longer in
+	 *	the context of the wine process and do not have our
+	 *	ldtr register pointing to the private ldt.
+	 */
+	reset_sregs();
+
+	ldt_unload();
+	cpu_fast_syscall_enable();
+}
+
+static void
+ldt_restorectx(proc_t *p)
+{
+	ASSERT(p->p_ldt != NULL);
+	ASSERT(p == curproc);
+
+	ldt_load();
+	cpu_fast_syscall_disable();
+}
+
+/*
+ * At exec time, we need to clear up our LDT context and re-enable fast syscalls
+ * for the new process image.
+ *
+ * The same is true for the other case, where we have:
+ *
+ * proc_exit()
+ *  ->exitpctx()->ldt_savectx()
+ *  ->freepctx()->ldt_freectx()
+ *
+ * Because pre-emption is not prevented between the two callbacks, we could have
+ * come off CPU, and brought back LDT context when coming back on CPU via
+ * ldt_restorectx().
+ */
+/* ARGSUSED */
+static void
+ldt_freectx(proc_t *p, int isexec)
+{
+	ASSERT(p->p_ldt != NULL);
+	ASSERT(p == curproc);
+
+	kpreempt_disable();
+	ldt_free(p);
+	cpu_fast_syscall_enable();
+	kpreempt_enable();
+}
+
+/*
+ * Install ctx op that ensures syscall/sysenter are disabled.
+ * See comments below.
+ *
+ * When a thread with a private LDT forks, the new process
+ * must have the LDT context ops installed.
+ */
+/* ARGSUSED */
+static void
+ldt_installctx(proc_t *p, proc_t *cp)
+{
+	proc_t		*targ = p;
+	kthread_t	*t;
+
+	/*
+	 * If this is a fork, operate on the child process.
+	 */
+	if (cp != NULL) {
+		targ = cp;
+		ldt_dup(p, cp);
+	}
+
+	/*
+	 * The process context ops expect the target process as their argument.
+	 */
+	ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx,
+	    ldt_installctx, ldt_savectx, ldt_freectx) == 0);
+
+	installpctx(targ, targ, ldt_savectx, ldt_restorectx,
+	    ldt_installctx, ldt_savectx, ldt_freectx);
+
+	/*
+	 * We've just disabled fast system call and return instructions; take
+	 * the slow path out to make sure we don't try to use one to return
+	 * back to user. We must set t_post_sys for every thread in the
+	 * process to make sure none of them escape out via fast return.
+	 */
+
+	mutex_enter(&targ->p_lock);
+	t = targ->p_tlist;
+	do {
+		t->t_post_sys = 1;
+	} while ((t = t->t_forw) != targ->p_tlist);
+	mutex_exit(&targ->p_lock);
+}
+
+int
+setdscr(struct ssd *ssd)
+{
+	ushort_t seli;		/* selector index */
+	user_desc_t *ldp;	/* descriptor pointer */
+	user_desc_t ndesc;	/* new descriptor */
+	proc_t	*pp = curproc;
+	int	rc = 0;
+
+	/*
+	 * LDT segments: executable and data at DPL 3 only.
+	 */
+	if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel))
+		return (EINVAL);
+
+	/*
+	 * check the selector index.
+	 */
+	seli = SELTOIDX(ssd->sel);
+	if (seli >= MAXNLDT || seli < LDT_UDBASE)
+		return (EINVAL);
+
+	ndesc = null_udesc;
+	mutex_enter(&pp->p_ldtlock);
+
+	/*
+	 * If this is the first time for this process then setup a
+	 * private LDT for it.
+	 */
+	if (pp->p_ldt == NULL) {
+		ldt_alloc(pp, seli);
+
+		/*
+		 * Now that this process has a private LDT, the use of
+		 * the syscall/sysret and sysenter/sysexit instructions
+		 * is forbidden for this processes because they destroy
+		 * the contents of %cs and %ss segment registers.
+		 *
+		 * Explicity disable them here and add a context handler
+		 * to the process. Note that disabling
+		 * them here means we can't use sysret or sysexit on
+		 * the way out of this system call - so we force this
+		 * thread to take the slow path (which doesn't make use
+		 * of sysenter or sysexit) back out.
+		 */
+		kpreempt_disable();
+		ldt_installctx(pp, NULL);
+		cpu_fast_syscall_disable();
+		ASSERT(curthread->t_post_sys != 0);
+		kpreempt_enable();
+
+	} else if (seli > pp->p_ldtlimit) {
+		ASSERT(pp->p_pctx != NULL);
+
+		/*
+		 * Increase size of ldt to include seli.
+		 */
+		ldt_grow(pp, seli);
+	}
+
+	ASSERT(seli <= pp->p_ldtlimit);
+	ldp = &pp->p_ldt[seli];
+
+	/*
+	 * On the 64-bit kernel, this is where things get more subtle.
+	 * Recall that in the 64-bit kernel, when we enter the kernel we
+	 * deliberately -don't- reload the segment selectors we came in on
+	 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
+	 * and the underlying descriptors are essentially ignored by the
+	 * hardware in long mode - except for the base that we override with
+	 * the gsbase MSRs.
+	 *
+	 * However, there's one unfortunate issue with this rosy picture --
+	 * a descriptor that's not marked as 'present' will still generate
+	 * an #np when loading a segment register.
+	 *
+	 * Consider this case.  An lwp creates a harmless LDT entry, points
+	 * one of it's segment registers at it, then tells the kernel (here)
+	 * to delete it.  In the 32-bit kernel, the #np will happen on the
+	 * way back to userland where we reload the segment registers, and be
+	 * handled in kern_gpfault().  In the 64-bit kernel, the same thing
+	 * will happen in the normal case too.  However, if we're trying to
+	 * use a debugger that wants to save and restore the segment registers,
+	 * and the debugger things that we have valid segment registers, we
+	 * have the problem that the debugger will try and restore the
+	 * segment register that points at the now 'not present' descriptor
+	 * and will take a #np right there.
+	 *
+	 * We should obviously fix the debugger to be paranoid about
+	 * -not- restoring segment registers that point to bad descriptors;
+	 * however we can prevent the problem here if we check to see if any
+	 * of the segment registers are still pointing at the thing we're
+	 * destroying; if they are, return an error instead. (That also seems
+	 * a lot better failure mode than SIGKILL and a core file
+	 * from kern_gpfault() too.)
+	 */
+	if (SI86SSD_PRES(ssd) == 0) {
+		kthread_t *t;
+		int bad = 0;
+
+		/*
+		 * Look carefully at the segment registers of every lwp
+		 * in the process (they're all stopped by our caller).
+		 * If we're about to invalidate a descriptor that's still
+		 * being referenced by *any* of them, return an error,
+		 * rather than having them #gp on their way out of the kernel.
+		 */
+		ASSERT(pp->p_lwprcnt == 1);
+
+		mutex_enter(&pp->p_lock);
+		t = pp->p_tlist;
+		do {
+			klwp_t *lwp = ttolwp(t);
+			struct regs *rp = lwp->lwp_regs;
+			pcb_t *pcb = &lwp->lwp_pcb;
+
+			if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) {
+				bad = 1;
+				break;
+			}
+
+			if (PCB_NEED_UPDATE_SEGS(pcb)) {
+				if (ssd->sel == pcb->pcb_ds ||
+				    ssd->sel == pcb->pcb_es ||
+				    ssd->sel == pcb->pcb_fs ||
+				    ssd->sel == pcb->pcb_gs) {
+					bad = 1;
+					break;
+				}
+			} else {
+				if (ssd->sel == rp->r_ds ||
+				    ssd->sel == rp->r_es ||
+				    ssd->sel == rp->r_fs ||
+				    ssd->sel == rp->r_gs) {
+					bad = 1;
+					break;
+				}
+			}
+
+		} while ((t = t->t_forw) != pp->p_tlist);
+		mutex_exit(&pp->p_lock);
+
+		if (bad) {
+			mutex_exit(&pp->p_ldtlock);
+			return (EBUSY);
+		}
+	}
+
+	/*
+	 * If acc1 is zero, clear the descriptor (including the 'present' bit).
+	 * Make sure we update the CPU-private copy of the LDT.
+	 */
+	if (ssd->acc1 == 0) {
+		rc  = ldt_update_segd(ldp, &null_udesc);
+		kpreempt_disable();
+		ldt_load();
+		kpreempt_enable();
+		mutex_exit(&pp->p_ldtlock);
+		return (rc);
+	}
+
+	/*
+	 * Check segment type, allow segment not present and
+	 * only user DPL (3).
+	 */
+	if (SI86SSD_DPL(ssd) != SEL_UPL) {
+		mutex_exit(&pp->p_ldtlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Do not allow 32-bit applications to create 64-bit mode code
+	 * segments.
+	 */
+	if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 &&
+	    SI86SSD_ISLONG(ssd)) {
+		mutex_exit(&pp->p_ldtlock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Set up a code or data user segment descriptor, making sure to update
+	 * the CPU-private copy of the LDT.
+	 */
+	if (SI86SSD_ISUSEG(ssd)) {
+		ssd_to_usd(ssd, &ndesc);
+		rc = ldt_update_segd(ldp, &ndesc);
+		kpreempt_disable();
+		ldt_load();
+		kpreempt_enable();
+		mutex_exit(&pp->p_ldtlock);
+		return (rc);
+	}
+
+	mutex_exit(&pp->p_ldtlock);
+	return (EINVAL);
+}
+
+/*
+ * Allocate new LDT for process just large enough to contain seli.  Note we
+ * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the
+ * implementation and because on the hypervisor it's required, since the LDT
+ * must live on pages that have PROT_WRITE removed and which are given to the
+ * hypervisor.
+ *
+ * Note that we don't actually load the LDT into the current CPU here: it's done
+ * later by our caller.
+ */
+static void
+ldt_alloc(proc_t *pp, uint_t seli)
+{
+	user_desc_t	*ldt;
+	size_t		ldtsz;
+	uint_t		nsels;
+
+	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
+	ASSERT(pp->p_ldt == NULL);
+	ASSERT(pp->p_ldtlimit == 0);
+
+	/*
+	 * Allocate new LDT just large enough to contain seli. The LDT must
+	 * always be allocated in units of pages for KPTI.
+	 */
+	ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
+	nsels = ldtsz / sizeof (user_desc_t);
+	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
+
+	ldt = kmem_zalloc(ldtsz, KM_SLEEP);
+	ASSERT(IS_P2ALIGNED(ldt, PAGESIZE));
+
+#if defined(__xpv)
+	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ))
+		panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed");
+#endif
+
+	pp->p_ldt = ldt;
+	pp->p_ldtlimit = nsels - 1;
+}
+
+static void
+ldt_free(proc_t *pp)
+{
+	user_desc_t	*ldt;
+	size_t		ldtsz;
+
+	ASSERT(pp->p_ldt != NULL);
+
+	mutex_enter(&pp->p_ldtlock);
+	ldt = pp->p_ldt;
+	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
+
+	ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE));
+
+	pp->p_ldt = NULL;
+	pp->p_ldtlimit = 0;
+	mutex_exit(&pp->p_ldtlock);
+
+	if (pp == curproc) {
+		kpreempt_disable();
+		ldt_unload();
+		kpreempt_enable();
+	}
+
+#if defined(__xpv)
+	/*
+	 * We are not allowed to make the ldt writable until after
+	 * we tell the hypervisor to unload it.
+	 */
+	if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE))
+		panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
+#endif
+
+	kmem_free(ldt, ldtsz);
+}
+
+/*
+ * On fork copy new ldt for child.
+ */
+static void
+ldt_dup(proc_t *pp, proc_t *cp)
+{
+	size_t	ldtsz;
+
+	ASSERT(pp->p_ldt != NULL);
+	ASSERT(cp != curproc);
+
+	/*
+	 * I assume the parent's ldt can't increase since we're in a fork.
+	 */
+	mutex_enter(&pp->p_ldtlock);
+	mutex_enter(&cp->p_ldtlock);
+
+	ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
+
+	ldt_alloc(cp, pp->p_ldtlimit);
+
+#if defined(__xpv)
+	/*
+	 * Make child's ldt writable so it can be copied into from
+	 * parent's ldt. This works since ldt_alloc above did not load
+	 * the ldt since its for the child process. If we tried to make
+	 * an LDT writable that is loaded in hw the setprot operation
+	 * would fail.
+	 */
+	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE))
+		panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
+#endif
+
+	bcopy(pp->p_ldt, cp->p_ldt, ldtsz);
+
+#if defined(__xpv)
+	if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ))
+		panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed");
+#endif
+	mutex_exit(&cp->p_ldtlock);
+	mutex_exit(&pp->p_ldtlock);
+
+}
+
+/*
+ * Note that we don't actually load the LDT into the current CPU here: it's done
+ * later by our caller - unless we take an error.  This works out because
+ * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT
+ * (and therefore can't be using the freed old LDT), and by definition if the
+ * new entry didn't pass validation, then the proc shouldn't be referencing an
+ * entry in the extended region.
+ */
+static void
+ldt_grow(proc_t *pp, uint_t seli)
+{
+	user_desc_t	*oldt, *nldt;
+	uint_t		nsels;
+	size_t		oldtsz, nldtsz;
+
+	ASSERT(MUTEX_HELD(&pp->p_ldtlock));
+	ASSERT(pp->p_ldt != NULL);
+	ASSERT(pp->p_ldtlimit != 0);
+
+	/*
+	 * Allocate larger LDT just large enough to contain seli. The LDT must
+	 * always be allocated in units of pages for KPTI.
+	 */
+	nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
+	nsels = nldtsz / sizeof (user_desc_t);
+	ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT);
+	ASSERT(nsels > pp->p_ldtlimit);
+
+	oldt = pp->p_ldt;
+	oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t);
+
+	nldt = kmem_zalloc(nldtsz, KM_SLEEP);
+	ASSERT(IS_P2ALIGNED(nldt, PAGESIZE));
+
+	bcopy(oldt, nldt, oldtsz);
+
+	/*
+	 * unload old ldt.
+	 */
+	kpreempt_disable();
+	ldt_unload();
+	kpreempt_enable();
+
+#if defined(__xpv)
+
+	/*
+	 * Make old ldt writable and new ldt read only.
+	 */
+	if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE))
+		panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed");
+
+	if (xen_ldt_setprot(nldt, nldtsz, PROT_READ))
+		panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed");
+#endif
+
+	pp->p_ldt = nldt;
+	pp->p_ldtlimit = nsels - 1;
+
+	kmem_free(oldt, oldtsz);
+}
author	Richard Lowe <richlowe@richlowe.net>	2021-06-04 15:15:12 -0500
committer	Richard Lowe <richlowe@richlowe.net>	2021-08-16 12:46:39 -0500
commit	f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988 (patch)
tree	c4ac2f5e703ed459d50bcee7ddb38a993d961520 /usr/src/uts/intel/os
parent	d083fed0c91296a88878f7a468910ad5b5c888ea (diff)
download	illumos-gate-f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988.tar.gz