diff options
author | Richard Lowe <richlowe@richlowe.net> | 2021-06-04 15:15:12 -0500 |
---|---|---|
committer | Richard Lowe <richlowe@richlowe.net> | 2021-08-16 12:46:39 -0500 |
commit | f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988 (patch) | |
tree | c4ac2f5e703ed459d50bcee7ddb38a993d961520 /usr/src/uts/intel/os | |
parent | d083fed0c91296a88878f7a468910ad5b5c888ea (diff) | |
download | illumos-gate-f0089e391b2bc4be2755f1a1b51fb4cd9b8f3988.tar.gz |
13941 intel code and headers should not look ia32 specific
Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Patrick Mooney <pmooney@pfmooney.com>
Approved by: Garret D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src/uts/intel/os')
-rw-r--r-- | usr/src/uts/intel/os/archdep.c | 1240 | ||||
-rw-r--r-- | usr/src/uts/intel/os/bootdev.c | 100 | ||||
-rw-r--r-- | usr/src/uts/intel/os/comm_page_util.c | 62 | ||||
-rw-r--r-- | usr/src/uts/intel/os/copy_subr.c | 102 | ||||
-rw-r--r-- | usr/src/uts/intel/os/cpc_subr.c | 274 | ||||
-rw-r--r-- | usr/src/uts/intel/os/ddi_i86.c | 1903 | ||||
-rw-r--r-- | usr/src/uts/intel/os/desctbls.c | 1218 | ||||
-rw-r--r-- | usr/src/uts/intel/os/fpu.c | 1506 | ||||
-rw-r--r-- | usr/src/uts/intel/os/sendsig.c | 589 | ||||
-rw-r--r-- | usr/src/uts/intel/os/sundep.c | 1012 | ||||
-rw-r--r-- | usr/src/uts/intel/os/syscall.c | 1397 | ||||
-rw-r--r-- | usr/src/uts/intel/os/sysi86.c | 850 |
12 files changed, 10253 insertions, 0 deletions
diff --git a/usr/src/uts/intel/os/archdep.c b/usr/src/uts/intel/os/archdep.c new file mode 100644 index 0000000000..14d20bb487 --- /dev/null +++ b/usr/src/uts/intel/os/archdep.c @@ -0,0 +1,1240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright (c) 2018, Joyent, Inc. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/vmparam.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/stack.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/frame.h> +#include <sys/proc.h> +#include <sys/psw.h> +#include <sys/siginfo.h> +#include <sys/cpuvar.h> +#include <sys/asm_linkage.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/bootconf.h> +#include <sys/archsystm.h> +#include <sys/debug.h> +#include <sys/elf.h> +#include <sys/spl.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/modctl.h> +#include <sys/kobj.h> +#include <sys/panic.h> +#include <sys/reboot.h> +#include <sys/time.h> +#include <sys/fp.h> +#include <sys/x86_archext.h> +#include <sys/auxv.h> +#include <sys/auxv_386.h> +#include <sys/dtrace.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/cmn_err.h> + +/* + * Map an fnsave-formatted save area into an fxsave-formatted save area. + * + * Most fields are the same width, content and semantics. However + * the tag word is compressed. + */ +static void +fnsave_to_fxsave(const struct fnsave_state *fn, struct fxsave_state *fx) +{ + uint_t i, tagbits; + + fx->fx_fcw = fn->f_fcw; + fx->fx_fsw = fn->f_fsw; + + /* + * copy element by element (because of holes) + */ + for (i = 0; i < 8; i++) + bcopy(&fn->f_st[i].fpr_16[0], &fx->fx_st[i].fpr_16[0], + sizeof (fn->f_st[0].fpr_16)); /* 80-bit x87-style floats */ + + /* + * synthesize compressed tag bits + */ + fx->fx_fctw = 0; + for (tagbits = fn->f_ftw, i = 0; i < 8; i++, tagbits >>= 2) + if ((tagbits & 3) != 3) + fx->fx_fctw |= (1 << i); + + fx->fx_fop = fn->f_fop; + + fx->fx_rip = (uint64_t)fn->f_eip; + fx->fx_rdp = (uint64_t)fn->f_dp; +} + +/* + * Map from an fxsave-format save area to an fnsave-format save area. + */ +static void +fxsave_to_fnsave(const struct fxsave_state *fx, struct fnsave_state *fn) +{ + uint_t i, top, tagbits; + + fn->f_fcw = fx->fx_fcw; + fn->__f_ign0 = 0; + fn->f_fsw = fx->fx_fsw; + fn->__f_ign1 = 0; + + top = (fx->fx_fsw & FPS_TOP) >> 11; + + /* + * copy element by element (because of holes) + */ + for (i = 0; i < 8; i++) + bcopy(&fx->fx_st[i].fpr_16[0], &fn->f_st[i].fpr_16[0], + sizeof (fn->f_st[0].fpr_16)); /* 80-bit x87-style floats */ + + /* + * synthesize uncompressed tag bits + */ + fn->f_ftw = 0; + for (tagbits = fx->fx_fctw, i = 0; i < 8; i++, tagbits >>= 1) { + uint_t ibit, expo; + const uint16_t *fpp; + static const uint16_t zero[5] = { 0, 0, 0, 0, 0 }; + + if ((tagbits & 1) == 0) { + fn->f_ftw |= 3 << (i << 1); /* empty */ + continue; + } + + /* + * (tags refer to *physical* registers) + */ + fpp = &fx->fx_st[(i - top + 8) & 7].fpr_16[0]; + ibit = fpp[3] >> 15; + expo = fpp[4] & 0x7fff; + + if (ibit && expo != 0 && expo != 0x7fff) + continue; /* valid fp number */ + + if (bcmp(fpp, &zero, sizeof (zero))) + fn->f_ftw |= 2 << (i << 1); /* NaN */ + else + fn->f_ftw |= 1 << (i << 1); /* fp zero */ + } + + fn->f_fop = fx->fx_fop; + + fn->__f_ign2 = 0; + fn->f_eip = (uint32_t)fx->fx_rip; + fn->f_cs = U32CS_SEL; + fn->f_dp = (uint32_t)fx->fx_rdp; + fn->f_ds = UDS_SEL; + fn->__f_ign3 = 0; +} + +/* + * Map from an fpregset_t into an fxsave-format save area + */ +static void +fpregset_to_fxsave(const fpregset_t *fp, struct fxsave_state *fx) +{ + bcopy(fp, fx, sizeof (*fx)); + /* + * avoid useless #gp exceptions - mask reserved bits + */ + fx->fx_mxcsr &= sse_mxcsr_mask; +} + +/* + * Map from an fxsave-format save area into a fpregset_t + */ +static void +fxsave_to_fpregset(const struct fxsave_state *fx, fpregset_t *fp) +{ + bcopy(fx, fp, sizeof (*fx)); +} + +#if defined(_SYSCALL32_IMPL) +static void +fpregset32_to_fxsave(const fpregset32_t *fp, struct fxsave_state *fx) +{ + const struct fpchip32_state *fc = &fp->fp_reg_set.fpchip_state; + + fnsave_to_fxsave((const struct fnsave_state *)fc, fx); + /* + * avoid useless #gp exceptions - mask reserved bits + */ + fx->fx_mxcsr = sse_mxcsr_mask & fc->mxcsr; + bcopy(&fc->xmm[0], &fx->fx_xmm[0], sizeof (fc->xmm)); +} + +static void +fxsave_to_fpregset32(const struct fxsave_state *fx, fpregset32_t *fp) +{ + struct fpchip32_state *fc = &fp->fp_reg_set.fpchip_state; + + fxsave_to_fnsave(fx, (struct fnsave_state *)fc); + fc->mxcsr = fx->fx_mxcsr; + bcopy(&fx->fx_xmm[0], &fc->xmm[0], sizeof (fc->xmm)); +} + +static void +fpregset_nto32(const fpregset_t *src, fpregset32_t *dst) +{ + fxsave_to_fpregset32((struct fxsave_state *)src, dst); + dst->fp_reg_set.fpchip_state.status = + src->fp_reg_set.fpchip_state.status; + dst->fp_reg_set.fpchip_state.xstatus = + src->fp_reg_set.fpchip_state.xstatus; +} + +static void +fpregset_32ton(const fpregset32_t *src, fpregset_t *dst) +{ + fpregset32_to_fxsave(src, (struct fxsave_state *)dst); + dst->fp_reg_set.fpchip_state.status = + src->fp_reg_set.fpchip_state.status; + dst->fp_reg_set.fpchip_state.xstatus = + src->fp_reg_set.fpchip_state.xstatus; +} +#endif + +/* + * Set floating-point registers from a native fpregset_t. + */ +void +setfpregs(klwp_t *lwp, fpregset_t *fp) +{ + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; + + if (fpu->fpu_flags & FPU_EN) { + if (!(fpu->fpu_flags & FPU_VALID)) { + /* + * FPU context is still active, release the + * ownership. + */ + fp_free(fpu, 0); + } + } + /* + * Else: if we are trying to change the FPU state of a thread which + * hasn't yet initialized floating point, store the state in + * the pcb and indicate that the state is valid. When the + * thread enables floating point, it will use this state instead + * of the default state. + */ + + switch (fp_save_mech) { + case FP_FXSAVE: + fpregset_to_fxsave(fp, fpu->fpu_regs.kfpu_u.kfpu_fx); + fpu->fpu_regs.kfpu_xstatus = + fp->fp_reg_set.fpchip_state.xstatus; + break; + + case FP_XSAVE: + fpregset_to_fxsave(fp, + &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave); + fpu->fpu_regs.kfpu_xstatus = + fp->fp_reg_set.fpchip_state.xstatus; + fpu->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= + (XFEATURE_LEGACY_FP | XFEATURE_SSE); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + fpu->fpu_regs.kfpu_status = fp->fp_reg_set.fpchip_state.status; + fpu->fpu_flags |= FPU_VALID; + PCB_SET_UPDATE_FPU(&lwp->lwp_pcb); +} + +/* + * Get floating-point registers into a native fpregset_t. + */ +void +getfpregs(klwp_t *lwp, fpregset_t *fp) +{ + struct fpu_ctx *fpu = &lwp->lwp_pcb.pcb_fpu; + + kpreempt_disable(); + if (fpu->fpu_flags & FPU_EN) { + /* + * If we have FPU hw and the thread's pcb doesn't have + * a valid FPU state then get the state from the hw. + */ + if (fpu_exists && ttolwp(curthread) == lwp && + !(fpu->fpu_flags & FPU_VALID)) + fp_save(fpu); /* get the current FPU state */ + } + + /* + * There are 3 possible cases we have to be aware of here: + * + * 1. FPU is enabled. FPU state is stored in the current LWP. + * + * 2. FPU is not enabled, and there have been no intervening /proc + * modifications. Return initial FPU state. + * + * 3. FPU is not enabled, but a /proc consumer has modified FPU state. + * FPU state is stored in the current LWP. + */ + if ((fpu->fpu_flags & FPU_EN) || (fpu->fpu_flags & FPU_VALID)) { + /* + * Cases 1 and 3. + */ + switch (fp_save_mech) { + case FP_FXSAVE: + fxsave_to_fpregset(fpu->fpu_regs.kfpu_u.kfpu_fx, fp); + fp->fp_reg_set.fpchip_state.xstatus = + fpu->fpu_regs.kfpu_xstatus; + break; + case FP_XSAVE: + fxsave_to_fpregset( + &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave, fp); + fp->fp_reg_set.fpchip_state.xstatus = + fpu->fpu_regs.kfpu_xstatus; + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status; + } else { + /* + * Case 2. + */ + switch (fp_save_mech) { + case FP_FXSAVE: + case FP_XSAVE: + /* + * For now, we don't have any AVX specific field in ABI. + * If we add any in the future, we need to initial them + * as well. + */ + fxsave_to_fpregset(&sse_initial, fp); + fp->fp_reg_set.fpchip_state.xstatus = + fpu->fpu_regs.kfpu_xstatus; + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + fp->fp_reg_set.fpchip_state.status = fpu->fpu_regs.kfpu_status; + } + kpreempt_enable(); +} + +#if defined(_SYSCALL32_IMPL) + +/* + * Set floating-point registers from an fpregset32_t. + */ +void +setfpregs32(klwp_t *lwp, fpregset32_t *fp) +{ + fpregset_t fpregs; + + fpregset_32ton(fp, &fpregs); + setfpregs(lwp, &fpregs); +} + +/* + * Get floating-point registers into an fpregset32_t. + */ +void +getfpregs32(klwp_t *lwp, fpregset32_t *fp) +{ + fpregset_t fpregs; + + getfpregs(lwp, &fpregs); + fpregset_nto32(&fpregs, fp); +} + +#endif /* _SYSCALL32_IMPL */ + +/* + * Return the general registers + */ +void +getgregs(klwp_t *lwp, gregset_t grp) +{ + struct regs *rp = lwptoregs(lwp); + struct pcb *pcb = &lwp->lwp_pcb; + int thisthread = lwptot(lwp) == curthread; + + grp[REG_RDI] = rp->r_rdi; + grp[REG_RSI] = rp->r_rsi; + grp[REG_RDX] = rp->r_rdx; + grp[REG_RCX] = rp->r_rcx; + grp[REG_R8] = rp->r_r8; + grp[REG_R9] = rp->r_r9; + grp[REG_RAX] = rp->r_rax; + grp[REG_RBX] = rp->r_rbx; + grp[REG_RBP] = rp->r_rbp; + grp[REG_R10] = rp->r_r10; + grp[REG_R11] = rp->r_r11; + grp[REG_R12] = rp->r_r12; + grp[REG_R13] = rp->r_r13; + grp[REG_R14] = rp->r_r14; + grp[REG_R15] = rp->r_r15; + grp[REG_FSBASE] = pcb->pcb_fsbase; + grp[REG_GSBASE] = pcb->pcb_gsbase; + if (thisthread) + kpreempt_disable(); + if (PCB_NEED_UPDATE_SEGS(pcb)) { + grp[REG_DS] = pcb->pcb_ds; + grp[REG_ES] = pcb->pcb_es; + grp[REG_FS] = pcb->pcb_fs; + grp[REG_GS] = pcb->pcb_gs; + } else { + grp[REG_DS] = rp->r_ds; + grp[REG_ES] = rp->r_es; + grp[REG_FS] = rp->r_fs; + grp[REG_GS] = rp->r_gs; + } + if (thisthread) + kpreempt_enable(); + grp[REG_TRAPNO] = rp->r_trapno; + grp[REG_ERR] = rp->r_err; + grp[REG_RIP] = rp->r_rip; + grp[REG_CS] = rp->r_cs; + grp[REG_SS] = rp->r_ss; + grp[REG_RFL] = rp->r_rfl; + grp[REG_RSP] = rp->r_rsp; +} + +#if defined(_SYSCALL32_IMPL) + +void +getgregs32(klwp_t *lwp, gregset32_t grp) +{ + struct regs *rp = lwptoregs(lwp); + struct pcb *pcb = &lwp->lwp_pcb; + int thisthread = lwptot(lwp) == curthread; + + if (thisthread) + kpreempt_disable(); + if (PCB_NEED_UPDATE_SEGS(pcb)) { + grp[GS] = (uint16_t)pcb->pcb_gs; + grp[FS] = (uint16_t)pcb->pcb_fs; + grp[DS] = (uint16_t)pcb->pcb_ds; + grp[ES] = (uint16_t)pcb->pcb_es; + } else { + grp[GS] = (uint16_t)rp->r_gs; + grp[FS] = (uint16_t)rp->r_fs; + grp[DS] = (uint16_t)rp->r_ds; + grp[ES] = (uint16_t)rp->r_es; + } + if (thisthread) + kpreempt_enable(); + grp[EDI] = (greg32_t)rp->r_rdi; + grp[ESI] = (greg32_t)rp->r_rsi; + grp[EBP] = (greg32_t)rp->r_rbp; + grp[ESP] = 0; + grp[EBX] = (greg32_t)rp->r_rbx; + grp[EDX] = (greg32_t)rp->r_rdx; + grp[ECX] = (greg32_t)rp->r_rcx; + grp[EAX] = (greg32_t)rp->r_rax; + grp[TRAPNO] = (greg32_t)rp->r_trapno; + grp[ERR] = (greg32_t)rp->r_err; + grp[EIP] = (greg32_t)rp->r_rip; + grp[CS] = (uint16_t)rp->r_cs; + grp[EFL] = (greg32_t)rp->r_rfl; + grp[UESP] = (greg32_t)rp->r_rsp; + grp[SS] = (uint16_t)rp->r_ss; +} + +void +ucontext_32ton(const ucontext32_t *src, ucontext_t *dst) +{ + mcontext_t *dmc = &dst->uc_mcontext; + const mcontext32_t *smc = &src->uc_mcontext; + + bzero(dst, sizeof (*dst)); + dst->uc_flags = src->uc_flags; + dst->uc_link = (ucontext_t *)(uintptr_t)src->uc_link; + + bcopy(&src->uc_sigmask, &dst->uc_sigmask, sizeof (dst->uc_sigmask)); + + dst->uc_stack.ss_sp = (void *)(uintptr_t)src->uc_stack.ss_sp; + dst->uc_stack.ss_size = (size_t)src->uc_stack.ss_size; + dst->uc_stack.ss_flags = src->uc_stack.ss_flags; + + dmc->gregs[REG_GS] = (greg_t)(uint32_t)smc->gregs[GS]; + dmc->gregs[REG_FS] = (greg_t)(uint32_t)smc->gregs[FS]; + dmc->gregs[REG_ES] = (greg_t)(uint32_t)smc->gregs[ES]; + dmc->gregs[REG_DS] = (greg_t)(uint32_t)smc->gregs[DS]; + dmc->gregs[REG_RDI] = (greg_t)(uint32_t)smc->gregs[EDI]; + dmc->gregs[REG_RSI] = (greg_t)(uint32_t)smc->gregs[ESI]; + dmc->gregs[REG_RBP] = (greg_t)(uint32_t)smc->gregs[EBP]; + dmc->gregs[REG_RBX] = (greg_t)(uint32_t)smc->gregs[EBX]; + dmc->gregs[REG_RDX] = (greg_t)(uint32_t)smc->gregs[EDX]; + dmc->gregs[REG_RCX] = (greg_t)(uint32_t)smc->gregs[ECX]; + dmc->gregs[REG_RAX] = (greg_t)(uint32_t)smc->gregs[EAX]; + dmc->gregs[REG_TRAPNO] = (greg_t)(uint32_t)smc->gregs[TRAPNO]; + dmc->gregs[REG_ERR] = (greg_t)(uint32_t)smc->gregs[ERR]; + dmc->gregs[REG_RIP] = (greg_t)(uint32_t)smc->gregs[EIP]; + dmc->gregs[REG_CS] = (greg_t)(uint32_t)smc->gregs[CS]; + dmc->gregs[REG_RFL] = (greg_t)(uint32_t)smc->gregs[EFL]; + dmc->gregs[REG_RSP] = (greg_t)(uint32_t)smc->gregs[UESP]; + dmc->gregs[REG_SS] = (greg_t)(uint32_t)smc->gregs[SS]; + + /* + * A valid fpregs is only copied in if uc.uc_flags has UC_FPU set + * otherwise there is no guarantee that anything in fpregs is valid. + */ + if (src->uc_flags & UC_FPU) + fpregset_32ton(&src->uc_mcontext.fpregs, + &dst->uc_mcontext.fpregs); +} + +#endif /* _SYSCALL32_IMPL */ + +/* + * Return the user-level PC. + * If in a system call, return the address of the syscall trap. + */ +greg_t +getuserpc() +{ + greg_t upc = lwptoregs(ttolwp(curthread))->r_pc; + uint32_t insn; + + if (curthread->t_sysnum == 0) + return (upc); + + /* + * We might've gotten here from sysenter (0xf 0x34), + * syscall (0xf 0x5) or lcall (0x9a 0 0 0 0 0x27 0). + * + * Go peek at the binary to figure it out.. + */ + if (fuword32((void *)(upc - 2), &insn) != -1 && + (insn & 0xffff) == 0x340f || (insn & 0xffff) == 0x050f) + return (upc - 2); + return (upc - 7); +} + +/* + * Protect segment registers from non-user privilege levels and GDT selectors + * other than USER_CS, USER_DS and lwp FS and GS values. If the segment + * selector is non-null and not USER_CS/USER_DS, we make sure that the + * TI bit is set to point into the LDT and that the RPL is set to 3. + * + * Since struct regs stores each 16-bit segment register as a 32-bit greg_t, we + * also explicitly zero the top 16 bits since they may be coming from the + * user's address space via setcontext(2) or /proc. + * + * Note about null selector. When running on the hypervisor if we allow a + * process to set its %cs to null selector with RPL of 0 the hypervisor will + * crash the domain. If running on bare metal we would get a #gp fault and + * be able to kill the process and continue on. Therefore we make sure to + * force RPL to SEL_UPL even for null selector when setting %cs. + */ + +#if defined(IS_CS) || defined(IS_NOT_CS) +#error "IS_CS and IS_NOT_CS already defined" +#endif + +#define IS_CS 1 +#define IS_NOT_CS 0 + +/*ARGSUSED*/ +static greg_t +fix_segreg(greg_t sr, int iscs, model_t datamodel) +{ + switch (sr &= 0xffff) { + + case 0: + if (iscs == IS_CS) + return (0 | SEL_UPL); + else + return (0); + + /* + * If lwp attempts to switch data model then force their + * code selector to be null selector. + */ + case U32CS_SEL: + if (datamodel == DATAMODEL_NATIVE) + return (0 | SEL_UPL); + else + return (sr); + + case UCS_SEL: + if (datamodel == DATAMODEL_ILP32) + return (0 | SEL_UPL); + /*FALLTHROUGH*/ + case UDS_SEL: + case LWPFS_SEL: + case LWPGS_SEL: + case SEL_UPL: + return (sr); + default: + break; + } + + /* + * Force it into the LDT in ring 3 for 32-bit processes, which by + * default do not have an LDT, so that any attempt to use an invalid + * selector will reference the (non-existant) LDT, and cause a #gp + * fault for the process. + * + * 64-bit processes get the null gdt selector since they + * are not allowed to have a private LDT. + */ + if (datamodel == DATAMODEL_ILP32) { + return (sr | SEL_TI_LDT | SEL_UPL); + } else { + if (iscs == IS_CS) + return (0 | SEL_UPL); + else + return (0); + } + +} + +/* + * Set general registers. + */ +void +setgregs(klwp_t *lwp, gregset_t grp) +{ + struct regs *rp = lwptoregs(lwp); + model_t datamodel = lwp_getdatamodel(lwp); + + struct pcb *pcb = &lwp->lwp_pcb; + int thisthread = lwptot(lwp) == curthread; + + if (datamodel == DATAMODEL_NATIVE) { + if (thisthread) + (void) save_syscall_args(); /* copy the args */ + + rp->r_rdi = grp[REG_RDI]; + rp->r_rsi = grp[REG_RSI]; + rp->r_rdx = grp[REG_RDX]; + rp->r_rcx = grp[REG_RCX]; + rp->r_r8 = grp[REG_R8]; + rp->r_r9 = grp[REG_R9]; + rp->r_rax = grp[REG_RAX]; + rp->r_rbx = grp[REG_RBX]; + rp->r_rbp = grp[REG_RBP]; + rp->r_r10 = grp[REG_R10]; + rp->r_r11 = grp[REG_R11]; + rp->r_r12 = grp[REG_R12]; + rp->r_r13 = grp[REG_R13]; + rp->r_r14 = grp[REG_R14]; + rp->r_r15 = grp[REG_R15]; + rp->r_trapno = grp[REG_TRAPNO]; + rp->r_err = grp[REG_ERR]; + rp->r_rip = grp[REG_RIP]; + /* + * Setting %cs or %ss to anything else is quietly but + * quite definitely forbidden! + */ + rp->r_cs = UCS_SEL; + rp->r_ss = UDS_SEL; + rp->r_rsp = grp[REG_RSP]; + + if (thisthread) + kpreempt_disable(); + + pcb->pcb_ds = UDS_SEL; + pcb->pcb_es = UDS_SEL; + + /* + * 64-bit processes -are- allowed to set their fsbase/gsbase + * values directly, but only if they're using the segment + * selectors that allow that semantic. + * + * (32-bit processes must use lwp_set_private().) + */ + pcb->pcb_fsbase = grp[REG_FSBASE]; + pcb->pcb_gsbase = grp[REG_GSBASE]; + pcb->pcb_fs = fix_segreg(grp[REG_FS], IS_NOT_CS, datamodel); + pcb->pcb_gs = fix_segreg(grp[REG_GS], IS_NOT_CS, datamodel); + + /* + * Ensure that we go out via update_sregs + */ + PCB_SET_UPDATE_SEGS(pcb); + lwptot(lwp)->t_post_sys = 1; + if (thisthread) + kpreempt_enable(); +#if defined(_SYSCALL32_IMPL) + } else { + rp->r_rdi = (uint32_t)grp[REG_RDI]; + rp->r_rsi = (uint32_t)grp[REG_RSI]; + rp->r_rdx = (uint32_t)grp[REG_RDX]; + rp->r_rcx = (uint32_t)grp[REG_RCX]; + rp->r_rax = (uint32_t)grp[REG_RAX]; + rp->r_rbx = (uint32_t)grp[REG_RBX]; + rp->r_rbp = (uint32_t)grp[REG_RBP]; + rp->r_trapno = (uint32_t)grp[REG_TRAPNO]; + rp->r_err = (uint32_t)grp[REG_ERR]; + rp->r_rip = (uint32_t)grp[REG_RIP]; + + rp->r_cs = fix_segreg(grp[REG_CS], IS_CS, datamodel); + rp->r_ss = fix_segreg(grp[REG_DS], IS_NOT_CS, datamodel); + + rp->r_rsp = (uint32_t)grp[REG_RSP]; + + if (thisthread) + kpreempt_disable(); + + pcb->pcb_ds = fix_segreg(grp[REG_DS], IS_NOT_CS, datamodel); + pcb->pcb_es = fix_segreg(grp[REG_ES], IS_NOT_CS, datamodel); + + /* + * (See fsbase/gsbase commentary above) + */ + pcb->pcb_fs = fix_segreg(grp[REG_FS], IS_NOT_CS, datamodel); + pcb->pcb_gs = fix_segreg(grp[REG_GS], IS_NOT_CS, datamodel); + + /* + * Ensure that we go out via update_sregs + */ + PCB_SET_UPDATE_SEGS(pcb); + lwptot(lwp)->t_post_sys = 1; + if (thisthread) + kpreempt_enable(); +#endif + } + + /* + * Only certain bits of the flags register can be modified. + */ + rp->r_rfl = (rp->r_rfl & ~PSL_USERMASK) | + (grp[REG_RFL] & PSL_USERMASK); +} + +/* + * Determine whether eip is likely to have an interrupt frame + * on the stack. We do this by comparing the address to the + * range of addresses spanned by several well-known routines. + */ +extern void _interrupt(); +extern void _allsyscalls(); +extern void _cmntrap(); +extern void fakesoftint(); + +extern size_t _interrupt_size; +extern size_t _allsyscalls_size; +extern size_t _cmntrap_size; +extern size_t _fakesoftint_size; + +/* + * Get a pc-only stacktrace. Used for kmem_alloc() buffer ownership tracking. + * Returns MIN(current stack depth, pcstack_limit). + */ +int +getpcstack(pc_t *pcstack, int pcstack_limit) +{ + struct frame *fp = (struct frame *)getfp(); + struct frame *nextfp, *minfp, *stacktop; + int depth = 0; + int on_intr; + uintptr_t pc; + + if ((on_intr = CPU_ON_INTR(CPU)) != 0) + stacktop = (struct frame *)(CPU->cpu_intr_stack + SA(MINFRAME)); + else + stacktop = (struct frame *)curthread->t_stk; + minfp = fp; + + pc = ((struct regs *)fp)->r_pc; + + while (depth < pcstack_limit) { + nextfp = (struct frame *)fp->fr_savfp; + pc = fp->fr_savpc; + if (nextfp <= minfp || nextfp >= stacktop) { + if (on_intr) { + /* + * Hop from interrupt stack to thread stack. + */ + stacktop = (struct frame *)curthread->t_stk; + minfp = (struct frame *)curthread->t_stkbase; + on_intr = 0; + continue; + } + break; + } + pcstack[depth++] = (pc_t)pc; + fp = nextfp; + minfp = fp; + } + return (depth); +} + +/* + * The following ELF header fields are defined as processor-specific + * in the V8 ABI: + * + * e_ident[EI_DATA] encoding of the processor-specific + * data in the object file + * e_machine processor identification + * e_flags processor-specific flags associated + * with the file + */ + +/* + * The value of at_flags reflects a platform's cpu module support. + * at_flags is used to check for allowing a binary to execute and + * is passed as the value of the AT_FLAGS auxiliary vector. + */ +int at_flags = 0; + +/* + * Check the processor-specific fields of an ELF header. + * + * returns 1 if the fields are valid, 0 otherwise + */ +/*ARGSUSED2*/ +int +elfheadcheck( + unsigned char e_data, + Elf32_Half e_machine, + Elf32_Word e_flags) +{ + if (e_data != ELFDATA2LSB) + return (0); + if (e_machine == EM_AMD64) + return (1); + return (e_machine == EM_386); +} + +uint_t auxv_hwcap_include = 0; /* patch to enable unrecognized features */ +uint_t auxv_hwcap_include_2 = 0; /* second word */ +uint_t auxv_hwcap_exclude = 0; /* patch for broken cpus, debugging */ +uint_t auxv_hwcap_exclude_2 = 0; /* second word */ +#if defined(_SYSCALL32_IMPL) +uint_t auxv_hwcap32_include = 0; /* ditto for 32-bit apps */ +uint_t auxv_hwcap32_include_2 = 0; /* ditto for 32-bit apps */ +uint_t auxv_hwcap32_exclude = 0; /* ditto for 32-bit apps */ +uint_t auxv_hwcap32_exclude_2 = 0; /* ditto for 32-bit apps */ +#endif + +/* + * Gather information about the processor and place it into auxv_hwcap + * so that it can be exported to the linker via the aux vector. + * + * We use this seemingly complicated mechanism so that we can ensure + * that /etc/system can be used to override what the system can or + * cannot discover for itself. + */ +void +bind_hwcap(void) +{ + uint_t cpu_hwcap_flags[2]; + cpuid_pass4(NULL, cpu_hwcap_flags); + + auxv_hwcap = (auxv_hwcap_include | cpu_hwcap_flags[0]) & + ~auxv_hwcap_exclude; + auxv_hwcap_2 = (auxv_hwcap_include_2 | cpu_hwcap_flags[1]) & + ~auxv_hwcap_exclude_2; + + /* + * On AMD processors, sysenter just doesn't work at all + * when the kernel is in long mode. On IA-32e processors + * it does, but there's no real point in all the alternate + * mechanism when syscall works on both. + * + * Besides, the kernel's sysenter handler is expecting a + * 32-bit lwp ... + */ + auxv_hwcap &= ~AV_386_SEP; + + if (auxv_hwcap_include || auxv_hwcap_exclude || auxv_hwcap_include_2 || + auxv_hwcap_exclude_2) { + /* + * The below assignment is regrettably required to get lint + * to accept the validity of our format string. The format + * string is in fact valid, but whatever intelligence in lint + * understands the cmn_err()-specific %b appears to have an + * off-by-one error: it (mistakenly) complains about bit + * number 32 (even though this is explicitly permitted). + * Normally, one would will away such warnings with a "LINTED" + * directive, but for reasons unclear and unknown, lint + * refuses to be assuaged in this case. Fortunately, lint + * doesn't pretend to have solved the Halting Problem -- + * and as soon as the format string is programmatic, it + * knows enough to shut up. + */ + char *fmt = "?user ABI extensions: %b\n"; + cmn_err(CE_CONT, fmt, auxv_hwcap, FMT_AV_386); + fmt = "?user ABI extensions (word 2): %b\n"; + cmn_err(CE_CONT, fmt, auxv_hwcap_2, FMT_AV_386_2); + } + +#if defined(_SYSCALL32_IMPL) + auxv_hwcap32 = (auxv_hwcap32_include | cpu_hwcap_flags[0]) & + ~auxv_hwcap32_exclude; + auxv_hwcap32_2 = (auxv_hwcap32_include_2 | cpu_hwcap_flags[1]) & + ~auxv_hwcap32_exclude_2; + + /* + * If this is an amd64 architecture machine from Intel, then + * syscall -doesn't- work in compatibility mode, only sysenter does. + * + * Sigh. + */ + if (!cpuid_syscall32_insn(NULL)) + auxv_hwcap32 &= ~AV_386_AMD_SYSC; + + /* + * 32-bit processes can -always- use the lahf/sahf instructions + */ + auxv_hwcap32 |= AV_386_AHF; + + /* + * 32-bit processes can -never- use fsgsbase instructions. + */ + auxv_hwcap32_2 &= ~AV_386_2_FSGSBASE; + + if (auxv_hwcap32_include || auxv_hwcap32_exclude || + auxv_hwcap32_include_2 || auxv_hwcap32_exclude_2) { + /* + * See the block comment in the cmn_err() of auxv_hwcap, above. + */ + char *fmt = "?32-bit user ABI extensions: %b\n"; + cmn_err(CE_CONT, fmt, auxv_hwcap32, FMT_AV_386); + fmt = "?32-bit user ABI extensions (word 2): %b\n"; + cmn_err(CE_CONT, fmt, auxv_hwcap32_2, FMT_AV_386_2); + } +#endif +} + +/* + * sync_icache() - this is called + * in proc/fs/prusrio.c. x86 has an unified cache and therefore + * this is a nop. + */ +/* ARGSUSED */ +void +sync_icache(caddr_t addr, uint_t len) +{ + /* Do nothing for now */ +} + +/*ARGSUSED*/ +void +sync_data_memory(caddr_t va, size_t len) +{ + /* Not implemented for this platform */ +} + +int +__ipltospl(int ipl) +{ + return (ipltospl(ipl)); +} + +/* + * The panic code invokes panic_saveregs() to record the contents of a + * regs structure into the specified panic_data structure for debuggers. + */ +void +panic_saveregs(panic_data_t *pdp, struct regs *rp) +{ + panic_nv_t *pnv = PANICNVGET(pdp); + + struct cregs creg; + + getcregs(&creg); + + PANICNVADD(pnv, "rdi", rp->r_rdi); + PANICNVADD(pnv, "rsi", rp->r_rsi); + PANICNVADD(pnv, "rdx", rp->r_rdx); + PANICNVADD(pnv, "rcx", rp->r_rcx); + PANICNVADD(pnv, "r8", rp->r_r8); + PANICNVADD(pnv, "r9", rp->r_r9); + PANICNVADD(pnv, "rax", rp->r_rax); + PANICNVADD(pnv, "rbx", rp->r_rbx); + PANICNVADD(pnv, "rbp", rp->r_rbp); + PANICNVADD(pnv, "r10", rp->r_r10); + PANICNVADD(pnv, "r11", rp->r_r11); + PANICNVADD(pnv, "r12", rp->r_r12); + PANICNVADD(pnv, "r13", rp->r_r13); + PANICNVADD(pnv, "r14", rp->r_r14); + PANICNVADD(pnv, "r15", rp->r_r15); + PANICNVADD(pnv, "fsbase", rdmsr(MSR_AMD_FSBASE)); + PANICNVADD(pnv, "gsbase", rdmsr(MSR_AMD_GSBASE)); + PANICNVADD(pnv, "ds", rp->r_ds); + PANICNVADD(pnv, "es", rp->r_es); + PANICNVADD(pnv, "fs", rp->r_fs); + PANICNVADD(pnv, "gs", rp->r_gs); + PANICNVADD(pnv, "trapno", rp->r_trapno); + PANICNVADD(pnv, "err", rp->r_err); + PANICNVADD(pnv, "rip", rp->r_rip); + PANICNVADD(pnv, "cs", rp->r_cs); + PANICNVADD(pnv, "rflags", rp->r_rfl); + PANICNVADD(pnv, "rsp", rp->r_rsp); + PANICNVADD(pnv, "ss", rp->r_ss); + PANICNVADD(pnv, "gdt_hi", (uint64_t)(creg.cr_gdt._l[3])); + PANICNVADD(pnv, "gdt_lo", (uint64_t)(creg.cr_gdt._l[0])); + PANICNVADD(pnv, "idt_hi", (uint64_t)(creg.cr_idt._l[3])); + PANICNVADD(pnv, "idt_lo", (uint64_t)(creg.cr_idt._l[0])); + + PANICNVADD(pnv, "ldt", creg.cr_ldt); + PANICNVADD(pnv, "task", creg.cr_task); + PANICNVADD(pnv, "cr0", creg.cr_cr0); + PANICNVADD(pnv, "cr2", creg.cr_cr2); + PANICNVADD(pnv, "cr3", creg.cr_cr3); + if (creg.cr_cr4) + PANICNVADD(pnv, "cr4", creg.cr_cr4); + + PANICNVSET(pdp, pnv); +} + +#define TR_ARG_MAX 6 /* Max args to print, same as SPARC */ + + +/* + * Print a stack backtrace using the specified frame pointer. We delay two + * seconds before continuing, unless this is the panic traceback. + * If we are in the process of panicking, we also attempt to write the + * stack backtrace to a staticly assigned buffer, to allow the panic + * code to find it and write it in to uncompressed pages within the + * system crash dump. + * Note that the frame for the starting stack pointer value is omitted because + * the corresponding %eip is not known. + */ + +extern char *dump_stack_scratch; + + +void +traceback(caddr_t fpreg) +{ + struct frame *fp = (struct frame *)fpreg; + struct frame *nextfp; + uintptr_t pc, nextpc; + ulong_t off; + char args[TR_ARG_MAX * 2 + 16], *sym; + uint_t offset = 0; + uint_t next_offset = 0; + char stack_buffer[1024]; + + if (!panicstr) + printf("traceback: %%fp = %p\n", (void *)fp); + + if (panicstr && !dump_stack_scratch) { + printf("Warning - stack not written to the dump buffer\n"); + } + + fp = (struct frame *)plat_traceback(fpreg); + if ((uintptr_t)fp < KERNELBASE) + goto out; + + pc = fp->fr_savpc; + fp = (struct frame *)fp->fr_savfp; + + while ((uintptr_t)fp >= KERNELBASE) { + /* + * XX64 Until port is complete tolerate 8-byte aligned + * frame pointers but flag with a warning so they can + * be fixed. + */ + if (((uintptr_t)fp & (STACK_ALIGN - 1)) != 0) { + if (((uintptr_t)fp & (8 - 1)) == 0) { + printf(" >> warning! 8-byte" + " aligned %%fp = %p\n", (void *)fp); + } else { + printf( + " >> mis-aligned %%fp = %p\n", (void *)fp); + break; + } + } + + args[0] = '\0'; + nextpc = (uintptr_t)fp->fr_savpc; + nextfp = (struct frame *)fp->fr_savfp; + if ((sym = kobj_getsymname(pc, &off)) != NULL) { + printf("%016lx %s:%s+%lx (%s)\n", (uintptr_t)fp, + mod_containing_pc((caddr_t)pc), sym, off, args); + (void) snprintf(stack_buffer, sizeof (stack_buffer), + "%s:%s+%lx (%s) | ", + mod_containing_pc((caddr_t)pc), sym, off, args); + } else { + printf("%016lx %lx (%s)\n", + (uintptr_t)fp, pc, args); + (void) snprintf(stack_buffer, sizeof (stack_buffer), + "%lx (%s) | ", pc, args); + } + + if (panicstr && dump_stack_scratch) { + next_offset = offset + strlen(stack_buffer); + if (next_offset < STACK_BUF_SIZE) { + bcopy(stack_buffer, dump_stack_scratch + offset, + strlen(stack_buffer)); + offset = next_offset; + } else { + /* + * In attempting to save the panic stack + * to the dumpbuf we have overflowed that area. + * Print a warning and continue to printf the + * stack to the msgbuf + */ + printf("Warning: stack in the dump buffer" + " may be incomplete\n"); + offset = next_offset; + } + } + + pc = nextpc; + fp = nextfp; + } +out: + if (!panicstr) { + printf("end of traceback\n"); + DELAY(2 * MICROSEC); + } else if (dump_stack_scratch) { + dump_stack_scratch[offset] = '\0'; + } +} + + +/* + * Generate a stack backtrace from a saved register set. + */ +void +traceregs(struct regs *rp) +{ + traceback((caddr_t)rp->r_fp); +} + +void +exec_set_sp(size_t stksize) +{ + klwp_t *lwp = ttolwp(curthread); + + lwptoregs(lwp)->r_sp = (uintptr_t)curproc->p_usrstack - stksize; +} + +hrtime_t +gethrtime_waitfree(void) +{ + return (dtrace_gethrtime()); +} + +hrtime_t +gethrtime(void) +{ + return (gethrtimef()); +} + +hrtime_t +gethrtime_unscaled(void) +{ + return (gethrtimeunscaledf()); +} + +void +scalehrtime(hrtime_t *hrt) +{ + scalehrtimef(hrt); +} + +uint64_t +unscalehrtime(hrtime_t nsecs) +{ + return (unscalehrtimef(nsecs)); +} + +void +gethrestime(timespec_t *tp) +{ + gethrestimef(tp); +} + +/* + * Part of the implementation of hres_tick(); this routine is + * easier in C than assembler .. called with the hres_lock held. + * + * XX64 Many of these timekeeping variables need to be extern'ed in a header + */ + +#include <sys/time.h> +#include <sys/machlock.h> + +extern int one_sec; +extern int max_hres_adj; + +void +__adj_hrestime(void) +{ + long long adj; + + if (hrestime_adj == 0) + adj = 0; + else if (hrestime_adj > 0) { + if (hrestime_adj < max_hres_adj) + adj = hrestime_adj; + else + adj = max_hres_adj; + } else { + if (hrestime_adj < -max_hres_adj) + adj = -max_hres_adj; + else + adj = hrestime_adj; + } + + timedelta -= adj; + hrestime_adj = timedelta; + hrestime.tv_nsec += adj; + + while (hrestime.tv_nsec >= NANOSEC) { + one_sec++; + hrestime.tv_sec++; + hrestime.tv_nsec -= NANOSEC; + } +} + +/* + * Wrapper functions to maintain backwards compability + */ +int +xcopyin(const void *uaddr, void *kaddr, size_t count) +{ + return (xcopyin_nta(uaddr, kaddr, count, UIO_COPY_CACHED)); +} + +int +xcopyout(const void *kaddr, void *uaddr, size_t count) +{ + return (xcopyout_nta(kaddr, uaddr, count, UIO_COPY_CACHED)); +} diff --git a/usr/src/uts/intel/os/bootdev.c b/usr/src/uts/intel/os/bootdev.c new file mode 100644 index 0000000000..02f31efd56 --- /dev/null +++ b/usr/src/uts/intel/os/bootdev.c @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/modctl.h> +#include <sys/sunddi.h> + +/* internal global data */ +static struct modlmisc modlmisc = { + &mod_miscops, "bootdev misc module" +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlmisc, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * convert a prom device path to an equivalent path in /devices + * Does not deal with aliases. Does deal with pathnames which + * are not fully qualified. This routine is generalized + * to work across several flavors of OBP + */ +int +i_promname_to_devname(char *prom_name, char *ret_buf) +{ + if (prom_name == NULL || ret_buf == NULL || + (strlen(prom_name) >= MAXPATHLEN)) { + return (EINVAL); + } + if (i_ddi_prompath_to_devfspath(prom_name, ret_buf) != DDI_SUCCESS) + return (EINVAL); + + return (0); +} + +/* + * If bootstring contains a device path, we need to convert to a format + * the prom will understand. To do so, we convert the existing path to + * a prom-compatible path and return the value of new_path. If the + * caller specifies new_path as NULL, we allocate an appropriately + * sized new_path on behalf of the caller. If the caller invokes this + * function with new_path = NULL, they must do so from a context in + * which it is safe to perform a sleeping memory allocation. + * + * NOTE: Intel does not have a real PROM, so the implementation + * simply returns a copy of the string passed in. + */ +char * +i_convert_boot_device_name(char *cur_path, char *new_path, size_t *len) +{ + if (new_path != NULL) { + (void) snprintf(new_path, *len, "%s", cur_path); + return (new_path); + } else { + *len = strlen(cur_path) + 1; + new_path = kmem_alloc(*len, KM_SLEEP); + (void) snprintf(new_path, *len, "%s", cur_path); + return (new_path); + } +} diff --git a/usr/src/uts/intel/os/comm_page_util.c b/usr/src/uts/intel/os/comm_page_util.c new file mode 100644 index 0000000000..f286bee7f6 --- /dev/null +++ b/usr/src/uts/intel/os/comm_page_util.c @@ -0,0 +1,62 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + + +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <vm/as.h> +#include <vm/seg_umap.h> + +#if !defined(__xpv) +#include <sys/comm_page.h> +#endif /* !defined(__xpv) */ + +/* + * Map in the comm page. + * + * The contents of the comm page are only defined on non-xpv x86 at this time. + * Furthermore, the data is only valid in userspace (32-bit or 64-bit) when + * mapped from a 64-bit kernel. + * See: "uts/i86pc/sys/comm_page.h" + */ +caddr_t +comm_page_mapin() +{ +#if !defined(__xpv) + proc_t *p = curproc; + caddr_t addr = NULL; + size_t len = COMM_PAGE_SIZE; + uint_t prot = PROT_USER | PROT_READ; + segumap_crargs_t suarg; + + map_addr(&addr, len, (offset_t)0, 1, 0); + if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as, + p->p_as->a_userlimit) != RANGE_OKAY) { + return (NULL); + } + + suarg.kaddr = (caddr_t)&comm_page; + suarg.prot = suarg.maxprot = prot; + if (as_map(p->p_as, addr, len, segumap_create, &suarg) != 0) { + return (NULL); + } + return (addr); +#else /* !defined(__xpv) */ + return (NULL); +#endif /* !defined(__xpv) */ +} diff --git a/usr/src/uts/intel/os/copy_subr.c b/usr/src/uts/intel/os/copy_subr.c new file mode 100644 index 0000000000..b69f052e68 --- /dev/null +++ b/usr/src/uts/intel/os/copy_subr.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Miscellaneous C routines for copying data around without + * descending into assembler. Compilers are pretty good at + * scheduling instructions, and humans are pretty hopeless at + * writing correct assembler. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/param.h> + +/* + * copyinstr_noerr and copyoutstr_noerr can be implemented completely + * in C on machines with shared user and kernel context. + */ +static int +copystr_nofault(const char *src, char *dst, size_t maxlength, + size_t *lencopied) +{ + int error = 0; + size_t leftover; + + if ((leftover = maxlength) == 0) + error = ENAMETOOLONG; + else + do { + leftover--; + if ((*dst++ = *src++) == '\0') + break; + if (leftover == 0) { + error = ENAMETOOLONG; + break; + } + /*CONSTCOND*/ + } while (1); + + if (lencopied) + *lencopied = maxlength - leftover; + return (error); +} + + +int +copyinstr_noerr(const char *uaddr, char *kaddr, size_t maxlength, + size_t *lencopied) +{ + char *ua = (char *)uaddr; + + ASSERT((uintptr_t)kaddr > kernelbase); + + if ((uintptr_t)ua > kernelbase) { + /* + * force fault at kernelbase + */ + ua = (char *)kernelbase; + } + return (copystr_nofault(ua, kaddr, maxlength, lencopied)); +} + +int +copyoutstr_noerr(const char *kaddr, char *uaddr, size_t maxlength, + size_t *lencopied) +{ + char *ua = (char *)uaddr; + + ASSERT((uintptr_t)kaddr > kernelbase); + + if ((uintptr_t)ua > kernelbase) { + /* + * force fault at kernelbase + */ + ua = (char *)kernelbase; + } + return (copystr_nofault(kaddr, ua, maxlength, lencopied)); +} diff --git a/usr/src/uts/intel/os/cpc_subr.c b/usr/src/uts/intel/os/cpc_subr.c new file mode 100644 index 0000000000..71e1ebaeee --- /dev/null +++ b/usr/src/uts/intel/os/cpc_subr.c @@ -0,0 +1,274 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Joyent, Inc. + */ + +/* + * x86-specific routines used by the CPU Performance counter driver. + */ + +#include <sys/types.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/x86_archext.h> +#include <sys/cpuvar.h> +#include <sys/machcpuvar.h> +#include <sys/archsystm.h> +#include <sys/cpc_pcbe.h> +#include <sys/cpc_impl.h> +#include <sys/x_call.h> +#include <sys/cmn_err.h> +#include <sys/cmt.h> +#include <sys/spl.h> +#include <sys/apic.h> + +static const uint64_t allstopped = 0; +static kcpc_ctx_t *(*overflow_intr_handler)(caddr_t); + +/* Do threads share performance monitoring hardware? */ +static int strands_perfmon_shared = 0; + +int kcpc_hw_overflow_intr_installed; /* set by APIC code */ +extern kcpc_ctx_t *kcpc_overflow_intr(caddr_t arg, uint64_t bitmap); + +extern int kcpc_counts_include_idle; /* Project Private /etc/system variable */ + +void (*kcpc_hw_enable_cpc_intr)(void); /* set by APIC code */ + +int +kcpc_hw_add_ovf_intr(kcpc_ctx_t *(*handler)(caddr_t)) +{ + if (x86_type != X86_TYPE_P6) + return (0); + overflow_intr_handler = handler; + return (ipltospl(APIC_PCINT_IPL)); +} + +void +kcpc_hw_rem_ovf_intr(void) +{ + overflow_intr_handler = NULL; +} + +/* + * Hook used on P4 systems to catch online/offline events. + */ +/*ARGSUSED*/ +static int +kcpc_cpu_setup(cpu_setup_t what, int cpuid, void *arg) +{ + pg_cmt_t *chip_pg; + int active_cpus_cnt; + + if (what != CPU_ON) + return (0); + + /* + * If any CPU-bound contexts exist, we don't need to invalidate + * anything, as no per-LWP contexts can coexist. + */ + if (kcpc_cpuctx || dtrace_cpc_in_use) + return (0); + + /* + * If this chip now has more than 1 active cpu, we must invalidate all + * contexts in the system. + */ + chip_pg = (pg_cmt_t *)pghw_find_pg(cpu[cpuid], PGHW_CHIP); + if (chip_pg != NULL) { + active_cpus_cnt = GROUP_SIZE(&chip_pg->cmt_cpus_actv); + if (active_cpus_cnt > 1) + kcpc_invalidate_all(); + } + + return (0); +} + +static kmutex_t cpu_setup_lock; /* protects setup_registered */ +static int setup_registered; + + +void +kcpc_hw_init(cpu_t *cp) +{ + kthread_t *t = cp->cpu_idle_thread; + uint32_t versionid; + struct cpuid_regs cpuid; + + strands_perfmon_shared = 0; + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + if (cpuid_getvendor(cpu[0]) == X86_VENDOR_Intel) { + /* + * Intel processors that support Architectural + * Performance Monitoring Version 3 have per strand + * performance monitoring hardware. + * Hence we can allow use of performance counters on + * multiple strands on the same core simultaneously. + */ + cpuid.cp_eax = 0x0; + (void) __cpuid_insn(&cpuid); + if (cpuid.cp_eax < 0xa) { + strands_perfmon_shared = 1; + } else { + cpuid.cp_eax = 0xa; + (void) __cpuid_insn(&cpuid); + + versionid = cpuid.cp_eax & 0xFF; + if (versionid < 3) { + strands_perfmon_shared = 1; + } + } + } else if (cpuid_getvendor(cpu[0]) == X86_VENDOR_AMD || + cpuid_getvendor(cpu[0]) == X86_VENDOR_HYGON) { + /* + * On AMD systems with HT, all of the performance + * monitors exist on a per-logical CPU basis. + */ + strands_perfmon_shared = 0; + } else { + strands_perfmon_shared = 1; + } + } + + if (strands_perfmon_shared) { + mutex_enter(&cpu_setup_lock); + if (setup_registered == 0) { + mutex_enter(&cpu_lock); + register_cpu_setup_func(kcpc_cpu_setup, NULL); + mutex_exit(&cpu_lock); + setup_registered = 1; + } + mutex_exit(&cpu_setup_lock); + } + + mutex_init(&cp->cpu_cpc_ctxlock, "cpu_cpc_ctxlock", MUTEX_DEFAULT, 0); + + if (kcpc_counts_include_idle) + return; + + installctx(t, cp, kcpc_idle_save, kcpc_idle_restore, + NULL, NULL, NULL, NULL, NULL); +} + +void +kcpc_hw_fini(cpu_t *cp) +{ + ASSERT(cp->cpu_idle_thread == NULL); + + mutex_destroy(&cp->cpu_cpc_ctxlock); +} + +#define BITS(v, u, l) \ + (((v) >> (l)) & ((1 << (1 + (u) - (l))) - 1)) + +#define PCBE_NAMELEN 30 /* Enough Room for pcbe.manuf.model.family.stepping */ + +/* + * Examine the processor and load an appropriate PCBE. + */ +int +kcpc_hw_load_pcbe(void) +{ + return (kcpc_pcbe_tryload(cpuid_getvendorstr(CPU), cpuid_getfamily(CPU), + cpuid_getmodel(CPU), cpuid_getstep(CPU))); +} + +/* + * Called by the generic framework to check if it's OK to bind a set to a CPU. + */ +int +kcpc_hw_cpu_hook(processorid_t cpuid, ulong_t *kcpc_cpumap) +{ + cpu_t *cpu, *p; + pg_t *chip_pg; + pg_cpu_itr_t itr; + + if (!strands_perfmon_shared) + return (0); + + /* + * Only one logical CPU on each Pentium 4 HT CPU may be bound to at + * once. + * + * This loop is protected by holding cpu_lock, in order to properly + * access the cpu_t of the desired cpu. + */ + mutex_enter(&cpu_lock); + if ((cpu = cpu_get(cpuid)) == NULL) { + mutex_exit(&cpu_lock); + return (-1); + } + + chip_pg = (pg_t *)pghw_find_pg(cpu, PGHW_CHIP); + + PG_CPU_ITR_INIT(chip_pg, itr); + while ((p = pg_cpu_next(&itr)) != NULL) { + if (p == cpu) + continue; + if (BT_TEST(kcpc_cpumap, p->cpu_id)) { + mutex_exit(&cpu_lock); + return (-1); + } + } + + mutex_exit(&cpu_lock); + return (0); +} + +/* + * Called by the generic framework to check if it's OK to bind a set to an LWP. + */ +int +kcpc_hw_lwp_hook(void) +{ + pg_cmt_t *chip; + group_t *chips; + group_iter_t i; + + if (!strands_perfmon_shared) + return (0); + + /* + * Only one CPU per chip may be online. + */ + mutex_enter(&cpu_lock); + + chips = pghw_set_lookup(PGHW_CHIP); + if (chips == NULL) { + mutex_exit(&cpu_lock); + return (0); + } + + group_iter_init(&i); + while ((chip = group_iterate(chips, &i)) != NULL) { + if (GROUP_SIZE(&chip->cmt_cpus_actv) > 1) { + mutex_exit(&cpu_lock); + return (-1); + } + } + + mutex_exit(&cpu_lock); + return (0); +} diff --git a/usr/src/uts/intel/os/ddi_i86.c b/usr/src/uts/intel/os/ddi_i86.c new file mode 100644 index 0000000000..f135d0673c --- /dev/null +++ b/usr/src/uts/intel/os/ddi_i86.c @@ -0,0 +1,1903 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2014 Garrett D'Amore <garrett@damore.org> + */ + +#include <sys/conf.h> +#include <sys/kmem.h> +#include <sys/ddi_impldefs.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddifm.h> +#include <sys/fm/io/ddi.h> +#include <sys/fm/protocol.h> +#include <sys/ontrap.h> + + +/* + * DDI DMA Engine functions for x86. + * These functions are more naturally generic, but do not apply to SPARC. + */ + +int +ddi_dmae_alloc(dev_info_t *dip, int chnl, int (*dmae_waitfp)(), caddr_t arg) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_ACQUIRE, + (off_t *)dmae_waitfp, (size_t *)arg, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_release(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_FREE, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_getattr(dev_info_t *dip, ddi_dma_attr_t *attrp) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_GETATTR, 0, 0, + (caddr_t *)attrp, 0)); +} + +int +ddi_dmae_1stparty(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_1STPTY, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_prog(dev_info_t *dip, struct ddi_dmae_req *dmaereqp, + ddi_dma_cookie_t *cookiep, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_PROG, (off_t *)dmaereqp, + (size_t *)cookiep, (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_swsetup(dev_info_t *dip, struct ddi_dmae_req *dmaereqp, + ddi_dma_cookie_t *cookiep, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_SWSETUP, (off_t *)dmaereqp, + (size_t *)cookiep, (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_swstart(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_SWSTART, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_stop(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_STOP, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_enable(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_ENABLE, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_disable(dev_info_t *dip, int chnl) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_DISABLE, 0, 0, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +int +ddi_dmae_getcnt(dev_info_t *dip, int chnl, int *countp) +{ + return (ddi_dma_mctl(dip, dip, 0, DDI_DMA_E_GETCNT, 0, (size_t *)countp, + (caddr_t *)(uintptr_t)chnl, 0)); +} + +/* + * implementation specific access handle and routines: + */ + +static uintptr_t impl_acc_hdl_id = 0; + +/* + * access handle allocator + */ +ddi_acc_hdl_t * +impl_acc_hdl_get(ddi_acc_handle_t hdl) +{ + /* + * recast to ddi_acc_hdl_t instead of + * casting to ddi_acc_impl_t and then return the ah_platform_private + * + * this optimization based on the ddi_acc_hdl_t is the + * first member of the ddi_acc_impl_t. + */ + return ((ddi_acc_hdl_t *)hdl); +} + +ddi_acc_handle_t +impl_acc_hdl_alloc(int (*waitfp)(caddr_t), caddr_t arg) +{ + ddi_acc_impl_t *hp; + on_trap_data_t *otp; + int sleepflag; + + sleepflag = ((waitfp == (int (*)())KM_SLEEP) ? KM_SLEEP : KM_NOSLEEP); + /* + * Allocate and initialize the data access handle and error status. + */ + if ((hp = kmem_zalloc(sizeof (ddi_acc_impl_t), sleepflag)) == NULL) + goto fail; + if ((hp->ahi_err = (ndi_err_t *)kmem_zalloc( + sizeof (ndi_err_t), sleepflag)) == NULL) { + kmem_free(hp, sizeof (ddi_acc_impl_t)); + goto fail; + } + if ((otp = (on_trap_data_t *)kmem_zalloc( + sizeof (on_trap_data_t), sleepflag)) == NULL) { + kmem_free(hp->ahi_err, sizeof (ndi_err_t)); + kmem_free(hp, sizeof (ddi_acc_impl_t)); + goto fail; + } + hp->ahi_err->err_ontrap = otp; + hp->ahi_common.ah_platform_private = (void *)hp; + + return ((ddi_acc_handle_t)hp); +fail: + if ((waitfp != (int (*)())KM_SLEEP) && + (waitfp != (int (*)())KM_NOSLEEP)) + ddi_set_callback(waitfp, arg, &impl_acc_hdl_id); + return (NULL); +} + +void +impl_acc_hdl_free(ddi_acc_handle_t handle) +{ + ddi_acc_impl_t *hp; + + /* + * The supplied (ddi_acc_handle_t) is actually a (ddi_acc_impl_t *), + * because that's what we allocated in impl_acc_hdl_alloc() above. + */ + hp = (ddi_acc_impl_t *)handle; + if (hp) { + kmem_free(hp->ahi_err->err_ontrap, sizeof (on_trap_data_t)); + kmem_free(hp->ahi_err, sizeof (ndi_err_t)); + kmem_free(hp, sizeof (ddi_acc_impl_t)); + if (impl_acc_hdl_id) + ddi_run_callback(&impl_acc_hdl_id); + } +} + +/* + * Function used to check if a given access handle owns the failing address. + * Called by ndi_fmc_error, when we detect a PIO error. + */ +/* ARGSUSED */ +static int +impl_acc_check(dev_info_t *dip, const void *handle, const void *addr, + const void *not_used) +{ + pfn_t pfn, fault_pfn; + ddi_acc_hdl_t *hp; + + hp = impl_acc_hdl_get((ddi_acc_handle_t)handle); + + ASSERT(hp); + + if (addr != NULL) { + pfn = hp->ah_pfn; + fault_pfn = mmu_btop(*(uint64_t *)addr); + if (fault_pfn >= pfn && fault_pfn < (pfn + hp->ah_pnum)) + return (DDI_FM_NONFATAL); + } + return (DDI_FM_UNKNOWN); +} + +void +impl_acc_err_init(ddi_acc_hdl_t *handlep) +{ + int fmcap; + ndi_err_t *errp; + on_trap_data_t *otp; + ddi_acc_impl_t *hp = (ddi_acc_impl_t *)handlep; + + fmcap = ddi_fm_capable(handlep->ah_dip); + + if (handlep->ah_acc.devacc_attr_version < DDI_DEVICE_ATTR_V1 || + !DDI_FM_ACC_ERR_CAP(fmcap)) { + handlep->ah_acc.devacc_attr_access = DDI_DEFAULT_ACC; + } else if (handlep->ah_acc.devacc_attr_access == DDI_FLAGERR_ACC && + hp->ahi_scan == NULL) { + handlep->ah_acc.devacc_attr_access = DDI_DEFAULT_ACC; + } else if (DDI_FM_ACC_ERR_CAP(fmcap)) { + if (handlep->ah_acc.devacc_attr_access == DDI_DEFAULT_ACC) { + if (handlep->ah_xfermodes) + return; + i_ddi_drv_ereport_post(handlep->ah_dip, DVR_EFMCAP, + NULL, DDI_NOSLEEP); + } else { + errp = hp->ahi_err; + otp = (on_trap_data_t *)errp->err_ontrap; + otp->ot_handle = (void *)(hp); + otp->ot_prot = OT_DATA_ACCESS; + errp->err_status = DDI_FM_OK; + errp->err_expected = DDI_FM_ERR_UNEXPECTED; + errp->err_cf = impl_acc_check; + } + } +} + +/* ARGSUSED */ +int +impl_dma_check(dev_info_t *dip, const void *handle, const void *pci_hdl, + const void *not_used) +{ + return (DDI_FM_UNKNOWN); +} + +void +impl_acc_hdl_init(ddi_acc_hdl_t *handlep) +{ + ddi_acc_impl_t *hp; + int fmcap; + int devacc_attr_access; + + if (!handlep) + return; + fmcap = ddi_fm_capable(handlep->ah_dip); + if (handlep->ah_acc.devacc_attr_version < DDI_DEVICE_ATTR_V1 || + !DDI_FM_ACC_ERR_CAP(fmcap)) + devacc_attr_access = DDI_DEFAULT_ACC; + else + devacc_attr_access = handlep->ah_acc.devacc_attr_access; + + hp = (ddi_acc_impl_t *)handlep->ah_platform_private; + + /* + * Can only do FLAGERR if scan callback is set up. This should + * also guarantee that the peekpoke_mutex and err_mutex are defined. + */ + if (devacc_attr_access == DDI_FLAGERR_ACC && hp->ahi_scan == NULL) + devacc_attr_access = DDI_DEFAULT_ACC; + + switch (devacc_attr_access) { + case DDI_CAUTIOUS_ACC: + hp->ahi_get8 = i_ddi_caut_get8; + hp->ahi_put8 = i_ddi_caut_put8; + hp->ahi_rep_get8 = i_ddi_caut_rep_get8; + hp->ahi_rep_put8 = i_ddi_caut_rep_put8; + hp->ahi_get16 = i_ddi_caut_get16; + hp->ahi_get32 = i_ddi_caut_get32; + hp->ahi_put16 = i_ddi_caut_put16; + hp->ahi_put32 = i_ddi_caut_put32; + hp->ahi_rep_get16 = i_ddi_caut_rep_get16; + hp->ahi_rep_get32 = i_ddi_caut_rep_get32; + hp->ahi_rep_put16 = i_ddi_caut_rep_put16; + hp->ahi_rep_put32 = i_ddi_caut_rep_put32; + hp->ahi_get64 = i_ddi_caut_get64; + hp->ahi_put64 = i_ddi_caut_put64; + hp->ahi_rep_get64 = i_ddi_caut_rep_get64; + hp->ahi_rep_put64 = i_ddi_caut_rep_put64; + break; + case DDI_FLAGERR_ACC: + if (hp->ahi_acc_attr & DDI_ACCATTR_IO_SPACE) { + hp->ahi_get8 = i_ddi_prot_io_get8; + hp->ahi_put8 = i_ddi_prot_io_put8; + hp->ahi_rep_get8 = i_ddi_prot_io_rep_get8; + hp->ahi_rep_put8 = i_ddi_prot_io_rep_put8; + + /* temporary set these 64 functions to no-ops */ + hp->ahi_get64 = i_ddi_io_get64; + hp->ahi_put64 = i_ddi_io_put64; + hp->ahi_rep_get64 = i_ddi_io_rep_get64; + hp->ahi_rep_put64 = i_ddi_io_rep_put64; + + /* + * check for BIG endian access + */ + if (handlep->ah_acc.devacc_attr_endian_flags == + DDI_STRUCTURE_BE_ACC) { + hp->ahi_get16 = i_ddi_prot_io_swap_get16; + hp->ahi_get32 = i_ddi_prot_io_swap_get32; + hp->ahi_put16 = i_ddi_prot_io_swap_put16; + hp->ahi_put32 = i_ddi_prot_io_swap_put32; + hp->ahi_rep_get16 = + i_ddi_prot_io_swap_rep_get16; + hp->ahi_rep_get32 = + i_ddi_prot_io_swap_rep_get32; + hp->ahi_rep_put16 = + i_ddi_prot_io_swap_rep_put16; + hp->ahi_rep_put32 = + i_ddi_prot_io_swap_rep_put32; + } else { + hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT; + hp->ahi_get16 = i_ddi_prot_io_get16; + hp->ahi_get32 = i_ddi_prot_io_get32; + hp->ahi_put16 = i_ddi_prot_io_put16; + hp->ahi_put32 = i_ddi_prot_io_put32; + hp->ahi_rep_get16 = i_ddi_prot_io_rep_get16; + hp->ahi_rep_get32 = i_ddi_prot_io_rep_get32; + hp->ahi_rep_put16 = i_ddi_prot_io_rep_put16; + hp->ahi_rep_put32 = i_ddi_prot_io_rep_put32; + } + + } else if (hp->ahi_acc_attr & DDI_ACCATTR_CPU_VADDR) { + + hp->ahi_get8 = i_ddi_prot_vaddr_get8; + hp->ahi_put8 = i_ddi_prot_vaddr_put8; + hp->ahi_rep_get8 = i_ddi_prot_vaddr_rep_get8; + hp->ahi_rep_put8 = i_ddi_prot_vaddr_rep_put8; + + /* + * check for BIG endian access + */ + if (handlep->ah_acc.devacc_attr_endian_flags == + DDI_STRUCTURE_BE_ACC) { + + hp->ahi_get16 = i_ddi_prot_vaddr_swap_get16; + hp->ahi_get32 = i_ddi_prot_vaddr_swap_get32; + hp->ahi_get64 = i_ddi_prot_vaddr_swap_get64; + hp->ahi_put16 = i_ddi_prot_vaddr_swap_put16; + hp->ahi_put32 = i_ddi_prot_vaddr_swap_put32; + hp->ahi_put64 = i_ddi_prot_vaddr_swap_put64; + hp->ahi_rep_get16 = + i_ddi_prot_vaddr_swap_rep_get16; + hp->ahi_rep_get32 = + i_ddi_prot_vaddr_swap_rep_get32; + hp->ahi_rep_get64 = + i_ddi_prot_vaddr_swap_rep_get64; + hp->ahi_rep_put16 = + i_ddi_prot_vaddr_swap_rep_put16; + hp->ahi_rep_put32 = + i_ddi_prot_vaddr_swap_rep_put32; + hp->ahi_rep_put64 = + i_ddi_prot_vaddr_swap_rep_put64; + } else { + hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT; + hp->ahi_get16 = i_ddi_prot_vaddr_get16; + hp->ahi_get32 = i_ddi_prot_vaddr_get32; + hp->ahi_get64 = i_ddi_prot_vaddr_get64; + hp->ahi_put16 = i_ddi_prot_vaddr_put16; + hp->ahi_put32 = i_ddi_prot_vaddr_put32; + hp->ahi_put64 = i_ddi_prot_vaddr_put64; + hp->ahi_rep_get16 = i_ddi_prot_vaddr_rep_get16; + hp->ahi_rep_get32 = i_ddi_prot_vaddr_rep_get32; + hp->ahi_rep_get64 = i_ddi_prot_vaddr_rep_get64; + hp->ahi_rep_put16 = i_ddi_prot_vaddr_rep_put16; + hp->ahi_rep_put32 = i_ddi_prot_vaddr_rep_put32; + hp->ahi_rep_put64 = i_ddi_prot_vaddr_rep_put64; + } + } + break; + case DDI_DEFAULT_ACC: + if (hp->ahi_acc_attr & DDI_ACCATTR_IO_SPACE) { + hp->ahi_get8 = i_ddi_io_get8; + hp->ahi_put8 = i_ddi_io_put8; + hp->ahi_rep_get8 = i_ddi_io_rep_get8; + hp->ahi_rep_put8 = i_ddi_io_rep_put8; + + /* temporary set these 64 functions to no-ops */ + hp->ahi_get64 = i_ddi_io_get64; + hp->ahi_put64 = i_ddi_io_put64; + hp->ahi_rep_get64 = i_ddi_io_rep_get64; + hp->ahi_rep_put64 = i_ddi_io_rep_put64; + + /* + * check for BIG endian access + */ + if (handlep->ah_acc.devacc_attr_endian_flags == + DDI_STRUCTURE_BE_ACC) { + hp->ahi_get16 = i_ddi_io_swap_get16; + hp->ahi_get32 = i_ddi_io_swap_get32; + hp->ahi_put16 = i_ddi_io_swap_put16; + hp->ahi_put32 = i_ddi_io_swap_put32; + hp->ahi_rep_get16 = i_ddi_io_swap_rep_get16; + hp->ahi_rep_get32 = i_ddi_io_swap_rep_get32; + hp->ahi_rep_put16 = i_ddi_io_swap_rep_put16; + hp->ahi_rep_put32 = i_ddi_io_swap_rep_put32; + } else { + hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT; + hp->ahi_get16 = i_ddi_io_get16; + hp->ahi_get32 = i_ddi_io_get32; + hp->ahi_put16 = i_ddi_io_put16; + hp->ahi_put32 = i_ddi_io_put32; + hp->ahi_rep_get16 = i_ddi_io_rep_get16; + hp->ahi_rep_get32 = i_ddi_io_rep_get32; + hp->ahi_rep_put16 = i_ddi_io_rep_put16; + hp->ahi_rep_put32 = i_ddi_io_rep_put32; + } + + } else if (hp->ahi_acc_attr & DDI_ACCATTR_CPU_VADDR) { + + hp->ahi_get8 = i_ddi_vaddr_get8; + hp->ahi_put8 = i_ddi_vaddr_put8; + hp->ahi_rep_get8 = i_ddi_vaddr_rep_get8; + hp->ahi_rep_put8 = i_ddi_vaddr_rep_put8; + + /* + * check for BIG endian access + */ + if (handlep->ah_acc.devacc_attr_endian_flags == + DDI_STRUCTURE_BE_ACC) { + + hp->ahi_get16 = i_ddi_vaddr_swap_get16; + hp->ahi_get32 = i_ddi_vaddr_swap_get32; + hp->ahi_get64 = i_ddi_vaddr_swap_get64; + hp->ahi_put16 = i_ddi_vaddr_swap_put16; + hp->ahi_put32 = i_ddi_vaddr_swap_put32; + hp->ahi_put64 = i_ddi_vaddr_swap_put64; + hp->ahi_rep_get16 = i_ddi_vaddr_swap_rep_get16; + hp->ahi_rep_get32 = i_ddi_vaddr_swap_rep_get32; + hp->ahi_rep_get64 = i_ddi_vaddr_swap_rep_get64; + hp->ahi_rep_put16 = i_ddi_vaddr_swap_rep_put16; + hp->ahi_rep_put32 = i_ddi_vaddr_swap_rep_put32; + hp->ahi_rep_put64 = i_ddi_vaddr_swap_rep_put64; + } else { + hp->ahi_acc_attr |= DDI_ACCATTR_DIRECT; + hp->ahi_get16 = i_ddi_vaddr_get16; + hp->ahi_get32 = i_ddi_vaddr_get32; + hp->ahi_get64 = i_ddi_vaddr_get64; + hp->ahi_put16 = i_ddi_vaddr_put16; + hp->ahi_put32 = i_ddi_vaddr_put32; + hp->ahi_put64 = i_ddi_vaddr_put64; + hp->ahi_rep_get16 = i_ddi_vaddr_rep_get16; + hp->ahi_rep_get32 = i_ddi_vaddr_rep_get32; + hp->ahi_rep_get64 = i_ddi_vaddr_rep_get64; + hp->ahi_rep_put16 = i_ddi_vaddr_rep_put16; + hp->ahi_rep_put32 = i_ddi_vaddr_rep_put32; + hp->ahi_rep_put64 = i_ddi_vaddr_rep_put64; + } + } + break; + } + hp->ahi_fault_check = i_ddi_acc_fault_check; + hp->ahi_fault_notify = i_ddi_acc_fault_notify; + hp->ahi_fault = 0; + impl_acc_err_init(handlep); +} + +/* + * The followings are low-level routines for data access. + * + * All of these routines should be implemented in assembly. Those + * that have been rewritten be found in ~ml/ddi_i86_asm.s + */ + +/*ARGSUSED*/ +uint16_t +i_ddi_vaddr_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + return (ddi_swap16(*addr)); +} + +/*ARGSUSED*/ +uint16_t +i_ddi_io_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + return (ddi_swap16(inw((uintptr_t)addr))); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_vaddr_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + return (ddi_swap32(*addr)); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_io_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + return (ddi_swap32(inl((uintptr_t)addr))); +} + +/*ARGSUSED*/ +uint64_t +i_ddi_vaddr_swap_get64(ddi_acc_impl_t *hdlp, uint64_t *addr) +{ + return (ddi_swap64(*addr)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value) +{ + *addr = ddi_swap16(value); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value) +{ + outw((uintptr_t)addr, ddi_swap16(value)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value) +{ + *addr = ddi_swap32(value); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value) +{ + outl((uintptr_t)addr, ddi_swap32(value)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_put64(ddi_acc_impl_t *hdlp, uint64_t *addr, uint64_t value) +{ + *addr = ddi_swap64(value); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + uint8_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = *d++; + else + for (; repcount; repcount--) + *h++ = *d; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = *d++; + else + for (; repcount; repcount--) + *h++ = *d; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = ddi_swap16(*d++); + else + for (; repcount; repcount--) + *h++ = ddi_swap16(*d); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 2) + *h++ = ddi_swap16(inw(port)); + else + for (; repcount; repcount--) + *h++ = ddi_swap16(inw(port)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = *d++; + else + for (; repcount; repcount--) + *h++ = *d; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = ddi_swap32(*d++); + else + for (; repcount; repcount--) + *h++ = ddi_swap32(*d); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 4) + *h++ = ddi_swap32(inl(port)); + else + for (; repcount; repcount--) + *h++ = ddi_swap32(inl(port)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = *d++; + else + for (; repcount; repcount--) + *h++ = *d; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *h++ = ddi_swap64(*d++); + else + for (; repcount; repcount--) + *h++ = ddi_swap64(*d); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + uint8_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap16(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap16(*h++); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 2) + outw(port, ddi_swap16(*h++)); + else + for (; repcount; repcount--) + outw(port, ddi_swap16(*h++)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap32(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap32(*h++); +} + +/*ARGSUSED*/ +void +i_ddi_io_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 4) + outl(port, ddi_swap32(*h++)); + else + for (; repcount; repcount--) + outl(port, ddi_swap32(*h++)); +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; +} + +/*ARGSUSED*/ +void +i_ddi_vaddr_swap_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap64(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap64(*h++); +} + +/*ARGSUSED*/ +uint64_t +i_ddi_io_get64(ddi_acc_impl_t *hdlp, uint64_t *addr) +{ + panic("ddi_get64 from i/o space"); + /*NOTREACHED*/ + return (0); +} + +/*ARGSUSED*/ +void +i_ddi_io_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, uint64_t value) +{ + panic("ddi_put64 to i/o space"); + /*NOTREACHED*/ +} + +void +do_scan(ddi_acc_impl_t *hdlp) +{ + ddi_fm_error_t de; + ndi_err_t *errp = (ndi_err_t *)hdlp->ahi_err; + + bzero(&de, sizeof (ddi_fm_error_t)); + de.fme_version = DDI_FME_VERSION; + de.fme_ena = fm_ena_generate(0, FM_ENA_FMT1); + de.fme_flag = DDI_FM_ERR_UNEXPECTED; + + mutex_enter(hdlp->ahi_err_mutexp); + hdlp->ahi_scan(hdlp->ahi_scan_dip, &de); + if (de.fme_status != DDI_FM_OK) { + errp->err_ena = de.fme_ena; + errp->err_expected = de.fme_flag; + errp->err_status = DDI_FM_NONFATAL; + } + mutex_exit(hdlp->ahi_err_mutexp); +} + +/*ARGSUSED*/ +uint8_t +i_ddi_prot_vaddr_get8(ddi_acc_impl_t *hdlp, uint8_t *addr) +{ + uint8_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = *addr; + if (val == 0xff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint16_t +i_ddi_prot_vaddr_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + uint16_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = *addr; + if (val == 0xffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_prot_vaddr_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + uint32_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = *addr; + if (val == 0xffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint64_t +i_ddi_prot_vaddr_get64(ddi_acc_impl_t *hdlp, uint64_t *addr) +{ + uint64_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = *addr; + if (val == 0xffffffffffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint8_t +i_ddi_prot_io_get8(ddi_acc_impl_t *hdlp, uint8_t *addr) +{ + uint8_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = inb((uintptr_t)addr); + if (val == 0xff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint16_t +i_ddi_prot_io_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + uint16_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = inw((uintptr_t)addr); + if (val == 0xffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_prot_io_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + uint32_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = inl((uintptr_t)addr); + if (val == 0xffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint16_t +i_ddi_prot_vaddr_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + uint16_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = ddi_swap16(*addr); + if (val == 0xffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint16_t +i_ddi_prot_io_swap_get16(ddi_acc_impl_t *hdlp, uint16_t *addr) +{ + uint16_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = ddi_swap16(inw((uintptr_t)addr)); + if (val == 0xffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_prot_vaddr_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + uint32_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = ddi_swap32(*addr); + if (val == 0xffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint32_t +i_ddi_prot_io_swap_get32(ddi_acc_impl_t *hdlp, uint32_t *addr) +{ + uint32_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = ddi_swap32(inl((uintptr_t)addr)); + if (val == 0xffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +uint64_t +i_ddi_prot_vaddr_swap_get64(ddi_acc_impl_t *hdlp, uint64_t *addr) +{ + uint64_t val; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + val = ddi_swap64(*addr); + if (val == 0xffffffffffffffff) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); + + return (val); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_put8(ddi_acc_impl_t *hdlp, uint8_t *addr, uint8_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = value; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_put8(ddi_acc_impl_t *hdlp, uint8_t *addr, uint8_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + outb((uintptr_t)addr, value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = value; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + outw((uintptr_t)addr, value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, + uint32_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = value; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + outl((uintptr_t)addr, value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_put64(ddi_acc_impl_t *hdlp, uint64_t *addr, + uint64_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = value; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, + uint16_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = ddi_swap16(value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_put16(ddi_acc_impl_t *hdlp, uint16_t *addr, uint16_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + outw((uintptr_t)addr, ddi_swap16(value)); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, + uint32_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = ddi_swap32(value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_put32(ddi_acc_impl_t *hdlp, uint32_t *addr, uint32_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + outl((uintptr_t)addr, ddi_swap32(value)); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_put64(ddi_acc_impl_t *hdlp, uint64_t *addr, + uint64_t value) +{ + mutex_enter(hdlp->ahi_peekpoke_mutexp); + *addr = ddi_swap64(value); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint8_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--, port++) + if ((*h++ = inb(port)) == 0xff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = inb(port)) == 0xff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--, port += 2) + if ((*h++ = inw(port)) == 0xffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = inw(port)) == 0xffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--, port += 4) + if ((*h++ = inl(port)) == 0xffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = inl(port)) == 0xffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_get8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint8_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = *d++) == 0xff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = *d) == 0xff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = *d++) == 0xffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = *d) == 0xffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = ddi_swap16(*d++)) == 0xffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = ddi_swap16(*d)) == 0xffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_rep_get16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--, port += 2) + if ((*h++ = ddi_swap16(inw(port))) == 0xffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = ddi_swap16(inw(port))) == 0xffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = *d++) == 0xffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = *d) == 0xffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = ddi_swap32(*d++)) == 0xffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = ddi_swap32(*d)) == 0xffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_rep_get32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--, port += 4) + if ((*h++ = ddi_swap32(inl(port))) == 0xffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = ddi_swap32(inl(port))) == 0xffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = *d++) == 0xffffffffffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = *d) == 0xffffffffffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + int fail = 0; + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) { + for (; repcount; repcount--) + if ((*h++ = ddi_swap64(*d++)) == 0xffffffffffffffff) + fail = 1; + } else { + for (; repcount; repcount--) + if ((*h++ = ddi_swap64(*d)) == 0xffffffffffffffff) + fail = 1; + } + if (fail == 1) + do_scan(hdlp); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + uint8_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_put8(ddi_acc_impl_t *hdlp, uint8_t *host_addr, + uint8_t *dev_addr, size_t repcount, uint_t flags) +{ + uint8_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port++) + outb(port, *h++); + else + for (; repcount; repcount--) + outb(port, *h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 2) + outw(port, *h++); + else + for (; repcount; repcount--) + outw(port, *h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap16(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap16(*h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_rep_put16(ddi_acc_impl_t *hdlp, uint16_t *host_addr, + uint16_t *dev_addr, size_t repcount, uint_t flags) +{ + uint16_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 2) + outw(port, ddi_swap16(*h++)); + else + for (; repcount; repcount--) + outw(port, ddi_swap16(*h++)); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 4) + outl(port, *h++); + else + for (; repcount; repcount--) + outl(port, *h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap32(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap32(*h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_io_swap_rep_put32(ddi_acc_impl_t *hdlp, uint32_t *host_addr, + uint32_t *dev_addr, size_t repcount, uint_t flags) +{ + uint32_t *h; + uintptr_t port; + + h = host_addr; + port = (uintptr_t)dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--, port += 4) + outl(port, ddi_swap32(*h++)); + else + for (; repcount; repcount--) + outl(port, ddi_swap32(*h++)); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = *h++; + else + for (; repcount; repcount--) + *d = *h++; + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +/*ARGSUSED*/ +void +i_ddi_prot_vaddr_swap_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + uint64_t *h, *d; + + h = host_addr; + d = dev_addr; + + mutex_enter(hdlp->ahi_peekpoke_mutexp); + if (flags == DDI_DEV_AUTOINCR) + for (; repcount; repcount--) + *d++ = ddi_swap64(*h++); + else + for (; repcount; repcount--) + *d = ddi_swap64(*h++); + mutex_exit(hdlp->ahi_peekpoke_mutexp); +} + +void +ddi_io_rep_get8(ddi_acc_handle_t handle, + uint8_t *host_addr, uint8_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_get8) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +void +ddi_io_rep_get16(ddi_acc_handle_t handle, + uint16_t *host_addr, uint16_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_get16) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +void +ddi_io_rep_get32(ddi_acc_handle_t handle, + uint32_t *host_addr, uint32_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_get32) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +/*ARGSUSED*/ +void +i_ddi_io_rep_get64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + cmn_err(CE_PANIC, "ddi_rep_get64 from i/o space"); +} + +void +ddi_io_rep_put8(ddi_acc_handle_t handle, + uint8_t *host_addr, uint8_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_put8) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +void +ddi_io_rep_put16(ddi_acc_handle_t handle, + uint16_t *host_addr, uint16_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_put16) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +void +ddi_io_rep_put32(ddi_acc_handle_t handle, + uint32_t *host_addr, uint32_t *dev_addr, size_t repcount) +{ + (((ddi_acc_impl_t *)handle)->ahi_rep_put32) + ((ddi_acc_impl_t *)handle, host_addr, dev_addr, + repcount, DDI_DEV_NO_AUTOINCR); +} + +/*ARGSUSED*/ +void +i_ddi_io_rep_put64(ddi_acc_impl_t *hdlp, uint64_t *host_addr, + uint64_t *dev_addr, size_t repcount, uint_t flags) +{ + cmn_err(CE_PANIC, "ddi_rep_put64 to i/o space"); +} + +/* + * These next two functions could be translated into assembler someday + */ +int +ddi_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle; + return (((*hdlp->ahi_fault_check)(hdlp) == DDI_SUCCESS) ? DDI_SUCCESS : + DDI_FAILURE); +} + +int +i_ddi_acc_fault_check(ddi_acc_impl_t *hdlp) +{ + /* Default version, just returns flag value */ + return (hdlp->ahi_fault); +} + +/*ARGSUSED*/ +void +i_ddi_acc_fault_notify(ddi_acc_impl_t *hdlp) +{ + /* Default version, does nothing for now */ +} + +void +i_ddi_acc_set_fault(ddi_acc_handle_t handle) +{ + ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle; + + if (!hdlp->ahi_fault) { + hdlp->ahi_fault = 1; + (*hdlp->ahi_fault_notify)(hdlp); + } +} + +void +i_ddi_acc_clr_fault(ddi_acc_handle_t handle) +{ + ddi_acc_impl_t *hdlp = (ddi_acc_impl_t *)handle; + + if (hdlp->ahi_fault) { + hdlp->ahi_fault = 0; + (*hdlp->ahi_fault_notify)(hdlp); + } +} diff --git a/usr/src/uts/intel/os/desctbls.c b/usr/src/uts/intel/os/desctbls.c new file mode 100644 index 0000000000..35345c3fe8 --- /dev/null +++ b/usr/src/uts/intel/os/desctbls.c @@ -0,0 +1,1218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Copyright 2018 Joyent, Inc. All rights reserved. + */ + +/* + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/tss.h> +#include <sys/segments.h> +#include <sys/trap.h> +#include <sys/cpuvar.h> +#include <sys/bootconf.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <sys/archsystm.h> +#include <sys/machsystm.h> +#include <sys/kobj.h> +#include <sys/cmn_err.h> +#include <sys/reboot.h> +#include <sys/kdi.h> +#include <sys/mach_mmu.h> +#include <sys/systm.h> +#include <sys/note.h> + +#ifdef __xpv +#include <sys/hypervisor.h> +#include <vm/as.h> +#endif + +#include <sys/promif.h> +#include <sys/bootinfo.h> +#include <vm/kboot_mmu.h> +#include <vm/hat_pte.h> + +/* + * cpu0 and default tables and structures. + */ +user_desc_t *gdt0; +#if !defined(__xpv) +desctbr_t gdt0_default_r; +#endif + +gate_desc_t *idt0; /* interrupt descriptor table */ + +tss_t *ktss0; /* kernel task state structure */ + + +user_desc_t zero_udesc; /* base zero user desc native procs */ +user_desc_t null_udesc; /* null user descriptor */ +system_desc_t null_sdesc; /* null system descriptor */ + +user_desc_t zero_u32desc; /* 32-bit compatibility procs */ + +user_desc_t ucs_on; +user_desc_t ucs_off; +user_desc_t ucs32_on; +user_desc_t ucs32_off; + +/* + * If the size of this is changed, you must update hat_pcp_setup() and the + * definitions in exception.s + */ +extern char dblfault_stack0[DEFAULTSTKSZ]; +extern char nmi_stack0[DEFAULTSTKSZ]; +extern char mce_stack0[DEFAULTSTKSZ]; + +extern void fast_null(void); +extern hrtime_t get_hrtime(void); +extern hrtime_t gethrvtime(void); +extern hrtime_t get_hrestime(void); +extern uint64_t getlgrp(void); + +void (*(fasttable[]))(void) = { + fast_null, /* T_FNULL routine */ + fast_null, /* T_FGETFP routine (initially null) */ + fast_null, /* T_FSETFP routine (initially null) */ + (void (*)())(uintptr_t)get_hrtime, /* T_GETHRTIME */ + (void (*)())(uintptr_t)gethrvtime, /* T_GETHRVTIME */ + (void (*)())(uintptr_t)get_hrestime, /* T_GETHRESTIME */ + (void (*)())(uintptr_t)getlgrp /* T_GETLGRP */ +}; + +/* + * Structure containing pre-computed descriptors to allow us to temporarily + * interpose on a standard handler. + */ +struct interposing_handler { + int ih_inum; + gate_desc_t ih_interp_desc; + gate_desc_t ih_default_desc; +}; + +/* + * The brand infrastructure interposes on two handlers, and we use one as a + * NULL signpost. + */ +static struct interposing_handler brand_tbl[2]; + +/* + * software prototypes for default local descriptor table + */ + +/* + * Routines for loading segment descriptors in format the hardware + * can understand. + */ + +/* + * In long mode we have the new L or long mode attribute bit + * for code segments. Only the conforming bit in type is used along + * with descriptor priority and present bits. Default operand size must + * be zero when in long mode. In 32-bit compatibility mode all fields + * are treated as in legacy mode. For data segments while in long mode + * only the present bit is loaded. + */ +void +set_usegd(user_desc_t *dp, uint_t lmode, void *base, size_t size, + uint_t type, uint_t dpl, uint_t gran, uint_t defopsz) +{ + ASSERT(lmode == SDP_SHORT || lmode == SDP_LONG); + /* This should never be a "system" segment. */ + ASSERT3U(type & SDT_S, !=, 0); + + /* + * 64-bit long mode. + */ + if (lmode == SDP_LONG) + dp->usd_def32 = 0; /* 32-bit operands only */ + else + /* + * 32-bit compatibility mode. + */ + dp->usd_def32 = defopsz; /* 0 = 16, 1 = 32-bit ops */ + + /* + * We should always set the "accessed" bit (SDT_A), otherwise the CPU + * will write to the GDT whenever we change segment registers around. + * With KPTI on, the GDT is read-only in the user page table, which + * causes crashes if we don't set this. + */ + ASSERT3U(type & SDT_A, !=, 0); + + dp->usd_long = lmode; /* 64-bit mode */ + dp->usd_type = type; + dp->usd_dpl = dpl; + dp->usd_p = 1; + dp->usd_gran = gran; /* 0 = bytes, 1 = pages */ + + dp->usd_lobase = (uintptr_t)base; + dp->usd_midbase = (uintptr_t)base >> 16; + dp->usd_hibase = (uintptr_t)base >> (16 + 8); + dp->usd_lolimit = size; + dp->usd_hilimit = (uintptr_t)size >> 16; +} + +/* + * Install system segment descriptor for LDT and TSS segments. + */ + +void +set_syssegd(system_desc_t *dp, void *base, size_t size, uint_t type, + uint_t dpl) +{ + dp->ssd_lolimit = size; + dp->ssd_hilimit = (uintptr_t)size >> 16; + + dp->ssd_lobase = (uintptr_t)base; + dp->ssd_midbase = (uintptr_t)base >> 16; + dp->ssd_hibase = (uintptr_t)base >> (16 + 8); + dp->ssd_hi64base = (uintptr_t)base >> (16 + 8 + 8); + + dp->ssd_type = type; + dp->ssd_zero1 = 0; /* must be zero */ + dp->ssd_zero2 = 0; + dp->ssd_dpl = dpl; + dp->ssd_p = 1; + dp->ssd_gran = 0; /* force byte units */ +} + +void * +get_ssd_base(system_desc_t *dp) +{ + uintptr_t base; + + base = (uintptr_t)dp->ssd_lobase | + (uintptr_t)dp->ssd_midbase << 16 | + (uintptr_t)dp->ssd_hibase << (16 + 8) | + (uintptr_t)dp->ssd_hi64base << (16 + 8 + 8); + return ((void *)base); +} + +/* + * Install gate segment descriptor for interrupt, trap, call and task gates. + * + * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on + * all interrupts. We have different ISTs for each class of exceptions that are + * most likely to occur while handling an existing exception; while many of + * these are just going to panic, it's nice not to trample on the existing + * exception state for debugging purposes. + * + * Normal interrupts are all redirected unconditionally to the KPTI trampoline + * stack space. This unifies the trampoline handling between user and kernel + * space (and avoids the need to touch %gs). + * + * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when + * we do a read from KMDB that cause another #PF. Without its own IST, this + * would stomp on the kernel's mcpu_kpti_flt frame. + */ +uint_t +idt_vector_to_ist(uint_t vector) +{ +#if defined(__xpv) + _NOTE(ARGUNUSED(vector)); + return (IST_NONE); +#else + switch (vector) { + /* These should always use IST even without KPTI enabled. */ + case T_DBLFLT: + return (IST_DF); + case T_NMIFLT: + return (IST_NMI); + case T_MCE: + return (IST_MCE); + + case T_BPTFLT: + case T_SGLSTP: + if (kpti_enable == 1) { + return (IST_DBG); + } + return (IST_NONE); + case T_STKFLT: + case T_GPFLT: + case T_PGFLT: + if (kpti_enable == 1) { + return (IST_NESTABLE); + } + return (IST_NONE); + default: + if (kpti_enable == 1) { + return (IST_DEFAULT); + } + return (IST_NONE); + } +#endif +} + +void +set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, + uint_t type, uint_t dpl, uint_t ist) +{ + dp->sgd_looffset = (uintptr_t)func; + dp->sgd_hioffset = (uintptr_t)func >> 16; + dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); + dp->sgd_selector = (uint16_t)sel; + dp->sgd_ist = ist; + dp->sgd_type = type; + dp->sgd_dpl = dpl; + dp->sgd_p = 1; +} + +/* + * Updates a single user descriptor in the the GDT of the current cpu. + * Caller is responsible for preventing cpu migration. + */ + +void +gdt_update_usegd(uint_t sidx, user_desc_t *udp) +{ +#if defined(DEBUG) + /* This should never be a "system" segment, but it might be null. */ + if (udp->usd_p != 0 || udp->usd_type != 0) { + ASSERT3U(udp->usd_type & SDT_S, !=, 0); + } + /* + * We should always set the "accessed" bit (SDT_A), otherwise the CPU + * will write to the GDT whenever we change segment registers around. + * With KPTI on, the GDT is read-only in the user page table, which + * causes crashes if we don't set this. + */ + if (udp->usd_p != 0 || udp->usd_type != 0) { + ASSERT3U(udp->usd_type & SDT_A, !=, 0); + } +#endif + +#if defined(__xpv) + uint64_t dpa = CPU->cpu_m.mcpu_gdtpa + sizeof (*udp) * sidx; + + if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp)) + panic("gdt_update_usegd: HYPERVISOR_update_descriptor"); + +#else /* __xpv */ + CPU->cpu_gdt[sidx] = *udp; +#endif /* __xpv */ +} + +/* + * Writes single descriptor pointed to by udp into a processes + * LDT entry pointed to by ldp. + */ +int +ldt_update_segd(user_desc_t *ldp, user_desc_t *udp) +{ +#if defined(DEBUG) + /* This should never be a "system" segment, but it might be null. */ + if (udp->usd_p != 0 || udp->usd_type != 0) { + ASSERT3U(udp->usd_type & SDT_S, !=, 0); + } + /* + * We should always set the "accessed" bit (SDT_A), otherwise the CPU + * will write to the LDT whenever we change segment registers around. + * With KPTI on, the LDT is read-only in the user page table, which + * causes crashes if we don't set this. + */ + if (udp->usd_p != 0 || udp->usd_type != 0) { + ASSERT3U(udp->usd_type & SDT_A, !=, 0); + } +#endif + +#if defined(__xpv) + uint64_t dpa; + + dpa = mmu_ptob(hat_getpfnum(kas.a_hat, (caddr_t)ldp)) | + ((uintptr_t)ldp & PAGEOFFSET); + + /* + * The hypervisor is a little more restrictive about what it + * supports in the LDT. + */ + if (HYPERVISOR_update_descriptor(pa_to_ma(dpa), *(uint64_t *)udp) != 0) + return (EINVAL); + +#else /* __xpv */ + *ldp = *udp; + +#endif /* __xpv */ + return (0); +} + +#if defined(__xpv) + +/* + * Converts hw format gate descriptor into pseudo-IDT format for the hypervisor. + * Returns true if a valid entry was written. + */ +int +xen_idt_to_trap_info(uint_t vec, gate_desc_t *sgd, void *ti_arg) +{ + trap_info_t *ti = ti_arg; /* XXPV Aargh - segments.h comment */ + + /* + * skip holes in the IDT + */ + if (GATESEG_GETOFFSET(sgd) == 0) + return (0); + + ASSERT(sgd->sgd_type == SDT_SYSIGT); + ti->vector = vec; + TI_SET_DPL(ti, sgd->sgd_dpl); + + /* + * Is this an interrupt gate? + */ + if (sgd->sgd_type == SDT_SYSIGT) { + /* LINTED */ + TI_SET_IF(ti, 1); + } + ti->cs = sgd->sgd_selector; + ti->cs |= SEL_KPL; /* force into ring 3. see KCS_SEL */ + ti->address = GATESEG_GETOFFSET(sgd); + return (1); +} + +/* + * Convert a single hw format gate descriptor and write it into our virtual IDT. + */ +void +xen_idt_write(gate_desc_t *sgd, uint_t vec) +{ + trap_info_t trapinfo[2]; + + bzero(trapinfo, sizeof (trapinfo)); + if (xen_idt_to_trap_info(vec, sgd, &trapinfo[0]) == 0) + return; + if (xen_set_trap_table(trapinfo) != 0) + panic("xen_idt_write: xen_set_trap_table() failed"); +} + +#endif /* __xpv */ + + +/* + * Build kernel GDT. + */ + +static void +init_gdt_common(user_desc_t *gdt) +{ + int i; + + /* + * 64-bit kernel code segment. + */ + set_usegd(&gdt[GDT_KCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_KPL, + SDP_PAGES, SDP_OP32); + + /* + * 64-bit kernel data segment. The limit attribute is ignored in 64-bit + * mode, but we set it here to 0xFFFF so that we can use the SYSRET + * instruction to return from system calls back to 32-bit applications. + * SYSRET doesn't update the base, limit, or attributes of %ss or %ds + * descriptors. We therefore must ensure that the kernel uses something, + * though it will be ignored by hardware, that is compatible with 32-bit + * apps. For the same reason we must set the default op size of this + * descriptor to 32-bit operands. + */ + set_usegd(&gdt[GDT_KDATA], SDP_LONG, NULL, -1, SDT_MEMRWA, + SEL_KPL, SDP_PAGES, SDP_OP32); + gdt[GDT_KDATA].usd_def32 = 1; + + /* + * 64-bit user code segment. + */ + set_usegd(&gdt[GDT_UCODE], SDP_LONG, NULL, 0, SDT_MEMERA, SEL_UPL, + SDP_PAGES, SDP_OP32); + + /* + * 32-bit user code segment. + */ + set_usegd(&gdt[GDT_U32CODE], SDP_SHORT, NULL, -1, SDT_MEMERA, + SEL_UPL, SDP_PAGES, SDP_OP32); + + /* + * See gdt_ucode32() and gdt_ucode_native(). + */ + ucs_on = ucs_off = gdt[GDT_UCODE]; + ucs_off.usd_p = 0; /* forces #np fault */ + + ucs32_on = ucs32_off = gdt[GDT_U32CODE]; + ucs32_off.usd_p = 0; /* forces #np fault */ + + /* + * 32 and 64 bit data segments can actually share the same descriptor. + * In long mode only the present bit is checked but all other fields + * are loaded. But in compatibility mode all fields are interpreted + * as in legacy mode so they must be set correctly for a 32-bit data + * segment. + */ + set_usegd(&gdt[GDT_UDATA], SDP_SHORT, NULL, -1, SDT_MEMRWA, SEL_UPL, + SDP_PAGES, SDP_OP32); + +#if !defined(__xpv) + + /* + * The 64-bit kernel has no default LDT. By default, the LDT descriptor + * in the GDT is 0. + */ + + /* + * Kernel TSS + */ + set_syssegd((system_desc_t *)&gdt[GDT_KTSS], ktss0, + sizeof (*ktss0) - 1, SDT_SYSTSS, SEL_KPL); + +#endif /* !__xpv */ + + /* + * Initialize fs and gs descriptors for 32 bit processes. + * Only attributes and limits are initialized, the effective + * base address is programmed via fsbase/gsbase. + */ + set_usegd(&gdt[GDT_LWPFS], SDP_SHORT, NULL, -1, SDT_MEMRWA, + SEL_UPL, SDP_PAGES, SDP_OP32); + set_usegd(&gdt[GDT_LWPGS], SDP_SHORT, NULL, -1, SDT_MEMRWA, + SEL_UPL, SDP_PAGES, SDP_OP32); + + /* + * Initialize the descriptors set aside for brand usage. + * Only attributes and limits are initialized. + */ + for (i = GDT_BRANDMIN; i <= GDT_BRANDMAX; i++) + set_usegd(&gdt0[i], SDP_SHORT, NULL, -1, SDT_MEMRWA, + SEL_UPL, SDP_PAGES, SDP_OP32); + + /* + * Initialize convenient zero base user descriptors for clearing + * lwp private %fs and %gs descriptors in GDT. See setregs() for + * an example. + */ + set_usegd(&zero_udesc, SDP_LONG, 0, 0, SDT_MEMRWA, SEL_UPL, + SDP_BYTES, SDP_OP32); + set_usegd(&zero_u32desc, SDP_SHORT, 0, -1, SDT_MEMRWA, SEL_UPL, + SDP_PAGES, SDP_OP32); +} + +#if defined(__xpv) + +static user_desc_t * +init_gdt(void) +{ + uint64_t gdtpa; + ulong_t ma[1]; /* XXPV should be a memory_t */ + ulong_t addr; + +#if !defined(__lint) + /* + * Our gdt is never larger than a single page. + */ + ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); +#endif + gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, + PAGESIZE, PAGESIZE); + bzero(gdt0, PAGESIZE); + + init_gdt_common(gdt0); + + /* + * XXX Since we never invoke kmdb until after the kernel takes + * over the descriptor tables why not have it use the kernel's + * selectors? + */ + if (boothowto & RB_DEBUG) { + set_usegd(&gdt0[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, + SEL_KPL, SDP_PAGES, SDP_OP32); + set_usegd(&gdt0[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, + SEL_KPL, SDP_PAGES, SDP_OP32); + } + + /* + * Clear write permission for page containing the gdt and install it. + */ + gdtpa = pfn_to_pa(va_to_pfn(gdt0)); + ma[0] = (ulong_t)(pa_to_ma(gdtpa) >> PAGESHIFT); + kbm_read_only((uintptr_t)gdt0, gdtpa); + xen_set_gdt(ma, NGDT); + + /* + * Reload the segment registers to use the new GDT. + * On 64-bit, fixup KCS_SEL to be in ring 3. + * See KCS_SEL in segments.h. + */ + load_segment_registers((KCS_SEL | SEL_KPL), KFS_SEL, KGS_SEL, KDS_SEL); + + /* + * setup %gs for kernel + */ + xen_set_segment_base(SEGBASE_GS_KERNEL, (ulong_t)&cpus[0]); + + /* + * XX64 We should never dereference off "other gsbase" or + * "fsbase". So, we should arrange to point FSBASE and + * KGSBASE somewhere truly awful e.g. point it at the last + * valid address below the hole so that any attempts to index + * off them cause an exception. + * + * For now, point it at 8G -- at least it should be unmapped + * until some 64-bit processes run. + */ + addr = 0x200000000ul; + xen_set_segment_base(SEGBASE_FS, addr); + xen_set_segment_base(SEGBASE_GS_USER, addr); + xen_set_segment_base(SEGBASE_GS_USER_SEL, 0); + + return (gdt0); +} + +#else /* __xpv */ + +static user_desc_t * +init_gdt(void) +{ + desctbr_t r_bgdt, r_gdt; + user_desc_t *bgdt; + +#if !defined(__lint) + /* + * Our gdt is never larger than a single page. + */ + ASSERT((sizeof (*gdt0) * NGDT) <= PAGESIZE); +#endif + gdt0 = (user_desc_t *)BOP_ALLOC(bootops, (caddr_t)GDT_VA, + PAGESIZE, PAGESIZE); + bzero(gdt0, PAGESIZE); + + init_gdt_common(gdt0); + + /* + * Copy in from boot's gdt to our gdt. + * Entry 0 is the null descriptor by definition. + */ + rd_gdtr(&r_bgdt); + bgdt = (user_desc_t *)r_bgdt.dtr_base; + if (bgdt == NULL) + panic("null boot gdt"); + + gdt0[GDT_B32DATA] = bgdt[GDT_B32DATA]; + gdt0[GDT_B32CODE] = bgdt[GDT_B32CODE]; + gdt0[GDT_B16CODE] = bgdt[GDT_B16CODE]; + gdt0[GDT_B16DATA] = bgdt[GDT_B16DATA]; + gdt0[GDT_B64CODE] = bgdt[GDT_B64CODE]; + + /* + * Install our new GDT + */ + r_gdt.dtr_limit = (sizeof (*gdt0) * NGDT) - 1; + r_gdt.dtr_base = (uintptr_t)gdt0; + wr_gdtr(&r_gdt); + + /* + * Reload the segment registers to use the new GDT + */ + load_segment_registers(KCS_SEL, KFS_SEL, KGS_SEL, KDS_SEL); + + /* + * setup %gs for kernel + */ + wrmsr(MSR_AMD_GSBASE, (uint64_t)&cpus[0]); + + /* + * XX64 We should never dereference off "other gsbase" or + * "fsbase". So, we should arrange to point FSBASE and + * KGSBASE somewhere truly awful e.g. point it at the last + * valid address below the hole so that any attempts to index + * off them cause an exception. + * + * For now, point it at 8G -- at least it should be unmapped + * until some 64-bit processes run. + */ + wrmsr(MSR_AMD_FSBASE, 0x200000000ul); + wrmsr(MSR_AMD_KGSBASE, 0x200000000ul); + return (gdt0); +} + +#endif /* __xpv */ + + +/* + * Build kernel IDT. + * + * Note that for amd64 we pretty much require every gate to be an interrupt + * gate which blocks interrupts atomically on entry; that's because of our + * dependency on using 'swapgs' every time we come into the kernel to find + * the cpu structure. If we get interrupted just before doing that, %cs could + * be in kernel mode (so that the trap prolog doesn't do a swapgs), but + * %gsbase is really still pointing at something in userland. Bad things will + * ensue. We also use interrupt gates for i386 as well even though this is not + * required for some traps. + * + * Perhaps they should have invented a trap gate that does an atomic swapgs? + */ +static void +init_idt_common(gate_desc_t *idt) +{ + set_gatesegd(&idt[T_ZERODIV], + (kpti_enable == 1) ? &tr_div0trap : &div0trap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV)); + set_gatesegd(&idt[T_SGLSTP], + (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP)); + set_gatesegd(&idt[T_NMIFLT], + (kpti_enable == 1) ? &tr_nmiint : &nmiint, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT)); + set_gatesegd(&idt[T_BPTFLT], + (kpti_enable == 1) ? &tr_brktrap : &brktrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT)); + set_gatesegd(&idt[T_OVFLW], + (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW)); + set_gatesegd(&idt[T_BOUNDFLT], + (kpti_enable == 1) ? &tr_boundstrap : &boundstrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT)); + set_gatesegd(&idt[T_ILLINST], + (kpti_enable == 1) ? &tr_invoptrap : &invoptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST)); + set_gatesegd(&idt[T_NOEXTFLT], + (kpti_enable == 1) ? &tr_ndptrap : &ndptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT)); + + /* + * double fault handler. + * + * Note that on the hypervisor a guest does not receive #df faults. + * Instead a failsafe event is injected into the guest if its selectors + * and/or stack is in a broken state. See xen_failsafe_callback. + */ +#if !defined(__xpv) + set_gatesegd(&idt[T_DBLFLT], + (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT)); +#endif /* !__xpv */ + + /* + * T_EXTOVRFLT coprocessor-segment-overrun not supported. + */ + set_gatesegd(&idt[T_TSSFLT], + (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT)); + set_gatesegd(&idt[T_SEGFLT], + (kpti_enable == 1) ? &tr_segnptrap : &segnptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT)); + set_gatesegd(&idt[T_STKFLT], + (kpti_enable == 1) ? &tr_stktrap : &stktrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT)); + set_gatesegd(&idt[T_GPFLT], + (kpti_enable == 1) ? &tr_gptrap : &gptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT)); + set_gatesegd(&idt[T_PGFLT], + (kpti_enable == 1) ? &tr_pftrap : &pftrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT)); + set_gatesegd(&idt[T_EXTERRFLT], + (kpti_enable == 1) ? &tr_ndperr : &ndperr, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT)); + set_gatesegd(&idt[T_ALIGNMENT], + (kpti_enable == 1) ? &tr_achktrap : &achktrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT)); + set_gatesegd(&idt[T_MCE], + (kpti_enable == 1) ? &tr_mcetrap : &mcetrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE)); + set_gatesegd(&idt[T_SIMDFPE], + (kpti_enable == 1) ? &tr_xmtrap : &xmtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); + + /* + * install fast trap handler at 210. + */ + set_gatesegd(&idt[T_FASTTRAP], + (kpti_enable == 1) ? &tr_fasttrap : &fasttrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP)); + + /* + * System call handler. + */ + set_gatesegd(&idt[T_SYSCALLINT], + (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); + + /* + * Install the DTrace interrupt handler for the pid provider. + */ + set_gatesegd(&idt[T_DTRACE_RET], + (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); + + /* + * Prepare interposing descriptor for the syscall handler + * and cache copy of the default descriptor. + */ + brand_tbl[0].ih_inum = T_SYSCALLINT; + brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT]; + + set_gatesegd(&(brand_tbl[0].ih_interp_desc), + (kpti_enable == 1) ? &tr_brand_sys_syscall_int : + &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, + idt_vector_to_ist(T_SYSCALLINT)); + + brand_tbl[1].ih_inum = 0; +} + +#if defined(__xpv) + +static void +init_idt(gate_desc_t *idt) +{ + init_idt_common(idt); +} + +#else /* __xpv */ + +static void +init_idt(gate_desc_t *idt) +{ + char ivctname[80]; + void (*ivctptr)(void); + int i; + + /* + * Initialize entire table with 'reserved' trap and then overwrite + * specific entries. T_EXTOVRFLT (9) is unsupported and reserved + * since it can only be generated on a 386 processor. 15 is also + * unsupported and reserved. + */ +#if !defined(__xpv) + for (i = 0; i < NIDT; i++) { + set_gatesegd(&idt[i], + (kpti_enable == 1) ? &tr_resvtrap : &resvtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(T_RESVTRAP)); + } +#else + for (i = 0; i < NIDT; i++) { + set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, + IST_NONE); + } +#endif + + /* + * 20-31 reserved + */ +#if !defined(__xpv) + for (i = 20; i < 32; i++) { + set_gatesegd(&idt[i], + (kpti_enable == 1) ? &tr_invaltrap : &invaltrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(T_INVALTRAP)); + } +#else + for (i = 20; i < 32; i++) { + set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, + IST_NONE); + } +#endif + + /* + * interrupts 32 - 255 + */ + for (i = 32; i < 256; i++) { +#if !defined(__xpv) + (void) snprintf(ivctname, sizeof (ivctname), + (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i); +#else + (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i); +#endif + ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0); + if (ivctptr == NULL) + panic("kobj_getsymvalue(%s) failed", ivctname); + + set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(i)); + } + + /* + * Now install the common ones. Note that it will overlay some + * entries installed above like T_SYSCALLINT, T_FASTTRAP etc. + */ + init_idt_common(idt); +} + +#endif /* __xpv */ + +/* + * The kernel does not deal with LDTs unless a user explicitly creates + * one. Under normal circumstances, the LDTR contains 0. Any process attempting + * to reference the LDT will therefore cause a #gp. System calls made via the + * obsolete lcall mechanism are emulated by the #gp fault handler. + */ +static void +init_ldt(void) +{ +#if defined(__xpv) + xen_set_ldt(NULL, 0); +#else + wr_ldtr(0); +#endif +} + +#if !defined(__xpv) + +static void +init_tss(void) +{ + extern struct cpu cpus[]; + + /* + * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each + * context switch but it'll be overwritten with this same value anyway. + */ + if (kpti_enable == 1) { + ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; + } + + /* Set up the IST stacks for double fault, NMI, MCE. */ + ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)]; + ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)]; + ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)]; + + /* + * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is + * enabled), and also for KDI (always). + */ + ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; + + if (kpti_enable == 1) { + /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */ + ktss0->tss_ist5 = + (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp; + + /* This IST stack is used for all other intrs (for KPTI). */ + ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; + } + + /* + * Set I/O bit map offset equal to size of TSS segment limit + * for no I/O permission map. This will force all user I/O + * instructions to generate #gp fault. + */ + ktss0->tss_bitmapbase = sizeof (*ktss0); + + /* + * Point %tr to descriptor for ktss0 in gdt. + */ + wr_tsr(KTSS_SEL); +} + +#endif /* !__xpv */ + +#if defined(__xpv) + +void +init_desctbls(void) +{ + uint_t vec; + user_desc_t *gdt; + + /* + * Setup and install our GDT. + */ + gdt = init_gdt(); + + /* + * Store static pa of gdt to speed up pa_to_ma() translations + * on lwp context switches. + */ + ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); + CPU->cpu_gdt = gdt; + CPU->cpu_m.mcpu_gdtpa = pfn_to_pa(va_to_pfn(gdt)); + + /* + * Setup and install our IDT. + */ +#if !defined(__lint) + ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); +#endif + idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, + PAGESIZE, PAGESIZE); + bzero(idt0, PAGESIZE); + init_idt(idt0); + for (vec = 0; vec < NIDT; vec++) + xen_idt_write(&idt0[vec], vec); + + CPU->cpu_idt = idt0; + + /* + * set default kernel stack + */ + xen_stack_switch(KDS_SEL, + (ulong_t)&dblfault_stack0[sizeof (dblfault_stack0)]); + + xen_init_callbacks(); + + init_ldt(); +} + +#else /* __xpv */ + +void +init_desctbls(void) +{ + user_desc_t *gdt; + desctbr_t idtr; + + /* + * Allocate IDT and TSS structures on unique pages for better + * performance in virtual machines. + */ +#if !defined(__lint) + ASSERT(NIDT * sizeof (*idt0) <= PAGESIZE); +#endif + idt0 = (gate_desc_t *)BOP_ALLOC(bootops, (caddr_t)IDT_VA, + PAGESIZE, PAGESIZE); + bzero(idt0, PAGESIZE); +#if !defined(__lint) + ASSERT(sizeof (*ktss0) <= PAGESIZE); +#endif + ktss0 = (tss_t *)BOP_ALLOC(bootops, (caddr_t)KTSS_VA, + PAGESIZE, PAGESIZE); + bzero(ktss0, PAGESIZE); + + + /* + * Setup and install our GDT. + */ + gdt = init_gdt(); + ASSERT(IS_P2ALIGNED((uintptr_t)gdt, PAGESIZE)); + CPU->cpu_gdt = gdt; + + /* + * Initialize this CPU's LDT. + */ + CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA, + LDT_CPU_SIZE, PAGESIZE); + bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE); + CPU->cpu_m.mcpu_ldt_len = 0; + + /* + * Setup and install our IDT. + */ + init_idt(idt0); + + idtr.dtr_base = (uintptr_t)idt0; + idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1; + wr_idtr(&idtr); + CPU->cpu_idt = idt0; + + + init_tss(); + CPU->cpu_tss = ktss0; + init_ldt(); + + /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */ + kpti_safe_cr3 = (uint64_t)getcr3(); +} + +#endif /* __xpv */ + +#ifndef __xpv +/* + * As per Intel Vol 3 27.5.2, the GDTR limit is reset to 64Kb on a VM exit, so + * we have to manually fix it up ourselves. + * + * The caller may still need to make sure that it can't go off-CPU with the + * incorrect limit, before calling this (such as disabling pre-emption). + */ +void +reset_gdtr_limit(void) +{ + ulong_t flags = intr_clear(); + desctbr_t gdtr; + + rd_gdtr(&gdtr); + gdtr.dtr_limit = (sizeof (user_desc_t) * NGDT) - 1; + wr_gdtr(&gdtr); + + intr_restore(flags); +} +#endif /* __xpv */ + +/* + * In the early kernel, we need to set up a simple GDT to run on. + * + * XXPV Can dboot use this too? See dboot_gdt.s + */ +void +init_boot_gdt(user_desc_t *bgdt) +{ + set_usegd(&bgdt[GDT_B32DATA], SDP_LONG, NULL, -1, SDT_MEMRWA, SEL_KPL, + SDP_PAGES, SDP_OP32); + set_usegd(&bgdt[GDT_B64CODE], SDP_LONG, NULL, -1, SDT_MEMERA, SEL_KPL, + SDP_PAGES, SDP_OP32); +} + +/* + * Enable interpositioning on the system call path by rewriting the + * sys{call|enter} MSRs and the syscall-related entries in the IDT to use + * the branded entry points. + */ +void +brand_interpositioning_enable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) { + idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_interp_desc; +#if defined(__xpv) + xen_idt_write(&idt[brand_tbl[i].ih_inum], + brand_tbl[i].ih_inum); +#endif + } + +#if defined(__xpv) + + /* + * Currently the hypervisor only supports 64-bit syscalls via + * syscall instruction. The 32-bit syscalls are handled by + * interrupt gate above. + */ + xen_set_callback(brand_sys_syscall, CALLBACKTYPE_syscall, + CALLBACKF_mask_events); + +#else + + if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { + if (kpti_enable == 1) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32); + } else { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); + } + } + +#endif + + if (is_x86_feature(x86_featureset, X86FSET_SEP)) { + if (kpti_enable == 1) { + wrmsr(MSR_INTC_SEP_EIP, + (uintptr_t)tr_brand_sys_sysenter); + } else { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); + } + } +} + +/* + * Disable interpositioning on the system call path by rewriting the + * sys{call|enter} MSRs and the syscall-related entries in the IDT to use + * the standard entry points, which bypass the interpositioning hooks. + */ +void +brand_interpositioning_disable(void) +{ + gate_desc_t *idt = CPU->cpu_idt; + int i; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + for (i = 0; brand_tbl[i].ih_inum; i++) { + idt[brand_tbl[i].ih_inum] = brand_tbl[i].ih_default_desc; +#if defined(__xpv) + xen_idt_write(&idt[brand_tbl[i].ih_inum], + brand_tbl[i].ih_inum); +#endif + } + +#if defined(__xpv) + + /* + * See comment above in brand_interpositioning_enable. + */ + xen_set_callback(sys_syscall, CALLBACKTYPE_syscall, + CALLBACKF_mask_events); + +#else + + if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { + if (kpti_enable == 1) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32); + } else { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); + } + } + +#endif + + if (is_x86_feature(x86_featureset, X86FSET_SEP)) { + if (kpti_enable == 1) { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter); + } else { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); + } + } +} diff --git a/usr/src/uts/intel/os/fpu.c b/usr/src/uts/intel/os/fpu.c new file mode 100644 index 0000000000..0037f49f85 --- /dev/null +++ b/usr/src/uts/intel/os/fpu.c @@ -0,0 +1,1506 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Joyent, Inc. + * Copyright 2021 RackTop Systems, Inc. + */ + +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ +/* All Rights Reserved */ + +/* Copyright (c) 1987, 1988 Microsoft Corporation */ +/* All Rights Reserved */ + +/* + * Copyright (c) 2009, Intel Corporation. + * All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/signal.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/psw.h> +#include <sys/trap.h> +#include <sys/fault.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/pcb.h> +#include <sys/lwp.h> +#include <sys/cpuvar.h> +#include <sys/thread.h> +#include <sys/disp.h> +#include <sys/fp.h> +#include <sys/siginfo.h> +#include <sys/archsystm.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/x86_archext.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/kfpu.h> + +/* + * FPU Management Overview + * ----------------------- + * + * The x86 FPU has evolved substantially since its days as the x87 coprocessor; + * however, many aspects of its life as a coprocessor are still around in x86. + * + * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU. + * While that state still exists, there is much more that is covered by the FPU. + * Today, this includes not just traditional FPU state, but also supervisor only + * state. The following state is currently managed and covered logically by the + * idea of the FPU registers: + * + * o Traditional x87 FPU + * o Vector Registers (%xmm, %ymm, %zmm) + * o Memory Protection Extensions (MPX) Bounds Registers + * o Protected Key Rights Registers (PKRU) + * o Processor Trace data + * + * The rest of this covers how the FPU is managed and controlled, how state is + * saved and restored between threads, interactions with hypervisors, and other + * information exported to user land through aux vectors. A lot of background + * information is here to synthesize major parts of the Intel SDM, but + * unfortunately, it is not a replacement for reading it. + * + * FPU Control Registers + * --------------------- + * + * Because the x87 FPU began its life as a co-processor and the FPU was + * optional there are several bits that show up in %cr0 that we have to + * manipulate when dealing with the FPU. These are: + * + * o CR0.ET The 'extension type' bit. This was used originally to indicate + * that the FPU co-processor was present. Now it is forced on for + * compatibility. This is often used to verify whether or not the + * FPU is present. + * + * o CR0.NE The 'native error' bit. Used to indicate that native error + * mode should be enabled. This indicates that we should take traps + * on FPU errors. The OS enables this early in boot. + * + * o CR0.MP The 'Monitor Coprocessor' bit. Used to control whether or not + * wait/fwait instructions generate a #NM if CR0.TS is set. + * + * o CR0.EM The 'Emulation' bit. This is used to cause floating point + * operations (x87 through SSE4) to trap with a #UD so they can be + * emulated. The system never sets this bit, but makes sure it is + * clear on processor start up. + * + * o CR0.TS The 'Task Switched' bit. When this is turned on, a floating + * point operation will generate a #NM. An fwait will as well, + * depending on the value in CR0.MP. + * + * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by + * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more + * complicated role. Historically it has been used to allow running systems to + * restore the FPU registers lazily. This will be discussed in greater depth + * later on. + * + * %cr4 is also used as part of the FPU control. Specifically we need to worry + * about the following bits in the system: + * + * o CR4.OSFXSR This bit is used to indicate that the OS understands and + * supports the execution of the fxsave and fxrstor + * instructions. This bit is required to be set to enable + * the use of the SSE->SSE4 instructions. + * + * o CR4.OSXMMEXCPT This bit is used to indicate that the OS can understand + * and take a SIMD floating point exception (#XM). This bit + * is always enabled by the system. + * + * o CR4.OSXSAVE This bit is used to indicate that the OS understands and + * supports the execution of the xsave and xrstor family of + * instructions. This bit is required to use any of the AVX + * and newer feature sets. + * + * Because all supported processors are 64-bit, they'll always support the XMM + * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot. + * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid. + * + * %xcr0 is used to manage the behavior of the xsave feature set and is only + * present on the system if xsave is supported. %xcr0 is read and written to + * through by the xgetbv and xsetbv instructions. This register is present + * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a + * different component of the xsave state and controls whether or not that + * information is saved and restored. For newer feature sets like AVX and MPX, + * it also controls whether or not the corresponding instructions can be + * executed (much like CR0.OSFXSR does for the SSE feature sets). + * + * Everything in %xcr0 is around features available to users. There is also the + * IA32_XSS MSR which is used to control supervisor-only features that are still + * part of the xsave state. Bits that can be set in %xcr0 are reserved in + * IA32_XSS and vice versa. This is an important property that is particularly + * relevant to how the xsave instructions operate. + * + * Save Mechanisms + * --------------- + * + * When switching between running threads the FPU state needs to be saved and + * restored by the OS. If this state was not saved, users would rightfully + * complain about corrupt state. There are three mechanisms that exist on the + * processor for saving and restoring these state images: + * + * o fsave + * o fxsave + * o xsave + * + * fsave saves and restores only the x87 FPU and is the oldest of these + * mechanisms. This mechanism is never used in the kernel today because we are + * always running on systems that support fxsave. + * + * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register + * state to be saved and restored to and from a struct fxsave_state. This is the + * default mechanism that is used to save and restore the FPU on amd64. An + * important aspect of fxsave that was different from the original i386 fsave + * mechanism is that the restoring of FPU state with pending exceptions will not + * generate an exception, it will be deferred to the next use of the FPU. + * + * The final and by far the most complex mechanism is that of the xsave set. + * xsave allows for saving and restoring all of the traditional x86 pieces (x87 + * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc. + * registers. + * + * Data is saved and restored into and out of a struct xsave_state. The first + * part of the struct xsave_state is equivalent to the struct fxsave_state. + * After that, there is a header which is used to describe the remaining + * portions of the state. The header is a 64-byte value of which the first two + * uint64_t values are defined and the rest are reserved and must be zero. The + * first uint64_t is the xstate_bv member. This describes which values in the + * xsave_state are actually valid and present. This is updated on a save and + * used on restore. The second member is the xcomp_bv member. Its last bit + * determines whether or not a compressed version of the structure is used. + * + * When the uncompressed structure is used (currently the only format we + * support), then each state component is at a fixed offset in the structure, + * even if it is not being used. For example, if you only saved the AVX related + * state, but did not save the MPX related state, the offset would not change + * for any component. With the compressed format, components that aren't used + * are all elided (though the x87 and SSE state are always there). + * + * Unlike fxsave which saves all state, the xsave family does not always save + * and restore all the state that could be covered by the xsave_state. The + * instructions all take an argument which is a mask of what to consider. This + * is the same mask that will be used in the xstate_bv vector and it is also the + * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only + * considered with the xsaves and xrstors instructions. + * + * When a save or restore is requested, a bitwise and is performed between the + * requested bits and those that have been enabled in %xcr0. Only the bits that + * match that are then saved or restored. Others will be silently ignored by + * the processor. This idea is used often in the OS. We will always request that + * we save and restore all of the state, but only those portions that are + * actually enabled in %xcr0 will be touched. + * + * If a feature has been asked to be restored that is not set in the xstate_bv + * feature vector of the save state, then it will be set to its initial state by + * the processor (usually zeros). Also, when asked to save state, the processor + * may not write out data that is in its initial state as an optimization. This + * optimization only applies to saving data and not to restoring data. + * + * There are a few different variants of the xsave and xrstor instruction. They + * are: + * + * o xsave This is the original save instruction. It will save all of the + * requested data in the xsave state structure. It only saves data + * in the uncompressed (xcomp_bv[63] is zero) format. It may be + * executed at all privilege levels. + * + * o xrstor This is the original restore instruction. It will restore all of + * the requested data. The xrstor function can handle both the + * compressed and uncompressed formats. It may be executed at all + * privilege levels. + * + * o xsaveopt This is a variant of the xsave instruction that employs + * optimizations to try and only write out state that has been + * modified since the last time an xrstor instruction was called. + * The processor tracks a tuple of information about the last + * xrstor and tries to ensure that the same buffer is being used + * when this optimization is being used. However, because of the + * way that it tracks the xrstor buffer based on the address of it, + * it is not suitable for use if that buffer can be easily reused. + * The most common case is trying to save data to the stack in + * rtld. It may be executed at all privilege levels. + * + * o xsavec This is a variant of the xsave instruction that writes out the + * compressed form of the xsave_state. Otherwise it behaves as + * xsave. It may be executed at all privilege levels. + * + * o xsaves This is a variant of the xsave instruction. It is similar to + * xsavec in that it always writes the compressed form of the + * buffer. Unlike all the other forms, this instruction looks at + * both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine + * what to save and restore. xsaves also implements the same + * optimization that xsaveopt does around modified pieces. User + * land may not execute the instruction. + * + * o xrstors This is a variant of the xrstor instruction. Similar to xsaves + * it can save and restore both the user and privileged states. + * Unlike xrstor it can only operate on the compressed form. + * User land may not execute the instruction. + * + * Based on all of these, the kernel has a precedence for what it will use. + * Basically, xsaves (not supported) is preferred to xsaveopt, which is + * preferred to xsave. A similar scheme is used when informing rtld (more later) + * about what it should use. xsavec is preferred to xsave. xsaveopt is not + * recommended due to the modified optimization not being appropriate for this + * use. + * + * Finally, there is one last gotcha with the xsave state. Importantly some AMD + * processors did not always save and restore some of the FPU exception state in + * some cases like Intel did. In those cases the OS will make up for this fact + * itself. + * + * FPU Initialization + * ------------------ + * + * One difference with the FPU registers is that not all threads have FPU state, + * only those that have an lwp. Generally this means kernel threads, which all + * share p0 and its lwp, do not have FPU state. Though there are definitely + * exceptions such as kcfpoold. In the rest of this discussion we'll use thread + * and lwp interchangeably, just think of thread meaning a thread that has a + * lwp. + * + * Each lwp has its FPU state allocated in its pcb (process control block). The + * actual storage comes from the fpsave_cachep kmem cache. This cache is sized + * dynamically at start up based on the save mechanism that we're using and the + * amount of memory required for it. This is dynamic because the xsave_state + * size varies based on the supported feature set. + * + * The hardware side of the FPU is initialized early in boot before we mount the + * root file system. This is effectively done in fpu_probe(). This is where we + * make the final decision about what the save and restore mechanisms we should + * use are, create the fpsave_cachep kmem cache, and initialize a number of + * function pointers that use save and restoring logic. + * + * The thread/lwp side is a a little more involved. There are two different + * things that we need to concern ourselves with. The first is how the FPU + * resources are allocated and the second is how the FPU state is initialized + * for a given lwp. + * + * We allocate the FPU save state from our kmem cache as part of lwp_fp_init(). + * This is always called unconditionally by the system as part of creating an + * LWP. + * + * There are three different initialization paths that we deal with. The first + * is when we are executing a new process. As part of exec all of the register + * state is reset. The exec case is particularly important because init is born + * like Athena, sprouting from the head of the kernel, without any true parent + * to fork from. The second is used whenever we fork or create a new lwp. The + * third is to deal with special lwps like the agent lwp. + * + * During exec, we will call fp_exec() which will initialize and set up the FPU + * state for the process. That will fill in the initial state for the FPU and + * also set that state in the FPU itself. As part of fp_exec() we also install a + * thread context operations vector that takes care of dealing with the saving + * and restoring of the FPU. These context handlers will also be called whenever + * an lwp is created or forked. In those cases, to initialize the FPU we will + * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context + * operations vector for the new thread. + * + * Next we'll end up in the context operation fp_new_lwp(). This saves the + * current thread's state, initializes the new thread's state, and copies over + * the relevant parts of the originating thread's state. It's as this point that + * we also install the FPU context operations into the new thread, which ensures + * that all future threads that are descendants of the current one get the + * thread context operations (unless they call exec). + * + * To deal with some things like the agent lwp, we double check the state of the + * FPU in sys_rtt_common() to make sure that it has been enabled before + * returning to user land. In general, this path should be rare, but it's useful + * for the odd lwp here and there. + * + * The FPU state will remain valid most of the time. There are times that + * the state will be rewritten. For example in restorecontext, due to /proc, or + * the lwp calls exec(). Whether the context is being freed or we are resetting + * the state, we will call fp_free() to disable the FPU and our context. + * + * Finally, when the lwp is destroyed, it will actually destroy and free the FPU + * state by calling fp_lwp_cleanup(). + * + * Kernel FPU Multiplexing + * ----------------------- + * + * Just as the kernel has to maintain all of the general purpose registers when + * switching between scheduled threads, the same is true of the FPU registers. + * + * When a thread has FPU state, it also has a set of context operations + * installed. These context operations take care of making sure that the FPU is + * properly saved and restored during a context switch (fpsave_ctxt and + * fprestore_ctxt respectively). This means that the current implementation of + * the FPU is 'eager', when a thread is running the CPU will have its FPU state + * loaded. While this is always true when executing in userland, there are a few + * cases where this is not true in the kernel. + * + * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was + * employed. This meant that the FPU would be saved on a context switch and the + * CR0.TS bit would be set. When a thread next tried to use the FPU, it would + * then take a #NM trap, at which point we would restore the FPU from the save + * area and return to user land. Given the frequency of use of the FPU alone by + * libc, there's no point returning to user land just to trap again. + * + * There are a few cases though where the FPU state may need to be changed for a + * thread on its behalf. The most notable cases are in the case of processes + * using /proc, restorecontext, forking, etc. In all of these cases the kernel + * will force a threads FPU state to be saved into the PCB through the fp_save() + * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the + * pcb. This indicates that the save state holds currently valid data. As a side + * effect of this, CR0.TS will be set. To make sure that all of the state is + * updated before returning to user land, in these cases, we set a flag on the + * PCB that says the FPU needs to be updated. This will make sure that we take + * the slow path out of a system call to fix things up for the thread. Due to + * the fact that this is a rather rare case, effectively setting the equivalent + * of t_postsys is acceptable. + * + * CR0.TS will be set after a save occurs and cleared when a restore occurs. + * Generally this means it will be cleared immediately by the new thread that is + * running in a context switch. However, this isn't the case for kernel threads. + * They currently operate with CR0.TS set as no kernel state is restored for + * them. This means that using the FPU will cause a #NM and panic. + * + * The FPU_VALID flag on the currently executing thread's pcb is meant to track + * what the value of CR0.TS should be. If it is set, then CR0.TS will be set. + * However, because we eagerly restore, the only time that CR0.TS should be set + * for a non-kernel thread is during operations where it will be cleared before + * returning to user land and importantly, the only data that is in it is its + * own. + * + * Kernel FPU Usage + * ---------------- + * + * Traditionally the kernel never used the FPU since it had no need for + * floating point operations. However, modern FPU hardware supports a variety + * of SIMD extensions which can speed up code such as parity calculations or + * encryption. + * + * To allow the kernel to take advantage of these features, the + * kernel_fpu_begin() and kernel_fpu_end() functions should be wrapped + * around any usage of the FPU by the kernel to ensure that user-level context + * is properly saved/restored, as well as to properly setup the FPU for use by + * the kernel. There are a variety of ways this wrapping can be used, as + * discussed in this section below. + * + * When kernel_fpu_begin() and kernel_fpu_end() are used for extended + * operations, the kernel_fpu_alloc() function should be used to allocate a + * kfpu_state_t structure that is used to save/restore the thread's kernel FPU + * state. This structure is not tied to any thread. That is, different threads + * can reuse the same kfpu_state_t structure, although not concurrently. A + * kfpu_state_t structure is freed by the kernel_fpu_free() function. + * + * In some cases, the kernel may need to use the FPU for a short operation + * without the overhead to manage a kfpu_state_t structure and without + * allowing for a context switch off the FPU. In this case the KFPU_NO_STATE + * bit can be set in the kernel_fpu_begin() and kernel_fpu_end() flags + * parameter. This indicates that there is no kfpu_state_t. When used this way, + * kernel preemption should be disabled by the caller (kpreempt_disable) before + * calling kernel_fpu_begin(), and re-enabled after calling kernel_fpu_end(). + * For this usage, it is important to limit the kernel's FPU use to short + * operations. The tradeoff between using the FPU without a kfpu_state_t + * structure vs. the overhead of allowing a context switch while using the FPU + * should be carefully considered on a case by case basis. + * + * In other cases, kernel threads have an LWP, but never execute in user space. + * In this situation, the LWP's pcb_fpu area can be used to save/restore the + * kernel's FPU state if the thread is context switched, instead of having to + * allocate and manage a kfpu_state_t structure. The KFPU_USE_LWP bit in the + * kernel_fpu_begin() and kernel_fpu_end() flags parameter is used to + * enable this behavior. It is the caller's responsibility to ensure that this + * is only used for a kernel thread which never executes in user space. + * + * FPU Exceptions + * -------------- + * + * Certain operations can cause the kernel to take traps due to FPU activity. + * Generally these events will cause a user process to receive a SIGFPU and if + * the kernel receives it in kernel context, we will die. Traditionally the #NM + * (Device Not Available / No Math) exception generated by CR0.TS would have + * caused us to restore the FPU. Now it is a fatal event regardless of whether + * or not user land causes it. + * + * While there are some cases where the kernel uses the FPU, it is up to the + * kernel to use the FPU in a way such that it cannot receive a trap or to use + * the appropriate trap protection mechanisms. + * + * Hypervisors + * ----------- + * + * When providing support for hypervisors things are a little bit more + * complicated because the FPU is not virtualized at all. This means that they + * need to save and restore the FPU and %xcr0 across entry and exit to the + * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These + * allow us to use the full native state to make sure that we are always saving + * and restoring the full FPU that the host sees, even when the guest is using a + * subset. + * + * One tricky aspect of this is that the guest may be using a subset of %xcr0 + * and therefore changing our %xcr0 on the fly. It is vital that when we're + * saving and restoring the FPU that we always use the largest %xcr0 contents + * otherwise we will end up leaving behind data in it. + * + * ELF PLT Support + * --------------- + * + * rtld has to preserve a subset of the FPU when it is saving and restoring + * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for + * more information. As a result, we set up an aux vector that contains + * information about what save and restore mechanisms it should be using and + * the sizing thereof based on what the kernel supports. This is passed down in + * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is + * initialized in fpu_subr.c. + */ + +kmem_cache_t *fpsave_cachep; + +/* Legacy fxsave layout + xsave header + ymm */ +#define AVX_XSAVE_SIZE (512 + 64 + 256) + +/* + * Various sanity checks. + */ +CTASSERT(sizeof (struct fxsave_state) == 512); +CTASSERT(sizeof (struct fnsave_state) == 108); +CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0); +CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE); + +/* + * This structure is the x86 implementation of the kernel FPU that is defined in + * uts/common/sys/kfpu.h. + */ + +typedef enum kfpu_flags { + /* + * This indicates that the save state has initial FPU data. + */ + KFPU_F_INITIALIZED = 0x01 +} kfpu_flags_t; + +struct kfpu_state { + fpu_ctx_t kfpu_ctx; + kfpu_flags_t kfpu_flags; + kthread_t *kfpu_curthread; +}; + +/* + * Initial kfpu state for SSE/SSE2 used by fpinit() + */ +const struct fxsave_state sse_initial = { + FPU_CW_INIT, /* fx_fcw */ + 0, /* fx_fsw */ + 0, /* fx_fctw */ + 0, /* fx_fop */ + 0, /* fx_rip */ + 0, /* fx_rdp */ + SSE_MXCSR_INIT /* fx_mxcsr */ + /* rest of structure is zero */ +}; + +/* + * Initial kfpu state for AVX used by fpinit() + */ +const struct xsave_state avx_initial = { + /* + * The definition below needs to be identical with sse_initial + * defined above. + */ + { + FPU_CW_INIT, /* fx_fcw */ + 0, /* fx_fsw */ + 0, /* fx_fctw */ + 0, /* fx_fop */ + 0, /* fx_rip */ + 0, /* fx_rdp */ + SSE_MXCSR_INIT /* fx_mxcsr */ + /* rest of structure is zero */ + }, + /* + * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid, + * and CPU should initialize XMM/YMM. + */ + 1, + 0 /* xs_xcomp_bv */ + /* rest of structure is zero */ +}; + +/* + * mxcsr_mask value (possibly reset in fpu_probe); used to avoid + * the #gp exception caused by setting unsupported bits in the + * MXCSR register + */ +uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT; + +/* + * Initial kfpu state for x87 used by fpinit() + */ +const struct fnsave_state x87_initial = { + FPU_CW_INIT, /* f_fcw */ + 0, /* __f_ign0 */ + 0, /* f_fsw */ + 0, /* __f_ign1 */ + 0xffff, /* f_ftw */ + /* rest of structure is zero */ +}; + +/* + * This vector is patched to xsave_ctxt() or xsaveopt_ctxt() if we discover we + * have an XSAVE-capable chip in fpu_probe. + */ +void (*fpsave_ctxt)(void *) = fpxsave_ctxt; +void (*fprestore_ctxt)(void *) = fpxrestore_ctxt; + +/* + * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable. + */ +void (*xsavep)(struct xsave_state *, uint64_t) = xsave; + +static int fpe_sicode(uint_t); +static int fpe_simd_sicode(uint_t); + +/* + * Copy the state of parent lwp's floating point context into the new lwp. + * Invoked for both fork() and lwp_create(). + * + * Note that we inherit -only- the control state (e.g. exception masks, + * rounding, precision control, etc.); the FPU registers are otherwise + * reset to their initial state. + */ +static void +fp_new_lwp(kthread_id_t t, kthread_id_t ct) +{ + struct fpu_ctx *fp; /* parent fpu context */ + struct fpu_ctx *cfp; /* new fpu context */ + struct fxsave_state *fx, *cfx; + struct xsave_state *cxs; + + ASSERT(fp_kind != FP_NO); + + fp = &t->t_lwp->lwp_pcb.pcb_fpu; + cfp = &ct->t_lwp->lwp_pcb.pcb_fpu; + + /* + * If the parent FPU state is still in the FPU hw then save it; + * conveniently, fp_save() already does this for us nicely. + */ + fp_save(fp); + + cfp->fpu_flags = FPU_EN | FPU_VALID; + cfp->fpu_regs.kfpu_status = 0; + cfp->fpu_regs.kfpu_xstatus = 0; + + /* + * Make sure that the child's FPU is cleaned up and made ready for user + * land. + */ + PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb); + + switch (fp_save_mech) { + case FP_FXSAVE: + fx = fp->fpu_regs.kfpu_u.kfpu_fx; + cfx = cfp->fpu_regs.kfpu_u.kfpu_fx; + bcopy(&sse_initial, cfx, sizeof (*cfx)); + cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; + cfx->fx_fcw = fx->fx_fcw; + break; + + case FP_XSAVE: + cfp->fpu_xsave_mask = fp->fpu_xsave_mask; + + VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL); + + fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; + cxs = cfp->fpu_regs.kfpu_u.kfpu_xs; + cfx = &cxs->xs_fxsave; + + bcopy(&avx_initial, cxs, sizeof (*cxs)); + cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; + cfx->fx_fcw = fx->fx_fcw; + cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) & + XFEATURE_FP_INITIAL); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + /* + * Mark that both the parent and child need to have the FPU cleaned up + * before returning to user land. + */ + + installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, + fp_new_lwp, NULL, fp_free, NULL); +} + +/* + * Free any state associated with floating point context. + * Fp_free can be called in three cases: + * 1) from reaper -> thread_free -> freectx-> fp_free + * fp context belongs to a thread on deathrow + * nothing to do, thread will never be resumed + * thread calling ctxfree is reaper + * + * 2) from exec -> freectx -> fp_free + * fp context belongs to the current thread + * must disable fpu, thread calling ctxfree is curthread + * + * 3) from restorecontext -> setfpregs -> fp_free + * we have a modified context in the memory (lwp->pcb_fpu) + * disable fpu and release the fp context for the CPU + * + */ +/*ARGSUSED*/ +void +fp_free(struct fpu_ctx *fp, int isexec) +{ + ASSERT(fp_kind != FP_NO); + + if (fp->fpu_flags & FPU_VALID) + return; + + kpreempt_disable(); + /* + * We want to do fpsave rather than fpdisable so that we can + * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit + */ + fp->fpu_flags |= FPU_VALID; + /* If for current thread disable FP to track FPU_VALID */ + if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) { + /* Clear errors if any to prevent frstor from complaining */ + (void) fperr_reset(); + if (fp_kind & __FP_SSE) + (void) fpxerr_reset(); + fpdisable(); + } + kpreempt_enable(); +} + +/* + * Store the floating point state and disable the floating point unit. + */ +void +fp_save(struct fpu_ctx *fp) +{ + ASSERT(fp_kind != FP_NO); + + kpreempt_disable(); + if (!fp || fp->fpu_flags & FPU_VALID || + (fp->fpu_flags & FPU_EN) == 0) { + kpreempt_enable(); + return; + } + ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu); + + switch (fp_save_mech) { + case FP_FXSAVE: + fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx); + break; + + case FP_XSAVE: + xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + fp->fpu_flags |= FPU_VALID; + + /* + * We save the FPU as part of forking, execing, modifications via /proc, + * restorecontext, etc. As such, we need to make sure that we return to + * userland with valid state in the FPU. If we're context switched out + * before we hit sys_rtt_common() we'll end up having restored the FPU + * as part of the context ops operations. The restore logic always makes + * sure that FPU_VALID is set before doing a restore so we don't restore + * it a second time. + */ + PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb); + + kpreempt_enable(); +} + +/* + * Restore the FPU context for the thread: + * The possibilities are: + * 1. No active FPU context: Load the new context into the FPU hw + * and enable the FPU. + */ +void +fp_restore(struct fpu_ctx *fp) +{ + switch (fp_save_mech) { + case FP_FXSAVE: + fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx); + break; + + case FP_XSAVE: + xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + fp->fpu_flags &= ~FPU_VALID; +} + +/* + * Reset the FPU such that it is in a valid state for a new thread that is + * coming out of exec. The FPU will be in a usable state at this point. At this + * point we know that the FPU state has already been allocated and if this + * wasn't an init process, then it will have had fp_free() previously called. + */ +void +fp_exec(void) +{ + struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; + struct ctxop *ctx = installctx_preallocate(); + + if (fp_save_mech == FP_XSAVE) { + fp->fpu_xsave_mask = XFEATURE_FP_ALL; + } + + /* + * Make sure that we're not preempted in the middle of initializing the + * FPU on CPU. + */ + kpreempt_disable(); + installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, + fp_new_lwp, NULL, fp_free, ctx); + fpinit(); + fp->fpu_flags = FPU_EN; + kpreempt_enable(); +} + + +/* + * Seeds the initial state for the current thread. The possibilities are: + * 1. Another process has modified the FPU state before we have done any + * initialization: Load the FPU state from the LWP state. + * 2. The FPU state has not been externally modified: Load a clean state. + */ +void +fp_seed(void) +{ + struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; + + ASSERT(curthread->t_preempt >= 1); + ASSERT((fp->fpu_flags & FPU_EN) == 0); + + /* + * Always initialize a new context and initialize the hardware. + */ + if (fp_save_mech == FP_XSAVE) { + fp->fpu_xsave_mask = XFEATURE_FP_ALL; + } + + installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp, + fp_new_lwp, NULL, fp_free, NULL); + fpinit(); + + /* + * If FPU_VALID is set, it means someone has modified registers via + * /proc. In this case, restore the current lwp's state. + */ + if (fp->fpu_flags & FPU_VALID) + fp_restore(fp); + + ASSERT((fp->fpu_flags & FPU_VALID) == 0); + fp->fpu_flags = FPU_EN; +} + +/* + * When using xsave/xrstor, these three functions are used by the lwp code to + * manage the memory for the xsave area. + */ +void +fp_lwp_init(struct _klwp *lwp) +{ + struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; + + /* + * We keep a copy of the pointer in lwp_fpu so that we can restore the + * value in forklwp() after we duplicate the parent's LWP state. + */ + lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = + kmem_cache_alloc(fpsave_cachep, KM_SLEEP); + + if (fp_save_mech == FP_XSAVE) { + /* + * + * We bzero since the fpinit() code path will only + * partially initialize the xsave area using avx_inital. + */ + ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state)); + bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size()); + } +} + +void +fp_lwp_cleanup(struct _klwp *lwp) +{ + struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu; + + if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) { + kmem_cache_free(fpsave_cachep, + fp->fpu_regs.kfpu_u.kfpu_generic); + lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL; + } +} + +/* + * Called during the process of forklwp(). The kfpu_u pointer will have been + * overwritten while copying the parent's LWP structure. We have a valid copy + * stashed in the child's lwp_fpu which we use to restore the correct value. + */ +void +fp_lwp_dup(struct _klwp *lwp) +{ + void *xp = lwp->lwp_fpu; + size_t sz; + + switch (fp_save_mech) { + case FP_FXSAVE: + sz = sizeof (struct fxsave_state); + break; + case FP_XSAVE: + sz = cpuid_get_xsave_size(); + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + /* copy the parent's values into the new lwp's struct */ + bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz); + /* now restore the pointer */ + lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp; +} + +/* + * Handle a processor extension error fault + * Returns non zero for error. + */ + +/*ARGSUSED*/ +int +fpexterrflt(struct regs *rp) +{ + uint32_t fpcw, fpsw; + fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; + + ASSERT(fp_kind != FP_NO); + + /* + * Now we can enable the interrupts. + * (NOTE: x87 fp exceptions come thru interrupt gate) + */ + sti(); + + if (!fpu_exists) + return (FPE_FLTINV); + + /* + * Do an unconditional save of the FP state. If it's dirty (TS=0), + * it'll be saved into the fpu context area passed in (that of the + * current thread). If it's not dirty (it may not be, due to + * an intervening save due to a context switch between the sti(), + * above and here, then it's safe to just use the stored values in + * the context save area to determine the cause of the fault. + */ + fp_save(fp); + + /* clear exception flags in saved state, as if by fnclex */ + switch (fp_save_mech) { + case FP_FXSAVE: + fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; + fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw; + fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS; + break; + + case FP_XSAVE: + fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; + fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw; + fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS; + /* + * Always set LEGACY_FP as it may have been cleared by XSAVE + * instruction + */ + fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } + + fp->fpu_regs.kfpu_status = fpsw; + + if ((fpsw & FPS_ES) == 0) + return (0); /* No exception */ + + /* + * "and" the exception flags with the complement of the mask + * bits to determine which exception occurred + */ + return (fpe_sicode(fpsw & ~fpcw & 0x3f)); +} + +/* + * Handle an SSE/SSE2 precise exception. + * Returns a non-zero sicode for error. + */ +/*ARGSUSED*/ +int +fpsimderrflt(struct regs *rp) +{ + uint32_t mxcsr, xmask; + fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu; + + ASSERT(fp_kind & __FP_SSE); + + /* + * NOTE: Interrupts are disabled during execution of this + * function. They are enabled by the caller in trap.c. + */ + + /* + * The only way we could have gotten here if there is no FP unit + * is via a user executing an INT $19 instruction, so there is + * no fault in that case. + */ + if (!fpu_exists) + return (0); + + /* + * Do an unconditional save of the FP state. If it's dirty (TS=0), + * it'll be saved into the fpu context area passed in (that of the + * current thread). If it's not dirty, then it's safe to just use + * the stored values in the context save area to determine the + * cause of the fault. + */ + fp_save(fp); /* save the FPU state */ + + if (fp_save_mech == FP_XSAVE) { + mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr; + fp->fpu_regs.kfpu_status = + fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw; + } else { + mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr; + fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw; + } + fp->fpu_regs.kfpu_xstatus = mxcsr; + + /* + * compute the mask that determines which conditions can cause + * a #xm exception, and use this to clean the status bits so that + * we can identify the true cause of this one. + */ + xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS; + return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask)); +} + +/* + * In the unlikely event that someone is relying on this subcode being + * FPE_FLTILL for denormalize exceptions, it can always be patched back + * again to restore old behaviour. + */ +int fpe_fltden = FPE_FLTDEN; + +/* + * Map from the FPU status word to the FP exception si_code. + */ +static int +fpe_sicode(uint_t sw) +{ + if (sw & FPS_IE) + return (FPE_FLTINV); + if (sw & FPS_ZE) + return (FPE_FLTDIV); + if (sw & FPS_DE) + return (fpe_fltden); + if (sw & FPS_OE) + return (FPE_FLTOVF); + if (sw & FPS_UE) + return (FPE_FLTUND); + if (sw & FPS_PE) + return (FPE_FLTRES); + return (FPE_FLTINV); /* default si_code for other exceptions */ +} + +/* + * Map from the SSE status word to the FP exception si_code. + */ +static int +fpe_simd_sicode(uint_t sw) +{ + if (sw & SSE_IE) + return (FPE_FLTINV); + if (sw & SSE_ZE) + return (FPE_FLTDIV); + if (sw & SSE_DE) + return (FPE_FLTDEN); + if (sw & SSE_OE) + return (FPE_FLTOVF); + if (sw & SSE_UE) + return (FPE_FLTUND); + if (sw & SSE_PE) + return (FPE_FLTRES); + return (FPE_FLTINV); /* default si_code for other exceptions */ +} + +/* + * This routine is invoked as part of libc's __fpstart implementation + * via sysi86(2). + * + * It may be called -before- any context has been assigned in which case + * we try and avoid touching the hardware. Or it may be invoked well + * after the context has been assigned and fiddled with, in which case + * just tweak it directly. + */ +void +fpsetcw(uint16_t fcw, uint32_t mxcsr) +{ + struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu; + struct fxsave_state *fx; + + if (!fpu_exists || fp_kind == FP_NO) + return; + + if ((fp->fpu_flags & FPU_EN) == 0) { + if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) { + /* + * Common case. Floating point unit not yet + * enabled, and kernel already intends to initialize + * the hardware the way the caller wants. + */ + return; + } + /* + * Hmm. Userland wants a different default. + * Do a fake "first trap" to establish the context, then + * handle as if we already had a context before we came in. + */ + kpreempt_disable(); + fp_seed(); + kpreempt_enable(); + } + + /* + * Ensure that the current hardware state is flushed back to the + * pcb, then modify that copy. Next use of the fp will + * restore the context. + */ + fp_save(fp); + + switch (fp_save_mech) { + case FP_FXSAVE: + fx = fp->fpu_regs.kfpu_u.kfpu_fx; + fx->fx_fcw = fcw; + fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; + break; + + case FP_XSAVE: + fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave; + fx->fx_fcw = fcw; + fx->fx_mxcsr = sse_mxcsr_mask & mxcsr; + /* + * Always set LEGACY_FP as it may have been cleared by XSAVE + * instruction + */ + fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; + break; + default: + panic("Invalid fp_save_mech"); + /*NOTREACHED*/ + } +} + +static void +kernel_fpu_fpstate_init(kfpu_state_t *kfpu) +{ + struct xsave_state *xs; + + switch (fp_save_mech) { + case FP_FXSAVE: + bcopy(&sse_initial, kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_fx, + sizeof (struct fxsave_state)); + kfpu->kfpu_ctx.fpu_xsave_mask = 0; + break; + case FP_XSAVE: + xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; + bzero(xs, cpuid_get_xsave_size()); + bcopy(&avx_initial, xs, sizeof (*xs)); + xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; + kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; + break; + default: + panic("invalid fp_save_mech"); + } + + /* + * Set the corresponding flags that the system expects on the FPU state + * to indicate that this is our state. The FPU_EN flag is required to + * indicate that FPU usage is allowed. The FPU_KERN flag is explicitly + * not set below as it represents that this state is being suppressed + * by the kernel. + */ + kfpu->kfpu_ctx.fpu_flags = FPU_EN | FPU_VALID; + kfpu->kfpu_flags |= KFPU_F_INITIALIZED; +} + +kfpu_state_t * +kernel_fpu_alloc(int kmflags) +{ + kfpu_state_t *kfpu; + + if ((kfpu = kmem_zalloc(sizeof (kfpu_state_t), kmflags)) == NULL) { + return (NULL); + } + + kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic = + kmem_cache_alloc(fpsave_cachep, kmflags); + if (kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic == NULL) { + kmem_free(kfpu, sizeof (kfpu_state_t)); + return (NULL); + } + + kernel_fpu_fpstate_init(kfpu); + + return (kfpu); +} + +void +kernel_fpu_free(kfpu_state_t *kfpu) +{ + kmem_cache_free(fpsave_cachep, + kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_generic); + kmem_free(kfpu, sizeof (kfpu_state_t)); +} + +static void +kernel_fpu_ctx_save(void *arg) +{ + kfpu_state_t *kfpu = arg; + fpu_ctx_t *pf; + + if (kfpu == NULL) { + /* + * A NULL kfpu implies this is a kernel thread with an LWP and + * no user-level FPU usage. Use the lwp fpu save area. + */ + pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; + + ASSERT(curthread->t_procp->p_flag & SSYS); + ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); + + fp_save(pf); + } else { + pf = &kfpu->kfpu_ctx; + + ASSERT3P(kfpu->kfpu_curthread, ==, curthread); + ASSERT3U(pf->fpu_flags & FPU_VALID, ==, 0); + + /* + * Note, we can't use fp_save because it assumes that we're + * saving to the thread's PCB and not somewhere else. Because + * this is a different FPU context, we instead have to do this + * ourselves. + */ + switch (fp_save_mech) { + case FP_FXSAVE: + fpxsave(pf->fpu_regs.kfpu_u.kfpu_fx); + break; + case FP_XSAVE: + xsavep(pf->fpu_regs.kfpu_u.kfpu_xs, pf->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + } + + /* + * Because we have saved context here, our save state is no + * longer valid and therefore needs to be reinitialized. + */ + kfpu->kfpu_flags &= ~KFPU_F_INITIALIZED; + } + + pf->fpu_flags |= FPU_VALID; + + /* + * Clear KFPU flag. This allows swtch to check for improper kernel + * usage of the FPU (i.e. switching to a new thread while the old + * thread was in the kernel and using the FPU, but did not perform a + * context save). + */ + curthread->t_flag &= ~T_KFPU; +} + +static void +kernel_fpu_ctx_restore(void *arg) +{ + kfpu_state_t *kfpu = arg; + fpu_ctx_t *pf; + + if (kfpu == NULL) { + /* + * A NULL kfpu implies this is a kernel thread with an LWP and + * no user-level FPU usage. Use the lwp fpu save area. + */ + pf = &curthread->t_lwp->lwp_pcb.pcb_fpu; + + ASSERT(curthread->t_procp->p_flag & SSYS); + ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); + } else { + pf = &kfpu->kfpu_ctx; + + ASSERT3P(kfpu->kfpu_curthread, ==, curthread); + ASSERT3U(pf->fpu_flags & FPU_VALID, !=, 0); + } + + fp_restore(pf); + curthread->t_flag |= T_KFPU; +} + +/* + * Validate that the thread is not switching off-cpu while actively using the + * FPU within the kernel. + */ +void +kernel_fpu_no_swtch(void) +{ + if ((curthread->t_flag & T_KFPU) != 0) { + panic("curthread swtch-ing while the kernel is using the FPU"); + } +} + +void +kernel_fpu_begin(kfpu_state_t *kfpu, uint_t flags) +{ + klwp_t *pl = curthread->t_lwp; + struct ctxop *ctx; + + if ((curthread->t_flag & T_KFPU) != 0) { + panic("curthread attempting to nest kernel FPU states"); + } + + /* KFPU_USE_LWP and KFPU_NO_STATE are mutually exclusive. */ + ASSERT((flags & (KFPU_USE_LWP | KFPU_NO_STATE)) != + (KFPU_USE_LWP | KFPU_NO_STATE)); + + if ((flags & KFPU_NO_STATE) == KFPU_NO_STATE) { + /* + * Since we don't have a kfpu_state or usable lwp pcb_fpu to + * hold our kernel FPU context, we depend on the caller doing + * kpreempt_disable for the duration of our FPU usage. This + * should only be done for very short periods of time. + */ + ASSERT(curthread->t_preempt > 0); + ASSERT(kfpu == NULL); + + if (pl != NULL) { + /* + * We might have already saved once so FPU_VALID could + * be set. This is handled in fp_save. + */ + fp_save(&pl->lwp_pcb.pcb_fpu); + pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; + } + + curthread->t_flag |= T_KFPU; + + /* Always restore the fpu to the initial state. */ + fpinit(); + + return; + } + + /* + * We either have a kfpu, or are using the LWP pcb_fpu for context ops. + */ + + if ((flags & KFPU_USE_LWP) == 0) { + if (kfpu->kfpu_curthread != NULL) + panic("attempting to reuse kernel FPU state at %p when " + "another thread already is using", kfpu); + + if ((kfpu->kfpu_flags & KFPU_F_INITIALIZED) == 0) + kernel_fpu_fpstate_init(kfpu); + + kfpu->kfpu_curthread = curthread; + } + + /* + * Not all threads may have an active LWP. If they do and we're not + * going to re-use the LWP, then we should go ahead and save the state. + * We must also note that the fpu is now being used by the kernel and + * therefore we do not want to manage the fpu state via the user-level + * thread's context handlers. + * + * We might have already saved once (due to a prior use of the kernel + * FPU or another code path) so FPU_VALID could be set. This is handled + * by fp_save, as is the FPU_EN check. + */ + ctx = installctx_preallocate(); + kpreempt_disable(); + if (pl != NULL) { + if ((flags & KFPU_USE_LWP) == 0) + fp_save(&pl->lwp_pcb.pcb_fpu); + pl->lwp_pcb.pcb_fpu.fpu_flags |= FPU_KERNEL; + } + + /* + * Set the context operations for kernel FPU usage. Note that this is + * done with a preallocated buffer and under kpreempt_disable because + * without a preallocated buffer, installctx does a sleeping + * allocation. We haven't finished initializing our kernel FPU state + * yet, and in the rare case that we happen to save/restore just as + * installctx() exits its own kpreempt_enable() internal call, we + * guard against restoring an uninitialized buffer (0xbaddcafe). + */ + installctx(curthread, kfpu, kernel_fpu_ctx_save, kernel_fpu_ctx_restore, + NULL, NULL, NULL, NULL, ctx); + + curthread->t_flag |= T_KFPU; + + if ((flags & KFPU_USE_LWP) == KFPU_USE_LWP) { + /* + * For pure kernel threads with an LWP, we can use the LWP's + * pcb_fpu to save/restore context. + */ + fpu_ctx_t *pf = &pl->lwp_pcb.pcb_fpu; + + VERIFY(curthread->t_procp->p_flag & SSYS); + VERIFY(kfpu == NULL); + ASSERT((pf->fpu_flags & FPU_EN) == 0); + + /* Always restore the fpu to the initial state. */ + if (fp_save_mech == FP_XSAVE) + pf->fpu_xsave_mask = XFEATURE_FP_ALL; + fpinit(); + pf->fpu_flags = FPU_EN | FPU_KERNEL; + } else { + /* initialize the kfpu state */ + kernel_fpu_ctx_restore(kfpu); + } + kpreempt_enable(); +} + +void +kernel_fpu_end(kfpu_state_t *kfpu, uint_t flags) +{ + ulong_t iflags; + + if ((curthread->t_flag & T_KFPU) == 0) { + panic("curthread attempting to clear kernel FPU state " + "without using it"); + } + + /* + * General comments on why the rest of this function is structured the + * way it is. Be aware that there is a lot of subtlety here. + * + * If a user-level thread ever uses the fpu while in the kernel, then + * we cannot call fpdisable since that does STTS. That will set the + * ts bit in %cr0 which will cause an exception if anything touches the + * fpu. However, the user-level context switch handler (fpsave_ctxt) + * needs to access the fpu to save the registers into the pcb. + * fpsave_ctxt relies on CLTS having been done to clear the ts bit in + * fprestore_ctxt when the thread context switched onto the CPU. + * + * Calling fpdisable only effects the current CPU's %cr0 register. + * + * During removectx and kpreempt_enable, we can voluntarily context + * switch, so the CPU we were on when we entered this function might + * not be the same one we're on when we return from removectx or end + * the function. Note there can be user-level context switch handlers + * still installed if this is a user-level thread. + * + * We also must be careful in the unlikely chance we're running in an + * interrupt thread, since we can't leave the CPU's %cr0 TS state set + * incorrectly for the "real" thread to resume on this CPU. + */ + + if ((flags & KFPU_NO_STATE) == 0) { + kpreempt_disable(); + } else { + ASSERT(curthread->t_preempt > 0); + } + + curthread->t_flag &= ~T_KFPU; + + /* + * When we are ending things, we explicitly don't save the current + * kernel FPU state back to the temporary state. The kfpu API is not + * intended to be a permanent save location. + * + * If this is a user-level thread and we were to context switch + * before returning to user-land, fpsave_ctxt will be a no-op since we + * already saved the user-level FPU state the first time we run + * kernel_fpu_begin (i.e. we won't save the bad kernel fpu state over + * the user-level fpu state). The fpsave_ctxt functions only save if + * FPU_VALID is not already set. fp_save also set PCB_SET_UPDATE_FPU so + * fprestore_ctxt will be done in sys_rtt_common when the thread + * finally returns to user-land. + */ + + if ((curthread->t_procp->p_flag & SSYS) != 0 && + curthread->t_intr == NULL) { + /* + * A kernel thread which is not an interrupt thread, so we + * STTS now. + */ + fpdisable(); + } + + if ((flags & KFPU_NO_STATE) == 0) { + removectx(curthread, kfpu, kernel_fpu_ctx_save, + kernel_fpu_ctx_restore, NULL, NULL, NULL, NULL); + + if (kfpu != NULL) { + if (kfpu->kfpu_curthread != curthread) { + panic("attempting to end kernel FPU state " + "for %p, but active thread is not " + "curthread", kfpu); + } else { + kfpu->kfpu_curthread = NULL; + } + } + + kpreempt_enable(); + } + + if (curthread->t_lwp != NULL) { + uint_t f; + + if (flags & KFPU_USE_LWP) { + f = FPU_EN | FPU_KERNEL; + } else { + f = FPU_KERNEL; + } + curthread->t_lwp->lwp_pcb.pcb_fpu.fpu_flags &= ~f; + } +} diff --git a/usr/src/uts/intel/os/sendsig.c b/usr/src/uts/intel/os/sendsig.c new file mode 100644 index 0000000000..e3d60eb62b --- /dev/null +++ b/usr/src/uts/intel/os/sendsig.c @@ -0,0 +1,589 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ +/* All Rights Reserved */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/signal.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/mman.h> +#include <sys/class.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/buf.h> +#include <sys/kmem.h> +#include <sys/cred.h> +#include <sys/archsystm.h> +#include <sys/vmparam.h> +#include <sys/prsystm.h> +#include <sys/reboot.h> +#include <sys/uadmin.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/session.h> +#include <sys/ucontext.h> +#include <sys/dnlc.h> +#include <sys/var.h> +#include <sys/cmn_err.h> +#include <sys/debugreg.h> +#include <sys/thread.h> +#include <sys/vtrace.h> +#include <sys/consdev.h> +#include <sys/psw.h> +#include <sys/regset.h> + +#include <sys/privregs.h> + +#include <sys/stack.h> +#include <sys/swap.h> +#include <vm/hat.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/page.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_map.h> +#include <vm/seg_vn.h> +#include <sys/exec.h> +#include <sys/acct.h> +#include <sys/core.h> +#include <sys/corectl.h> +#include <sys/modctl.h> +#include <sys/tuneable.h> +#include <c2/audit.h> +#include <sys/bootconf.h> +#include <sys/dumphdr.h> +#include <sys/promif.h> +#include <sys/systeminfo.h> +#include <sys/kdi.h> +#include <sys/contract_impl.h> +#include <sys/x86_archext.h> + +/* + * Construct the execution environment for the user's signal + * handler and arrange for control to be given to it on return + * to userland. The library code now calls setcontext() to + * clean up after the signal handler, so sigret() is no longer + * needed. + * + * (The various 'volatile' declarations are need to ensure that values + * are correct on the error return from on_fault().) + */ + + +/* + * An amd64 signal frame looks like this on the stack: + * + * old %rsp: + * <128 bytes of untouched stack space> + * <a siginfo_t [optional]> + * <a ucontext_t> + * <siginfo_t *> + * <signal number> + * new %rsp: <return address (deliberately invalid)> + * + * The signal number and siginfo_t pointer are only pushed onto the stack in + * order to allow stack backtraces. The actual signal handling code expects the + * arguments in registers. + */ + +struct sigframe { + caddr_t retaddr; + long signo; + siginfo_t *sip; +}; + +int +sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) +{ + volatile int minstacksz; + int newstack; + label_t ljb; + volatile caddr_t sp; + caddr_t fp; + volatile struct regs *rp; + volatile greg_t upc; + volatile proc_t *p = ttoproc(curthread); + struct as *as = p->p_as; + klwp_t *lwp = ttolwp(curthread); + ucontext_t *volatile tuc = NULL; + ucontext_t *uc; + siginfo_t *sip_addr; + volatile int watched; + + /* + * This routine is utterly dependent upon STACK_ALIGN being + * 16 and STACK_ENTRY_ALIGN being 8. Let's just acknowledge + * that and require it. + */ + +#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8 +#error "sendsig() amd64 did not find the expected stack alignments" +#endif + + rp = lwptoregs(lwp); + upc = rp->r_pc; + + /* + * Since we're setting up to run the signal handler we have to + * arrange that the stack at entry to the handler is (only) + * STACK_ENTRY_ALIGN (i.e. 8) byte aligned so that when the handler + * executes its push of %rbp, the stack realigns to STACK_ALIGN + * (i.e. 16) correctly. + * + * The new sp will point to the sigframe and the ucontext_t. The + * above means that sp (and thus sigframe) will be 8-byte aligned, + * but not 16-byte aligned. ucontext_t, however, contains %xmm regs + * which must be 16-byte aligned. Because of this, for correct + * alignment, sigframe must be a multiple of 8-bytes in length, but + * not 16-bytes. This will place ucontext_t at a nice 16-byte boundary. + */ + + /* LINTED: logical expression always true: op "||" */ + ASSERT((sizeof (struct sigframe) % 16) == 8); + + minstacksz = sizeof (struct sigframe) + SA(sizeof (*uc)); + if (sip != NULL) + minstacksz += SA(sizeof (siginfo_t)); + ASSERT((minstacksz & (STACK_ENTRY_ALIGN - 1ul)) == 0); + + /* + * Figure out whether we will be handling this signal on + * an alternate stack specified by the user. Then allocate + * and validate the stack requirements for the signal handler + * context. on_fault will catch any faults. + */ + newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && + !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); + + if (newstack) { + fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); + } else { + /* + * Drop below the 128-byte reserved region of the stack frame + * we're interrupting. + */ + fp = (caddr_t)rp->r_sp - STACK_RESERVE; + } + + /* + * Force proper stack pointer alignment, even in the face of a + * misaligned stack pointer from user-level before the signal. + */ + fp = (caddr_t)((uintptr_t)fp & ~(STACK_ENTRY_ALIGN - 1ul)); + + /* + * Most of the time during normal execution, the stack pointer + * is aligned on a STACK_ALIGN (i.e. 16 byte) boundary. However, + * (for example) just after a call instruction (which pushes + * the return address), the callers stack misaligns until the + * 'push %rbp' happens in the callee prolog. So while we should + * expect the stack pointer to be always at least STACK_ENTRY_ALIGN + * aligned, we should -not- expect it to always be STACK_ALIGN aligned. + * We now adjust to ensure that the new sp is aligned to + * STACK_ENTRY_ALIGN but not to STACK_ALIGN. + */ + sp = fp - minstacksz; + if (((uintptr_t)sp & (STACK_ALIGN - 1ul)) == 0) { + sp -= STACK_ENTRY_ALIGN; + minstacksz = fp - sp; + } + + /* + * Now, make sure the resulting signal frame address is sane + */ + if (sp >= as->a_userlimit || fp >= as->a_userlimit) { +#ifdef DEBUG + printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", + PTOU(p)->u_comm, p->p_pid, sig); + printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", + (void *)sp, (void *)hdlr, (uintptr_t)upc); + printf("sp above USERLIMIT\n"); +#endif + return (0); + } + + watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE); + + if (on_fault(&ljb)) + goto badstack; + + if (sip != NULL) { + zoneid_t zoneid; + + fp -= SA(sizeof (siginfo_t)); + uzero(fp, sizeof (siginfo_t)); + if (SI_FROMUSER(sip) && + (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID && + zoneid != sip->si_zoneid) { + k_siginfo_t sani_sip = *sip; + + sani_sip.si_pid = p->p_zone->zone_zsched->p_pid; + sani_sip.si_uid = 0; + sani_sip.si_ctid = -1; + sani_sip.si_zoneid = zoneid; + copyout_noerr(&sani_sip, fp, sizeof (sani_sip)); + } else + copyout_noerr(sip, fp, sizeof (*sip)); + sip_addr = (siginfo_t *)fp; + + if (sig == SIGPROF && + curthread->t_rprof != NULL && + curthread->t_rprof->rp_anystate) { + /* + * We stand on our head to deal with + * the real time profiling signal. + * Fill in the stuff that doesn't fit + * in a normal k_siginfo structure. + */ + int i = sip->si_nsysarg; + + while (--i >= 0) + sulword_noerr( + (ulong_t *)&(sip_addr->si_sysarg[i]), + (ulong_t)lwp->lwp_arg[i]); + copyout_noerr(curthread->t_rprof->rp_state, + sip_addr->si_mstate, + sizeof (curthread->t_rprof->rp_state)); + } + } else + sip_addr = NULL; + + /* + * save the current context on the user stack directly after the + * sigframe. Since sigframe is 8-byte-but-not-16-byte aligned, + * and since sizeof (struct sigframe) is 24, this guarantees + * 16-byte alignment for ucontext_t and its %xmm registers. + */ + uc = (ucontext_t *)(sp + sizeof (struct sigframe)); + tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); + no_fault(); + savecontext(tuc, &lwp->lwp_sigoldmask); + if (on_fault(&ljb)) + goto badstack; + copyout_noerr(tuc, uc, sizeof (*tuc)); + kmem_free(tuc, sizeof (*tuc)); + tuc = NULL; + + lwp->lwp_oldcontext = (uintptr_t)uc; + + if (newstack) { + lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK; + if (lwp->lwp_ustack) + copyout_noerr(&lwp->lwp_sigaltstack, + (stack_t *)lwp->lwp_ustack, sizeof (stack_t)); + } + + /* + * Set up signal handler return and stack linkage + */ + { + struct sigframe frame; + + /* + * ensure we never return "normally" + */ + frame.retaddr = (caddr_t)(uintptr_t)-1L; + frame.signo = sig; + frame.sip = sip_addr; + copyout_noerr(&frame, sp, sizeof (frame)); + } + + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); + + /* + * Set up user registers for execution of signal handler. + */ + rp->r_sp = (greg_t)sp; + rp->r_pc = (greg_t)hdlr; + rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL); + + rp->r_rdi = sig; + rp->r_rsi = (uintptr_t)sip_addr; + rp->r_rdx = (uintptr_t)uc; + + if ((rp->r_cs & 0xffff) != UCS_SEL || + (rp->r_ss & 0xffff) != UDS_SEL) { + /* + * Try our best to deliver the signal. + */ + rp->r_cs = UCS_SEL; + rp->r_ss = UDS_SEL; + } + + /* + * Don't set lwp_eosys here. sendsig() is called via psig() after + * lwp_eosys is handled, so setting it here would affect the next + * system call. + */ + return (1); + +badstack: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); + if (tuc) + kmem_free(tuc, sizeof (*tuc)); +#ifdef DEBUG + printf("sendsig: bad signal stack cmd=%s, pid=%d, sig=%d\n", + PTOU(p)->u_comm, p->p_pid, sig); + printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", + (void *)sp, (void *)hdlr, (uintptr_t)upc); +#endif + return (0); +} + +#ifdef _SYSCALL32_IMPL + +/* + * An i386 SVR4/ABI signal frame looks like this on the stack: + * + * old %esp: + * <a siginfo32_t [optional]> + * <a ucontext32_t> + * <pointer to that ucontext32_t> + * <pointer to that siginfo32_t> + * <signo> + * new %esp: <return address (deliberately invalid)> + */ +struct sigframe32 { + caddr32_t retaddr; + uint32_t signo; + caddr32_t sip; + caddr32_t ucp; +}; + +int +sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) +{ + volatile int minstacksz; + int newstack; + label_t ljb; + volatile caddr_t sp; + caddr_t fp; + volatile struct regs *rp; + volatile greg_t upc; + volatile proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + ucontext32_t *volatile tuc = NULL; + ucontext32_t *uc; + siginfo32_t *sip_addr; + volatile int watched; + + rp = lwptoregs(lwp); + upc = rp->r_pc; + + minstacksz = SA32(sizeof (struct sigframe32)) + SA32(sizeof (*uc)); + if (sip != NULL) + minstacksz += SA32(sizeof (siginfo32_t)); + ASSERT((minstacksz & (STACK_ALIGN32 - 1)) == 0); + + /* + * Figure out whether we will be handling this signal on + * an alternate stack specified by the user. Then allocate + * and validate the stack requirements for the signal handler + * context. on_fault will catch any faults. + */ + newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && + !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); + + if (newstack) { + fp = (caddr_t)(SA32((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + + SA32(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN32); + } else if ((rp->r_ss & 0xffff) != UDS_SEL) { + user_desc_t *ldt; + /* + * If the stack segment selector is -not- pointing at + * the UDS_SEL descriptor and we have an LDT entry for + * it instead, add the base address to find the effective va. + */ + if ((ldt = p->p_ldt) != NULL) + fp = (caddr_t)rp->r_sp + + USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]); + else + fp = (caddr_t)rp->r_sp; + } else + fp = (caddr_t)rp->r_sp; + + /* + * Force proper stack pointer alignment, even in the face of a + * misaligned stack pointer from user-level before the signal. + * Don't use the SA32() macro because that rounds up, not down. + */ + fp = (caddr_t)((uintptr_t)fp & ~(STACK_ALIGN32 - 1)); + sp = fp - minstacksz; + + /* + * Make sure lwp hasn't trashed its stack + */ + if (sp >= (caddr_t)(uintptr_t)USERLIMIT32 || + fp >= (caddr_t)(uintptr_t)USERLIMIT32) { +#ifdef DEBUG + printf("sendsig32: bad signal stack cmd=%s, pid=%d, sig=%d\n", + PTOU(p)->u_comm, p->p_pid, sig); + printf("sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", + (void *)sp, (void *)hdlr, (uintptr_t)upc); + printf("sp above USERLIMIT\n"); +#endif + return (0); + } + + watched = watch_disable_addr((caddr_t)sp, minstacksz, S_WRITE); + + if (on_fault(&ljb)) + goto badstack; + + if (sip != NULL) { + siginfo32_t si32; + zoneid_t zoneid; + + siginfo_kto32(sip, &si32); + if (SI_FROMUSER(sip) && + (zoneid = p->p_zone->zone_id) != GLOBAL_ZONEID && + zoneid != sip->si_zoneid) { + si32.si_pid = p->p_zone->zone_zsched->p_pid; + si32.si_uid = 0; + si32.si_ctid = -1; + si32.si_zoneid = zoneid; + } + fp -= SA32(sizeof (si32)); + uzero(fp, sizeof (si32)); + copyout_noerr(&si32, fp, sizeof (si32)); + sip_addr = (siginfo32_t *)fp; + + if (sig == SIGPROF && + curthread->t_rprof != NULL && + curthread->t_rprof->rp_anystate) { + /* + * We stand on our head to deal with + * the real-time profiling signal. + * Fill in the stuff that doesn't fit + * in a normal k_siginfo structure. + */ + int i = sip->si_nsysarg; + + while (--i >= 0) + suword32_noerr(&(sip_addr->si_sysarg[i]), + (uint32_t)lwp->lwp_arg[i]); + copyout_noerr(curthread->t_rprof->rp_state, + sip_addr->si_mstate, + sizeof (curthread->t_rprof->rp_state)); + } + } else + sip_addr = NULL; + + /* save the current context on the user stack */ + fp -= SA32(sizeof (*tuc)); + uc = (ucontext32_t *)fp; + tuc = kmem_alloc(sizeof (*tuc), KM_SLEEP); + no_fault(); + savecontext32(tuc, &lwp->lwp_sigoldmask); + if (on_fault(&ljb)) + goto badstack; + copyout_noerr(tuc, uc, sizeof (*tuc)); + kmem_free(tuc, sizeof (*tuc)); + tuc = NULL; + + lwp->lwp_oldcontext = (uintptr_t)uc; + + if (newstack) { + lwp->lwp_sigaltstack.ss_flags |= SS_ONSTACK; + if (lwp->lwp_ustack) { + stack32_t stk32; + + stk32.ss_sp = (caddr32_t)(uintptr_t) + lwp->lwp_sigaltstack.ss_sp; + stk32.ss_size = (size32_t) + lwp->lwp_sigaltstack.ss_size; + stk32.ss_flags = (int32_t) + lwp->lwp_sigaltstack.ss_flags; + copyout_noerr(&stk32, + (stack32_t *)lwp->lwp_ustack, sizeof (stk32)); + } + } + + /* + * Set up signal handler arguments + */ + { + struct sigframe32 frame32; + + frame32.sip = (caddr32_t)(uintptr_t)sip_addr; + frame32.ucp = (caddr32_t)(uintptr_t)uc; + frame32.signo = sig; + frame32.retaddr = 0xffffffff; /* never return! */ + copyout_noerr(&frame32, sp, sizeof (frame32)); + } + + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); + + rp->r_sp = (greg_t)(uintptr_t)sp; + rp->r_pc = (greg_t)(uintptr_t)hdlr; + rp->r_ps = PSL_USER | (rp->r_ps & PS_IOPL); + + if ((rp->r_cs & 0xffff) != U32CS_SEL || + (rp->r_ss & 0xffff) != UDS_SEL) { + /* + * Try our best to deliver the signal. + */ + rp->r_cs = U32CS_SEL; + rp->r_ss = UDS_SEL; + } + + /* + * Don't set lwp_eosys here. sendsig() is called via psig() after + * lwp_eosys is handled, so setting it here would affect the next + * system call. + */ + return (1); + +badstack: + no_fault(); + if (watched) + watch_enable_addr((caddr_t)sp, minstacksz, S_WRITE); + if (tuc) + kmem_free(tuc, sizeof (*tuc)); +#ifdef DEBUG + printf("sendsig32: bad signal stack cmd=%s pid=%d, sig=%d\n", + PTOU(p)->u_comm, p->p_pid, sig); + printf("on fault, sigsp = 0x%p, action = 0x%p, upc = 0x%lx\n", + (void *)sp, (void *)hdlr, (uintptr_t)upc); +#endif + return (0); +} + +#endif /* _SYSCALL32_IMPL */ diff --git a/usr/src/uts/intel/os/sundep.c b/usr/src/uts/intel/os/sundep.c new file mode 100644 index 0000000000..80e149f01b --- /dev/null +++ b/usr/src/uts/intel/os/sundep.c @@ -0,0 +1,1012 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Joyent, Inc. + */ + +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ +/* All Rights Reserved */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/signal.h> +#include <sys/systm.h> +#include <sys/user.h> +#include <sys/mman.h> +#include <sys/class.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/buf.h> +#include <sys/kmem.h> +#include <sys/cred.h> +#include <sys/archsystm.h> +#include <sys/vmparam.h> +#include <sys/prsystm.h> +#include <sys/reboot.h> +#include <sys/uadmin.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/session.h> +#include <sys/ucontext.h> +#include <sys/dnlc.h> +#include <sys/var.h> +#include <sys/cmn_err.h> +#include <sys/debugreg.h> +#include <sys/thread.h> +#include <sys/vtrace.h> +#include <sys/consdev.h> +#include <sys/psw.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/cpu.h> +#include <sys/stack.h> +#include <sys/swap.h> +#include <vm/hat.h> +#include <vm/anon.h> +#include <vm/as.h> +#include <vm/page.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_map.h> +#include <vm/seg_vn.h> +#include <sys/exec.h> +#include <sys/acct.h> +#include <sys/core.h> +#include <sys/corectl.h> +#include <sys/modctl.h> +#include <sys/tuneable.h> +#include <c2/audit.h> +#include <sys/bootconf.h> +#include <sys/brand.h> +#include <sys/dumphdr.h> +#include <sys/promif.h> +#include <sys/systeminfo.h> +#include <sys/kdi.h> +#include <sys/contract_impl.h> +#include <sys/x86_archext.h> +#include <sys/segments.h> +#include <sys/ontrap.h> +#include <sys/cpu.h> +#ifdef __xpv +#include <sys/hypervisor.h> +#endif + +/* + * Compare the version of boot that boot says it is against + * the version of boot the kernel expects. + */ +int +check_boot_version(int boots_version) +{ + if (boots_version == BO_VERSION) + return (0); + + prom_printf("Wrong boot interface - kernel needs v%d found v%d\n", + BO_VERSION, boots_version); + prom_panic("halting"); + /*NOTREACHED*/ +} + +/* + * Process the physical installed list for boot. + * Finds: + * 1) the pfn of the highest installed physical page, + * 2) the number of pages installed + * 3) the number of distinct contiguous regions these pages fall into. + * 4) the number of contiguous memory ranges + */ +void +installed_top_size_ex( + struct memlist *list, /* pointer to start of installed list */ + pfn_t *high_pfn, /* return ptr for top value */ + pgcnt_t *pgcnt, /* return ptr for sum of installed pages */ + int *ranges) /* return ptr for the count of contig. ranges */ +{ + pfn_t top = 0; + pgcnt_t sumpages = 0; + pfn_t highp; /* high page in a chunk */ + int cnt = 0; + + for (; list; list = list->ml_next) { + ++cnt; + highp = (list->ml_address + list->ml_size - 1) >> PAGESHIFT; + if (top < highp) + top = highp; + sumpages += btop(list->ml_size); + } + + *high_pfn = top; + *pgcnt = sumpages; + *ranges = cnt; +} + +void +installed_top_size( + struct memlist *list, /* pointer to start of installed list */ + pfn_t *high_pfn, /* return ptr for top value */ + pgcnt_t *pgcnt) /* return ptr for sum of installed pages */ +{ + int ranges; + + installed_top_size_ex(list, high_pfn, pgcnt, &ranges); +} + +void +phys_install_has_changed(void) +{} + +/* + * Copy in a memory list from boot to kernel, with a filter function + * to remove pages. The filter function can increase the address and/or + * decrease the size to filter out pages. It will also align addresses and + * sizes to PAGESIZE. + */ +void +copy_memlist_filter( + struct memlist *src, + struct memlist **dstp, + void (*filter)(uint64_t *, uint64_t *)) +{ + struct memlist *dst, *prev; + uint64_t addr; + uint64_t size; + uint64_t eaddr; + + dst = *dstp; + prev = dst; + + /* + * Move through the memlist applying a filter against + * each range of memory. Note that we may apply the + * filter multiple times against each memlist entry. + */ + for (; src; src = src->ml_next) { + addr = P2ROUNDUP(src->ml_address, PAGESIZE); + eaddr = P2ALIGN(src->ml_address + src->ml_size, PAGESIZE); + while (addr < eaddr) { + size = eaddr - addr; + if (filter != NULL) + filter(&addr, &size); + if (size == 0) + break; + dst->ml_address = addr; + dst->ml_size = size; + dst->ml_next = 0; + if (prev == dst) { + dst->ml_prev = 0; + dst++; + } else { + dst->ml_prev = prev; + prev->ml_next = dst; + dst++; + prev++; + } + addr += size; + } + } + + *dstp = dst; +} + +/* + * Kernel setup code, called from startup(). + */ +void +kern_setup1(void) +{ + proc_t *pp; + + pp = &p0; + + proc_sched = pp; + + /* + * Initialize process 0 data structures + */ + pp->p_stat = SRUN; + pp->p_flag = SSYS; + + pp->p_pidp = &pid0; + pp->p_pgidp = &pid0; + pp->p_sessp = &session0; + pp->p_tlist = &t0; + pid0.pid_pglink = pp; + pid0.pid_pgtail = pp; + + /* + * XXX - we asssume that the u-area is zeroed out except for + * ttolwp(curthread)->lwp_regs. + */ + PTOU(curproc)->u_cmask = (mode_t)CMASK; + + thread_init(); /* init thread_free list */ + pid_init(); /* initialize pid (proc) table */ + contract_init(); /* initialize contracts */ + + init_pages_pp_maximum(); +} + +/* + * Load a procedure into a thread. + */ +void +thread_load(kthread_t *t, void (*start)(), caddr_t arg, size_t len) +{ + caddr_t sp; + size_t framesz; + caddr_t argp; + long *p; + extern void thread_start(); + + /* + * Push a "c" call frame onto the stack to represent + * the caller of "start". + */ + sp = t->t_stk; + ASSERT(((uintptr_t)t->t_stk & (STACK_ENTRY_ALIGN - 1)) == 0); + if (len != 0) { + /* + * the object that arg points at is copied into the + * caller's frame. + */ + framesz = SA(len); + sp -= framesz; + ASSERT(sp > t->t_stkbase); + argp = sp + SA(MINFRAME); + bcopy(arg, argp, len); + arg = argp; + } + /* + * Set up arguments (arg and len) on the caller's stack frame. + */ + p = (long *)sp; + + *--p = 0; /* fake call */ + *--p = 0; /* null frame pointer terminates stack trace */ + *--p = (long)len; + *--p = (intptr_t)arg; + *--p = (intptr_t)start; + + /* + * initialize thread to resume at thread_start() which will + * turn around and invoke (*start)(arg, len). + */ + t->t_pc = (uintptr_t)thread_start; + t->t_sp = (uintptr_t)p; + + ASSERT((t->t_sp & (STACK_ENTRY_ALIGN - 1)) == 0); +} + +/* + * load user registers into lwp. + */ +/*ARGSUSED2*/ +void +lwp_load(klwp_t *lwp, gregset_t grp, uintptr_t thrptr) +{ + struct regs *rp = lwptoregs(lwp); + + setgregs(lwp, grp); + rp->r_ps = PSL_USER; + + /* + * For 64-bit lwps, we allow one magic %fs selector value, and one + * magic %gs selector to point anywhere in the address space using + * %fsbase and %gsbase behind the scenes. libc uses %fs to point + * at the ulwp_t structure. + * + * For 32-bit lwps, libc wedges its lwp thread pointer into the + * ucontext ESP slot (which is otherwise irrelevant to setting a + * ucontext) and LWPGS_SEL value into gregs[REG_GS]. This is so + * syslwp_create() can atomically setup %gs. + * + * See setup_context() in libc. + */ +#ifdef _SYSCALL32_IMPL + if (lwp_getdatamodel(lwp) == DATAMODEL_ILP32) { + if (grp[REG_GS] == LWPGS_SEL) + (void) lwp_setprivate(lwp, _LWP_GSBASE, thrptr); + } else { + /* + * See lwp_setprivate in kernel and setup_context in libc. + * + * Currently libc constructs a ucontext from whole cloth for + * every new (not main) lwp created. For 64 bit processes + * %fsbase is directly set to point to current thread pointer. + * In the past (solaris 10) %fs was also set LWPFS_SEL to + * indicate %fsbase. Now we use the null GDT selector for + * this purpose. LWP[FS|GS]_SEL are only intended for 32 bit + * processes. To ease transition we support older libcs in + * the newer kernel by forcing %fs or %gs selector to null + * by calling lwp_setprivate if LWP[FS|GS]_SEL is passed in + * the ucontext. This is should be ripped out at some future + * date. Another fix would be for libc to do a getcontext + * and inherit the null %fs/%gs from the current context but + * that means an extra system call and could hurt performance. + */ + if (grp[REG_FS] == 0x1bb) /* hard code legacy LWPFS_SEL */ + (void) lwp_setprivate(lwp, _LWP_FSBASE, + (uintptr_t)grp[REG_FSBASE]); + + if (grp[REG_GS] == 0x1c3) /* hard code legacy LWPGS_SEL */ + (void) lwp_setprivate(lwp, _LWP_GSBASE, + (uintptr_t)grp[REG_GSBASE]); + } +#else + if (grp[GS] == LWPGS_SEL) + (void) lwp_setprivate(lwp, _LWP_GSBASE, thrptr); +#endif + + lwp->lwp_eosys = JUSTRETURN; + lwptot(lwp)->t_post_sys = 1; +} + +/* + * set syscall()'s return values for a lwp. + */ +void +lwp_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_ps &= ~PS_C; + lwptoregs(lwp)->r_r0 = v1; + lwptoregs(lwp)->r_r1 = v2; +} + +/* + * set syscall()'s return values for a lwp. + */ +void +lwp_setsp(klwp_t *lwp, caddr_t sp) +{ + lwptoregs(lwp)->r_sp = (intptr_t)sp; +} + +/* + * Copy regs from parent to child. + */ +void +lwp_forkregs(klwp_t *lwp, klwp_t *clwp) +{ + struct pcb *pcb = &clwp->lwp_pcb; + struct regs *rp = lwptoregs(lwp); + + if (!PCB_NEED_UPDATE_SEGS(pcb)) { + pcb->pcb_ds = rp->r_ds; + pcb->pcb_es = rp->r_es; + pcb->pcb_fs = rp->r_fs; + pcb->pcb_gs = rp->r_gs; + PCB_SET_UPDATE_SEGS(pcb); + lwptot(clwp)->t_post_sys = 1; + } + ASSERT(lwptot(clwp)->t_post_sys); + + fp_lwp_dup(clwp); + + bcopy(lwp->lwp_regs, clwp->lwp_regs, sizeof (struct regs)); +} + +/* + * This function is currently unused on x86. + */ +/*ARGSUSED*/ +void +lwp_freeregs(klwp_t *lwp, int isexec) +{} + +/* + * This function is currently unused on x86. + */ +void +lwp_pcb_exit(void) +{} + +/* + * Lwp context ops for segment registers. + */ + +/* + * Every time we come into the kernel (syscall, interrupt or trap + * but not fast-traps) we capture the current values of the user's + * segment registers into the lwp's reg structure. This includes + * lcall for i386 generic system call support since it is handled + * as a segment-not-present trap. + * + * Here we save the current values from the lwp regs into the pcb + * and or PCB_UPDATE_SEGS (1) in pcb->pcb_rupdate to tell the rest + * of the kernel that the pcb copy of the segment registers is the + * current one. This ensures the lwp's next trip to user land via + * update_sregs. Finally we set t_post_sys to ensure that no + * system call fast-path's its way out of the kernel via sysret. + * + * (This means that we need to have interrupts disabled when we + * test t->t_post_sys in the syscall handlers; if the test fails, + * we need to keep interrupts disabled until we return to userland + * so we can't be switched away.) + * + * As a result of all this, we don't really have to do a whole lot + * if the thread is just mucking about in the kernel, switching on + * and off the cpu for whatever reason it feels like. And yet we + * still preserve fast syscalls, cause if we -don't- get + * descheduled, we never come here either. + */ + +#define VALID_LWP_DESC(udp) ((udp)->usd_type == SDT_MEMRWA && \ + (udp)->usd_p == 1 && (udp)->usd_dpl == SEL_UPL) + +/*ARGSUSED*/ +void +lwp_segregs_save(klwp_t *lwp) +{ + pcb_t *pcb = &lwp->lwp_pcb; + struct regs *rp; + + ASSERT(VALID_LWP_DESC(&pcb->pcb_fsdesc)); + ASSERT(VALID_LWP_DESC(&pcb->pcb_gsdesc)); + + if (!PCB_NEED_UPDATE_SEGS(pcb)) { + rp = lwptoregs(lwp); + + /* + * If there's no update already pending, capture the current + * %ds/%es/%fs/%gs values from lwp's regs in case the user + * changed them; %fsbase and %gsbase are privileged so the + * kernel versions of these registers in pcb_fsbase and + * pcb_gsbase are always up-to-date. + */ + pcb->pcb_ds = rp->r_ds; + pcb->pcb_es = rp->r_es; + pcb->pcb_fs = rp->r_fs; + pcb->pcb_gs = rp->r_gs; + PCB_SET_UPDATE_SEGS(pcb); + lwp->lwp_thread->t_post_sys = 1; + } + +#if !defined(__xpv) /* XXPV not sure if we can re-read gdt? */ + ASSERT(bcmp(&CPU->cpu_gdt[GDT_LWPFS], &lwp->lwp_pcb.pcb_fsdesc, + sizeof (lwp->lwp_pcb.pcb_fsdesc)) == 0); + ASSERT(bcmp(&CPU->cpu_gdt[GDT_LWPGS], &lwp->lwp_pcb.pcb_gsdesc, + sizeof (lwp->lwp_pcb.pcb_gsdesc)) == 0); +#endif +} + +/* + * Update the segment registers with new values from the pcb. + * + * We have to do this carefully, and in the following order, + * in case any of the selectors points at a bogus descriptor. + * If they do, we'll catch trap with on_trap and return 1. + * returns 0 on success. + * + * This is particularly tricky for %gs. + * This routine must be executed under a cli. + */ +int +update_sregs(struct regs *rp, klwp_t *lwp) +{ + pcb_t *pcb = &lwp->lwp_pcb; + ulong_t kgsbase; + on_trap_data_t otd; + int rc = 0; + + if (!on_trap(&otd, OT_SEGMENT_ACCESS)) { + +#if defined(__xpv) + /* + * On the hyervisor this is easy. The hypercall below will + * swapgs and load %gs with the user selector. If the user + * selector is bad the hypervisor will catch the fault and + * load %gs with the null selector instead. Either way the + * kernel's gsbase is not damaged. + */ + kgsbase = (ulong_t)CPU; + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, + pcb->pcb_gs) != 0) { + no_trap(); + return (1); + } + + rp->r_gs = pcb->pcb_gs; + ASSERT((cpu_t *)kgsbase == CPU); + +#else /* __xpv */ + + /* + * A little more complicated running native. + */ + kgsbase = (ulong_t)CPU; + __set_gs(pcb->pcb_gs); + + /* + * If __set_gs fails it's because the new %gs is a bad %gs, + * we'll be taking a trap but with the original %gs and %gsbase + * undamaged (i.e. pointing at curcpu). + * + * We've just mucked up the kernel's gsbase. Oops. In + * particular we can't take any traps at all. Make the newly + * computed gsbase be the hidden gs via swapgs, and fix + * the kernel's gsbase back again. Later, when we return to + * userland we'll swapgs again restoring gsbase just loaded + * above. + */ + __asm__ __volatile__("mfence; swapgs"); + + rp->r_gs = pcb->pcb_gs; + + /* + * Restore kernel's gsbase. Note that this also serializes any + * attempted speculation from loading the user-controlled + * %gsbase. + */ + wrmsr(MSR_AMD_GSBASE, kgsbase); + +#endif /* __xpv */ + + /* + * Only override the descriptor base address if + * r_gs == LWPGS_SEL or if r_gs == NULL. A note on + * NULL descriptors -- 32-bit programs take faults + * if they deference NULL descriptors; however, + * when 64-bit programs load them into %fs or %gs, + * they DONT fault -- only the base address remains + * whatever it was from the last load. Urk. + * + * XXX - note that lwp_setprivate now sets %fs/%gs to the + * null selector for 64 bit processes. Whereas before + * %fs/%gs were set to LWP(FS|GS)_SEL regardless of + * the process's data model. For now we check for both + * values so that the kernel can also support the older + * libc. This should be ripped out at some point in the + * future. + */ + if (pcb->pcb_gs == LWPGS_SEL || pcb->pcb_gs == 0) { +#if defined(__xpv) + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER, + pcb->pcb_gsbase)) { + no_trap(); + return (1); + } +#else + wrmsr(MSR_AMD_KGSBASE, pcb->pcb_gsbase); +#endif + } + + __set_ds(pcb->pcb_ds); + rp->r_ds = pcb->pcb_ds; + + __set_es(pcb->pcb_es); + rp->r_es = pcb->pcb_es; + + __set_fs(pcb->pcb_fs); + rp->r_fs = pcb->pcb_fs; + + /* + * Same as for %gs + */ + if (pcb->pcb_fs == LWPFS_SEL || pcb->pcb_fs == 0) { +#if defined(__xpv) + if (HYPERVISOR_set_segment_base(SEGBASE_FS, + pcb->pcb_fsbase)) { + no_trap(); + return (1); + } +#else + wrmsr(MSR_AMD_FSBASE, pcb->pcb_fsbase); +#endif + } + + } else { + cli(); + rc = 1; + } + no_trap(); + return (rc); +} + +/* + * Make sure any stale selectors are cleared from the segment registers + * by putting KDS_SEL (the kernel's default %ds gdt selector) into them. + * This is necessary because the kernel itself does not use %es, %fs, nor + * %ds. (%cs and %ss are necessary, and are set up by the kernel - along with + * %gs - to point to the current cpu struct.) If we enter kmdb while in the + * kernel and resume with a stale ldt or brandz selector sitting there in a + * segment register, kmdb will #gp fault if the stale selector points to, + * for example, an ldt in the context of another process. + * + * WARNING: Intel and AMD chips behave differently when storing + * the null selector into %fs and %gs while in long mode. On AMD + * chips fsbase and gsbase are not cleared. But on Intel chips, storing + * a null selector into %fs or %gs has the side effect of clearing + * fsbase or gsbase. For that reason we use KDS_SEL, which has + * consistent behavor between AMD and Intel. + * + * Caller responsible for preventing cpu migration. + */ +void +reset_sregs(void) +{ + ulong_t kgsbase = (ulong_t)CPU; + + ASSERT(curthread->t_preempt != 0 || getpil() >= DISP_LEVEL); + + cli(); + __set_gs(KGS_SEL); + + /* + * restore kernel gsbase + */ +#if defined(__xpv) + xen_set_segment_base(SEGBASE_GS_KERNEL, kgsbase); +#else + wrmsr(MSR_AMD_GSBASE, kgsbase); +#endif + + sti(); + + __set_ds(KDS_SEL); + __set_es(0 | SEL_KPL); /* selector RPL not ring 0 on hypervisor */ + __set_fs(KFS_SEL); +} + + +#ifdef _SYSCALL32_IMPL + +/* + * Make it impossible for a process to change its data model. + * We do this by toggling the present bits for the 32 and + * 64-bit user code descriptors. That way if a user lwp attempts + * to change its data model (by using the wrong code descriptor in + * %cs) it will fault immediately. This also allows us to simplify + * assertions and checks in the kernel. + */ + +static void +gdt_ucode_model(model_t model) +{ + kpreempt_disable(); + if (model == DATAMODEL_NATIVE) { + gdt_update_usegd(GDT_UCODE, &ucs_on); + gdt_update_usegd(GDT_U32CODE, &ucs32_off); + } else { + gdt_update_usegd(GDT_U32CODE, &ucs32_on); + gdt_update_usegd(GDT_UCODE, &ucs_off); + } + kpreempt_enable(); +} + +#endif /* _SYSCALL32_IMPL */ + +/* + * Restore lwp private fs and gs segment descriptors + * on current cpu's GDT. + */ +static void +lwp_segregs_restore(klwp_t *lwp) +{ + pcb_t *pcb = &lwp->lwp_pcb; + + ASSERT(VALID_LWP_DESC(&pcb->pcb_fsdesc)); + ASSERT(VALID_LWP_DESC(&pcb->pcb_gsdesc)); + +#ifdef _SYSCALL32_IMPL + gdt_ucode_model(DATAMODEL_NATIVE); +#endif + + gdt_update_usegd(GDT_LWPFS, &pcb->pcb_fsdesc); + gdt_update_usegd(GDT_LWPGS, &pcb->pcb_gsdesc); + +} + +#ifdef _SYSCALL32_IMPL + +static void +lwp_segregs_restore32(klwp_t *lwp) +{ + /*LINTED*/ + cpu_t *cpu = CPU; + pcb_t *pcb = &lwp->lwp_pcb; + + ASSERT(VALID_LWP_DESC(&lwp->lwp_pcb.pcb_fsdesc)); + ASSERT(VALID_LWP_DESC(&lwp->lwp_pcb.pcb_gsdesc)); + + gdt_ucode_model(DATAMODEL_ILP32); + gdt_update_usegd(GDT_LWPFS, &pcb->pcb_fsdesc); + gdt_update_usegd(GDT_LWPGS, &pcb->pcb_gsdesc); +} + +#endif /* _SYSCALL32_IMPL */ + +/* + * If this is a process in a branded zone, then we want it to use the brand + * syscall entry points instead of the standard Solaris entry points. This + * routine must be called when a new lwp is created within a branded zone + * or when an existing lwp moves into a branded zone via a zone_enter() + * operation. + */ +void +lwp_attach_brand_hdlrs(klwp_t *lwp) +{ + kthread_t *t = lwptot(lwp); + + ASSERT(PROC_IS_BRANDED(lwptoproc(lwp))); + + ASSERT(removectx(t, NULL, brand_interpositioning_disable, + brand_interpositioning_enable, NULL, NULL, + brand_interpositioning_disable, NULL) == 0); + installctx(t, NULL, brand_interpositioning_disable, + brand_interpositioning_enable, NULL, NULL, + brand_interpositioning_disable, NULL, NULL); + + if (t == curthread) { + kpreempt_disable(); + brand_interpositioning_enable(); + kpreempt_enable(); + } +} + +/* + * If this is a process in a branded zone, then we want it to disable the + * brand syscall entry points. This routine must be called when the last + * lwp in a process is exiting in proc_exit(). + */ +void +lwp_detach_brand_hdlrs(klwp_t *lwp) +{ + kthread_t *t = lwptot(lwp); + + ASSERT(PROC_IS_BRANDED(lwptoproc(lwp))); + if (t == curthread) + kpreempt_disable(); + + /* Remove the original context handlers */ + VERIFY(removectx(t, NULL, brand_interpositioning_disable, + brand_interpositioning_enable, NULL, NULL, + brand_interpositioning_disable, NULL) != 0); + + if (t == curthread) { + /* Cleanup our MSR and IDT entries. */ + brand_interpositioning_disable(); + kpreempt_enable(); + } +} + +/* + * Add any lwp-associated context handlers to the lwp at the beginning + * of the lwp's useful life. + * + * All paths which create lwp's invoke lwp_create(); lwp_create() + * invokes lwp_stk_init() which initializes the stack, sets up + * lwp_regs, and invokes this routine. + * + * All paths which destroy lwp's invoke lwp_exit() to rip the lwp + * apart and put it on 'lwp_deathrow'; if the lwp is destroyed it + * ends up in thread_free() which invokes freectx(t, 0) before + * invoking lwp_stk_fini(). When the lwp is recycled from death + * row, lwp_stk_fini() is invoked, then thread_free(), and thus + * freectx(t, 0) as before. + * + * In the case of exec, the surviving lwp is thoroughly scrubbed + * clean; exec invokes freectx(t, 1) to destroy associated contexts. + * On the way back to the new image, it invokes setregs() which + * in turn invokes this routine. + */ +void +lwp_installctx(klwp_t *lwp) +{ + kthread_t *t = lwptot(lwp); + int thisthread = t == curthread; +#ifdef _SYSCALL32_IMPL + void (*restop)(klwp_t *) = lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ? + lwp_segregs_restore : lwp_segregs_restore32; +#else + void (*restop)(klwp_t *) = lwp_segregs_restore; +#endif + struct ctxop *ctx; + + /* + * Install the basic lwp context handlers on each lwp. + * + * On the amd64 kernel, the context handlers are responsible for + * virtualizing %ds, %es, %fs, and %gs to the lwp. The register + * values are only ever changed via sys_rtt when the + * PCB_UPDATE_SEGS bit (1) is set in pcb->pcb_rupdate. Only + * sys_rtt gets to clear the bit. + * + * On the i386 kernel, the context handlers are responsible for + * virtualizing %gs/%fs to the lwp by updating the per-cpu GDTs + */ + ASSERT(removectx(t, lwp, lwp_segregs_save, restop, + NULL, NULL, NULL, NULL) == 0); + if (thisthread) { + ctx = installctx_preallocate(); + kpreempt_disable(); + } else { + ctx = NULL; + } + installctx(t, lwp, lwp_segregs_save, restop, + NULL, NULL, NULL, NULL, ctx); + if (thisthread) { + /* + * Since we're the right thread, set the values in the GDT + */ + restop(lwp); + kpreempt_enable(); + } + + /* + * If we have sysenter/sysexit instructions enabled, we need + * to ensure that the hardware mechanism is kept up-to-date with the + * lwp's kernel stack pointer across context switches. + * + * sep_save zeros the sysenter stack pointer msr; sep_restore sets + * it to the lwp's kernel stack pointer (kstktop). + */ + if (is_x86_feature(x86_featureset, X86FSET_SEP)) { + caddr_t kstktop = (caddr_t)lwp->lwp_regs; + ASSERT(removectx(t, kstktop, + sep_save, sep_restore, NULL, NULL, NULL, NULL) == 0); + + if (thisthread) { + ctx = installctx_preallocate(); + kpreempt_disable(); + } else { + ctx = NULL; + } + installctx(t, kstktop, + sep_save, sep_restore, NULL, NULL, NULL, NULL, ctx); + if (thisthread) { + /* + * We're the right thread, so set the stack pointer + * for the first sysenter instruction to use + */ + sep_restore(kstktop); + kpreempt_enable(); + } + } + + if (PROC_IS_BRANDED(ttoproc(t))) + lwp_attach_brand_hdlrs(lwp); +} + +/* + * Clear registers on exec(2). + */ +void +setregs(uarg_t *args) +{ + struct regs *rp; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + pcb_t *pcb = &lwp->lwp_pcb; + greg_t sp; + + /* + * Initialize user registers + */ + (void) save_syscall_args(); /* copy args from registers first */ + rp = lwptoregs(lwp); + sp = rp->r_sp; + bzero(rp, sizeof (*rp)); + + rp->r_ss = UDS_SEL; + rp->r_sp = sp; + rp->r_pc = args->entry; + rp->r_ps = PSL_USER; + + pcb->pcb_fs = pcb->pcb_gs = 0; + pcb->pcb_fsbase = pcb->pcb_gsbase = 0; + + if (ttoproc(t)->p_model == DATAMODEL_NATIVE) { + + rp->r_cs = UCS_SEL; + + /* + * Only allow 64-bit user code descriptor to be present. + */ + gdt_ucode_model(DATAMODEL_NATIVE); + + /* + * Arrange that the virtualized %fs and %gs GDT descriptors + * have a well-defined initial state (present, ring 3 + * and of type data). + */ + pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_udesc; + + /* + * thrptr is either NULL or a value used by DTrace. + * 64-bit processes use %fs as their "thread" register. + */ + if (args->thrptr) + (void) lwp_setprivate(lwp, _LWP_FSBASE, args->thrptr); + + } else { + + rp->r_cs = U32CS_SEL; + rp->r_ds = rp->r_es = UDS_SEL; + + /* + * only allow 32-bit user code selector to be present. + */ + gdt_ucode_model(DATAMODEL_ILP32); + + pcb->pcb_fsdesc = pcb->pcb_gsdesc = zero_u32desc; + + /* + * thrptr is either NULL or a value used by DTrace. + * 32-bit processes use %gs as their "thread" register. + */ + if (args->thrptr) + (void) lwp_setprivate(lwp, _LWP_GSBASE, args->thrptr); + + } + + pcb->pcb_ds = rp->r_ds; + pcb->pcb_es = rp->r_es; + PCB_SET_UPDATE_SEGS(pcb); + + lwp->lwp_eosys = JUSTRETURN; + t->t_post_sys = 1; + + /* + * Add the lwp context handlers that virtualize segment registers, + * and/or system call stacks etc. + */ + lwp_installctx(lwp); + + /* + * Reset the FPU flags and then initialize the FPU for this lwp. + */ + fp_exec(); +} + +user_desc_t * +cpu_get_gdt(void) +{ + return (CPU->cpu_gdt); +} + + +#if !defined(lwp_getdatamodel) + +/* + * Return the datamodel of the given lwp. + */ +/*ARGSUSED*/ +model_t +lwp_getdatamodel(klwp_t *lwp) +{ + return (lwp->lwp_procp->p_model); +} + +#endif /* !lwp_getdatamodel */ + +#if !defined(get_udatamodel) + +model_t +get_udatamodel(void) +{ + return (curproc->p_model); +} + +#endif /* !get_udatamodel */ diff --git a/usr/src/uts/intel/os/syscall.c b/usr/src/uts/intel/os/syscall.c new file mode 100644 index 0000000000..6cf4293ff4 --- /dev/null +++ b/usr/src/uts/intel/os/syscall.c @@ -0,0 +1,1397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/vmparam.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/stack.h> +#include <sys/cred.h> +#include <sys/cmn_err.h> +#include <sys/user.h> +#include <sys/privregs.h> +#include <sys/psw.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/var.h> +#include <sys/inline.h> +#include <sys/syscall.h> +#include <sys/ucontext.h> +#include <sys/cpuvar.h> +#include <sys/siginfo.h> +#include <sys/trap.h> +#include <sys/vtrace.h> +#include <sys/sysinfo.h> +#include <sys/procfs.h> +#include <sys/prsystm.h> +#include <c2/audit.h> +#include <sys/modctl.h> +#include <sys/aio_impl.h> +#include <sys/tnf.h> +#include <sys/tnf_probe.h> +#include <sys/copyops.h> +#include <sys/priv.h> +#include <sys/msacct.h> + +int syscalltrace = 0; +#ifdef SYSCALLTRACE +static kmutex_t systrace_lock; /* syscall tracing lock */ +#else +#define syscalltrace 0 +#endif /* SYSCALLTRACE */ + +typedef int64_t (*llfcn_t)(); /* function returning long long */ + +int pre_syscall(void); +void post_syscall(long rval1, long rval2); +static krwlock_t *lock_syscall(struct sysent *, uint_t); +void deferred_singlestep_trap(caddr_t); + +#ifdef _SYSCALL32_IMPL +#define LWP_GETSYSENT(lwp) \ + (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE ? sysent : sysent32) +#else +#define LWP_GETSYSENT(lwp) (sysent) +#endif + +/* + * If watchpoints are active, don't make copying in of + * system call arguments take a read watchpoint trap. + */ +static int +copyin_args(struct regs *rp, long *ap, uint_t nargs) +{ + greg_t *sp = 1 + (greg_t *)rp->r_sp; /* skip ret addr */ + + ASSERT(nargs <= MAXSYSARGS); + + return (copyin_nowatch(sp, ap, nargs * sizeof (*sp))); +} + +#if defined(_SYSCALL32_IMPL) +static int +copyin_args32(struct regs *rp, long *ap, uint_t nargs) +{ + greg32_t *sp = 1 + (greg32_t *)rp->r_sp; /* skip ret addr */ + uint32_t a32[MAXSYSARGS]; + int rc; + + ASSERT(nargs <= MAXSYSARGS); + + if ((rc = copyin_nowatch(sp, a32, nargs * sizeof (*sp))) == 0) { + uint32_t *a32p = &a32[0]; + + while (nargs--) + *ap++ = (ulong_t)*a32p++; + } + return (rc); +} +#define COPYIN_ARGS32 copyin_args32 +#else +#define COPYIN_ARGS32 copyin_args +#endif + +/* + * Error handler for system calls where arg copy gets fault. + */ +static longlong_t +syscall_err() +{ + return (0); +} + +/* + * Corresponding sysent entry to allow syscall_entry caller + * to invoke syscall_err. + */ +static struct sysent sysent_err = { + 0, SE_32RVAL1, NULL, NULL, (llfcn_t)syscall_err +}; + +/* + * Called from syscall() when a non-trivial 32-bit system call occurs. + * Sets up the args and returns a pointer to the handler. + */ +struct sysent * +syscall_entry(kthread_t *t, long *argp) +{ + klwp_t *lwp = ttolwp(t); + struct regs *rp = lwptoregs(lwp); + unsigned int code; + struct sysent *callp; + struct sysent *se = LWP_GETSYSENT(lwp); + int error = 0; + uint_t nargs; + + ASSERT(t == curthread && curthread->t_schedflag & TS_DONT_SWAP); + + lwp->lwp_ru.sysc++; + lwp->lwp_eosys = NORMALRETURN; /* assume this will be normal */ + + /* + * Set lwp_ap to point to the args, even if none are needed for this + * system call. This is for the loadable-syscall case where the + * number of args won't be known until the system call is loaded, and + * also maintains a non-NULL lwp_ap setup for get_syscall_args(). Note + * that lwp_ap MUST be set to a non-NULL value _BEFORE_ t_sysnum is + * set to non-zero; otherwise get_syscall_args(), seeing a non-zero + * t_sysnum for this thread, will charge ahead and dereference lwp_ap. + */ + lwp->lwp_ap = argp; /* for get_syscall_args */ + + code = rp->r_r0; + t->t_sysnum = (short)code; + callp = code >= NSYSCALL ? &nosys_ent : se + code; + + if ((t->t_pre_sys | syscalltrace) != 0) { + error = pre_syscall(); + + /* + * pre_syscall() has taken care so that lwp_ap is current; + * it either points to syscall-entry-saved amd64 regs, + * or it points to lwp_arg[], which has been re-copied from + * the ia32 ustack, but either way, it's a current copy after + * /proc has possibly mucked with the syscall args. + */ + + if (error) + return (&sysent_err); /* use dummy handler */ + } + + /* + * Fetch the system call arguments to the kernel stack copy used + * for syscall handling. + * Note: for loadable system calls the number of arguments required + * may not be known at this point, and will be zero if the system call + * was never loaded. Once the system call has been loaded, the number + * of args is not allowed to be changed. + */ + if ((nargs = (uint_t)callp->sy_narg) != 0 && + COPYIN_ARGS32(rp, argp, nargs)) { + (void) set_errno(EFAULT); + return (&sysent_err); /* use dummy handler */ + } + + return (callp); /* return sysent entry for caller */ +} + +void +syscall_exit(kthread_t *t, long rval1, long rval2) +{ + /* + * Handle signals and other post-call events if necessary. + */ + if ((t->t_post_sys_ast | syscalltrace) == 0) { + klwp_t *lwp = ttolwp(t); + struct regs *rp = lwptoregs(lwp); + + /* + * Normal return. + * Clear error indication and set return values. + */ + rp->r_ps &= ~PS_C; /* reset carry bit */ + rp->r_r0 = rval1; + rp->r_r1 = rval2; + lwp->lwp_state = LWP_USER; + } else { + post_syscall(rval1, rval2); + } + t->t_sysnum = 0; /* invalidate args */ +} + +/* + * Perform pre-system-call processing, including stopping for tracing, + * auditing, etc. + * + * This routine is called only if the t_pre_sys flag is set. Any condition + * requiring pre-syscall handling must set the t_pre_sys flag. If the + * condition is persistent, this routine will repost t_pre_sys. + */ +int +pre_syscall() +{ + kthread_t *t = curthread; + unsigned code = t->t_sysnum; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + int repost; + + t->t_pre_sys = repost = 0; /* clear pre-syscall processing flag */ + + ASSERT(t->t_schedflag & TS_DONT_SWAP); + +#if defined(DEBUG) + /* + * On the i386 kernel, lwp_ap points at the piece of the thread + * stack that we copy the users arguments into. + * + * On the amd64 kernel, the syscall arguments in the rdi..r9 + * registers should be pointed at by lwp_ap. If the args need to + * be copied so that those registers can be changed without losing + * the ability to get the args for /proc, they can be saved by + * save_syscall_args(), and lwp_ap will be restored by post_syscall(). + */ + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { +#if defined(_LP64) + ASSERT(lwp->lwp_ap == (long *)&lwptoregs(lwp)->r_rdi); + } else { +#endif + ASSERT((caddr_t)lwp->lwp_ap > t->t_stkbase && + (caddr_t)lwp->lwp_ap < t->t_stk); + } +#endif /* DEBUG */ + + /* + * Make sure the thread is holding the latest credentials for the + * process. The credentials in the process right now apply to this + * thread for the entire system call. + */ + if (t->t_cred != p->p_cred) { + cred_t *oldcred = t->t_cred; + /* + * DTrace accesses t_cred in probe context. t_cred must + * always be either NULL, or point to a valid, allocated cred + * structure. + */ + t->t_cred = crgetcred(); + crfree(oldcred); + } + + /* + * From the proc(4) manual page: + * When entry to a system call is being traced, the traced process + * stops after having begun the call to the system but before the + * system call arguments have been fetched from the process. + */ + if (PTOU(p)->u_systrap) { + if (prismember(&PTOU(p)->u_entrymask, code)) { + mutex_enter(&p->p_lock); + /* + * Recheck stop condition, now that lock is held. + */ + if (PTOU(p)->u_systrap && + prismember(&PTOU(p)->u_entrymask, code)) { + stop(PR_SYSENTRY, code); + + /* + * /proc may have modified syscall args, + * either in regs for amd64 or on ustack + * for ia32. Either way, arrange to + * copy them again, both for the syscall + * handler and for other consumers in + * post_syscall (like audit). Here, we + * only do amd64, and just set lwp_ap + * back to the kernel-entry stack copy; + * the syscall ml code redoes + * move-from-regs to set up for the + * syscall handler after we return. For + * ia32, save_syscall_args() below makes + * an lwp_ap-accessible copy. + */ +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + lwp->lwp_argsaved = 0; + lwp->lwp_ap = + (long *)&lwptoregs(lwp)->r_rdi; + } +#endif + } + mutex_exit(&p->p_lock); + } + repost = 1; + } + + /* + * ia32 kernel, or ia32 proc on amd64 kernel: keep args in + * lwp_arg for post-syscall processing, regardless of whether + * they might have been changed in /proc above. + */ +#if defined(_LP64) + if (lwp_getdatamodel(lwp) != DATAMODEL_NATIVE) +#endif + (void) save_syscall_args(); + + if (lwp->lwp_sysabort) { + /* + * lwp_sysabort may have been set via /proc while the process + * was stopped on PR_SYSENTRY. If so, abort the system call. + * Override any error from the copyin() of the arguments. + */ + lwp->lwp_sysabort = 0; + (void) set_errno(EINTR); /* forces post_sys */ + t->t_pre_sys = 1; /* repost anyway */ + return (1); /* don't do system call, return EINTR */ + } + + /* + * begin auditing for this syscall if the c2audit module is loaded + * and auditing is enabled + */ + if (audit_active == C2AUDIT_LOADED) { + uint32_t auditing = au_zone_getstate(NULL); + + if (auditing & AU_AUDIT_MASK) { + int error; + if (error = audit_start(T_SYSCALL, code, auditing, \ + 0, lwp)) { + t->t_pre_sys = 1; /* repost anyway */ + (void) set_errno(error); + return (1); + } + repost = 1; + } + } + +#ifndef NPROBE + /* Kernel probe */ + if (tnf_tracing_active) { + TNF_PROBE_1(syscall_start, "syscall thread", /* CSTYLED */, + tnf_sysnum, sysnum, t->t_sysnum); + t->t_post_sys = 1; /* make sure post_syscall runs */ + repost = 1; + } +#endif /* NPROBE */ + +#ifdef SYSCALLTRACE + if (syscalltrace) { + int i; + long *ap; + char *cp; + char *sysname; + struct sysent *callp; + + if (code >= NSYSCALL) + callp = &nosys_ent; /* nosys has no args */ + else + callp = LWP_GETSYSENT(lwp) + code; + (void) save_syscall_args(); + mutex_enter(&systrace_lock); + printf("%d: ", p->p_pid); + if (code >= NSYSCALL) { + printf("0x%x", code); + } else { + sysname = mod_getsysname(code); + printf("%s[0x%x/0x%p]", sysname == NULL ? "NULL" : + sysname, code, callp->sy_callc); + } + cp = "("; + for (i = 0, ap = lwp->lwp_ap; i < callp->sy_narg; i++, ap++) { + printf("%s%lx", cp, *ap); + cp = ", "; + } + if (i) + printf(")"); + printf(" %s id=0x%p\n", PTOU(p)->u_comm, curthread); + mutex_exit(&systrace_lock); + } +#endif /* SYSCALLTRACE */ + + /* + * If there was a continuing reason for pre-syscall processing, + * set the t_pre_sys flag for the next system call. + */ + if (repost) + t->t_pre_sys = 1; + lwp->lwp_error = 0; /* for old drivers */ + lwp->lwp_badpriv = PRIV_NONE; + return (0); +} + + +/* + * Post-syscall processing. Perform abnormal system call completion + * actions such as /proc tracing, profiling, signals, preemption, etc. + * + * This routine is called only if t_post_sys, t_sig_check, or t_astflag is set. + * Any condition requiring pre-syscall handling must set one of these. + * If the condition is persistent, this routine will repost t_post_sys. + */ +void +post_syscall(long rval1, long rval2) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + struct regs *rp = lwptoregs(lwp); + uint_t error; + uint_t code = t->t_sysnum; + int repost = 0; + int proc_stop = 0; /* non-zero if stopping */ + int sigprof = 0; /* non-zero if sending SIGPROF */ + + t->t_post_sys = 0; + + error = lwp->lwp_errno; + + /* + * Code can be zero if this is a new LWP returning after a forkall(), + * other than the one which matches the one in the parent which called + * forkall(). In these LWPs, skip most of post-syscall activity. + */ + if (code == 0) + goto sig_check; + /* + * If the trace flag is set, mark the lwp to take a single-step trap + * on return to user level (below). The x86 lcall interface and + * sysenter has already done this, and turned off the flag, but + * amd64 syscall interface has not. + */ + if (rp->r_ps & PS_T) { + lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING; + rp->r_ps &= ~PS_T; + aston(curthread); + } + + /* put out audit record for this syscall */ + if (AU_AUDITING()) { + rval_t rval; + + /* XX64 -- truncation of 64-bit return values? */ + rval.r_val1 = (int)rval1; + rval.r_val2 = (int)rval2; + audit_finish(T_SYSCALL, code, error, &rval); + repost = 1; + } + + if (curthread->t_pdmsg != NULL) { + char *m = curthread->t_pdmsg; + + uprintf("%s", m); + kmem_free(m, strlen(m) + 1); + curthread->t_pdmsg = NULL; + } + + /* + * If we're going to stop for /proc tracing, set the flag and + * save the arguments so that the return values don't smash them. + */ + if (PTOU(p)->u_systrap) { + if (prismember(&PTOU(p)->u_exitmask, code)) { + if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) + (void) save_syscall_args(); + proc_stop = 1; + } + repost = 1; + } + + /* + * Similarly check to see if SIGPROF might be sent. + */ + if (curthread->t_rprof != NULL && + curthread->t_rprof->rp_anystate != 0) { + if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) + (void) save_syscall_args(); + sigprof = 1; + } + + if (lwp->lwp_eosys == NORMALRETURN) { + if (error == 0) { +#ifdef SYSCALLTRACE + if (syscalltrace) { + mutex_enter(&systrace_lock); + printf( + "%d: r_val1=0x%lx, r_val2=0x%lx, id 0x%p\n", + p->p_pid, rval1, rval2, curthread); + mutex_exit(&systrace_lock); + } +#endif /* SYSCALLTRACE */ + rp->r_ps &= ~PS_C; + rp->r_r0 = rval1; + rp->r_r1 = rval2; + } else { + int sig; +#ifdef SYSCALLTRACE + if (syscalltrace) { + mutex_enter(&systrace_lock); + printf("%d: error=%d, id 0x%p\n", + p->p_pid, error, curthread); + mutex_exit(&systrace_lock); + } +#endif /* SYSCALLTRACE */ + if (error == EINTR && t->t_activefd.a_stale) + error = EBADF; + if (error == EINTR && + (sig = lwp->lwp_cursig) != 0 && + sigismember(&PTOU(p)->u_sigrestart, sig) && + PTOU(p)->u_signal[sig - 1] != SIG_DFL && + PTOU(p)->u_signal[sig - 1] != SIG_IGN) + error = ERESTART; + rp->r_r0 = error; + rp->r_ps |= PS_C; + } + } + + /* + * From the proc(4) manual page: + * When exit from a system call is being traced, the traced process + * stops on completion of the system call just prior to checking for + * signals and returning to user level. At this point all return + * values have been stored into the traced process's saved registers. + */ + if (proc_stop) { + mutex_enter(&p->p_lock); + if (PTOU(p)->u_systrap && + prismember(&PTOU(p)->u_exitmask, code)) + stop(PR_SYSEXIT, code); + mutex_exit(&p->p_lock); + } + + /* + * If we are the parent returning from a successful + * vfork, wait for the child to exec or exit. + * This code must be here and not in the bowels of the system + * so that /proc can intercept exit from vfork in a timely way. + */ + if (t->t_flag & T_VFPARENT) { + ASSERT(code == SYS_vfork || code == SYS_forksys); + ASSERT(rp->r_r1 == 0 && error == 0); + vfwait((pid_t)rval1); + t->t_flag &= ~T_VFPARENT; + } + + /* + * If profiling is active, bill the current PC in user-land + * and keep reposting until profiling is disabled. + */ + if (p->p_prof.pr_scale) { + if (lwp->lwp_oweupc) + profil_tick(rp->r_pc); + repost = 1; + } + +sig_check: + /* + * Reset flag for next time. + * We must do this after stopping on PR_SYSEXIT + * because /proc uses the information in lwp_eosys. + */ + lwp->lwp_eosys = NORMALRETURN; + clear_stale_fd(); + t->t_flag &= ~T_FORKALL; + + if (t->t_astflag | t->t_sig_check) { + /* + * Turn off the AST flag before checking all the conditions that + * may have caused an AST. This flag is on whenever a signal or + * unusual condition should be handled after the next trap or + * syscall. + */ + astoff(t); + /* + * If a single-step trap occurred on a syscall (see trap()) + * recognize it now. Do this before checking for signals + * because deferred_singlestep_trap() may generate a SIGTRAP to + * the LWP or may otherwise mark the LWP to call issig(FORREAL). + */ + if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING) + deferred_singlestep_trap((caddr_t)rp->r_pc); + + t->t_sig_check = 0; + + /* + * The following check is legal for the following reasons: + * 1) The thread we are checking, is ourselves, so there is + * no way the proc can go away. + * 2) The only time we need to be protected by the + * lock is if the binding is changed. + * + * Note we will still take the lock and check the binding + * if the condition was true without the lock held. This + * prevents lock contention among threads owned by the + * same proc. + */ + + if (curthread->t_proc_flag & TP_CHANGEBIND) { + mutex_enter(&p->p_lock); + if (curthread->t_proc_flag & TP_CHANGEBIND) { + timer_lwpbind(); + curthread->t_proc_flag &= ~TP_CHANGEBIND; + } + mutex_exit(&p->p_lock); + } + + /* + * for kaio requests on the special kaio poll queue, + * copyout their results to user memory. + */ + if (p->p_aio) + aio_cleanup(0); + /* + * If this LWP was asked to hold, call holdlwp(), which will + * stop. holdlwps() sets this up and calls pokelwps() which + * sets the AST flag. + * + * Also check TP_EXITLWP, since this is used by fresh new LWPs + * through lwp_rtt(). That flag is set if the lwp_create(2) + * syscall failed after creating the LWP. + */ + if (ISHOLD(p) || (t->t_proc_flag & TP_EXITLWP)) + holdlwp(); + + /* + * All code that sets signals and makes ISSIG_PENDING + * evaluate true must set t_sig_check afterwards. + */ + if (ISSIG_PENDING(t, lwp, p)) { + if (issig(FORREAL)) + psig(); + t->t_sig_check = 1; /* recheck next time */ + } + + if (sigprof) { + int nargs = (code > 0 && code < NSYSCALL)? + LWP_GETSYSENT(lwp)[code].sy_narg : 0; + realsigprof(code, nargs, error); + t->t_sig_check = 1; /* recheck next time */ + } + + /* + * If a performance counter overflow interrupt was + * delivered *during* the syscall, then re-enable the + * AST so that we take a trip through trap() to cause + * the SIGEMT to be delivered. + */ + if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) + aston(t); + + /* + * /proc can't enable/disable the trace bit itself + * because that could race with the call gate used by + * system calls via "lcall". If that happened, an + * invalid EFLAGS would result. prstep()/prnostep() + * therefore schedule an AST for the purpose. + */ + if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) { + lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP; + rp->r_ps |= PS_T; + } + if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) { + lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP; + rp->r_ps &= ~PS_T; + } + } + + lwp->lwp_errno = 0; /* clear error for next time */ + +#ifndef NPROBE + /* Kernel probe */ + if (tnf_tracing_active) { + TNF_PROBE_3(syscall_end, "syscall thread", /* CSTYLED */, + tnf_long, rval1, rval1, + tnf_long, rval2, rval2, + tnf_long, errno, (long)error); + repost = 1; + } +#endif /* NPROBE */ + + /* + * Set state to LWP_USER here so preempt won't give us a kernel + * priority if it occurs after this point. Call CL_TRAPRET() to + * restore the user-level priority. + * + * It is important that no locks (other than spinlocks) be entered + * after this point before returning to user mode (unless lwp_state + * is set back to LWP_SYS). + * + * XXX Sampled times past this point are charged to the user. + */ + lwp->lwp_state = LWP_USER; + + if (t->t_trapret) { + t->t_trapret = 0; + thread_lock(t); + CL_TRAPRET(t); + thread_unlock(t); + } + if (CPU->cpu_runrun || t->t_schedflag & TS_ANYWAITQ) + preempt(); + prunstop(); + + lwp->lwp_errno = 0; /* clear error for next time */ + + /* + * The thread lock must be held in order to clear sysnum and reset + * lwp_ap atomically with respect to other threads in the system that + * may be looking at the args via lwp_ap from get_syscall_args(). + */ + + thread_lock(t); + t->t_sysnum = 0; /* no longer in a system call */ + + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { +#if defined(_LP64) + /* + * In case the args were copied to the lwp, reset the + * pointer so the next syscall will have the right + * lwp_ap pointer. + */ + lwp->lwp_ap = (long *)&rp->r_rdi; + } else { +#endif + lwp->lwp_ap = NULL; /* reset on every syscall entry */ + } + thread_unlock(t); + + lwp->lwp_argsaved = 0; + + /* + * If there was a continuing reason for post-syscall processing, + * set the t_post_sys flag for the next system call. + */ + if (repost) + t->t_post_sys = 1; + + /* + * If there is a ustack registered for this lwp, and the stack rlimit + * has been altered, read in the ustack. If the saved stack rlimit + * matches the bounds of the ustack, update the ustack to reflect + * the new rlimit. If the new stack rlimit is RLIM_INFINITY, disable + * stack checking by setting the size to 0. + */ + if (lwp->lwp_ustack != 0 && lwp->lwp_old_stk_ctl != 0) { + rlim64_t new_size; + caddr_t top; + stack_t stk; + struct rlimit64 rl; + + mutex_enter(&p->p_lock); + new_size = p->p_stk_ctl; + top = p->p_usrstack; + (void) rctl_rlimit_get(rctlproc_legacy[RLIMIT_STACK], p, &rl); + mutex_exit(&p->p_lock); + + if (rl.rlim_cur == RLIM64_INFINITY) + new_size = 0; + + if (copyin((stack_t *)lwp->lwp_ustack, &stk, + sizeof (stack_t)) == 0 && + (stk.ss_size == lwp->lwp_old_stk_ctl || + stk.ss_size == 0) && + stk.ss_sp == top - stk.ss_size) { + stk.ss_sp = (void *)((uintptr_t)stk.ss_sp + + stk.ss_size - (uintptr_t)new_size); + stk.ss_size = new_size; + + (void) copyout(&stk, (stack_t *)lwp->lwp_ustack, + sizeof (stack_t)); + } + + lwp->lwp_old_stk_ctl = 0; + } +} + +/* + * Called from post_syscall() when a deferred singlestep is to be taken. + */ +void +deferred_singlestep_trap(caddr_t pc) +{ + proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + pcb_t *pcb = &lwp->lwp_pcb; + uint_t fault = 0; + k_siginfo_t siginfo; + + bzero(&siginfo, sizeof (siginfo)); + + /* + * If both NORMAL_STEP and WATCH_STEP are in + * effect, give precedence to WATCH_STEP. + * If neither is set, user must have set the + * PS_T bit in %efl; treat this as NORMAL_STEP. + */ + if ((fault = undo_watch_step(&siginfo)) == 0 && + ((pcb->pcb_flags & NORMAL_STEP) || + !(pcb->pcb_flags & WATCH_STEP))) { + siginfo.si_signo = SIGTRAP; + siginfo.si_code = TRAP_TRACE; + siginfo.si_addr = pc; + fault = FLTTRACE; + } + pcb->pcb_flags &= ~(DEBUG_PENDING|NORMAL_STEP|WATCH_STEP); + + if (fault) { + /* + * Remember the fault and fault adddress + * for real-time (SIGPROF) profiling. + */ + lwp->lwp_lastfault = fault; + lwp->lwp_lastfaddr = siginfo.si_addr; + /* + * If a debugger has declared this fault to be an + * event of interest, stop the lwp. Otherwise just + * deliver the associated signal. + */ + if (prismember(&p->p_fltmask, fault) && + stop_on_fault(fault, &siginfo) == 0) + siginfo.si_signo = 0; + } + + if (siginfo.si_signo) + trapsig(&siginfo, 1); +} + +/* + * nonexistent system call-- signal lwp (may want to handle it) + * flag error if lwp won't see signal immediately + */ +int64_t +nosys(void) +{ + tsignal(curthread, SIGSYS); + return (set_errno(ENOSYS)); +} + +int +nosys32(void) +{ + return (nosys()); +} + +/* + * Execute a 32-bit system call on behalf of the current thread. + */ +void +dosyscall(void) +{ + /* + * Need space on the stack to store syscall arguments. + */ + long syscall_args[MAXSYSARGS]; + struct sysent *se; + int64_t ret; + + syscall_mstate(LMS_TRAP, LMS_SYSTEM); + + ASSERT(curproc->p_model == DATAMODEL_ILP32); + + CPU_STATS_ENTER_K(); + CPU_STATS_ADDQ(CPU, sys, syscall, 1); + CPU_STATS_EXIT_K(); + + se = syscall_entry(curthread, syscall_args); + + /* + * syscall_entry() copied all 8 arguments into syscall_args. + */ + ret = se->sy_callc(syscall_args[0], syscall_args[1], syscall_args[2], + syscall_args[3], syscall_args[4], syscall_args[5], syscall_args[6], + syscall_args[7]); + + syscall_exit(curthread, (int)ret & 0xffffffffu, (int)(ret >> 32)); + syscall_mstate(LMS_SYSTEM, LMS_TRAP); +} + +/* + * Get the arguments to the current system call. See comment atop + * save_syscall_args() regarding lwp_ap usage. + */ + +uint_t +get_syscall_args(klwp_t *lwp, long *argp, int *nargsp) +{ + kthread_t *t = lwptot(lwp); + ulong_t mask = 0xfffffffful; + uint_t code; + long *ap; + int nargs; + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_LP64) + mask = 0xfffffffffffffffful; +#endif + + /* + * The thread lock must be held while looking at the arguments to ensure + * they don't go away via post_syscall(). + * get_syscall_args() is the only routine to read them which is callable + * outside the LWP in question and hence the only one that must be + * synchronized in this manner. + */ + thread_lock(t); + + code = t->t_sysnum; + ap = lwp->lwp_ap; + + thread_unlock(t); + + if (code != 0 && code < NSYSCALL) { + nargs = LWP_GETSYSENT(lwp)[code].sy_narg; + + ASSERT(nargs <= MAXSYSARGS); + + *nargsp = nargs; + while (nargs-- > 0) + *argp++ = *ap++ & mask; + } else { + *nargsp = 0; + } + + return (code); +} + +#ifdef _SYSCALL32_IMPL +/* + * Get the arguments to the current 32-bit system call. + */ +uint_t +get_syscall32_args(klwp_t *lwp, int *argp, int *nargsp) +{ + long args[MAXSYSARGS]; + uint_t i, code; + + code = get_syscall_args(lwp, args, nargsp); + + for (i = 0; i != *nargsp; i++) + *argp++ = (int)args[i]; + return (code); +} +#endif + +/* + * Save the system call arguments in a safe place. + * + * On the i386 kernel: + * + * Copy the users args prior to changing the stack or stack pointer. + * This is so /proc will be able to get a valid copy of the + * args from the user stack even after the user stack has been changed. + * Note that the kernel stack copy of the args may also have been + * changed by a system call handler which takes C-style arguments. + * + * Note that this may be called by stop() from trap(). In that case + * t_sysnum will be zero (syscall_exit clears it), so no args will be + * copied. + * + * On the amd64 kernel: + * + * For 64-bit applications, lwp->lwp_ap normally points to %rdi..%r9 + * in the reg structure. If the user is going to change the argument + * registers, rax, or the stack and might want to get the args (for + * /proc tracing), it must copy the args elsewhere via save_syscall_args(). + * + * For 32-bit applications, lwp->lwp_ap normally points to a copy of + * the system call arguments on the kernel stack made from the user + * stack. Copy the args prior to change the stack or stack pointer. + * This is so /proc will be able to get a valid copy of the args + * from the user stack even after that stack has been changed. + * + * This may be called from stop() even when we're not in a system call. + * Since there's no easy way to tell, this must be safe (not panic). + * If the copyins get data faults, return non-zero. + */ +int +save_syscall_args() +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + uint_t code = t->t_sysnum; + uint_t nargs; + + if (lwp->lwp_argsaved || code == 0) + return (0); /* args already saved or not needed */ + + if (code >= NSYSCALL) { + nargs = 0; /* illegal syscall */ + } else { + struct sysent *se = LWP_GETSYSENT(lwp); + struct sysent *callp = se + code; + + nargs = callp->sy_narg; + if (LOADABLE_SYSCALL(callp) && nargs == 0) { + krwlock_t *module_lock; + + /* + * Find out how many arguments the system + * call uses. + * + * We have the property that loaded syscalls + * never change the number of arguments they + * use after they've been loaded once. This + * allows us to stop for /proc tracing without + * holding the module lock. + * /proc is assured that sy_narg is valid. + */ + module_lock = lock_syscall(se, code); + nargs = callp->sy_narg; + rw_exit(module_lock); + } + } + + /* + * Fetch the system call arguments. + */ + if (nargs == 0) + goto out; + + ASSERT(nargs <= MAXSYSARGS); + + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { +#if defined(_LP64) + struct regs *rp = lwptoregs(lwp); + + lwp->lwp_arg[0] = rp->r_rdi; + lwp->lwp_arg[1] = rp->r_rsi; + lwp->lwp_arg[2] = rp->r_rdx; + lwp->lwp_arg[3] = rp->r_rcx; + lwp->lwp_arg[4] = rp->r_r8; + lwp->lwp_arg[5] = rp->r_r9; + if (nargs > 6 && copyin_args(rp, &lwp->lwp_arg[6], nargs - 6)) + return (-1); + } else { +#endif + if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_arg, nargs)) + return (-1); + } +out: + lwp->lwp_ap = lwp->lwp_arg; + lwp->lwp_argsaved = 1; + t->t_post_sys = 1; /* so lwp_ap will be reset */ + return (0); +} + +void +reset_syscall_args(void) +{ + ttolwp(curthread)->lwp_argsaved = 0; +} + +/* + * Call a system call which takes a pointer to the user args struct and + * a pointer to the return values. This is a bit slower than the standard + * C arg-passing method in some cases. + */ +int64_t +syscall_ap(void) +{ + uint_t error; + struct sysent *callp; + rval_t rval; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + struct regs *rp = lwptoregs(lwp); + + callp = LWP_GETSYSENT(lwp) + t->t_sysnum; + + /* + * If the arguments don't fit in registers %rdi-%r9, make sure they + * have been copied to the lwp_arg array. + */ + if (callp->sy_narg > 6 && save_syscall_args()) + return ((int64_t)set_errno(EFAULT)); + + rval.r_val1 = 0; + rval.r_val2 = rp->r_r1; + lwp->lwp_error = 0; /* for old drivers */ + error = (*(callp->sy_call))(lwp->lwp_ap, &rval); + if (error) + return ((longlong_t)set_errno(error)); + return (rval.r_vals); +} + +/* + * Load system call module. + * Returns with pointer to held read lock for module. + */ +static krwlock_t * +lock_syscall(struct sysent *table, uint_t code) +{ + krwlock_t *module_lock; + struct modctl *modp; + int id; + struct sysent *callp; + + callp = table + code; + module_lock = callp->sy_lock; + + /* + * Optimization to only call modload if we don't have a loaded + * syscall. + */ + rw_enter(module_lock, RW_READER); + if (LOADED_SYSCALL(callp)) + return (module_lock); + rw_exit(module_lock); + + for (;;) { + if ((id = modload("sys", syscallnames[code])) == -1) + break; + + /* + * If we loaded successfully at least once, the modctl + * will still be valid, so we try to grab it by filename. + * If this call fails, it's because the mod_filename + * was changed after the call to modload() (mod_hold_by_name() + * is the likely culprit). We can safely just take + * another lap if this is the case; the modload() will + * change the mod_filename back to one by which we can + * find the modctl. + */ + modp = mod_find_by_filename("sys", syscallnames[code]); + + if (modp == NULL) + continue; + + mutex_enter(&mod_lock); + + if (!modp->mod_installed) { + mutex_exit(&mod_lock); + continue; + } + break; + } + rw_enter(module_lock, RW_READER); + + if (id != -1) + mutex_exit(&mod_lock); + + return (module_lock); +} + +/* + * Loadable syscall support. + * If needed, load the module, then reserve it by holding a read + * lock for the duration of the call. + * Later, if the syscall is not unloadable, it could patch the vector. + */ +/*ARGSUSED*/ +int64_t +loadable_syscall( + long a0, long a1, long a2, long a3, + long a4, long a5, long a6, long a7) +{ + klwp_t *lwp = ttolwp(curthread); + int64_t rval; + struct sysent *callp; + struct sysent *se = LWP_GETSYSENT(lwp); + krwlock_t *module_lock; + int code, error = 0; + + code = curthread->t_sysnum; + callp = se + code; + + /* + * Try to autoload the system call if necessary + */ + module_lock = lock_syscall(se, code); + + /* + * we've locked either the loaded syscall or nosys + */ + + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { +#if defined(_LP64) + if (callp->sy_flags & SE_ARGC) { + rval = (int64_t)(*callp->sy_call)(a0, a1, a2, a3, + a4, a5); + } else { + rval = syscall_ap(); + } + } else { +#endif + /* + * Now that it's loaded, make sure enough args were copied. + */ + if (COPYIN_ARGS32(lwptoregs(lwp), lwp->lwp_ap, callp->sy_narg)) + error = EFAULT; + if (error) { + rval = set_errno(error); + } else if (callp->sy_flags & SE_ARGC) { + rval = (int64_t)(*callp->sy_call)(lwp->lwp_ap[0], + lwp->lwp_ap[1], lwp->lwp_ap[2], lwp->lwp_ap[3], + lwp->lwp_ap[4], lwp->lwp_ap[5]); + } else { + rval = syscall_ap(); + } + } + + rw_exit(module_lock); + return (rval); +} + +/* + * Indirect syscall handled in libc on x86 architectures + */ +int64_t +indir() +{ + return (nosys()); +} + +/* + * set_errno - set an error return from the current system call. + * This could be a macro. + * This returns the value it is passed, so that the caller can + * use tail-recursion-elimination and do return (set_errno(ERRNO)); + */ +uint_t +set_errno(uint_t error) +{ + ASSERT(error != 0); /* must not be used to clear errno */ + + curthread->t_post_sys = 1; /* have post_syscall do error return */ + return (ttolwp(curthread)->lwp_errno = error); +} + +/* + * set_proc_pre_sys - Set pre-syscall processing for entire process. + */ +void +set_proc_pre_sys(proc_t *p) +{ + kthread_t *t; + kthread_t *first; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + t = first = p->p_tlist; + do { + t->t_pre_sys = 1; + } while ((t = t->t_forw) != first); +} + +/* + * set_proc_post_sys - Set post-syscall processing for entire process. + */ +void +set_proc_post_sys(proc_t *p) +{ + kthread_t *t; + kthread_t *first; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + t = first = p->p_tlist; + do { + t->t_post_sys = 1; + } while ((t = t->t_forw) != first); +} + +/* + * set_proc_sys - Set pre- and post-syscall processing for entire process. + */ +void +set_proc_sys(proc_t *p) +{ + kthread_t *t; + kthread_t *first; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + t = first = p->p_tlist; + do { + t->t_pre_sys = 1; + t->t_post_sys = 1; + } while ((t = t->t_forw) != first); +} + +/* + * set_all_proc_sys - set pre- and post-syscall processing flags for all + * user processes. + * + * This is needed when auditing, tracing, or other facilities which affect + * all processes are turned on. + */ +void +set_all_proc_sys() +{ + kthread_t *t; + kthread_t *first; + + mutex_enter(&pidlock); + t = first = curthread; + do { + t->t_pre_sys = 1; + t->t_post_sys = 1; + } while ((t = t->t_next) != first); + mutex_exit(&pidlock); +} + +/* + * set_all_zone_usr_proc_sys - set pre- and post-syscall processing flags for + * all user processes running in the zone of the current process + * + * This is needed when auditing, tracing, or other facilities which affect + * all processes are turned on. + */ +void +set_all_zone_usr_proc_sys(zoneid_t zoneid) +{ + proc_t *p; + kthread_t *t; + + mutex_enter(&pidlock); + for (p = practive; p != NULL; p = p->p_next) { + /* skip kernel and incomplete processes */ + if (p->p_exec == NULLVP || p->p_as == &kas || + p->p_stat == SIDL || p->p_stat == SZOMB || + (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) + continue; + /* + * Only processes in the given zone (eventually in + * all zones) are taken into account + */ + if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) { + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + mutex_exit(&p->p_lock); + continue; + } + /* + * Set pre- and post-syscall processing flags + * for all threads of the process + */ + do { + t->t_pre_sys = 1; + t->t_post_sys = 1; + } while (p->p_tlist != (t = t->t_forw)); + mutex_exit(&p->p_lock); + } + } + mutex_exit(&pidlock); +} + +/* + * set_proc_ast - Set asynchronous service trap (AST) flag for all + * threads in process. + */ +void +set_proc_ast(proc_t *p) +{ + kthread_t *t; + kthread_t *first; + + ASSERT(MUTEX_HELD(&p->p_lock)); + + t = first = p->p_tlist; + do { + aston(t); + } while ((t = t->t_forw) != first); +} diff --git a/usr/src/uts/intel/os/sysi86.c b/usr/src/uts/intel/os/sysi86.c new file mode 100644 index 0000000000..b107afddfb --- /dev/null +++ b/usr/src/uts/intel/os/sysi86.c @@ -0,0 +1,850 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2021 Joyent, Inc. + */ + +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ +/* All Rights Reserved */ + +/* Copyright (c) 1987, 1988 Microsoft Corporation */ +/* All Rights Reserved */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/signal.h> +#include <sys/errno.h> +#include <sys/fault.h> +#include <sys/syscall.h> +#include <sys/cpuvar.h> +#include <sys/sysi86.h> +#include <sys/psw.h> +#include <sys/cred.h> +#include <sys/policy.h> +#include <sys/thread.h> +#include <sys/debug.h> +#include <sys/ontrap.h> +#include <sys/privregs.h> +#include <sys/x86_archext.h> +#include <sys/vmem.h> +#include <sys/kmem.h> +#include <sys/mman.h> +#include <sys/archsystm.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/faultcode.h> +#include <sys/fp.h> +#include <sys/cmn_err.h> +#include <sys/segments.h> +#include <sys/clock.h> +#include <vm/hat_i86.h> +#if defined(__xpv) +#include <sys/hypervisor.h> +#include <sys/note.h> +#endif + +static void ldt_alloc(proc_t *, uint_t); +static void ldt_free(proc_t *); +static void ldt_dup(proc_t *, proc_t *); +static void ldt_grow(proc_t *, uint_t); + +/* + * sysi86 System Call + */ + +/* ARGSUSED */ +int +sysi86(short cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3) +{ + struct ssd ssd; + int error = 0; + int c; + proc_t *pp = curproc; + + switch (cmd) { + + /* + * The SI86V86 subsystem call of the SYSI86 system call + * supports only one subcode -- V86SC_IOPL. + */ + case SI86V86: + if (arg1 == V86SC_IOPL) { +#if defined(__xpv) + struct ctxop *ctx; +#endif + struct regs *rp = lwptoregs(ttolwp(curthread)); + greg_t oldpl = rp->r_ps & PS_IOPL; + greg_t newpl = arg2 & PS_IOPL; + + /* + * Must be privileged to run this system call + * if giving more io privilege. + */ + if (newpl > oldpl && (error = + secpolicy_sys_config(CRED(), B_FALSE)) != 0) + return (set_errno(error)); +#if defined(__xpv) + ctx = installctx_preallocate(); + kpreempt_disable(); + installctx(curthread, NULL, xen_disable_user_iopl, + xen_enable_user_iopl, NULL, NULL, + xen_disable_user_iopl, NULL, ctx); + xen_enable_user_iopl(); + kpreempt_enable(); +#else + rp->r_ps ^= oldpl ^ newpl; +#endif + } else + error = EINVAL; + break; + + /* + * Set a segment descriptor + */ + case SI86DSCR: + /* + * There are considerable problems here manipulating + * resources shared by many running lwps. Get everyone + * into a safe state before changing the LDT. + */ + if (curthread != pp->p_agenttp && !holdlwps(SHOLDFORK1)) { + error = EINTR; + break; + } + + if (get_udatamodel() == DATAMODEL_LP64) { + error = EINVAL; + break; + } + + if (copyin((caddr_t)arg1, &ssd, sizeof (ssd)) < 0) { + error = EFAULT; + break; + } + + error = setdscr(&ssd); + + mutex_enter(&pp->p_lock); + if (curthread != pp->p_agenttp) + continuelwps(pp); + mutex_exit(&pp->p_lock); + break; + + case SI86FPHW: + c = fp_kind & 0xff; + if (suword32((void *)arg1, c) == -1) + error = EFAULT; + break; + + case SI86FPSTART: + /* + * arg1 is the address of _fp_hw + * arg2 is the desired x87 FCW value + * arg3 is the desired SSE MXCSR value + * a return value of one means SSE hardware, else none. + */ + c = fp_kind & 0xff; + if (suword32((void *)arg1, c) == -1) { + error = EFAULT; + break; + } + fpsetcw((uint16_t)arg2, (uint32_t)arg3); + return ((fp_kind & __FP_SSE) ? 1 : 0); + + /* real time clock management commands */ + + case WTODC: + if ((error = secpolicy_settime(CRED())) == 0) { + timestruc_t ts; + mutex_enter(&tod_lock); + gethrestime(&ts); + tod_set(ts); + mutex_exit(&tod_lock); + } + break; + +/* Give some timezone playing room */ +#define ONEWEEK (7 * 24 * 60 * 60) + + case SGMTL: + /* + * Called from 32 bit land, negative values + * are not sign extended, so we do that here + * by casting it to an int and back. We also + * clamp the value to within reason and detect + * when a 64 bit call overflows an int. + */ + if ((error = secpolicy_settime(CRED())) == 0) { + int newlag = (int)arg1; + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() == DATAMODEL_NATIVE && + (long)newlag != (long)arg1) { + error = EOVERFLOW; + } else +#endif + if (newlag >= -ONEWEEK && newlag <= ONEWEEK) + sgmtl(newlag); + else + error = EOVERFLOW; + } + break; + + case GGMTL: + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (sulword((void *)arg1, ggmtl()) == -1) + error = EFAULT; +#ifdef _SYSCALL32_IMPL + } else { + time_t gmtl; + + if ((gmtl = ggmtl()) > INT32_MAX) { + /* + * Since gmt_lag can at most be + * +/- 12 hours, something is + * *seriously* messed up here. + */ + error = EOVERFLOW; + } else if (suword32((void *)arg1, (int32_t)gmtl) == -1) + error = EFAULT; +#endif + } + break; + + case RTCSYNC: + if ((error = secpolicy_settime(CRED())) == 0) + rtcsync(); + break; + + /* END OF real time clock management commands */ + + default: + error = EINVAL; + break; + } + return (error == 0 ? 0 : set_errno(error)); +} + +void +usd_to_ssd(user_desc_t *usd, struct ssd *ssd, selector_t sel) +{ + ssd->bo = USEGD_GETBASE(usd); + ssd->ls = USEGD_GETLIMIT(usd); + ssd->sel = sel; + + /* + * set type, dpl and present bits. + */ + ssd->acc1 = usd->usd_type; + ssd->acc1 |= usd->usd_dpl << 5; + ssd->acc1 |= usd->usd_p << (5 + 2); + + /* + * set avl, DB and granularity bits. + */ + ssd->acc2 = usd->usd_avl; + + ssd->acc2 |= usd->usd_long << 1; + + ssd->acc2 |= usd->usd_def32 << (1 + 1); + ssd->acc2 |= usd->usd_gran << (1 + 1 + 1); +} + +static void +ssd_to_usd(struct ssd *ssd, user_desc_t *usd) +{ + + ASSERT(bcmp(usd, &null_udesc, sizeof (*usd)) == 0); + + USEGD_SETBASE(usd, ssd->bo); + USEGD_SETLIMIT(usd, ssd->ls); + + /* + * Set type, dpl and present bits. + * + * Force the "accessed" bit to on so that we don't run afoul of + * KPTI. + */ + usd->usd_type = ssd->acc1 | SDT_A; + usd->usd_dpl = ssd->acc1 >> 5; + usd->usd_p = ssd->acc1 >> (5 + 2); + + ASSERT(usd->usd_type >= SDT_MEMRO); + ASSERT(usd->usd_dpl == SEL_UPL); + + /* + * 64-bit code selectors are never allowed in the LDT. + * Reserved bit is always 0 on 32-bit systems. + */ + usd->usd_long = 0; + + /* + * set avl, DB and granularity bits. + */ + usd->usd_avl = ssd->acc2; + usd->usd_def32 = ssd->acc2 >> (1 + 1); + usd->usd_gran = ssd->acc2 >> (1 + 1 + 1); +} + + + +/* + * Load LDT register with the current process's LDT. + */ +static void +ldt_load(void) +{ +#if defined(__xpv) + xen_set_ldt(curproc->p_ldt, curproc->p_ldtlimit + 1); +#else + size_t len; + system_desc_t desc; + + /* + * Before we can use the LDT on this CPU, we must install the LDT in the + * user mapping table. + */ + len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); + bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); + CPU->cpu_m.mcpu_ldt_len = len; + set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); + *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; + + wr_ldtr(ULDT_SEL); +#endif +} + +/* + * Store a NULL selector in the LDTR. All subsequent illegal references to + * the LDT will result in a #gp. + */ +void +ldt_unload(void) +{ +#if defined(__xpv) + xen_set_ldt(NULL, 0); +#else + *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; + wr_ldtr(0); + + bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); + CPU->cpu_m.mcpu_ldt_len = 0; +#endif +} + +/*ARGSUSED*/ +static void +ldt_savectx(proc_t *p) +{ + ASSERT(p->p_ldt != NULL); + ASSERT(p == curproc); + + /* + * The 64-bit kernel must be sure to clear any stale ldt + * selectors when context switching away from a process that + * has a private ldt. Consider the following example: + * + * Wine creats a ldt descriptor and points a segment register + * to it. + * + * We then context switch away from wine lwp to kernel + * thread and hit breakpoint in kernel with kmdb + * + * When we continue and resume from kmdb we will #gp + * fault since kmdb will have saved the stale ldt selector + * from wine and will try to restore it but we are no longer in + * the context of the wine process and do not have our + * ldtr register pointing to the private ldt. + */ + reset_sregs(); + + ldt_unload(); + cpu_fast_syscall_enable(); +} + +static void +ldt_restorectx(proc_t *p) +{ + ASSERT(p->p_ldt != NULL); + ASSERT(p == curproc); + + ldt_load(); + cpu_fast_syscall_disable(); +} + +/* + * At exec time, we need to clear up our LDT context and re-enable fast syscalls + * for the new process image. + * + * The same is true for the other case, where we have: + * + * proc_exit() + * ->exitpctx()->ldt_savectx() + * ->freepctx()->ldt_freectx() + * + * Because pre-emption is not prevented between the two callbacks, we could have + * come off CPU, and brought back LDT context when coming back on CPU via + * ldt_restorectx(). + */ +/* ARGSUSED */ +static void +ldt_freectx(proc_t *p, int isexec) +{ + ASSERT(p->p_ldt != NULL); + ASSERT(p == curproc); + + kpreempt_disable(); + ldt_free(p); + cpu_fast_syscall_enable(); + kpreempt_enable(); +} + +/* + * Install ctx op that ensures syscall/sysenter are disabled. + * See comments below. + * + * When a thread with a private LDT forks, the new process + * must have the LDT context ops installed. + */ +/* ARGSUSED */ +static void +ldt_installctx(proc_t *p, proc_t *cp) +{ + proc_t *targ = p; + kthread_t *t; + + /* + * If this is a fork, operate on the child process. + */ + if (cp != NULL) { + targ = cp; + ldt_dup(p, cp); + } + + /* + * The process context ops expect the target process as their argument. + */ + ASSERT(removepctx(targ, targ, ldt_savectx, ldt_restorectx, + ldt_installctx, ldt_savectx, ldt_freectx) == 0); + + installpctx(targ, targ, ldt_savectx, ldt_restorectx, + ldt_installctx, ldt_savectx, ldt_freectx); + + /* + * We've just disabled fast system call and return instructions; take + * the slow path out to make sure we don't try to use one to return + * back to user. We must set t_post_sys for every thread in the + * process to make sure none of them escape out via fast return. + */ + + mutex_enter(&targ->p_lock); + t = targ->p_tlist; + do { + t->t_post_sys = 1; + } while ((t = t->t_forw) != targ->p_tlist); + mutex_exit(&targ->p_lock); +} + +int +setdscr(struct ssd *ssd) +{ + ushort_t seli; /* selector index */ + user_desc_t *ldp; /* descriptor pointer */ + user_desc_t ndesc; /* new descriptor */ + proc_t *pp = curproc; + int rc = 0; + + /* + * LDT segments: executable and data at DPL 3 only. + */ + if (!SELISLDT(ssd->sel) || !SELISUPL(ssd->sel)) + return (EINVAL); + + /* + * check the selector index. + */ + seli = SELTOIDX(ssd->sel); + if (seli >= MAXNLDT || seli < LDT_UDBASE) + return (EINVAL); + + ndesc = null_udesc; + mutex_enter(&pp->p_ldtlock); + + /* + * If this is the first time for this process then setup a + * private LDT for it. + */ + if (pp->p_ldt == NULL) { + ldt_alloc(pp, seli); + + /* + * Now that this process has a private LDT, the use of + * the syscall/sysret and sysenter/sysexit instructions + * is forbidden for this processes because they destroy + * the contents of %cs and %ss segment registers. + * + * Explicity disable them here and add a context handler + * to the process. Note that disabling + * them here means we can't use sysret or sysexit on + * the way out of this system call - so we force this + * thread to take the slow path (which doesn't make use + * of sysenter or sysexit) back out. + */ + kpreempt_disable(); + ldt_installctx(pp, NULL); + cpu_fast_syscall_disable(); + ASSERT(curthread->t_post_sys != 0); + kpreempt_enable(); + + } else if (seli > pp->p_ldtlimit) { + ASSERT(pp->p_pctx != NULL); + + /* + * Increase size of ldt to include seli. + */ + ldt_grow(pp, seli); + } + + ASSERT(seli <= pp->p_ldtlimit); + ldp = &pp->p_ldt[seli]; + + /* + * On the 64-bit kernel, this is where things get more subtle. + * Recall that in the 64-bit kernel, when we enter the kernel we + * deliberately -don't- reload the segment selectors we came in on + * for %ds, %es, %fs or %gs. Messing with selectors is expensive, + * and the underlying descriptors are essentially ignored by the + * hardware in long mode - except for the base that we override with + * the gsbase MSRs. + * + * However, there's one unfortunate issue with this rosy picture -- + * a descriptor that's not marked as 'present' will still generate + * an #np when loading a segment register. + * + * Consider this case. An lwp creates a harmless LDT entry, points + * one of it's segment registers at it, then tells the kernel (here) + * to delete it. In the 32-bit kernel, the #np will happen on the + * way back to userland where we reload the segment registers, and be + * handled in kern_gpfault(). In the 64-bit kernel, the same thing + * will happen in the normal case too. However, if we're trying to + * use a debugger that wants to save and restore the segment registers, + * and the debugger things that we have valid segment registers, we + * have the problem that the debugger will try and restore the + * segment register that points at the now 'not present' descriptor + * and will take a #np right there. + * + * We should obviously fix the debugger to be paranoid about + * -not- restoring segment registers that point to bad descriptors; + * however we can prevent the problem here if we check to see if any + * of the segment registers are still pointing at the thing we're + * destroying; if they are, return an error instead. (That also seems + * a lot better failure mode than SIGKILL and a core file + * from kern_gpfault() too.) + */ + if (SI86SSD_PRES(ssd) == 0) { + kthread_t *t; + int bad = 0; + + /* + * Look carefully at the segment registers of every lwp + * in the process (they're all stopped by our caller). + * If we're about to invalidate a descriptor that's still + * being referenced by *any* of them, return an error, + * rather than having them #gp on their way out of the kernel. + */ + ASSERT(pp->p_lwprcnt == 1); + + mutex_enter(&pp->p_lock); + t = pp->p_tlist; + do { + klwp_t *lwp = ttolwp(t); + struct regs *rp = lwp->lwp_regs; + pcb_t *pcb = &lwp->lwp_pcb; + + if (ssd->sel == rp->r_cs || ssd->sel == rp->r_ss) { + bad = 1; + break; + } + + if (PCB_NEED_UPDATE_SEGS(pcb)) { + if (ssd->sel == pcb->pcb_ds || + ssd->sel == pcb->pcb_es || + ssd->sel == pcb->pcb_fs || + ssd->sel == pcb->pcb_gs) { + bad = 1; + break; + } + } else { + if (ssd->sel == rp->r_ds || + ssd->sel == rp->r_es || + ssd->sel == rp->r_fs || + ssd->sel == rp->r_gs) { + bad = 1; + break; + } + } + + } while ((t = t->t_forw) != pp->p_tlist); + mutex_exit(&pp->p_lock); + + if (bad) { + mutex_exit(&pp->p_ldtlock); + return (EBUSY); + } + } + + /* + * If acc1 is zero, clear the descriptor (including the 'present' bit). + * Make sure we update the CPU-private copy of the LDT. + */ + if (ssd->acc1 == 0) { + rc = ldt_update_segd(ldp, &null_udesc); + kpreempt_disable(); + ldt_load(); + kpreempt_enable(); + mutex_exit(&pp->p_ldtlock); + return (rc); + } + + /* + * Check segment type, allow segment not present and + * only user DPL (3). + */ + if (SI86SSD_DPL(ssd) != SEL_UPL) { + mutex_exit(&pp->p_ldtlock); + return (EINVAL); + } + + /* + * Do not allow 32-bit applications to create 64-bit mode code + * segments. + */ + if (SI86SSD_ISUSEG(ssd) && ((SI86SSD_TYPE(ssd) >> 3) & 1) == 1 && + SI86SSD_ISLONG(ssd)) { + mutex_exit(&pp->p_ldtlock); + return (EINVAL); + } + + /* + * Set up a code or data user segment descriptor, making sure to update + * the CPU-private copy of the LDT. + */ + if (SI86SSD_ISUSEG(ssd)) { + ssd_to_usd(ssd, &ndesc); + rc = ldt_update_segd(ldp, &ndesc); + kpreempt_disable(); + ldt_load(); + kpreempt_enable(); + mutex_exit(&pp->p_ldtlock); + return (rc); + } + + mutex_exit(&pp->p_ldtlock); + return (EINVAL); +} + +/* + * Allocate new LDT for process just large enough to contain seli. Note we + * allocate and grow LDT in PAGESIZE chunks. We do this to simplify the + * implementation and because on the hypervisor it's required, since the LDT + * must live on pages that have PROT_WRITE removed and which are given to the + * hypervisor. + * + * Note that we don't actually load the LDT into the current CPU here: it's done + * later by our caller. + */ +static void +ldt_alloc(proc_t *pp, uint_t seli) +{ + user_desc_t *ldt; + size_t ldtsz; + uint_t nsels; + + ASSERT(MUTEX_HELD(&pp->p_ldtlock)); + ASSERT(pp->p_ldt == NULL); + ASSERT(pp->p_ldtlimit == 0); + + /* + * Allocate new LDT just large enough to contain seli. The LDT must + * always be allocated in units of pages for KPTI. + */ + ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); + nsels = ldtsz / sizeof (user_desc_t); + ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); + + ldt = kmem_zalloc(ldtsz, KM_SLEEP); + ASSERT(IS_P2ALIGNED(ldt, PAGESIZE)); + +#if defined(__xpv) + if (xen_ldt_setprot(ldt, ldtsz, PROT_READ)) + panic("ldt_alloc:xen_ldt_setprot(PROT_READ) failed"); +#endif + + pp->p_ldt = ldt; + pp->p_ldtlimit = nsels - 1; +} + +static void +ldt_free(proc_t *pp) +{ + user_desc_t *ldt; + size_t ldtsz; + + ASSERT(pp->p_ldt != NULL); + + mutex_enter(&pp->p_ldtlock); + ldt = pp->p_ldt; + ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + + ASSERT(IS_P2ALIGNED(ldtsz, PAGESIZE)); + + pp->p_ldt = NULL; + pp->p_ldtlimit = 0; + mutex_exit(&pp->p_ldtlock); + + if (pp == curproc) { + kpreempt_disable(); + ldt_unload(); + kpreempt_enable(); + } + +#if defined(__xpv) + /* + * We are not allowed to make the ldt writable until after + * we tell the hypervisor to unload it. + */ + if (xen_ldt_setprot(ldt, ldtsz, PROT_READ | PROT_WRITE)) + panic("ldt_free:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); +#endif + + kmem_free(ldt, ldtsz); +} + +/* + * On fork copy new ldt for child. + */ +static void +ldt_dup(proc_t *pp, proc_t *cp) +{ + size_t ldtsz; + + ASSERT(pp->p_ldt != NULL); + ASSERT(cp != curproc); + + /* + * I assume the parent's ldt can't increase since we're in a fork. + */ + mutex_enter(&pp->p_ldtlock); + mutex_enter(&cp->p_ldtlock); + + ldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + + ldt_alloc(cp, pp->p_ldtlimit); + +#if defined(__xpv) + /* + * Make child's ldt writable so it can be copied into from + * parent's ldt. This works since ldt_alloc above did not load + * the ldt since its for the child process. If we tried to make + * an LDT writable that is loaded in hw the setprot operation + * would fail. + */ + if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ | PROT_WRITE)) + panic("ldt_dup:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); +#endif + + bcopy(pp->p_ldt, cp->p_ldt, ldtsz); + +#if defined(__xpv) + if (xen_ldt_setprot(cp->p_ldt, ldtsz, PROT_READ)) + panic("ldt_dup:xen_ldt_setprot(PROT_READ) failed"); +#endif + mutex_exit(&cp->p_ldtlock); + mutex_exit(&pp->p_ldtlock); + +} + +/* + * Note that we don't actually load the LDT into the current CPU here: it's done + * later by our caller - unless we take an error. This works out because + * ldt_load() does a copy of ->p_ldt instead of directly loading it into the GDT + * (and therefore can't be using the freed old LDT), and by definition if the + * new entry didn't pass validation, then the proc shouldn't be referencing an + * entry in the extended region. + */ +static void +ldt_grow(proc_t *pp, uint_t seli) +{ + user_desc_t *oldt, *nldt; + uint_t nsels; + size_t oldtsz, nldtsz; + + ASSERT(MUTEX_HELD(&pp->p_ldtlock)); + ASSERT(pp->p_ldt != NULL); + ASSERT(pp->p_ldtlimit != 0); + + /* + * Allocate larger LDT just large enough to contain seli. The LDT must + * always be allocated in units of pages for KPTI. + */ + nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); + nsels = nldtsz / sizeof (user_desc_t); + ASSERT(nsels >= MINNLDT && nsels <= MAXNLDT); + ASSERT(nsels > pp->p_ldtlimit); + + oldt = pp->p_ldt; + oldtsz = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + + nldt = kmem_zalloc(nldtsz, KM_SLEEP); + ASSERT(IS_P2ALIGNED(nldt, PAGESIZE)); + + bcopy(oldt, nldt, oldtsz); + + /* + * unload old ldt. + */ + kpreempt_disable(); + ldt_unload(); + kpreempt_enable(); + +#if defined(__xpv) + + /* + * Make old ldt writable and new ldt read only. + */ + if (xen_ldt_setprot(oldt, oldtsz, PROT_READ | PROT_WRITE)) + panic("ldt_grow:xen_ldt_setprot(PROT_READ|PROT_WRITE) failed"); + + if (xen_ldt_setprot(nldt, nldtsz, PROT_READ)) + panic("ldt_grow:xen_ldt_setprot(PROT_READ) failed"); +#endif + + pp->p_ldt = nldt; + pp->p_ldtlimit = nsels - 1; + + kmem_free(oldt, oldtsz); +} |