diff options
Diffstat (limited to 'usr/src')
45 files changed, 2919 insertions, 1666 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c index ef89d23312..c6ac1d2967 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c +++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ /* @@ -46,6 +48,9 @@ #include <vm/page.h> #include <vm/hat_i86.h> +#define VA_SIGN_BIT (1UL << 47) +#define VA_SIGN_EXTEND(va) (((va) ^ VA_SIGN_BIT) - VA_SIGN_BIT) + struct pfn2pp { pfn_t pfn; page_t *pp; @@ -398,13 +403,6 @@ pte2mfn(x86pte_t pte, uint_t level) return (mfn); } -/* - * Print a PTE in more human friendly way. The PTE is assumed to be in - * a level 0 page table, unless -l specifies another level. - * - * The PTE value can be specified as the -p option, since on a 32 bit kernel - * with PAE running it's larger than a uintptr_t. - */ static int do_pte_dcmd(int level, uint64_t pte) { @@ -414,13 +412,14 @@ do_pte_dcmd(int level, uint64_t pte) int pat_index = 0; pfn_t mfn; - mdb_printf("pte=%llr: ", pte); - if (PTE_GET(pte, mmu.pt_nx)) - mdb_printf("noexec "); + mdb_printf("pte=0x%llr: ", pte); mfn = pte2mfn(pte, level); mdb_printf("%s=0x%lr ", is_xpv ? "mfn" : "pfn", mfn); + if (PTE_GET(pte, mmu.pt_nx)) + mdb_printf("noexec "); + if (PTE_GET(pte, PT_NOCONSIST)) mdb_printf("noconsist "); @@ -476,52 +475,34 @@ do_pte_dcmd(int level, uint64_t pte) /* * Print a PTE in more human friendly way. The PTE is assumed to be in * a level 0 page table, unless -l specifies another level. - * - * The PTE value can be specified as the -p option, since on a 32 bit kernel - * with PAE running it's larger than a uintptr_t. */ /*ARGSUSED*/ int pte_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - int level = 0; - uint64_t pte = 0; - char *level_str = NULL; - char *pte_str = NULL; + uint64_t level = 0; init_mmu(); if (mmu.num_level == 0) return (DCMD_ERR); + if ((flags & DCMD_ADDRSPEC) == 0) + return (DCMD_USAGE); + if (mdb_getopts(argc, argv, - 'p', MDB_OPT_STR, &pte_str, - 'l', MDB_OPT_STR, &level_str) != argc) + 'l', MDB_OPT_UINT64, &level) != argc) return (DCMD_USAGE); - /* - * parse the PTE to decode, if it's 0, we don't do anything - */ - if (pte_str != NULL) { - pte = mdb_strtoull(pte_str); - } else { - if ((flags & DCMD_ADDRSPEC) == 0) - return (DCMD_USAGE); - pte = addr; + if (level > mmu.max_level) { + mdb_warn("invalid level %lu\n", level); + return (DCMD_ERR); } - if (pte == 0) - return (DCMD_OK); - /* - * parse the level if supplied - */ - if (level_str != NULL) { - level = mdb_strtoull(level_str); - if (level < 0 || level > mmu.max_level) - return (DCMD_ERR); - } + if (addr == 0) + return (DCMD_OK); - return (do_pte_dcmd(level, pte)); + return (do_pte_dcmd((int)level, addr)); } static size_t @@ -537,25 +518,20 @@ static x86pte_t get_pte(hat_t *hat, htable_t *htable, uintptr_t addr) { x86pte_t buf; - x86pte32_t *pte32 = (x86pte32_t *)&buf; - size_t len; - if (htable->ht_flags & HTABLE_VLP) { - uintptr_t ptr = (uintptr_t)hat->hat_vlp_ptes; + if (htable->ht_flags & HTABLE_COPIED) { + uintptr_t ptr = (uintptr_t)hat->hat_copied_ptes; ptr += va2entry(htable, addr) << mmu.pte_size_shift; - len = mdb_vread(&buf, mmu.pte_size, ptr); - } else { - paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn); - paddr += va2entry(htable, addr) << mmu.pte_size_shift; - len = mdb_pread(&buf, mmu.pte_size, paddr); + return (*(x86pte_t *)ptr); } - if (len != mmu.pte_size) - return (0); + paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn); + paddr += va2entry(htable, addr) << mmu.pte_size_shift; - if (mmu.pte_size == sizeof (x86pte_t)) + if ((mdb_pread(&buf, mmu.pte_size, paddr)) == mmu.pte_size) return (buf); - return (*pte32); + + return (0); } static int @@ -621,8 +597,8 @@ do_va2pa(uintptr_t addr, struct as *asp, int print_level, physaddr_t *pap, pte = get_pte(&hat, &htable, addr); if (print_level) { - mdb_printf("\tlevel=%d htable=%p " - "pte=%llr\n", level, ht, pte); + mdb_printf("\tlevel=%d htable=0x%p " + "pte=0x%llr\n", level, ht, pte); } if (!PTE_ISVALID(pte)) { @@ -725,8 +701,6 @@ do_report_maps(pfn_t pfn) int level; int entry; x86pte_t pte; - x86pte_t buf; - x86pte32_t *pte32 = (x86pte32_t *)&buf; physaddr_t paddr; size_t len; @@ -796,14 +770,10 @@ do_report_maps(pfn_t pfn) base >= kernelbase) continue; - len = mdb_pread(&buf, mmu.pte_size, + len = mdb_pread(&pte, mmu.pte_size, paddr + entry * mmu.pte_size); if (len != mmu.pte_size) return (DCMD_ERR); - if (mmu.pte_size == sizeof (x86pte_t)) - pte = buf; - else - pte = *pte32; if ((pte & PT_VALID) == 0) continue; @@ -854,7 +824,7 @@ report_maps_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } static int -do_ptable_dcmd(pfn_t pfn) +do_ptable_dcmd(pfn_t pfn, uint64_t level) { struct hat *hatp; struct hat hat; @@ -862,12 +832,10 @@ do_ptable_dcmd(pfn_t pfn) htable_t htable; uintptr_t base; int h; - int level; int entry; uintptr_t pagesize; x86pte_t pte; x86pte_t buf; - x86pte32_t *pte32 = (x86pte32_t *)&buf; physaddr_t paddr; size_t len; @@ -912,14 +880,21 @@ do_ptable_dcmd(pfn_t pfn) found_it: if (htable.ht_pfn == pfn) { mdb_printf("htable=%p\n", ht); - level = htable.ht_level; + if (level == (uint64_t)-1) { + level = htable.ht_level; + } else if (htable.ht_level != level) { + mdb_warn("htable has level %d but forcing level %lu\n", + htable.ht_level, level); + } base = htable.ht_vaddr; pagesize = mmu.level_size[level]; } else { - mdb_printf("Unknown pagetable - assuming level/addr 0"); - level = 0; /* assume level == 0 for PFN */ + if (level == (uint64_t)-1) + level = 0; + mdb_warn("couldn't find matching htable, using level=%lu, " + "base address=0x0\n", level); base = 0; - pagesize = MMU_PAGESIZE; + pagesize = mmu.level_size[level]; } paddr = mmu_ptob((physaddr_t)pfn); @@ -928,15 +903,13 @@ found_it: paddr + entry * mmu.pte_size); if (len != mmu.pte_size) return (DCMD_ERR); - if (mmu.pte_size == sizeof (x86pte_t)) pte = buf; - else - pte = *pte32; if (pte == 0) continue; - mdb_printf("[%3d] va=%p ", entry, base + entry * pagesize); + mdb_printf("[%3d] va=0x%p ", entry, + VA_SIGN_EXTEND(base + entry * pagesize)); do_pte_dcmd(level, pte); } @@ -953,6 +926,7 @@ ptable_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { pfn_t pfn; uint_t mflag = 0; + uint64_t level = (uint64_t)-1; init_mmu(); @@ -963,14 +937,20 @@ ptable_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_USAGE); if (mdb_getopts(argc, argv, - 'm', MDB_OPT_SETBITS, TRUE, &mflag, NULL) != argc) + 'm', MDB_OPT_SETBITS, TRUE, &mflag, + 'l', MDB_OPT_UINT64, &level, NULL) != argc) return (DCMD_USAGE); + if (level != (uint64_t)-1 && level > mmu.max_level) { + mdb_warn("invalid level %lu\n", level); + return (DCMD_ERR); + } + pfn = (pfn_t)addr; if (mflag) pfn = mdb_mfn_to_pfn(pfn); - return (do_ptable_dcmd(pfn)); + return (do_ptable_dcmd(pfn, level)); } static int @@ -1031,3 +1011,112 @@ htables_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (do_htables_dcmd(hat)); } + +static uintptr_t +entry2va(size_t *entries) +{ + uintptr_t va = 0; + + for (level_t l = mmu.max_level; l >= 0; l--) + va += entries[l] << mmu.level_shift[l]; + + return (VA_SIGN_EXTEND(va)); +} + +static void +ptmap_report(size_t *entries, uintptr_t start, + boolean_t user, boolean_t writable, boolean_t wflag) +{ + uint64_t curva = entry2va(entries); + + mdb_printf("mapped %s,%s range of %lu bytes: %a-%a\n", + user ? "user" : "kernel", writable ? "writable" : "read-only", + curva - start, start, curva - 1); + if (wflag && start >= kernelbase) + (void) mdb_call_dcmd("whatis", start, DCMD_ADDRSPEC, 0, NULL); +} + +int +ptmap_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + physaddr_t paddrs[MAX_NUM_LEVEL] = { 0, }; + size_t entry[MAX_NUM_LEVEL] = { 0, }; + uintptr_t start = (uintptr_t)-1; + boolean_t writable = B_FALSE; + boolean_t user = B_FALSE; + boolean_t wflag = B_FALSE; + level_t curlevel; + + if ((flags & DCMD_ADDRSPEC) == 0) + return (DCMD_USAGE); + + if (mdb_getopts(argc, argv, + 'w', MDB_OPT_SETBITS, TRUE, &wflag, NULL) != argc) + return (DCMD_USAGE); + + init_mmu(); + + if (mmu.num_level == 0) + return (DCMD_ERR); + + curlevel = mmu.max_level; + + paddrs[curlevel] = addr & MMU_PAGEMASK; + + for (;;) { + physaddr_t pte_addr; + x86pte_t pte; + + pte_addr = paddrs[curlevel] + + (entry[curlevel] << mmu.pte_size_shift); + + if (mdb_pread(&pte, sizeof (pte), pte_addr) != sizeof (pte)) { + mdb_warn("couldn't read pte at %p", pte_addr); + return (DCMD_ERR); + } + + if (PTE_GET(pte, PT_VALID) == 0) { + if (start != (uintptr_t)-1) { + ptmap_report(entry, start, + user, writable, wflag); + start = (uintptr_t)-1; + } + } else if (curlevel == 0 || PTE_GET(pte, PT_PAGESIZE)) { + if (start == (uintptr_t)-1) { + start = entry2va(entry); + user = PTE_GET(pte, PT_USER); + writable = PTE_GET(pte, PT_WRITABLE); + } else if (user != PTE_GET(pte, PT_USER) || + writable != PTE_GET(pte, PT_WRITABLE)) { + ptmap_report(entry, start, + user, writable, wflag); + start = entry2va(entry); + user = PTE_GET(pte, PT_USER); + writable = PTE_GET(pte, PT_WRITABLE); + } + } else { + /* Descend a level. */ + physaddr_t pa = mmu_ptob(pte2mfn(pte, curlevel)); + paddrs[--curlevel] = pa; + entry[curlevel] = 0; + continue; + } + + while (++entry[curlevel] == mmu.ptes_per_table) { + /* Ascend back up. */ + entry[curlevel] = 0; + if (curlevel == mmu.max_level) { + if (start != (uintptr_t)-1) { + ptmap_report(entry, start, + user, writable, wflag); + } + goto out; + } + + curlevel++; + } + } + +out: + return (DCMD_OK); +} diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h index 3e5476e31e..8d794095c9 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h +++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _I86MMU_H @@ -42,6 +44,9 @@ extern int htables_dcmd(uintptr_t addr, uint_t flags, int argc, extern int ptable_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv); +extern int ptmap_dcmd(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv); + extern int va2pfn_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv); diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c index 5d8a0f222f..95e588eed6 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c +++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c @@ -432,6 +432,9 @@ ttrace_dumpregs(trap_trace_rec_t *rec) mdb_printf(THREEREGS, DUMP(gs), "trp", regs->r_trapno, DUMP(err)); mdb_printf(THREEREGS, DUMP(rip), DUMP(cs), DUMP(rfl)); mdb_printf(THREEREGS, DUMP(rsp), DUMP(ss), "cr2", rec->ttr_cr2); + mdb_printf(" %3s: %16lx %3s: %16lx\n", + "fsb", regs->__r_fsbase, + "gsb", regs->__r_gsbase); mdb_printf("\n"); } @@ -753,7 +756,18 @@ ptable_help(void) "Given a PFN holding a page table, print its contents, and\n" "the address of the corresponding htable structure.\n" "\n" - "-m Interpret the PFN as an MFN (machine frame number)\n"); + "-m Interpret the PFN as an MFN (machine frame number)\n" + "-l force page table level (3 is top)\n"); +} + +static void +ptmap_help(void) +{ + mdb_printf( + "Report all mappings represented by the page table hierarchy\n" + "rooted at the given cr3 value / physical address.\n" + "\n" + "-w run ::whatis on mapping start addresses\n"); } static const char *const scalehrtime_desc = @@ -1000,10 +1014,10 @@ crregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("%%cr2 = 0x%08x <%a>\n", cr2, cr2); if ((cr4 & CR4_PCIDE)) { - mdb_printf("%%cr3 = 0x%08x <pfn:%lu pcid:%u>\n", + mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx pcid:%u>\n", cr3 >> MMU_PAGESHIFT, cr3 & MMU_PAGEOFFSET); } else { - mdb_printf("%%cr3 = 0x%08x <pfn:%lu flags:%b>\n", cr3, + mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx flags:%b>\n", cr3, cr3 >> MMU_PAGESHIFT, cr3, cr3_flag_bits); } @@ -1024,9 +1038,11 @@ static const mdb_dcmd_t dcmds[] = { report_maps_dcmd, report_maps_help }, { "htables", "", "Given hat_t *, lists all its htable_t * values", htables_dcmd, htables_help }, - { "ptable", ":[-m]", "Given PFN, dump contents of a page table", + { "ptable", ":[-lm]", "Given PFN, dump contents of a page table", ptable_dcmd, ptable_help }, - { "pte", ":[-p XXXXX] [-l N]", "print human readable page table entry", + { "ptmap", ":", "Given a cr3 value, dump all mappings", + ptmap_dcmd, ptmap_help }, + { "pte", ":[-l N]", "print human readable page table entry", pte_dcmd }, { "pfntomfn", ":", "convert physical page to hypervisor machine page", pfntomfn_dcmd }, diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s b/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s index 407123c7e0..38ddf5cf44 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s +++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s @@ -32,6 +32,10 @@ kmdb_unix_getcr0(void) { return (0); } ulong_t +kmdb_unix_getcr3(void) +{ return (0); } + +ulong_t kmdb_unix_getcr4(void) { return (0); } diff --git a/usr/src/cmd/mdb/intel/kmdb/kaif.c b/usr/src/cmd/mdb/intel/kmdb/kaif.c index c1be6aae0f..dda6a94ea6 100644 --- a/usr/src/cmd/mdb/intel/kmdb/kaif.c +++ b/usr/src/cmd/mdb/intel/kmdb/kaif.c @@ -50,6 +50,7 @@ #include <sys/bitmap.h> #include <sys/termios.h> #include <sys/kdi_impl.h> +#include <sys/sysmacros.h> /* * This is the area containing the saved state when we enter @@ -256,11 +257,42 @@ kaif_set_register(const char *regname, kreg_t val) return (0); } +/* + * Refuse to single-step or break within any stub that loads a user %cr3 value. + * As the KDI traps are not careful to restore such a %cr3, this can all go + * wrong, both spectacularly and subtly. + */ +static boolean_t +kaif_toxic_text(uintptr_t addr) +{ + static GElf_Sym toxic_syms[1] = { 0, }; + size_t i; + + if (toxic_syms[0].st_name == NULL) { + if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC, + "tr_iret_user", &toxic_syms[0], NULL) != 0) + warn("couldn't find tr_iret_user\n"); + } + + for (i = 0; i < ARRAY_SIZE(toxic_syms); i++) { + if (addr >= toxic_syms[i].st_value && + addr - toxic_syms[i].st_value < toxic_syms[i].st_size) + return (B_TRUE); + } + + return (B_FALSE); +} + static int kaif_brkpt_arm(uintptr_t addr, mdb_instr_t *instrp) { mdb_instr_t bkpt = KAIF_BREAKPOINT_INSTR; + if (kaif_toxic_text(addr)) { + warn("%a cannot be a breakpoint target\n", addr); + return (set_errno(EMDB_TGTNOTSUP)); + } + if (mdb_tgt_vread(mdb.m_target, instrp, sizeof (mdb_instr_t), addr) != sizeof (mdb_instr_t)) return (-1); /* errno is set for us */ @@ -445,6 +477,11 @@ kaif_step(void) (void) kmdb_dpi_get_register("pc", &pc); + if (kaif_toxic_text(pc)) { + warn("%a cannot be stepped\n", pc); + return (set_errno(EMDB_TGTNOTSUP)); + } + if ((npc = mdb_dis_nextins(mdb.m_disasm, mdb.m_target, MDB_TGT_AS_VIRT, pc)) == pc) { warn("failed to decode instruction at %a for step\n", pc); diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c index 2465146a38..22c3d1dc6a 100644 --- a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c +++ b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ @@ -133,6 +133,9 @@ const mdb_tgt_regdesc_t mdb_amd64_kregs[] = { { "sp", KREG_RSP, MDB_TGT_R_EXPORT | MDB_TGT_R_16 }, { "spl", KREG_RSP, MDB_TGT_R_EXPORT | MDB_TGT_R_8L }, { "ss", KREG_SS, MDB_TGT_R_EXPORT }, + { "gsbase", KREG_GSBASE, MDB_TGT_R_EXPORT }, + { "kgsbase", KREG_KGSBASE, MDB_TGT_R_EXPORT }, + { "cr2", KREG_CR2, MDB_TGT_R_EXPORT }, { NULL, 0, 0 } }; @@ -186,13 +189,13 @@ mdb_amd64_printregs(const mdb_tgt_gregset_t *gregs) (rflags & KREG_EFLAGS_PF_MASK) ? "PF" : "pf", (rflags & KREG_EFLAGS_CF_MASK) ? "CF" : "cf"); - mdb_printf("%24s%%cs = 0x%04x\t%%ds = 0x%04x\t%%es = 0x%04x\n", - " ", kregs[KREG_CS], kregs[KREG_DS], kregs[KREG_ES]); - - mdb_printf("%%trapno = 0x%x\t\t%%fs = 0x%04x\t%%gs = 0x%04x\n", - kregs[KREG_TRAPNO], (kregs[KREG_FS] & 0xffff), - (kregs[KREG_GS] & 0xffff)); - mdb_printf(" %%err = 0x%x\n", kregs[KREG_ERR]); + mdb_printf("%%cs = 0x%04x\t%%ds = 0x%04x\t" + "%%es = 0x%04x\t%%fs = 0x%04x\n", kregs[KREG_CS], kregs[KREG_DS], + kregs[KREG_ES], kregs[KREG_FS] & 0xffff); + mdb_printf("%%gs = 0x%04x\t%%gsbase = 0x%lx\t%%kgsbase = 0x%lx\n", + kregs[KREG_GS] & 0xffff, kregs[KREG_GSBASE], kregs[KREG_KGSBASE]); + mdb_printf("%%trapno = 0x%x\t%%err = 0x%x\t%%cr2 = 0x%lx\n", + kregs[KREG_TRAPNO], kregs[KREG_ERR], kregs[KREG_CR2]); } int diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h index 4ba5fb567c..8bee68b379 100644 --- a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h +++ b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h @@ -21,13 +21,13 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _MDB_KREG_H #define _MDB_KREG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/kdi_regs.h> #ifndef _ASM #include <sys/types.h> @@ -75,8 +75,11 @@ typedef uint32_t kreg_t; #define KREG_ES KDIREG_ES #define KREG_FS KDIREG_FS #define KREG_GS KDIREG_GS +#define KREG_GSBASE KDIREG_GSBASE +#define KREG_KGSBASE KDIREG_KGSBASE #define KREG_TRAPNO KDIREG_TRAPNO #define KREG_ERR KDIREG_ERR +#define KREG_CR2 KDIREG_CR2 #define KREG_RIP KDIREG_RIP #define KREG_CS KDIREG_CS #define KREG_RFLAGS KDIREG_RFLAGS diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h index 394a716a02..6f5882b54b 100644 --- a/usr/src/uts/common/sys/sysmacros.h +++ b/usr/src/uts/common/sys/sysmacros.h @@ -27,6 +27,8 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * + * Copyright 2018 Joyent Inc. */ #ifndef _SYS_SYSMACROS_H @@ -367,12 +369,9 @@ extern unsigned char bcd_to_byte[256]; #error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined #endif /* _BIT_FIELDS_LTOH */ -/* avoid any possibility of clashing with <stddef.h> version */ -#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_KMEMUSER) - +#if !defined(ARRAY_SIZE) #define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0])) - -#endif /* _KERNEL, !_KMEMUSER */ +#endif #ifdef __cplusplus } diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 34db892539..7fc3cfec14 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -75,7 +75,6 @@ CORE_OBJS += \ instr_size.o \ intr.o \ kboot_mmu.o \ - kdi_subr.o \ kdi_idt.o \ kdi_idthdl.o \ kdi_asm.o \ @@ -160,7 +159,8 @@ SPECIAL_OBJS_64 += \ locore.o \ fast_trap_asm.o \ interrupt.o \ - syscall_asm_amd64.o + syscall_asm_amd64.o \ + kpti_trampolines.o SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS)) @@ -307,10 +307,9 @@ ASSYM_DEPS += \ swtch.o \ syscall_asm.o \ syscall_asm_amd64.o \ + kpti_trampolines.o \ cpr_wakecode.o CPR_IMPL_OBJS = cpr_impl.o cpr_wakecode.o $(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h - -ASSYM_DEPS += kdi_asm.o diff --git a/usr/src/uts/i86pc/ml/genassym.c b/usr/src/uts/i86pc/ml/genassym.c index 088dd661a3..6d840368d7 100644 --- a/usr/src/uts/i86pc/ml/genassym.c +++ b/usr/src/uts/i86pc/ml/genassym.c @@ -20,6 +20,8 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _GENASSYM @@ -68,8 +70,6 @@ extern void exit(int); int main(int argc, char *argv[]) { - printf("#define\tT_AST 0x%x\n", T_AST); - printf("#define\tLOCK_LEVEL 0x%x\n", LOCK_LEVEL); printf("#define\tCLOCK_LEVEL 0x%x\n", CLOCK_LEVEL); printf("#define\tDISP_LEVEL 0x%x\n", DISP_LEVEL); @@ -109,20 +109,6 @@ main(int argc, char *argv[]) printf("#define\tSSE_MXCSR_EFLAGS 0x%x\n", SSE_MXCSR_EFLAGS); - printf("#define\tFP_487 0x%x\n", FP_487); - printf("#define\tFP_486 0x%x\n", FP_486); - printf("#define\tFPU_CW_INIT 0x%x\n", FPU_CW_INIT); - printf("#define\tFPU_EN 0x%x\n", FPU_EN); - printf("#define\tFPU_VALID 0x%x\n", FPU_VALID); - - printf("#define\tFP_NO 0x%x\n", FP_NO); - printf("#define\tFP_SW 0x%x\n", FP_SW); - printf("#define\tFP_HW 0x%x\n", FP_HW); - printf("#define\tFP_287 0x%x\n", FP_287); - printf("#define\tFP_387 0x%x\n", FP_387); - printf("#define\t__FP_SSE 0x%x\n", __FP_SSE); - - printf("#define\tFP_FNSAVE 0x%x\n", FP_FNSAVE); printf("#define\tFP_FXSAVE 0x%x\n", FP_FXSAVE); printf("#define\tFP_XSAVE 0x%x\n", FP_XSAVE); @@ -154,11 +140,6 @@ main(int argc, char *argv[]) printf("#define\tNSEC_PER_COUNTER_TICK 0x%llx\n", NANOSEC / PIT_HZ); - printf("#define\tPITCTR0_PORT 0x%x\n", PITCTR0_PORT); - printf("#define\tPITCTL_PORT 0x%x\n", PITCTL_PORT); - printf("#define\tPIT_COUNTDOWN 0x%x\n", - PIT_C0 | PIT_LOADMODE | PIT_NDIVMODE); - printf("#define\tNBPW 0x%x\n", NBPW); printf("#define\tDDI_ACCATTR_IO_SPACE 0x%x\n", DDI_ACCATTR_IO_SPACE); diff --git a/usr/src/uts/i86pc/ml/kdi_subr.s b/usr/src/uts/i86pc/ml/kdi_subr.s deleted file mode 100644 index 8ed90ed410..0000000000 --- a/usr/src/uts/i86pc/ml/kdi_subr.s +++ /dev/null @@ -1,160 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/asm_linkage.h> -#include <sys/asm_misc.h> -#include <sys/regset.h> -#include <sys/privregs.h> -#include <sys/psw.h> - -#if defined(__lint) -#include <sys/types.h> -#include <sys/segments.h> -#endif - -#if defined(__lint) - -ulong_t -kdi_getdr0(void) -{ - return (0); -} - -ulong_t -kdi_getdr1(void) -{ - return (0); -} - -ulong_t -kdi_getdr2(void) -{ - return (0); -} - -ulong_t -kdi_getdr3(void) -{ - return (0); -} - -ulong_t -kdi_getdr6(void) -{ - return (0); -} - -ulong_t -kdi_getdr7(void) -{ - return (0); -} - -/*ARGSUSED*/ -void -kdi_setdr0(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr1(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr2(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr3(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr4(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr6(ulong_t value) -{} - -/*ARGSUSED*/ -void -kdi_setdr7(ulong_t value) -{} - -#else - -#if defined(__amd64) - -#define GETDREG(name, r) \ - ENTRY_NP(name); \ - movq r, %rax; \ - ret; \ - SET_SIZE(name) - -#define SETDREG(name, r) \ - ENTRY_NP(name); \ - movq %rdi, r; \ - ret; \ - SET_SIZE(name) - -#elif defined(__i386) - -#define GETDREG(name, r) \ - ENTRY_NP(name); \ - movl r, %eax; \ - ret; \ - SET_SIZE(name) - -#define SETDREG(name, r) \ - ENTRY_NP(name); \ - movl 4(%esp), %eax; \ - movl %eax, r; \ - ret; \ - SET_SIZE(name) - -#endif - - GETDREG(kdi_getdr0, %dr0) - GETDREG(kdi_getdr1, %dr1) - GETDREG(kdi_getdr2, %dr2) - GETDREG(kdi_getdr3, %dr3) - GETDREG(kdi_getdr6, %dr6) - GETDREG(kdi_getdr7, %dr7) - - SETDREG(kdi_setdr0, %dr0) - SETDREG(kdi_setdr1, %dr1) - SETDREG(kdi_setdr2, %dr2) - SETDREG(kdi_setdr3, %dr3) - SETDREG(kdi_setdr6, %dr6) - SETDREG(kdi_setdr7, %dr7) - -#endif /* __lint */ diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s new file mode 100644 index 0000000000..c05718c3ad --- /dev/null +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -0,0 +1,713 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * This file contains the trampolines that are used by KPTI in order to be + * able to take interrupts/trap/etc while on the "user" page table. + * + * We don't map the full kernel text into the user page table: instead we + * map this one small section of trampolines (which compiles to ~13 pages). + * These trampolines are set in the IDT always (so they will run no matter + * whether we're on the kernel or user page table), and their primary job is to + * pivot us to the kernel %cr3 and %rsp without ruining everything. + * + * All of these interrupts use the amd64 IST feature when we have KPTI enabled, + * meaning that they will execute with their %rsp set to a known location, even + * if we take them in the kernel. + * + * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST + * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti + * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be + * page-aligned, and we map the page it's on into both page tables. Using a + * struct attached to the cpu_t also means that we can use %rsp-relative + * addressing to find anything on the cpu_t, so we don't have to touch %gs or + * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy). + * + * This little struct is where the CPU will push the actual interrupt frame. + * Then, in the trampoline, we change %cr3, then figure out our destination + * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt + * frame). Then we jump to the regular ISR in the kernel text and carry on as + * normal. + * + * We leave the original frame and any spilled regs behind in the kpti_frame + * lazily until we want to return to userland. Then, we clear any spilled + * regs from it, and overwrite the rest with our iret frame. When switching + * this cpu to a different process (in hat_switch), we bzero the whole region to + * make sure nothing can leak between processes. + * + * When we're returning back to the original place we took the interrupt later + * (especially if it was in userland), we have to jmp back to the "return + * trampolines" here, since when we set %cr3 back to the user value, we need to + * be executing from code here in these shared pages and not the main kernel + * text again. Even though it should be fine to iret directly from kernel text + * when returning to kernel code, we make things jmp to a trampoline here just + * for consistency. + * + * Note that with IST, it's very important that we always must have pivoted + * away from the IST stack before we could possibly take any other interrupt + * on the same IST (unless it's an end-of-the-world fault and we don't care + * about coming back from it ever). + * + * This is particularly relevant to the dbgtrap/brktrap trampolines, as they + * regularly have to happen from within trampoline code (e.g. in the sysenter + * single-step case) and then return to the world normally. As a result, these + * two are IST'd to their own kpti_frame right above the normal one (in the same + * page), so they don't clobber their parent interrupt. + * + * To aid with debugging, we also IST the page fault (#PF/pftrap), general + * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to + * their own separate kpti_frame. This ensures that if we take one of these + * due to a bug in trampoline code, we preserve the original trampoline + * state that caused the trap. + * + * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST + * stacks, since they can interrupt another ISR at any time. These stacks are + * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in + * their trampolines (and do it unconditionally), and don't bother pivoting + * away. We're either going into the panic() path, or we're going to return + * straight away without rescheduling, so it's fine to not be on our real + * kthread stack (and some of the state we want to go find it with might be + * corrupt!) + * + * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a + * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to + * point at the PML4 for kas early in boot and never touch it again. Hopefully + * it survives whatever corruption brings down the rest of the kernel! + * + * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64 + * cases) in that they do not push an interrupt frame (and also have some other + * effects). In the syscall trampolines, we assume that we can only be taking + * the call from userland and use SWAPGS and an unconditional overwrite of %cr3. + * We do not do any stack pivoting for syscalls (and we leave SYSENTER's + * existing %rsp pivot untouched) -- instead we spill registers into + * %gs:CPU_KPTI_* as we need to. + */ + +/* + * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you + * fix bugs here check to see if they should be fixed there as well. + */ + +#include <sys/asm_linkage.h> +#include <sys/asm_misc.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/psw.h> +#include <sys/machbrand.h> +#include <sys/param.h> + +#if defined(__lint) + +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/systm.h> + +#else /* __lint */ + +#include <sys/segments.h> +#include <sys/pcb.h> +#include <sys/trap.h> +#include <sys/ftrace.h> +#include <sys/traptrace.h> +#include <sys/clock.h> +#include <sys/model.h> +#include <sys/panic.h> + +#if defined(__xpv) +#include <sys/hypervisor.h> +#endif + +#include "assym.h" + + .data + DGDEF3(kpti_enable, 8, 8) + .fill 1, 8, 1 + +.section ".text"; +.align MMU_PAGESIZE + +.global kpti_tramp_start +kpti_tramp_start: + nop + +/* This will be set by mlsetup, and then double-checked later */ +.global kpti_safe_cr3 +kpti_safe_cr3: + .quad 0 + SET_SIZE(kpti_safe_cr3) + +/* startup_kmem() will overwrite this */ +.global kpti_kbase +kpti_kbase: + .quad KERNELBASE + SET_SIZE(kpti_kbase) + +#define SET_KERNEL_CR3(spillreg) \ + mov %cr3, spillreg; \ + mov spillreg, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_KCR3, spillreg; \ + cmp $0, spillreg; \ + je 2f; \ + mov spillreg, %cr3; \ +2: + +#if DEBUG +#define SET_USER_CR3(spillreg) \ + mov %cr3, spillreg; \ + mov spillreg, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_UCR3, spillreg; \ + mov spillreg, %cr3 +#else +#define SET_USER_CR3(spillreg) \ + mov %gs:CPU_KPTI_UCR3, spillreg; \ + mov spillreg, %cr3 +#endif + +#define PIVOT_KPTI_STK(spillreg) \ + mov %rsp, spillreg; \ + mov %gs:CPU_KPTI_RET_RSP, %rsp; \ + pushq T_FRAMERET_SS(spillreg); \ + pushq T_FRAMERET_RSP(spillreg); \ + pushq T_FRAMERET_RFLAGS(spillreg); \ + pushq T_FRAMERET_CS(spillreg); \ + pushq T_FRAMERET_RIP(spillreg) + + +#define INTERRUPT_TRAMPOLINE_P(errpush) \ + pushq %r13; \ + pushq %r14; \ + subq $KPTI_R14, %rsp; \ + /* Save current %cr3. */ \ + mov %cr3, %r14; \ + mov %r14, KPTI_TR_CR3(%rsp); \ + \ + cmpw $KCS_SEL, KPTI_CS(%rsp); \ + je 3f; \ +1: \ + /* Change to the "kernel" %cr3 */ \ + mov KPTI_KCR3(%rsp), %r14; \ + cmp $0, %r14; \ + je 2f; \ + mov %r14, %cr3; \ +2: \ + /* Get our cpu_t in %r13 */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + subq $CPU_KPTI_START, %r13; \ + /* Use top of the kthread stk */ \ + mov CPU_THREAD(%r13), %r14; \ + mov T_STACK(%r14), %r14; \ + addq $REGSIZE+MINFRAME, %r14; \ + jmp 4f; \ +3: \ + /* Check the %rsp in the frame. */ \ + /* Is it above kernel base? */ \ + mov kpti_kbase, %r14; \ + cmp %r14, KPTI_RSP(%rsp); \ + jb 1b; \ + /* Use the %rsp from the trap frame */ \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~0xf), %r14; \ +4: \ + mov %rsp, %r13; \ + /* %r14 contains our destination stk */ \ + mov %r14, %rsp; \ + pushq KPTI_SS(%r13); \ + pushq KPTI_RSP(%r13); \ + pushq KPTI_RFLAGS(%r13); \ + pushq KPTI_CS(%r13); \ + pushq KPTI_RIP(%r13); \ + errpush; \ + mov KPTI_R14(%r13), %r14; \ + mov KPTI_R13(%r13), %r13 + +#define INTERRUPT_TRAMPOLINE_NOERR \ + INTERRUPT_TRAMPOLINE_P(/**/) + +#define INTERRUPT_TRAMPOLINE \ + INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) + +/* + * This is used for all interrupts that can plausibly be taken inside another + * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS). + * + * We check for whether we took the interrupt while in another trampoline, in + * which case we need to use the kthread stack. + */ +#define DBG_INTERRUPT_TRAMPOLINE_P(errpush) \ + pushq %r13; \ + pushq %r14; \ + subq $KPTI_R14, %rsp; \ + /* Check for clobbering */ \ + cmp $0, KPTI_FLAG(%rsp); \ + je 1f; \ + /* Don't worry, this totally works */ \ + int $8; \ +1: \ + movq $1, KPTI_FLAG(%rsp); \ + /* Save current %cr3. */ \ + mov %cr3, %r14; \ + mov %r14, KPTI_TR_CR3(%rsp); \ + \ + cmpw $KCS_SEL, KPTI_CS(%rsp); \ + je 4f; \ +2: \ + /* Change to the "kernel" %cr3 */ \ + mov KPTI_KCR3(%rsp), %r14; \ + cmp $0, %r14; \ + je 3f; \ + mov %r14, %cr3; \ +3: \ + /* Get our cpu_t in %r13 */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + subq $CPU_KPTI_START, %r13; \ + /* Use top of the kthread stk */ \ + mov CPU_THREAD(%r13), %r14; \ + mov T_STACK(%r14), %r14; \ + addq $REGSIZE+MINFRAME, %r14; \ + jmp 6f; \ +4: \ + /* Check the %rsp in the frame. */ \ + /* Is it above kernel base? */ \ + /* If not, treat as user. */ \ + mov kpti_kbase, %r14; \ + cmp %r14, KPTI_RSP(%rsp); \ + jb 2b; \ + /* Is it within the kpti_frame page? */ \ + /* If it is, treat as user interrupt */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~(MMU_PAGESIZE - 1)), %r14; \ + cmp %r13, %r14; \ + je 2b; \ + /* Were we in trampoline code? */ \ + leaq kpti_tramp_start, %r14; \ + cmp %r14, KPTI_RIP(%rsp); \ + jb 5f; \ + leaq kpti_tramp_end, %r14; \ + cmp %r14, KPTI_RIP(%rsp); \ + ja 5f; \ + /* If we were, change %cr3: we might */ \ + /* have interrupted before it did. */ \ + mov KPTI_KCR3(%rsp), %r14; \ + mov %r14, %cr3; \ +5: \ + /* Use the %rsp from the trap frame */ \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~0xf), %r14; \ +6: \ + mov %rsp, %r13; \ + /* %r14 contains our destination stk */ \ + mov %r14, %rsp; \ + pushq KPTI_SS(%r13); \ + pushq KPTI_RSP(%r13); \ + pushq KPTI_RFLAGS(%r13); \ + pushq KPTI_CS(%r13); \ + pushq KPTI_RIP(%r13); \ + errpush; \ + mov KPTI_R14(%r13), %r14; \ + movq $0, KPTI_FLAG(%r13); \ + mov KPTI_R13(%r13), %r13 + +#define DBG_INTERRUPT_TRAMPOLINE_NOERR \ + DBG_INTERRUPT_TRAMPOLINE_P(/**/) + +#define DBG_INTERRUPT_TRAMPOLINE \ + DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13)) + + /* + * These labels (_start and _end) are used by trap.c to determine if + * we took an interrupt like an NMI during the return process. + */ +.global tr_sysc_ret_start +tr_sysc_ret_start: + + /* + * Syscall return trampolines. + * + * These are expected to be called on the kernel %gs. tr_sysret[ql] are + * called after %rsp is changed back to the user value, so we have no + * stack to work with. tr_sysexit has a kernel stack (but has to + * preserve rflags, soooo). + */ + ENTRY_NP(tr_sysretq) + cmpq $1, kpti_enable + jne 1f + + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 +1: + swapgs + sysretq + SET_SIZE(tr_sysretq) + + ENTRY_NP(tr_sysretl) + cmpq $1, kpti_enable + jne 1f + + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 +1: + SWAPGS + SYSRETL + SET_SIZE(tr_sysretl) + + ENTRY_NP(tr_sysexit) + /* + * Note: we want to preserve RFLAGS across this branch, since sysexit + * (unlike sysret above) does not restore RFLAGS for us. + * + * We still have the real kernel stack (sysexit does restore that), so + * we can use pushfq/popfq. + */ + pushfq + + cmpq $1, kpti_enable + jne 1f + + /* Have to pop it back off now before we change %cr3! */ + popfq + mov %r13, %gs:CPU_KPTI_R13 + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 + jmp 2f +1: + popfq +2: + swapgs + sti + sysexit + SET_SIZE(tr_sysexit) + +.global tr_sysc_ret_end +tr_sysc_ret_end: + + /* + * Syscall entry trampolines. + */ + +#if DEBUG +#define MK_SYSCALL_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + mov %cr3, %r13; \ + mov %r13, %gs:CPU_KPTI_TR_CR3; \ + mov %gs:CPU_KPTI_KCR3, %r13; \ + mov %r13, %cr3; \ + mov %gs:CPU_KPTI_R13, %r13; \ + swapgs; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) +#else +#define MK_SYSCALL_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + mov %gs:CPU_KPTI_KCR3, %r13; \ + mov %r13, %cr3; \ + mov %gs:CPU_KPTI_R13, %r13; \ + swapgs; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) +#endif + + MK_SYSCALL_TRAMPOLINE(sys_syscall) + MK_SYSCALL_TRAMPOLINE(sys_syscall32) + MK_SYSCALL_TRAMPOLINE(brand_sys_syscall) + MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32) + + /* + * SYSENTER is special. The CPU is really not very helpful when it + * comes to preserving and restoring state with it, and as a result + * we have to do all of it by hand. So, since we want to preserve + * RFLAGS, we have to be very careful in these trampolines to not + * clobber any bits in it. That means no cmpqs or branches! + */ + ENTRY_NP(tr_sys_sysenter) + swapgs + mov %r13, %gs:CPU_KPTI_R13 +#if DEBUG + mov %cr3, %r13 + mov %r13, %gs:CPU_KPTI_TR_CR3 +#endif + mov %gs:CPU_KPTI_KCR3, %r13 + mov %r13, %cr3 + mov %gs:CPU_KPTI_R13, %r13 + jmp _sys_sysenter_post_swapgs + SET_SIZE(tr_sys_sysenter) + + ENTRY_NP(tr_brand_sys_sysenter) + swapgs + mov %r13, %gs:CPU_KPTI_R13 +#if DEBUG + mov %cr3, %r13 + mov %r13, %gs:CPU_KPTI_TR_CR3 +#endif + mov %gs:CPU_KPTI_KCR3, %r13 + mov %r13, %cr3 + mov %gs:CPU_KPTI_R13, %r13 + jmp _brand_sys_sysenter_post_swapgs + SET_SIZE(tr_brand_sys_sysenter) + +#define MK_SYSCALL_INT_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + swapgs; \ + mov %r13, %gs:CPU_KPTI_R13; \ + SET_KERNEL_CR3(%r13); \ + mov %gs:CPU_THREAD, %r13; \ + mov T_STACK(%r13), %r13; \ + addq $REGSIZE+MINFRAME, %r13; \ + mov %r13, %rsp; \ + pushq %gs:CPU_KPTI_SS; \ + pushq %gs:CPU_KPTI_RSP; \ + pushq %gs:CPU_KPTI_RFLAGS; \ + pushq %gs:CPU_KPTI_CS; \ + pushq %gs:CPU_KPTI_RIP; \ + mov %gs:CPU_KPTI_R13, %r13; \ + SWAPGS; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int) + MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int) + + /* + * Interrupt/trap return trampolines + */ + +.global tr_intr_ret_start +tr_intr_ret_start: + + ENTRY_NP(tr_iret_auto) + cmpq $1, kpti_enable + jne tr_iret_kernel + cmpw $KCS_SEL, T_FRAMERET_CS(%rsp) + je tr_iret_kernel + jmp tr_iret_user + SET_SIZE(tr_iret_auto) + + ENTRY_NP(tr_iret_kernel) + /* + * Yes, this does nothing extra. But this way we know if we see iret + * elsewhere, then we've failed to properly consider trampolines there. + */ + iretq + SET_SIZE(tr_iret_kernel) + + ENTRY_NP(tr_iret_user) + cmpq $1, kpti_enable + jne 1f + + swapgs + mov %r13, %gs:CPU_KPTI_R13 + PIVOT_KPTI_STK(%r13) + SET_USER_CR3(%r13) + mov %gs:CPU_KPTI_R13, %r13 + /* Zero these to make sure they didn't leak from a kernel trap */ + movq $0, %gs:CPU_KPTI_R13 + movq $0, %gs:CPU_KPTI_R14 + swapgs +1: + iretq + SET_SIZE(tr_iret_user) + +.global tr_intr_ret_end +tr_intr_ret_end: + + /* + * Interrupt/trap entry trampolines + */ + + /* CPU pushed an error code, and ISR wants one */ +#define MK_INTR_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + INTERRUPT_TRAMPOLINE; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU didn't push an error code, and ISR doesn't want one */ +#define MK_INTR_TRAMPOLINE_NOERR(isr) \ + ENTRY_NP(tr_/**/isr); \ + push $0; \ + INTERRUPT_TRAMPOLINE_NOERR; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU pushed an error code, and ISR wants one */ +#define MK_DBG_INTR_TRAMPOLINE(isr) \ + ENTRY_NP(tr_/**/isr); \ + DBG_INTERRUPT_TRAMPOLINE; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + /* CPU didn't push an error code, and ISR doesn't want one */ +#define MK_DBG_INTR_TRAMPOLINE_NOERR(isr) \ + ENTRY_NP(tr_/**/isr); \ + push $0; \ + DBG_INTERRUPT_TRAMPOLINE_NOERR; \ + jmp isr; \ + SET_SIZE(tr_/**/isr) + + + MK_INTR_TRAMPOLINE_NOERR(div0trap) + MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap) + MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap) + MK_INTR_TRAMPOLINE_NOERR(ovflotrap) + MK_INTR_TRAMPOLINE_NOERR(boundstrap) + MK_INTR_TRAMPOLINE_NOERR(invoptrap) + MK_INTR_TRAMPOLINE_NOERR(ndptrap) + MK_INTR_TRAMPOLINE(invtsstrap) + MK_INTR_TRAMPOLINE(segnptrap) + MK_DBG_INTR_TRAMPOLINE(stktrap) + MK_DBG_INTR_TRAMPOLINE(gptrap) + MK_DBG_INTR_TRAMPOLINE(pftrap) + MK_INTR_TRAMPOLINE_NOERR(resvtrap) + MK_INTR_TRAMPOLINE_NOERR(ndperr) + MK_INTR_TRAMPOLINE(achktrap) + MK_INTR_TRAMPOLINE_NOERR(xmtrap) + MK_INTR_TRAMPOLINE_NOERR(invaltrap) + MK_INTR_TRAMPOLINE_NOERR(fasttrap) + MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) + MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80) + MK_INTR_TRAMPOLINE_NOERR(sys_int80) + + /* + * These are special because they can interrupt other traps, and + * each other. We don't need to pivot their stacks, because they have + * dedicated IST stack space, but we need to change %cr3. + */ + ENTRY_NP(tr_nmiint) + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp nmiint + SET_SIZE(tr_nmiint) + +#if !defined(__xpv) + ENTRY_NP(tr_syserrtrap) + /* + * If we got here we should always have a zero error code pushed. + * The INT $0x8 instr doesn't seem to push one, though, which we use + * as an emergency panic in the other trampolines. So adjust things + * here. + */ + cmpq $0, (%rsp) + je 1f + pushq $0 +1: + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp syserrtrap + SET_SIZE(tr_syserrtrap) +#endif + + ENTRY_NP(tr_mcetrap) + pushq %r13 + mov kpti_safe_cr3, %r13 + mov %r13, %cr3 + popq %r13 + jmp mcetrap + SET_SIZE(tr_mcetrap) + + /* + * Interrupts start at 32 + */ +#define MKIVCT(n) \ + ENTRY_NP(tr_ivct/**/n) \ + push $0; \ + INTERRUPT_TRAMPOLINE; \ + push $n - 0x20; \ + jmp cmnint; \ + SET_SIZE(tr_ivct/**/n) + + MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35); + MKIVCT(36); MKIVCT(37); MKIVCT(38); MKIVCT(39); + MKIVCT(40); MKIVCT(41); MKIVCT(42); MKIVCT(43); + MKIVCT(44); MKIVCT(45); MKIVCT(46); MKIVCT(47); + MKIVCT(48); MKIVCT(49); MKIVCT(50); MKIVCT(51); + MKIVCT(52); MKIVCT(53); MKIVCT(54); MKIVCT(55); + MKIVCT(56); MKIVCT(57); MKIVCT(58); MKIVCT(59); + MKIVCT(60); MKIVCT(61); MKIVCT(62); MKIVCT(63); + MKIVCT(64); MKIVCT(65); MKIVCT(66); MKIVCT(67); + MKIVCT(68); MKIVCT(69); MKIVCT(70); MKIVCT(71); + MKIVCT(72); MKIVCT(73); MKIVCT(74); MKIVCT(75); + MKIVCT(76); MKIVCT(77); MKIVCT(78); MKIVCT(79); + MKIVCT(80); MKIVCT(81); MKIVCT(82); MKIVCT(83); + MKIVCT(84); MKIVCT(85); MKIVCT(86); MKIVCT(87); + MKIVCT(88); MKIVCT(89); MKIVCT(90); MKIVCT(91); + MKIVCT(92); MKIVCT(93); MKIVCT(94); MKIVCT(95); + MKIVCT(96); MKIVCT(97); MKIVCT(98); MKIVCT(99); + MKIVCT(100); MKIVCT(101); MKIVCT(102); MKIVCT(103); + MKIVCT(104); MKIVCT(105); MKIVCT(106); MKIVCT(107); + MKIVCT(108); MKIVCT(109); MKIVCT(110); MKIVCT(111); + MKIVCT(112); MKIVCT(113); MKIVCT(114); MKIVCT(115); + MKIVCT(116); MKIVCT(117); MKIVCT(118); MKIVCT(119); + MKIVCT(120); MKIVCT(121); MKIVCT(122); MKIVCT(123); + MKIVCT(124); MKIVCT(125); MKIVCT(126); MKIVCT(127); + MKIVCT(128); MKIVCT(129); MKIVCT(130); MKIVCT(131); + MKIVCT(132); MKIVCT(133); MKIVCT(134); MKIVCT(135); + MKIVCT(136); MKIVCT(137); MKIVCT(138); MKIVCT(139); + MKIVCT(140); MKIVCT(141); MKIVCT(142); MKIVCT(143); + MKIVCT(144); MKIVCT(145); MKIVCT(146); MKIVCT(147); + MKIVCT(148); MKIVCT(149); MKIVCT(150); MKIVCT(151); + MKIVCT(152); MKIVCT(153); MKIVCT(154); MKIVCT(155); + MKIVCT(156); MKIVCT(157); MKIVCT(158); MKIVCT(159); + MKIVCT(160); MKIVCT(161); MKIVCT(162); MKIVCT(163); + MKIVCT(164); MKIVCT(165); MKIVCT(166); MKIVCT(167); + MKIVCT(168); MKIVCT(169); MKIVCT(170); MKIVCT(171); + MKIVCT(172); MKIVCT(173); MKIVCT(174); MKIVCT(175); + MKIVCT(176); MKIVCT(177); MKIVCT(178); MKIVCT(179); + MKIVCT(180); MKIVCT(181); MKIVCT(182); MKIVCT(183); + MKIVCT(184); MKIVCT(185); MKIVCT(186); MKIVCT(187); + MKIVCT(188); MKIVCT(189); MKIVCT(190); MKIVCT(191); + MKIVCT(192); MKIVCT(193); MKIVCT(194); MKIVCT(195); + MKIVCT(196); MKIVCT(197); MKIVCT(198); MKIVCT(199); + MKIVCT(200); MKIVCT(201); MKIVCT(202); MKIVCT(203); + MKIVCT(204); MKIVCT(205); MKIVCT(206); MKIVCT(207); + MKIVCT(208); MKIVCT(209); MKIVCT(210); MKIVCT(211); + MKIVCT(212); MKIVCT(213); MKIVCT(214); MKIVCT(215); + MKIVCT(216); MKIVCT(217); MKIVCT(218); MKIVCT(219); + MKIVCT(220); MKIVCT(221); MKIVCT(222); MKIVCT(223); + MKIVCT(224); MKIVCT(225); MKIVCT(226); MKIVCT(227); + MKIVCT(228); MKIVCT(229); MKIVCT(230); MKIVCT(231); + MKIVCT(232); MKIVCT(233); MKIVCT(234); MKIVCT(235); + MKIVCT(236); MKIVCT(237); MKIVCT(238); MKIVCT(239); + MKIVCT(240); MKIVCT(241); MKIVCT(242); MKIVCT(243); + MKIVCT(244); MKIVCT(245); MKIVCT(246); MKIVCT(247); + MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251); + MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255); + +.align MMU_PAGESIZE +.global kpti_tramp_end +kpti_tramp_end: + nop + +#endif /* __lint */ diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s index 042818844d..4626dd1492 100644 --- a/usr/src/uts/i86pc/ml/locore.s +++ b/usr/src/uts/i86pc/ml/locore.s @@ -23,7 +23,7 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright (c) 2018 Joyent, Inc. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -1186,7 +1186,7 @@ cmntrap() addq %rax, %r12 movq %r12, REGOFF_RIP(%rbp) INTR_POP - IRET + jmp tr_iret_auto /*NOTREACHED*/ 3: leaq dtrace_badflags(%rip), %rdi @@ -1599,7 +1599,7 @@ _no_pending_updates: */ ALTENTRY(sys_rtt_syscall32) USER32_POP - IRET + jmp tr_iret_user /*NOTREACHED*/ ALTENTRY(sys_rtt_syscall) @@ -1608,7 +1608,7 @@ _no_pending_updates: */ USER_POP ALTENTRY(nopop_sys_rtt_syscall) - IRET + jmp tr_iret_user /*NOTREACHED*/ SET_SIZE(nopop_sys_rtt_syscall) @@ -1623,7 +1623,7 @@ _no_pending_updates: * Restore regs before doing iretq to kernel mode */ INTR_POP - IRET + jmp tr_iret_kernel .globl _sys_rtt_end _sys_rtt_end: /*NOTREACHED*/ diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 406d389000..a1f1e935aa 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -1,7 +1,7 @@ \ \ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. \ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. -\ Copyright 2016 Joyent, Inc. +\ Copyright 2018 Joyent, Inc. \ \ CDDL HEADER START \ @@ -233,6 +233,44 @@ cpu cpu_m.mcpu_vcpu_info CPU_VCPU_INFO #endif +cpu + cpu_m.mcpu_kpti.kf_kernel_cr3 CPU_KPTI_KCR3 + cpu_m.mcpu_kpti.kf_user_cr3 CPU_KPTI_UCR3 + cpu_m.mcpu_kpti.kf_tr_rsp CPU_KPTI_TR_RSP + cpu_m.mcpu_kpti.kf_tr_cr3 CPU_KPTI_TR_CR3 + cpu_m.mcpu_kpti.kf_r13 CPU_KPTI_R13 + cpu_m.mcpu_kpti.kf_r14 CPU_KPTI_R14 + cpu_m.mcpu_kpti.kf_tr_ret_rsp CPU_KPTI_RET_RSP + + cpu_m.mcpu_kpti.kf_ss CPU_KPTI_SS + cpu_m.mcpu_kpti.kf_rsp CPU_KPTI_RSP + cpu_m.mcpu_kpti.kf_rflags CPU_KPTI_RFLAGS + cpu_m.mcpu_kpti.kf_cs CPU_KPTI_CS + cpu_m.mcpu_kpti.kf_rip CPU_KPTI_RIP + cpu_m.mcpu_kpti.kf_err CPU_KPTI_ERR + + cpu_m.mcpu_pad2 CPU_KPTI_START + cpu_m.mcpu_pad3 CPU_KPTI_END + +kpti_frame + kf_r14 KPTI_R14 + kf_r13 KPTI_R13 + kf_err KPTI_ERR + kf_rip KPTI_RIP + kf_cs KPTI_CS + kf_rflags KPTI_RFLAGS + kf_rsp KPTI_RSP + kf_ss KPTI_SS + + kf_tr_rsp KPTI_TOP + + kf_kernel_cr3 KPTI_KCR3 + kf_user_cr3 KPTI_UCR3 + kf_tr_ret_rsp KPTI_RET_RSP + kf_tr_cr3 KPTI_TR_CR3 + + kf_tr_flag KPTI_FLAG + standard_pic c_curmask c_iplmask diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index bc9351cada..98f8c8f8da 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -491,6 +491,20 @@ noprod_sys_syscall: movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Machine state saved in the regs structure on the stack * First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9 * %eax is the syscall number @@ -705,8 +719,7 @@ _syscall_after_brand: SYSRETQ #else ALTENTRY(nopop_sys_syscall_swapgs_sysretq) - SWAPGS /* user gsbase */ - SYSRETQ + jmp tr_sysretq #endif /*NOTREACHED*/ SET_SIZE(nopop_sys_syscall_swapgs_sysretq) @@ -807,6 +820,20 @@ _syscall32_save: movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Application state saved in the regs structure on the stack * %eax is the syscall number * %rsp is the thread's stack, %r15 is curthread @@ -964,8 +991,7 @@ _syscall32_after_brand: ASSERT_UPCALL_MASK_IS_SET ALTENTRY(nopop_sys_syscall32_swapgs_sysretl) - SWAPGS /* user gsbase */ - SYSRETL + jmp tr_sysretl SET_SIZE(nopop_sys_syscall32_swapgs_sysretl) /*NOTREACHED*/ @@ -1010,23 +1036,22 @@ _full_syscall_postsys32: * this call, as %edx is used by the sysexit instruction. * * One final complication in this routine is its interaction with - * single-stepping in a debugger. For most of the system call mechanisms, - * the CPU automatically clears the single-step flag before we enter the - * kernel. The sysenter mechanism does not clear the flag, so a user - * single-stepping through a libc routine may suddenly find themself - * single-stepping through the kernel. To detect this, kmdb compares the - * trap %pc to the [brand_]sys_enter addresses on each single-step trap. - * If it finds that we have single-stepped to a sysenter entry point, it - * explicitly clears the flag and executes the sys_sysenter routine. + * single-stepping in a debugger. For most of the system call mechanisms, the + * CPU automatically clears the single-step flag before we enter the kernel. + * The sysenter mechanism does not clear the flag, so a user single-stepping + * through a libc routine may suddenly find themself single-stepping through the + * kernel. To detect this, kmdb and trap() both compare the trap %pc to the + * [brand_]sys_enter addresses on each single-step trap. If it finds that we + * have single-stepped to a sysenter entry point, it explicitly clears the flag + * and executes the sys_sysenter routine. * - * One final complication in this final complication is the fact that we - * have two different entry points for sysenter: brand_sys_sysenter and - * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping - * through the kernel with kmdb, we will eventually hit the instruction at - * sys_sysenter. kmdb cannot distinguish between that valid single-step - * and the undesirable one mentioned above. To avoid this situation, we - * simply add a jump over the instruction at sys_sysenter to make it - * impossible to single-step to it. + * One final complication in this final complication is the fact that we have + * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter. + * If we enter at brand_sys_sysenter and start single-stepping through the + * kernel with kmdb, we will eventually hit the instruction at sys_sysenter. + * kmdb cannot distinguish between that valid single-step and the undesirable + * one mentioned above. To avoid this situation, we simply add a jump over the + * instruction at sys_sysenter to make it impossible to single-step to it. */ #if defined(__lint) @@ -1039,6 +1064,7 @@ sys_sysenter() ENTRY_NP(brand_sys_sysenter) SWAPGS /* kernel gsbase */ ALTENTRY(_brand_sys_sysenter_post_swapgs) + BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx)) /* * Jump over sys_sysenter to allow single-stepping as described @@ -1048,13 +1074,17 @@ sys_sysenter() ALTENTRY(sys_sysenter) SWAPGS /* kernel gsbase */ - ALTENTRY(_sys_sysenter_post_swapgs) + movq %gs:CPU_THREAD, %r15 movl $U32CS_SEL, REGOFF_CS(%rsp) movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */ movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */ + /* + * NOTE: none of the instructions that run before we get here should + * clobber bits in (R)FLAGS! This includes the kpti trampoline. + */ pushfq popq %r10 movl $UDS_SEL, REGOFF_SS(%rsp) @@ -1096,6 +1126,20 @@ sys_sysenter() movq %rbx, REGOFF_GS(%rsp) /* + * If we're trying to use TRAPTRACE though, I take that back: we're + * probably debugging some problem in the SWAPGS logic and want to know + * what the incoming gsbase was. + * + * Since we already did SWAPGS, record the KGSBASE. + */ +#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv) + movl $MSR_AMD_KGSBASE, %ecx + rdmsr + movl %eax, REGOFF_GSBASE(%rsp) + movl %edx, REGOFF_GSBASE+4(%rsp) +#endif + + /* * Application state saved in the regs structure on the stack * %eax is the syscall number * %rsp is the thread's stack, %r15 is curthread @@ -1198,6 +1242,8 @@ sys_sysenter() * If we were, and we ended up on another cpu, or another * lwp got int ahead of us, it could change the segment * registers without us noticing before we return to userland. + * + * This cli is undone in the tr_sysexit trampoline code. */ cli CHECK_POSTSYS_NE(%r15, %r14, %ebx) @@ -1231,16 +1277,14 @@ sys_sysenter() popfq movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */ ALTENTRY(sys_sysenter_swapgs_sysexit) - swapgs - sti - sysexit + jmp tr_sysexit SET_SIZE(sys_sysenter_swapgs_sysexit) SET_SIZE(sys_sysenter) SET_SIZE(_sys_sysenter_post_swapgs) SET_SIZE(brand_sys_sysenter) #endif /* __lint */ - + #if defined(__lint) /* * System call via an int80. This entry point is only used by the Linux @@ -1352,10 +1396,13 @@ nopop_syscall_int: * or we could end up breaking branded zone support. See the usage of * this label in lx_brand_int80_callback and sn1_brand_int91_callback * for examples. + * + * We want to swapgs to maintain the invariant that all entries into + * tr_iret_user are done on the user gsbase. */ - ALTENTRY(sys_sysint_swapgs_iret) - SWAPGS /* user gsbase */ - IRET + ALTENTRY(sys_sysint_swapgs_iret) + SWAPGS + jmp tr_iret_user /*NOTREACHED*/ SET_SIZE(sys_sysint_swapgs_iret) SET_SIZE(sys_syscall_int) diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c index 2569812c47..36ec2e4945 100644 --- a/usr/src/uts/i86pc/os/intr.c +++ b/usr/src/uts/i86pc/os/intr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserverd. + * Copyright (c) 2018 Joyent, Inc. All rights reserverd. */ /* @@ -471,6 +471,21 @@ #include <sys/hypervisor.h> #endif +#if defined(__amd64) && !defined(__xpv) +/* If this fails, then the padding numbers in machcpuvar.h are wrong. */ +CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad)) + < MMU_PAGESIZE); +CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti)) + >= MMU_PAGESIZE); +CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg)) + < 2 * MMU_PAGESIZE); +CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad2)) + < 2 * MMU_PAGESIZE); +CTASSERT(((sizeof (struct kpti_frame)) & 0xF) == 0); +CTASSERT(((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg)) + & 0xF) == 0); +CTASSERT((offsetof(struct kpti_frame, kf_tr_rsp) & 0xF) == 0); +#endif #if defined(__xpv) && defined(DEBUG) @@ -1473,6 +1488,21 @@ loop: return (1); } +#if !defined(__xpv) + /* + * Assert that we're not trying to return into the syscall return + * trampolines. Things will go baaaaad if we try to do that. + * + * Note that none of these run with interrupts on, so this should + * never happen (even in the sysexit case the STI doesn't take effect + * until after sysexit finishes). + */ + extern void tr_sysc_ret_start(); + extern void tr_sysc_ret_end(); + ASSERT(!(rp->r_pc >= (uintptr_t)tr_sysc_ret_start && + rp->r_pc <= (uintptr_t)tr_sysc_ret_end)); +#endif + /* * Here if we are returning to supervisor mode. * Check for a kernel preemption request. diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 13ccfde671..19f0f5f676 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -23,7 +23,7 @@ * * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* * Copyright (c) 2010, Intel Corporation. @@ -148,6 +148,20 @@ mlsetup(struct regs *rp) else cpuid_feature_edx_exclude = (uint32_t)prop_value; +#if defined(__amd64) && !defined(__xpv) + /* + * Check to see if KPTI has been explicitly enabled or disabled. + * We have to check this before init_desctbls(). + */ + if (bootprop_getval("kpti", &prop_value) != 0) { + kpti_enable = 1; + } else { + kpti_enable = (uint64_t)(prop_value == 1); + prom_printf("unix: forcing kpti to %s due to boot argument\n", + (kpti_enable == 1) ? "ON" : "OFF"); + } +#endif + /* * Initialize idt0, gdt0, ldt0_default, ktss0 and dftss. */ diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c index 105b1e93dc..4e12703395 100644 --- a/usr/src/uts/i86pc/os/mp_pc.c +++ b/usr/src/uts/i86pc/os/mp_pc.c @@ -26,7 +26,7 @@ * All rights reserved. */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc */ /* @@ -174,13 +174,15 @@ mach_cpucontext_alloc_tables(struct cpu *cp) { tss_t *ntss; struct cpu_tables *ct; + size_t ctsize; /* * Allocate space for stack, tss, gdt and idt. We round the size * allotted for cpu_tables up, so that the TSS is on a unique page. * This is more efficient when running in virtual machines. */ - ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP); + ctsize = P2ROUNDUP(sizeof (*ct), PAGESIZE); + ct = kmem_zalloc(ctsize, KM_SLEEP); if ((uintptr_t)ct & PAGEOFFSET) panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables", cp->cpu_id); @@ -188,16 +190,62 @@ mach_cpucontext_alloc_tables(struct cpu *cp) ntss = cp->cpu_tss = &ct->ct_tss; #if defined(__amd64) + uintptr_t va; + size_t len; /* * #DF (double fault). */ - ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)]; + ntss->tss_ist1 = (uintptr_t)&ct->ct_stack1[sizeof (ct->ct_stack1)]; + + /* + * #NM (non-maskable interrupt) + */ + ntss->tss_ist2 = (uintptr_t)&ct->ct_stack2[sizeof (ct->ct_stack2)]; + + /* + * #MC (machine check exception / hardware error) + */ + ntss->tss_ist3 = (uintptr_t)&ct->ct_stack3[sizeof (ct->ct_stack3)]; + + /* + * #DB, #BP debug interrupts and KDI/kmdb + */ + ntss->tss_ist4 = (uintptr_t)&cp->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; + + if (kpti_enable == 1) { + /* + * #GP, #PF, #SS fault interrupts + */ + ntss->tss_ist5 = (uintptr_t)&cp->cpu_m.mcpu_kpti_flt.kf_tr_rsp; + + /* + * Used by all other interrupts + */ + ntss->tss_ist6 = (uint64_t)&cp->cpu_m.mcpu_kpti.kf_tr_rsp; + + /* + * On AMD64 we need to make sure that all of the pages of the + * struct cpu_tables are punched through onto the user CPU for + * kpti. + * + * The final page will always be the TSS, so treat that + * separately. + */ + for (va = (uintptr_t)ct, len = ctsize - MMU_PAGESIZE; + len >= MMU_PAGESIZE; + len -= MMU_PAGESIZE, va += MMU_PAGESIZE) { + /* The doublefault stack must be RW */ + hati_cpu_punchin(cp, va, PROT_READ | PROT_WRITE); + } + ASSERT3U((uintptr_t)ntss, ==, va); + hati_cpu_punchin(cp, (uintptr_t)ntss, PROT_READ); + } #elif defined(__i386) ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp = - (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)]; + (uint32_t)&ct->ct_stack1[sizeof (ct->ct_stack1)]; ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL; diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index 0fadfb7993..a807be6a40 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -27,7 +27,7 @@ * All rights reserved. */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -80,10 +80,10 @@ #include <sys/cpu_module.h> #include <sys/ontrap.h> -struct cpu cpus[1]; /* CPU data */ -struct cpu *cpu[NCPU] = {&cpus[0]}; /* pointers to all CPUs */ -struct cpu *cpu_free_list; /* list for released CPUs */ -cpu_core_t cpu_core[NCPU]; /* cpu_core structures */ +struct cpu cpus[1] __aligned(MMU_PAGESIZE); +struct cpu *cpu[NCPU] = {&cpus[0]}; +struct cpu *cpu_free_list; +cpu_core_t cpu_core[NCPU]; #define cpu_next_free cpu_prev @@ -166,25 +166,23 @@ init_cpu_info(struct cpu *cp) void init_cpu_syscall(struct cpu *cp) { - uint64_t flags; - kpreempt_disable(); -#if defined(__amd64) if (is_x86_feature(x86_featureset, X86FSET_MSR) && is_x86_feature(x86_featureset, X86FSET_ASYSC)) { uint64_t flags; -#if !defined(__lint) +#if !defined(__xpv) /* * The syscall instruction imposes a certain ordering on * segment selectors, so we double-check that ordering * here. */ - ASSERT(KDS_SEL == KCS_SEL + 8); - ASSERT(UDS_SEL == U32CS_SEL + 8); - ASSERT(UCS_SEL == U32CS_SEL + 16); + CTASSERT(KDS_SEL == KCS_SEL + 8); + CTASSERT(UDS_SEL == U32CS_SEL + 8); + CTASSERT(UCS_SEL == U32CS_SEL + 16); #endif + /* * Turn syscall/sysret extensions on. */ @@ -195,8 +193,17 @@ init_cpu_syscall(struct cpu *cp) */ wrmsr(MSR_AMD_STAR, ((uint64_t)(U32CS_SEL << 16 | KCS_SEL)) << 32); - wrmsr(MSR_AMD_LSTAR, (uint64_t)(uintptr_t)sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uint64_t)(uintptr_t)sys_syscall32); + if (kpti_enable == 1) { + wrmsr(MSR_AMD_LSTAR, + (uint64_t)(uintptr_t)tr_sys_syscall); + wrmsr(MSR_AMD_CSTAR, + (uint64_t)(uintptr_t)tr_sys_syscall32); + } else { + wrmsr(MSR_AMD_LSTAR, + (uint64_t)(uintptr_t)sys_syscall); + wrmsr(MSR_AMD_CSTAR, + (uint64_t)(uintptr_t)sys_syscall32); + } /* * This list of flags is masked off the incoming @@ -207,19 +214,15 @@ init_cpu_syscall(struct cpu *cp) flags |= PS_ACHK; wrmsr(MSR_AMD_SFMASK, flags); } -#endif /* - * On 32-bit kernels, we use sysenter/sysexit because it's too - * hard to use syscall/sysret, and it is more portable anyway. - * * On 64-bit kernels on Nocona machines, the 32-bit syscall * variant isn't available to 32-bit applications, but sysenter is. */ if (is_x86_feature(x86_featureset, X86FSET_MSR) && is_x86_feature(x86_featureset, X86FSET_SEP)) { -#if !defined(__lint) +#if !defined(__xpv) /* * The sysenter instruction imposes a certain ordering on * segment selectors, so we double-check that ordering @@ -227,13 +230,10 @@ init_cpu_syscall(struct cpu *cp) * Intel Architecture Software Developer's Manual Volume 2: * Instruction Set Reference" */ - ASSERT(KDS_SEL == KCS_SEL + 8); + CTASSERT(KDS_SEL == KCS_SEL + 8); - ASSERT32(UCS_SEL == ((KCS_SEL + 16) | 3)); - ASSERT32(UDS_SEL == UCS_SEL + 8); - - ASSERT64(U32CS_SEL == ((KCS_SEL + 16) | 3)); - ASSERT64(UDS_SEL == U32CS_SEL + 8); + CTASSERT(U32CS_SEL == ((KCS_SEL + 16) | 3)); + CTASSERT(UDS_SEL == U32CS_SEL + 8); #endif cpu_sep_enable(); @@ -243,7 +243,14 @@ init_cpu_syscall(struct cpu *cp) * via a context handler. */ wrmsr(MSR_INTC_SEP_ESP, 0); - wrmsr(MSR_INTC_SEP_EIP, (uint64_t)(uintptr_t)sys_sysenter); + + if (kpti_enable == 1) { + wrmsr(MSR_INTC_SEP_EIP, + (uint64_t)(uintptr_t)tr_sys_sysenter); + } else { + wrmsr(MSR_INTC_SEP_EIP, + (uint64_t)(uintptr_t)sys_sysenter); + } } kpreempt_enable(); @@ -420,20 +427,20 @@ mp_cpu_configure_common(int cpun, boolean_t boot) #endif /* - * If we have more than one node, each cpu gets a copy of IDT - * local to its node. If this is a Pentium box, we use cpu 0's - * IDT. cpu 0's IDT has been made read-only to workaround the - * cmpxchgl register bug + * Allocate pages for the CPU LDT. + */ + cp->cpu_m.mcpu_ldt = kmem_zalloc(LDT_CPU_SIZE, KM_SLEEP); + cp->cpu_m.mcpu_ldt_len = 0; + + /* + * Allocate a per-CPU IDT and initialize the new IDT to the currently + * runing CPU. */ - if (system_hardware.hd_nodes && x86_type != X86_TYPE_P5) { #if !defined(__lint) - ASSERT((sizeof (*CPU->cpu_idt) * NIDT) <= PAGESIZE); + ASSERT((sizeof (*CPU->cpu_idt) * NIDT) <= PAGESIZE); #endif - cp->cpu_idt = kmem_zalloc(PAGESIZE, KM_SLEEP); - bcopy(CPU->cpu_idt, cp->cpu_idt, PAGESIZE); - } else { - cp->cpu_idt = CPU->cpu_idt; - } + cp->cpu_idt = kmem_alloc(PAGESIZE, KM_SLEEP); + bcopy(CPU->cpu_idt, cp->cpu_idt, PAGESIZE); /* * alloc space for cpuid info @@ -571,6 +578,10 @@ mp_cpu_unconfigure_common(struct cpu *cp, int error) kmem_free(cp->cpu_idt, PAGESIZE); cp->cpu_idt = NULL; + kmem_free(cp->cpu_m.mcpu_ldt, LDT_CPU_SIZE); + cp->cpu_m.mcpu_ldt = NULL; + cp->cpu_m.mcpu_ldt_len = 0; + kmem_free(cp->cpu_gdt, PAGESIZE); cp->cpu_gdt = NULL; diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index bfe8c2486b..5e23d2f486 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -23,7 +23,7 @@ * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. - * Copyright (c) 2017 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. * Copyright (c) 2015 by Delphix. All rights reserved. */ /* @@ -446,8 +446,10 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t); * 0xFFFFFFFF.FBC00000 |-----------------------| * | Kernel Text | * 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT - * |--- GDT ---|- GDT page (GDT_VA) * |--- debug info ---|- debug info (DEBUG_INFO_VA) + * |--- GDT ---|- GDT page (GDT_VA) + * |--- IDT ---|- IDT page (IDT_VA) + * |--- LDT ---|- LDT pages (LDT_VA) * | | * | Core heap | (used for loadable modules) * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap @@ -959,6 +961,17 @@ kpm_init() panic("segkpm_create segkpm"); rw_exit(&kas.a_lock); + + kpm_enable = 1; + + /* + * As the KPM was disabled while setting up the system, go back and fix + * CPU zero's access to its user page table. This is a bit gross, but + * we have a chicken and egg problem otherwise. + */ + ASSERT(CPU->cpu_hat_info->hci_user_l3ptes == NULL); + CPU->cpu_hat_info->hci_user_l3ptes = + (x86pte_t *)hat_kpm_mapin_pfn(CPU->cpu_hat_info->hci_user_l3pfn); } /* @@ -1422,6 +1435,9 @@ static void startup_kmem(void) { extern void page_set_colorequiv_arr(void); +#if !defined(__xpv) + extern uint64_t kpti_kbase; +#endif PRM_POINT("startup_kmem() starting..."); @@ -1484,6 +1500,9 @@ startup_kmem(void) *(uintptr_t *)&_userlimit = kernelbase; #if defined(__amd64) *(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT; +#if !defined(__xpv) + kpti_kbase = kernelbase; +#endif #else *(uintptr_t *)&_userlimit32 = _userlimit; #endif @@ -1491,6 +1510,9 @@ startup_kmem(void) PRM_DEBUG(_userlimit); PRM_DEBUG(_userlimit32); + /* We have to re-do this now that we've modified _userlimit. */ + mmu_calc_user_slots(); + layout_kernel_va(); #if defined(__i386) @@ -2129,32 +2151,6 @@ startup_vm(void) if (boothowto & RB_DEBUG) kdi_dvec_memavail(); - /* - * The following code installs a special page fault handler (#pf) - * to work around a pentium bug. - */ -#if !defined(__amd64) && !defined(__xpv) - if (x86_type == X86_TYPE_P5) { - desctbr_t idtr; - gate_desc_t *newidt; - - if ((newidt = kmem_zalloc(MMU_PAGESIZE, KM_NOSLEEP)) == NULL) - panic("failed to install pentium_pftrap"); - - bcopy(idt0, newidt, NIDT * sizeof (*idt0)); - set_gatesegd(&newidt[T_PGFLT], &pentium_pftrap, - KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - - (void) as_setprot(&kas, (caddr_t)newidt, MMU_PAGESIZE, - PROT_READ | PROT_EXEC); - - CPU->cpu_idt = newidt; - idtr.dtr_base = (uintptr_t)CPU->cpu_idt; - idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1; - wr_idtr(&idtr); - } -#endif /* !__amd64 */ - #if !defined(__xpv) /* * Map page pfn=0 for drivers, such as kd, that need to pick up @@ -2217,10 +2213,8 @@ startup_vm(void) * kpm segment */ segmap_kpm = 0; - if (kpm_desired) { + if (kpm_desired) kpm_init(); - kpm_enable = 1; - } /* * Now create segmap segment. diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index 192f3dbd32..4b867bac0c 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -32,7 +32,7 @@ /* */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -481,7 +481,6 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) int watchcode; int watchpage; caddr_t vaddr; - int singlestep_twiddle; size_t sz; int ta; #ifdef __amd64 @@ -1103,58 +1102,35 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) case T_SGLSTP: /* single step/hw breakpoint exception */ - /* Now evaluate how we got here */ +#if !defined(__xpv) + /* + * We'd never normally get here, as kmdb handles its own single + * step traps. There is one nasty exception though, as + * described in more detail in sys_sysenter(). Note that + * checking for all four locations covers both the KPTI and the + * non-KPTI cases correctly: the former will never be found at + * (brand_)sys_sysenter, and vice versa. + */ if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) { - /* - * i386 single-steps even through lcalls which - * change the privilege level. So we take a trap at - * the first instruction in privileged mode. - * - * Set a flag to indicate that upon completion of - * the system call, deal with the single-step trap. - * - * The same thing happens for sysenter, too. - */ - singlestep_twiddle = 0; - if (rp->r_pc == (uintptr_t)sys_sysenter || - rp->r_pc == (uintptr_t)brand_sys_sysenter) { - singlestep_twiddle = 1; -#if defined(__amd64) - /* - * Since we are already on the kernel's - * %gs, on 64-bit systems the sysenter case - * needs to adjust the pc to avoid - * executing the swapgs instruction at the - * top of the handler. - */ - if (rp->r_pc == (uintptr_t)sys_sysenter) - rp->r_pc = (uintptr_t) - _sys_sysenter_post_swapgs; - else - rp->r_pc = (uintptr_t) - _brand_sys_sysenter_post_swapgs; -#endif - } -#if defined(__i386) - else if (rp->r_pc == (uintptr_t)sys_call || - rp->r_pc == (uintptr_t)brand_sys_call) { - singlestep_twiddle = 1; - } -#endif - else { - /* not on sysenter/syscall; uregs available */ - if (tudebug && tudebugbpt) - showregs(type, rp, (caddr_t)0); - } - if (singlestep_twiddle) { + if (rp->r_pc == (greg_t)brand_sys_sysenter || + rp->r_pc == (greg_t)sys_sysenter || + rp->r_pc == (greg_t)tr_brand_sys_sysenter || + rp->r_pc == (greg_t)tr_sys_sysenter) { + + rp->r_pc += 0x3; /* sizeof (swapgs) */ + rp->r_ps &= ~PS_T; /* turn off trace */ lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING; ct->t_post_sys = 1; aston(curthread); goto cleanup; + } else { + if (tudebug && tudebugbpt) + showregs(type, rp, (caddr_t)0); } } - /* XXX - needs review on debugger interface? */ +#endif /* !__xpv */ + if (boothowto & RB_DEBUG) debug_enter((char *)NULL); else @@ -1743,16 +1719,16 @@ showregs(uint_t type, struct regs *rp, caddr_t addr) * this clause can be deleted when lint bug 4870403 is fixed * (lint thinks that bit 32 is illegal in a %b format string) */ - printf("cr0: %x cr4: %b\n", + printf("cr0: %x cr4: %b\n", (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4); #else - printf("cr0: %b cr4: %b\n", + printf("cr0: %b cr4: %b\n", (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4); #endif /* __lint */ - printf("cr2: %lx", getcr2()); + printf("cr2: %lx ", getcr2()); #if !defined(__xpv) - printf("cr3: %lx", getcr3()); + printf("cr3: %lx ", getcr3()); #if defined(__amd64) printf("cr8: %lx\n", getcr8()); #endif @@ -1858,7 +1834,8 @@ instr_is_segregs_pop(caddr_t pc) #endif /* __i386 */ /* - * Test to see if the instruction is part of _sys_rtt. + * Test to see if the instruction is part of _sys_rtt (or the KPTI trampolines + * which are used by _sys_rtt). * * Again on the hypervisor if we try to IRET to user land with a bad code * or stack selector we will get vectored through xen_failsafe_callback. @@ -1870,6 +1847,19 @@ instr_is_sys_rtt(caddr_t pc) { extern void _sys_rtt(), _sys_rtt_end(); +#if defined(__amd64) && !defined(__xpv) + extern void tr_sysc_ret_start(), tr_sysc_ret_end(); + extern void tr_intr_ret_start(), tr_intr_ret_end(); + + if ((uintptr_t)pc >= (uintptr_t)tr_sysc_ret_start && + (uintptr_t)pc <= (uintptr_t)tr_sysc_ret_end) + return (1); + + if ((uintptr_t)pc >= (uintptr_t)tr_intr_ret_start && + (uintptr_t)pc <= (uintptr_t)tr_intr_ret_end) + return (1); +#endif + if ((uintptr_t)pc < (uintptr_t)_sys_rtt || (uintptr_t)pc > (uintptr_t)_sys_rtt_end) return (0); diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index e2adaaaed9..cf1a252c28 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -23,7 +23,7 @@ * Use is subject to license terms. */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MACHCPUVAR_H @@ -40,6 +40,9 @@ extern "C" { #include <sys/rm_platter.h> #include <sys/avintr.h> #include <sys/pte.h> +#include <sys/stddef.h> +#include <sys/debug.h> +#include <sys/cpuvar.h> #ifndef _ASM /* @@ -78,6 +81,76 @@ struct xen_evt_data { ulong_t evt_affinity[sizeof (ulong_t) * 8]; /* service on cpu */ }; +struct kpti_frame { + uint64_t kf_lower_redzone; + + /* Stashed value of %cr3 when we entered the trampoline. */ + greg_t kf_tr_cr3; + + /* + * We use %r13-r14 as scratch registers in the trampoline code, + * so stash those here "below" the rest of the stack so they can be + * pushed/popped if needed. + */ + greg_t kf_r14; + greg_t kf_r13; + + /* + * Part of this struct is used as the HW stack frame when taking an + * interrupt on the user page table. The CPU is going to push a bunch + * of regs onto the stack pointer set in the TSS/IDT (which we set to + * &kf_rsp here). + * + * This is only a temporary holding area for them (we'll move them over + * to the real interrupt stack once we've set %cr3). + * + * Note that these must be cleared during a process switch on this cpu. + */ + greg_t kf_err; /* Bottom of initial hw stack frame */ + greg_t kf_rip; + greg_t kf_cs; + greg_t kf_rflags; + greg_t kf_rsp; + greg_t kf_ss; + + greg_t kf_tr_rsp; /* Top of HW stack frame */ + /* We also write this with the %rsp value on tramp entry */ + + /* Written to 0x1 when this kpti_frame is in use. */ + uint64_t kf_tr_flag; + + uint64_t kf_middle_redzone; + + /* + * The things we need to write to %cr3 to change between page tables. + * These live "above" the HW stack. + */ + greg_t kf_kernel_cr3; + greg_t kf_user_cr3; + greg_t kf_tr_ret_rsp; + + uint64_t kf_unused; /* For 16-byte align */ + + uint64_t kf_upper_redzone; +}; + +/* + * This first value, MACHCPU_SIZE is the size of all the members in the cpu_t + * AND struct machcpu, before we get to the mcpu_pad and the kpti area. + * The KPTI is used to contain per-CPU data that is visible in both sets of + * page-tables, and hence must be page-aligned and page-sized. See + * hat_pcp_setup(). + * + * There is a CTASSERT in os/intr.c that checks these numbers. + */ +#if defined(__amd64) +#define MACHCPU_SIZE (572 + 1584) +#else +#define MACHCPU_SIZE (452 + 1328) +#endif +#define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE) +#define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame)) + struct machcpu { /* * x_call fields - used for interprocessor cross calls @@ -103,6 +176,8 @@ struct machcpu { gate_desc_t *mcpu_idt; /* current IDT */ tss_t *mcpu_tss; /* TSS */ + void *mcpu_ldt; + size_t mcpu_ldt_len; kmutex_t mcpu_ppaddr_mutex; caddr_t mcpu_caddr1; /* per cpu CADDR1 */ @@ -147,6 +222,15 @@ struct machcpu { * The low order bits will be incremented on every interrupt. */ volatile uint32_t mcpu_istamp; + + char mcpu_pad[MACHCPU_PAD]; + + /* This is the start of the page */ + char mcpu_pad2[MACHCPU_PAD2]; + struct kpti_frame mcpu_kpti; + struct kpti_frame mcpu_kpti_flt; + struct kpti_frame mcpu_kpti_dbg; + char mcpu_pad3[16]; }; #define NINTR_THREADS (LOCK_LEVEL-1) /* number of interrupt threads */ @@ -167,7 +251,6 @@ struct machcpu { #define cpu_gdt cpu_m.mcpu_gdt #define cpu_idt cpu_m.mcpu_idt #define cpu_tss cpu_m.mcpu_tss -#define cpu_ldt cpu_m.mcpu_ldt #define cpu_caddr1 cpu_m.mcpu_caddr1 #define cpu_caddr2 cpu_m.mcpu_caddr2 #define cpu_softinfo cpu_m.mcpu_softinfo diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h index 21d725b862..51d7559483 100644 --- a/usr/src/uts/i86pc/sys/machparam.h +++ b/usr/src/uts/i86pc/sys/machparam.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -293,7 +293,8 @@ extern "C" { #endif /* __i386 */ /* - * Reserve pages just below KERNEL_TEXT for the GDT, IDT, TSS and debug info. + * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug + * info. * * For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps * of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable @@ -303,7 +304,8 @@ extern "C" { #define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE) #define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE) #define IDT_VA (GDT_VA - MMU_PAGESIZE) -#define KTSS_VA (IDT_VA - MMU_PAGESIZE) +#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE)) +#define KTSS_VA (LDT_VA - MMU_PAGESIZE) #define DFTSS_VA (KTSS_VA - MMU_PAGESIZE) #define MISC_VA_BASE (DFTSS_VA) #define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE) diff --git a/usr/src/uts/i86pc/sys/machprivregs.h b/usr/src/uts/i86pc/sys/machprivregs.h index 3ef6a768a0..53b14a8de8 100644 --- a/usr/src/uts/i86pc/sys/machprivregs.h +++ b/usr/src/uts/i86pc/sys/machprivregs.h @@ -22,13 +22,13 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MACHPRIVREGS_H #define _SYS_MACHPRIVREGS_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Platform dependent instruction sequences for manipulating * privileged state @@ -77,8 +77,6 @@ extern "C" { #define SYSRETQ sysretq #define SYSRETL sysretl #define SWAPGS swapgs -#define XPV_TRAP_POP /* empty */ -#define XPV_TRAP_PUSH /* empty */ #elif defined(__i386) @@ -86,6 +84,8 @@ extern "C" { #endif /* __i386 */ +#define XPV_TRAP_POP /* empty */ +#define XPV_TRAP_PUSH /* empty */ #define CLEAN_CS /* empty */ @@ -129,7 +129,7 @@ extern "C" { movq REGOFF_RDI(%rsp), %rdi; \ addq $REGOFF_RIP, %rsp -#define FAST_INTR_RETURN iretq +#define FAST_INTR_RETURN jmp tr_iret_user #elif defined(__i386) diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h index ea63abf77d..15ab068854 100644 --- a/usr/src/uts/i86pc/sys/rm_platter.h +++ b/usr/src/uts/i86pc/sys/rm_platter.h @@ -26,7 +26,7 @@ * All rights reserved. */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_RM_PLATTER_H @@ -113,7 +113,12 @@ typedef struct rm_platter { * Since DEFAULTSTKSIZE is a multiple of PAGESIZE tss will be aligned. */ struct cpu_tables { - char ct_stack[DEFAULTSTKSZ]; + /* IST stacks */ + char ct_stack1[DEFAULTSTKSZ]; /* dblfault */ +#if defined(__amd64) && !defined(__xpv) + char ct_stack2[DEFAULTSTKSZ]; /* nmi */ + char ct_stack3[DEFAULTSTKSZ]; /* mce */ +#endif tss_t ct_tss; }; diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index e16933dbde..8690c46adf 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -27,7 +27,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. * Copyright (c) 2014, 2015 by Delphix. All rights reserved. */ @@ -43,6 +43,191 @@ * Routines used only inside of i86pc/vm start with hati_ for HAT Internal. */ +/* + * amd64 HAT Design + * + * ---------- + * Background + * ---------- + * + * On x86, the address space is shared between a user process and the kernel. + * This is different from SPARC. Conventionally, the kernel lives at the top of + * the address space and the user process gets to enjoy the rest of it. If you + * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a + * rough sense of how the address space is laid out and used. + * + * Every unique address space is represented by an instance of a HAT structure + * called a 'hat_t'. In addition to a hat_t structure for each process, there is + * also one that is used for the kernel (kas.a_hat), and each CPU ultimately + * also has a HAT. + * + * Each HAT contains a pointer to its root page table. This root page table is + * what we call an L3 page table in illumos and Intel calls the PML4. It is the + * physical address of the L3 table that we place in the %cr3 register which the + * processor uses. + * + * Each of the many layers of the page table is represented by a structure + * called an htable_t. The htable_t manages a set of 512 8-byte entries. The + * number of entries in a given page table is constant across all different + * level page tables. Note, this is only true on amd64. This has not always been + * the case on x86. + * + * Each entry in a page table, generally referred to as a PTE, may refer to + * another page table or a memory location, depending on the level of the page + * table and the use of large pages. Importantly, the top-level L3 page table + * (PML4) only supports linking to further page tables. This is also true on + * systems which support a 5th level page table (which we do not currently + * support). + * + * Historically, on x86, when a process was running on CPU, the root of the page + * table was inserted into %cr3 on each CPU on which it was currently running. + * When processes would switch (by calling hat_switch()), then the value in %cr3 + * on that CPU would change to that of the new HAT. While this behavior is still + * maintained in the xpv kernel, this is not what is done today. + * + * ------------------- + * Per-CPU Page Tables + * ------------------- + * + * Throughout the system the 64-bit kernel has a notion of what it calls a + * per-CPU page table or PCP. The notion of a per-CPU page table was originally + * introduced as part of the original work to support x86 PAE. On the 64-bit + * kernel, it was originally used for 32-bit processes running on the 64-bit + * kernel. The rationale behind this was that each 32-bit process could have all + * of its memory represented in a single L2 page table as each L2 page table + * entry represents 1 GbE of memory. + * + * Following on from this, the idea was that given that all of the L3 page table + * entries for 32-bit processes are basically going to be identical with the + * exception of the first entry in the page table, why not share those page + * table entries. This gave rise to the idea of a per-CPU page table. + * + * The way this works is that we have a member in the machcpu_t called the + * mcpu_hat_info. That structure contains two different 4k pages: one that + * represents the L3 page table and one that represents an L2 page table. When + * the CPU starts up, the L3 page table entries are copied in from the kernel's + * page table. The L3 kernel entries do not change throughout the lifetime of + * the kernel. The kernel portion of these L3 pages for each CPU have the same + * records, meaning that they point to the same L2 page tables and thus see a + * consistent view of the world. + * + * When a 32-bit process is loaded into this world, we copy the 32-bit process's + * four top-level page table entries into the CPU's L2 page table and then set + * the CPU's first L3 page table entry to point to the CPU's L2 page. + * Specifically, in hat_pcp_update(), we're copying from the process's + * HAT_COPIED_32 HAT into the page tables specific to this CPU. + * + * As part of the implementation of kernel page table isolation, this was also + * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3 + * PTEs across into the current CPU's L3 page table. (As we can't do the + * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this + * case.) + * + * The use of per-CPU page tables has a lot of implementation ramifications. A + * HAT that runs a user process will be flagged with the HAT_COPIED flag to + * indicate that it is using the per-CPU page table functionality. In tandem + * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED + * flag. If the HAT represents a 32-bit process, then we will also set the + * HAT_COPIED_32 flag on that hat_t. + * + * These two flags work together. The top-level htable_t when using per-CPU page + * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e. + * ht->ht_pfn is PFN_INVALID). Instead, when we need to modify a PTE in an + * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to + * ht_hat->hat_copied_ptes. + * + * Of course, such a modification won't actually modify the HAT_PCP page tables + * that were copied from the HAT_COPIED htable. When we change the top level + * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit + * process), we need to make sure to trigger hat_pcp_update() on all CPUs that + * are currently tied to this HAT (including the current CPU). + * + * To do this, PCP piggy-backs on TLB invalidation, specifically via the + * hat_tlb_inval() path from link_ptp() and unlink_ptp(). + * + * (Importantly, in all such cases, when this is in operation, the top-level + * entry should not be able to refer to an actual page table entry that can be + * changed and consolidated into a large page. If large page consolidation is + * required here, then there will be much that needs to be reconsidered.) + * + * ----------------------------------------------- + * Kernel Page Table Isolation and the Per-CPU HAT + * ----------------------------------------------- + * + * All Intel CPUs that support speculative execution and paging are subject to a + * series of bugs that have been termed 'Meltdown'. These exploits allow a user + * process to read kernel memory through cache side channels and speculative + * execution. To mitigate this on vulnerable CPUs, we need to use a technique + * called kernel page table isolation. What this requires is that we have two + * different page table roots. When executing in kernel mode, we will use a %cr3 + * value that has both the user and kernel pages. However when executing in user + * mode, we will need to have a %cr3 that has all of the user pages; however, + * only a subset of the kernel pages required to operate. + * + * These kernel pages that we need mapped are: + * + * o Kernel Text that allows us to switch between the cr3 values. + * o The current global descriptor table (GDT) + * o The current interrupt descriptor table (IDT) + * o The current task switching state (TSS) + * o The current local descriptor table (LDT) + * o Stacks and scratch space used by the interrupt handlers + * + * For more information on the stack switching techniques, construction of the + * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most + * important part of these mappings are the following two constraints: + * + * o The mappings are all per-CPU (except for read-only text) + * o The mappings are static. They are all established before the CPU is + * started (with the exception of the boot CPU). + * + * To facilitate the kernel page table isolation we employ our per-CPU + * page tables discussed in the previous section and add the notion of a per-CPU + * HAT. Fundamentally we have a second page table root. There is both a kernel + * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes). + * Both will have the user page table entries copied into them, the same way + * that we discussed in the section 'Per-CPU Page Tables'. + * + * The complex part of this is how do we construct the set of kernel mappings + * that should be present when running with the user page table. To answer that, + * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT, + * except that it's not really associated with an address space the same way + * that other HATs are. + * + * This HAT lives off of the 'struct hat_cpu_info' which is a member of the + * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set + * of kernel mappings that should be present on this CPU. The kernel mappings + * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a + * mapping has been punched in, it may not be punched out. The reason that we + * opt to leverage a HAT structure is that it knows how to allocate and manage + * all of the lower level page tables as required. + * + * Because all of the mappings are present at the beginning of time for this CPU + * and none of the mappings are in the kernel pageable segment, we don't have to + * worry about faulting on these HAT structures and thus the notion of the + * current HAT that we're using is always the appropriate HAT for the process + * (usually a user HAT or the kernel's HAT). + * + * A further constraint we place on the system with these per-CPU HATs is that + * they are not subject to htable_steal(). Because each CPU will have a rather + * fixed number of page tables, the same way that we don't steal from the + * kernel's HAT, it was determined that we should not steal from this HAT due to + * the complications involved and somewhat criminal nature of htable_steal(). + * + * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of + * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is + * removed in hat_pcp_teardown() which is called when a CPU is being offlined to + * be removed from the system (which is different from what psradm usually + * does). + * + * Finally, once the CPU has been onlined, the set of mappings in the per-CPU + * HAT must not change. The HAT related functions that we call are not meant to + * be called when we're switching between processes. For example, it is quite + * possible that if they were, they would try to grab an htable mutex which + * another thread might have. One needs to treat hat_switch() as though they + * were above LOCK_LEVEL and therefore _must not_ block under any circumstance. + */ + #include <sys/machparam.h> #include <sys/machsystm.h> #include <sys/mman.h> @@ -96,15 +281,18 @@ struct hat_mmu_info mmu; * * For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries * on this 4K page for its top level page table. The remaining groups of - * 4 entries are used for per processor copies of user VLP pagetables for + * 4 entries are used for per processor copies of user PCP pagetables for * running threads. See hat_switch() and reload_pae32() for details. * - * vlp_page[0..3] - level==2 PTEs for kernel HAT - * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0 - * vlp_page[8..11] - level==2 PTE for user thread on cpu 1 + * pcp_page[0..3] - level==2 PTEs for kernel HAT + * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0 + * pcp_page[8..11] - level==2 PTE for user thread on cpu 1 * etc... + * + * On the 64-bit kernel, this is the normal root of the page table and there is + * nothing special about it when used for other CPUs. */ -static x86pte_t *vlp_page; +static x86pte_t *pcp_page; /* * forward declaration of internal utility routines @@ -171,7 +359,7 @@ kmutex_t hat_list_lock; kcondvar_t hat_list_cv; kmem_cache_t *hat_cache; kmem_cache_t *hat_hash_cache; -kmem_cache_t *vlp_hash_cache; +kmem_cache_t *hat32_hash_cache; /* * Simple statistics @@ -237,6 +425,32 @@ hati_constructor(void *buf, void *handle, int kmflags) } /* + * Put it at the start of the global list of all hats (used by stealing) + * + * kas.a_hat is not in the list but is instead used to find the + * first and last items in the list. + * + * - kas.a_hat->hat_next points to the start of the user hats. + * The list ends where hat->hat_next == NULL + * + * - kas.a_hat->hat_prev points to the last of the user hats. + * The list begins where hat->hat_prev == NULL + */ +static void +hat_list_append(hat_t *hat) +{ + mutex_enter(&hat_list_lock); + hat->hat_prev = NULL; + hat->hat_next = kas.a_hat->hat_next; + if (hat->hat_next) + hat->hat_next->hat_prev = hat; + else + kas.a_hat->hat_prev = hat; + kas.a_hat->hat_next = hat; + mutex_exit(&hat_list_lock); +} + +/* * Allocate a hat structure for as. We also create the top level * htable and initialize it to contain the kernel hat entries. */ @@ -245,7 +459,7 @@ hat_alloc(struct as *as) { hat_t *hat; htable_t *ht; /* top level htable */ - uint_t use_vlp; + uint_t use_copied; uint_t r; hat_kernel_range_t *rp; uintptr_t va; @@ -253,6 +467,7 @@ hat_alloc(struct as *as) uint_t start; uint_t cnt; htable_t *src; + boolean_t use_hat32_cache; /* * Once we start creating user process HATs we can enable @@ -269,30 +484,71 @@ hat_alloc(struct as *as) #if defined(__xpv) /* - * No VLP stuff on the hypervisor due to the 64-bit split top level + * No PCP stuff on the hypervisor due to the 64-bit split top level * page tables. On 32-bit it's not needed as the hypervisor takes * care of copying the top level PTEs to a below 4Gig page. */ - use_vlp = 0; + use_copied = 0; + use_hat32_cache = B_FALSE; + hat->hat_max_level = mmu.max_level; + hat->hat_num_copied = 0; + hat->hat_flags = 0; #else /* __xpv */ - /* 32 bit processes uses a VLP style hat when running with PAE */ #if defined(__amd64) - use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32); + + /* + * All processes use HAT_COPIED on the 64-bit kernel if KPTI is + * turned on. + */ + if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) { + use_copied = 1; + hat->hat_max_level = mmu.max_level32; + hat->hat_num_copied = mmu.num_copied_ents32; + use_hat32_cache = B_TRUE; + hat->hat_flags |= HAT_COPIED_32; + HATSTAT_INC(hs_hat_copied32); + } else if (kpti_enable == 1) { + use_copied = 1; + hat->hat_max_level = mmu.max_level; + hat->hat_num_copied = mmu.num_copied_ents; + use_hat32_cache = B_FALSE; + HATSTAT_INC(hs_hat_copied64); + } else { + use_copied = 0; + use_hat32_cache = B_FALSE; + hat->hat_max_level = mmu.max_level; + hat->hat_num_copied = 0; + hat->hat_flags = 0; + HATSTAT_INC(hs_hat_normal64); + } #elif defined(__i386) - use_vlp = mmu.pae_hat; + use_copied = mmu.pae_hat; + if (use_copied) { + use_hat32_cache = B_TRUE; + hat->hat_num_copied = mmu.num_copied_ents; + HATSTAT_INC(hs_hat_copied32); + } else { + use_hat32_cache = B_FALSE; + hat->hat_num_copied = 0; + } #endif #endif /* __xpv */ - if (use_vlp) { - hat->hat_flags = HAT_VLP; - bzero(hat->hat_vlp_ptes, VLP_SIZE); + if (use_copied) { + hat->hat_flags |= HAT_COPIED; + bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes)); } /* - * Allocate the htable hash + * Allocate the htable hash. For 32-bit PCP processes we use the + * hat32_hash_cache. However, for 64-bit PCP processes we do not as the + * number of entries that they have to handle is closer to + * hat_hash_cache in count (though there will be more wastage when we + * have more DRAM in the system and thus push down the user address + * range). */ - if ((hat->hat_flags & HAT_VLP)) { - hat->hat_num_hash = mmu.vlp_hash_cnt; - hat->hat_ht_hash = kmem_cache_alloc(vlp_hash_cache, KM_SLEEP); + if (use_hat32_cache) { + hat->hat_num_hash = mmu.hat32_hash_cnt; + hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP); } else { hat->hat_num_hash = mmu.hash_cnt; hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); @@ -310,7 +566,7 @@ hat_alloc(struct as *as) hat->hat_htable = ht; #if defined(__amd64) - if (hat->hat_flags & HAT_VLP) + if (hat->hat_flags & HAT_COPIED) goto init_done; #endif @@ -335,9 +591,9 @@ hat_alloc(struct as *as) start; #if defined(__i386) && !defined(__xpv) - if (ht->ht_flags & HTABLE_VLP) { - bcopy(&vlp_page[start], - &hat->hat_vlp_ptes[start], + if (ht->ht_flags & HTABLE_COPIED) { + bcopy(&pcp_page[start], + &hat->hat_copied_ptes[start], cnt * sizeof (x86pte_t)); continue; } @@ -362,30 +618,54 @@ init_done: #endif XPV_ALLOW_MIGRATE(); + hat_list_append(hat); + + return (hat); +} + +#if !defined(__xpv) +/* + * Cons up a HAT for a CPU. This represents the user mappings. This will have + * various kernel pages punched into it manually. Importantly, this hat is + * ineligible for stealing. We really don't want to deal with this ever + * faulting and figuring out that this is happening, much like we don't with + * kas. + */ +static hat_t * +hat_cpu_alloc(cpu_t *cpu) +{ + hat_t *hat; + htable_t *ht; + + hat = kmem_cache_alloc(hat_cache, KM_SLEEP); + hat->hat_as = NULL; + mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL); + hat->hat_max_level = mmu.max_level; + hat->hat_num_copied = 0; + hat->hat_flags = HAT_PCP; + + hat->hat_num_hash = mmu.hash_cnt; + hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP); + bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *)); + + hat->hat_next = hat->hat_prev = NULL; + /* - * Put it at the start of the global list of all hats (used by stealing) - * - * kas.a_hat is not in the list but is instead used to find the - * first and last items in the list. - * - * - kas.a_hat->hat_next points to the start of the user hats. - * The list ends where hat->hat_next == NULL - * - * - kas.a_hat->hat_prev points to the last of the user hats. - * The list begins where hat->hat_prev == NULL + * Because this HAT will only ever be used by the current CPU, we'll go + * ahead and set the CPUSET up to only point to the CPU in question. */ - mutex_enter(&hat_list_lock); - hat->hat_prev = NULL; - hat->hat_next = kas.a_hat->hat_next; - if (hat->hat_next) - hat->hat_next->hat_prev = hat; - else - kas.a_hat->hat_prev = hat; - kas.a_hat->hat_next = hat; - mutex_exit(&hat_list_lock); + CPUSET_ADD(hat->hat_cpus, cpu->cpu_id); + + hat->hat_htable = NULL; + hat->hat_ht_cached = NULL; + ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL); + hat->hat_htable = ht; + + hat_list_append(hat); return (hat); } +#endif /* !__xpv */ /* * process has finished executing but as has not been cleaned up yet. @@ -442,6 +722,7 @@ hat_free_end(hat_t *hat) /* * On the hypervisor, unpin top level page table(s) */ + VERIFY3U(hat->hat_flags & HAT_PCP, ==, 0); xen_unpin(hat->hat_htable->ht_pfn); #if defined(__amd64) xen_unpin(hat->hat_user_ptable); @@ -456,14 +737,25 @@ hat_free_end(hat_t *hat) /* * Decide which kmem cache the hash table came from, then free it. */ - if (hat->hat_flags & HAT_VLP) - cache = vlp_hash_cache; - else + if (hat->hat_flags & HAT_COPIED) { +#if defined(__amd64) + if (hat->hat_flags & HAT_COPIED_32) { + cache = hat32_hash_cache; + } else { + cache = hat_hash_cache; + } +#else + cache = hat32_hash_cache; +#endif + } else { cache = hat_hash_cache; + } kmem_cache_free(cache, hat->hat_ht_hash); hat->hat_ht_hash = NULL; hat->hat_flags = 0; + hat->hat_max_level = 0; + hat->hat_num_copied = 0; kmem_cache_free(hat_cache, hat); } @@ -518,6 +810,53 @@ set_max_page_level() } /* + * Determine the number of slots that are in used in the top-most level page + * table for user memory. This is based on _userlimit. In effect this is similar + * to htable_va2entry, but without the convenience of having an htable. + */ +void +mmu_calc_user_slots(void) +{ + uint_t ent, nptes; + uintptr_t shift; + + nptes = mmu.top_level_count; + shift = _userlimit >> mmu.level_shift[mmu.max_level]; + ent = shift & (nptes - 1); + + /* + * Ent tells us the slot that the page for _userlimit would fit in. We + * need to add one to this to cover the total number of entries. + */ + mmu.top_level_uslots = ent + 1; + +#if defined(__amd64) + /* + * When running 32-bit compatability processes on a 64-bit kernel, we + * will only need to use one slot. + */ + mmu.top_level_uslots32 = 1; + + /* + * Record the number of PCP page table entries that we'll need to copy + * around. For 64-bit processes this is the number of user slots. For + * 32-bit proceses, this is 4 1 GiB pages. + */ + mmu.num_copied_ents = mmu.top_level_uslots; + mmu.num_copied_ents32 = 4; +#elif defined(__xpv) + /* + * + */ + if (mmu.pae_hat) { + mmu.num_copied_ents = 4; + } else { + mmu.num_copied_ents = 0; + } +#endif +} + +/* * Initialize hat data structures based on processor MMU information. */ void @@ -536,6 +875,17 @@ mmu_init(void) (getcr4() & CR4_PGE) != 0) mmu.pt_global = PT_GLOBAL; +#if defined(__amd64) && !defined(__xpv) + /* + * The 64-bit x86 kernel has split user/kernel page tables. As such we + * cannot have the global bit set. The simplest way for us to deal with + * this is to just say that pt_global is zero, so the global bit isn't + * present. + */ + if (kpti_enable == 1) + mmu.pt_global = 0; +#endif + /* * Detect NX and PAE usage. */ @@ -594,6 +944,11 @@ mmu_init(void) mmu.ptes_per_table = 512; mmu.top_level_count = 512; + /* + * 32-bit processes only use 1 GB ptes. + */ + mmu.max_level32 = 2; + mmu.level_shift[0] = 12; mmu.level_shift[1] = 21; mmu.level_shift[2] = 30; @@ -630,6 +985,7 @@ mmu_init(void) } set_max_page_level(); + mmu_calc_user_slots(); mmu_page_sizes = mmu.max_page_level + 1; mmu_exported_page_sizes = mmu.umax_page_level + 1; @@ -665,7 +1021,7 @@ mmu_init(void) mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *); while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables) mmu.hash_cnt >>= 1; - mmu.vlp_hash_cnt = mmu.hash_cnt; + mmu.hat32_hash_cnt = mmu.hash_cnt; #if defined(__amd64) /* @@ -714,14 +1070,15 @@ hat_init() NULL, 0, 0); /* - * VLP hats can use a smaller hash table size on large memroy machines + * 32-bit PCP hats can use a smaller hash table size on large memory + * machines */ - if (mmu.hash_cnt == mmu.vlp_hash_cnt) { - vlp_hash_cache = hat_hash_cache; + if (mmu.hash_cnt == mmu.hat32_hash_cnt) { + hat32_hash_cache = hat_hash_cache; } else { - vlp_hash_cache = kmem_cache_create("HatVlpHash", - mmu.vlp_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL, - NULL, 0, 0); + hat32_hash_cache = kmem_cache_create("Hat32Hash", + mmu.hat32_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, + NULL, NULL, 0, 0); } /* @@ -738,6 +1095,13 @@ hat_init() CPUSET_ADD(khat_cpuset, CPU->cpu_id); /* + * The kernel HAT doesn't use PCP regardless of architectures. + */ + ASSERT3U(mmu.max_level, >, 0); + kas.a_hat->hat_max_level = mmu.max_level; + kas.a_hat->hat_num_copied = 0; + + /* * The kernel hat's next pointer serves as the head of the hat list . * The kernel hat's prev pointer tracks the last hat on the list for * htable_steal() to use. @@ -769,57 +1133,165 @@ hat_init() KM_SLEEP); } + +extern void kpti_tramp_start(); +extern void kpti_tramp_end(); + +extern void kdi_isr_start(); +extern void kdi_isr_end(); + +extern gate_desc_t kdi_idt[NIDT]; + /* - * Prepare CPU specific pagetables for VLP processes on 64 bit kernels. + * Prepare per-CPU pagetables for all processes on the 64 bit kernel. * * Each CPU has a set of 2 pagetables that are reused for any 32 bit - * process it runs. They are the top level pagetable, hci_vlp_l3ptes, and - * the next to top level table for the bottom 512 Gig, hci_vlp_l2ptes. + * process it runs. They are the top level pagetable, hci_pcp_l3ptes, and + * the next to top level table for the bottom 512 Gig, hci_pcp_l2ptes. */ /*ARGSUSED*/ static void -hat_vlp_setup(struct cpu *cpu) +hat_pcp_setup(struct cpu *cpu) { -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) struct hat_cpu_info *hci = cpu->cpu_hat_info; - pfn_t pfn; + uintptr_t va; + size_t len; /* * allocate the level==2 page table for the bottom most * 512Gig of address space (this is where 32 bit apps live) */ ASSERT(hci != NULL); - hci->hci_vlp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); + hci->hci_pcp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); /* * Allocate a top level pagetable and copy the kernel's - * entries into it. Then link in hci_vlp_l2ptes in the 1st entry. - */ - hci->hci_vlp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); - hci->hci_vlp_pfn = - hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l3ptes); - ASSERT(hci->hci_vlp_pfn != PFN_INVALID); - bcopy(vlp_page, hci->hci_vlp_l3ptes, MMU_PAGESIZE); - - pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l2ptes); - ASSERT(pfn != PFN_INVALID); - hci->hci_vlp_l3ptes[0] = MAKEPTP(pfn, 2); -#endif /* __amd64 && !__xpv */ + * entries into it. Then link in hci_pcp_l2ptes in the 1st entry. + */ + hci->hci_pcp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP); + hci->hci_pcp_l3pfn = + hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l3ptes); + ASSERT3U(hci->hci_pcp_l3pfn, !=, PFN_INVALID); + bcopy(pcp_page, hci->hci_pcp_l3ptes, MMU_PAGESIZE); + + hci->hci_pcp_l2pfn = + hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l2ptes); + ASSERT3U(hci->hci_pcp_l2pfn, !=, PFN_INVALID); + + /* + * Now go through and allocate the user version of these structures. + * Unlike with the kernel version, we allocate a hat to represent the + * top-level page table as that will make it much simpler when we need + * to patch through user entries. + */ + hci->hci_user_hat = hat_cpu_alloc(cpu); + hci->hci_user_l3pfn = hci->hci_user_hat->hat_htable->ht_pfn; + ASSERT3U(hci->hci_user_l3pfn, !=, PFN_INVALID); + hci->hci_user_l3ptes = + (x86pte_t *)hat_kpm_mapin_pfn(hci->hci_user_l3pfn); + + /* Skip the rest of this if KPTI is switched off at boot. */ + if (kpti_enable != 1) + return; + + /* + * OK, now that we have this we need to go through and punch the normal + * holes in the CPU's hat for this. At this point we'll punch in the + * following: + * + * o GDT + * o IDT + * o LDT + * o Trampoline Code + * o machcpu KPTI page + * o kmdb ISR code page (just trampolines) + * + * If this is cpu0, then we also can initialize the following because + * they'll have already been allocated. + * + * o TSS for CPU 0 + * o Double Fault for CPU 0 + * + * The following items have yet to be allocated and have not been + * punched in yet. They will be punched in later: + * + * o TSS (mach_cpucontext_alloc_tables()) + * o Double Fault Stack (mach_cpucontext_alloc_tables()) + */ + hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_gdt, PROT_READ); + hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_idt, PROT_READ); + + /* + * As the KDI IDT is only active during kmdb sessions (including single + * stepping), typically we don't actually need this punched in (we + * consider the routines that switch to the user cr3 to be toxic). But + * if we ever accidentally end up on the user cr3 while on this IDT, + * we'd prefer not to triple fault. + */ + hati_cpu_punchin(cpu, (uintptr_t)&kdi_idt, PROT_READ); + + CTASSERT(((uintptr_t)&kpti_tramp_start % MMU_PAGESIZE) == 0); + CTASSERT(((uintptr_t)&kpti_tramp_end % MMU_PAGESIZE) == 0); + for (va = (uintptr_t)&kpti_tramp_start; + va < (uintptr_t)&kpti_tramp_end; va += MMU_PAGESIZE) { + hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); + } + + VERIFY3U(((uintptr_t)cpu->cpu_m.mcpu_ldt) % MMU_PAGESIZE, ==, 0); + for (va = (uintptr_t)cpu->cpu_m.mcpu_ldt, len = LDT_CPU_SIZE; + len >= MMU_PAGESIZE; va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { + hati_cpu_punchin(cpu, va, PROT_READ); + } + + /* mcpu_pad2 is the start of the page containing the kpti_frames. */ + hati_cpu_punchin(cpu, (uintptr_t)&cpu->cpu_m.mcpu_pad2[0], + PROT_READ | PROT_WRITE); + + if (cpu == &cpus[0]) { + /* + * CPU0 uses a global for its double fault stack to deal with + * the chicken and egg problem. We need to punch it into its + * user HAT. + */ + extern char dblfault_stack0[]; + + hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_m.mcpu_tss, + PROT_READ); + + for (va = (uintptr_t)dblfault_stack0, + len = DEFAULTSTKSZ; len >= MMU_PAGESIZE; + va += MMU_PAGESIZE, len -= MMU_PAGESIZE) { + hati_cpu_punchin(cpu, va, PROT_READ | PROT_WRITE); + } + } + + CTASSERT(((uintptr_t)&kdi_isr_start % MMU_PAGESIZE) == 0); + CTASSERT(((uintptr_t)&kdi_isr_end % MMU_PAGESIZE) == 0); + for (va = (uintptr_t)&kdi_isr_start; + va < (uintptr_t)&kdi_isr_end; va += MMU_PAGESIZE) { + hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC); + } +#endif /* !__xpv */ } /*ARGSUSED*/ static void -hat_vlp_teardown(cpu_t *cpu) +hat_pcp_teardown(cpu_t *cpu) { -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) struct hat_cpu_info *hci; if ((hci = cpu->cpu_hat_info) == NULL) return; - if (hci->hci_vlp_l2ptes) - kmem_free(hci->hci_vlp_l2ptes, MMU_PAGESIZE); - if (hci->hci_vlp_l3ptes) - kmem_free(hci->hci_vlp_l3ptes, MMU_PAGESIZE); + if (hci->hci_pcp_l2ptes != NULL) + kmem_free(hci->hci_pcp_l2ptes, MMU_PAGESIZE); + if (hci->hci_pcp_l3ptes != NULL) + kmem_free(hci->hci_pcp_l3ptes, MMU_PAGESIZE); + if (hci->hci_user_hat != NULL) { + hat_free_start(hci->hci_user_hat); + hat_free_end(hci->hci_user_hat); + } #endif } @@ -830,6 +1302,8 @@ hat_vlp_teardown(cpu_t *cpu) ++r; \ } +extern uint64_t kpti_safe_cr3; + /* * Finish filling in the kernel hat. * Pre fill in all top level kernel page table entries for the kernel's @@ -915,13 +1389,16 @@ hat_init_finish(void) /* * 32 bit PAE metal kernels use only 4 of the 512 entries in the * page holding the top level pagetable. We use the remainder for - * the "per CPU" page tables for VLP processes. + * the "per CPU" page tables for PCP processes. * Map the top level kernel pagetable into the kernel to make * it easy to use bcopy access these tables. + * + * PAE is required for the 64-bit kernel which uses this as well to + * perform the per-CPU pagetables. See the big theory statement. */ if (mmu.pae_hat) { - vlp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); - hat_devload(kas.a_hat, (caddr_t)vlp_page, MMU_PAGESIZE, + pcp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP); + hat_devload(kas.a_hat, (caddr_t)pcp_page, MMU_PAGESIZE, kas.a_hat->hat_htable->ht_pfn, #if !defined(__xpv) PROT_WRITE | @@ -929,7 +1406,7 @@ hat_init_finish(void) PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK, HAT_LOAD | HAT_LOAD_NOCONSIST); } - hat_vlp_setup(CPU); + hat_pcp_setup(CPU); /* * Create kmap (cached mappings of kernel PTEs) @@ -942,6 +1419,11 @@ hat_init_finish(void) size = segmapsize; #endif hat_kmap_init((uintptr_t)segmap_start, size); + +#if defined(__amd64) && !defined(__xpv) + ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID); + ASSERT3U(kpti_safe_cr3, ==, MAKECR3(kas.a_hat->hat_htable->ht_pfn)); +#endif } /* @@ -959,12 +1441,12 @@ reload_pae32(hat_t *hat, cpu_t *cpu) /* * Load the 4 entries of the level 2 page table into this - * cpu's range of the vlp_page and point cr3 at them. + * cpu's range of the pcp_page and point cr3 at them. */ ASSERT(mmu.pae_hat); - src = hat->hat_vlp_ptes; - dest = vlp_page + (cpu->cpu_id + 1) * VLP_NUM_PTES; - for (i = 0; i < VLP_NUM_PTES; ++i) { + src = hat->hat_copied_ptes; + dest = pcp_page + (cpu->cpu_id + 1) * MAX_COPIED_PTES; + for (i = 0; i < MAX_COPIED_PTES; ++i) { for (;;) { pte = dest[i]; if (pte == src[i]) @@ -977,6 +1459,89 @@ reload_pae32(hat_t *hat, cpu_t *cpu) #endif /* + * Update the PCP data on the CPU cpu to the one on the hat. If this is a 32-bit + * process, then we must update the L2 pages and then the L3. If this is a + * 64-bit process then we must update the L3 entries. + */ +static void +hat_pcp_update(cpu_t *cpu, const hat_t *hat) +{ + ASSERT3U(hat->hat_flags & HAT_COPIED, !=, 0); + + if ((hat->hat_flags & HAT_COPIED_32) != 0) { + const x86pte_t *l2src; + x86pte_t *l2dst, *l3ptes, *l3uptes; + /* + * This is a 32-bit process. To set this up, we need to do the + * following: + * + * - Copy the 4 L2 PTEs into the dedicated L2 table + * - Zero the user L3 PTEs in the user and kernel page table + * - Set the first L3 PTE to point to the CPU L2 table + */ + l2src = hat->hat_copied_ptes; + l2dst = cpu->cpu_hat_info->hci_pcp_l2ptes; + l3ptes = cpu->cpu_hat_info->hci_pcp_l3ptes; + l3uptes = cpu->cpu_hat_info->hci_user_l3ptes; + + l2dst[0] = l2src[0]; + l2dst[1] = l2src[1]; + l2dst[2] = l2src[2]; + l2dst[3] = l2src[3]; + + /* + * Make sure to use the mmu to get the number of slots. The + * number of PLP entries that this has will always be less as + * it's a 32-bit process. + */ + bzero(l3ptes, sizeof (x86pte_t) * mmu.top_level_uslots); + l3ptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); + bzero(l3uptes, sizeof (x86pte_t) * mmu.top_level_uslots); + l3uptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2); + } else { + /* + * This is a 64-bit process. To set this up, we need to do the + * following: + * + * - Zero the 4 L2 PTEs in the CPU structure for safety + * - Copy over the new user L3 PTEs into the kernel page table + * - Copy over the new user L3 PTEs into the user page table + */ + ASSERT3S(kpti_enable, ==, 1); + bzero(cpu->cpu_hat_info->hci_pcp_l2ptes, sizeof (x86pte_t) * 4); + bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_pcp_l3ptes, + sizeof (x86pte_t) * mmu.top_level_uslots); + bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_user_l3ptes, + sizeof (x86pte_t) * mmu.top_level_uslots); + } +} + +static void +reset_kpti(struct kpti_frame *fr, uint64_t kcr3) +{ + ASSERT3U(fr->kf_tr_flag, ==, 0); +#if DEBUG + if (fr->kf_kernel_cr3 != 0) { + ASSERT3U(fr->kf_lower_redzone, ==, 0xdeadbeefdeadbeef); + ASSERT3U(fr->kf_middle_redzone, ==, 0xdeadbeefdeadbeef); + ASSERT3U(fr->kf_upper_redzone, ==, 0xdeadbeefdeadbeef); + } +#endif + + bzero(fr, offsetof(struct kpti_frame, kf_kernel_cr3)); + bzero(&fr->kf_unused, sizeof (struct kpti_frame) - + offsetof(struct kpti_frame, kf_unused)); + + fr->kf_kernel_cr3 = kcr3; + fr->kf_user_cr3 = 0; + fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp; + + fr->kf_lower_redzone = 0xdeadbeefdeadbeef; + fr->kf_middle_redzone = 0xdeadbeefdeadbeef; + fr->kf_upper_redzone = 0xdeadbeefdeadbeef; +} + +/* * Switch to a new active hat, maintaining bit masks to track active CPUs. * * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it @@ -1010,17 +1575,9 @@ hat_switch(hat_t *hat) /* * now go ahead and load cr3 */ - if (hat->hat_flags & HAT_VLP) { -#if defined(__amd64) - x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes; - - VLP_COPY(hat->hat_vlp_ptes, vlpptep); - newcr3 = MAKECR3(cpu->cpu_hat_info->hci_vlp_pfn); -#elif defined(__i386) - reload_pae32(hat, cpu); - newcr3 = MAKECR3(kas.a_hat->hat_htable->ht_pfn) + - (cpu->cpu_id + 1) * VLP_SIZE; -#endif + if (hat->hat_flags & HAT_COPIED) { + hat_pcp_update(cpu, hat); + newcr3 = MAKECR3(cpu->cpu_hat_info->hci_pcp_l3pfn); } else { newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn); } @@ -1032,7 +1589,7 @@ hat_switch(hat_t *hat) t[0].cmd = MMUEXT_NEW_BASEPTR; t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); -#if defined(__amd64) + /* * There's an interesting problem here, as to what to * actually specify when switching to the kernel hat. @@ -1044,7 +1601,7 @@ hat_switch(hat_t *hat) else t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); ++opcnt; -#endif /* __amd64 */ + if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) panic("HYPERVISOR_mmu_update() failed"); ASSERT(retcnt == opcnt); @@ -1052,6 +1609,16 @@ hat_switch(hat_t *hat) } #else setcr3(newcr3); + reset_kpti(&cpu->cpu_m.mcpu_kpti, newcr3); + reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, newcr3); + reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, newcr3); + + if (kpti_enable == 1) { + newcr3 = MAKECR3(cpu->cpu_hat_info->hci_user_l3pfn); + cpu->cpu_m.mcpu_kpti.kf_user_cr3 = newcr3; + cpu->cpu_m.mcpu_kpti_dbg.kf_user_cr3 = newcr3; + cpu->cpu_m.mcpu_kpti_flt.kf_user_cr3 = newcr3; + } #endif ASSERT(cpu == CPU); } @@ -1364,10 +1931,9 @@ hati_pte_map( ASSERT(flags & HAT_LOAD_NOCONSIST); } #if defined(__amd64) - if (ht->ht_flags & HTABLE_VLP) { + if (ht->ht_flags & HTABLE_COPIED) { cpu_t *cpu = CPU; - x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes; - VLP_COPY(hat->hat_vlp_ptes, vlpptep); + hat_pcp_update(cpu, hat); } #endif HTABLE_INC(ht->ht_valid_cnt); @@ -1439,7 +2005,8 @@ hati_load_common( ++curthread->t_hatdepth; ASSERT(curthread->t_hatdepth < 16); - ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as)); + ASSERT(hat == kas.a_hat || (hat->hat_flags & HAT_PCP) != 0 || + AS_LOCK_HELD(hat->hat_as)); if (flags & HAT_LOAD_SHARE) hat->hat_flags |= HAT_SHARED; @@ -1459,15 +2026,23 @@ hati_load_common( ht = htable_create(hat, va, level, NULL); ASSERT(ht != NULL); } + /* + * htable_va2entry checks this condition as well, but it won't include + * much useful info in the panic. So we do it in advance here to include + * all the context. + */ + if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) { + panic("hati_load_common: bad htable: va=%p, last page=%p, " + "ht->ht_vaddr=%p, ht->ht_level=%d", (void *)va, + (void *)HTABLE_LAST_PAGE(ht), (void *)ht->ht_vaddr, + (int)ht->ht_level); + } entry = htable_va2entry(va, ht); /* * a bunch of paranoid error checking */ ASSERT(ht->ht_busy > 0); - if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) - panic("hati_load_common: bad htable %p, va %p", - (void *)ht, (void *)va); ASSERT(ht->ht_level == level); /* @@ -1958,14 +2533,12 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) /* * Otherwise we reload cr3 to effect a complete TLB flush. * - * A reload of cr3 on a VLP process also means we must also recopy in - * the pte values from the struct hat + * A reload of cr3 when using PCP also means we must also recopy in the + * pte values from the struct hat */ - if (hat->hat_flags & HAT_VLP) { + if (hat->hat_flags & HAT_COPIED) { #if defined(__amd64) - x86pte_t *vlpptep = CPU->cpu_hat_info->hci_vlp_l2ptes; - - VLP_COPY(hat->hat_vlp_ptes, vlpptep); + hat_pcp_update(CPU, hat); #elif defined(__i386) reload_pae32(hat, CPU); #endif @@ -4075,7 +4648,7 @@ hat_cpu_online(struct cpu *cpup) { if (cpup != CPU) { x86pte_cpu_init(cpup); - hat_vlp_setup(cpup); + hat_pcp_setup(cpup); } CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id); } @@ -4090,7 +4663,7 @@ hat_cpu_offline(struct cpu *cpup) ASSERT(cpup != CPU); CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id); - hat_vlp_teardown(cpup); + hat_pcp_teardown(cpup); x86pte_cpu_fini(cpup); } @@ -4406,7 +4979,7 @@ hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) #ifndef __xpv void hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, - offset_t kpm_pages_off) + offset_t kpm_pages_off) { _NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off)); pfn_t base, end; @@ -4465,7 +5038,7 @@ hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) void hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, - struct memseg *lo, struct memseg *mid, struct memseg *hi) + struct memseg *lo, struct memseg *mid, struct memseg *hi) { _NOTE(ARGUNUSED(msp, mspp, lo, mid, hi)); ASSERT(0); @@ -4537,3 +5110,32 @@ hat_release_mapping(hat_t *hat, caddr_t addr) XPV_ALLOW_MIGRATE(); } #endif /* __xpv */ + +/* + * Helper function to punch in a mapping that we need with the specified + * attributes. + */ +void +hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs) +{ + int ret; + pfn_t pfn; + hat_t *cpu_hat = cpu->cpu_hat_info->hci_user_hat; + + ASSERT3S(kpti_enable, ==, 1); + ASSERT3P(cpu_hat, !=, NULL); + ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP); + ASSERT3U(va & (MMU_PAGESIZE - 1), ==, 0); + + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); + VERIFY3U(pfn, !=, PFN_INVALID); + + /* + * We purposefully don't try to find the page_t. This means that this + * will be marked PT_NOCONSIST; however, given that this is pretty much + * a static mapping that we're using we should be relatively OK. + */ + attrs |= HAT_STORECACHING_OK; + ret = hati_load_common(cpu_hat, va, NULL, attrs, 0, 0, pfn); + VERIFY3S(ret, ==, 0); +} diff --git a/usr/src/uts/i86pc/vm/hat_i86.h b/usr/src/uts/i86pc/vm/hat_i86.h index fdbb9346bc..2bcac4ec61 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.h +++ b/usr/src/uts/i86pc/vm/hat_i86.h @@ -24,6 +24,7 @@ */ /* * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _VM_HAT_I86_H @@ -62,19 +63,32 @@ extern "C" { */ /* - * VLP processes have a 32 bit address range, so their top level is 2 and - * with only 4 PTEs in that table. + * Maximum number of per-CPU pagetable entries that we'll need to cache in the + * HAT. See the big theory statement in uts/i86pc/vm/hat_i86.c for more + * information. */ -#define VLP_LEVEL (2) -#define VLP_NUM_PTES (4) -#define VLP_SIZE (VLP_NUM_PTES * sizeof (x86pte_t)) -#define TOP_LEVEL(h) (((h)->hat_flags & HAT_VLP) ? VLP_LEVEL : mmu.max_level) -#define VLP_COPY(fromptep, toptep) { \ - toptep[0] = fromptep[0]; \ - toptep[1] = fromptep[1]; \ - toptep[2] = fromptep[2]; \ - toptep[3] = fromptep[3]; \ -} +#if defined(__xpv) +/* + * The Xen hypervisor does not use per-CPU pagetables (PCP). Define a single + * struct member for it at least to make life easier and not make the member + * conditional. + */ +#define MAX_COPIED_PTES 1 +#else +#if defined(__amd64) +/* + * The 64-bit kernel may have up to 512 PTEs present in it for a given process. + */ +#define MAX_COPIED_PTES 512 +#elif defined(__i386) +/* + * The 32-bit kernel always uses 4 PTEs for this. + */ +#define MAX_COPIED_PTES 4 +#endif /* __amd64 */ +#endif /* __xpv */ + +#define TOP_LEVEL(h) (((h)->hat_max_level)) /* * The hat struct exists for each address space. @@ -87,13 +101,15 @@ struct hat { pgcnt_t hat_ism_pgcnt; cpuset_t hat_cpus; uint16_t hat_flags; + uint8_t hat_max_level; /* top level of this HAT */ + uint_t hat_num_copied; /* Actual num of hat_copied_ptes[] */ htable_t *hat_htable; /* top level htable */ struct hat *hat_next; struct hat *hat_prev; uint_t hat_num_hash; /* number of htable hash buckets */ htable_t **hat_ht_hash; /* htable hash buckets */ htable_t *hat_ht_cached; /* cached free htables */ - x86pte_t hat_vlp_ptes[VLP_NUM_PTES]; + x86pte_t hat_copied_ptes[MAX_COPIED_PTES]; #if defined(__amd64) && defined(__xpv) pfn_t hat_user_ptable; /* alt top ptable for user mode */ #endif @@ -106,14 +122,16 @@ typedef struct hat hat_t; atomic_dec_ulong(&(hat)->hat_pages_mapped[level]); /* - * Flags for the hat_flags field + * Flags for the hat_flags field. For more information, please see the big + * theory statement on the HAT design in uts/i86pc/vm/hat_i86.c. * * HAT_FREEING - set when HAT is being destroyed - mostly used to detect that * demap()s can be avoided. * - * HAT_VLP - indicates a 32 bit process has a virtual address range less than - * the hardware's physical address range. (VLP->Virtual Less-than Physical) - * Note - never used on the hypervisor. + * HAT_COPIED - Indicates this HAT is a source for per-cpu page tables: see the + * big comment in hat_i86.c for a description. + * + * HAT_COPIED_32 - HAT_COPIED, but for an ILP32 process. * * HAT_VICTIM - This is set while a hat is being examined for page table * stealing and prevents it from being freed. @@ -121,12 +139,17 @@ typedef struct hat hat_t; * HAT_SHARED - The hat has exported it's page tables via hat_share() * * HAT_PINNED - On the hypervisor, indicates the top page table has been pinned. + * + * HAT_PCP - Used for the per-cpu user page table (i.e. associated with a CPU, + * not a process). */ #define HAT_FREEING (0x0001) -#define HAT_VLP (0x0002) -#define HAT_VICTIM (0x0004) -#define HAT_SHARED (0x0008) -#define HAT_PINNED (0x0010) +#define HAT_VICTIM (0x0002) +#define HAT_SHARED (0x0004) +#define HAT_PINNED (0x0008) +#define HAT_COPIED (0x0010) +#define HAT_COPIED_32 (0x0020) +#define HAT_PCP (0x0040) /* * Additional platform attribute for hat_devload() to force no caching. @@ -155,6 +178,9 @@ struct hatstats { ulong_t hs_hm_steals; ulong_t hs_hm_steal_exam; ulong_t hs_tlb_inval_delayed; + ulong_t hs_hat_copied64; + ulong_t hs_hat_copied32; + ulong_t hs_hat_normal64; }; extern struct hatstats hatstat; #ifdef DEBUG @@ -240,6 +266,11 @@ extern void hat_kmap_init(uintptr_t base, size_t len); extern hment_t *hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry); +#if defined(__amd64) +extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs); +extern void mmu_calc_user_slots(void); +#endif + #if !defined(__xpv) /* * routines to deal with delayed TLB invalidations for idle CPUs diff --git a/usr/src/uts/i86pc/vm/hat_pte.h b/usr/src/uts/i86pc/vm/hat_pte.h index 7b078b0435..121d96cf84 100644 --- a/usr/src/uts/i86pc/vm/hat_pte.h +++ b/usr/src/uts/i86pc/vm/hat_pte.h @@ -21,7 +21,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2017 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _VM_HAT_PTE_H @@ -175,10 +175,18 @@ struct hat_mmu_info { uint_t max_page_level; /* maximum level at which we can map a page */ uint_t umax_page_level; /* max user page map level */ uint_t ptes_per_table; /* # of entries in lower level page tables */ - uint_t top_level_count; /* # of entries in top most level page table */ + uint_t top_level_count; /* # of entries in top-level page table */ + uint_t top_level_uslots; /* # of user slots in top-level page table */ + uint_t num_copied_ents; /* # of PCP-copied PTEs to create */ +#if defined(__amd64) + /* 32-bit versions of values */ + uint_t top_level_uslots32; + uint_t max_level32; + uint_t num_copied_ents32; +#endif - uint_t hash_cnt; /* cnt of entries in htable_hash_cache */ - uint_t vlp_hash_cnt; /* cnt of entries in vlp htable_hash_cache */ + uint_t hash_cnt; /* cnt of entries in htable_hash_cache */ + uint_t hat32_hash_cnt; /* cnt of entries in 32-bit htable_hash_cache */ uint_t pae_hat; /* either 0 or 1 */ diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c index 44e66ddfc1..b294597eba 100644 --- a/usr/src/uts/i86pc/vm/htable.c +++ b/usr/src/uts/i86pc/vm/htable.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -621,11 +621,15 @@ htable_steal(uint_t cnt, boolean_t reap) * stale PTEs either here or under hat_unload() when we * steal and unload the same page table in competing * threads. + * + * We skip HATs that belong to CPUs, to make our lives + * simpler. */ - while (hat != NULL && - (hat->hat_flags & - (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) + while (hat != NULL && (hat->hat_flags & + (HAT_VICTIM | HAT_SHARED | HAT_FREEING | + HAT_PCP)) != 0) { hat = hat->hat_next; + } if (hat == NULL) break; @@ -668,8 +672,8 @@ htable_steal(uint_t cnt, boolean_t reap) continue; ASSERT(ht->ht_hat == hat); #if defined(__xpv) && defined(__amd64) - if (!(ht->ht_flags & HTABLE_VLP) && - ht->ht_level == mmu.max_level) { + ASSERT(!(ht->ht_flags & HTABLE_COPIED)); + if (ht->ht_level == mmu.max_level) { ptable_free(hat->hat_user_ptable); hat->hat_user_ptable = PFN_INVALID; } @@ -779,7 +783,7 @@ htable_alloc( htable_t *shared) { htable_t *ht = NULL; - uint_t is_vlp; + uint_t is_copied; uint_t is_bare = 0; uint_t need_to_zero = 1; int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); @@ -787,8 +791,9 @@ htable_alloc( if (level < 0 || level > TOP_LEVEL(hat)) panic("htable_alloc(): level %d out of range\n", level); - is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; - if (is_vlp || shared != NULL) + is_copied = (hat->hat_flags & HAT_COPIED) && + level == hat->hat_max_level; + if (is_copied || shared != NULL) is_bare = 1; /* @@ -930,10 +935,10 @@ htable_alloc( } /* - * setup flags, etc. for VLP htables + * setup flags, etc. for copied page tables. */ - if (is_vlp) { - ht->ht_flags |= HTABLE_VLP; + if (is_copied) { + ht->ht_flags |= HTABLE_COPIED; ASSERT(ht->ht_pfn == PFN_INVALID); need_to_zero = 0; } @@ -984,7 +989,7 @@ htable_free(htable_t *ht) !(ht->ht_flags & HTABLE_SHARED_PFN) && (use_boot_reserve || (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { - ASSERT((ht->ht_flags & HTABLE_VLP) == 0); + ASSERT((ht->ht_flags & HTABLE_COPIED) == 0); ASSERT(ht->ht_pfn != PFN_INVALID); hat_enter(hat); ht->ht_next = hat->hat_ht_cached; @@ -999,7 +1004,7 @@ htable_free(htable_t *ht) */ if (ht->ht_flags & HTABLE_SHARED_PFN) { ASSERT(ht->ht_pfn != PFN_INVALID); - } else if (!(ht->ht_flags & HTABLE_VLP)) { + } else if (!(ht->ht_flags & HTABLE_COPIED)) { ptable_free(ht->ht_pfn); #if defined(__amd64) && defined(__xpv) if (ht->ht_level == mmu.max_level && hat != NULL) { @@ -1111,15 +1116,15 @@ unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) found, expect); /* - * When a top level VLP page table entry changes, we must issue - * a reload of cr3 on all processors. + * When a top level PTE changes for a copied htable, we must trigger a + * hat_pcp_update() on all HAT CPUs. * - * If we don't need do do that, then we still have to INVLPG against - * an address covered by the inner page table, as the latest processors + * If we don't need do do that, then we still have to INVLPG against an + * address covered by the inner page table, as the latest processors * have TLB-like caches for non-leaf page table entries. */ if (!(hat->hat_flags & HAT_FREEING)) { - hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? + hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ? DEMAP_ALL_ADDR : old->ht_vaddr); } @@ -1148,15 +1153,17 @@ link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) panic("HAT: ptp not 0, found=" FMT_PTE, found); /* - * When any top level VLP page table entry changes, we must issue - * a reload of cr3 on all processors using it. + * When a top level PTE changes for a copied htable, we must trigger a + * hat_pcp_update() on all HAT CPUs. + * * We also need to do this for the kernel hat on PAE 32 bit kernel. */ if ( #ifdef __i386 - (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || + (higher->ht_hat == kas.a_hat && + higher->ht_level == higher->ht_hat->hat_max_level) || #endif - (higher->ht_flags & HTABLE_VLP)) + (higher->ht_flags & HTABLE_COPIED)) hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); } @@ -1295,7 +1302,8 @@ htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level) * 32 bit address spaces on 64 bit kernels need to check * for overflow of the 32 bit address space */ - if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32)) + if ((hat->hat_flags & HAT_COPIED_32) && + vaddr >= ((uint64_t)1 << 32)) return (NULL); #endif base = 0; @@ -1943,10 +1951,12 @@ static x86pte_t * x86pte_access_pagetable(htable_t *ht, uint_t index) { /* - * VLP pagetables are contained in the hat_t + * HTABLE_COPIED pagetables are contained in the hat_t */ - if (ht->ht_flags & HTABLE_VLP) - return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index)); + if (ht->ht_flags & HTABLE_COPIED) { + ASSERT3U(index, <, ht->ht_hat->hat_num_copied); + return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index)); + } return (x86pte_mapin(ht->ht_pfn, index, ht)); } @@ -2026,10 +2036,7 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) static void x86pte_release_pagetable(htable_t *ht) { - /* - * nothing to do for VLP htables - */ - if (ht->ht_flags & HTABLE_VLP) + if (ht->ht_flags & HTABLE_COPIED) return; x86pte_mapout(); @@ -2189,7 +2196,7 @@ x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) maddr_t ma; if (!IN_XPV_PANIC()) { - ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */ + ASSERT(!(ht->ht_flags & HTABLE_COPIED)); ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; t[0].val = new; @@ -2346,7 +2353,7 @@ x86pte_update( /* * Copy page tables - this is just a little more complicated than the * previous routines. Note that it's also not atomic! It also is never - * used for VLP pagetables. + * used for HTABLE_COPIED pagetables. */ void x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) @@ -2358,8 +2365,8 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) x86pte_t pte; ASSERT(khat_running); - ASSERT(!(dest->ht_flags & HTABLE_VLP)); - ASSERT(!(src->ht_flags & HTABLE_VLP)); + ASSERT(!(dest->ht_flags & HTABLE_COPIED)); + ASSERT(!(src->ht_flags & HTABLE_COPIED)); ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); @@ -2450,7 +2457,7 @@ x86pte_zero(htable_t *dest, uint_t entry, uint_t count) * Map in the page table to be zeroed. */ ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); - ASSERT(!(dest->ht_flags & HTABLE_VLP)); + ASSERT(!(dest->ht_flags & HTABLE_COPIED)); /* * On the hypervisor we don't use x86pte_access_pagetable() since @@ -2504,7 +2511,7 @@ hat_dump(void) for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { for (h = 0; h < hat->hat_num_hash; ++h) { for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { - if ((ht->ht_flags & HTABLE_VLP) == 0) + if ((ht->ht_flags & HTABLE_COPIED) == 0) dump_page(ht->ht_pfn); } } diff --git a/usr/src/uts/i86pc/vm/htable.h b/usr/src/uts/i86pc/vm/htable.h index 6377beef94..d9b91189c9 100644 --- a/usr/src/uts/i86pc/vm/htable.h +++ b/usr/src/uts/i86pc/vm/htable.h @@ -24,6 +24,7 @@ */ /* * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _VM_HTABLE_H @@ -85,12 +86,13 @@ typedef struct htable htable_t; /* * Flags values for htable ht_flags field: * - * HTABLE_VLP - this is the top level htable of a VLP HAT. + * HTABLE_COPIED - This is the top level htable of a HAT being used with per-CPU + * pagetables. * * HTABLE_SHARED_PFN - this htable had its PFN assigned from sharing another * htable. Used by hat_share() for ISM. */ -#define HTABLE_VLP (0x01) +#define HTABLE_COPIED (0x01) #define HTABLE_SHARED_PFN (0x02) /* @@ -106,14 +108,19 @@ typedef struct htable htable_t; ((uintptr_t)(hat) >> 4)) & ((hat)->hat_num_hash - 1)) /* - * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info. + * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info. For more + * information on its use and members, see uts/i86pc/vm/hat_i86.c. */ struct hat_cpu_info { kmutex_t hci_mutex; /* mutex to ensure sequential usage */ #if defined(__amd64) - pfn_t hci_vlp_pfn; /* pfn of hci_vlp_l3ptes */ - x86pte_t *hci_vlp_l3ptes; /* VLP Level==3 pagetable (top) */ - x86pte_t *hci_vlp_l2ptes; /* VLP Level==2 pagetable */ + pfn_t hci_pcp_l3pfn; /* pfn of hci_pcp_l3ptes */ + pfn_t hci_pcp_l2pfn; /* pfn of hci_pcp_l2ptes */ + x86pte_t *hci_pcp_l3ptes; /* PCP Level==3 pagetable (top) */ + x86pte_t *hci_pcp_l2ptes; /* PCP Level==2 pagetable */ + struct hat *hci_user_hat; /* CPU specific HAT */ + pfn_t hci_user_l3pfn; /* pfn of hci_user_l3ptes */ + x86pte_t *hci_user_l3ptes; /* PCP User L3 pagetable */ #endif /* __amd64 */ }; @@ -127,7 +134,8 @@ struct hat_cpu_info { * XX64 - The check for the VA hole needs to be better generalized. */ #if defined(__amd64) -#define HTABLE_NUM_PTES(ht) (((ht)->ht_flags & HTABLE_VLP) ? 4 : 512) +#define HTABLE_NUM_PTES(ht) (((ht)->ht_flags & HTABLE_COPIED) ? \ + (((ht)->ht_level == mmu.max_level) ? 512 : 4) : 512) #define HTABLE_LAST_PAGE(ht) \ ((ht)->ht_level == mmu.max_level ? ((uintptr_t)0UL - MMU_PAGESIZE) :\ diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files index 8fdda3652d..a576b2f0a8 100644 --- a/usr/src/uts/i86xpv/Makefile.files +++ b/usr/src/uts/i86xpv/Makefile.files @@ -22,6 +22,8 @@ # # Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. # +# Copyright 2018 Joyent, Inc. +# # This Makefile defines file modules in the directory uts/i86xpv # and its children. These are the source files which are i86xpv @@ -65,7 +67,6 @@ CORE_OBJS += \ instr_size.o \ intr.o \ kboot_mmu.o \ - kdi_subr.o \ kdi_idt.o \ kdi_idthdl.o \ kdi_asm.o \ @@ -160,7 +161,8 @@ SPECIAL_OBJS_64 += \ locore.o \ fast_trap_asm.o \ interrupt.o \ - syscall_asm_amd64.o + syscall_asm_amd64.o \ + kpti_trampolines.o SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS)) @@ -252,5 +254,3 @@ ASSYM_DEPS += \ syscall_asm_amd64.o $(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h - -ASSYM_DEPS += kdi_asm.o diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules index 0054b66bef..bb63d03166 100644 --- a/usr/src/uts/intel/Makefile.rules +++ b/usr/src/uts/intel/Makefile.rules @@ -272,9 +272,6 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/%.s $(COMPILE.s) -o $@ $< -$(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/$(SUBARCH_DIR)/%.s - $(COMPILE.s) -o $@ $< - $(OBJS_DIR)/%.o: $(UTSBASE)/intel/zfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -489,9 +486,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/%.s @($(LHEAD) $(LINT.s) $< $(LTAIL)) -$(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/$(SUBARCH_DIR)/%.s - @($(LHEAD) $(LINT.s) $< $(LTAIL)) - $(LINTS_DIR)/%.ln: $(UTSBASE)/intel/nskern/%.s @($(LHEAD) $(LINT.s) $< $(LTAIL)) diff --git a/usr/src/uts/intel/amd64/sys/kdi_regs.h b/usr/src/uts/intel/amd64/sys/kdi_regs.h index 945e0f8c95..d7c4e87807 100644 --- a/usr/src/uts/intel/amd64/sys/kdi_regs.h +++ b/usr/src/uts/intel/amd64/sys/kdi_regs.h @@ -33,8 +33,6 @@ extern "C" { #endif -#define KDIREG_NGREG 31 - /* * A modified version of struct regs layout. */ @@ -59,17 +57,20 @@ extern "C" { #define KDIREG_FSBASE 17 #define KDIREG_GSBASE 18 #define KDIREG_KGSBASE 19 -#define KDIREG_DS 20 -#define KDIREG_ES 21 -#define KDIREG_FS 22 -#define KDIREG_GS 23 -#define KDIREG_TRAPNO 24 -#define KDIREG_ERR 25 -#define KDIREG_RIP 26 -#define KDIREG_CS 27 -#define KDIREG_RFLAGS 28 -#define KDIREG_RSP 29 -#define KDIREG_SS 30 +#define KDIREG_CR2 20 +#define KDIREG_DS 21 +#define KDIREG_ES 22 +#define KDIREG_FS 23 +#define KDIREG_GS 24 +#define KDIREG_TRAPNO 25 +#define KDIREG_ERR 26 +#define KDIREG_RIP 27 +#define KDIREG_CS 28 +#define KDIREG_RFLAGS 29 +#define KDIREG_RSP 30 +#define KDIREG_SS 31 + +#define KDIREG_NGREG (KDIREG_SS + 1) #define KDIREG_PC KDIREG_RIP #define KDIREG_SP KDIREG_RSP diff --git a/usr/src/uts/intel/ia32/ml/exception.s b/usr/src/uts/intel/ia32/ml/exception.s index 8b538910e2..82d449f31c 100644 --- a/usr/src/uts/intel/ia32/ml/exception.s +++ b/usr/src/uts/intel/ia32/ml/exception.s @@ -1,7 +1,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. - * Copyright (c) 2017 Joyent, Inc. + * Copyright (c) 2018 Joyent, Inc. */ /* @@ -81,7 +81,7 @@ ndptrap_frstor(void) #define NPTRAP_NOERR(trapno) \ pushq $0; \ - pushq $trapno + pushq $trapno #define TRAP_NOERR(trapno) \ XPV_TRAP_POP; \ @@ -93,13 +93,13 @@ ndptrap_frstor(void) */ #define TRAP_ERR(trapno) \ XPV_TRAP_POP; \ - pushq $trapno + pushq $trapno #else /* __xpv && __amd64 */ #define TRAP_NOERR(trapno) \ push $0; \ - push $trapno + push $trapno #define NPTRAP_NOERR(trapno) TRAP_NOERR(trapno) @@ -108,10 +108,24 @@ ndptrap_frstor(void) * onto stack. */ #define TRAP_ERR(trapno) \ - push $trapno + push $trapno #endif /* __xpv && __amd64 */ + /* + * These are the stacks used on cpu0 for taking double faults, + * NMIs and MCEs (the latter two only on amd64 where we have IST). + * + * We define them here instead of in a C file so that we can page-align + * them (gcc won't do that in a .c file). + */ + .data + DGDEF3(dblfault_stack0, DEFAULTSTKSZ, MMU_PAGESIZE) + .fill DEFAULTSTKSZ, 1, 0 + DGDEF3(nmi_stack0, DEFAULTSTKSZ, MMU_PAGESIZE) + .fill DEFAULTSTKSZ, 1, 0 + DGDEF3(mce_stack0, DEFAULTSTKSZ, MMU_PAGESIZE) + .fill DEFAULTSTKSZ, 1, 0 /* * #DE @@ -163,6 +177,12 @@ ndptrap_frstor(void) je 1f leaq brand_sys_sysenter(%rip), %r11 cmpq %r11, 24(%rsp) /* Compare to saved r_rip on the stack */ + je 1f + leaq tr_sys_sysenter(%rip), %r11 + cmpq %r11, 24(%rsp) + je 1f + leaq tr_brand_sys_sysenter(%rip), %r11 + cmpq %r11, 24(%rsp) jne 2f 1: SWAPGS 2: popq %r11 @@ -214,6 +234,10 @@ ndptrap_frstor(void) * the cpu structs for all processors till we find a match for the gdt * of the trapping processor. The stack is expected to be pointing at * the standard regs pushed by hardware on a trap (plus error code and trapno). + * + * It's ok for us to clobber gsbase here (and possibly end up with both gsbase + * and kgsbase set to the same value) because we're not going back the normal + * way out of here (via IRET). Where we're going, we don't need no user %gs. */ #define SET_CPU_GSBASE \ subq $REGOFF_TRAPNO, %rsp; /* save regs */ \ @@ -294,7 +318,7 @@ ndptrap_frstor(void) call av_dispatch_nmivect INTR_POP - IRET + jmp tr_iret_auto /*NOTREACHED*/ SET_SIZE(nmiint) @@ -319,8 +343,8 @@ ndptrap_frstor(void) movl %esp, %ebp - pushl %ebp - call av_dispatch_nmivect + pushl %ebp + call av_dispatch_nmivect addl $4, %esp INTR_POP_USER @@ -433,7 +457,7 @@ ud_push: movq 32(%rsp), %rax /* reload calling RSP */ movq %rbp, (%rax) /* store %rbp there */ popq %rax /* pop off temp */ - IRET /* return from interrupt */ + jmp tr_iret_kernel /* return from interrupt */ /*NOTREACHED*/ ud_leave: @@ -454,7 +478,7 @@ ud_leave: movq %rbp, 32(%rsp) /* store new %rsp */ movq %rax, %rbp /* set new %rbp */ popq %rax /* pop off temp */ - IRET /* return from interrupt */ + jmp tr_iret_kernel /* return from interrupt */ /*NOTREACHED*/ ud_nop: @@ -464,7 +488,7 @@ ud_nop: */ INTR_POP incq (%rsp) - IRET + jmp tr_iret_kernel /*NOTREACHED*/ ud_ret: @@ -475,7 +499,7 @@ ud_ret: movq %rax, 8(%rsp) /* store calling RIP */ addq $8, 32(%rsp) /* adjust new %rsp */ popq %rax /* pop off temp */ - IRET /* return from interrupt */ + jmp tr_iret_kernel /* return from interrupt */ /*NOTREACHED*/ ud_trap: @@ -633,7 +657,7 @@ _emul_done: */ TRAP_NOERR(T_NOEXTFLT) /* $7 */ INTR_PUSH - + /* * We want to do this quickly as every lwp using fp will take this * after a context switch -- we do the frequent path in ndptrap_frstor @@ -709,7 +733,7 @@ _patch_xrstorq_rbx: SWAPGS /* if from user, need swapgs */ LOADCPU(%rax) SWAPGS -2: +2: /* * Xrstor needs to use edx as part of its flag. * NOTE: have to push rdx after "cmpw ...24(%rsp)", otherwise rsp+$24 @@ -749,7 +773,7 @@ _patch_xrstorq_rbx: popq %rdx popq %rbx popq %rax - IRET + jmp tr_iret_auto /*NOTREACHED*/ .handle_in_trap: @@ -867,7 +891,7 @@ _patch_xrstor_ebx: 1: addq $DESCTBR_SIZE, %rsp popq %rax - + DFTRAP_PUSH /* @@ -1127,7 +1151,7 @@ check_for_user_address: #endif /* !__amd64 */ ENTRY_NP(resvtrap) - TRAP_NOERR(15) /* (reserved) */ + TRAP_NOERR(T_RESVTRAP) /* (reserved) */ jmp cmntrap SET_SIZE(resvtrap) @@ -1207,15 +1231,10 @@ check_for_user_address: SET_SIZE(xmtrap) ENTRY_NP(invaltrap) - TRAP_NOERR(30) /* very invalid */ + TRAP_NOERR(T_INVALTRAP) /* very invalid */ jmp cmntrap SET_SIZE(invaltrap) - ENTRY_NP(invalint) - TRAP_NOERR(31) /* even more so */ - jmp cmnint - SET_SIZE(invalint) - .globl fasttable #if defined(__amd64) @@ -1286,7 +1305,7 @@ check_for_user_address: ENTRY_NP(fast_null) XPV_TRAP_POP orq $PS_C, 24(%rsp) /* set carry bit in user flags */ - IRET + jmp tr_iret_auto /*NOTREACHED*/ SET_SIZE(fast_null) diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s index 0948fa7c93..6fc38cfbe8 100644 --- a/usr/src/uts/intel/ia32/ml/swtch.s +++ b/usr/src/uts/intel/ia32/ml/swtch.s @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright (c) 2018 Joyent, Inc. */ /* @@ -64,7 +64,7 @@ * The MMU context, therefore, only changes when resuming a thread in * a process different from curproc. * - * resume_from_intr() is called when the thread being resumed was not + * resume_from_intr() is called when the thread being resumed was not * passivated by resume (e.g. was interrupted). This means that the * resume lock is already held and that a restore context is not needed. * Also, the MMU context is not changed on the resume in this case. @@ -235,6 +235,8 @@ resume(kthread_t *t) #if defined(__amd64) + .global kpti_enable + ENTRY(resume) movq %gs:CPU_THREAD, %rax leaq resume_return(%rip), %r11 @@ -305,7 +307,7 @@ resume(kthread_t *t) */ movq CPU_IDLE_THREAD(%r15), %rax /* idle thread pointer */ - /* + /* * Set the idle thread as the current thread */ movq T_SP(%rax), %rsp /* It is safe to set rsp */ @@ -318,7 +320,7 @@ resume(kthread_t *t) GET_THREAD_HATP(%rdi, %r12, %r11) call hat_switch - /* + /* * Clear and unlock previous thread's t_lock * to allow it to be dispatched by another processor. */ @@ -368,13 +370,24 @@ resume(kthread_t *t) * thread -- this will set rsp0 to the wrong value, but it's harmless * as it's a kernel thread, and it won't actually attempt to implicitly * use the rsp0 via a privilege change. + * + * Note that when we have KPTI enabled on amd64, we never use this + * value at all (since all the interrupts have an IST set). */ movq CPU_TSS(%r13), %r14 +#if !defined(__xpv) + cmpq $1, kpti_enable + jne 1f + leaq CPU_KPTI_TR_RSP(%r13), %rax + jmp 2f +1: movq T_STACK(%r12), %rax addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */ -#if !defined(__xpv) +2: movq %rax, TSS_RSP0(%r14) #else + movq T_STACK(%r12), %rax + addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */ movl $KDS_SEL, %edi movq %rax, %rsi call HYPERVISOR_stack_switch @@ -407,7 +420,7 @@ resume(kthread_t *t) movq %rcx, %rdi call restorepctx .norestorepctx: - + STORE_INTR_START(%r12) /* @@ -428,7 +441,7 @@ resume(kthread_t *t) * resuming thread's PC after first setting the priority as low as * possible and blocking all interrupt threads that may be active. */ - movq %r13, %rax /* save return address */ + movq %r13, %rax /* save return address */ RESTORE_REGS(%r11) pushq %rax /* push return address for spl0() */ call __dtrace_probe___sched_on__cpu @@ -490,12 +503,12 @@ resume_return: addl $4, %esp .nosavepctx: - /* + /* * Temporarily switch to the idle thread's stack */ movl CPU_IDLE_THREAD(%ebx), %eax /* idle thread pointer */ - /* + /* * Set the idle thread as the current thread */ movl T_SP(%eax), %esp /* It is safe to set esp */ @@ -506,8 +519,8 @@ resume_return: pushl %ecx call hat_switch addl $4, %esp - - /* + + /* * Clear and unlock previous thread's t_lock * to allow it to be dispatched by another processor. */ @@ -673,7 +686,7 @@ resume_from_zombie(kthread_t *t) #endif /* __xpv */ - /* + /* * Temporarily switch to the idle thread's stack so that the zombie * thread's stack can be reclaimed by the reaper. */ @@ -686,7 +699,7 @@ resume_from_zombie(kthread_t *t) */ andq $_BITNOT(STACK_ALIGN-1), %rsp - /* + /* * Set the idle thread as the current thread. */ movq %rax, %gs:CPU_THREAD @@ -695,7 +708,7 @@ resume_from_zombie(kthread_t *t) GET_THREAD_HATP(%rdi, %r12, %r11) call hat_switch - /* + /* * Put the zombie on death-row. */ movq %r13, %rdi @@ -743,14 +756,14 @@ resume_from_zombie_return: movl %eax, %cr0 .zfpu_disabled: - /* + /* * Temporarily switch to the idle thread's stack so that the zombie * thread's stack can be reclaimed by the reaper. */ movl %gs:CPU_IDLE_THREAD, %eax /* idle thread pointer */ movl T_SP(%eax), %esp /* get onto idle thread stack */ - /* + /* * Set the idle thread as the current thread. */ movl %eax, %gs:CPU_THREAD @@ -763,7 +776,7 @@ resume_from_zombie_return: call hat_switch addl $4, %esp - /* + /* * Put the zombie on death-row. */ pushl %esi @@ -814,7 +827,7 @@ resume_from_intr(kthread_t *t) movq T_SP(%r12), %rsp /* restore resuming thread's sp */ xorl %ebp, %ebp /* make $<threadlist behave better */ - /* + /* * Unlock outgoing thread's mutex dispatched by another processor. */ xorl %eax, %eax @@ -864,7 +877,7 @@ resume_from_intr_return: movl T_SP(%edi), %esp /* restore resuming thread's sp */ xorl %ebp, %ebp /* make $<threadlist behave better */ - /* + /* * Unlock outgoing thread's mutex dispatched by another processor. */ xorl %eax,%eax @@ -969,9 +982,15 @@ thread_splitstack_cleanup(void) ENTRY(thread_splitstack_cleanup) LOADCPU(%r8) movq CPU_TSS(%r8), %r9 - movq CPU_THREAD(%r8), %r10 + cmpq $1, kpti_enable + jne 1f + leaq CPU_KPTI_TR_RSP(%r8), %rax + jmp 2f +1: + movq CPU_THREAD(%r8), %r10 movq T_STACK(%r10), %rax - addq $REGSIZE+MINFRAME, %rax + addq $REGSIZE+MINFRAME, %rax +2: movq %rax, TSS_RSP0(%r9) ret SET_SIZE(thread_splitstack_cleanup) diff --git a/usr/src/uts/intel/ia32/os/desctbls.c b/usr/src/uts/intel/ia32/os/desctbls.c index 97024b7b59..3c021bd055 100644 --- a/usr/src/uts/intel/ia32/os/desctbls.c +++ b/usr/src/uts/intel/ia32/os/desctbls.c @@ -24,7 +24,7 @@ */ /* - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. */ /* @@ -83,6 +83,7 @@ #include <sys/kdi.h> #include <sys/mach_mmu.h> #include <sys/systm.h> +#include <sys/note.h> #ifdef __xpv #include <sys/hypervisor.h> @@ -128,8 +129,13 @@ user_desc_t ucs32_on; user_desc_t ucs32_off; #endif /* __amd64 */ -#pragma align 16(dblfault_stack0) -char dblfault_stack0[DEFAULTSTKSZ]; +/* + * If the size of this is changed, you must update hat_pcp_setup() and the + * definitions in exception.s + */ +extern char dblfault_stack0[DEFAULTSTKSZ]; +extern char nmi_stack0[DEFAULTSTKSZ]; +extern char mce_stack0[DEFAULTSTKSZ]; extern void fast_null(void); extern hrtime_t get_hrtime(void); @@ -310,57 +316,73 @@ get_ssd_base(system_desc_t *dp) /* * Install gate segment descriptor for interrupt, trap, call and task gates. + * + * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on + * all interrupts. We have different ISTs for each class of exceptions that are + * most likely to occur while handling an existing exception; while many of + * these are just going to panic, it's nice not to trample on the existing + * exception state for debugging purposes. + * + * Normal interrupts are all redirected unconditionally to the KPTI trampoline + * stack space. This unifies the trampoline handling between user and kernel + * space (and avoids the need to touch %gs). + * + * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when + * we do a read from KMDB that cause another #PF. Without its own IST, this + * would stomp on the kernel's mcpu_kpti_flt frame. */ - -#if defined(__amd64) - -/*ARGSUSED*/ -void -set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, - uint_t type, uint_t dpl, uint_t vector) +uint_t +idt_vector_to_ist(uint_t vector) { - dp->sgd_looffset = (uintptr_t)func; - dp->sgd_hioffset = (uintptr_t)func >> 16; - dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); - - dp->sgd_selector = (uint16_t)sel; - - /* - * For 64 bit native we use the IST stack mechanism - * for double faults. All other traps use the CPL = 0 - * (tss_rsp0) stack. - */ -#if !defined(__xpv) - if (vector == T_DBLFLT) - dp->sgd_ist = 1; - else +#if defined(__xpv) + _NOTE(ARGUNUSED(vector)); + return (IST_NONE); +#else + switch (vector) { + /* These should always use IST even without KPTI enabled. */ + case T_DBLFLT: + return (IST_DF); + case T_NMIFLT: + return (IST_NMI); + case T_MCE: + return (IST_MCE); + + case T_BPTFLT: + case T_SGLSTP: + if (kpti_enable == 1) { + return (IST_DBG); + } + return (IST_NONE); + case T_STKFLT: + case T_GPFLT: + case T_PGFLT: + if (kpti_enable == 1) { + return (IST_NESTABLE); + } + return (IST_NONE); + default: + if (kpti_enable == 1) { + return (IST_DEFAULT); + } + return (IST_NONE); + } #endif - dp->sgd_ist = 0; - - dp->sgd_type = type; - dp->sgd_dpl = dpl; - dp->sgd_p = 1; } -#elif defined(__i386) - -/*ARGSUSED*/ void set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel, - uint_t type, uint_t dpl, uint_t unused) + uint_t type, uint_t dpl, uint_t ist) { dp->sgd_looffset = (uintptr_t)func; dp->sgd_hioffset = (uintptr_t)func >> 16; - + dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16); dp->sgd_selector = (uint16_t)sel; - dp->sgd_stkcpy = 0; /* always zero bytes */ + dp->sgd_ist = ist; dp->sgd_type = type; dp->sgd_dpl = dpl; dp->sgd_p = 1; } -#endif /* __i386 */ - /* * Updates a single user descriptor in the the GDT of the current cpu. * Caller is responsible for preventing cpu migration. @@ -917,22 +939,30 @@ init_gdt(void) static void init_idt_common(gate_desc_t *idt) { - set_gatesegd(&idt[T_ZERODIV], &div0trap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_SGLSTP], &dbgtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_NMIFLT], &nmiint, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_BPTFLT], &brktrap, KCS_SEL, SDT_SYSIGT, TRP_UPL, - 0); - set_gatesegd(&idt[T_OVFLW], &ovflotrap, KCS_SEL, SDT_SYSIGT, TRP_UPL, - 0); - set_gatesegd(&idt[T_BOUNDFLT], &boundstrap, KCS_SEL, SDT_SYSIGT, - TRP_KPL, 0); - set_gatesegd(&idt[T_ILLINST], &invoptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_NOEXTFLT], &ndptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); + set_gatesegd(&idt[T_ZERODIV], + (kpti_enable == 1) ? &tr_div0trap : &div0trap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV)); + set_gatesegd(&idt[T_SGLSTP], + (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP)); + set_gatesegd(&idt[T_NMIFLT], + (kpti_enable == 1) ? &tr_nmiint : &nmiint, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT)); + set_gatesegd(&idt[T_BPTFLT], + (kpti_enable == 1) ? &tr_brktrap : &brktrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT)); + set_gatesegd(&idt[T_OVFLW], + (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW)); + set_gatesegd(&idt[T_BOUNDFLT], + (kpti_enable == 1) ? &tr_boundstrap : &boundstrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT)); + set_gatesegd(&idt[T_ILLINST], + (kpti_enable == 1) ? &tr_invoptrap : &invoptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST)); + set_gatesegd(&idt[T_NOEXTFLT], + (kpti_enable == 1) ? &tr_ndptrap : &ndptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT)); /* * double fault handler. @@ -942,90 +972,88 @@ init_idt_common(gate_desc_t *idt) * and/or stack is in a broken state. See xen_failsafe_callback. */ #if !defined(__xpv) -#if defined(__amd64) - - set_gatesegd(&idt[T_DBLFLT], &syserrtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - T_DBLFLT); - -#elif defined(__i386) - - /* - * task gate required. - */ - set_gatesegd(&idt[T_DBLFLT], NULL, DFTSS_SEL, SDT_SYSTASKGT, TRP_KPL, - 0); - -#endif /* __i386 */ + set_gatesegd(&idt[T_DBLFLT], + (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT)); #endif /* !__xpv */ /* * T_EXTOVRFLT coprocessor-segment-overrun not supported. */ + set_gatesegd(&idt[T_TSSFLT], + (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT)); + set_gatesegd(&idt[T_SEGFLT], + (kpti_enable == 1) ? &tr_segnptrap : &segnptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT)); + set_gatesegd(&idt[T_STKFLT], + (kpti_enable == 1) ? &tr_stktrap : &stktrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT)); + set_gatesegd(&idt[T_GPFLT], + (kpti_enable == 1) ? &tr_gptrap : &gptrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT)); + set_gatesegd(&idt[T_PGFLT], + (kpti_enable == 1) ? &tr_pftrap : &pftrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT)); + set_gatesegd(&idt[T_EXTERRFLT], + (kpti_enable == 1) ? &tr_ndperr : &ndperr, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT)); + set_gatesegd(&idt[T_ALIGNMENT], + (kpti_enable == 1) ? &tr_achktrap : &achktrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT)); + set_gatesegd(&idt[T_MCE], + (kpti_enable == 1) ? &tr_mcetrap : &mcetrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE)); + set_gatesegd(&idt[T_SIMDFPE], + (kpti_enable == 1) ? &tr_xmtrap : &xmtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); - set_gatesegd(&idt[T_TSSFLT], &invtsstrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_SEGFLT], &segnptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_STKFLT], &stktrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - set_gatesegd(&idt[T_GPFLT], &gptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - set_gatesegd(&idt[T_PGFLT], &pftrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - set_gatesegd(&idt[T_EXTERRFLT], &ndperr, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); - set_gatesegd(&idt[T_ALIGNMENT], &achktrap, KCS_SEL, SDT_SYSIGT, - TRP_KPL, 0); - set_gatesegd(&idt[T_MCE], &mcetrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - set_gatesegd(&idt[T_SIMDFPE], &xmtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); - - /* + /* * install "int80" handler at, well, 0x80. */ - set_gatesegd(&idt0[T_INT80], &sys_int80, KCS_SEL, SDT_SYSIGT, TRP_UPL, - 0); + set_gatesegd(&idt0[T_INT80], + (kpti_enable == 1) ? &tr_sys_int80 : &sys_int80, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80)); /* * install fast trap handler at 210. */ - set_gatesegd(&idt[T_FASTTRAP], &fasttrap, KCS_SEL, SDT_SYSIGT, TRP_UPL, - 0); + set_gatesegd(&idt[T_FASTTRAP], + (kpti_enable == 1) ? &tr_fasttrap : &fasttrap, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP)); /* * System call handler. */ -#if defined(__amd64) - set_gatesegd(&idt[T_SYSCALLINT], &sys_syscall_int, KCS_SEL, SDT_SYSIGT, - TRP_UPL, 0); - -#elif defined(__i386) - set_gatesegd(&idt[T_SYSCALLINT], &sys_call, KCS_SEL, SDT_SYSIGT, - TRP_UPL, 0); -#endif /* __i386 */ + set_gatesegd(&idt[T_SYSCALLINT], + (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); /* * Install the DTrace interrupt handler for the pid provider. */ - set_gatesegd(&idt[T_DTRACE_RET], &dtrace_ret, KCS_SEL, - SDT_SYSIGT, TRP_UPL, 0); + set_gatesegd(&idt[T_DTRACE_RET], + (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); /* -- * Prepare interposing descriptors for the branded "int80" -- * and syscall handlers and cache copies of the default -- * descriptors. + * Prepare interposing descriptors for the branded "int80" + * and syscall handlers and cache copies of the default + * descriptors. */ brand_tbl[0].ih_inum = T_INT80; brand_tbl[0].ih_default_desc = idt0[T_INT80]; - set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_int80, KCS_SEL, - SDT_SYSIGT, TRP_UPL, 0); + set_gatesegd(&(brand_tbl[0].ih_interp_desc), + (kpti_enable == 1) ? &tr_brand_sys_int80 : &brand_sys_int80, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80)); brand_tbl[1].ih_inum = T_SYSCALLINT; brand_tbl[1].ih_default_desc = idt0[T_SYSCALLINT]; -#if defined(__amd64) - set_gatesegd(&(brand_tbl[1].ih_interp_desc), &brand_sys_syscall_int, - KCS_SEL, SDT_SYSIGT, TRP_UPL, 0); -#elif defined(__i386) - set_gatesegd(&(brand_tbl[1].ih_interp_desc), &brand_sys_call, - KCS_SEL, SDT_SYSIGT, TRP_UPL, 0); -#endif /* __i386 */ + set_gatesegd(&(brand_tbl[1].ih_interp_desc), + (kpti_enable == 1) ? &tr_brand_sys_syscall_int : + &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, + idt_vector_to_ist(T_SYSCALLINT)); brand_tbl[2].ih_inum = 0; } @@ -1053,27 +1081,53 @@ init_idt(gate_desc_t *idt) * since it can only be generated on a 386 processor. 15 is also * unsupported and reserved. */ - for (i = 0; i < NIDT; i++) +#if !defined(__xpv) + for (i = 0; i < NIDT; i++) { + set_gatesegd(&idt[i], + (kpti_enable == 1) ? &tr_resvtrap : &resvtrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(T_RESVTRAP)); + } +#else + for (i = 0; i < NIDT; i++) { set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); + IST_NONE); + } +#endif /* * 20-31 reserved */ - for (i = 20; i < 32; i++) +#if !defined(__xpv) + for (i = 20; i < 32; i++) { + set_gatesegd(&idt[i], + (kpti_enable == 1) ? &tr_invaltrap : &invaltrap, + KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(T_INVALTRAP)); + } +#else + for (i = 20; i < 32; i++) { set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, - 0); + IST_NONE); + } +#endif /* * interrupts 32 - 255 */ for (i = 32; i < 256; i++) { +#if !defined(__xpv) + (void) snprintf(ivctname, sizeof (ivctname), + (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i); +#else (void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i); +#endif ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0); if (ivctptr == NULL) panic("kobj_getsymvalue(%s) failed", ivctname); - set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0); + set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, + idt_vector_to_ist(i)); } /* @@ -1102,67 +1156,39 @@ init_ldt(void) } #if !defined(__xpv) -#if defined(__amd64) static void init_tss(void) { - /* - * tss_rsp0 is dynamically filled in by resume() on each context switch. - * All exceptions but #DF will run on the thread stack. - * Set up the double fault stack here. - */ - ktss0->tss_ist1 = - (uint64_t)&dblfault_stack0[sizeof (dblfault_stack0)]; + extern struct cpu cpus[]; /* - * Set I/O bit map offset equal to size of TSS segment limit - * for no I/O permission map. This will force all user I/O - * instructions to generate #gp fault. + * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each + * context switch but it'll be overwritten with this same value anyway. */ - ktss0->tss_bitmapbase = sizeof (*ktss0); - - /* - * Point %tr to descriptor for ktss0 in gdt. - */ - wr_tsr(KTSS_SEL); -} + if (kpti_enable == 1) { + ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; + } -#elif defined(__i386) + /* Set up the IST stacks for double fault, NMI, MCE. */ + ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)]; + ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)]; + ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)]; -static void -init_tss(void) -{ /* - * ktss0->tss_esp dynamically filled in by resume() on each - * context switch. + * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is + * enabled), and also for KDI (always). */ - ktss0->tss_ss0 = KDS_SEL; - ktss0->tss_eip = (uint32_t)_start; - ktss0->tss_ds = ktss0->tss_es = ktss0->tss_ss = KDS_SEL; - ktss0->tss_cs = KCS_SEL; - ktss0->tss_fs = KFS_SEL; - ktss0->tss_gs = KGS_SEL; - ktss0->tss_ldt = ULDT_SEL; + ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp; - /* - * Initialize double fault tss. - */ - dftss0->tss_esp0 = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)]; - dftss0->tss_ss0 = KDS_SEL; + if (kpti_enable == 1) { + /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */ + ktss0->tss_ist5 = + (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp; - /* - * tss_cr3 will get initialized in hat_kern_setup() once our page - * tables have been setup. - */ - dftss0->tss_eip = (uint32_t)syserrtrap; - dftss0->tss_esp = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)]; - dftss0->tss_cs = KCS_SEL; - dftss0->tss_ds = KDS_SEL; - dftss0->tss_es = KDS_SEL; - dftss0->tss_ss = KDS_SEL; - dftss0->tss_fs = KFS_SEL; - dftss0->tss_gs = KGS_SEL; + /* This IST stack is used for all other intrs (for KPTI). */ + ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp; + } /* * Set I/O bit map offset equal to size of TSS segment limit @@ -1177,7 +1203,6 @@ init_tss(void) wr_tsr(KTSS_SEL); } -#endif /* __i386 */ #endif /* !__xpv */ #if defined(__xpv) @@ -1234,6 +1259,9 @@ init_desctbls(void) { user_desc_t *gdt; desctbr_t idtr; +#if defined(__amd64) + extern uint64_t kpti_safe_cr3; +#endif /* * Allocate IDT and TSS structures on unique pages for better @@ -1269,6 +1297,14 @@ init_desctbls(void) CPU->cpu_gdt = gdt; /* + * Initialize this CPU's LDT. + */ + CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA, + LDT_CPU_SIZE, PAGESIZE); + bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE); + CPU->cpu_m.mcpu_ldt_len = 0; + + /* * Setup and install our IDT. */ init_idt(idt0); @@ -1289,6 +1325,9 @@ init_desctbls(void) init_tss(); CPU->cpu_tss = ktss0; init_ldt(); + + /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */ + kpti_safe_cr3 = (uint64_t)getcr3(); } #endif /* __xpv */ @@ -1349,15 +1388,26 @@ brand_interpositioning_enable(void) #else if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { - wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); + if (kpti_enable == 1) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32); + } else { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32); + } } #endif #endif /* __amd64 */ - if (is_x86_feature(x86_featureset, X86FSET_SEP)) - wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); + if (is_x86_feature(x86_featureset, X86FSET_SEP)) { + if (kpti_enable == 1) { + wrmsr(MSR_INTC_SEP_EIP, + (uintptr_t)tr_brand_sys_sysenter); + } else { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter); + } + } } /* @@ -1393,13 +1443,23 @@ brand_interpositioning_disable(void) #else if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) { - wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); - wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); + if (kpti_enable == 1) { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32); + } else { + wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall); + wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32); + } } #endif #endif /* __amd64 */ - if (is_x86_feature(x86_featureset, X86FSET_SEP)) - wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); + if (is_x86_feature(x86_featureset, X86FSET_SEP)) { + if (kpti_enable == 1) { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter); + } else { + wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter); + } + } } diff --git a/usr/src/uts/intel/ia32/os/sysi86.c b/usr/src/uts/intel/ia32/os/sysi86.c index 7be9ec20fd..cd1129ea1f 100644 --- a/usr/src/uts/intel/ia32/os/sysi86.c +++ b/usr/src/uts/intel/ia32/os/sysi86.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -60,6 +61,7 @@ #include <sys/cmn_err.h> #include <sys/segments.h> #include <sys/clock.h> +#include <vm/hat_i86.h> #if defined(__xpv) #include <sys/hypervisor.h> #include <sys/note.h> @@ -346,7 +348,19 @@ ldt_load(void) xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc), curproc->p_ldtlimit + 1); #else - *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc; + size_t len; + system_desc_t desc; + + /* + * Before we can use the LDT on this CPU, we must install the LDT in the + * user mapping table. + */ + len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t); + bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len); + CPU->cpu_m.mcpu_ldt_len = len; + set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL); + *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc; + wr_ldtr(ULDT_SEL); #endif } @@ -363,6 +377,9 @@ ldt_unload(void) #else *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc; wr_ldtr(0); + + bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len); + CPU->cpu_m.mcpu_ldt_len = 0; #endif } @@ -714,7 +731,8 @@ ldt_alloc(proc_t *pp, uint_t seli) ASSERT(pp->p_ldtlimit == 0); /* - * Allocate new LDT just large enough to contain seli. + * Allocate new LDT just large enough to contain seli. The LDT must + * always be allocated in units of pages for KPTI. */ ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); nsels = ldtsz / sizeof (user_desc_t); @@ -832,7 +850,8 @@ ldt_grow(proc_t *pp, uint_t seli) ASSERT(pp->p_ldtlimit != 0); /* - * Allocate larger LDT just large enough to contain seli. + * Allocate larger LDT just large enough to contain seli. The LDT must + * always be allocated in units of pages for KPTI. */ nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE); nsels = nldtsz / sizeof (user_desc_t); diff --git a/usr/src/uts/intel/ia32/sys/trap.h b/usr/src/uts/intel/ia32/sys/trap.h index cc41d102a8..4165f1289e 100644 --- a/usr/src/uts/intel/ia32/sys/trap.h +++ b/usr/src/uts/intel/ia32/sys/trap.h @@ -25,6 +25,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _IA32_SYS_TRAP_H @@ -53,11 +55,13 @@ extern "C" { #define T_STKFLT 0xc /* #ss stack fault */ #define T_GPFLT 0xd /* #gp general protection fault */ #define T_PGFLT 0xe /* #pf page fault */ +#define T_RESVTRAP 0xf /* reserved */ #define T_EXTERRFLT 0x10 /* #mf x87 FPU error fault */ #define T_ALIGNMENT 0x11 /* #ac alignment check error */ #define T_MCE 0x12 /* #mc machine check exception */ #define T_SIMDFPE 0x13 /* #xm SSE/SSE exception */ #define T_DBGENTR 0x14 /* debugger entry */ +#define T_INVALTRAP 0x1e /* invalid */ #define T_ENDPERR 0x21 /* emulated extension error flt */ #define T_ENOEXTFLT 0x20 /* emulated ext not present */ #define T_FASTTRAP 0xd2 /* fast system call */ @@ -102,6 +106,22 @@ extern "C" { #define T_LASTFAST 6 /* Last valid subfunction */ +/* + * Offsets for an interrupt/trap frame. + */ +#define T_FRAME_ERR 0 +#define T_FRAME_RIP 8 +#define T_FRAME_CS 16 +#define T_FRAME_RFLAGS 24 +#define T_FRAME_RSP 32 +#define T_FRAME_SS 40 + +#define T_FRAMERET_RIP 0 +#define T_FRAMERET_CS 8 +#define T_FRAMERET_RFLAGS 16 +#define T_FRAMERET_RSP 24 +#define T_FRAMERET_SS 32 + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/intel/kdi/ia32/kdi_asm.s b/usr/src/uts/intel/kdi/ia32/kdi_asm.s deleted file mode 100644 index 4f7e2ec7ad..0000000000 --- a/usr/src/uts/intel/kdi/ia32/kdi_asm.s +++ /dev/null @@ -1,662 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - * - * Copyright 2018 Joyent, Inc. - */ - -/* - * Debugger entry for both master and slave CPUs - */ - -#if defined(__lint) -#include <sys/types.h> -#endif - -#include <sys/segments.h> -#include <sys/asm_linkage.h> -#include <sys/controlregs.h> -#include <sys/x86_archext.h> -#include <sys/privregs.h> -#include <sys/machprivregs.h> -#include <sys/kdi_regs.h> -#include <sys/uadmin.h> -#include <sys/psw.h> - -#ifdef _ASM - -#include <kdi_assym.h> -#include <assym.h> - -/* clobbers %edx, %ecx, returns addr in %eax, cpu id in %ebx */ -#define GET_CPUSAVE_ADDR \ - movl %gs:CPU_ID, %ebx; \ - movl %ebx, %eax; \ - movl $KRS_SIZE, %ecx; \ - mull %ecx; \ - movl $kdi_cpusave, %edx; \ - /*CSTYLED*/ \ - addl (%edx), %eax - -/* - * Save copies of the IDT and GDT descriptors. Note that we only save the IDT - * and GDT if the IDT isn't ours, as we may be legitimately re-entering the - * debugger through the trap handler. We don't want to clobber the saved IDT - * in the process, as we'd end up resuming the world on our IDT. - */ -#define SAVE_IDTGDT \ - movl %gs:CPU_IDT, %edx; \ - cmpl $kdi_idt, %edx; \ - je 1f; \ - movl %edx, KRS_IDT(%eax); \ - movl %gs:CPU_GDT, %edx; \ - movl %edx, KRS_GDT(%eax); \ -1: - -/* - * Given the address of the current CPU's cpusave area in %edi, the following - * macro restores the debugging state to said CPU. Restored state includes - * the debug registers from the global %dr variables. - */ -#define KDI_RESTORE_DEBUGGING_STATE \ - leal kdi_drreg, %ebx; \ - \ - pushl DR_CTL(%ebx); \ - pushl $7; \ - call kdi_dreg_set; \ - addl $8, %esp; \ - \ - pushl $KDIREG_DRSTAT_RESERVED; \ - pushl $6; \ - call kdi_dreg_set; \ - addl $8, %esp; \ - \ - pushl DRADDR_OFF(0)(%ebx); \ - pushl $0; \ - call kdi_dreg_set; \ - addl $8, %esp; \ - \ - pushl DRADDR_OFF(1)(%ebx); \ - pushl $1; \ - call kdi_dreg_set; \ - addl $8, %esp; \ - \ - pushl DRADDR_OFF(2)(%ebx); \ - pushl $2; \ - call kdi_dreg_set; \ - addl $8, %esp; \ - \ - pushl DRADDR_OFF(3)(%ebx); \ - pushl $3; \ - call kdi_dreg_set; \ - addl $8, %esp; - -#define KDI_RESTORE_REGS() \ - /* Discard savfp and savpc */ \ - addl $8, %esp; \ - popl %ss; \ - popl %gs; \ - popl %fs; \ - popl %es; \ - popl %ds; \ - popal; \ - /* Discard trapno and err */ \ - addl $8, %esp - -/* - * Each cpusave buffer has an area set aside for a ring buffer of breadcrumbs. - * The following macros manage the buffer. - */ - -/* Advance the ring buffer */ -#define ADVANCE_CRUMB_POINTER(cpusave, tmp1, tmp2) \ - movl KRS_CURCRUMBIDX(cpusave), tmp1; \ - cmpl $[KDI_NCRUMBS - 1], tmp1; \ - jge 1f; \ - /* Advance the pointer and index */ \ - addl $1, tmp1; \ - movl tmp1, KRS_CURCRUMBIDX(cpusave); \ - movl KRS_CURCRUMB(cpusave), tmp1; \ - addl $KRM_SIZE, tmp1; \ - jmp 2f; \ -1: /* Reset the pointer and index */ \ - movw $0, KRS_CURCRUMBIDX(cpusave); \ - leal KRS_CRUMBS(cpusave), tmp1; \ -2: movl tmp1, KRS_CURCRUMB(cpusave); \ - /* Clear the new crumb */ \ - movl $KDI_NCRUMBS, tmp2; \ -3: movl $0, -4(tmp1, tmp2, 4); \ - decl tmp2; \ - jnz 3b - -/* Set a value in the current breadcrumb buffer */ -#define ADD_CRUMB(cpusave, offset, value, tmp) \ - movl KRS_CURCRUMB(cpusave), tmp; \ - movl value, offset(tmp) - -#endif /* _ASM */ - -/* - * The main entry point for master CPUs. It also serves as the trap handler - * for all traps and interrupts taken during single-step. - */ -#if defined(__lint) -void -kdi_cmnint(void) -{ -} -#else /* __lint */ - - /* XXX implement me */ - ENTRY_NP(kdi_nmiint) - clr %ecx - movl (%ecx), %ecx - SET_SIZE(kdi_nmiint) - - ENTRY_NP(kdi_cmnint) - ALTENTRY(kdi_master_entry) - - /* Save all registers and selectors */ - - pushal - pushl %ds - pushl %es - pushl %fs - pushl %gs - pushl %ss - - subl $8, %esp - movl %ebp, REG_OFF(KDIREG_SAVFP)(%esp) - movl REG_OFF(KDIREG_EIP)(%esp), %eax - movl %eax, REG_OFF(KDIREG_SAVPC)(%esp) - - /* - * If the kernel has started using its own selectors, we should too. - * Update our saved selectors if they haven't been updated already. - */ - movw %cs, %ax - cmpw $KCS_SEL, %ax - jne 1f /* The kernel hasn't switched yet */ - - movw $KDS_SEL, %ax - movw %ax, %ds - movw kdi_cs, %ax - cmpw $KCS_SEL, %ax - je 1f /* We already switched */ - - /* - * The kernel switched, but we haven't. Update our saved selectors - * to match the kernel's copies for use below. - */ - movl $KCS_SEL, kdi_cs - movl $KDS_SEL, kdi_ds - movl $KFS_SEL, kdi_fs - movl $KGS_SEL, kdi_gs - -1: - /* - * Set the selectors to a known state. If we come in from kmdb's IDT, - * we'll be on boot's %cs. This will cause GET_CPUSAVE_ADDR to return - * CPU 0's cpusave, regardless of which CPU we're on, and chaos will - * ensue. So, if we've got $KCSSEL in kdi_cs, switch to it. The other - * selectors are restored normally. - */ - movw %cs:kdi_cs, %ax - cmpw $KCS_SEL, %ax - jne 1f - ljmp $KCS_SEL, $1f -1: - movw %cs:kdi_ds, %ds - movw kdi_ds, %es - movw kdi_fs, %fs - movw kdi_gs, %gs - movw kdi_ds, %ss - - /* - * This has to come after we set %gs to the kernel descriptor. Since - * we've hijacked some IDT entries used in user-space such as the - * breakpoint handler, we can enter kdi_cmnint() with GDT_LWPGS used - * in %gs. On the hypervisor, CLI() needs GDT_GS to access the machcpu. - */ - CLI(%eax) - -#if defined(__xpv) - /* - * Clear saved_upcall_mask in unused byte of cs slot on stack. - * It can only confuse things. - */ - movb $0, REG_OFF(KDIREG_CS)+2(%esp) - -#endif - - GET_CPUSAVE_ADDR /* %eax = cpusave, %ebx = CPU ID */ - - ADVANCE_CRUMB_POINTER(%eax, %ecx, %edx) - - ADD_CRUMB(%eax, KRM_CPU_STATE, $KDI_CPU_STATE_MASTER, %edx) - - movl REG_OFF(KDIREG_EIP)(%esp), %ecx - ADD_CRUMB(%eax, KRM_PC, %ecx, %edx) - ADD_CRUMB(%eax, KRM_SP, %esp, %edx) - movl REG_OFF(KDIREG_TRAPNO)(%esp), %ecx - ADD_CRUMB(%eax, KRM_TRAPNO, %ecx, %edx) - - movl %esp, %ebp - pushl %eax - - /* - * Were we in the debugger when we took the trap (i.e. was %esp in one - * of the debugger's memory ranges)? - */ - leal kdi_memranges, %ecx - movl kdi_nmemranges, %edx -1: cmpl MR_BASE(%ecx), %esp - jl 2f /* below this range -- try the next one */ - cmpl MR_LIM(%ecx), %esp - jg 2f /* above this range -- try the next one */ - jmp 3f /* matched within this range */ - -2: decl %edx - jz kdi_save_common_state /* %esp not within debugger memory */ - addl $MR_SIZE, %ecx - jmp 1b - -3: /* - * %esp was within one of the debugger's memory ranges. This should - * only happen when we take a trap while running in the debugger. - * kmdb_dpi_handle_fault will determine whether or not it was an - * expected trap, and will take the appropriate action. - */ - - pushl %ebx /* cpuid */ - - movl REG_OFF(KDIREG_ESP)(%ebp), %ecx - addl $REG_OFF(KDIREG_EFLAGS - KDIREG_EAX), %ecx - pushl %ecx - - pushl REG_OFF(KDIREG_EIP)(%ebp) - pushl REG_OFF(KDIREG_TRAPNO)(%ebp) - - call kdi_dvec_handle_fault - addl $16, %esp - - /* - * If we're here, we ran into a debugger problem, and the user - * elected to solve it by having the debugger debug itself. The - * state we're about to save is that of the debugger when it took - * the fault. - */ - - jmp kdi_save_common_state - - SET_SIZE(kdi_master_entry) - SET_SIZE(kdi_cmnint) - -#endif /* __lint */ - -/* - * The cross-call handler for slave CPUs. - * - * The debugger is single-threaded, so only one CPU, called the master, may be - * running it at any given time. The other CPUs, known as slaves, spin in a - * busy loop until there's something for them to do. This is the entry point - * for the slaves - they'll be sent here in response to a cross-call sent by the - * master. - */ - -#if defined(__lint) -void -kdi_slave_entry(void) -{ -} -#else /* __lint */ - ENTRY_NP(kdi_slave_entry) - - /* - * Cross calls are implemented as function calls, so our stack - * currently looks like one you'd get from a zero-argument function - * call. There's an %eip at %esp, and that's about it. We want to - * make it look like the master CPU's stack. By doing this, we can - * use the same resume code for both master and slave. We need to - * make our stack look like a `struct regs' before we jump into the - * common save routine. - */ - - pushl %cs - pushfl - pushl $-1 /* A phony trap error code */ - pushl $-1 /* A phony trap number */ - pushal - pushl %ds - pushl %es - pushl %fs - pushl %gs - pushl %ss - - subl $8, %esp - movl %ebp, REG_OFF(KDIREG_SAVFP)(%esp) - movl REG_OFF(KDIREG_EIP)(%esp), %eax - movl %eax, REG_OFF(KDIREG_SAVPC)(%esp) - - /* - * Swap our saved EFLAGS and %eip. Each is where the other - * should be. - */ - movl REG_OFF(KDIREG_EFLAGS)(%esp), %eax - xchgl REG_OFF(KDIREG_EIP)(%esp), %eax - movl %eax, REG_OFF(KDIREG_EFLAGS)(%esp) - - /* - * Our stack now matches struct regs, and is irettable. We don't need - * to do anything special for the hypervisor w.r.t. PS_IE since we - * iret twice anyway; the second iret back to the hypervisor - * will re-enable interrupts. - */ - CLI(%eax) - - /* Load sanitized segment selectors */ - movw kdi_ds, %ds - movw kdi_ds, %es - movw kdi_fs, %fs - movw kdi_gs, %gs - movw kdi_ds, %ss - - GET_CPUSAVE_ADDR /* %eax = cpusave, %ebx = CPU ID */ - - ADVANCE_CRUMB_POINTER(%eax, %ecx, %edx) - - ADD_CRUMB(%eax, KRM_CPU_STATE, $KDI_CPU_STATE_SLAVE, %edx) - - movl REG_OFF(KDIREG_EIP)(%esp), %ecx - ADD_CRUMB(%eax, KRM_PC, %ecx, %edx) - - pushl %eax - jmp kdi_save_common_state - - SET_SIZE(kdi_slave_entry) - -#endif /* __lint */ - -/* - * The state of the world: - * - * The stack has a complete set of saved registers and segment - * selectors, arranged in `struct regs' order (or vice-versa), up to - * and including EFLAGS. It also has a pointer to our cpusave area. - * - * We need to save a pointer to these saved registers. We also want - * to adjust the saved %esp - it should point just beyond the saved - * registers to the last frame of the thread we interrupted. Finally, - * we want to clear out bits 16-31 of the saved selectors, as the - * selector pushls don't automatically clear them. - */ -#if !defined(__lint) - - ENTRY_NP(kdi_save_common_state) - - popl %eax /* the cpusave area */ - - movl %esp, KRS_GREGS(%eax) /* save ptr to current saved regs */ - - addl $REG_OFF(KDIREG_EFLAGS - KDIREG_EAX), KDIREG_OFF(KDIREG_ESP)(%esp) - - andl $0xffff, KDIREG_OFF(KDIREG_SS)(%esp) - andl $0xffff, KDIREG_OFF(KDIREG_GS)(%esp) - andl $0xffff, KDIREG_OFF(KDIREG_FS)(%esp) - andl $0xffff, KDIREG_OFF(KDIREG_ES)(%esp) - andl $0xffff, KDIREG_OFF(KDIREG_DS)(%esp) - - pushl %eax - call kdi_trap_pass - cmpl $1, %eax - je kdi_pass_to_kernel - popl %eax - - SAVE_IDTGDT - -#if !defined(__xpv) - /* Save off %cr0, and clear write protect */ - movl %cr0, %ecx - movl %ecx, KRS_CR0(%eax) - andl $_BITNOT(CR0_WP), %ecx - movl %ecx, %cr0 -#endif - pushl %edi - movl %eax, %edi - - /* Save the debug registers and disable any active watchpoints */ - pushl $7 - call kdi_dreg_get - addl $4, %esp - - movl %eax, KRS_DRCTL(%edi) - andl $_BITNOT(KDIREG_DRCTL_WPALLEN_MASK), %eax - - pushl %eax - pushl $7 - call kdi_dreg_set - addl $8, %esp - - pushl $6 - call kdi_dreg_get - addl $4, %esp - movl %eax, KRS_DRSTAT(%edi) - - pushl $0 - call kdi_dreg_get - addl $4, %esp - movl %eax, KRS_DROFF(0)(%edi) - - pushl $1 - call kdi_dreg_get - addl $4, %esp - movl %eax, KRS_DROFF(1)(%edi) - - pushl $2 - call kdi_dreg_get - addl $4, %esp - movl %eax, KRS_DROFF(2)(%edi) - - pushl $3 - call kdi_dreg_get - addl $4, %esp - movl %eax, KRS_DROFF(3)(%edi) - - movl %edi, %eax - popl %edi - - clr %ebp /* stack traces should end here */ - - pushl %eax - call kdi_debugger_entry - popl %eax - - jmp kdi_resume - - SET_SIZE(kdi_save_common_state) - -#endif /* !__lint */ - -/* - * Resume the world. The code that calls kdi_resume has already - * decided whether or not to restore the IDT. - */ -#if defined(__lint) -void -kdi_resume(void) -{ -} -#else /* __lint */ - - /* cpusave in %eax */ - ENTRY_NP(kdi_resume) - - /* - * Send this CPU back into the world - */ - -#if !defined(__xpv) - movl KRS_CR0(%eax), %edx - movl %edx, %cr0 -#endif - - pushl %edi - movl %eax, %edi - - KDI_RESTORE_DEBUGGING_STATE - - popl %edi - -#if defined(__xpv) - /* - * kmdb might have set PS_T in the saved eflags, so we can't use - * intr_restore, since that restores all of eflags; instead, just - * pick up PS_IE from the saved eflags. - */ - movl REG_OFF(KDIREG_EFLAGS)(%esp), %eax - testl $PS_IE, %eax - jz 2f - STI -2: -#endif - - addl $8, %esp /* Discard savfp and savpc */ - - popl %ss - popl %gs - popl %fs - popl %es - popl %ds - popal - - addl $8, %esp /* Discard TRAPNO and ERROR */ - - IRET - - SET_SIZE(kdi_resume) -#endif /* __lint */ - -#if !defined(__lint) - - ENTRY_NP(kdi_pass_to_kernel) - - /* pop cpusave, leaving %esp pointing to saved regs */ - popl %eax - - movl $KDI_CPU_STATE_NONE, KRS_CPU_STATE(%eax) - - /* - * Find the trap and vector off the right kernel handler. The trap - * handler will expect the stack to be in trap order, with %eip being - * the last entry, so we'll need to restore all our regs. - * - * We're hard-coding the three cases where KMDB has installed permanent - * handlers, since after we restore, we don't have registers to work - * with; we can't use a global since other CPUs can easily pass through - * here at the same time. - * - * Note that we handle T_DBGENTR since userspace might have tried it. - */ - movl REG_OFF(KDIREG_TRAPNO)(%esp), %eax - cmpl $T_SGLSTP, %eax - je kpass_dbgtrap - cmpl $T_BPTFLT, %eax - je kpass_brktrap - cmpl $T_DBGENTR, %eax - je kpass_invaltrap - /* - * Hmm, unknown handler. Somebody forgot to update this when they - * added a new trap interposition... try to drop back into kmdb. - */ - int $T_DBGENTR - -kpass_dbgtrap: - KDI_RESTORE_REGS() - ljmp $KCS_SEL, $1f -1: jmp %cs:dbgtrap - /*NOTREACHED*/ - -kpass_brktrap: - KDI_RESTORE_REGS() - ljmp $KCS_SEL, $2f -2: jmp %cs:brktrap - /*NOTREACHED*/ - -kpass_invaltrap: - KDI_RESTORE_REGS() - ljmp $KCS_SEL, $3f -3: jmp %cs:invaltrap - /*NOTREACHED*/ - - SET_SIZE(kdi_pass_to_kernel) - - /* - * A minimal version of mdboot(), to be used by the master CPU only. - */ - ENTRY_NP(kdi_reboot) - - pushl $AD_BOOT - pushl $A_SHUTDOWN - call *psm_shutdownf - addl $8, %esp - -#if defined(__xpv) - pushl $SHUTDOWN_reboot - call HYPERVISOR_shutdown -#else - call reset -#endif - /*NOTREACHED*/ - - SET_SIZE(kdi_reboot) - -#endif /* !__lint */ - -#if defined(__lint) -/*ARGSUSED*/ -void -kdi_cpu_debug_init(kdi_cpusave_t *save) -{ -} -#else /* __lint */ - - ENTRY_NP(kdi_cpu_debug_init) - pushl %ebp - movl %esp, %ebp - - pushl %edi - pushl %ebx - - movl 8(%ebp), %edi - - KDI_RESTORE_DEBUGGING_STATE - - popl %ebx - popl %edi - leave - ret - - SET_SIZE(kdi_cpu_debug_init) -#endif /* !__lint */ - diff --git a/usr/src/uts/intel/kdi/amd64/kdi_asm.s b/usr/src/uts/intel/kdi/kdi_asm.s index ea6f404af4..9e5bbc110f 100644 --- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s +++ b/usr/src/uts/intel/kdi/kdi_asm.s @@ -27,12 +27,13 @@ */ /* - * Debugger entry for both master and slave CPUs + * Debugger entry and exit for both master and slave CPUs. kdi_idthdl.s contains + * the IDT stubs that drop into here (mainly via kdi_cmnint). */ #if defined(__lint) #include <sys/types.h> -#endif +#else #include <sys/segments.h> #include <sys/asm_linkage.h> @@ -46,9 +47,6 @@ #ifdef __xpv #include <sys/hypervisor.h> #endif - -#ifdef _ASM - #include <kdi_assym.h> #include <assym.h> @@ -80,6 +78,9 @@ #ifdef __xpv +/* + * Already on kernel gsbase via the hypervisor. + */ #define SAVE_GSBASE(reg) /* nothing */ #define RESTORE_GSBASE(reg) /* nothing */ @@ -90,8 +91,16 @@ rdmsr; \ shlq $32, %rdx; \ orq %rax, %rdx; \ - movq %rdx, REG_OFF(KDIREG_GSBASE)(base) + movq %rdx, REG_OFF(KDIREG_GSBASE)(base); \ + movl $MSR_AMD_KGSBASE, %ecx; \ + rdmsr; \ + shlq $32, %rdx; \ + orq %rax, %rdx; \ + movq %rdx, REG_OFF(KDIREG_KGSBASE)(base) +/* + * We shouldn't have stomped on KGSBASE, so don't try to restore it. + */ #define RESTORE_GSBASE(base) \ movq REG_OFF(KDIREG_GSBASE)(base), %rdx; \ movq %rdx, %rax; \ @@ -102,9 +111,7 @@ #endif /* __xpv */ /* - * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack. Note - * that on the hypervisor, we skip the save/restore of GSBASE: it's slow, and - * unnecessary. + * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack. */ #define KDI_SAVE_REGS(base) \ movq %rdi, REG_OFF(KDIREG_RDI)(base); \ @@ -125,6 +132,8 @@ movq %rbp, REG_OFF(KDIREG_SAVFP)(base); \ movq REG_OFF(KDIREG_RIP)(base), %rax; \ movq %rax, REG_OFF(KDIREG_SAVPC)(base); \ + movq %cr2, %rax; \ + movq %rax, REG_OFF(KDIREG_CR2)(base); \ clrq %rax; \ movw %ds, %ax; \ movq %rax, REG_OFF(KDIREG_DS)(base); \ @@ -143,6 +152,8 @@ movw %ax, %es; \ movq REG_OFF(KDIREG_DS)(%rdi), %rax; \ movw %ax, %ds; \ + movq REG_OFF(KDIREG_CR2)(base), %rax; \ + movq %rax, %cr2; \ movq REG_OFF(KDIREG_R15)(%rdi), %r15; \ movq REG_OFF(KDIREG_R14)(%rdi), %r14; \ movq REG_OFF(KDIREG_R13)(%rdi), %r13; \ @@ -222,15 +233,6 @@ movq KRS_CURCRUMB(cpusave), tmp; \ movq value, offset(tmp) -#endif /* _ASM */ - -#if defined(__lint) -void -kdi_cmnint(void) -{ -} -#else /* __lint */ - /* XXX implement me */ ENTRY_NP(kdi_nmiint) clrq %rcx @@ -280,6 +282,20 @@ kdi_cmnint(void) shrq $32, %rdx movl $MSR_AMD_GSBASE, %ecx wrmsr + + /* + * Switch to the kernel's %cr3. From the early interrupt handler + * until now we've been running on the "paranoid" %cr3 (that of kas + * from early in boot). + * + * Hopefully it's not corrupt! + */ + mov %gs:CPU_KPTI_KCR3, %rdx + cmp $0, %rdx + je .zero_kcr3 + mov %rdx, %cr3 +.zero_kcr3: + #endif /* __xpv */ GET_CPUSAVE_ADDR /* %rax = cpusave, %rbx = CPU ID */ @@ -303,13 +319,15 @@ kdi_cmnint(void) */ leaq kdi_memranges, %rcx movl kdi_nmemranges, %edx -1: cmpq MR_BASE(%rcx), %rsp +1: + cmpq MR_BASE(%rcx), %rsp jl 2f /* below this range -- try the next one */ cmpq MR_LIM(%rcx), %rsp jg 2f /* above this range -- try the next one */ jmp 3f /* matched within this range */ -2: decl %edx +2: + decl %edx jz kdi_save_common_state /* %rsp not within debugger memory */ addq $MR_SIZE, %rcx jmp 1b @@ -339,8 +357,6 @@ kdi_cmnint(void) SET_SIZE(kdi_master_entry) SET_SIZE(kdi_cmnint) -#endif /* __lint */ - /* * The cross-call handler for slave CPUs. * @@ -351,12 +367,6 @@ kdi_cmnint(void) * master. */ -#if defined(__lint) -void -kdi_slave_entry(void) -{ -} -#else /* __lint */ ENTRY_NP(kdi_slave_entry) /* @@ -390,7 +400,7 @@ kdi_slave_entry(void) addq $8, %rax movq %rax, REG_OFF(KDIREG_RSP)(%rsp) - /* + /* * We've saved all of the general-purpose registers, and have a stack * that is irettable (after we strip down to the error code) */ @@ -409,8 +419,6 @@ kdi_slave_entry(void) SET_SIZE(kdi_slave_entry) -#endif /* __lint */ - /* * The state of the world: * @@ -424,8 +432,6 @@ kdi_slave_entry(void) * machine for debugger entry, and enter the debugger. */ -#if !defined(__lint) - ENTRY_NP(kdi_save_common_state) popq %rdi /* the cpusave area */ @@ -495,19 +501,10 @@ kdi_slave_entry(void) SET_SIZE(kdi_save_common_state) -#endif /* !__lint */ - /* * Resume the world. The code that calls kdi_resume has already * decided whether or not to restore the IDT. */ -#if defined(__lint) -void -kdi_resume(void) -{ -} -#else /* __lint */ - /* cpusave in %rdi */ ENTRY_NP(kdi_resume) @@ -524,14 +521,18 @@ kdi_resume(void) movq KRS_GREGS(%rdi), %rsp KDI_RESTORE_REGS(%rsp) addq $REG_OFF(KDIREG_RIP), %rsp /* Discard state, trapno, err */ + /* + * The common trampoline code will restore %cr3 to the right value + * for either kernel or userland. + */ +#if !defined(__xpv) + jmp tr_iret_auto +#else IRET +#endif /*NOTREACHED*/ SET_SIZE(kdi_resume) -#endif /* __lint */ - -#if !defined(__lint) - ENTRY_NP(kdi_pass_to_kernel) popq %rdi /* cpusave */ @@ -564,7 +565,7 @@ kdi_resume(void) * added a new trap interposition... try to drop back into kmdb. */ int $T_DBGENTR - + #define CALL_TRAP_HANDLER(name) \ KDI_RESTORE_REGS(%rsp); \ /* Discard state, trapno, err */ \ @@ -602,16 +603,6 @@ kdi_resume(void) SET_SIZE(kdi_reboot) -#endif /* !__lint */ - -#if defined(__lint) -/*ARGSUSED*/ -void -kdi_cpu_debug_init(kdi_cpusave_t *save) -{ -} -#else /* __lint */ - ENTRY_NP(kdi_cpu_debug_init) pushq %rbp movq %rsp, %rbp @@ -622,7 +613,32 @@ kdi_cpu_debug_init(kdi_cpusave_t *save) leave ret - SET_SIZE(kdi_cpu_debug_init) -#endif /* !__lint */ +#define GETDREG(name, r) \ + ENTRY_NP(name); \ + movq r, %rax; \ + ret; \ + SET_SIZE(name) + +#define SETDREG(name, r) \ + ENTRY_NP(name); \ + movq %rdi, r; \ + ret; \ + SET_SIZE(name) + + GETDREG(kdi_getdr0, %dr0) + GETDREG(kdi_getdr1, %dr1) + GETDREG(kdi_getdr2, %dr2) + GETDREG(kdi_getdr3, %dr3) + GETDREG(kdi_getdr6, %dr6) + GETDREG(kdi_getdr7, %dr7) + + SETDREG(kdi_setdr0, %dr0) + SETDREG(kdi_setdr1, %dr1) + SETDREG(kdi_setdr2, %dr2) + SETDREG(kdi_setdr3, %dr3) + SETDREG(kdi_setdr6, %dr6) + SETDREG(kdi_setdr7, %dr7) + +#endif /* !__lint */ diff --git a/usr/src/uts/intel/kdi/kdi_idt.c b/usr/src/uts/intel/kdi/kdi_idt.c index 64e2b225d5..d801588954 100644 --- a/usr/src/uts/intel/kdi/kdi_idt.c +++ b/usr/src/uts/intel/kdi/kdi_idt.c @@ -78,6 +78,7 @@ #include <sys/kdi_impl.h> #include <sys/x_call.h> #include <ia32/sys/psw.h> +#include <vm/hat_i86.h> #define KDI_GATE_NVECS 3 @@ -116,7 +117,7 @@ typedef void idt_hdlr_f(void); extern idt_hdlr_f kdi_trap0, kdi_trap1, kdi_int2, kdi_trap3, kdi_trap4; extern idt_hdlr_f kdi_trap5, kdi_trap6, kdi_trap7, kdi_trap9; extern idt_hdlr_f kdi_traperr10, kdi_traperr11, kdi_traperr12; -extern idt_hdlr_f kdi_traperr13, kdi_traperr14, kdi_trap16, kdi_trap17; +extern idt_hdlr_f kdi_traperr13, kdi_traperr14, kdi_trap16, kdi_traperr17; extern idt_hdlr_f kdi_trap18, kdi_trap19, kdi_trap20, kdi_ivct32; extern idt_hdlr_f kdi_invaltrap; extern size_t kdi_ivct_size; @@ -137,7 +138,7 @@ static const kdi_gate_spec_t kdi_gate_specs[KDI_GATE_NVECS] = { static gate_desc_t kdi_kgates[KDI_GATE_NVECS]; -gate_desc_t kdi_idt[NIDT]; +extern gate_desc_t kdi_idt[NIDT]; struct idt_description { uint_t id_low; @@ -164,7 +165,7 @@ struct idt_description { { T_PGFLT, 0, kdi_traperr14, NULL }, { 15, 0, kdi_invaltrap, NULL }, { T_EXTERRFLT, 0, kdi_trap16, NULL }, - { T_ALIGNMENT, 0, kdi_trap17, NULL }, + { T_ALIGNMENT, 0, kdi_traperr17, NULL }, { T_MCE, 0, kdi_trap18, NULL }, { T_SIMDFPE, 0, kdi_trap19, NULL }, { T_DBGENTR, 0, kdi_trap20, NULL }, @@ -183,11 +184,16 @@ kdi_idt_init(selector_t sel) uint_t high = id->id_high != 0 ? id->id_high : id->id_low; size_t incr = id->id_incrp != NULL ? *id->id_incrp : 0; +#if !defined(__xpv) + if (kpti_enable && sel == KCS_SEL && id->id_low == T_DBLFLT) + id->id_basehdlr = tr_syserrtrap; +#endif + for (i = id->id_low; i <= high; i++) { caddr_t hdlr = (caddr_t)id->id_basehdlr + incr * (i - id->id_low); set_gatesegd(&kdi_idt[i], (void (*)())hdlr, sel, - SDT_SYSIGT, TRP_KPL, i); + SDT_SYSIGT, TRP_KPL, IST_DBG); } } } @@ -204,7 +210,7 @@ kdi_idt_gates_install(selector_t sel, int saveold) const kdi_gate_spec_t *gs = &kdi_gate_specs[i]; uintptr_t func = GATESEG_GETOFFSET(&kdi_idt[gs->kgs_vec]); set_gatesegd(&gates[i], (void (*)())func, sel, SDT_SYSIGT, - gs->kgs_dpl, gs->kgs_vec); + gs->kgs_dpl, IST_DBG); } for (i = 0; i < KDI_GATE_NVECS; i++) { @@ -390,9 +396,17 @@ kdi_trap_pass(kdi_cpusave_t *cpusave) * See the comments in the kernel's T_SGLSTP handler for why we need to * do this. */ +#if !defined(__xpv) if (tt == T_SGLSTP && - (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter)) + (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter || + pc == (greg_t)tr_sys_sysenter || + pc == (greg_t)tr_brand_sys_sysenter)) { +#else + if (tt == T_SGLSTP && + (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter)) { +#endif return (1); + } return (0); } diff --git a/usr/src/uts/intel/kdi/kdi_idthdl.s b/usr/src/uts/intel/kdi/kdi_idthdl.s index 359df2a8f8..510bb20fcb 100644 --- a/usr/src/uts/intel/kdi/kdi_idthdl.s +++ b/usr/src/uts/intel/kdi/kdi_idthdl.s @@ -26,22 +26,30 @@ */ /* - * Companion to kdi_idt.c - the implementation of the trap and interrupt + * Companion to kdi_asm.s - the implementation of the trap and interrupt * handlers. For the most part, these handlers do the same thing - they * push a trap number onto the stack, followed by a jump to kdi_cmnint. * Each trap and interrupt has its own handler because each one pushes a * different number. */ +#if defined(__lint) +#include <sys/types.h> +#else + #include <sys/asm_linkage.h> +#include <sys/asm_misc.h> +#include <sys/machprivregs.h> +#include <sys/privregs.h> #include <sys/kdi_regs.h> +#include <sys/trap.h> +#include <sys/param.h> -/* Nothing in this file is of interest to lint. */ -#if !defined(__lint) +#include <kdi_assym.h> +#include <assym.h> /* - * The default ASM_ENTRY_ALIGN (16) wastes far too much space. Pay no - * attention to the fleet of nop's we're adding to each handler. + * The default ASM_ENTRY_ALIGN (16) wastes far too much space. */ #undef ASM_ENTRY_ALIGN #define ASM_ENTRY_ALIGN 8 @@ -50,65 +58,174 @@ * Generic trap and interrupt handlers. */ -#if defined(__xpv) && defined(__amd64) +#if defined(__xpv) -/* - * The hypervisor places r11 and rcx on the stack. - */ - -#define TRAP_NOERR(trapno) \ - popq %rcx; \ - popq %r11; \ - pushq $trapno - -#define TRAP_ERR(trapno) \ - popq %rcx; \ - popq %r11; \ - pushq $0; \ - pushq $trapno +#define INTERRUPT_TRAMPOLINE #else -#define TRAP_NOERR(trapno) \ - push $trapno +/* + * If we're !xpv, then we will need to support KPTI (kernel page table + * isolation), where we have separate page tables for user and kernel modes. + * There's more detail about this in kpti_trampolines.s and hat_i86.c + */ -#define TRAP_ERR(trapno) \ - push $0; \ - push $trapno +#define INTERRUPT_TRAMPOLINE \ + pushq %r13; \ + pushq %r14; \ + subq $KPTI_R14, %rsp; \ + /* Check for clobbering */ \ + cmp $0, KPTI_FLAG(%rsp); \ + je 1f; \ + /* Don't worry, this totally works */ \ + int $8; \ +1: \ + movq $1, KPTI_FLAG(%rsp); \ + /* Save current %cr3. */ \ + mov %cr3, %r14; \ + mov %r14, KPTI_TR_CR3(%rsp); \ + /* Switch to paranoid %cr3. */ \ + mov kpti_safe_cr3, %r14; \ + mov %r14, %cr3; \ + \ + cmpw $KCS_SEL, KPTI_CS(%rsp); \ + je 3f; \ +2: \ + /* Get our cpu_t in %r13 */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + subq $CPU_KPTI_START, %r13; \ + /* Use top of the kthread stk */ \ + mov CPU_THREAD(%r13), %r14; \ + mov T_STACK(%r14), %r14; \ + addq $REGSIZE+MINFRAME, %r14; \ + jmp 5f; \ +3: \ + /* Check the %rsp in the frame. */ \ + /* Is it above kernel base? */ \ + mov kpti_kbase, %r14; \ + cmp %r14, KPTI_RSP(%rsp); \ + jb 2b; \ + /* Is it within the kpti_frame page? */ \ + mov %rsp, %r13; \ + and $(~(MMU_PAGESIZE - 1)), %r13; \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~(MMU_PAGESIZE - 1)), %r14; \ + cmp %r13, %r14; \ + je 2b; \ + /* Use the %rsp from the trap frame. */ \ + /* We already did %cr3. */ \ + mov KPTI_RSP(%rsp), %r14; \ + and $(~0xf), %r14; \ +5: \ + mov %rsp, %r13; \ + /* %r14 contains our destination stk */ \ + mov %r14, %rsp; \ + pushq KPTI_SS(%r13); \ + pushq KPTI_RSP(%r13); \ + pushq KPTI_RFLAGS(%r13); \ + pushq KPTI_CS(%r13); \ + pushq KPTI_RIP(%r13); \ + pushq KPTI_ERR(%r13); \ + mov KPTI_R14(%r13), %r14; \ + movq $0, KPTI_FLAG(%r13); \ + mov KPTI_R13(%r13), %r13 -#endif /* __xpv && __amd64 */ +#endif /* !__xpv */ #define MKIVCT(n) \ ENTRY_NP(kdi_ivct/**/n/**/); \ - TRAP_ERR(n); \ + XPV_TRAP_POP; \ + push $0; /* err */ \ + INTERRUPT_TRAMPOLINE; \ + push $n; \ jmp kdi_cmnint; \ SET_SIZE(kdi_ivct/**/n/**/) #define MKTRAPHDLR(n) \ ENTRY_NP(kdi_trap/**/n); \ - TRAP_ERR(n); \ + XPV_TRAP_POP; \ + push $0; /* err */ \ + INTERRUPT_TRAMPOLINE; \ + push $n; \ jmp kdi_cmnint; \ SET_SIZE(kdi_trap/**/n/**/) #define MKTRAPERRHDLR(n) \ ENTRY_NP(kdi_traperr/**/n); \ - TRAP_NOERR(n); \ + XPV_TRAP_POP; \ + INTERRUPT_TRAMPOLINE; \ + push $n; \ jmp kdi_cmnint; \ SET_SIZE(kdi_traperr/**/n) +#if !defined(__xpv) +#define MKNMIHDLR \ + ENTRY_NP(kdi_int2); \ + push $0; \ + push $2; \ + pushq %r13; \ + mov kpti_safe_cr3, %r13; \ + mov %r13, %cr3; \ + popq %r13; \ + jmp kdi_nmiint; \ + SET_SIZE(kdi_int2) + +#define MKMCEHDLR \ + ENTRY_NP(kdi_trap18); \ + push $0; \ + push $18; \ + pushq %r13; \ + mov kpti_safe_cr3, %r13; \ + mov %r13, %cr3; \ + popq %r13; \ + jmp kdi_cmnint; \ + SET_SIZE(kdi_trap18) +#else #define MKNMIHDLR \ ENTRY_NP(kdi_int2); \ - TRAP_NOERR(2); \ + push $0; \ + push $2; \ jmp kdi_nmiint; \ SET_SIZE(kdi_int2) +#define MKMCEHDLR \ + ENTRY_NP(kdi_trap18); \ + push $0; \ + push $18; \ + jmp kdi_cmnint; \ + SET_SIZE(kdi_trap18) +#endif + +/* + * The only way we should reach here is by an explicit "int 0x.." which is + * defined not to push an error code. + */ #define MKINVALHDLR \ ENTRY_NP(kdi_invaltrap); \ - TRAP_NOERR(255); \ + XPV_TRAP_POP; \ + push $0; /* err */ \ + INTERRUPT_TRAMPOLINE; \ + push $255; \ jmp kdi_cmnint; \ SET_SIZE(kdi_invaltrap) + .data + DGDEF3(kdi_idt, 16 * NIDT, MMU_PAGESIZE) + .fill MMU_PAGESIZE, 1, 0 + +#if !defined(__xpv) +.section ".text" +.align MMU_PAGESIZE +.global kdi_isr_start +kdi_isr_start: + nop + +.global kpti_safe_cr3 +.global kpti_kbase +#endif + /* * The handlers themselves */ @@ -125,8 +242,7 @@ MKTRAPHDLR(9) MKTRAPHDLR(15) MKTRAPHDLR(16) - MKTRAPHDLR(17) - MKTRAPHDLR(18) + MKMCEHDLR/*18*/ MKTRAPHDLR(19) MKTRAPHDLR(20) @@ -136,11 +252,12 @@ MKTRAPERRHDLR(12) MKTRAPERRHDLR(13) MKTRAPERRHDLR(14) + MKTRAPERRHDLR(17) .globl kdi_ivct_size kdi_ivct_size: .NWORD [kdi_ivct33-kdi_ivct32] - + /* 10 billion and one interrupt handlers */ kdi_ivct_base: MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35); @@ -200,4 +317,12 @@ kdi_ivct_base: MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251); MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255); +#if !defined(__xpv) +.section ".text" +.align MMU_PAGESIZE +.global kdi_isr_end +kdi_isr_end: + nop #endif + +#endif /* !__lint */ diff --git a/usr/src/uts/intel/kdi/kdi_offsets.in b/usr/src/uts/intel/kdi/kdi_offsets.in index 212fdc9f4c..c9228de978 100644 --- a/usr/src/uts/intel/kdi/kdi_offsets.in +++ b/usr/src/uts/intel/kdi/kdi_offsets.in @@ -25,8 +25,6 @@ \ \ CPU-save structure offsets for use in assembly code. \ -\ Keep in sync with kdi_state.h -\ #include <sys/cpuvar.h> #include <sys/kdi_impl.h> @@ -60,16 +58,9 @@ kdi_cpusave_t KRS_SIZE krs_curcrumb krs_crumbs -cpu - cpu_id - greg_t KREG_SIZE -#if defined(__amd64) \#define REG_SHIFT 3 -#else -\#define REG_SHIFT 2 -#endif \#define DRADDR_IDX(num) _CONST(_MUL(num, DR_ADDR_INCR)) \#define DRADDR_OFF(num) _CONST(DRADDR_IDX(num) + DR_ADDR) diff --git a/usr/src/uts/intel/sys/archsystm.h b/usr/src/uts/intel/sys/archsystm.h index 9ca38f823c..4d14e58880 100644 --- a/usr/src/uts/intel/sys/archsystm.h +++ b/usr/src/uts/intel/sys/archsystm.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_ARCHSYSTM_H @@ -80,22 +80,26 @@ extern void int20(void); extern void int_cmci(void); #if defined(__amd64) -extern void sys_syscall(); -extern void sys_syscall32(); +extern void sys_syscall(), tr_sys_syscall(); +extern void sys_syscall32(), tr_sys_syscall32(); extern void sys_lcall32(); extern void sys_syscall_int(); -extern void brand_sys_syscall(); -extern void brand_sys_syscall32(); -extern void brand_sys_syscall_int(); +extern void tr_sys_syscall_int(); +extern void brand_sys_syscall(), tr_brand_sys_syscall(); +extern void brand_sys_syscall32(), tr_brand_sys_syscall32(); +extern void brand_sys_syscall_int(), tr_brand_sys_syscall_int(); extern int update_sregs(); extern void reset_sregs(); #elif defined(__i386) extern void sys_call(); +extern void tr_sys_call(); extern void brand_sys_call(); #endif extern void sys_sysenter(); +extern void tr_sys_sysenter(); extern void _sys_sysenter_post_swapgs(); extern void brand_sys_sysenter(); +extern void tr_brand_sys_sysenter(); extern void _brand_sys_sysenter_post_swapgs(); extern void dosyscall(void); diff --git a/usr/src/uts/intel/sys/segments.h b/usr/src/uts/intel/sys/segments.h index 5368f80735..84eb363f00 100644 --- a/usr/src/uts/intel/sys/segments.h +++ b/usr/src/uts/intel/sys/segments.h @@ -2,7 +2,7 @@ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. */ /* - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_SEGMENTS_H @@ -98,29 +98,30 @@ extern "C" { */ #if defined(__xpv) -#if defined(__amd64) - #define SEL_XPL 0 /* hypervisor privilege level */ #define SEL_KPL 3 /* both kernel and user in ring 3 */ #define TRP_KPL 1 /* system gate priv (user blocked) */ - -#elif defined(__i386) - -#define SEL_XPL 0 /* hypervisor privilege level */ -#define SEL_KPL 1 /* kernel privilege level */ -#define TRP_KPL SEL_KPL /* system gate priv (user blocked) */ - -#endif /* __i386 */ - #define TRP_XPL 0 /* system gate priv (hypervisor) */ +#define IST_DBG 0 + #else /* __xpv */ #define SEL_KPL 0 /* kernel privilege level on metal */ #define TRP_KPL SEL_KPL /* system gate priv (user blocked) */ + +#define IST_DF 1 +#define IST_NMI 2 +#define IST_MCE 3 +#define IST_DBG 4 +#define IST_NESTABLE 5 +#define IST_DEFAULT 6 + #endif /* __xpv */ +#define IST_NONE 0 + #define SEL_UPL 3 /* user priority level */ #define TRP_UPL 3 /* system gate priv (user allowed) */ #define SEL_TI_LDT 4 /* local descriptor table */ @@ -401,6 +402,8 @@ extern void set_usegd(user_desc_t *, void *, size_t, uint_t, uint_t, #endif /* __i386 */ +extern uint_t idt_vector_to_ist(uint_t); + extern void set_gatesegd(gate_desc_t *, void (*)(void), selector_t, uint_t, uint_t, uint_t); @@ -646,6 +649,10 @@ void init_boot_gdt(user_desc_t *); #define MINNLDT 512 /* Current min solaris ldt size (1 4K page) */ #define MAXNLDT 8192 /* max solaris ldt size (16 4K pages) */ +#ifdef _KERNEL +#define LDT_CPU_SIZE (16 * 4096) /* Size of kernel per-CPU allocation */ +#endif + #ifndef _ASM extern gate_desc_t *idt0; @@ -688,10 +695,29 @@ extern void sys_int80(); extern void brand_sys_int80(); extern void dtrace_ret(); +/* KPTI trampolines */ +extern void tr_invaltrap(); +extern void tr_div0trap(), tr_dbgtrap(), tr_nmiint(), tr_brktrap(); +extern void tr_ovflotrap(), tr_boundstrap(), tr_invoptrap(), tr_ndptrap(); +#if !defined(__xpv) +extern void tr_syserrtrap(); +#endif +extern void tr_invaltrap(), tr_invtsstrap(), tr_segnptrap(), tr_stktrap(); +extern void tr_gptrap(), tr_pftrap(), tr_ndperr(); +extern void tr_overrun(), tr_resvtrap(); +extern void tr_achktrap(), tr_mcetrap(); +extern void tr_xmtrap(); +extern void tr_fasttrap(); +extern void tr_sys_int80(); +extern void tr_brand_sys_int80(); +extern void tr_dtrace_ret(); + #if !defined(__amd64) extern void pentium_pftrap(); #endif +extern uint64_t kpti_enable; + #endif /* _ASM */ #ifdef __cplusplus |