summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Wilson <alex.wilson@joyent.com>2018-03-12 11:20:26 -0700
committerAlex Wilson <alex.wilson@joyent.com>2018-03-12 18:31:40 +0000
commitd85fbfe15cf9925f83722b6d62da49d549af615c (patch)
treeedd88a89a2065378767b7a3a64cd0ccc827330e1
parent8005f4ee748b1fe324b3f234a2defe0dd557611b (diff)
downloadillumos-joyent-d85fbfe15cf9925f83722b6d62da49d549af615c.tar.gz
OS-6547 Implement KPTI
Reviewed by: John Levon <john.levon@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c237
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h5
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/unix.c26
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s4
-rw-r--r--usr/src/cmd/mdb/intel/kmdb/kaif.c37
-rw-r--r--usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c19
-rw-r--r--usr/src/cmd/mdb/intel/mdb/mdb_kreg.h7
-rw-r--r--usr/src/uts/common/sys/sysmacros.h9
-rw-r--r--usr/src/uts/i86pc/Makefile.files7
-rw-r--r--usr/src/uts/i86pc/ml/genassym.c23
-rw-r--r--usr/src/uts/i86pc/ml/kdi_subr.s160
-rw-r--r--usr/src/uts/i86pc/ml/kpti_trampolines.s713
-rw-r--r--usr/src/uts/i86pc/ml/locore.s10
-rw-r--r--usr/src/uts/i86pc/ml/offsets.in40
-rw-r--r--usr/src/uts/i86pc/ml/syscall_asm_amd64.s105
-rw-r--r--usr/src/uts/i86pc/os/intr.c32
-rw-r--r--usr/src/uts/i86pc/os/mlsetup.c16
-rw-r--r--usr/src/uts/i86pc/os/mp_pc.c56
-rw-r--r--usr/src/uts/i86pc/os/mp_startup.c85
-rw-r--r--usr/src/uts/i86pc/os/startup.c56
-rw-r--r--usr/src/uts/i86pc/os/trap.c92
-rw-r--r--usr/src/uts/i86pc/sys/machcpuvar.h87
-rw-r--r--usr/src/uts/i86pc/sys/machparam.h8
-rw-r--r--usr/src/uts/i86pc/sys/machprivregs.h10
-rw-r--r--usr/src/uts/i86pc/sys/rm_platter.h9
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c834
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.h73
-rw-r--r--usr/src/uts/i86pc/vm/hat_pte.h16
-rw-r--r--usr/src/uts/i86pc/vm/htable.c81
-rw-r--r--usr/src/uts/i86pc/vm/htable.h22
-rw-r--r--usr/src/uts/i86xpv/Makefile.files8
-rw-r--r--usr/src/uts/intel/Makefile.rules6
-rw-r--r--usr/src/uts/intel/amd64/sys/kdi_regs.h27
-rw-r--r--usr/src/uts/intel/ia32/ml/exception.s67
-rw-r--r--usr/src/uts/intel/ia32/ml/swtch.s61
-rw-r--r--usr/src/uts/intel/ia32/os/desctbls.c402
-rw-r--r--usr/src/uts/intel/ia32/os/sysi86.c25
-rw-r--r--usr/src/uts/intel/ia32/sys/trap.h20
-rw-r--r--usr/src/uts/intel/kdi/ia32/kdi_asm.s662
-rw-r--r--usr/src/uts/intel/kdi/kdi_asm.s (renamed from usr/src/uts/intel/kdi/amd64/kdi_asm.s)134
-rw-r--r--usr/src/uts/intel/kdi/kdi_idt.c26
-rw-r--r--usr/src/uts/intel/kdi/kdi_idthdl.s193
-rw-r--r--usr/src/uts/intel/kdi/kdi_offsets.in9
-rw-r--r--usr/src/uts/intel/sys/archsystm.h16
-rw-r--r--usr/src/uts/intel/sys/segments.h50
45 files changed, 2919 insertions, 1666 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
index ef89d23312..c6ac1d2967 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.c
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -46,6 +48,9 @@
#include <vm/page.h>
#include <vm/hat_i86.h>
+#define VA_SIGN_BIT (1UL << 47)
+#define VA_SIGN_EXTEND(va) (((va) ^ VA_SIGN_BIT) - VA_SIGN_BIT)
+
struct pfn2pp {
pfn_t pfn;
page_t *pp;
@@ -398,13 +403,6 @@ pte2mfn(x86pte_t pte, uint_t level)
return (mfn);
}
-/*
- * Print a PTE in more human friendly way. The PTE is assumed to be in
- * a level 0 page table, unless -l specifies another level.
- *
- * The PTE value can be specified as the -p option, since on a 32 bit kernel
- * with PAE running it's larger than a uintptr_t.
- */
static int
do_pte_dcmd(int level, uint64_t pte)
{
@@ -414,13 +412,14 @@ do_pte_dcmd(int level, uint64_t pte)
int pat_index = 0;
pfn_t mfn;
- mdb_printf("pte=%llr: ", pte);
- if (PTE_GET(pte, mmu.pt_nx))
- mdb_printf("noexec ");
+ mdb_printf("pte=0x%llr: ", pte);
mfn = pte2mfn(pte, level);
mdb_printf("%s=0x%lr ", is_xpv ? "mfn" : "pfn", mfn);
+ if (PTE_GET(pte, mmu.pt_nx))
+ mdb_printf("noexec ");
+
if (PTE_GET(pte, PT_NOCONSIST))
mdb_printf("noconsist ");
@@ -476,52 +475,34 @@ do_pte_dcmd(int level, uint64_t pte)
/*
* Print a PTE in more human friendly way. The PTE is assumed to be in
* a level 0 page table, unless -l specifies another level.
- *
- * The PTE value can be specified as the -p option, since on a 32 bit kernel
- * with PAE running it's larger than a uintptr_t.
*/
/*ARGSUSED*/
int
pte_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- int level = 0;
- uint64_t pte = 0;
- char *level_str = NULL;
- char *pte_str = NULL;
+ uint64_t level = 0;
init_mmu();
if (mmu.num_level == 0)
return (DCMD_ERR);
+ if ((flags & DCMD_ADDRSPEC) == 0)
+ return (DCMD_USAGE);
+
if (mdb_getopts(argc, argv,
- 'p', MDB_OPT_STR, &pte_str,
- 'l', MDB_OPT_STR, &level_str) != argc)
+ 'l', MDB_OPT_UINT64, &level) != argc)
return (DCMD_USAGE);
- /*
- * parse the PTE to decode, if it's 0, we don't do anything
- */
- if (pte_str != NULL) {
- pte = mdb_strtoull(pte_str);
- } else {
- if ((flags & DCMD_ADDRSPEC) == 0)
- return (DCMD_USAGE);
- pte = addr;
+ if (level > mmu.max_level) {
+ mdb_warn("invalid level %lu\n", level);
+ return (DCMD_ERR);
}
- if (pte == 0)
- return (DCMD_OK);
- /*
- * parse the level if supplied
- */
- if (level_str != NULL) {
- level = mdb_strtoull(level_str);
- if (level < 0 || level > mmu.max_level)
- return (DCMD_ERR);
- }
+ if (addr == 0)
+ return (DCMD_OK);
- return (do_pte_dcmd(level, pte));
+ return (do_pte_dcmd((int)level, addr));
}
static size_t
@@ -537,25 +518,20 @@ static x86pte_t
get_pte(hat_t *hat, htable_t *htable, uintptr_t addr)
{
x86pte_t buf;
- x86pte32_t *pte32 = (x86pte32_t *)&buf;
- size_t len;
- if (htable->ht_flags & HTABLE_VLP) {
- uintptr_t ptr = (uintptr_t)hat->hat_vlp_ptes;
+ if (htable->ht_flags & HTABLE_COPIED) {
+ uintptr_t ptr = (uintptr_t)hat->hat_copied_ptes;
ptr += va2entry(htable, addr) << mmu.pte_size_shift;
- len = mdb_vread(&buf, mmu.pte_size, ptr);
- } else {
- paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn);
- paddr += va2entry(htable, addr) << mmu.pte_size_shift;
- len = mdb_pread(&buf, mmu.pte_size, paddr);
+ return (*(x86pte_t *)ptr);
}
- if (len != mmu.pte_size)
- return (0);
+ paddr_t paddr = mmu_ptob((paddr_t)htable->ht_pfn);
+ paddr += va2entry(htable, addr) << mmu.pte_size_shift;
- if (mmu.pte_size == sizeof (x86pte_t))
+ if ((mdb_pread(&buf, mmu.pte_size, paddr)) == mmu.pte_size)
return (buf);
- return (*pte32);
+
+ return (0);
}
static int
@@ -621,8 +597,8 @@ do_va2pa(uintptr_t addr, struct as *asp, int print_level, physaddr_t *pap,
pte = get_pte(&hat, &htable, addr);
if (print_level) {
- mdb_printf("\tlevel=%d htable=%p "
- "pte=%llr\n", level, ht, pte);
+ mdb_printf("\tlevel=%d htable=0x%p "
+ "pte=0x%llr\n", level, ht, pte);
}
if (!PTE_ISVALID(pte)) {
@@ -725,8 +701,6 @@ do_report_maps(pfn_t pfn)
int level;
int entry;
x86pte_t pte;
- x86pte_t buf;
- x86pte32_t *pte32 = (x86pte32_t *)&buf;
physaddr_t paddr;
size_t len;
@@ -796,14 +770,10 @@ do_report_maps(pfn_t pfn)
base >= kernelbase)
continue;
- len = mdb_pread(&buf, mmu.pte_size,
+ len = mdb_pread(&pte, mmu.pte_size,
paddr + entry * mmu.pte_size);
if (len != mmu.pte_size)
return (DCMD_ERR);
- if (mmu.pte_size == sizeof (x86pte_t))
- pte = buf;
- else
- pte = *pte32;
if ((pte & PT_VALID) == 0)
continue;
@@ -854,7 +824,7 @@ report_maps_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
static int
-do_ptable_dcmd(pfn_t pfn)
+do_ptable_dcmd(pfn_t pfn, uint64_t level)
{
struct hat *hatp;
struct hat hat;
@@ -862,12 +832,10 @@ do_ptable_dcmd(pfn_t pfn)
htable_t htable;
uintptr_t base;
int h;
- int level;
int entry;
uintptr_t pagesize;
x86pte_t pte;
x86pte_t buf;
- x86pte32_t *pte32 = (x86pte32_t *)&buf;
physaddr_t paddr;
size_t len;
@@ -912,14 +880,21 @@ do_ptable_dcmd(pfn_t pfn)
found_it:
if (htable.ht_pfn == pfn) {
mdb_printf("htable=%p\n", ht);
- level = htable.ht_level;
+ if (level == (uint64_t)-1) {
+ level = htable.ht_level;
+ } else if (htable.ht_level != level) {
+ mdb_warn("htable has level %d but forcing level %lu\n",
+ htable.ht_level, level);
+ }
base = htable.ht_vaddr;
pagesize = mmu.level_size[level];
} else {
- mdb_printf("Unknown pagetable - assuming level/addr 0");
- level = 0; /* assume level == 0 for PFN */
+ if (level == (uint64_t)-1)
+ level = 0;
+ mdb_warn("couldn't find matching htable, using level=%lu, "
+ "base address=0x0\n", level);
base = 0;
- pagesize = MMU_PAGESIZE;
+ pagesize = mmu.level_size[level];
}
paddr = mmu_ptob((physaddr_t)pfn);
@@ -928,15 +903,13 @@ found_it:
paddr + entry * mmu.pte_size);
if (len != mmu.pte_size)
return (DCMD_ERR);
- if (mmu.pte_size == sizeof (x86pte_t))
pte = buf;
- else
- pte = *pte32;
if (pte == 0)
continue;
- mdb_printf("[%3d] va=%p ", entry, base + entry * pagesize);
+ mdb_printf("[%3d] va=0x%p ", entry,
+ VA_SIGN_EXTEND(base + entry * pagesize));
do_pte_dcmd(level, pte);
}
@@ -953,6 +926,7 @@ ptable_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
pfn_t pfn;
uint_t mflag = 0;
+ uint64_t level = (uint64_t)-1;
init_mmu();
@@ -963,14 +937,20 @@ ptable_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_USAGE);
if (mdb_getopts(argc, argv,
- 'm', MDB_OPT_SETBITS, TRUE, &mflag, NULL) != argc)
+ 'm', MDB_OPT_SETBITS, TRUE, &mflag,
+ 'l', MDB_OPT_UINT64, &level, NULL) != argc)
return (DCMD_USAGE);
+ if (level != (uint64_t)-1 && level > mmu.max_level) {
+ mdb_warn("invalid level %lu\n", level);
+ return (DCMD_ERR);
+ }
+
pfn = (pfn_t)addr;
if (mflag)
pfn = mdb_mfn_to_pfn(pfn);
- return (do_ptable_dcmd(pfn));
+ return (do_ptable_dcmd(pfn, level));
}
static int
@@ -1031,3 +1011,112 @@ htables_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (do_htables_dcmd(hat));
}
+
+static uintptr_t
+entry2va(size_t *entries)
+{
+ uintptr_t va = 0;
+
+ for (level_t l = mmu.max_level; l >= 0; l--)
+ va += entries[l] << mmu.level_shift[l];
+
+ return (VA_SIGN_EXTEND(va));
+}
+
+static void
+ptmap_report(size_t *entries, uintptr_t start,
+ boolean_t user, boolean_t writable, boolean_t wflag)
+{
+ uint64_t curva = entry2va(entries);
+
+ mdb_printf("mapped %s,%s range of %lu bytes: %a-%a\n",
+ user ? "user" : "kernel", writable ? "writable" : "read-only",
+ curva - start, start, curva - 1);
+ if (wflag && start >= kernelbase)
+ (void) mdb_call_dcmd("whatis", start, DCMD_ADDRSPEC, 0, NULL);
+}
+
+int
+ptmap_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ physaddr_t paddrs[MAX_NUM_LEVEL] = { 0, };
+ size_t entry[MAX_NUM_LEVEL] = { 0, };
+ uintptr_t start = (uintptr_t)-1;
+ boolean_t writable = B_FALSE;
+ boolean_t user = B_FALSE;
+ boolean_t wflag = B_FALSE;
+ level_t curlevel;
+
+ if ((flags & DCMD_ADDRSPEC) == 0)
+ return (DCMD_USAGE);
+
+ if (mdb_getopts(argc, argv,
+ 'w', MDB_OPT_SETBITS, TRUE, &wflag, NULL) != argc)
+ return (DCMD_USAGE);
+
+ init_mmu();
+
+ if (mmu.num_level == 0)
+ return (DCMD_ERR);
+
+ curlevel = mmu.max_level;
+
+ paddrs[curlevel] = addr & MMU_PAGEMASK;
+
+ for (;;) {
+ physaddr_t pte_addr;
+ x86pte_t pte;
+
+ pte_addr = paddrs[curlevel] +
+ (entry[curlevel] << mmu.pte_size_shift);
+
+ if (mdb_pread(&pte, sizeof (pte), pte_addr) != sizeof (pte)) {
+ mdb_warn("couldn't read pte at %p", pte_addr);
+ return (DCMD_ERR);
+ }
+
+ if (PTE_GET(pte, PT_VALID) == 0) {
+ if (start != (uintptr_t)-1) {
+ ptmap_report(entry, start,
+ user, writable, wflag);
+ start = (uintptr_t)-1;
+ }
+ } else if (curlevel == 0 || PTE_GET(pte, PT_PAGESIZE)) {
+ if (start == (uintptr_t)-1) {
+ start = entry2va(entry);
+ user = PTE_GET(pte, PT_USER);
+ writable = PTE_GET(pte, PT_WRITABLE);
+ } else if (user != PTE_GET(pte, PT_USER) ||
+ writable != PTE_GET(pte, PT_WRITABLE)) {
+ ptmap_report(entry, start,
+ user, writable, wflag);
+ start = entry2va(entry);
+ user = PTE_GET(pte, PT_USER);
+ writable = PTE_GET(pte, PT_WRITABLE);
+ }
+ } else {
+ /* Descend a level. */
+ physaddr_t pa = mmu_ptob(pte2mfn(pte, curlevel));
+ paddrs[--curlevel] = pa;
+ entry[curlevel] = 0;
+ continue;
+ }
+
+ while (++entry[curlevel] == mmu.ptes_per_table) {
+ /* Ascend back up. */
+ entry[curlevel] = 0;
+ if (curlevel == mmu.max_level) {
+ if (start != (uintptr_t)-1) {
+ ptmap_report(entry, start,
+ user, writable, wflag);
+ }
+ goto out;
+ }
+
+ curlevel++;
+ }
+ }
+
+out:
+ return (DCMD_OK);
+}
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h
index 3e5476e31e..8d794095c9 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/i86mmu.h
@@ -21,6 +21,8 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _I86MMU_H
@@ -42,6 +44,9 @@ extern int htables_dcmd(uintptr_t addr, uint_t flags, int argc,
extern int ptable_dcmd(uintptr_t addr, uint_t flags, int argc,
const mdb_arg_t *argv);
+extern int ptmap_dcmd(uintptr_t addr, uint_t flags, int argc,
+ const mdb_arg_t *argv);
+
extern int va2pfn_dcmd(uintptr_t addr, uint_t flags, int argc,
const mdb_arg_t *argv);
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
index 5d8a0f222f..95e588eed6 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
@@ -432,6 +432,9 @@ ttrace_dumpregs(trap_trace_rec_t *rec)
mdb_printf(THREEREGS, DUMP(gs), "trp", regs->r_trapno, DUMP(err));
mdb_printf(THREEREGS, DUMP(rip), DUMP(cs), DUMP(rfl));
mdb_printf(THREEREGS, DUMP(rsp), DUMP(ss), "cr2", rec->ttr_cr2);
+ mdb_printf(" %3s: %16lx %3s: %16lx\n",
+ "fsb", regs->__r_fsbase,
+ "gsb", regs->__r_gsbase);
mdb_printf("\n");
}
@@ -753,7 +756,18 @@ ptable_help(void)
"Given a PFN holding a page table, print its contents, and\n"
"the address of the corresponding htable structure.\n"
"\n"
- "-m Interpret the PFN as an MFN (machine frame number)\n");
+ "-m Interpret the PFN as an MFN (machine frame number)\n"
+ "-l force page table level (3 is top)\n");
+}
+
+static void
+ptmap_help(void)
+{
+ mdb_printf(
+ "Report all mappings represented by the page table hierarchy\n"
+ "rooted at the given cr3 value / physical address.\n"
+ "\n"
+ "-w run ::whatis on mapping start addresses\n");
}
static const char *const scalehrtime_desc =
@@ -1000,10 +1014,10 @@ crregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
mdb_printf("%%cr2 = 0x%08x <%a>\n", cr2, cr2);
if ((cr4 & CR4_PCIDE)) {
- mdb_printf("%%cr3 = 0x%08x <pfn:%lu pcid:%u>\n",
+ mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx pcid:%u>\n",
cr3 >> MMU_PAGESHIFT, cr3 & MMU_PAGEOFFSET);
} else {
- mdb_printf("%%cr3 = 0x%08x <pfn:%lu flags:%b>\n", cr3,
+ mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx flags:%b>\n", cr3,
cr3 >> MMU_PAGESHIFT, cr3, cr3_flag_bits);
}
@@ -1024,9 +1038,11 @@ static const mdb_dcmd_t dcmds[] = {
report_maps_dcmd, report_maps_help },
{ "htables", "", "Given hat_t *, lists all its htable_t * values",
htables_dcmd, htables_help },
- { "ptable", ":[-m]", "Given PFN, dump contents of a page table",
+ { "ptable", ":[-lm]", "Given PFN, dump contents of a page table",
ptable_dcmd, ptable_help },
- { "pte", ":[-p XXXXX] [-l N]", "print human readable page table entry",
+ { "ptmap", ":", "Given a cr3 value, dump all mappings",
+ ptmap_dcmd, ptmap_help },
+ { "pte", ":[-l N]", "print human readable page table entry",
pte_dcmd },
{ "pfntomfn", ":", "convert physical page to hypervisor machine page",
pfntomfn_dcmd },
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s b/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s
index 407123c7e0..38ddf5cf44 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix_sup.s
@@ -32,6 +32,10 @@ kmdb_unix_getcr0(void)
{ return (0); }
ulong_t
+kmdb_unix_getcr3(void)
+{ return (0); }
+
+ulong_t
kmdb_unix_getcr4(void)
{ return (0); }
diff --git a/usr/src/cmd/mdb/intel/kmdb/kaif.c b/usr/src/cmd/mdb/intel/kmdb/kaif.c
index c1be6aae0f..dda6a94ea6 100644
--- a/usr/src/cmd/mdb/intel/kmdb/kaif.c
+++ b/usr/src/cmd/mdb/intel/kmdb/kaif.c
@@ -50,6 +50,7 @@
#include <sys/bitmap.h>
#include <sys/termios.h>
#include <sys/kdi_impl.h>
+#include <sys/sysmacros.h>
/*
* This is the area containing the saved state when we enter
@@ -256,11 +257,42 @@ kaif_set_register(const char *regname, kreg_t val)
return (0);
}
+/*
+ * Refuse to single-step or break within any stub that loads a user %cr3 value.
+ * As the KDI traps are not careful to restore such a %cr3, this can all go
+ * wrong, both spectacularly and subtly.
+ */
+static boolean_t
+kaif_toxic_text(uintptr_t addr)
+{
+ static GElf_Sym toxic_syms[1] = { 0, };
+ size_t i;
+
+ if (toxic_syms[0].st_name == NULL) {
+ if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC,
+ "tr_iret_user", &toxic_syms[0], NULL) != 0)
+ warn("couldn't find tr_iret_user\n");
+ }
+
+ for (i = 0; i < ARRAY_SIZE(toxic_syms); i++) {
+ if (addr >= toxic_syms[i].st_value &&
+ addr - toxic_syms[i].st_value < toxic_syms[i].st_size)
+ return (B_TRUE);
+ }
+
+ return (B_FALSE);
+}
+
static int
kaif_brkpt_arm(uintptr_t addr, mdb_instr_t *instrp)
{
mdb_instr_t bkpt = KAIF_BREAKPOINT_INSTR;
+ if (kaif_toxic_text(addr)) {
+ warn("%a cannot be a breakpoint target\n", addr);
+ return (set_errno(EMDB_TGTNOTSUP));
+ }
+
if (mdb_tgt_vread(mdb.m_target, instrp, sizeof (mdb_instr_t), addr) !=
sizeof (mdb_instr_t))
return (-1); /* errno is set for us */
@@ -445,6 +477,11 @@ kaif_step(void)
(void) kmdb_dpi_get_register("pc", &pc);
+ if (kaif_toxic_text(pc)) {
+ warn("%a cannot be stepped\n", pc);
+ return (set_errno(EMDB_TGTNOTSUP));
+ }
+
if ((npc = mdb_dis_nextins(mdb.m_disasm, mdb.m_target,
MDB_TGT_AS_VIRT, pc)) == pc) {
warn("failed to decode instruction at %a for step\n", pc);
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
index 2465146a38..22c3d1dc6a 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_amd64util.c
@@ -24,7 +24,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
@@ -133,6 +133,9 @@ const mdb_tgt_regdesc_t mdb_amd64_kregs[] = {
{ "sp", KREG_RSP, MDB_TGT_R_EXPORT | MDB_TGT_R_16 },
{ "spl", KREG_RSP, MDB_TGT_R_EXPORT | MDB_TGT_R_8L },
{ "ss", KREG_SS, MDB_TGT_R_EXPORT },
+ { "gsbase", KREG_GSBASE, MDB_TGT_R_EXPORT },
+ { "kgsbase", KREG_KGSBASE, MDB_TGT_R_EXPORT },
+ { "cr2", KREG_CR2, MDB_TGT_R_EXPORT },
{ NULL, 0, 0 }
};
@@ -186,13 +189,13 @@ mdb_amd64_printregs(const mdb_tgt_gregset_t *gregs)
(rflags & KREG_EFLAGS_PF_MASK) ? "PF" : "pf",
(rflags & KREG_EFLAGS_CF_MASK) ? "CF" : "cf");
- mdb_printf("%24s%%cs = 0x%04x\t%%ds = 0x%04x\t%%es = 0x%04x\n",
- " ", kregs[KREG_CS], kregs[KREG_DS], kregs[KREG_ES]);
-
- mdb_printf("%%trapno = 0x%x\t\t%%fs = 0x%04x\t%%gs = 0x%04x\n",
- kregs[KREG_TRAPNO], (kregs[KREG_FS] & 0xffff),
- (kregs[KREG_GS] & 0xffff));
- mdb_printf(" %%err = 0x%x\n", kregs[KREG_ERR]);
+ mdb_printf("%%cs = 0x%04x\t%%ds = 0x%04x\t"
+ "%%es = 0x%04x\t%%fs = 0x%04x\n", kregs[KREG_CS], kregs[KREG_DS],
+ kregs[KREG_ES], kregs[KREG_FS] & 0xffff);
+ mdb_printf("%%gs = 0x%04x\t%%gsbase = 0x%lx\t%%kgsbase = 0x%lx\n",
+ kregs[KREG_GS] & 0xffff, kregs[KREG_GSBASE], kregs[KREG_KGSBASE]);
+ mdb_printf("%%trapno = 0x%x\t%%err = 0x%x\t%%cr2 = 0x%lx\n",
+ kregs[KREG_TRAPNO], kregs[KREG_ERR], kregs[KREG_CR2]);
}
int
diff --git a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
index 4ba5fb567c..8bee68b379 100644
--- a/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
+++ b/usr/src/cmd/mdb/intel/mdb/mdb_kreg.h
@@ -21,13 +21,13 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _MDB_KREG_H
#define _MDB_KREG_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/kdi_regs.h>
#ifndef _ASM
#include <sys/types.h>
@@ -75,8 +75,11 @@ typedef uint32_t kreg_t;
#define KREG_ES KDIREG_ES
#define KREG_FS KDIREG_FS
#define KREG_GS KDIREG_GS
+#define KREG_GSBASE KDIREG_GSBASE
+#define KREG_KGSBASE KDIREG_KGSBASE
#define KREG_TRAPNO KDIREG_TRAPNO
#define KREG_ERR KDIREG_ERR
+#define KREG_CR2 KDIREG_CR2
#define KREG_RIP KDIREG_RIP
#define KREG_CS KDIREG_CS
#define KREG_RFLAGS KDIREG_RFLAGS
diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h
index 394a716a02..6f5882b54b 100644
--- a/usr/src/uts/common/sys/sysmacros.h
+++ b/usr/src/uts/common/sys/sysmacros.h
@@ -27,6 +27,8 @@
* Use is subject to license terms.
*
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ *
+ * Copyright 2018 Joyent Inc.
*/
#ifndef _SYS_SYSMACROS_H
@@ -367,12 +369,9 @@ extern unsigned char bcd_to_byte[256];
#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
#endif /* _BIT_FIELDS_LTOH */
-/* avoid any possibility of clashing with <stddef.h> version */
-#if (defined(_KERNEL) || defined(_FAKE_KERNEL)) && !defined(_KMEMUSER)
-
+#if !defined(ARRAY_SIZE)
#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0]))
-
-#endif /* _KERNEL, !_KMEMUSER */
+#endif
#ifdef __cplusplus
}
diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files
index 34db892539..7fc3cfec14 100644
--- a/usr/src/uts/i86pc/Makefile.files
+++ b/usr/src/uts/i86pc/Makefile.files
@@ -75,7 +75,6 @@ CORE_OBJS += \
instr_size.o \
intr.o \
kboot_mmu.o \
- kdi_subr.o \
kdi_idt.o \
kdi_idthdl.o \
kdi_asm.o \
@@ -160,7 +159,8 @@ SPECIAL_OBJS_64 += \
locore.o \
fast_trap_asm.o \
interrupt.o \
- syscall_asm_amd64.o
+ syscall_asm_amd64.o \
+ kpti_trampolines.o
SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS))
@@ -307,10 +307,9 @@ ASSYM_DEPS += \
swtch.o \
syscall_asm.o \
syscall_asm_amd64.o \
+ kpti_trampolines.o \
cpr_wakecode.o
CPR_IMPL_OBJS = cpr_impl.o cpr_wakecode.o
$(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h
-
-ASSYM_DEPS += kdi_asm.o
diff --git a/usr/src/uts/i86pc/ml/genassym.c b/usr/src/uts/i86pc/ml/genassym.c
index 088dd661a3..6d840368d7 100644
--- a/usr/src/uts/i86pc/ml/genassym.c
+++ b/usr/src/uts/i86pc/ml/genassym.c
@@ -20,6 +20,8 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _GENASSYM
@@ -68,8 +70,6 @@ extern void exit(int);
int
main(int argc, char *argv[])
{
- printf("#define\tT_AST 0x%x\n", T_AST);
-
printf("#define\tLOCK_LEVEL 0x%x\n", LOCK_LEVEL);
printf("#define\tCLOCK_LEVEL 0x%x\n", CLOCK_LEVEL);
printf("#define\tDISP_LEVEL 0x%x\n", DISP_LEVEL);
@@ -109,20 +109,6 @@ main(int argc, char *argv[])
printf("#define\tSSE_MXCSR_EFLAGS 0x%x\n", SSE_MXCSR_EFLAGS);
- printf("#define\tFP_487 0x%x\n", FP_487);
- printf("#define\tFP_486 0x%x\n", FP_486);
- printf("#define\tFPU_CW_INIT 0x%x\n", FPU_CW_INIT);
- printf("#define\tFPU_EN 0x%x\n", FPU_EN);
- printf("#define\tFPU_VALID 0x%x\n", FPU_VALID);
-
- printf("#define\tFP_NO 0x%x\n", FP_NO);
- printf("#define\tFP_SW 0x%x\n", FP_SW);
- printf("#define\tFP_HW 0x%x\n", FP_HW);
- printf("#define\tFP_287 0x%x\n", FP_287);
- printf("#define\tFP_387 0x%x\n", FP_387);
- printf("#define\t__FP_SSE 0x%x\n", __FP_SSE);
-
- printf("#define\tFP_FNSAVE 0x%x\n", FP_FNSAVE);
printf("#define\tFP_FXSAVE 0x%x\n", FP_FXSAVE);
printf("#define\tFP_XSAVE 0x%x\n", FP_XSAVE);
@@ -154,11 +140,6 @@ main(int argc, char *argv[])
printf("#define\tNSEC_PER_COUNTER_TICK 0x%llx\n", NANOSEC / PIT_HZ);
- printf("#define\tPITCTR0_PORT 0x%x\n", PITCTR0_PORT);
- printf("#define\tPITCTL_PORT 0x%x\n", PITCTL_PORT);
- printf("#define\tPIT_COUNTDOWN 0x%x\n",
- PIT_C0 | PIT_LOADMODE | PIT_NDIVMODE);
-
printf("#define\tNBPW 0x%x\n", NBPW);
printf("#define\tDDI_ACCATTR_IO_SPACE 0x%x\n", DDI_ACCATTR_IO_SPACE);
diff --git a/usr/src/uts/i86pc/ml/kdi_subr.s b/usr/src/uts/i86pc/ml/kdi_subr.s
deleted file mode 100644
index 8ed90ed410..0000000000
--- a/usr/src/uts/i86pc/ml/kdi_subr.s
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/asm_linkage.h>
-#include <sys/asm_misc.h>
-#include <sys/regset.h>
-#include <sys/privregs.h>
-#include <sys/psw.h>
-
-#if defined(__lint)
-#include <sys/types.h>
-#include <sys/segments.h>
-#endif
-
-#if defined(__lint)
-
-ulong_t
-kdi_getdr0(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr1(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr2(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr3(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr6(void)
-{
- return (0);
-}
-
-ulong_t
-kdi_getdr7(void)
-{
- return (0);
-}
-
-/*ARGSUSED*/
-void
-kdi_setdr0(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr1(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr2(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr3(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr4(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr6(ulong_t value)
-{}
-
-/*ARGSUSED*/
-void
-kdi_setdr7(ulong_t value)
-{}
-
-#else
-
-#if defined(__amd64)
-
-#define GETDREG(name, r) \
- ENTRY_NP(name); \
- movq r, %rax; \
- ret; \
- SET_SIZE(name)
-
-#define SETDREG(name, r) \
- ENTRY_NP(name); \
- movq %rdi, r; \
- ret; \
- SET_SIZE(name)
-
-#elif defined(__i386)
-
-#define GETDREG(name, r) \
- ENTRY_NP(name); \
- movl r, %eax; \
- ret; \
- SET_SIZE(name)
-
-#define SETDREG(name, r) \
- ENTRY_NP(name); \
- movl 4(%esp), %eax; \
- movl %eax, r; \
- ret; \
- SET_SIZE(name)
-
-#endif
-
- GETDREG(kdi_getdr0, %dr0)
- GETDREG(kdi_getdr1, %dr1)
- GETDREG(kdi_getdr2, %dr2)
- GETDREG(kdi_getdr3, %dr3)
- GETDREG(kdi_getdr6, %dr6)
- GETDREG(kdi_getdr7, %dr7)
-
- SETDREG(kdi_setdr0, %dr0)
- SETDREG(kdi_setdr1, %dr1)
- SETDREG(kdi_setdr2, %dr2)
- SETDREG(kdi_setdr3, %dr3)
- SETDREG(kdi_setdr6, %dr6)
- SETDREG(kdi_setdr7, %dr7)
-
-#endif /* __lint */
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
new file mode 100644
index 0000000000..c05718c3ad
--- /dev/null
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -0,0 +1,713 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+/*
+ * Copyright 2018 Joyent, Inc.
+ */
+
+/*
+ * This file contains the trampolines that are used by KPTI in order to be
+ * able to take interrupts/trap/etc while on the "user" page table.
+ *
+ * We don't map the full kernel text into the user page table: instead we
+ * map this one small section of trampolines (which compiles to ~13 pages).
+ * These trampolines are set in the IDT always (so they will run no matter
+ * whether we're on the kernel or user page table), and their primary job is to
+ * pivot us to the kernel %cr3 and %rsp without ruining everything.
+ *
+ * All of these interrupts use the amd64 IST feature when we have KPTI enabled,
+ * meaning that they will execute with their %rsp set to a known location, even
+ * if we take them in the kernel.
+ *
+ * Over in desctbls.c (for cpu0) and mp_pc.c (other cpus) we set up the IST
+ * stack to point at &cpu->cpu_m.mcpu_kpti.kf_tr_rsp. You can see the mcpu_kpti
+ * (a struct kpti_frame) defined in machcpuvar.h. This struct is set up to be
+ * page-aligned, and we map the page it's on into both page tables. Using a
+ * struct attached to the cpu_t also means that we can use %rsp-relative
+ * addressing to find anything on the cpu_t, so we don't have to touch %gs or
+ * GSBASE at all on incoming interrupt trampolines (which can get pretty hairy).
+ *
+ * This little struct is where the CPU will push the actual interrupt frame.
+ * Then, in the trampoline, we change %cr3, then figure out our destination
+ * stack pointer and "pivot" to it (set %rsp and re-push the CPU's interrupt
+ * frame). Then we jump to the regular ISR in the kernel text and carry on as
+ * normal.
+ *
+ * We leave the original frame and any spilled regs behind in the kpti_frame
+ * lazily until we want to return to userland. Then, we clear any spilled
+ * regs from it, and overwrite the rest with our iret frame. When switching
+ * this cpu to a different process (in hat_switch), we bzero the whole region to
+ * make sure nothing can leak between processes.
+ *
+ * When we're returning back to the original place we took the interrupt later
+ * (especially if it was in userland), we have to jmp back to the "return
+ * trampolines" here, since when we set %cr3 back to the user value, we need to
+ * be executing from code here in these shared pages and not the main kernel
+ * text again. Even though it should be fine to iret directly from kernel text
+ * when returning to kernel code, we make things jmp to a trampoline here just
+ * for consistency.
+ *
+ * Note that with IST, it's very important that we always must have pivoted
+ * away from the IST stack before we could possibly take any other interrupt
+ * on the same IST (unless it's an end-of-the-world fault and we don't care
+ * about coming back from it ever).
+ *
+ * This is particularly relevant to the dbgtrap/brktrap trampolines, as they
+ * regularly have to happen from within trampoline code (e.g. in the sysenter
+ * single-step case) and then return to the world normally. As a result, these
+ * two are IST'd to their own kpti_frame right above the normal one (in the same
+ * page), so they don't clobber their parent interrupt.
+ *
+ * To aid with debugging, we also IST the page fault (#PF/pftrap), general
+ * protection fault (#GP/gptrap) and stack fault (#SS/stktrap) interrupts to
+ * their own separate kpti_frame. This ensures that if we take one of these
+ * due to a bug in trampoline code, we preserve the original trampoline
+ * state that caused the trap.
+ *
+ * NMI, MCE and dblfault interrupts also are taken on their own dedicated IST
+ * stacks, since they can interrupt another ISR at any time. These stacks are
+ * full-sized, however, and not a little kpti_frame struct. We only set %cr3 in
+ * their trampolines (and do it unconditionally), and don't bother pivoting
+ * away. We're either going into the panic() path, or we're going to return
+ * straight away without rescheduling, so it's fine to not be on our real
+ * kthread stack (and some of the state we want to go find it with might be
+ * corrupt!)
+ *
+ * Finally, for these "special" interrupts (NMI/MCE/double fault) we use a
+ * special %cr3 value we stash here in the text (kpti_safe_cr3). We set this to
+ * point at the PML4 for kas early in boot and never touch it again. Hopefully
+ * it survives whatever corruption brings down the rest of the kernel!
+ *
+ * Syscalls are different to interrupts (at least in the SYSENTER/SYSCALL64
+ * cases) in that they do not push an interrupt frame (and also have some other
+ * effects). In the syscall trampolines, we assume that we can only be taking
+ * the call from userland and use SWAPGS and an unconditional overwrite of %cr3.
+ * We do not do any stack pivoting for syscalls (and we leave SYSENTER's
+ * existing %rsp pivot untouched) -- instead we spill registers into
+ * %gs:CPU_KPTI_* as we need to.
+ */
+
+/*
+ * The macros here mostly line up with what's in kdi_idthdl.s, too, so if you
+ * fix bugs here check to see if they should be fixed there as well.
+ */
+
+#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/regset.h>
+#include <sys/privregs.h>
+#include <sys/psw.h>
+#include <sys/machbrand.h>
+#include <sys/param.h>
+
+#if defined(__lint)
+
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/systm.h>
+
+#else /* __lint */
+
+#include <sys/segments.h>
+#include <sys/pcb.h>
+#include <sys/trap.h>
+#include <sys/ftrace.h>
+#include <sys/traptrace.h>
+#include <sys/clock.h>
+#include <sys/model.h>
+#include <sys/panic.h>
+
+#if defined(__xpv)
+#include <sys/hypervisor.h>
+#endif
+
+#include "assym.h"
+
+ .data
+ DGDEF3(kpti_enable, 8, 8)
+ .fill 1, 8, 1
+
+.section ".text";
+.align MMU_PAGESIZE
+
+.global kpti_tramp_start
+kpti_tramp_start:
+ nop
+
+/* This will be set by mlsetup, and then double-checked later */
+.global kpti_safe_cr3
+kpti_safe_cr3:
+ .quad 0
+ SET_SIZE(kpti_safe_cr3)
+
+/* startup_kmem() will overwrite this */
+.global kpti_kbase
+kpti_kbase:
+ .quad KERNELBASE
+ SET_SIZE(kpti_kbase)
+
+#define SET_KERNEL_CR3(spillreg) \
+ mov %cr3, spillreg; \
+ mov spillreg, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_KCR3, spillreg; \
+ cmp $0, spillreg; \
+ je 2f; \
+ mov spillreg, %cr3; \
+2:
+
+#if DEBUG
+#define SET_USER_CR3(spillreg) \
+ mov %cr3, spillreg; \
+ mov spillreg, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_UCR3, spillreg; \
+ mov spillreg, %cr3
+#else
+#define SET_USER_CR3(spillreg) \
+ mov %gs:CPU_KPTI_UCR3, spillreg; \
+ mov spillreg, %cr3
+#endif
+
+#define PIVOT_KPTI_STK(spillreg) \
+ mov %rsp, spillreg; \
+ mov %gs:CPU_KPTI_RET_RSP, %rsp; \
+ pushq T_FRAMERET_SS(spillreg); \
+ pushq T_FRAMERET_RSP(spillreg); \
+ pushq T_FRAMERET_RFLAGS(spillreg); \
+ pushq T_FRAMERET_CS(spillreg); \
+ pushq T_FRAMERET_RIP(spillreg)
+
+
+#define INTERRUPT_TRAMPOLINE_P(errpush) \
+ pushq %r13; \
+ pushq %r14; \
+ subq $KPTI_R14, %rsp; \
+ /* Save current %cr3. */ \
+ mov %cr3, %r14; \
+ mov %r14, KPTI_TR_CR3(%rsp); \
+ \
+ cmpw $KCS_SEL, KPTI_CS(%rsp); \
+ je 3f; \
+1: \
+ /* Change to the "kernel" %cr3 */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ cmp $0, %r14; \
+ je 2f; \
+ mov %r14, %cr3; \
+2: \
+ /* Get our cpu_t in %r13 */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ subq $CPU_KPTI_START, %r13; \
+ /* Use top of the kthread stk */ \
+ mov CPU_THREAD(%r13), %r14; \
+ mov T_STACK(%r14), %r14; \
+ addq $REGSIZE+MINFRAME, %r14; \
+ jmp 4f; \
+3: \
+ /* Check the %rsp in the frame. */ \
+ /* Is it above kernel base? */ \
+ mov kpti_kbase, %r14; \
+ cmp %r14, KPTI_RSP(%rsp); \
+ jb 1b; \
+ /* Use the %rsp from the trap frame */ \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~0xf), %r14; \
+4: \
+ mov %rsp, %r13; \
+ /* %r14 contains our destination stk */ \
+ mov %r14, %rsp; \
+ pushq KPTI_SS(%r13); \
+ pushq KPTI_RSP(%r13); \
+ pushq KPTI_RFLAGS(%r13); \
+ pushq KPTI_CS(%r13); \
+ pushq KPTI_RIP(%r13); \
+ errpush; \
+ mov KPTI_R14(%r13), %r14; \
+ mov KPTI_R13(%r13), %r13
+
+#define INTERRUPT_TRAMPOLINE_NOERR \
+ INTERRUPT_TRAMPOLINE_P(/**/)
+
+#define INTERRUPT_TRAMPOLINE \
+ INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
+
+/*
+ * This is used for all interrupts that can plausibly be taken inside another
+ * interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
+ *
+ * We check for whether we took the interrupt while in another trampoline, in
+ * which case we need to use the kthread stack.
+ */
+#define DBG_INTERRUPT_TRAMPOLINE_P(errpush) \
+ pushq %r13; \
+ pushq %r14; \
+ subq $KPTI_R14, %rsp; \
+ /* Check for clobbering */ \
+ cmp $0, KPTI_FLAG(%rsp); \
+ je 1f; \
+ /* Don't worry, this totally works */ \
+ int $8; \
+1: \
+ movq $1, KPTI_FLAG(%rsp); \
+ /* Save current %cr3. */ \
+ mov %cr3, %r14; \
+ mov %r14, KPTI_TR_CR3(%rsp); \
+ \
+ cmpw $KCS_SEL, KPTI_CS(%rsp); \
+ je 4f; \
+2: \
+ /* Change to the "kernel" %cr3 */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ cmp $0, %r14; \
+ je 3f; \
+ mov %r14, %cr3; \
+3: \
+ /* Get our cpu_t in %r13 */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ subq $CPU_KPTI_START, %r13; \
+ /* Use top of the kthread stk */ \
+ mov CPU_THREAD(%r13), %r14; \
+ mov T_STACK(%r14), %r14; \
+ addq $REGSIZE+MINFRAME, %r14; \
+ jmp 6f; \
+4: \
+ /* Check the %rsp in the frame. */ \
+ /* Is it above kernel base? */ \
+ /* If not, treat as user. */ \
+ mov kpti_kbase, %r14; \
+ cmp %r14, KPTI_RSP(%rsp); \
+ jb 2b; \
+ /* Is it within the kpti_frame page? */ \
+ /* If it is, treat as user interrupt */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~(MMU_PAGESIZE - 1)), %r14; \
+ cmp %r13, %r14; \
+ je 2b; \
+ /* Were we in trampoline code? */ \
+ leaq kpti_tramp_start, %r14; \
+ cmp %r14, KPTI_RIP(%rsp); \
+ jb 5f; \
+ leaq kpti_tramp_end, %r14; \
+ cmp %r14, KPTI_RIP(%rsp); \
+ ja 5f; \
+ /* If we were, change %cr3: we might */ \
+ /* have interrupted before it did. */ \
+ mov KPTI_KCR3(%rsp), %r14; \
+ mov %r14, %cr3; \
+5: \
+ /* Use the %rsp from the trap frame */ \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~0xf), %r14; \
+6: \
+ mov %rsp, %r13; \
+ /* %r14 contains our destination stk */ \
+ mov %r14, %rsp; \
+ pushq KPTI_SS(%r13); \
+ pushq KPTI_RSP(%r13); \
+ pushq KPTI_RFLAGS(%r13); \
+ pushq KPTI_CS(%r13); \
+ pushq KPTI_RIP(%r13); \
+ errpush; \
+ mov KPTI_R14(%r13), %r14; \
+ movq $0, KPTI_FLAG(%r13); \
+ mov KPTI_R13(%r13), %r13
+
+#define DBG_INTERRUPT_TRAMPOLINE_NOERR \
+ DBG_INTERRUPT_TRAMPOLINE_P(/**/)
+
+#define DBG_INTERRUPT_TRAMPOLINE \
+ DBG_INTERRUPT_TRAMPOLINE_P(pushq KPTI_ERR(%r13))
+
+ /*
+ * These labels (_start and _end) are used by trap.c to determine if
+ * we took an interrupt like an NMI during the return process.
+ */
+.global tr_sysc_ret_start
+tr_sysc_ret_start:
+
+ /*
+ * Syscall return trampolines.
+ *
+ * These are expected to be called on the kernel %gs. tr_sysret[ql] are
+ * called after %rsp is changed back to the user value, so we have no
+ * stack to work with. tr_sysexit has a kernel stack (but has to
+ * preserve rflags, soooo).
+ */
+ ENTRY_NP(tr_sysretq)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+1:
+ swapgs
+ sysretq
+ SET_SIZE(tr_sysretq)
+
+ ENTRY_NP(tr_sysretl)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+1:
+ SWAPGS
+ SYSRETL
+ SET_SIZE(tr_sysretl)
+
+ ENTRY_NP(tr_sysexit)
+ /*
+ * Note: we want to preserve RFLAGS across this branch, since sysexit
+ * (unlike sysret above) does not restore RFLAGS for us.
+ *
+ * We still have the real kernel stack (sysexit does restore that), so
+ * we can use pushfq/popfq.
+ */
+ pushfq
+
+ cmpq $1, kpti_enable
+ jne 1f
+
+ /* Have to pop it back off now before we change %cr3! */
+ popfq
+ mov %r13, %gs:CPU_KPTI_R13
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+ jmp 2f
+1:
+ popfq
+2:
+ swapgs
+ sti
+ sysexit
+ SET_SIZE(tr_sysexit)
+
+.global tr_sysc_ret_end
+tr_sysc_ret_end:
+
+ /*
+ * Syscall entry trampolines.
+ */
+
+#if DEBUG
+#define MK_SYSCALL_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ mov %cr3, %r13; \
+ mov %r13, %gs:CPU_KPTI_TR_CR3; \
+ mov %gs:CPU_KPTI_KCR3, %r13; \
+ mov %r13, %cr3; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ swapgs; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+#else
+#define MK_SYSCALL_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ mov %gs:CPU_KPTI_KCR3, %r13; \
+ mov %r13, %cr3; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ swapgs; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+#endif
+
+ MK_SYSCALL_TRAMPOLINE(sys_syscall)
+ MK_SYSCALL_TRAMPOLINE(sys_syscall32)
+ MK_SYSCALL_TRAMPOLINE(brand_sys_syscall)
+ MK_SYSCALL_TRAMPOLINE(brand_sys_syscall32)
+
+ /*
+ * SYSENTER is special. The CPU is really not very helpful when it
+ * comes to preserving and restoring state with it, and as a result
+ * we have to do all of it by hand. So, since we want to preserve
+ * RFLAGS, we have to be very careful in these trampolines to not
+ * clobber any bits in it. That means no cmpqs or branches!
+ */
+ ENTRY_NP(tr_sys_sysenter)
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+#if DEBUG
+ mov %cr3, %r13
+ mov %r13, %gs:CPU_KPTI_TR_CR3
+#endif
+ mov %gs:CPU_KPTI_KCR3, %r13
+ mov %r13, %cr3
+ mov %gs:CPU_KPTI_R13, %r13
+ jmp _sys_sysenter_post_swapgs
+ SET_SIZE(tr_sys_sysenter)
+
+ ENTRY_NP(tr_brand_sys_sysenter)
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+#if DEBUG
+ mov %cr3, %r13
+ mov %r13, %gs:CPU_KPTI_TR_CR3
+#endif
+ mov %gs:CPU_KPTI_KCR3, %r13
+ mov %r13, %cr3
+ mov %gs:CPU_KPTI_R13, %r13
+ jmp _brand_sys_sysenter_post_swapgs
+ SET_SIZE(tr_brand_sys_sysenter)
+
+#define MK_SYSCALL_INT_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ swapgs; \
+ mov %r13, %gs:CPU_KPTI_R13; \
+ SET_KERNEL_CR3(%r13); \
+ mov %gs:CPU_THREAD, %r13; \
+ mov T_STACK(%r13), %r13; \
+ addq $REGSIZE+MINFRAME, %r13; \
+ mov %r13, %rsp; \
+ pushq %gs:CPU_KPTI_SS; \
+ pushq %gs:CPU_KPTI_RSP; \
+ pushq %gs:CPU_KPTI_RFLAGS; \
+ pushq %gs:CPU_KPTI_CS; \
+ pushq %gs:CPU_KPTI_RIP; \
+ mov %gs:CPU_KPTI_R13, %r13; \
+ SWAPGS; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ MK_SYSCALL_INT_TRAMPOLINE(brand_sys_syscall_int)
+ MK_SYSCALL_INT_TRAMPOLINE(sys_syscall_int)
+
+ /*
+ * Interrupt/trap return trampolines
+ */
+
+.global tr_intr_ret_start
+tr_intr_ret_start:
+
+ ENTRY_NP(tr_iret_auto)
+ cmpq $1, kpti_enable
+ jne tr_iret_kernel
+ cmpw $KCS_SEL, T_FRAMERET_CS(%rsp)
+ je tr_iret_kernel
+ jmp tr_iret_user
+ SET_SIZE(tr_iret_auto)
+
+ ENTRY_NP(tr_iret_kernel)
+ /*
+ * Yes, this does nothing extra. But this way we know if we see iret
+ * elsewhere, then we've failed to properly consider trampolines there.
+ */
+ iretq
+ SET_SIZE(tr_iret_kernel)
+
+ ENTRY_NP(tr_iret_user)
+ cmpq $1, kpti_enable
+ jne 1f
+
+ swapgs
+ mov %r13, %gs:CPU_KPTI_R13
+ PIVOT_KPTI_STK(%r13)
+ SET_USER_CR3(%r13)
+ mov %gs:CPU_KPTI_R13, %r13
+ /* Zero these to make sure they didn't leak from a kernel trap */
+ movq $0, %gs:CPU_KPTI_R13
+ movq $0, %gs:CPU_KPTI_R14
+ swapgs
+1:
+ iretq
+ SET_SIZE(tr_iret_user)
+
+.global tr_intr_ret_end
+tr_intr_ret_end:
+
+ /*
+ * Interrupt/trap entry trampolines
+ */
+
+ /* CPU pushed an error code, and ISR wants one */
+#define MK_INTR_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ INTERRUPT_TRAMPOLINE; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU didn't push an error code, and ISR doesn't want one */
+#define MK_INTR_TRAMPOLINE_NOERR(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ push $0; \
+ INTERRUPT_TRAMPOLINE_NOERR; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU pushed an error code, and ISR wants one */
+#define MK_DBG_INTR_TRAMPOLINE(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ DBG_INTERRUPT_TRAMPOLINE; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+ /* CPU didn't push an error code, and ISR doesn't want one */
+#define MK_DBG_INTR_TRAMPOLINE_NOERR(isr) \
+ ENTRY_NP(tr_/**/isr); \
+ push $0; \
+ DBG_INTERRUPT_TRAMPOLINE_NOERR; \
+ jmp isr; \
+ SET_SIZE(tr_/**/isr)
+
+
+ MK_INTR_TRAMPOLINE_NOERR(div0trap)
+ MK_DBG_INTR_TRAMPOLINE_NOERR(dbgtrap)
+ MK_DBG_INTR_TRAMPOLINE_NOERR(brktrap)
+ MK_INTR_TRAMPOLINE_NOERR(ovflotrap)
+ MK_INTR_TRAMPOLINE_NOERR(boundstrap)
+ MK_INTR_TRAMPOLINE_NOERR(invoptrap)
+ MK_INTR_TRAMPOLINE_NOERR(ndptrap)
+ MK_INTR_TRAMPOLINE(invtsstrap)
+ MK_INTR_TRAMPOLINE(segnptrap)
+ MK_DBG_INTR_TRAMPOLINE(stktrap)
+ MK_DBG_INTR_TRAMPOLINE(gptrap)
+ MK_DBG_INTR_TRAMPOLINE(pftrap)
+ MK_INTR_TRAMPOLINE_NOERR(resvtrap)
+ MK_INTR_TRAMPOLINE_NOERR(ndperr)
+ MK_INTR_TRAMPOLINE(achktrap)
+ MK_INTR_TRAMPOLINE_NOERR(xmtrap)
+ MK_INTR_TRAMPOLINE_NOERR(invaltrap)
+ MK_INTR_TRAMPOLINE_NOERR(fasttrap)
+ MK_INTR_TRAMPOLINE_NOERR(dtrace_ret)
+ MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80)
+ MK_INTR_TRAMPOLINE_NOERR(sys_int80)
+
+ /*
+ * These are special because they can interrupt other traps, and
+ * each other. We don't need to pivot their stacks, because they have
+ * dedicated IST stack space, but we need to change %cr3.
+ */
+ ENTRY_NP(tr_nmiint)
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp nmiint
+ SET_SIZE(tr_nmiint)
+
+#if !defined(__xpv)
+ ENTRY_NP(tr_syserrtrap)
+ /*
+ * If we got here we should always have a zero error code pushed.
+ * The INT $0x8 instr doesn't seem to push one, though, which we use
+ * as an emergency panic in the other trampolines. So adjust things
+ * here.
+ */
+ cmpq $0, (%rsp)
+ je 1f
+ pushq $0
+1:
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp syserrtrap
+ SET_SIZE(tr_syserrtrap)
+#endif
+
+ ENTRY_NP(tr_mcetrap)
+ pushq %r13
+ mov kpti_safe_cr3, %r13
+ mov %r13, %cr3
+ popq %r13
+ jmp mcetrap
+ SET_SIZE(tr_mcetrap)
+
+ /*
+ * Interrupts start at 32
+ */
+#define MKIVCT(n) \
+ ENTRY_NP(tr_ivct/**/n) \
+ push $0; \
+ INTERRUPT_TRAMPOLINE; \
+ push $n - 0x20; \
+ jmp cmnint; \
+ SET_SIZE(tr_ivct/**/n)
+
+ MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35);
+ MKIVCT(36); MKIVCT(37); MKIVCT(38); MKIVCT(39);
+ MKIVCT(40); MKIVCT(41); MKIVCT(42); MKIVCT(43);
+ MKIVCT(44); MKIVCT(45); MKIVCT(46); MKIVCT(47);
+ MKIVCT(48); MKIVCT(49); MKIVCT(50); MKIVCT(51);
+ MKIVCT(52); MKIVCT(53); MKIVCT(54); MKIVCT(55);
+ MKIVCT(56); MKIVCT(57); MKIVCT(58); MKIVCT(59);
+ MKIVCT(60); MKIVCT(61); MKIVCT(62); MKIVCT(63);
+ MKIVCT(64); MKIVCT(65); MKIVCT(66); MKIVCT(67);
+ MKIVCT(68); MKIVCT(69); MKIVCT(70); MKIVCT(71);
+ MKIVCT(72); MKIVCT(73); MKIVCT(74); MKIVCT(75);
+ MKIVCT(76); MKIVCT(77); MKIVCT(78); MKIVCT(79);
+ MKIVCT(80); MKIVCT(81); MKIVCT(82); MKIVCT(83);
+ MKIVCT(84); MKIVCT(85); MKIVCT(86); MKIVCT(87);
+ MKIVCT(88); MKIVCT(89); MKIVCT(90); MKIVCT(91);
+ MKIVCT(92); MKIVCT(93); MKIVCT(94); MKIVCT(95);
+ MKIVCT(96); MKIVCT(97); MKIVCT(98); MKIVCT(99);
+ MKIVCT(100); MKIVCT(101); MKIVCT(102); MKIVCT(103);
+ MKIVCT(104); MKIVCT(105); MKIVCT(106); MKIVCT(107);
+ MKIVCT(108); MKIVCT(109); MKIVCT(110); MKIVCT(111);
+ MKIVCT(112); MKIVCT(113); MKIVCT(114); MKIVCT(115);
+ MKIVCT(116); MKIVCT(117); MKIVCT(118); MKIVCT(119);
+ MKIVCT(120); MKIVCT(121); MKIVCT(122); MKIVCT(123);
+ MKIVCT(124); MKIVCT(125); MKIVCT(126); MKIVCT(127);
+ MKIVCT(128); MKIVCT(129); MKIVCT(130); MKIVCT(131);
+ MKIVCT(132); MKIVCT(133); MKIVCT(134); MKIVCT(135);
+ MKIVCT(136); MKIVCT(137); MKIVCT(138); MKIVCT(139);
+ MKIVCT(140); MKIVCT(141); MKIVCT(142); MKIVCT(143);
+ MKIVCT(144); MKIVCT(145); MKIVCT(146); MKIVCT(147);
+ MKIVCT(148); MKIVCT(149); MKIVCT(150); MKIVCT(151);
+ MKIVCT(152); MKIVCT(153); MKIVCT(154); MKIVCT(155);
+ MKIVCT(156); MKIVCT(157); MKIVCT(158); MKIVCT(159);
+ MKIVCT(160); MKIVCT(161); MKIVCT(162); MKIVCT(163);
+ MKIVCT(164); MKIVCT(165); MKIVCT(166); MKIVCT(167);
+ MKIVCT(168); MKIVCT(169); MKIVCT(170); MKIVCT(171);
+ MKIVCT(172); MKIVCT(173); MKIVCT(174); MKIVCT(175);
+ MKIVCT(176); MKIVCT(177); MKIVCT(178); MKIVCT(179);
+ MKIVCT(180); MKIVCT(181); MKIVCT(182); MKIVCT(183);
+ MKIVCT(184); MKIVCT(185); MKIVCT(186); MKIVCT(187);
+ MKIVCT(188); MKIVCT(189); MKIVCT(190); MKIVCT(191);
+ MKIVCT(192); MKIVCT(193); MKIVCT(194); MKIVCT(195);
+ MKIVCT(196); MKIVCT(197); MKIVCT(198); MKIVCT(199);
+ MKIVCT(200); MKIVCT(201); MKIVCT(202); MKIVCT(203);
+ MKIVCT(204); MKIVCT(205); MKIVCT(206); MKIVCT(207);
+ MKIVCT(208); MKIVCT(209); MKIVCT(210); MKIVCT(211);
+ MKIVCT(212); MKIVCT(213); MKIVCT(214); MKIVCT(215);
+ MKIVCT(216); MKIVCT(217); MKIVCT(218); MKIVCT(219);
+ MKIVCT(220); MKIVCT(221); MKIVCT(222); MKIVCT(223);
+ MKIVCT(224); MKIVCT(225); MKIVCT(226); MKIVCT(227);
+ MKIVCT(228); MKIVCT(229); MKIVCT(230); MKIVCT(231);
+ MKIVCT(232); MKIVCT(233); MKIVCT(234); MKIVCT(235);
+ MKIVCT(236); MKIVCT(237); MKIVCT(238); MKIVCT(239);
+ MKIVCT(240); MKIVCT(241); MKIVCT(242); MKIVCT(243);
+ MKIVCT(244); MKIVCT(245); MKIVCT(246); MKIVCT(247);
+ MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251);
+ MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255);
+
+.align MMU_PAGESIZE
+.global kpti_tramp_end
+kpti_tramp_end:
+ nop
+
+#endif /* __lint */
diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s
index 042818844d..4626dd1492 100644
--- a/usr/src/uts/i86pc/ml/locore.s
+++ b/usr/src/uts/i86pc/ml/locore.s
@@ -23,7 +23,7 @@
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
- * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
*/
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
@@ -1186,7 +1186,7 @@ cmntrap()
addq %rax, %r12
movq %r12, REGOFF_RIP(%rbp)
INTR_POP
- IRET
+ jmp tr_iret_auto
/*NOTREACHED*/
3:
leaq dtrace_badflags(%rip), %rdi
@@ -1599,7 +1599,7 @@ _no_pending_updates:
*/
ALTENTRY(sys_rtt_syscall32)
USER32_POP
- IRET
+ jmp tr_iret_user
/*NOTREACHED*/
ALTENTRY(sys_rtt_syscall)
@@ -1608,7 +1608,7 @@ _no_pending_updates:
*/
USER_POP
ALTENTRY(nopop_sys_rtt_syscall)
- IRET
+ jmp tr_iret_user
/*NOTREACHED*/
SET_SIZE(nopop_sys_rtt_syscall)
@@ -1623,7 +1623,7 @@ _no_pending_updates:
* Restore regs before doing iretq to kernel mode
*/
INTR_POP
- IRET
+ jmp tr_iret_kernel
.globl _sys_rtt_end
_sys_rtt_end:
/*NOTREACHED*/
diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in
index 406d389000..a1f1e935aa 100644
--- a/usr/src/uts/i86pc/ml/offsets.in
+++ b/usr/src/uts/i86pc/ml/offsets.in
@@ -1,7 +1,7 @@
\
\ Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
\ Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
-\ Copyright 2016 Joyent, Inc.
+\ Copyright 2018 Joyent, Inc.
\
\ CDDL HEADER START
\
@@ -233,6 +233,44 @@ cpu
cpu_m.mcpu_vcpu_info CPU_VCPU_INFO
#endif
+cpu
+ cpu_m.mcpu_kpti.kf_kernel_cr3 CPU_KPTI_KCR3
+ cpu_m.mcpu_kpti.kf_user_cr3 CPU_KPTI_UCR3
+ cpu_m.mcpu_kpti.kf_tr_rsp CPU_KPTI_TR_RSP
+ cpu_m.mcpu_kpti.kf_tr_cr3 CPU_KPTI_TR_CR3
+ cpu_m.mcpu_kpti.kf_r13 CPU_KPTI_R13
+ cpu_m.mcpu_kpti.kf_r14 CPU_KPTI_R14
+ cpu_m.mcpu_kpti.kf_tr_ret_rsp CPU_KPTI_RET_RSP
+
+ cpu_m.mcpu_kpti.kf_ss CPU_KPTI_SS
+ cpu_m.mcpu_kpti.kf_rsp CPU_KPTI_RSP
+ cpu_m.mcpu_kpti.kf_rflags CPU_KPTI_RFLAGS
+ cpu_m.mcpu_kpti.kf_cs CPU_KPTI_CS
+ cpu_m.mcpu_kpti.kf_rip CPU_KPTI_RIP
+ cpu_m.mcpu_kpti.kf_err CPU_KPTI_ERR
+
+ cpu_m.mcpu_pad2 CPU_KPTI_START
+ cpu_m.mcpu_pad3 CPU_KPTI_END
+
+kpti_frame
+ kf_r14 KPTI_R14
+ kf_r13 KPTI_R13
+ kf_err KPTI_ERR
+ kf_rip KPTI_RIP
+ kf_cs KPTI_CS
+ kf_rflags KPTI_RFLAGS
+ kf_rsp KPTI_RSP
+ kf_ss KPTI_SS
+
+ kf_tr_rsp KPTI_TOP
+
+ kf_kernel_cr3 KPTI_KCR3
+ kf_user_cr3 KPTI_UCR3
+ kf_tr_ret_rsp KPTI_RET_RSP
+ kf_tr_cr3 KPTI_TR_CR3
+
+ kf_tr_flag KPTI_FLAG
+
standard_pic
c_curmask
c_iplmask
diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
index bc9351cada..98f8c8f8da 100644
--- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
+++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -491,6 +491,20 @@ noprod_sys_syscall:
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Machine state saved in the regs structure on the stack
* First six args in %rdi, %rsi, %rdx, %rcx, %r8, %r9
* %eax is the syscall number
@@ -705,8 +719,7 @@ _syscall_after_brand:
SYSRETQ
#else
ALTENTRY(nopop_sys_syscall_swapgs_sysretq)
- SWAPGS /* user gsbase */
- SYSRETQ
+ jmp tr_sysretq
#endif
/*NOTREACHED*/
SET_SIZE(nopop_sys_syscall_swapgs_sysretq)
@@ -807,6 +820,20 @@ _syscall32_save:
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Application state saved in the regs structure on the stack
* %eax is the syscall number
* %rsp is the thread's stack, %r15 is curthread
@@ -964,8 +991,7 @@ _syscall32_after_brand:
ASSERT_UPCALL_MASK_IS_SET
ALTENTRY(nopop_sys_syscall32_swapgs_sysretl)
- SWAPGS /* user gsbase */
- SYSRETL
+ jmp tr_sysretl
SET_SIZE(nopop_sys_syscall32_swapgs_sysretl)
/*NOTREACHED*/
@@ -1010,23 +1036,22 @@ _full_syscall_postsys32:
* this call, as %edx is used by the sysexit instruction.
*
* One final complication in this routine is its interaction with
- * single-stepping in a debugger. For most of the system call mechanisms,
- * the CPU automatically clears the single-step flag before we enter the
- * kernel. The sysenter mechanism does not clear the flag, so a user
- * single-stepping through a libc routine may suddenly find themself
- * single-stepping through the kernel. To detect this, kmdb compares the
- * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
- * If it finds that we have single-stepped to a sysenter entry point, it
- * explicitly clears the flag and executes the sys_sysenter routine.
+ * single-stepping in a debugger. For most of the system call mechanisms, the
+ * CPU automatically clears the single-step flag before we enter the kernel.
+ * The sysenter mechanism does not clear the flag, so a user single-stepping
+ * through a libc routine may suddenly find themself single-stepping through the
+ * kernel. To detect this, kmdb and trap() both compare the trap %pc to the
+ * [brand_]sys_enter addresses on each single-step trap. If it finds that we
+ * have single-stepped to a sysenter entry point, it explicitly clears the flag
+ * and executes the sys_sysenter routine.
*
- * One final complication in this final complication is the fact that we
- * have two different entry points for sysenter: brand_sys_sysenter and
- * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping
- * through the kernel with kmdb, we will eventually hit the instruction at
- * sys_sysenter. kmdb cannot distinguish between that valid single-step
- * and the undesirable one mentioned above. To avoid this situation, we
- * simply add a jump over the instruction at sys_sysenter to make it
- * impossible to single-step to it.
+ * One final complication in this final complication is the fact that we have
+ * two different entry points for sysenter: brand_sys_sysenter and sys_sysenter.
+ * If we enter at brand_sys_sysenter and start single-stepping through the
+ * kernel with kmdb, we will eventually hit the instruction at sys_sysenter.
+ * kmdb cannot distinguish between that valid single-step and the undesirable
+ * one mentioned above. To avoid this situation, we simply add a jump over the
+ * instruction at sys_sysenter to make it impossible to single-step to it.
*/
#if defined(__lint)
@@ -1039,6 +1064,7 @@ sys_sysenter()
ENTRY_NP(brand_sys_sysenter)
SWAPGS /* kernel gsbase */
ALTENTRY(_brand_sys_sysenter_post_swapgs)
+
BRAND_CALLBACK(BRAND_CB_SYSENTER, BRAND_URET_FROM_REG(%rdx))
/*
* Jump over sys_sysenter to allow single-stepping as described
@@ -1048,13 +1074,17 @@ sys_sysenter()
ALTENTRY(sys_sysenter)
SWAPGS /* kernel gsbase */
-
ALTENTRY(_sys_sysenter_post_swapgs)
+
movq %gs:CPU_THREAD, %r15
movl $U32CS_SEL, REGOFF_CS(%rsp)
movl %ecx, REGOFF_RSP(%rsp) /* wrapper: %esp -> %ecx */
movl %edx, REGOFF_RIP(%rsp) /* wrapper: %eip -> %edx */
+ /*
+ * NOTE: none of the instructions that run before we get here should
+ * clobber bits in (R)FLAGS! This includes the kpti trampoline.
+ */
pushfq
popq %r10
movl $UDS_SEL, REGOFF_SS(%rsp)
@@ -1096,6 +1126,20 @@ sys_sysenter()
movq %rbx, REGOFF_GS(%rsp)
/*
+ * If we're trying to use TRAPTRACE though, I take that back: we're
+ * probably debugging some problem in the SWAPGS logic and want to know
+ * what the incoming gsbase was.
+ *
+ * Since we already did SWAPGS, record the KGSBASE.
+ */
+#if defined(DEBUG) && defined(TRAPTRACE) && !defined(__xpv)
+ movl $MSR_AMD_KGSBASE, %ecx
+ rdmsr
+ movl %eax, REGOFF_GSBASE(%rsp)
+ movl %edx, REGOFF_GSBASE+4(%rsp)
+#endif
+
+ /*
* Application state saved in the regs structure on the stack
* %eax is the syscall number
* %rsp is the thread's stack, %r15 is curthread
@@ -1198,6 +1242,8 @@ sys_sysenter()
* If we were, and we ended up on another cpu, or another
* lwp got int ahead of us, it could change the segment
* registers without us noticing before we return to userland.
+ *
+ * This cli is undone in the tr_sysexit trampoline code.
*/
cli
CHECK_POSTSYS_NE(%r15, %r14, %ebx)
@@ -1231,16 +1277,14 @@ sys_sysenter()
popfq
movl REGOFF_RSP(%rsp), %ecx /* sysexit: %ecx -> %esp */
ALTENTRY(sys_sysenter_swapgs_sysexit)
- swapgs
- sti
- sysexit
+ jmp tr_sysexit
SET_SIZE(sys_sysenter_swapgs_sysexit)
SET_SIZE(sys_sysenter)
SET_SIZE(_sys_sysenter_post_swapgs)
SET_SIZE(brand_sys_sysenter)
#endif /* __lint */
-
+
#if defined(__lint)
/*
* System call via an int80. This entry point is only used by the Linux
@@ -1352,10 +1396,13 @@ nopop_syscall_int:
* or we could end up breaking branded zone support. See the usage of
* this label in lx_brand_int80_callback and sn1_brand_int91_callback
* for examples.
+ *
+ * We want to swapgs to maintain the invariant that all entries into
+ * tr_iret_user are done on the user gsbase.
*/
- ALTENTRY(sys_sysint_swapgs_iret)
- SWAPGS /* user gsbase */
- IRET
+ ALTENTRY(sys_sysint_swapgs_iret)
+ SWAPGS
+ jmp tr_iret_user
/*NOTREACHED*/
SET_SIZE(sys_sysint_swapgs_iret)
SET_SIZE(sys_syscall_int)
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 2569812c47..36ec2e4945 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserverd.
+ * Copyright (c) 2018 Joyent, Inc. All rights reserverd.
*/
/*
@@ -471,6 +471,21 @@
#include <sys/hypervisor.h>
#endif
+#if defined(__amd64) && !defined(__xpv)
+/* If this fails, then the padding numbers in machcpuvar.h are wrong. */
+CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad))
+ < MMU_PAGESIZE);
+CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti))
+ >= MMU_PAGESIZE);
+CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg))
+ < 2 * MMU_PAGESIZE);
+CTASSERT((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_pad2))
+ < 2 * MMU_PAGESIZE);
+CTASSERT(((sizeof (struct kpti_frame)) & 0xF) == 0);
+CTASSERT(((offsetof(cpu_t, cpu_m) + offsetof(struct machcpu, mcpu_kpti_dbg))
+ & 0xF) == 0);
+CTASSERT((offsetof(struct kpti_frame, kf_tr_rsp) & 0xF) == 0);
+#endif
#if defined(__xpv) && defined(DEBUG)
@@ -1473,6 +1488,21 @@ loop:
return (1);
}
+#if !defined(__xpv)
+ /*
+ * Assert that we're not trying to return into the syscall return
+ * trampolines. Things will go baaaaad if we try to do that.
+ *
+ * Note that none of these run with interrupts on, so this should
+ * never happen (even in the sysexit case the STI doesn't take effect
+ * until after sysexit finishes).
+ */
+ extern void tr_sysc_ret_start();
+ extern void tr_sysc_ret_end();
+ ASSERT(!(rp->r_pc >= (uintptr_t)tr_sysc_ret_start &&
+ rp->r_pc <= (uintptr_t)tr_sysc_ret_end));
+#endif
+
/*
* Here if we are returning to supervisor mode.
* Check for a kernel preemption request.
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index 13ccfde671..19f0f5f676 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -23,7 +23,7 @@
*
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
* Copyright (c) 2010, Intel Corporation.
@@ -148,6 +148,20 @@ mlsetup(struct regs *rp)
else
cpuid_feature_edx_exclude = (uint32_t)prop_value;
+#if defined(__amd64) && !defined(__xpv)
+ /*
+ * Check to see if KPTI has been explicitly enabled or disabled.
+ * We have to check this before init_desctbls().
+ */
+ if (bootprop_getval("kpti", &prop_value) != 0) {
+ kpti_enable = 1;
+ } else {
+ kpti_enable = (uint64_t)(prop_value == 1);
+ prom_printf("unix: forcing kpti to %s due to boot argument\n",
+ (kpti_enable == 1) ? "ON" : "OFF");
+ }
+#endif
+
/*
* Initialize idt0, gdt0, ldt0_default, ktss0 and dftss.
*/
diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c
index 105b1e93dc..4e12703395 100644
--- a/usr/src/uts/i86pc/os/mp_pc.c
+++ b/usr/src/uts/i86pc/os/mp_pc.c
@@ -26,7 +26,7 @@
* All rights reserved.
*/
/*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc
*/
/*
@@ -174,13 +174,15 @@ mach_cpucontext_alloc_tables(struct cpu *cp)
{
tss_t *ntss;
struct cpu_tables *ct;
+ size_t ctsize;
/*
* Allocate space for stack, tss, gdt and idt. We round the size
* allotted for cpu_tables up, so that the TSS is on a unique page.
* This is more efficient when running in virtual machines.
*/
- ct = kmem_zalloc(P2ROUNDUP(sizeof (*ct), PAGESIZE), KM_SLEEP);
+ ctsize = P2ROUNDUP(sizeof (*ct), PAGESIZE);
+ ct = kmem_zalloc(ctsize, KM_SLEEP);
if ((uintptr_t)ct & PAGEOFFSET)
panic("mach_cpucontext_alloc_tables: cpu%d misaligned tables",
cp->cpu_id);
@@ -188,16 +190,62 @@ mach_cpucontext_alloc_tables(struct cpu *cp)
ntss = cp->cpu_tss = &ct->ct_tss;
#if defined(__amd64)
+ uintptr_t va;
+ size_t len;
/*
* #DF (double fault).
*/
- ntss->tss_ist1 = (uint64_t)&ct->ct_stack[sizeof (ct->ct_stack)];
+ ntss->tss_ist1 = (uintptr_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
+
+ /*
+ * #NM (non-maskable interrupt)
+ */
+ ntss->tss_ist2 = (uintptr_t)&ct->ct_stack2[sizeof (ct->ct_stack2)];
+
+ /*
+ * #MC (machine check exception / hardware error)
+ */
+ ntss->tss_ist3 = (uintptr_t)&ct->ct_stack3[sizeof (ct->ct_stack3)];
+
+ /*
+ * #DB, #BP debug interrupts and KDI/kmdb
+ */
+ ntss->tss_ist4 = (uintptr_t)&cp->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
+
+ if (kpti_enable == 1) {
+ /*
+ * #GP, #PF, #SS fault interrupts
+ */
+ ntss->tss_ist5 = (uintptr_t)&cp->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
+
+ /*
+ * Used by all other interrupts
+ */
+ ntss->tss_ist6 = (uint64_t)&cp->cpu_m.mcpu_kpti.kf_tr_rsp;
+
+ /*
+ * On AMD64 we need to make sure that all of the pages of the
+ * struct cpu_tables are punched through onto the user CPU for
+ * kpti.
+ *
+ * The final page will always be the TSS, so treat that
+ * separately.
+ */
+ for (va = (uintptr_t)ct, len = ctsize - MMU_PAGESIZE;
+ len >= MMU_PAGESIZE;
+ len -= MMU_PAGESIZE, va += MMU_PAGESIZE) {
+ /* The doublefault stack must be RW */
+ hati_cpu_punchin(cp, va, PROT_READ | PROT_WRITE);
+ }
+ ASSERT3U((uintptr_t)ntss, ==, va);
+ hati_cpu_punchin(cp, (uintptr_t)ntss, PROT_READ);
+ }
#elif defined(__i386)
ntss->tss_esp0 = ntss->tss_esp1 = ntss->tss_esp2 = ntss->tss_esp =
- (uint32_t)&ct->ct_stack[sizeof (ct->ct_stack)];
+ (uint32_t)&ct->ct_stack1[sizeof (ct->ct_stack1)];
ntss->tss_ss0 = ntss->tss_ss1 = ntss->tss_ss2 = ntss->tss_ss = KDS_SEL;
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index 0fadfb7993..a807be6a40 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -27,7 +27,7 @@
* All rights reserved.
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
@@ -80,10 +80,10 @@
#include <sys/cpu_module.h>
#include <sys/ontrap.h>
-struct cpu cpus[1]; /* CPU data */
-struct cpu *cpu[NCPU] = {&cpus[0]}; /* pointers to all CPUs */
-struct cpu *cpu_free_list; /* list for released CPUs */
-cpu_core_t cpu_core[NCPU]; /* cpu_core structures */
+struct cpu cpus[1] __aligned(MMU_PAGESIZE);
+struct cpu *cpu[NCPU] = {&cpus[0]};
+struct cpu *cpu_free_list;
+cpu_core_t cpu_core[NCPU];
#define cpu_next_free cpu_prev
@@ -166,25 +166,23 @@ init_cpu_info(struct cpu *cp)
void
init_cpu_syscall(struct cpu *cp)
{
- uint64_t flags;
-
kpreempt_disable();
-#if defined(__amd64)
if (is_x86_feature(x86_featureset, X86FSET_MSR) &&
is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
uint64_t flags;
-#if !defined(__lint)
+#if !defined(__xpv)
/*
* The syscall instruction imposes a certain ordering on
* segment selectors, so we double-check that ordering
* here.
*/
- ASSERT(KDS_SEL == KCS_SEL + 8);
- ASSERT(UDS_SEL == U32CS_SEL + 8);
- ASSERT(UCS_SEL == U32CS_SEL + 16);
+ CTASSERT(KDS_SEL == KCS_SEL + 8);
+ CTASSERT(UDS_SEL == U32CS_SEL + 8);
+ CTASSERT(UCS_SEL == U32CS_SEL + 16);
#endif
+
/*
* Turn syscall/sysret extensions on.
*/
@@ -195,8 +193,17 @@ init_cpu_syscall(struct cpu *cp)
*/
wrmsr(MSR_AMD_STAR,
((uint64_t)(U32CS_SEL << 16 | KCS_SEL)) << 32);
- wrmsr(MSR_AMD_LSTAR, (uint64_t)(uintptr_t)sys_syscall);
- wrmsr(MSR_AMD_CSTAR, (uint64_t)(uintptr_t)sys_syscall32);
+ if (kpti_enable == 1) {
+ wrmsr(MSR_AMD_LSTAR,
+ (uint64_t)(uintptr_t)tr_sys_syscall);
+ wrmsr(MSR_AMD_CSTAR,
+ (uint64_t)(uintptr_t)tr_sys_syscall32);
+ } else {
+ wrmsr(MSR_AMD_LSTAR,
+ (uint64_t)(uintptr_t)sys_syscall);
+ wrmsr(MSR_AMD_CSTAR,
+ (uint64_t)(uintptr_t)sys_syscall32);
+ }
/*
* This list of flags is masked off the incoming
@@ -207,19 +214,15 @@ init_cpu_syscall(struct cpu *cp)
flags |= PS_ACHK;
wrmsr(MSR_AMD_SFMASK, flags);
}
-#endif
/*
- * On 32-bit kernels, we use sysenter/sysexit because it's too
- * hard to use syscall/sysret, and it is more portable anyway.
- *
* On 64-bit kernels on Nocona machines, the 32-bit syscall
* variant isn't available to 32-bit applications, but sysenter is.
*/
if (is_x86_feature(x86_featureset, X86FSET_MSR) &&
is_x86_feature(x86_featureset, X86FSET_SEP)) {
-#if !defined(__lint)
+#if !defined(__xpv)
/*
* The sysenter instruction imposes a certain ordering on
* segment selectors, so we double-check that ordering
@@ -227,13 +230,10 @@ init_cpu_syscall(struct cpu *cp)
* Intel Architecture Software Developer's Manual Volume 2:
* Instruction Set Reference"
*/
- ASSERT(KDS_SEL == KCS_SEL + 8);
+ CTASSERT(KDS_SEL == KCS_SEL + 8);
- ASSERT32(UCS_SEL == ((KCS_SEL + 16) | 3));
- ASSERT32(UDS_SEL == UCS_SEL + 8);
-
- ASSERT64(U32CS_SEL == ((KCS_SEL + 16) | 3));
- ASSERT64(UDS_SEL == U32CS_SEL + 8);
+ CTASSERT(U32CS_SEL == ((KCS_SEL + 16) | 3));
+ CTASSERT(UDS_SEL == U32CS_SEL + 8);
#endif
cpu_sep_enable();
@@ -243,7 +243,14 @@ init_cpu_syscall(struct cpu *cp)
* via a context handler.
*/
wrmsr(MSR_INTC_SEP_ESP, 0);
- wrmsr(MSR_INTC_SEP_EIP, (uint64_t)(uintptr_t)sys_sysenter);
+
+ if (kpti_enable == 1) {
+ wrmsr(MSR_INTC_SEP_EIP,
+ (uint64_t)(uintptr_t)tr_sys_sysenter);
+ } else {
+ wrmsr(MSR_INTC_SEP_EIP,
+ (uint64_t)(uintptr_t)sys_sysenter);
+ }
}
kpreempt_enable();
@@ -420,20 +427,20 @@ mp_cpu_configure_common(int cpun, boolean_t boot)
#endif
/*
- * If we have more than one node, each cpu gets a copy of IDT
- * local to its node. If this is a Pentium box, we use cpu 0's
- * IDT. cpu 0's IDT has been made read-only to workaround the
- * cmpxchgl register bug
+ * Allocate pages for the CPU LDT.
+ */
+ cp->cpu_m.mcpu_ldt = kmem_zalloc(LDT_CPU_SIZE, KM_SLEEP);
+ cp->cpu_m.mcpu_ldt_len = 0;
+
+ /*
+ * Allocate a per-CPU IDT and initialize the new IDT to the currently
+ * runing CPU.
*/
- if (system_hardware.hd_nodes && x86_type != X86_TYPE_P5) {
#if !defined(__lint)
- ASSERT((sizeof (*CPU->cpu_idt) * NIDT) <= PAGESIZE);
+ ASSERT((sizeof (*CPU->cpu_idt) * NIDT) <= PAGESIZE);
#endif
- cp->cpu_idt = kmem_zalloc(PAGESIZE, KM_SLEEP);
- bcopy(CPU->cpu_idt, cp->cpu_idt, PAGESIZE);
- } else {
- cp->cpu_idt = CPU->cpu_idt;
- }
+ cp->cpu_idt = kmem_alloc(PAGESIZE, KM_SLEEP);
+ bcopy(CPU->cpu_idt, cp->cpu_idt, PAGESIZE);
/*
* alloc space for cpuid info
@@ -571,6 +578,10 @@ mp_cpu_unconfigure_common(struct cpu *cp, int error)
kmem_free(cp->cpu_idt, PAGESIZE);
cp->cpu_idt = NULL;
+ kmem_free(cp->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
+ cp->cpu_m.mcpu_ldt = NULL;
+ cp->cpu_m.mcpu_ldt_len = 0;
+
kmem_free(cp->cpu_gdt, PAGESIZE);
cp->cpu_gdt = NULL;
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index bfe8c2486b..5e23d2f486 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -23,7 +23,7 @@
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
- * Copyright (c) 2017 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
* Copyright (c) 2015 by Delphix. All rights reserved.
*/
/*
@@ -446,8 +446,10 @@ static pgcnt_t kphysm_init(page_t *, pgcnt_t);
* 0xFFFFFFFF.FBC00000 |-----------------------|
* | Kernel Text |
* 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT
- * |--- GDT ---|- GDT page (GDT_VA)
* |--- debug info ---|- debug info (DEBUG_INFO_VA)
+ * |--- GDT ---|- GDT page (GDT_VA)
+ * |--- IDT ---|- IDT page (IDT_VA)
+ * |--- LDT ---|- LDT pages (LDT_VA)
* | |
* | Core heap | (used for loadable modules)
* 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
@@ -959,6 +961,17 @@ kpm_init()
panic("segkpm_create segkpm");
rw_exit(&kas.a_lock);
+
+ kpm_enable = 1;
+
+ /*
+ * As the KPM was disabled while setting up the system, go back and fix
+ * CPU zero's access to its user page table. This is a bit gross, but
+ * we have a chicken and egg problem otherwise.
+ */
+ ASSERT(CPU->cpu_hat_info->hci_user_l3ptes == NULL);
+ CPU->cpu_hat_info->hci_user_l3ptes =
+ (x86pte_t *)hat_kpm_mapin_pfn(CPU->cpu_hat_info->hci_user_l3pfn);
}
/*
@@ -1422,6 +1435,9 @@ static void
startup_kmem(void)
{
extern void page_set_colorequiv_arr(void);
+#if !defined(__xpv)
+ extern uint64_t kpti_kbase;
+#endif
PRM_POINT("startup_kmem() starting...");
@@ -1484,6 +1500,9 @@ startup_kmem(void)
*(uintptr_t *)&_userlimit = kernelbase;
#if defined(__amd64)
*(uintptr_t *)&_userlimit -= KERNELBASE - USERLIMIT;
+#if !defined(__xpv)
+ kpti_kbase = kernelbase;
+#endif
#else
*(uintptr_t *)&_userlimit32 = _userlimit;
#endif
@@ -1491,6 +1510,9 @@ startup_kmem(void)
PRM_DEBUG(_userlimit);
PRM_DEBUG(_userlimit32);
+ /* We have to re-do this now that we've modified _userlimit. */
+ mmu_calc_user_slots();
+
layout_kernel_va();
#if defined(__i386)
@@ -2129,32 +2151,6 @@ startup_vm(void)
if (boothowto & RB_DEBUG)
kdi_dvec_memavail();
- /*
- * The following code installs a special page fault handler (#pf)
- * to work around a pentium bug.
- */
-#if !defined(__amd64) && !defined(__xpv)
- if (x86_type == X86_TYPE_P5) {
- desctbr_t idtr;
- gate_desc_t *newidt;
-
- if ((newidt = kmem_zalloc(MMU_PAGESIZE, KM_NOSLEEP)) == NULL)
- panic("failed to install pentium_pftrap");
-
- bcopy(idt0, newidt, NIDT * sizeof (*idt0));
- set_gatesegd(&newidt[T_PGFLT], &pentium_pftrap,
- KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
-
- (void) as_setprot(&kas, (caddr_t)newidt, MMU_PAGESIZE,
- PROT_READ | PROT_EXEC);
-
- CPU->cpu_idt = newidt;
- idtr.dtr_base = (uintptr_t)CPU->cpu_idt;
- idtr.dtr_limit = (NIDT * sizeof (*idt0)) - 1;
- wr_idtr(&idtr);
- }
-#endif /* !__amd64 */
-
#if !defined(__xpv)
/*
* Map page pfn=0 for drivers, such as kd, that need to pick up
@@ -2217,10 +2213,8 @@ startup_vm(void)
* kpm segment
*/
segmap_kpm = 0;
- if (kpm_desired) {
+ if (kpm_desired)
kpm_init();
- kpm_enable = 1;
- }
/*
* Now create segmap segment.
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index 192f3dbd32..4b867bac0c 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -32,7 +32,7 @@
/* */
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -481,7 +481,6 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
int watchcode;
int watchpage;
caddr_t vaddr;
- int singlestep_twiddle;
size_t sz;
int ta;
#ifdef __amd64
@@ -1103,58 +1102,35 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
case T_SGLSTP: /* single step/hw breakpoint exception */
- /* Now evaluate how we got here */
+#if !defined(__xpv)
+ /*
+ * We'd never normally get here, as kmdb handles its own single
+ * step traps. There is one nasty exception though, as
+ * described in more detail in sys_sysenter(). Note that
+ * checking for all four locations covers both the KPTI and the
+ * non-KPTI cases correctly: the former will never be found at
+ * (brand_)sys_sysenter, and vice versa.
+ */
if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
- /*
- * i386 single-steps even through lcalls which
- * change the privilege level. So we take a trap at
- * the first instruction in privileged mode.
- *
- * Set a flag to indicate that upon completion of
- * the system call, deal with the single-step trap.
- *
- * The same thing happens for sysenter, too.
- */
- singlestep_twiddle = 0;
- if (rp->r_pc == (uintptr_t)sys_sysenter ||
- rp->r_pc == (uintptr_t)brand_sys_sysenter) {
- singlestep_twiddle = 1;
-#if defined(__amd64)
- /*
- * Since we are already on the kernel's
- * %gs, on 64-bit systems the sysenter case
- * needs to adjust the pc to avoid
- * executing the swapgs instruction at the
- * top of the handler.
- */
- if (rp->r_pc == (uintptr_t)sys_sysenter)
- rp->r_pc = (uintptr_t)
- _sys_sysenter_post_swapgs;
- else
- rp->r_pc = (uintptr_t)
- _brand_sys_sysenter_post_swapgs;
-#endif
- }
-#if defined(__i386)
- else if (rp->r_pc == (uintptr_t)sys_call ||
- rp->r_pc == (uintptr_t)brand_sys_call) {
- singlestep_twiddle = 1;
- }
-#endif
- else {
- /* not on sysenter/syscall; uregs available */
- if (tudebug && tudebugbpt)
- showregs(type, rp, (caddr_t)0);
- }
- if (singlestep_twiddle) {
+ if (rp->r_pc == (greg_t)brand_sys_sysenter ||
+ rp->r_pc == (greg_t)sys_sysenter ||
+ rp->r_pc == (greg_t)tr_brand_sys_sysenter ||
+ rp->r_pc == (greg_t)tr_sys_sysenter) {
+
+ rp->r_pc += 0x3; /* sizeof (swapgs) */
+
rp->r_ps &= ~PS_T; /* turn off trace */
lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
ct->t_post_sys = 1;
aston(curthread);
goto cleanup;
+ } else {
+ if (tudebug && tudebugbpt)
+ showregs(type, rp, (caddr_t)0);
}
}
- /* XXX - needs review on debugger interface? */
+#endif /* !__xpv */
+
if (boothowto & RB_DEBUG)
debug_enter((char *)NULL);
else
@@ -1743,16 +1719,16 @@ showregs(uint_t type, struct regs *rp, caddr_t addr)
* this clause can be deleted when lint bug 4870403 is fixed
* (lint thinks that bit 32 is illegal in a %b format string)
*/
- printf("cr0: %x cr4: %b\n",
+ printf("cr0: %x cr4: %b\n",
(uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
#else
- printf("cr0: %b cr4: %b\n",
+ printf("cr0: %b cr4: %b\n",
(uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
#endif /* __lint */
- printf("cr2: %lx", getcr2());
+ printf("cr2: %lx ", getcr2());
#if !defined(__xpv)
- printf("cr3: %lx", getcr3());
+ printf("cr3: %lx ", getcr3());
#if defined(__amd64)
printf("cr8: %lx\n", getcr8());
#endif
@@ -1858,7 +1834,8 @@ instr_is_segregs_pop(caddr_t pc)
#endif /* __i386 */
/*
- * Test to see if the instruction is part of _sys_rtt.
+ * Test to see if the instruction is part of _sys_rtt (or the KPTI trampolines
+ * which are used by _sys_rtt).
*
* Again on the hypervisor if we try to IRET to user land with a bad code
* or stack selector we will get vectored through xen_failsafe_callback.
@@ -1870,6 +1847,19 @@ instr_is_sys_rtt(caddr_t pc)
{
extern void _sys_rtt(), _sys_rtt_end();
+#if defined(__amd64) && !defined(__xpv)
+ extern void tr_sysc_ret_start(), tr_sysc_ret_end();
+ extern void tr_intr_ret_start(), tr_intr_ret_end();
+
+ if ((uintptr_t)pc >= (uintptr_t)tr_sysc_ret_start &&
+ (uintptr_t)pc <= (uintptr_t)tr_sysc_ret_end)
+ return (1);
+
+ if ((uintptr_t)pc >= (uintptr_t)tr_intr_ret_start &&
+ (uintptr_t)pc <= (uintptr_t)tr_intr_ret_end)
+ return (1);
+#endif
+
if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
(uintptr_t)pc > (uintptr_t)_sys_rtt_end)
return (0);
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index e2adaaaed9..cf1a252c28 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -23,7 +23,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_MACHCPUVAR_H
@@ -40,6 +40,9 @@ extern "C" {
#include <sys/rm_platter.h>
#include <sys/avintr.h>
#include <sys/pte.h>
+#include <sys/stddef.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
#ifndef _ASM
/*
@@ -78,6 +81,76 @@ struct xen_evt_data {
ulong_t evt_affinity[sizeof (ulong_t) * 8]; /* service on cpu */
};
+struct kpti_frame {
+ uint64_t kf_lower_redzone;
+
+ /* Stashed value of %cr3 when we entered the trampoline. */
+ greg_t kf_tr_cr3;
+
+ /*
+ * We use %r13-r14 as scratch registers in the trampoline code,
+ * so stash those here "below" the rest of the stack so they can be
+ * pushed/popped if needed.
+ */
+ greg_t kf_r14;
+ greg_t kf_r13;
+
+ /*
+ * Part of this struct is used as the HW stack frame when taking an
+ * interrupt on the user page table. The CPU is going to push a bunch
+ * of regs onto the stack pointer set in the TSS/IDT (which we set to
+ * &kf_rsp here).
+ *
+ * This is only a temporary holding area for them (we'll move them over
+ * to the real interrupt stack once we've set %cr3).
+ *
+ * Note that these must be cleared during a process switch on this cpu.
+ */
+ greg_t kf_err; /* Bottom of initial hw stack frame */
+ greg_t kf_rip;
+ greg_t kf_cs;
+ greg_t kf_rflags;
+ greg_t kf_rsp;
+ greg_t kf_ss;
+
+ greg_t kf_tr_rsp; /* Top of HW stack frame */
+ /* We also write this with the %rsp value on tramp entry */
+
+ /* Written to 0x1 when this kpti_frame is in use. */
+ uint64_t kf_tr_flag;
+
+ uint64_t kf_middle_redzone;
+
+ /*
+ * The things we need to write to %cr3 to change between page tables.
+ * These live "above" the HW stack.
+ */
+ greg_t kf_kernel_cr3;
+ greg_t kf_user_cr3;
+ greg_t kf_tr_ret_rsp;
+
+ uint64_t kf_unused; /* For 16-byte align */
+
+ uint64_t kf_upper_redzone;
+};
+
+/*
+ * This first value, MACHCPU_SIZE is the size of all the members in the cpu_t
+ * AND struct machcpu, before we get to the mcpu_pad and the kpti area.
+ * The KPTI is used to contain per-CPU data that is visible in both sets of
+ * page-tables, and hence must be page-aligned and page-sized. See
+ * hat_pcp_setup().
+ *
+ * There is a CTASSERT in os/intr.c that checks these numbers.
+ */
+#if defined(__amd64)
+#define MACHCPU_SIZE (572 + 1584)
+#else
+#define MACHCPU_SIZE (452 + 1328)
+#endif
+#define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE)
+#define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame))
+
struct machcpu {
/*
* x_call fields - used for interprocessor cross calls
@@ -103,6 +176,8 @@ struct machcpu {
gate_desc_t *mcpu_idt; /* current IDT */
tss_t *mcpu_tss; /* TSS */
+ void *mcpu_ldt;
+ size_t mcpu_ldt_len;
kmutex_t mcpu_ppaddr_mutex;
caddr_t mcpu_caddr1; /* per cpu CADDR1 */
@@ -147,6 +222,15 @@ struct machcpu {
* The low order bits will be incremented on every interrupt.
*/
volatile uint32_t mcpu_istamp;
+
+ char mcpu_pad[MACHCPU_PAD];
+
+ /* This is the start of the page */
+ char mcpu_pad2[MACHCPU_PAD2];
+ struct kpti_frame mcpu_kpti;
+ struct kpti_frame mcpu_kpti_flt;
+ struct kpti_frame mcpu_kpti_dbg;
+ char mcpu_pad3[16];
};
#define NINTR_THREADS (LOCK_LEVEL-1) /* number of interrupt threads */
@@ -167,7 +251,6 @@ struct machcpu {
#define cpu_gdt cpu_m.mcpu_gdt
#define cpu_idt cpu_m.mcpu_idt
#define cpu_tss cpu_m.mcpu_tss
-#define cpu_ldt cpu_m.mcpu_ldt
#define cpu_caddr1 cpu_m.mcpu_caddr1
#define cpu_caddr2 cpu_m.mcpu_caddr2
#define cpu_softinfo cpu_m.mcpu_softinfo
diff --git a/usr/src/uts/i86pc/sys/machparam.h b/usr/src/uts/i86pc/sys/machparam.h
index 21d725b862..51d7559483 100644
--- a/usr/src/uts/i86pc/sys/machparam.h
+++ b/usr/src/uts/i86pc/sys/machparam.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1988 AT&T */
@@ -293,7 +293,8 @@ extern "C" {
#endif /* __i386 */
/*
- * Reserve pages just below KERNEL_TEXT for the GDT, IDT, TSS and debug info.
+ * Reserve pages just below KERNEL_TEXT for the GDT, IDT, LDT, TSS and debug
+ * info.
*
* For now, DEBUG_INFO_VA must be first in this list for "xm" initiated dumps
* of solaris domUs to be usable with mdb. Relying on a fixed VA is not viable
@@ -303,7 +304,8 @@ extern "C" {
#define DEBUG_INFO_VA (KERNEL_TEXT - MMU_PAGESIZE)
#define GDT_VA (DEBUG_INFO_VA - MMU_PAGESIZE)
#define IDT_VA (GDT_VA - MMU_PAGESIZE)
-#define KTSS_VA (IDT_VA - MMU_PAGESIZE)
+#define LDT_VA (IDT_VA - (16 * MMU_PAGESIZE))
+#define KTSS_VA (LDT_VA - MMU_PAGESIZE)
#define DFTSS_VA (KTSS_VA - MMU_PAGESIZE)
#define MISC_VA_BASE (DFTSS_VA)
#define MISC_VA_SIZE (KERNEL_TEXT - MISC_VA_BASE)
diff --git a/usr/src/uts/i86pc/sys/machprivregs.h b/usr/src/uts/i86pc/sys/machprivregs.h
index 3ef6a768a0..53b14a8de8 100644
--- a/usr/src/uts/i86pc/sys/machprivregs.h
+++ b/usr/src/uts/i86pc/sys/machprivregs.h
@@ -22,13 +22,13 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_MACHPRIVREGS_H
#define _SYS_MACHPRIVREGS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Platform dependent instruction sequences for manipulating
* privileged state
@@ -77,8 +77,6 @@ extern "C" {
#define SYSRETQ sysretq
#define SYSRETL sysretl
#define SWAPGS swapgs
-#define XPV_TRAP_POP /* empty */
-#define XPV_TRAP_PUSH /* empty */
#elif defined(__i386)
@@ -86,6 +84,8 @@ extern "C" {
#endif /* __i386 */
+#define XPV_TRAP_POP /* empty */
+#define XPV_TRAP_PUSH /* empty */
#define CLEAN_CS /* empty */
@@ -129,7 +129,7 @@ extern "C" {
movq REGOFF_RDI(%rsp), %rdi; \
addq $REGOFF_RIP, %rsp
-#define FAST_INTR_RETURN iretq
+#define FAST_INTR_RETURN jmp tr_iret_user
#elif defined(__i386)
diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h
index ea63abf77d..15ab068854 100644
--- a/usr/src/uts/i86pc/sys/rm_platter.h
+++ b/usr/src/uts/i86pc/sys/rm_platter.h
@@ -26,7 +26,7 @@
* All rights reserved.
*/
/*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_RM_PLATTER_H
@@ -113,7 +113,12 @@ typedef struct rm_platter {
* Since DEFAULTSTKSIZE is a multiple of PAGESIZE tss will be aligned.
*/
struct cpu_tables {
- char ct_stack[DEFAULTSTKSZ];
+ /* IST stacks */
+ char ct_stack1[DEFAULTSTKSZ]; /* dblfault */
+#if defined(__amd64) && !defined(__xpv)
+ char ct_stack2[DEFAULTSTKSZ]; /* nmi */
+ char ct_stack3[DEFAULTSTKSZ]; /* mce */
+#endif
tss_t ct_tss;
};
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index e16933dbde..8690c46adf 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -27,7 +27,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2017 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
* Copyright (c) 2014, 2015 by Delphix. All rights reserved.
*/
@@ -43,6 +43,191 @@
* Routines used only inside of i86pc/vm start with hati_ for HAT Internal.
*/
+/*
+ * amd64 HAT Design
+ *
+ * ----------
+ * Background
+ * ----------
+ *
+ * On x86, the address space is shared between a user process and the kernel.
+ * This is different from SPARC. Conventionally, the kernel lives at the top of
+ * the address space and the user process gets to enjoy the rest of it. If you
+ * look at the image of the address map in uts/i86pc/os/startup.c, you'll get a
+ * rough sense of how the address space is laid out and used.
+ *
+ * Every unique address space is represented by an instance of a HAT structure
+ * called a 'hat_t'. In addition to a hat_t structure for each process, there is
+ * also one that is used for the kernel (kas.a_hat), and each CPU ultimately
+ * also has a HAT.
+ *
+ * Each HAT contains a pointer to its root page table. This root page table is
+ * what we call an L3 page table in illumos and Intel calls the PML4. It is the
+ * physical address of the L3 table that we place in the %cr3 register which the
+ * processor uses.
+ *
+ * Each of the many layers of the page table is represented by a structure
+ * called an htable_t. The htable_t manages a set of 512 8-byte entries. The
+ * number of entries in a given page table is constant across all different
+ * level page tables. Note, this is only true on amd64. This has not always been
+ * the case on x86.
+ *
+ * Each entry in a page table, generally referred to as a PTE, may refer to
+ * another page table or a memory location, depending on the level of the page
+ * table and the use of large pages. Importantly, the top-level L3 page table
+ * (PML4) only supports linking to further page tables. This is also true on
+ * systems which support a 5th level page table (which we do not currently
+ * support).
+ *
+ * Historically, on x86, when a process was running on CPU, the root of the page
+ * table was inserted into %cr3 on each CPU on which it was currently running.
+ * When processes would switch (by calling hat_switch()), then the value in %cr3
+ * on that CPU would change to that of the new HAT. While this behavior is still
+ * maintained in the xpv kernel, this is not what is done today.
+ *
+ * -------------------
+ * Per-CPU Page Tables
+ * -------------------
+ *
+ * Throughout the system the 64-bit kernel has a notion of what it calls a
+ * per-CPU page table or PCP. The notion of a per-CPU page table was originally
+ * introduced as part of the original work to support x86 PAE. On the 64-bit
+ * kernel, it was originally used for 32-bit processes running on the 64-bit
+ * kernel. The rationale behind this was that each 32-bit process could have all
+ * of its memory represented in a single L2 page table as each L2 page table
+ * entry represents 1 GbE of memory.
+ *
+ * Following on from this, the idea was that given that all of the L3 page table
+ * entries for 32-bit processes are basically going to be identical with the
+ * exception of the first entry in the page table, why not share those page
+ * table entries. This gave rise to the idea of a per-CPU page table.
+ *
+ * The way this works is that we have a member in the machcpu_t called the
+ * mcpu_hat_info. That structure contains two different 4k pages: one that
+ * represents the L3 page table and one that represents an L2 page table. When
+ * the CPU starts up, the L3 page table entries are copied in from the kernel's
+ * page table. The L3 kernel entries do not change throughout the lifetime of
+ * the kernel. The kernel portion of these L3 pages for each CPU have the same
+ * records, meaning that they point to the same L2 page tables and thus see a
+ * consistent view of the world.
+ *
+ * When a 32-bit process is loaded into this world, we copy the 32-bit process's
+ * four top-level page table entries into the CPU's L2 page table and then set
+ * the CPU's first L3 page table entry to point to the CPU's L2 page.
+ * Specifically, in hat_pcp_update(), we're copying from the process's
+ * HAT_COPIED_32 HAT into the page tables specific to this CPU.
+ *
+ * As part of the implementation of kernel page table isolation, this was also
+ * extended to 64-bit processes. When a 64-bit process runs, we'll copy their L3
+ * PTEs across into the current CPU's L3 page table. (As we can't do the
+ * first-L3-entry trick for 64-bit processes, ->hci_pcp_l2ptes is unused in this
+ * case.)
+ *
+ * The use of per-CPU page tables has a lot of implementation ramifications. A
+ * HAT that runs a user process will be flagged with the HAT_COPIED flag to
+ * indicate that it is using the per-CPU page table functionality. In tandem
+ * with the HAT, the top-level htable_t will be flagged with the HTABLE_COPIED
+ * flag. If the HAT represents a 32-bit process, then we will also set the
+ * HAT_COPIED_32 flag on that hat_t.
+ *
+ * These two flags work together. The top-level htable_t when using per-CPU page
+ * tables is 'virtual'. We never allocate a ptable for this htable_t (i.e.
+ * ht->ht_pfn is PFN_INVALID). Instead, when we need to modify a PTE in an
+ * HTABLE_COPIED ptable, x86pte_access_pagetable() will redirect any accesses to
+ * ht_hat->hat_copied_ptes.
+ *
+ * Of course, such a modification won't actually modify the HAT_PCP page tables
+ * that were copied from the HAT_COPIED htable. When we change the top level
+ * page table entries (L2 PTEs for a 32-bit process and L3 PTEs for a 64-bit
+ * process), we need to make sure to trigger hat_pcp_update() on all CPUs that
+ * are currently tied to this HAT (including the current CPU).
+ *
+ * To do this, PCP piggy-backs on TLB invalidation, specifically via the
+ * hat_tlb_inval() path from link_ptp() and unlink_ptp().
+ *
+ * (Importantly, in all such cases, when this is in operation, the top-level
+ * entry should not be able to refer to an actual page table entry that can be
+ * changed and consolidated into a large page. If large page consolidation is
+ * required here, then there will be much that needs to be reconsidered.)
+ *
+ * -----------------------------------------------
+ * Kernel Page Table Isolation and the Per-CPU HAT
+ * -----------------------------------------------
+ *
+ * All Intel CPUs that support speculative execution and paging are subject to a
+ * series of bugs that have been termed 'Meltdown'. These exploits allow a user
+ * process to read kernel memory through cache side channels and speculative
+ * execution. To mitigate this on vulnerable CPUs, we need to use a technique
+ * called kernel page table isolation. What this requires is that we have two
+ * different page table roots. When executing in kernel mode, we will use a %cr3
+ * value that has both the user and kernel pages. However when executing in user
+ * mode, we will need to have a %cr3 that has all of the user pages; however,
+ * only a subset of the kernel pages required to operate.
+ *
+ * These kernel pages that we need mapped are:
+ *
+ * o Kernel Text that allows us to switch between the cr3 values.
+ * o The current global descriptor table (GDT)
+ * o The current interrupt descriptor table (IDT)
+ * o The current task switching state (TSS)
+ * o The current local descriptor table (LDT)
+ * o Stacks and scratch space used by the interrupt handlers
+ *
+ * For more information on the stack switching techniques, construction of the
+ * trampolines, and more, please see i86pc/ml/kpti_trampolines.s. The most
+ * important part of these mappings are the following two constraints:
+ *
+ * o The mappings are all per-CPU (except for read-only text)
+ * o The mappings are static. They are all established before the CPU is
+ * started (with the exception of the boot CPU).
+ *
+ * To facilitate the kernel page table isolation we employ our per-CPU
+ * page tables discussed in the previous section and add the notion of a per-CPU
+ * HAT. Fundamentally we have a second page table root. There is both a kernel
+ * page table (hci_pcp_l3ptes), and a user L3 page table (hci_user_l3ptes).
+ * Both will have the user page table entries copied into them, the same way
+ * that we discussed in the section 'Per-CPU Page Tables'.
+ *
+ * The complex part of this is how do we construct the set of kernel mappings
+ * that should be present when running with the user page table. To answer that,
+ * we add the notion of a per-CPU HAT. This HAT functions like a normal HAT,
+ * except that it's not really associated with an address space the same way
+ * that other HATs are.
+ *
+ * This HAT lives off of the 'struct hat_cpu_info' which is a member of the
+ * machcpu in the member hci_user_hat. We use this per-CPU HAT to create the set
+ * of kernel mappings that should be present on this CPU. The kernel mappings
+ * are added to the per-CPU HAT through the function hati_cpu_punchin(). Once a
+ * mapping has been punched in, it may not be punched out. The reason that we
+ * opt to leverage a HAT structure is that it knows how to allocate and manage
+ * all of the lower level page tables as required.
+ *
+ * Because all of the mappings are present at the beginning of time for this CPU
+ * and none of the mappings are in the kernel pageable segment, we don't have to
+ * worry about faulting on these HAT structures and thus the notion of the
+ * current HAT that we're using is always the appropriate HAT for the process
+ * (usually a user HAT or the kernel's HAT).
+ *
+ * A further constraint we place on the system with these per-CPU HATs is that
+ * they are not subject to htable_steal(). Because each CPU will have a rather
+ * fixed number of page tables, the same way that we don't steal from the
+ * kernel's HAT, it was determined that we should not steal from this HAT due to
+ * the complications involved and somewhat criminal nature of htable_steal().
+ *
+ * The per-CPU HAT is initialized in hat_pcp_setup() which is called as part of
+ * onlining the CPU, but before the CPU is actually started. The per-CPU HAT is
+ * removed in hat_pcp_teardown() which is called when a CPU is being offlined to
+ * be removed from the system (which is different from what psradm usually
+ * does).
+ *
+ * Finally, once the CPU has been onlined, the set of mappings in the per-CPU
+ * HAT must not change. The HAT related functions that we call are not meant to
+ * be called when we're switching between processes. For example, it is quite
+ * possible that if they were, they would try to grab an htable mutex which
+ * another thread might have. One needs to treat hat_switch() as though they
+ * were above LOCK_LEVEL and therefore _must not_ block under any circumstance.
+ */
+
#include <sys/machparam.h>
#include <sys/machsystm.h>
#include <sys/mman.h>
@@ -96,15 +281,18 @@ struct hat_mmu_info mmu;
*
* For 32 bit PAE support on i86pc, the kernel hat will use the 1st 4 entries
* on this 4K page for its top level page table. The remaining groups of
- * 4 entries are used for per processor copies of user VLP pagetables for
+ * 4 entries are used for per processor copies of user PCP pagetables for
* running threads. See hat_switch() and reload_pae32() for details.
*
- * vlp_page[0..3] - level==2 PTEs for kernel HAT
- * vlp_page[4..7] - level==2 PTEs for user thread on cpu 0
- * vlp_page[8..11] - level==2 PTE for user thread on cpu 1
+ * pcp_page[0..3] - level==2 PTEs for kernel HAT
+ * pcp_page[4..7] - level==2 PTEs for user thread on cpu 0
+ * pcp_page[8..11] - level==2 PTE for user thread on cpu 1
* etc...
+ *
+ * On the 64-bit kernel, this is the normal root of the page table and there is
+ * nothing special about it when used for other CPUs.
*/
-static x86pte_t *vlp_page;
+static x86pte_t *pcp_page;
/*
* forward declaration of internal utility routines
@@ -171,7 +359,7 @@ kmutex_t hat_list_lock;
kcondvar_t hat_list_cv;
kmem_cache_t *hat_cache;
kmem_cache_t *hat_hash_cache;
-kmem_cache_t *vlp_hash_cache;
+kmem_cache_t *hat32_hash_cache;
/*
* Simple statistics
@@ -237,6 +425,32 @@ hati_constructor(void *buf, void *handle, int kmflags)
}
/*
+ * Put it at the start of the global list of all hats (used by stealing)
+ *
+ * kas.a_hat is not in the list but is instead used to find the
+ * first and last items in the list.
+ *
+ * - kas.a_hat->hat_next points to the start of the user hats.
+ * The list ends where hat->hat_next == NULL
+ *
+ * - kas.a_hat->hat_prev points to the last of the user hats.
+ * The list begins where hat->hat_prev == NULL
+ */
+static void
+hat_list_append(hat_t *hat)
+{
+ mutex_enter(&hat_list_lock);
+ hat->hat_prev = NULL;
+ hat->hat_next = kas.a_hat->hat_next;
+ if (hat->hat_next)
+ hat->hat_next->hat_prev = hat;
+ else
+ kas.a_hat->hat_prev = hat;
+ kas.a_hat->hat_next = hat;
+ mutex_exit(&hat_list_lock);
+}
+
+/*
* Allocate a hat structure for as. We also create the top level
* htable and initialize it to contain the kernel hat entries.
*/
@@ -245,7 +459,7 @@ hat_alloc(struct as *as)
{
hat_t *hat;
htable_t *ht; /* top level htable */
- uint_t use_vlp;
+ uint_t use_copied;
uint_t r;
hat_kernel_range_t *rp;
uintptr_t va;
@@ -253,6 +467,7 @@ hat_alloc(struct as *as)
uint_t start;
uint_t cnt;
htable_t *src;
+ boolean_t use_hat32_cache;
/*
* Once we start creating user process HATs we can enable
@@ -269,30 +484,71 @@ hat_alloc(struct as *as)
#if defined(__xpv)
/*
- * No VLP stuff on the hypervisor due to the 64-bit split top level
+ * No PCP stuff on the hypervisor due to the 64-bit split top level
* page tables. On 32-bit it's not needed as the hypervisor takes
* care of copying the top level PTEs to a below 4Gig page.
*/
- use_vlp = 0;
+ use_copied = 0;
+ use_hat32_cache = B_FALSE;
+ hat->hat_max_level = mmu.max_level;
+ hat->hat_num_copied = 0;
+ hat->hat_flags = 0;
#else /* __xpv */
- /* 32 bit processes uses a VLP style hat when running with PAE */
#if defined(__amd64)
- use_vlp = (ttoproc(curthread)->p_model == DATAMODEL_ILP32);
+
+ /*
+ * All processes use HAT_COPIED on the 64-bit kernel if KPTI is
+ * turned on.
+ */
+ if (ttoproc(curthread)->p_model == DATAMODEL_ILP32) {
+ use_copied = 1;
+ hat->hat_max_level = mmu.max_level32;
+ hat->hat_num_copied = mmu.num_copied_ents32;
+ use_hat32_cache = B_TRUE;
+ hat->hat_flags |= HAT_COPIED_32;
+ HATSTAT_INC(hs_hat_copied32);
+ } else if (kpti_enable == 1) {
+ use_copied = 1;
+ hat->hat_max_level = mmu.max_level;
+ hat->hat_num_copied = mmu.num_copied_ents;
+ use_hat32_cache = B_FALSE;
+ HATSTAT_INC(hs_hat_copied64);
+ } else {
+ use_copied = 0;
+ use_hat32_cache = B_FALSE;
+ hat->hat_max_level = mmu.max_level;
+ hat->hat_num_copied = 0;
+ hat->hat_flags = 0;
+ HATSTAT_INC(hs_hat_normal64);
+ }
#elif defined(__i386)
- use_vlp = mmu.pae_hat;
+ use_copied = mmu.pae_hat;
+ if (use_copied) {
+ use_hat32_cache = B_TRUE;
+ hat->hat_num_copied = mmu.num_copied_ents;
+ HATSTAT_INC(hs_hat_copied32);
+ } else {
+ use_hat32_cache = B_FALSE;
+ hat->hat_num_copied = 0;
+ }
#endif
#endif /* __xpv */
- if (use_vlp) {
- hat->hat_flags = HAT_VLP;
- bzero(hat->hat_vlp_ptes, VLP_SIZE);
+ if (use_copied) {
+ hat->hat_flags |= HAT_COPIED;
+ bzero(hat->hat_copied_ptes, sizeof (hat->hat_copied_ptes));
}
/*
- * Allocate the htable hash
+ * Allocate the htable hash. For 32-bit PCP processes we use the
+ * hat32_hash_cache. However, for 64-bit PCP processes we do not as the
+ * number of entries that they have to handle is closer to
+ * hat_hash_cache in count (though there will be more wastage when we
+ * have more DRAM in the system and thus push down the user address
+ * range).
*/
- if ((hat->hat_flags & HAT_VLP)) {
- hat->hat_num_hash = mmu.vlp_hash_cnt;
- hat->hat_ht_hash = kmem_cache_alloc(vlp_hash_cache, KM_SLEEP);
+ if (use_hat32_cache) {
+ hat->hat_num_hash = mmu.hat32_hash_cnt;
+ hat->hat_ht_hash = kmem_cache_alloc(hat32_hash_cache, KM_SLEEP);
} else {
hat->hat_num_hash = mmu.hash_cnt;
hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
@@ -310,7 +566,7 @@ hat_alloc(struct as *as)
hat->hat_htable = ht;
#if defined(__amd64)
- if (hat->hat_flags & HAT_VLP)
+ if (hat->hat_flags & HAT_COPIED)
goto init_done;
#endif
@@ -335,9 +591,9 @@ hat_alloc(struct as *as)
start;
#if defined(__i386) && !defined(__xpv)
- if (ht->ht_flags & HTABLE_VLP) {
- bcopy(&vlp_page[start],
- &hat->hat_vlp_ptes[start],
+ if (ht->ht_flags & HTABLE_COPIED) {
+ bcopy(&pcp_page[start],
+ &hat->hat_copied_ptes[start],
cnt * sizeof (x86pte_t));
continue;
}
@@ -362,30 +618,54 @@ init_done:
#endif
XPV_ALLOW_MIGRATE();
+ hat_list_append(hat);
+
+ return (hat);
+}
+
+#if !defined(__xpv)
+/*
+ * Cons up a HAT for a CPU. This represents the user mappings. This will have
+ * various kernel pages punched into it manually. Importantly, this hat is
+ * ineligible for stealing. We really don't want to deal with this ever
+ * faulting and figuring out that this is happening, much like we don't with
+ * kas.
+ */
+static hat_t *
+hat_cpu_alloc(cpu_t *cpu)
+{
+ hat_t *hat;
+ htable_t *ht;
+
+ hat = kmem_cache_alloc(hat_cache, KM_SLEEP);
+ hat->hat_as = NULL;
+ mutex_init(&hat->hat_mutex, NULL, MUTEX_DEFAULT, NULL);
+ hat->hat_max_level = mmu.max_level;
+ hat->hat_num_copied = 0;
+ hat->hat_flags = HAT_PCP;
+
+ hat->hat_num_hash = mmu.hash_cnt;
+ hat->hat_ht_hash = kmem_cache_alloc(hat_hash_cache, KM_SLEEP);
+ bzero(hat->hat_ht_hash, hat->hat_num_hash * sizeof (htable_t *));
+
+ hat->hat_next = hat->hat_prev = NULL;
+
/*
- * Put it at the start of the global list of all hats (used by stealing)
- *
- * kas.a_hat is not in the list but is instead used to find the
- * first and last items in the list.
- *
- * - kas.a_hat->hat_next points to the start of the user hats.
- * The list ends where hat->hat_next == NULL
- *
- * - kas.a_hat->hat_prev points to the last of the user hats.
- * The list begins where hat->hat_prev == NULL
+ * Because this HAT will only ever be used by the current CPU, we'll go
+ * ahead and set the CPUSET up to only point to the CPU in question.
*/
- mutex_enter(&hat_list_lock);
- hat->hat_prev = NULL;
- hat->hat_next = kas.a_hat->hat_next;
- if (hat->hat_next)
- hat->hat_next->hat_prev = hat;
- else
- kas.a_hat->hat_prev = hat;
- kas.a_hat->hat_next = hat;
- mutex_exit(&hat_list_lock);
+ CPUSET_ADD(hat->hat_cpus, cpu->cpu_id);
+
+ hat->hat_htable = NULL;
+ hat->hat_ht_cached = NULL;
+ ht = htable_create(hat, (uintptr_t)0, TOP_LEVEL(hat), NULL);
+ hat->hat_htable = ht;
+
+ hat_list_append(hat);
return (hat);
}
+#endif /* !__xpv */
/*
* process has finished executing but as has not been cleaned up yet.
@@ -442,6 +722,7 @@ hat_free_end(hat_t *hat)
/*
* On the hypervisor, unpin top level page table(s)
*/
+ VERIFY3U(hat->hat_flags & HAT_PCP, ==, 0);
xen_unpin(hat->hat_htable->ht_pfn);
#if defined(__amd64)
xen_unpin(hat->hat_user_ptable);
@@ -456,14 +737,25 @@ hat_free_end(hat_t *hat)
/*
* Decide which kmem cache the hash table came from, then free it.
*/
- if (hat->hat_flags & HAT_VLP)
- cache = vlp_hash_cache;
- else
+ if (hat->hat_flags & HAT_COPIED) {
+#if defined(__amd64)
+ if (hat->hat_flags & HAT_COPIED_32) {
+ cache = hat32_hash_cache;
+ } else {
+ cache = hat_hash_cache;
+ }
+#else
+ cache = hat32_hash_cache;
+#endif
+ } else {
cache = hat_hash_cache;
+ }
kmem_cache_free(cache, hat->hat_ht_hash);
hat->hat_ht_hash = NULL;
hat->hat_flags = 0;
+ hat->hat_max_level = 0;
+ hat->hat_num_copied = 0;
kmem_cache_free(hat_cache, hat);
}
@@ -518,6 +810,53 @@ set_max_page_level()
}
/*
+ * Determine the number of slots that are in used in the top-most level page
+ * table for user memory. This is based on _userlimit. In effect this is similar
+ * to htable_va2entry, but without the convenience of having an htable.
+ */
+void
+mmu_calc_user_slots(void)
+{
+ uint_t ent, nptes;
+ uintptr_t shift;
+
+ nptes = mmu.top_level_count;
+ shift = _userlimit >> mmu.level_shift[mmu.max_level];
+ ent = shift & (nptes - 1);
+
+ /*
+ * Ent tells us the slot that the page for _userlimit would fit in. We
+ * need to add one to this to cover the total number of entries.
+ */
+ mmu.top_level_uslots = ent + 1;
+
+#if defined(__amd64)
+ /*
+ * When running 32-bit compatability processes on a 64-bit kernel, we
+ * will only need to use one slot.
+ */
+ mmu.top_level_uslots32 = 1;
+
+ /*
+ * Record the number of PCP page table entries that we'll need to copy
+ * around. For 64-bit processes this is the number of user slots. For
+ * 32-bit proceses, this is 4 1 GiB pages.
+ */
+ mmu.num_copied_ents = mmu.top_level_uslots;
+ mmu.num_copied_ents32 = 4;
+#elif defined(__xpv)
+ /*
+ *
+ */
+ if (mmu.pae_hat) {
+ mmu.num_copied_ents = 4;
+ } else {
+ mmu.num_copied_ents = 0;
+ }
+#endif
+}
+
+/*
* Initialize hat data structures based on processor MMU information.
*/
void
@@ -536,6 +875,17 @@ mmu_init(void)
(getcr4() & CR4_PGE) != 0)
mmu.pt_global = PT_GLOBAL;
+#if defined(__amd64) && !defined(__xpv)
+ /*
+ * The 64-bit x86 kernel has split user/kernel page tables. As such we
+ * cannot have the global bit set. The simplest way for us to deal with
+ * this is to just say that pt_global is zero, so the global bit isn't
+ * present.
+ */
+ if (kpti_enable == 1)
+ mmu.pt_global = 0;
+#endif
+
/*
* Detect NX and PAE usage.
*/
@@ -594,6 +944,11 @@ mmu_init(void)
mmu.ptes_per_table = 512;
mmu.top_level_count = 512;
+ /*
+ * 32-bit processes only use 1 GB ptes.
+ */
+ mmu.max_level32 = 2;
+
mmu.level_shift[0] = 12;
mmu.level_shift[1] = 21;
mmu.level_shift[2] = 30;
@@ -630,6 +985,7 @@ mmu_init(void)
}
set_max_page_level();
+ mmu_calc_user_slots();
mmu_page_sizes = mmu.max_page_level + 1;
mmu_exported_page_sizes = mmu.umax_page_level + 1;
@@ -665,7 +1021,7 @@ mmu_init(void)
mmu.hash_cnt = MMU_PAGESIZE / sizeof (htable_t *);
while (mmu.hash_cnt > 16 && mmu.hash_cnt >= max_htables)
mmu.hash_cnt >>= 1;
- mmu.vlp_hash_cnt = mmu.hash_cnt;
+ mmu.hat32_hash_cnt = mmu.hash_cnt;
#if defined(__amd64)
/*
@@ -714,14 +1070,15 @@ hat_init()
NULL, 0, 0);
/*
- * VLP hats can use a smaller hash table size on large memroy machines
+ * 32-bit PCP hats can use a smaller hash table size on large memory
+ * machines
*/
- if (mmu.hash_cnt == mmu.vlp_hash_cnt) {
- vlp_hash_cache = hat_hash_cache;
+ if (mmu.hash_cnt == mmu.hat32_hash_cnt) {
+ hat32_hash_cache = hat_hash_cache;
} else {
- vlp_hash_cache = kmem_cache_create("HatVlpHash",
- mmu.vlp_hash_cnt * sizeof (htable_t *), 0, NULL, NULL, NULL,
- NULL, 0, 0);
+ hat32_hash_cache = kmem_cache_create("Hat32Hash",
+ mmu.hat32_hash_cnt * sizeof (htable_t *), 0, NULL, NULL,
+ NULL, NULL, 0, 0);
}
/*
@@ -738,6 +1095,13 @@ hat_init()
CPUSET_ADD(khat_cpuset, CPU->cpu_id);
/*
+ * The kernel HAT doesn't use PCP regardless of architectures.
+ */
+ ASSERT3U(mmu.max_level, >, 0);
+ kas.a_hat->hat_max_level = mmu.max_level;
+ kas.a_hat->hat_num_copied = 0;
+
+ /*
* The kernel hat's next pointer serves as the head of the hat list .
* The kernel hat's prev pointer tracks the last hat on the list for
* htable_steal() to use.
@@ -769,57 +1133,165 @@ hat_init()
KM_SLEEP);
}
+
+extern void kpti_tramp_start();
+extern void kpti_tramp_end();
+
+extern void kdi_isr_start();
+extern void kdi_isr_end();
+
+extern gate_desc_t kdi_idt[NIDT];
+
/*
- * Prepare CPU specific pagetables for VLP processes on 64 bit kernels.
+ * Prepare per-CPU pagetables for all processes on the 64 bit kernel.
*
* Each CPU has a set of 2 pagetables that are reused for any 32 bit
- * process it runs. They are the top level pagetable, hci_vlp_l3ptes, and
- * the next to top level table for the bottom 512 Gig, hci_vlp_l2ptes.
+ * process it runs. They are the top level pagetable, hci_pcp_l3ptes, and
+ * the next to top level table for the bottom 512 Gig, hci_pcp_l2ptes.
*/
/*ARGSUSED*/
static void
-hat_vlp_setup(struct cpu *cpu)
+hat_pcp_setup(struct cpu *cpu)
{
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
struct hat_cpu_info *hci = cpu->cpu_hat_info;
- pfn_t pfn;
+ uintptr_t va;
+ size_t len;
/*
* allocate the level==2 page table for the bottom most
* 512Gig of address space (this is where 32 bit apps live)
*/
ASSERT(hci != NULL);
- hci->hci_vlp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
+ hci->hci_pcp_l2ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
/*
* Allocate a top level pagetable and copy the kernel's
- * entries into it. Then link in hci_vlp_l2ptes in the 1st entry.
- */
- hci->hci_vlp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
- hci->hci_vlp_pfn =
- hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l3ptes);
- ASSERT(hci->hci_vlp_pfn != PFN_INVALID);
- bcopy(vlp_page, hci->hci_vlp_l3ptes, MMU_PAGESIZE);
-
- pfn = hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_vlp_l2ptes);
- ASSERT(pfn != PFN_INVALID);
- hci->hci_vlp_l3ptes[0] = MAKEPTP(pfn, 2);
-#endif /* __amd64 && !__xpv */
+ * entries into it. Then link in hci_pcp_l2ptes in the 1st entry.
+ */
+ hci->hci_pcp_l3ptes = kmem_zalloc(MMU_PAGESIZE, KM_SLEEP);
+ hci->hci_pcp_l3pfn =
+ hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l3ptes);
+ ASSERT3U(hci->hci_pcp_l3pfn, !=, PFN_INVALID);
+ bcopy(pcp_page, hci->hci_pcp_l3ptes, MMU_PAGESIZE);
+
+ hci->hci_pcp_l2pfn =
+ hat_getpfnum(kas.a_hat, (caddr_t)hci->hci_pcp_l2ptes);
+ ASSERT3U(hci->hci_pcp_l2pfn, !=, PFN_INVALID);
+
+ /*
+ * Now go through and allocate the user version of these structures.
+ * Unlike with the kernel version, we allocate a hat to represent the
+ * top-level page table as that will make it much simpler when we need
+ * to patch through user entries.
+ */
+ hci->hci_user_hat = hat_cpu_alloc(cpu);
+ hci->hci_user_l3pfn = hci->hci_user_hat->hat_htable->ht_pfn;
+ ASSERT3U(hci->hci_user_l3pfn, !=, PFN_INVALID);
+ hci->hci_user_l3ptes =
+ (x86pte_t *)hat_kpm_mapin_pfn(hci->hci_user_l3pfn);
+
+ /* Skip the rest of this if KPTI is switched off at boot. */
+ if (kpti_enable != 1)
+ return;
+
+ /*
+ * OK, now that we have this we need to go through and punch the normal
+ * holes in the CPU's hat for this. At this point we'll punch in the
+ * following:
+ *
+ * o GDT
+ * o IDT
+ * o LDT
+ * o Trampoline Code
+ * o machcpu KPTI page
+ * o kmdb ISR code page (just trampolines)
+ *
+ * If this is cpu0, then we also can initialize the following because
+ * they'll have already been allocated.
+ *
+ * o TSS for CPU 0
+ * o Double Fault for CPU 0
+ *
+ * The following items have yet to be allocated and have not been
+ * punched in yet. They will be punched in later:
+ *
+ * o TSS (mach_cpucontext_alloc_tables())
+ * o Double Fault Stack (mach_cpucontext_alloc_tables())
+ */
+ hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_gdt, PROT_READ);
+ hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_idt, PROT_READ);
+
+ /*
+ * As the KDI IDT is only active during kmdb sessions (including single
+ * stepping), typically we don't actually need this punched in (we
+ * consider the routines that switch to the user cr3 to be toxic). But
+ * if we ever accidentally end up on the user cr3 while on this IDT,
+ * we'd prefer not to triple fault.
+ */
+ hati_cpu_punchin(cpu, (uintptr_t)&kdi_idt, PROT_READ);
+
+ CTASSERT(((uintptr_t)&kpti_tramp_start % MMU_PAGESIZE) == 0);
+ CTASSERT(((uintptr_t)&kpti_tramp_end % MMU_PAGESIZE) == 0);
+ for (va = (uintptr_t)&kpti_tramp_start;
+ va < (uintptr_t)&kpti_tramp_end; va += MMU_PAGESIZE) {
+ hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC);
+ }
+
+ VERIFY3U(((uintptr_t)cpu->cpu_m.mcpu_ldt) % MMU_PAGESIZE, ==, 0);
+ for (va = (uintptr_t)cpu->cpu_m.mcpu_ldt, len = LDT_CPU_SIZE;
+ len >= MMU_PAGESIZE; va += MMU_PAGESIZE, len -= MMU_PAGESIZE) {
+ hati_cpu_punchin(cpu, va, PROT_READ);
+ }
+
+ /* mcpu_pad2 is the start of the page containing the kpti_frames. */
+ hati_cpu_punchin(cpu, (uintptr_t)&cpu->cpu_m.mcpu_pad2[0],
+ PROT_READ | PROT_WRITE);
+
+ if (cpu == &cpus[0]) {
+ /*
+ * CPU0 uses a global for its double fault stack to deal with
+ * the chicken and egg problem. We need to punch it into its
+ * user HAT.
+ */
+ extern char dblfault_stack0[];
+
+ hati_cpu_punchin(cpu, (uintptr_t)cpu->cpu_m.mcpu_tss,
+ PROT_READ);
+
+ for (va = (uintptr_t)dblfault_stack0,
+ len = DEFAULTSTKSZ; len >= MMU_PAGESIZE;
+ va += MMU_PAGESIZE, len -= MMU_PAGESIZE) {
+ hati_cpu_punchin(cpu, va, PROT_READ | PROT_WRITE);
+ }
+ }
+
+ CTASSERT(((uintptr_t)&kdi_isr_start % MMU_PAGESIZE) == 0);
+ CTASSERT(((uintptr_t)&kdi_isr_end % MMU_PAGESIZE) == 0);
+ for (va = (uintptr_t)&kdi_isr_start;
+ va < (uintptr_t)&kdi_isr_end; va += MMU_PAGESIZE) {
+ hati_cpu_punchin(cpu, va, PROT_READ | PROT_EXEC);
+ }
+#endif /* !__xpv */
}
/*ARGSUSED*/
static void
-hat_vlp_teardown(cpu_t *cpu)
+hat_pcp_teardown(cpu_t *cpu)
{
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
struct hat_cpu_info *hci;
if ((hci = cpu->cpu_hat_info) == NULL)
return;
- if (hci->hci_vlp_l2ptes)
- kmem_free(hci->hci_vlp_l2ptes, MMU_PAGESIZE);
- if (hci->hci_vlp_l3ptes)
- kmem_free(hci->hci_vlp_l3ptes, MMU_PAGESIZE);
+ if (hci->hci_pcp_l2ptes != NULL)
+ kmem_free(hci->hci_pcp_l2ptes, MMU_PAGESIZE);
+ if (hci->hci_pcp_l3ptes != NULL)
+ kmem_free(hci->hci_pcp_l3ptes, MMU_PAGESIZE);
+ if (hci->hci_user_hat != NULL) {
+ hat_free_start(hci->hci_user_hat);
+ hat_free_end(hci->hci_user_hat);
+ }
#endif
}
@@ -830,6 +1302,8 @@ hat_vlp_teardown(cpu_t *cpu)
++r; \
}
+extern uint64_t kpti_safe_cr3;
+
/*
* Finish filling in the kernel hat.
* Pre fill in all top level kernel page table entries for the kernel's
@@ -915,13 +1389,16 @@ hat_init_finish(void)
/*
* 32 bit PAE metal kernels use only 4 of the 512 entries in the
* page holding the top level pagetable. We use the remainder for
- * the "per CPU" page tables for VLP processes.
+ * the "per CPU" page tables for PCP processes.
* Map the top level kernel pagetable into the kernel to make
* it easy to use bcopy access these tables.
+ *
+ * PAE is required for the 64-bit kernel which uses this as well to
+ * perform the per-CPU pagetables. See the big theory statement.
*/
if (mmu.pae_hat) {
- vlp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
- hat_devload(kas.a_hat, (caddr_t)vlp_page, MMU_PAGESIZE,
+ pcp_page = vmem_alloc(heap_arena, MMU_PAGESIZE, VM_SLEEP);
+ hat_devload(kas.a_hat, (caddr_t)pcp_page, MMU_PAGESIZE,
kas.a_hat->hat_htable->ht_pfn,
#if !defined(__xpv)
PROT_WRITE |
@@ -929,7 +1406,7 @@ hat_init_finish(void)
PROT_READ | HAT_NOSYNC | HAT_UNORDERED_OK,
HAT_LOAD | HAT_LOAD_NOCONSIST);
}
- hat_vlp_setup(CPU);
+ hat_pcp_setup(CPU);
/*
* Create kmap (cached mappings of kernel PTEs)
@@ -942,6 +1419,11 @@ hat_init_finish(void)
size = segmapsize;
#endif
hat_kmap_init((uintptr_t)segmap_start, size);
+
+#if defined(__amd64) && !defined(__xpv)
+ ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID);
+ ASSERT3U(kpti_safe_cr3, ==, MAKECR3(kas.a_hat->hat_htable->ht_pfn));
+#endif
}
/*
@@ -959,12 +1441,12 @@ reload_pae32(hat_t *hat, cpu_t *cpu)
/*
* Load the 4 entries of the level 2 page table into this
- * cpu's range of the vlp_page and point cr3 at them.
+ * cpu's range of the pcp_page and point cr3 at them.
*/
ASSERT(mmu.pae_hat);
- src = hat->hat_vlp_ptes;
- dest = vlp_page + (cpu->cpu_id + 1) * VLP_NUM_PTES;
- for (i = 0; i < VLP_NUM_PTES; ++i) {
+ src = hat->hat_copied_ptes;
+ dest = pcp_page + (cpu->cpu_id + 1) * MAX_COPIED_PTES;
+ for (i = 0; i < MAX_COPIED_PTES; ++i) {
for (;;) {
pte = dest[i];
if (pte == src[i])
@@ -977,6 +1459,89 @@ reload_pae32(hat_t *hat, cpu_t *cpu)
#endif
/*
+ * Update the PCP data on the CPU cpu to the one on the hat. If this is a 32-bit
+ * process, then we must update the L2 pages and then the L3. If this is a
+ * 64-bit process then we must update the L3 entries.
+ */
+static void
+hat_pcp_update(cpu_t *cpu, const hat_t *hat)
+{
+ ASSERT3U(hat->hat_flags & HAT_COPIED, !=, 0);
+
+ if ((hat->hat_flags & HAT_COPIED_32) != 0) {
+ const x86pte_t *l2src;
+ x86pte_t *l2dst, *l3ptes, *l3uptes;
+ /*
+ * This is a 32-bit process. To set this up, we need to do the
+ * following:
+ *
+ * - Copy the 4 L2 PTEs into the dedicated L2 table
+ * - Zero the user L3 PTEs in the user and kernel page table
+ * - Set the first L3 PTE to point to the CPU L2 table
+ */
+ l2src = hat->hat_copied_ptes;
+ l2dst = cpu->cpu_hat_info->hci_pcp_l2ptes;
+ l3ptes = cpu->cpu_hat_info->hci_pcp_l3ptes;
+ l3uptes = cpu->cpu_hat_info->hci_user_l3ptes;
+
+ l2dst[0] = l2src[0];
+ l2dst[1] = l2src[1];
+ l2dst[2] = l2src[2];
+ l2dst[3] = l2src[3];
+
+ /*
+ * Make sure to use the mmu to get the number of slots. The
+ * number of PLP entries that this has will always be less as
+ * it's a 32-bit process.
+ */
+ bzero(l3ptes, sizeof (x86pte_t) * mmu.top_level_uslots);
+ l3ptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2);
+ bzero(l3uptes, sizeof (x86pte_t) * mmu.top_level_uslots);
+ l3uptes[0] = MAKEPTP(cpu->cpu_hat_info->hci_pcp_l2pfn, 2);
+ } else {
+ /*
+ * This is a 64-bit process. To set this up, we need to do the
+ * following:
+ *
+ * - Zero the 4 L2 PTEs in the CPU structure for safety
+ * - Copy over the new user L3 PTEs into the kernel page table
+ * - Copy over the new user L3 PTEs into the user page table
+ */
+ ASSERT3S(kpti_enable, ==, 1);
+ bzero(cpu->cpu_hat_info->hci_pcp_l2ptes, sizeof (x86pte_t) * 4);
+ bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_pcp_l3ptes,
+ sizeof (x86pte_t) * mmu.top_level_uslots);
+ bcopy(hat->hat_copied_ptes, cpu->cpu_hat_info->hci_user_l3ptes,
+ sizeof (x86pte_t) * mmu.top_level_uslots);
+ }
+}
+
+static void
+reset_kpti(struct kpti_frame *fr, uint64_t kcr3)
+{
+ ASSERT3U(fr->kf_tr_flag, ==, 0);
+#if DEBUG
+ if (fr->kf_kernel_cr3 != 0) {
+ ASSERT3U(fr->kf_lower_redzone, ==, 0xdeadbeefdeadbeef);
+ ASSERT3U(fr->kf_middle_redzone, ==, 0xdeadbeefdeadbeef);
+ ASSERT3U(fr->kf_upper_redzone, ==, 0xdeadbeefdeadbeef);
+ }
+#endif
+
+ bzero(fr, offsetof(struct kpti_frame, kf_kernel_cr3));
+ bzero(&fr->kf_unused, sizeof (struct kpti_frame) -
+ offsetof(struct kpti_frame, kf_unused));
+
+ fr->kf_kernel_cr3 = kcr3;
+ fr->kf_user_cr3 = 0;
+ fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp;
+
+ fr->kf_lower_redzone = 0xdeadbeefdeadbeef;
+ fr->kf_middle_redzone = 0xdeadbeefdeadbeef;
+ fr->kf_upper_redzone = 0xdeadbeefdeadbeef;
+}
+
+/*
* Switch to a new active hat, maintaining bit masks to track active CPUs.
*
* On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it
@@ -1010,17 +1575,9 @@ hat_switch(hat_t *hat)
/*
* now go ahead and load cr3
*/
- if (hat->hat_flags & HAT_VLP) {
-#if defined(__amd64)
- x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;
-
- VLP_COPY(hat->hat_vlp_ptes, vlpptep);
- newcr3 = MAKECR3(cpu->cpu_hat_info->hci_vlp_pfn);
-#elif defined(__i386)
- reload_pae32(hat, cpu);
- newcr3 = MAKECR3(kas.a_hat->hat_htable->ht_pfn) +
- (cpu->cpu_id + 1) * VLP_SIZE;
-#endif
+ if (hat->hat_flags & HAT_COPIED) {
+ hat_pcp_update(cpu, hat);
+ newcr3 = MAKECR3(cpu->cpu_hat_info->hci_pcp_l3pfn);
} else {
newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn);
}
@@ -1032,7 +1589,7 @@ hat_switch(hat_t *hat)
t[0].cmd = MMUEXT_NEW_BASEPTR;
t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
-#if defined(__amd64)
+
/*
* There's an interesting problem here, as to what to
* actually specify when switching to the kernel hat.
@@ -1044,7 +1601,7 @@ hat_switch(hat_t *hat)
else
t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
++opcnt;
-#endif /* __amd64 */
+
if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
panic("HYPERVISOR_mmu_update() failed");
ASSERT(retcnt == opcnt);
@@ -1052,6 +1609,16 @@ hat_switch(hat_t *hat)
}
#else
setcr3(newcr3);
+ reset_kpti(&cpu->cpu_m.mcpu_kpti, newcr3);
+ reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, newcr3);
+ reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, newcr3);
+
+ if (kpti_enable == 1) {
+ newcr3 = MAKECR3(cpu->cpu_hat_info->hci_user_l3pfn);
+ cpu->cpu_m.mcpu_kpti.kf_user_cr3 = newcr3;
+ cpu->cpu_m.mcpu_kpti_dbg.kf_user_cr3 = newcr3;
+ cpu->cpu_m.mcpu_kpti_flt.kf_user_cr3 = newcr3;
+ }
#endif
ASSERT(cpu == CPU);
}
@@ -1364,10 +1931,9 @@ hati_pte_map(
ASSERT(flags & HAT_LOAD_NOCONSIST);
}
#if defined(__amd64)
- if (ht->ht_flags & HTABLE_VLP) {
+ if (ht->ht_flags & HTABLE_COPIED) {
cpu_t *cpu = CPU;
- x86pte_t *vlpptep = cpu->cpu_hat_info->hci_vlp_l2ptes;
- VLP_COPY(hat->hat_vlp_ptes, vlpptep);
+ hat_pcp_update(cpu, hat);
}
#endif
HTABLE_INC(ht->ht_valid_cnt);
@@ -1439,7 +2005,8 @@ hati_load_common(
++curthread->t_hatdepth;
ASSERT(curthread->t_hatdepth < 16);
- ASSERT(hat == kas.a_hat || AS_LOCK_HELD(hat->hat_as));
+ ASSERT(hat == kas.a_hat || (hat->hat_flags & HAT_PCP) != 0 ||
+ AS_LOCK_HELD(hat->hat_as));
if (flags & HAT_LOAD_SHARE)
hat->hat_flags |= HAT_SHARED;
@@ -1459,15 +2026,23 @@ hati_load_common(
ht = htable_create(hat, va, level, NULL);
ASSERT(ht != NULL);
}
+ /*
+ * htable_va2entry checks this condition as well, but it won't include
+ * much useful info in the panic. So we do it in advance here to include
+ * all the context.
+ */
+ if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht)) {
+ panic("hati_load_common: bad htable: va=%p, last page=%p, "
+ "ht->ht_vaddr=%p, ht->ht_level=%d", (void *)va,
+ (void *)HTABLE_LAST_PAGE(ht), (void *)ht->ht_vaddr,
+ (int)ht->ht_level);
+ }
entry = htable_va2entry(va, ht);
/*
* a bunch of paranoid error checking
*/
ASSERT(ht->ht_busy > 0);
- if (ht->ht_vaddr > va || va > HTABLE_LAST_PAGE(ht))
- panic("hati_load_common: bad htable %p, va %p",
- (void *)ht, (void *)va);
ASSERT(ht->ht_level == level);
/*
@@ -1958,14 +2533,12 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
/*
* Otherwise we reload cr3 to effect a complete TLB flush.
*
- * A reload of cr3 on a VLP process also means we must also recopy in
- * the pte values from the struct hat
+ * A reload of cr3 when using PCP also means we must also recopy in the
+ * pte values from the struct hat
*/
- if (hat->hat_flags & HAT_VLP) {
+ if (hat->hat_flags & HAT_COPIED) {
#if defined(__amd64)
- x86pte_t *vlpptep = CPU->cpu_hat_info->hci_vlp_l2ptes;
-
- VLP_COPY(hat->hat_vlp_ptes, vlpptep);
+ hat_pcp_update(CPU, hat);
#elif defined(__i386)
reload_pae32(hat, CPU);
#endif
@@ -4075,7 +4648,7 @@ hat_cpu_online(struct cpu *cpup)
{
if (cpup != CPU) {
x86pte_cpu_init(cpup);
- hat_vlp_setup(cpup);
+ hat_pcp_setup(cpup);
}
CPUSET_ATOMIC_ADD(khat_cpuset, cpup->cpu_id);
}
@@ -4090,7 +4663,7 @@ hat_cpu_offline(struct cpu *cpup)
ASSERT(cpup != CPU);
CPUSET_ATOMIC_DEL(khat_cpuset, cpup->cpu_id);
- hat_vlp_teardown(cpup);
+ hat_pcp_teardown(cpup);
x86pte_cpu_fini(cpup);
}
@@ -4406,7 +4979,7 @@ hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp)
#ifndef __xpv
void
hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs,
- offset_t kpm_pages_off)
+ offset_t kpm_pages_off)
{
_NOTE(ARGUNUSED(nkpmpgs, kpm_pages_off));
pfn_t base, end;
@@ -4465,7 +5038,7 @@ hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp)
void
hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp,
- struct memseg *lo, struct memseg *mid, struct memseg *hi)
+ struct memseg *lo, struct memseg *mid, struct memseg *hi)
{
_NOTE(ARGUNUSED(msp, mspp, lo, mid, hi));
ASSERT(0);
@@ -4537,3 +5110,32 @@ hat_release_mapping(hat_t *hat, caddr_t addr)
XPV_ALLOW_MIGRATE();
}
#endif /* __xpv */
+
+/*
+ * Helper function to punch in a mapping that we need with the specified
+ * attributes.
+ */
+void
+hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs)
+{
+ int ret;
+ pfn_t pfn;
+ hat_t *cpu_hat = cpu->cpu_hat_info->hci_user_hat;
+
+ ASSERT3S(kpti_enable, ==, 1);
+ ASSERT3P(cpu_hat, !=, NULL);
+ ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP);
+ ASSERT3U(va & (MMU_PAGESIZE - 1), ==, 0);
+
+ pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
+ VERIFY3U(pfn, !=, PFN_INVALID);
+
+ /*
+ * We purposefully don't try to find the page_t. This means that this
+ * will be marked PT_NOCONSIST; however, given that this is pretty much
+ * a static mapping that we're using we should be relatively OK.
+ */
+ attrs |= HAT_STORECACHING_OK;
+ ret = hati_load_common(cpu_hat, va, NULL, attrs, 0, 0, pfn);
+ VERIFY3S(ret, ==, 0);
+}
diff --git a/usr/src/uts/i86pc/vm/hat_i86.h b/usr/src/uts/i86pc/vm/hat_i86.h
index fdbb9346bc..2bcac4ec61 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.h
+++ b/usr/src/uts/i86pc/vm/hat_i86.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _VM_HAT_I86_H
@@ -62,19 +63,32 @@ extern "C" {
*/
/*
- * VLP processes have a 32 bit address range, so their top level is 2 and
- * with only 4 PTEs in that table.
+ * Maximum number of per-CPU pagetable entries that we'll need to cache in the
+ * HAT. See the big theory statement in uts/i86pc/vm/hat_i86.c for more
+ * information.
*/
-#define VLP_LEVEL (2)
-#define VLP_NUM_PTES (4)
-#define VLP_SIZE (VLP_NUM_PTES * sizeof (x86pte_t))
-#define TOP_LEVEL(h) (((h)->hat_flags & HAT_VLP) ? VLP_LEVEL : mmu.max_level)
-#define VLP_COPY(fromptep, toptep) { \
- toptep[0] = fromptep[0]; \
- toptep[1] = fromptep[1]; \
- toptep[2] = fromptep[2]; \
- toptep[3] = fromptep[3]; \
-}
+#if defined(__xpv)
+/*
+ * The Xen hypervisor does not use per-CPU pagetables (PCP). Define a single
+ * struct member for it at least to make life easier and not make the member
+ * conditional.
+ */
+#define MAX_COPIED_PTES 1
+#else
+#if defined(__amd64)
+/*
+ * The 64-bit kernel may have up to 512 PTEs present in it for a given process.
+ */
+#define MAX_COPIED_PTES 512
+#elif defined(__i386)
+/*
+ * The 32-bit kernel always uses 4 PTEs for this.
+ */
+#define MAX_COPIED_PTES 4
+#endif /* __amd64 */
+#endif /* __xpv */
+
+#define TOP_LEVEL(h) (((h)->hat_max_level))
/*
* The hat struct exists for each address space.
@@ -87,13 +101,15 @@ struct hat {
pgcnt_t hat_ism_pgcnt;
cpuset_t hat_cpus;
uint16_t hat_flags;
+ uint8_t hat_max_level; /* top level of this HAT */
+ uint_t hat_num_copied; /* Actual num of hat_copied_ptes[] */
htable_t *hat_htable; /* top level htable */
struct hat *hat_next;
struct hat *hat_prev;
uint_t hat_num_hash; /* number of htable hash buckets */
htable_t **hat_ht_hash; /* htable hash buckets */
htable_t *hat_ht_cached; /* cached free htables */
- x86pte_t hat_vlp_ptes[VLP_NUM_PTES];
+ x86pte_t hat_copied_ptes[MAX_COPIED_PTES];
#if defined(__amd64) && defined(__xpv)
pfn_t hat_user_ptable; /* alt top ptable for user mode */
#endif
@@ -106,14 +122,16 @@ typedef struct hat hat_t;
atomic_dec_ulong(&(hat)->hat_pages_mapped[level]);
/*
- * Flags for the hat_flags field
+ * Flags for the hat_flags field. For more information, please see the big
+ * theory statement on the HAT design in uts/i86pc/vm/hat_i86.c.
*
* HAT_FREEING - set when HAT is being destroyed - mostly used to detect that
* demap()s can be avoided.
*
- * HAT_VLP - indicates a 32 bit process has a virtual address range less than
- * the hardware's physical address range. (VLP->Virtual Less-than Physical)
- * Note - never used on the hypervisor.
+ * HAT_COPIED - Indicates this HAT is a source for per-cpu page tables: see the
+ * big comment in hat_i86.c for a description.
+ *
+ * HAT_COPIED_32 - HAT_COPIED, but for an ILP32 process.
*
* HAT_VICTIM - This is set while a hat is being examined for page table
* stealing and prevents it from being freed.
@@ -121,12 +139,17 @@ typedef struct hat hat_t;
* HAT_SHARED - The hat has exported it's page tables via hat_share()
*
* HAT_PINNED - On the hypervisor, indicates the top page table has been pinned.
+ *
+ * HAT_PCP - Used for the per-cpu user page table (i.e. associated with a CPU,
+ * not a process).
*/
#define HAT_FREEING (0x0001)
-#define HAT_VLP (0x0002)
-#define HAT_VICTIM (0x0004)
-#define HAT_SHARED (0x0008)
-#define HAT_PINNED (0x0010)
+#define HAT_VICTIM (0x0002)
+#define HAT_SHARED (0x0004)
+#define HAT_PINNED (0x0008)
+#define HAT_COPIED (0x0010)
+#define HAT_COPIED_32 (0x0020)
+#define HAT_PCP (0x0040)
/*
* Additional platform attribute for hat_devload() to force no caching.
@@ -155,6 +178,9 @@ struct hatstats {
ulong_t hs_hm_steals;
ulong_t hs_hm_steal_exam;
ulong_t hs_tlb_inval_delayed;
+ ulong_t hs_hat_copied64;
+ ulong_t hs_hat_copied32;
+ ulong_t hs_hat_normal64;
};
extern struct hatstats hatstat;
#ifdef DEBUG
@@ -240,6 +266,11 @@ extern void hat_kmap_init(uintptr_t base, size_t len);
extern hment_t *hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry);
+#if defined(__amd64)
+extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs);
+extern void mmu_calc_user_slots(void);
+#endif
+
#if !defined(__xpv)
/*
* routines to deal with delayed TLB invalidations for idle CPUs
diff --git a/usr/src/uts/i86pc/vm/hat_pte.h b/usr/src/uts/i86pc/vm/hat_pte.h
index 7b078b0435..121d96cf84 100644
--- a/usr/src/uts/i86pc/vm/hat_pte.h
+++ b/usr/src/uts/i86pc/vm/hat_pte.h
@@ -21,7 +21,7 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2017 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _VM_HAT_PTE_H
@@ -175,10 +175,18 @@ struct hat_mmu_info {
uint_t max_page_level; /* maximum level at which we can map a page */
uint_t umax_page_level; /* max user page map level */
uint_t ptes_per_table; /* # of entries in lower level page tables */
- uint_t top_level_count; /* # of entries in top most level page table */
+ uint_t top_level_count; /* # of entries in top-level page table */
+ uint_t top_level_uslots; /* # of user slots in top-level page table */
+ uint_t num_copied_ents; /* # of PCP-copied PTEs to create */
+#if defined(__amd64)
+ /* 32-bit versions of values */
+ uint_t top_level_uslots32;
+ uint_t max_level32;
+ uint_t num_copied_ents32;
+#endif
- uint_t hash_cnt; /* cnt of entries in htable_hash_cache */
- uint_t vlp_hash_cnt; /* cnt of entries in vlp htable_hash_cache */
+ uint_t hash_cnt; /* cnt of entries in htable_hash_cache */
+ uint_t hat32_hash_cnt; /* cnt of entries in 32-bit htable_hash_cache */
uint_t pae_hat; /* either 0 or 1 */
diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c
index 44e66ddfc1..b294597eba 100644
--- a/usr/src/uts/i86pc/vm/htable.c
+++ b/usr/src/uts/i86pc/vm/htable.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014 by Delphix. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -621,11 +621,15 @@ htable_steal(uint_t cnt, boolean_t reap)
* stale PTEs either here or under hat_unload() when we
* steal and unload the same page table in competing
* threads.
+ *
+ * We skip HATs that belong to CPUs, to make our lives
+ * simpler.
*/
- while (hat != NULL &&
- (hat->hat_flags &
- (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0)
+ while (hat != NULL && (hat->hat_flags &
+ (HAT_VICTIM | HAT_SHARED | HAT_FREEING |
+ HAT_PCP)) != 0) {
hat = hat->hat_next;
+ }
if (hat == NULL)
break;
@@ -668,8 +672,8 @@ htable_steal(uint_t cnt, boolean_t reap)
continue;
ASSERT(ht->ht_hat == hat);
#if defined(__xpv) && defined(__amd64)
- if (!(ht->ht_flags & HTABLE_VLP) &&
- ht->ht_level == mmu.max_level) {
+ ASSERT(!(ht->ht_flags & HTABLE_COPIED));
+ if (ht->ht_level == mmu.max_level) {
ptable_free(hat->hat_user_ptable);
hat->hat_user_ptable = PFN_INVALID;
}
@@ -779,7 +783,7 @@ htable_alloc(
htable_t *shared)
{
htable_t *ht = NULL;
- uint_t is_vlp;
+ uint_t is_copied;
uint_t is_bare = 0;
uint_t need_to_zero = 1;
int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP);
@@ -787,8 +791,9 @@ htable_alloc(
if (level < 0 || level > TOP_LEVEL(hat))
panic("htable_alloc(): level %d out of range\n", level);
- is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL;
- if (is_vlp || shared != NULL)
+ is_copied = (hat->hat_flags & HAT_COPIED) &&
+ level == hat->hat_max_level;
+ if (is_copied || shared != NULL)
is_bare = 1;
/*
@@ -930,10 +935,10 @@ htable_alloc(
}
/*
- * setup flags, etc. for VLP htables
+ * setup flags, etc. for copied page tables.
*/
- if (is_vlp) {
- ht->ht_flags |= HTABLE_VLP;
+ if (is_copied) {
+ ht->ht_flags |= HTABLE_COPIED;
ASSERT(ht->ht_pfn == PFN_INVALID);
need_to_zero = 0;
}
@@ -984,7 +989,7 @@ htable_free(htable_t *ht)
!(ht->ht_flags & HTABLE_SHARED_PFN) &&
(use_boot_reserve ||
(!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) {
- ASSERT((ht->ht_flags & HTABLE_VLP) == 0);
+ ASSERT((ht->ht_flags & HTABLE_COPIED) == 0);
ASSERT(ht->ht_pfn != PFN_INVALID);
hat_enter(hat);
ht->ht_next = hat->hat_ht_cached;
@@ -999,7 +1004,7 @@ htable_free(htable_t *ht)
*/
if (ht->ht_flags & HTABLE_SHARED_PFN) {
ASSERT(ht->ht_pfn != PFN_INVALID);
- } else if (!(ht->ht_flags & HTABLE_VLP)) {
+ } else if (!(ht->ht_flags & HTABLE_COPIED)) {
ptable_free(ht->ht_pfn);
#if defined(__amd64) && defined(__xpv)
if (ht->ht_level == mmu.max_level && hat != NULL) {
@@ -1111,15 +1116,15 @@ unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr)
found, expect);
/*
- * When a top level VLP page table entry changes, we must issue
- * a reload of cr3 on all processors.
+ * When a top level PTE changes for a copied htable, we must trigger a
+ * hat_pcp_update() on all HAT CPUs.
*
- * If we don't need do do that, then we still have to INVLPG against
- * an address covered by the inner page table, as the latest processors
+ * If we don't need do do that, then we still have to INVLPG against an
+ * address covered by the inner page table, as the latest processors
* have TLB-like caches for non-leaf page table entries.
*/
if (!(hat->hat_flags & HAT_FREEING)) {
- hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ?
+ hat_tlb_inval(hat, (higher->ht_flags & HTABLE_COPIED) ?
DEMAP_ALL_ADDR : old->ht_vaddr);
}
@@ -1148,15 +1153,17 @@ link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr)
panic("HAT: ptp not 0, found=" FMT_PTE, found);
/*
- * When any top level VLP page table entry changes, we must issue
- * a reload of cr3 on all processors using it.
+ * When a top level PTE changes for a copied htable, we must trigger a
+ * hat_pcp_update() on all HAT CPUs.
+ *
* We also need to do this for the kernel hat on PAE 32 bit kernel.
*/
if (
#ifdef __i386
- (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) ||
+ (higher->ht_hat == kas.a_hat &&
+ higher->ht_level == higher->ht_hat->hat_max_level) ||
#endif
- (higher->ht_flags & HTABLE_VLP))
+ (higher->ht_flags & HTABLE_COPIED))
hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR);
}
@@ -1295,7 +1302,8 @@ htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level)
* 32 bit address spaces on 64 bit kernels need to check
* for overflow of the 32 bit address space
*/
- if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32))
+ if ((hat->hat_flags & HAT_COPIED_32) &&
+ vaddr >= ((uint64_t)1 << 32))
return (NULL);
#endif
base = 0;
@@ -1943,10 +1951,12 @@ static x86pte_t *
x86pte_access_pagetable(htable_t *ht, uint_t index)
{
/*
- * VLP pagetables are contained in the hat_t
+ * HTABLE_COPIED pagetables are contained in the hat_t
*/
- if (ht->ht_flags & HTABLE_VLP)
- return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index));
+ if (ht->ht_flags & HTABLE_COPIED) {
+ ASSERT3U(index, <, ht->ht_hat->hat_num_copied);
+ return (PT_INDEX_PTR(ht->ht_hat->hat_copied_ptes, index));
+ }
return (x86pte_mapin(ht->ht_pfn, index, ht));
}
@@ -2026,10 +2036,7 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
static void
x86pte_release_pagetable(htable_t *ht)
{
- /*
- * nothing to do for VLP htables
- */
- if (ht->ht_flags & HTABLE_VLP)
+ if (ht->ht_flags & HTABLE_COPIED)
return;
x86pte_mapout();
@@ -2189,7 +2196,7 @@ x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new)
maddr_t ma;
if (!IN_XPV_PANIC()) {
- ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */
+ ASSERT(!(ht->ht_flags & HTABLE_COPIED));
ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry));
t[0].ptr = ma | MMU_NORMAL_PT_UPDATE;
t[0].val = new;
@@ -2346,7 +2353,7 @@ x86pte_update(
/*
* Copy page tables - this is just a little more complicated than the
* previous routines. Note that it's also not atomic! It also is never
- * used for VLP pagetables.
+ * used for HTABLE_COPIED pagetables.
*/
void
x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
@@ -2358,8 +2365,8 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
x86pte_t pte;
ASSERT(khat_running);
- ASSERT(!(dest->ht_flags & HTABLE_VLP));
- ASSERT(!(src->ht_flags & HTABLE_VLP));
+ ASSERT(!(dest->ht_flags & HTABLE_COPIED));
+ ASSERT(!(src->ht_flags & HTABLE_COPIED));
ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN));
ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
@@ -2450,7 +2457,7 @@ x86pte_zero(htable_t *dest, uint_t entry, uint_t count)
* Map in the page table to be zeroed.
*/
ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN));
- ASSERT(!(dest->ht_flags & HTABLE_VLP));
+ ASSERT(!(dest->ht_flags & HTABLE_COPIED));
/*
* On the hypervisor we don't use x86pte_access_pagetable() since
@@ -2504,7 +2511,7 @@ hat_dump(void)
for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) {
for (h = 0; h < hat->hat_num_hash; ++h) {
for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) {
- if ((ht->ht_flags & HTABLE_VLP) == 0)
+ if ((ht->ht_flags & HTABLE_COPIED) == 0)
dump_page(ht->ht_pfn);
}
}
diff --git a/usr/src/uts/i86pc/vm/htable.h b/usr/src/uts/i86pc/vm/htable.h
index 6377beef94..d9b91189c9 100644
--- a/usr/src/uts/i86pc/vm/htable.h
+++ b/usr/src/uts/i86pc/vm/htable.h
@@ -24,6 +24,7 @@
*/
/*
* Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _VM_HTABLE_H
@@ -85,12 +86,13 @@ typedef struct htable htable_t;
/*
* Flags values for htable ht_flags field:
*
- * HTABLE_VLP - this is the top level htable of a VLP HAT.
+ * HTABLE_COPIED - This is the top level htable of a HAT being used with per-CPU
+ * pagetables.
*
* HTABLE_SHARED_PFN - this htable had its PFN assigned from sharing another
* htable. Used by hat_share() for ISM.
*/
-#define HTABLE_VLP (0x01)
+#define HTABLE_COPIED (0x01)
#define HTABLE_SHARED_PFN (0x02)
/*
@@ -106,14 +108,19 @@ typedef struct htable htable_t;
((uintptr_t)(hat) >> 4)) & ((hat)->hat_num_hash - 1))
/*
- * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info.
+ * Each CPU gets a unique hat_cpu_info structure in cpu_hat_info. For more
+ * information on its use and members, see uts/i86pc/vm/hat_i86.c.
*/
struct hat_cpu_info {
kmutex_t hci_mutex; /* mutex to ensure sequential usage */
#if defined(__amd64)
- pfn_t hci_vlp_pfn; /* pfn of hci_vlp_l3ptes */
- x86pte_t *hci_vlp_l3ptes; /* VLP Level==3 pagetable (top) */
- x86pte_t *hci_vlp_l2ptes; /* VLP Level==2 pagetable */
+ pfn_t hci_pcp_l3pfn; /* pfn of hci_pcp_l3ptes */
+ pfn_t hci_pcp_l2pfn; /* pfn of hci_pcp_l2ptes */
+ x86pte_t *hci_pcp_l3ptes; /* PCP Level==3 pagetable (top) */
+ x86pte_t *hci_pcp_l2ptes; /* PCP Level==2 pagetable */
+ struct hat *hci_user_hat; /* CPU specific HAT */
+ pfn_t hci_user_l3pfn; /* pfn of hci_user_l3ptes */
+ x86pte_t *hci_user_l3ptes; /* PCP User L3 pagetable */
#endif /* __amd64 */
};
@@ -127,7 +134,8 @@ struct hat_cpu_info {
* XX64 - The check for the VA hole needs to be better generalized.
*/
#if defined(__amd64)
-#define HTABLE_NUM_PTES(ht) (((ht)->ht_flags & HTABLE_VLP) ? 4 : 512)
+#define HTABLE_NUM_PTES(ht) (((ht)->ht_flags & HTABLE_COPIED) ? \
+ (((ht)->ht_level == mmu.max_level) ? 512 : 4) : 512)
#define HTABLE_LAST_PAGE(ht) \
((ht)->ht_level == mmu.max_level ? ((uintptr_t)0UL - MMU_PAGESIZE) :\
diff --git a/usr/src/uts/i86xpv/Makefile.files b/usr/src/uts/i86xpv/Makefile.files
index 8fdda3652d..a576b2f0a8 100644
--- a/usr/src/uts/i86xpv/Makefile.files
+++ b/usr/src/uts/i86xpv/Makefile.files
@@ -22,6 +22,8 @@
#
# Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
#
+# Copyright 2018 Joyent, Inc.
+#
# This Makefile defines file modules in the directory uts/i86xpv
# and its children. These are the source files which are i86xpv
@@ -65,7 +67,6 @@ CORE_OBJS += \
instr_size.o \
intr.o \
kboot_mmu.o \
- kdi_subr.o \
kdi_idt.o \
kdi_idthdl.o \
kdi_asm.o \
@@ -160,7 +161,8 @@ SPECIAL_OBJS_64 += \
locore.o \
fast_trap_asm.o \
interrupt.o \
- syscall_asm_amd64.o
+ syscall_asm_amd64.o \
+ kpti_trampolines.o
SPECIAL_OBJS += $(SPECIAL_OBJS_$(CLASS))
@@ -252,5 +254,3 @@ ASSYM_DEPS += \
syscall_asm_amd64.o
$(KDI_ASSYM_DEPS:%=$(OBJS_DIR)/%): $(DSF_DIR)/$(OBJS_DIR)/kdi_assym.h
-
-ASSYM_DEPS += kdi_asm.o
diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules
index 0054b66bef..bb63d03166 100644
--- a/usr/src/uts/intel/Makefile.rules
+++ b/usr/src/uts/intel/Makefile.rules
@@ -272,9 +272,6 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/%.c
$(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/%.s
$(COMPILE.s) -o $@ $<
-$(OBJS_DIR)/%.o: $(UTSBASE)/intel/kdi/$(SUBARCH_DIR)/%.s
- $(COMPILE.s) -o $@ $<
-
$(OBJS_DIR)/%.o: $(UTSBASE)/intel/zfs/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -489,9 +486,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/%.s
@($(LHEAD) $(LINT.s) $< $(LTAIL))
-$(LINTS_DIR)/%.ln: $(UTSBASE)/intel/kdi/$(SUBARCH_DIR)/%.s
- @($(LHEAD) $(LINT.s) $< $(LTAIL))
-
$(LINTS_DIR)/%.ln: $(UTSBASE)/intel/nskern/%.s
@($(LHEAD) $(LINT.s) $< $(LTAIL))
diff --git a/usr/src/uts/intel/amd64/sys/kdi_regs.h b/usr/src/uts/intel/amd64/sys/kdi_regs.h
index 945e0f8c95..d7c4e87807 100644
--- a/usr/src/uts/intel/amd64/sys/kdi_regs.h
+++ b/usr/src/uts/intel/amd64/sys/kdi_regs.h
@@ -33,8 +33,6 @@
extern "C" {
#endif
-#define KDIREG_NGREG 31
-
/*
* A modified version of struct regs layout.
*/
@@ -59,17 +57,20 @@ extern "C" {
#define KDIREG_FSBASE 17
#define KDIREG_GSBASE 18
#define KDIREG_KGSBASE 19
-#define KDIREG_DS 20
-#define KDIREG_ES 21
-#define KDIREG_FS 22
-#define KDIREG_GS 23
-#define KDIREG_TRAPNO 24
-#define KDIREG_ERR 25
-#define KDIREG_RIP 26
-#define KDIREG_CS 27
-#define KDIREG_RFLAGS 28
-#define KDIREG_RSP 29
-#define KDIREG_SS 30
+#define KDIREG_CR2 20
+#define KDIREG_DS 21
+#define KDIREG_ES 22
+#define KDIREG_FS 23
+#define KDIREG_GS 24
+#define KDIREG_TRAPNO 25
+#define KDIREG_ERR 26
+#define KDIREG_RIP 27
+#define KDIREG_CS 28
+#define KDIREG_RFLAGS 29
+#define KDIREG_RSP 30
+#define KDIREG_SS 31
+
+#define KDIREG_NGREG (KDIREG_SS + 1)
#define KDIREG_PC KDIREG_RIP
#define KDIREG_SP KDIREG_RSP
diff --git a/usr/src/uts/intel/ia32/ml/exception.s b/usr/src/uts/intel/ia32/ml/exception.s
index 8b538910e2..82d449f31c 100644
--- a/usr/src/uts/intel/ia32/ml/exception.s
+++ b/usr/src/uts/intel/ia32/ml/exception.s
@@ -1,7 +1,7 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
- * Copyright (c) 2017 Joyent, Inc.
+ * Copyright (c) 2018 Joyent, Inc.
*/
/*
@@ -81,7 +81,7 @@ ndptrap_frstor(void)
#define NPTRAP_NOERR(trapno) \
pushq $0; \
- pushq $trapno
+ pushq $trapno
#define TRAP_NOERR(trapno) \
XPV_TRAP_POP; \
@@ -93,13 +93,13 @@ ndptrap_frstor(void)
*/
#define TRAP_ERR(trapno) \
XPV_TRAP_POP; \
- pushq $trapno
+ pushq $trapno
#else /* __xpv && __amd64 */
#define TRAP_NOERR(trapno) \
push $0; \
- push $trapno
+ push $trapno
#define NPTRAP_NOERR(trapno) TRAP_NOERR(trapno)
@@ -108,10 +108,24 @@ ndptrap_frstor(void)
* onto stack.
*/
#define TRAP_ERR(trapno) \
- push $trapno
+ push $trapno
#endif /* __xpv && __amd64 */
+ /*
+ * These are the stacks used on cpu0 for taking double faults,
+ * NMIs and MCEs (the latter two only on amd64 where we have IST).
+ *
+ * We define them here instead of in a C file so that we can page-align
+ * them (gcc won't do that in a .c file).
+ */
+ .data
+ DGDEF3(dblfault_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+ .fill DEFAULTSTKSZ, 1, 0
+ DGDEF3(nmi_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+ .fill DEFAULTSTKSZ, 1, 0
+ DGDEF3(mce_stack0, DEFAULTSTKSZ, MMU_PAGESIZE)
+ .fill DEFAULTSTKSZ, 1, 0
/*
* #DE
@@ -163,6 +177,12 @@ ndptrap_frstor(void)
je 1f
leaq brand_sys_sysenter(%rip), %r11
cmpq %r11, 24(%rsp) /* Compare to saved r_rip on the stack */
+ je 1f
+ leaq tr_sys_sysenter(%rip), %r11
+ cmpq %r11, 24(%rsp)
+ je 1f
+ leaq tr_brand_sys_sysenter(%rip), %r11
+ cmpq %r11, 24(%rsp)
jne 2f
1: SWAPGS
2: popq %r11
@@ -214,6 +234,10 @@ ndptrap_frstor(void)
* the cpu structs for all processors till we find a match for the gdt
* of the trapping processor. The stack is expected to be pointing at
* the standard regs pushed by hardware on a trap (plus error code and trapno).
+ *
+ * It's ok for us to clobber gsbase here (and possibly end up with both gsbase
+ * and kgsbase set to the same value) because we're not going back the normal
+ * way out of here (via IRET). Where we're going, we don't need no user %gs.
*/
#define SET_CPU_GSBASE \
subq $REGOFF_TRAPNO, %rsp; /* save regs */ \
@@ -294,7 +318,7 @@ ndptrap_frstor(void)
call av_dispatch_nmivect
INTR_POP
- IRET
+ jmp tr_iret_auto
/*NOTREACHED*/
SET_SIZE(nmiint)
@@ -319,8 +343,8 @@ ndptrap_frstor(void)
movl %esp, %ebp
- pushl %ebp
- call av_dispatch_nmivect
+ pushl %ebp
+ call av_dispatch_nmivect
addl $4, %esp
INTR_POP_USER
@@ -433,7 +457,7 @@ ud_push:
movq 32(%rsp), %rax /* reload calling RSP */
movq %rbp, (%rax) /* store %rbp there */
popq %rax /* pop off temp */
- IRET /* return from interrupt */
+ jmp tr_iret_kernel /* return from interrupt */
/*NOTREACHED*/
ud_leave:
@@ -454,7 +478,7 @@ ud_leave:
movq %rbp, 32(%rsp) /* store new %rsp */
movq %rax, %rbp /* set new %rbp */
popq %rax /* pop off temp */
- IRET /* return from interrupt */
+ jmp tr_iret_kernel /* return from interrupt */
/*NOTREACHED*/
ud_nop:
@@ -464,7 +488,7 @@ ud_nop:
*/
INTR_POP
incq (%rsp)
- IRET
+ jmp tr_iret_kernel
/*NOTREACHED*/
ud_ret:
@@ -475,7 +499,7 @@ ud_ret:
movq %rax, 8(%rsp) /* store calling RIP */
addq $8, 32(%rsp) /* adjust new %rsp */
popq %rax /* pop off temp */
- IRET /* return from interrupt */
+ jmp tr_iret_kernel /* return from interrupt */
/*NOTREACHED*/
ud_trap:
@@ -633,7 +657,7 @@ _emul_done:
*/
TRAP_NOERR(T_NOEXTFLT) /* $7 */
INTR_PUSH
-
+
/*
* We want to do this quickly as every lwp using fp will take this
* after a context switch -- we do the frequent path in ndptrap_frstor
@@ -709,7 +733,7 @@ _patch_xrstorq_rbx:
SWAPGS /* if from user, need swapgs */
LOADCPU(%rax)
SWAPGS
-2:
+2:
/*
* Xrstor needs to use edx as part of its flag.
* NOTE: have to push rdx after "cmpw ...24(%rsp)", otherwise rsp+$24
@@ -749,7 +773,7 @@ _patch_xrstorq_rbx:
popq %rdx
popq %rbx
popq %rax
- IRET
+ jmp tr_iret_auto
/*NOTREACHED*/
.handle_in_trap:
@@ -867,7 +891,7 @@ _patch_xrstor_ebx:
1: addq $DESCTBR_SIZE, %rsp
popq %rax
-
+
DFTRAP_PUSH
/*
@@ -1127,7 +1151,7 @@ check_for_user_address:
#endif /* !__amd64 */
ENTRY_NP(resvtrap)
- TRAP_NOERR(15) /* (reserved) */
+ TRAP_NOERR(T_RESVTRAP) /* (reserved) */
jmp cmntrap
SET_SIZE(resvtrap)
@@ -1207,15 +1231,10 @@ check_for_user_address:
SET_SIZE(xmtrap)
ENTRY_NP(invaltrap)
- TRAP_NOERR(30) /* very invalid */
+ TRAP_NOERR(T_INVALTRAP) /* very invalid */
jmp cmntrap
SET_SIZE(invaltrap)
- ENTRY_NP(invalint)
- TRAP_NOERR(31) /* even more so */
- jmp cmnint
- SET_SIZE(invalint)
-
.globl fasttable
#if defined(__amd64)
@@ -1286,7 +1305,7 @@ check_for_user_address:
ENTRY_NP(fast_null)
XPV_TRAP_POP
orq $PS_C, 24(%rsp) /* set carry bit in user flags */
- IRET
+ jmp tr_iret_auto
/*NOTREACHED*/
SET_SIZE(fast_null)
diff --git a/usr/src/uts/intel/ia32/ml/swtch.s b/usr/src/uts/intel/ia32/ml/swtch.s
index 0948fa7c93..6fc38cfbe8 100644
--- a/usr/src/uts/intel/ia32/ml/swtch.s
+++ b/usr/src/uts/intel/ia32/ml/swtch.s
@@ -24,7 +24,7 @@
*/
/*
- * Copyright (c) 2015, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018 Joyent, Inc.
*/
/*
@@ -64,7 +64,7 @@
* The MMU context, therefore, only changes when resuming a thread in
* a process different from curproc.
*
- * resume_from_intr() is called when the thread being resumed was not
+ * resume_from_intr() is called when the thread being resumed was not
* passivated by resume (e.g. was interrupted). This means that the
* resume lock is already held and that a restore context is not needed.
* Also, the MMU context is not changed on the resume in this case.
@@ -235,6 +235,8 @@ resume(kthread_t *t)
#if defined(__amd64)
+ .global kpti_enable
+
ENTRY(resume)
movq %gs:CPU_THREAD, %rax
leaq resume_return(%rip), %r11
@@ -305,7 +307,7 @@ resume(kthread_t *t)
*/
movq CPU_IDLE_THREAD(%r15), %rax /* idle thread pointer */
- /*
+ /*
* Set the idle thread as the current thread
*/
movq T_SP(%rax), %rsp /* It is safe to set rsp */
@@ -318,7 +320,7 @@ resume(kthread_t *t)
GET_THREAD_HATP(%rdi, %r12, %r11)
call hat_switch
- /*
+ /*
* Clear and unlock previous thread's t_lock
* to allow it to be dispatched by another processor.
*/
@@ -368,13 +370,24 @@ resume(kthread_t *t)
* thread -- this will set rsp0 to the wrong value, but it's harmless
* as it's a kernel thread, and it won't actually attempt to implicitly
* use the rsp0 via a privilege change.
+ *
+ * Note that when we have KPTI enabled on amd64, we never use this
+ * value at all (since all the interrupts have an IST set).
*/
movq CPU_TSS(%r13), %r14
+#if !defined(__xpv)
+ cmpq $1, kpti_enable
+ jne 1f
+ leaq CPU_KPTI_TR_RSP(%r13), %rax
+ jmp 2f
+1:
movq T_STACK(%r12), %rax
addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */
-#if !defined(__xpv)
+2:
movq %rax, TSS_RSP0(%r14)
#else
+ movq T_STACK(%r12), %rax
+ addq $REGSIZE+MINFRAME, %rax /* to the bottom of thread stack */
movl $KDS_SEL, %edi
movq %rax, %rsi
call HYPERVISOR_stack_switch
@@ -407,7 +420,7 @@ resume(kthread_t *t)
movq %rcx, %rdi
call restorepctx
.norestorepctx:
-
+
STORE_INTR_START(%r12)
/*
@@ -428,7 +441,7 @@ resume(kthread_t *t)
* resuming thread's PC after first setting the priority as low as
* possible and blocking all interrupt threads that may be active.
*/
- movq %r13, %rax /* save return address */
+ movq %r13, %rax /* save return address */
RESTORE_REGS(%r11)
pushq %rax /* push return address for spl0() */
call __dtrace_probe___sched_on__cpu
@@ -490,12 +503,12 @@ resume_return:
addl $4, %esp
.nosavepctx:
- /*
+ /*
* Temporarily switch to the idle thread's stack
*/
movl CPU_IDLE_THREAD(%ebx), %eax /* idle thread pointer */
- /*
+ /*
* Set the idle thread as the current thread
*/
movl T_SP(%eax), %esp /* It is safe to set esp */
@@ -506,8 +519,8 @@ resume_return:
pushl %ecx
call hat_switch
addl $4, %esp
-
- /*
+
+ /*
* Clear and unlock previous thread's t_lock
* to allow it to be dispatched by another processor.
*/
@@ -673,7 +686,7 @@ resume_from_zombie(kthread_t *t)
#endif /* __xpv */
- /*
+ /*
* Temporarily switch to the idle thread's stack so that the zombie
* thread's stack can be reclaimed by the reaper.
*/
@@ -686,7 +699,7 @@ resume_from_zombie(kthread_t *t)
*/
andq $_BITNOT(STACK_ALIGN-1), %rsp
- /*
+ /*
* Set the idle thread as the current thread.
*/
movq %rax, %gs:CPU_THREAD
@@ -695,7 +708,7 @@ resume_from_zombie(kthread_t *t)
GET_THREAD_HATP(%rdi, %r12, %r11)
call hat_switch
- /*
+ /*
* Put the zombie on death-row.
*/
movq %r13, %rdi
@@ -743,14 +756,14 @@ resume_from_zombie_return:
movl %eax, %cr0
.zfpu_disabled:
- /*
+ /*
* Temporarily switch to the idle thread's stack so that the zombie
* thread's stack can be reclaimed by the reaper.
*/
movl %gs:CPU_IDLE_THREAD, %eax /* idle thread pointer */
movl T_SP(%eax), %esp /* get onto idle thread stack */
- /*
+ /*
* Set the idle thread as the current thread.
*/
movl %eax, %gs:CPU_THREAD
@@ -763,7 +776,7 @@ resume_from_zombie_return:
call hat_switch
addl $4, %esp
- /*
+ /*
* Put the zombie on death-row.
*/
pushl %esi
@@ -814,7 +827,7 @@ resume_from_intr(kthread_t *t)
movq T_SP(%r12), %rsp /* restore resuming thread's sp */
xorl %ebp, %ebp /* make $<threadlist behave better */
- /*
+ /*
* Unlock outgoing thread's mutex dispatched by another processor.
*/
xorl %eax, %eax
@@ -864,7 +877,7 @@ resume_from_intr_return:
movl T_SP(%edi), %esp /* restore resuming thread's sp */
xorl %ebp, %ebp /* make $<threadlist behave better */
- /*
+ /*
* Unlock outgoing thread's mutex dispatched by another processor.
*/
xorl %eax,%eax
@@ -969,9 +982,15 @@ thread_splitstack_cleanup(void)
ENTRY(thread_splitstack_cleanup)
LOADCPU(%r8)
movq CPU_TSS(%r8), %r9
- movq CPU_THREAD(%r8), %r10
+ cmpq $1, kpti_enable
+ jne 1f
+ leaq CPU_KPTI_TR_RSP(%r8), %rax
+ jmp 2f
+1:
+ movq CPU_THREAD(%r8), %r10
movq T_STACK(%r10), %rax
- addq $REGSIZE+MINFRAME, %rax
+ addq $REGSIZE+MINFRAME, %rax
+2:
movq %rax, TSS_RSP0(%r9)
ret
SET_SIZE(thread_splitstack_cleanup)
diff --git a/usr/src/uts/intel/ia32/os/desctbls.c b/usr/src/uts/intel/ia32/os/desctbls.c
index 97024b7b59..3c021bd055 100644
--- a/usr/src/uts/intel/ia32/os/desctbls.c
+++ b/usr/src/uts/intel/ia32/os/desctbls.c
@@ -24,7 +24,7 @@
*/
/*
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
*/
/*
@@ -83,6 +83,7 @@
#include <sys/kdi.h>
#include <sys/mach_mmu.h>
#include <sys/systm.h>
+#include <sys/note.h>
#ifdef __xpv
#include <sys/hypervisor.h>
@@ -128,8 +129,13 @@ user_desc_t ucs32_on;
user_desc_t ucs32_off;
#endif /* __amd64 */
-#pragma align 16(dblfault_stack0)
-char dblfault_stack0[DEFAULTSTKSZ];
+/*
+ * If the size of this is changed, you must update hat_pcp_setup() and the
+ * definitions in exception.s
+ */
+extern char dblfault_stack0[DEFAULTSTKSZ];
+extern char nmi_stack0[DEFAULTSTKSZ];
+extern char mce_stack0[DEFAULTSTKSZ];
extern void fast_null(void);
extern hrtime_t get_hrtime(void);
@@ -310,57 +316,73 @@ get_ssd_base(system_desc_t *dp)
/*
* Install gate segment descriptor for interrupt, trap, call and task gates.
+ *
+ * For 64 bit native if we have KPTI enabled, we use the IST stack mechanism on
+ * all interrupts. We have different ISTs for each class of exceptions that are
+ * most likely to occur while handling an existing exception; while many of
+ * these are just going to panic, it's nice not to trample on the existing
+ * exception state for debugging purposes.
+ *
+ * Normal interrupts are all redirected unconditionally to the KPTI trampoline
+ * stack space. This unifies the trampoline handling between user and kernel
+ * space (and avoids the need to touch %gs).
+ *
+ * The KDI IDT *all* uses the DBG IST: consider single stepping tr_pftrap, when
+ * we do a read from KMDB that cause another #PF. Without its own IST, this
+ * would stomp on the kernel's mcpu_kpti_flt frame.
*/
-
-#if defined(__amd64)
-
-/*ARGSUSED*/
-void
-set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
- uint_t type, uint_t dpl, uint_t vector)
+uint_t
+idt_vector_to_ist(uint_t vector)
{
- dp->sgd_looffset = (uintptr_t)func;
- dp->sgd_hioffset = (uintptr_t)func >> 16;
- dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
-
- dp->sgd_selector = (uint16_t)sel;
-
- /*
- * For 64 bit native we use the IST stack mechanism
- * for double faults. All other traps use the CPL = 0
- * (tss_rsp0) stack.
- */
-#if !defined(__xpv)
- if (vector == T_DBLFLT)
- dp->sgd_ist = 1;
- else
+#if defined(__xpv)
+ _NOTE(ARGUNUSED(vector));
+ return (IST_NONE);
+#else
+ switch (vector) {
+ /* These should always use IST even without KPTI enabled. */
+ case T_DBLFLT:
+ return (IST_DF);
+ case T_NMIFLT:
+ return (IST_NMI);
+ case T_MCE:
+ return (IST_MCE);
+
+ case T_BPTFLT:
+ case T_SGLSTP:
+ if (kpti_enable == 1) {
+ return (IST_DBG);
+ }
+ return (IST_NONE);
+ case T_STKFLT:
+ case T_GPFLT:
+ case T_PGFLT:
+ if (kpti_enable == 1) {
+ return (IST_NESTABLE);
+ }
+ return (IST_NONE);
+ default:
+ if (kpti_enable == 1) {
+ return (IST_DEFAULT);
+ }
+ return (IST_NONE);
+ }
#endif
- dp->sgd_ist = 0;
-
- dp->sgd_type = type;
- dp->sgd_dpl = dpl;
- dp->sgd_p = 1;
}
-#elif defined(__i386)
-
-/*ARGSUSED*/
void
set_gatesegd(gate_desc_t *dp, void (*func)(void), selector_t sel,
- uint_t type, uint_t dpl, uint_t unused)
+ uint_t type, uint_t dpl, uint_t ist)
{
dp->sgd_looffset = (uintptr_t)func;
dp->sgd_hioffset = (uintptr_t)func >> 16;
-
+ dp->sgd_hi64offset = (uintptr_t)func >> (16 + 16);
dp->sgd_selector = (uint16_t)sel;
- dp->sgd_stkcpy = 0; /* always zero bytes */
+ dp->sgd_ist = ist;
dp->sgd_type = type;
dp->sgd_dpl = dpl;
dp->sgd_p = 1;
}
-#endif /* __i386 */
-
/*
* Updates a single user descriptor in the the GDT of the current cpu.
* Caller is responsible for preventing cpu migration.
@@ -917,22 +939,30 @@ init_gdt(void)
static void
init_idt_common(gate_desc_t *idt)
{
- set_gatesegd(&idt[T_ZERODIV], &div0trap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_SGLSTP], &dbgtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_NMIFLT], &nmiint, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_BPTFLT], &brktrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
- 0);
- set_gatesegd(&idt[T_OVFLW], &ovflotrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
- 0);
- set_gatesegd(&idt[T_BOUNDFLT], &boundstrap, KCS_SEL, SDT_SYSIGT,
- TRP_KPL, 0);
- set_gatesegd(&idt[T_ILLINST], &invoptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_NOEXTFLT], &ndptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
+ set_gatesegd(&idt[T_ZERODIV],
+ (kpti_enable == 1) ? &tr_div0trap : &div0trap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ZERODIV));
+ set_gatesegd(&idt[T_SGLSTP],
+ (kpti_enable == 1) ? &tr_dbgtrap : &dbgtrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SGLSTP));
+ set_gatesegd(&idt[T_NMIFLT],
+ (kpti_enable == 1) ? &tr_nmiint : &nmiint,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NMIFLT));
+ set_gatesegd(&idt[T_BPTFLT],
+ (kpti_enable == 1) ? &tr_brktrap : &brktrap,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_BPTFLT));
+ set_gatesegd(&idt[T_OVFLW],
+ (kpti_enable == 1) ? &tr_ovflotrap : &ovflotrap,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_OVFLW));
+ set_gatesegd(&idt[T_BOUNDFLT],
+ (kpti_enable == 1) ? &tr_boundstrap : &boundstrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_BOUNDFLT));
+ set_gatesegd(&idt[T_ILLINST],
+ (kpti_enable == 1) ? &tr_invoptrap : &invoptrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ILLINST));
+ set_gatesegd(&idt[T_NOEXTFLT],
+ (kpti_enable == 1) ? &tr_ndptrap : &ndptrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_NOEXTFLT));
/*
* double fault handler.
@@ -942,90 +972,88 @@ init_idt_common(gate_desc_t *idt)
* and/or stack is in a broken state. See xen_failsafe_callback.
*/
#if !defined(__xpv)
-#if defined(__amd64)
-
- set_gatesegd(&idt[T_DBLFLT], &syserrtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- T_DBLFLT);
-
-#elif defined(__i386)
-
- /*
- * task gate required.
- */
- set_gatesegd(&idt[T_DBLFLT], NULL, DFTSS_SEL, SDT_SYSTASKGT, TRP_KPL,
- 0);
-
-#endif /* __i386 */
+ set_gatesegd(&idt[T_DBLFLT],
+ (kpti_enable == 1) ? &tr_syserrtrap : &syserrtrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_DBLFLT));
#endif /* !__xpv */
/*
* T_EXTOVRFLT coprocessor-segment-overrun not supported.
*/
+ set_gatesegd(&idt[T_TSSFLT],
+ (kpti_enable == 1) ? &tr_invtsstrap : &invtsstrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_TSSFLT));
+ set_gatesegd(&idt[T_SEGFLT],
+ (kpti_enable == 1) ? &tr_segnptrap : &segnptrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SEGFLT));
+ set_gatesegd(&idt[T_STKFLT],
+ (kpti_enable == 1) ? &tr_stktrap : &stktrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_STKFLT));
+ set_gatesegd(&idt[T_GPFLT],
+ (kpti_enable == 1) ? &tr_gptrap : &gptrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_GPFLT));
+ set_gatesegd(&idt[T_PGFLT],
+ (kpti_enable == 1) ? &tr_pftrap : &pftrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_PGFLT));
+ set_gatesegd(&idt[T_EXTERRFLT],
+ (kpti_enable == 1) ? &tr_ndperr : &ndperr,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_EXTERRFLT));
+ set_gatesegd(&idt[T_ALIGNMENT],
+ (kpti_enable == 1) ? &tr_achktrap : &achktrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_ALIGNMENT));
+ set_gatesegd(&idt[T_MCE],
+ (kpti_enable == 1) ? &tr_mcetrap : &mcetrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_MCE));
+ set_gatesegd(&idt[T_SIMDFPE],
+ (kpti_enable == 1) ? &tr_xmtrap : &xmtrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE));
- set_gatesegd(&idt[T_TSSFLT], &invtsstrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_SEGFLT], &segnptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_STKFLT], &stktrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
- set_gatesegd(&idt[T_GPFLT], &gptrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
- set_gatesegd(&idt[T_PGFLT], &pftrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
- set_gatesegd(&idt[T_EXTERRFLT], &ndperr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
- set_gatesegd(&idt[T_ALIGNMENT], &achktrap, KCS_SEL, SDT_SYSIGT,
- TRP_KPL, 0);
- set_gatesegd(&idt[T_MCE], &mcetrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
- set_gatesegd(&idt[T_SIMDFPE], &xmtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
-
- /*
+ /*
* install "int80" handler at, well, 0x80.
*/
- set_gatesegd(&idt0[T_INT80], &sys_int80, KCS_SEL, SDT_SYSIGT, TRP_UPL,
- 0);
+ set_gatesegd(&idt0[T_INT80],
+ (kpti_enable == 1) ? &tr_sys_int80 : &sys_int80,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80));
/*
* install fast trap handler at 210.
*/
- set_gatesegd(&idt[T_FASTTRAP], &fasttrap, KCS_SEL, SDT_SYSIGT, TRP_UPL,
- 0);
+ set_gatesegd(&idt[T_FASTTRAP],
+ (kpti_enable == 1) ? &tr_fasttrap : &fasttrap,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_FASTTRAP));
/*
* System call handler.
*/
-#if defined(__amd64)
- set_gatesegd(&idt[T_SYSCALLINT], &sys_syscall_int, KCS_SEL, SDT_SYSIGT,
- TRP_UPL, 0);
-
-#elif defined(__i386)
- set_gatesegd(&idt[T_SYSCALLINT], &sys_call, KCS_SEL, SDT_SYSIGT,
- TRP_UPL, 0);
-#endif /* __i386 */
+ set_gatesegd(&idt[T_SYSCALLINT],
+ (kpti_enable == 1) ? &tr_sys_syscall_int : &sys_syscall_int,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT));
/*
* Install the DTrace interrupt handler for the pid provider.
*/
- set_gatesegd(&idt[T_DTRACE_RET], &dtrace_ret, KCS_SEL,
- SDT_SYSIGT, TRP_UPL, 0);
+ set_gatesegd(&idt[T_DTRACE_RET],
+ (kpti_enable == 1) ? &tr_dtrace_ret : &dtrace_ret,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET));
/*
-- * Prepare interposing descriptors for the branded "int80"
-- * and syscall handlers and cache copies of the default
-- * descriptors.
+ * Prepare interposing descriptors for the branded "int80"
+ * and syscall handlers and cache copies of the default
+ * descriptors.
*/
brand_tbl[0].ih_inum = T_INT80;
brand_tbl[0].ih_default_desc = idt0[T_INT80];
- set_gatesegd(&(brand_tbl[0].ih_interp_desc), &brand_sys_int80, KCS_SEL,
- SDT_SYSIGT, TRP_UPL, 0);
+ set_gatesegd(&(brand_tbl[0].ih_interp_desc),
+ (kpti_enable == 1) ? &tr_brand_sys_int80 : &brand_sys_int80,
+ KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80));
brand_tbl[1].ih_inum = T_SYSCALLINT;
brand_tbl[1].ih_default_desc = idt0[T_SYSCALLINT];
-#if defined(__amd64)
- set_gatesegd(&(brand_tbl[1].ih_interp_desc), &brand_sys_syscall_int,
- KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
-#elif defined(__i386)
- set_gatesegd(&(brand_tbl[1].ih_interp_desc), &brand_sys_call,
- KCS_SEL, SDT_SYSIGT, TRP_UPL, 0);
-#endif /* __i386 */
+ set_gatesegd(&(brand_tbl[1].ih_interp_desc),
+ (kpti_enable == 1) ? &tr_brand_sys_syscall_int :
+ &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL,
+ idt_vector_to_ist(T_SYSCALLINT));
brand_tbl[2].ih_inum = 0;
}
@@ -1053,27 +1081,53 @@ init_idt(gate_desc_t *idt)
* since it can only be generated on a 386 processor. 15 is also
* unsupported and reserved.
*/
- for (i = 0; i < NIDT; i++)
+#if !defined(__xpv)
+ for (i = 0; i < NIDT; i++) {
+ set_gatesegd(&idt[i],
+ (kpti_enable == 1) ? &tr_resvtrap : &resvtrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL,
+ idt_vector_to_ist(T_RESVTRAP));
+ }
+#else
+ for (i = 0; i < NIDT; i++) {
set_gatesegd(&idt[i], &resvtrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
+ IST_NONE);
+ }
+#endif
/*
* 20-31 reserved
*/
- for (i = 20; i < 32; i++)
+#if !defined(__xpv)
+ for (i = 20; i < 32; i++) {
+ set_gatesegd(&idt[i],
+ (kpti_enable == 1) ? &tr_invaltrap : &invaltrap,
+ KCS_SEL, SDT_SYSIGT, TRP_KPL,
+ idt_vector_to_ist(T_INVALTRAP));
+ }
+#else
+ for (i = 20; i < 32; i++) {
set_gatesegd(&idt[i], &invaltrap, KCS_SEL, SDT_SYSIGT, TRP_KPL,
- 0);
+ IST_NONE);
+ }
+#endif
/*
* interrupts 32 - 255
*/
for (i = 32; i < 256; i++) {
+#if !defined(__xpv)
+ (void) snprintf(ivctname, sizeof (ivctname),
+ (kpti_enable == 1) ? "tr_ivct%d" : "ivct%d", i);
+#else
(void) snprintf(ivctname, sizeof (ivctname), "ivct%d", i);
+#endif
ivctptr = (void (*)(void))kobj_getsymvalue(ivctname, 0);
if (ivctptr == NULL)
panic("kobj_getsymvalue(%s) failed", ivctname);
- set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL, 0);
+ set_gatesegd(&idt[i], ivctptr, KCS_SEL, SDT_SYSIGT, TRP_KPL,
+ idt_vector_to_ist(i));
}
/*
@@ -1102,67 +1156,39 @@ init_ldt(void)
}
#if !defined(__xpv)
-#if defined(__amd64)
static void
init_tss(void)
{
- /*
- * tss_rsp0 is dynamically filled in by resume() on each context switch.
- * All exceptions but #DF will run on the thread stack.
- * Set up the double fault stack here.
- */
- ktss0->tss_ist1 =
- (uint64_t)&dblfault_stack0[sizeof (dblfault_stack0)];
+ extern struct cpu cpus[];
/*
- * Set I/O bit map offset equal to size of TSS segment limit
- * for no I/O permission map. This will force all user I/O
- * instructions to generate #gp fault.
+ * tss_rsp0 is dynamically filled in by resume() (in swtch.s) on each
+ * context switch but it'll be overwritten with this same value anyway.
*/
- ktss0->tss_bitmapbase = sizeof (*ktss0);
-
- /*
- * Point %tr to descriptor for ktss0 in gdt.
- */
- wr_tsr(KTSS_SEL);
-}
+ if (kpti_enable == 1) {
+ ktss0->tss_rsp0 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
+ }
-#elif defined(__i386)
+ /* Set up the IST stacks for double fault, NMI, MCE. */
+ ktss0->tss_ist1 = (uintptr_t)&dblfault_stack0[sizeof (dblfault_stack0)];
+ ktss0->tss_ist2 = (uintptr_t)&nmi_stack0[sizeof (nmi_stack0)];
+ ktss0->tss_ist3 = (uintptr_t)&mce_stack0[sizeof (mce_stack0)];
-static void
-init_tss(void)
-{
/*
- * ktss0->tss_esp dynamically filled in by resume() on each
- * context switch.
+ * This IST stack is used for #DB,#BP (debug) interrupts (when KPTI is
+ * enabled), and also for KDI (always).
*/
- ktss0->tss_ss0 = KDS_SEL;
- ktss0->tss_eip = (uint32_t)_start;
- ktss0->tss_ds = ktss0->tss_es = ktss0->tss_ss = KDS_SEL;
- ktss0->tss_cs = KCS_SEL;
- ktss0->tss_fs = KFS_SEL;
- ktss0->tss_gs = KGS_SEL;
- ktss0->tss_ldt = ULDT_SEL;
+ ktss0->tss_ist4 = (uint64_t)&cpus->cpu_m.mcpu_kpti_dbg.kf_tr_rsp;
- /*
- * Initialize double fault tss.
- */
- dftss0->tss_esp0 = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
- dftss0->tss_ss0 = KDS_SEL;
+ if (kpti_enable == 1) {
+ /* This IST stack is used for #GP,#PF,#SS (fault) interrupts. */
+ ktss0->tss_ist5 =
+ (uint64_t)&cpus->cpu_m.mcpu_kpti_flt.kf_tr_rsp;
- /*
- * tss_cr3 will get initialized in hat_kern_setup() once our page
- * tables have been setup.
- */
- dftss0->tss_eip = (uint32_t)syserrtrap;
- dftss0->tss_esp = (uint32_t)&dblfault_stack0[sizeof (dblfault_stack0)];
- dftss0->tss_cs = KCS_SEL;
- dftss0->tss_ds = KDS_SEL;
- dftss0->tss_es = KDS_SEL;
- dftss0->tss_ss = KDS_SEL;
- dftss0->tss_fs = KFS_SEL;
- dftss0->tss_gs = KGS_SEL;
+ /* This IST stack is used for all other intrs (for KPTI). */
+ ktss0->tss_ist6 = (uint64_t)&cpus->cpu_m.mcpu_kpti.kf_tr_rsp;
+ }
/*
* Set I/O bit map offset equal to size of TSS segment limit
@@ -1177,7 +1203,6 @@ init_tss(void)
wr_tsr(KTSS_SEL);
}
-#endif /* __i386 */
#endif /* !__xpv */
#if defined(__xpv)
@@ -1234,6 +1259,9 @@ init_desctbls(void)
{
user_desc_t *gdt;
desctbr_t idtr;
+#if defined(__amd64)
+ extern uint64_t kpti_safe_cr3;
+#endif
/*
* Allocate IDT and TSS structures on unique pages for better
@@ -1269,6 +1297,14 @@ init_desctbls(void)
CPU->cpu_gdt = gdt;
/*
+ * Initialize this CPU's LDT.
+ */
+ CPU->cpu_m.mcpu_ldt = BOP_ALLOC(bootops, (caddr_t)LDT_VA,
+ LDT_CPU_SIZE, PAGESIZE);
+ bzero(CPU->cpu_m.mcpu_ldt, LDT_CPU_SIZE);
+ CPU->cpu_m.mcpu_ldt_len = 0;
+
+ /*
* Setup and install our IDT.
*/
init_idt(idt0);
@@ -1289,6 +1325,9 @@ init_desctbls(void)
init_tss();
CPU->cpu_tss = ktss0;
init_ldt();
+
+ /* Stash this so that the NMI,MCE,#DF and KDI handlers can use it. */
+ kpti_safe_cr3 = (uint64_t)getcr3();
}
#endif /* __xpv */
@@ -1349,15 +1388,26 @@ brand_interpositioning_enable(void)
#else
if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
- wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
- wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
+ if (kpti_enable == 1) {
+ wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_brand_sys_syscall);
+ wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_brand_sys_syscall32);
+ } else {
+ wrmsr(MSR_AMD_LSTAR, (uintptr_t)brand_sys_syscall);
+ wrmsr(MSR_AMD_CSTAR, (uintptr_t)brand_sys_syscall32);
+ }
}
#endif
#endif /* __amd64 */
- if (is_x86_feature(x86_featureset, X86FSET_SEP))
- wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
+ if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
+ if (kpti_enable == 1) {
+ wrmsr(MSR_INTC_SEP_EIP,
+ (uintptr_t)tr_brand_sys_sysenter);
+ } else {
+ wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)brand_sys_sysenter);
+ }
+ }
}
/*
@@ -1393,13 +1443,23 @@ brand_interpositioning_disable(void)
#else
if (is_x86_feature(x86_featureset, X86FSET_ASYSC)) {
- wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
- wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
+ if (kpti_enable == 1) {
+ wrmsr(MSR_AMD_LSTAR, (uintptr_t)tr_sys_syscall);
+ wrmsr(MSR_AMD_CSTAR, (uintptr_t)tr_sys_syscall32);
+ } else {
+ wrmsr(MSR_AMD_LSTAR, (uintptr_t)sys_syscall);
+ wrmsr(MSR_AMD_CSTAR, (uintptr_t)sys_syscall32);
+ }
}
#endif
#endif /* __amd64 */
- if (is_x86_feature(x86_featureset, X86FSET_SEP))
- wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
+ if (is_x86_feature(x86_featureset, X86FSET_SEP)) {
+ if (kpti_enable == 1) {
+ wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)tr_sys_sysenter);
+ } else {
+ wrmsr(MSR_INTC_SEP_EIP, (uintptr_t)sys_sysenter);
+ }
+ }
}
diff --git a/usr/src/uts/intel/ia32/os/sysi86.c b/usr/src/uts/intel/ia32/os/sysi86.c
index 7be9ec20fd..cd1129ea1f 100644
--- a/usr/src/uts/intel/ia32/os/sysi86.c
+++ b/usr/src/uts/intel/ia32/os/sysi86.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
@@ -60,6 +61,7 @@
#include <sys/cmn_err.h>
#include <sys/segments.h>
#include <sys/clock.h>
+#include <vm/hat_i86.h>
#if defined(__xpv)
#include <sys/hypervisor.h>
#include <sys/note.h>
@@ -346,7 +348,19 @@ ldt_load(void)
xen_set_ldt(get_ssd_base(&curproc->p_ldt_desc),
curproc->p_ldtlimit + 1);
#else
- *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = curproc->p_ldt_desc;
+ size_t len;
+ system_desc_t desc;
+
+ /*
+ * Before we can use the LDT on this CPU, we must install the LDT in the
+ * user mapping table.
+ */
+ len = (curproc->p_ldtlimit + 1) * sizeof (user_desc_t);
+ bcopy(curproc->p_ldt, CPU->cpu_m.mcpu_ldt, len);
+ CPU->cpu_m.mcpu_ldt_len = len;
+ set_syssegd(&desc, CPU->cpu_m.mcpu_ldt, len - 1, SDT_SYSLDT, SEL_KPL);
+ *((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = desc;
+
wr_ldtr(ULDT_SEL);
#endif
}
@@ -363,6 +377,9 @@ ldt_unload(void)
#else
*((system_desc_t *)&CPU->cpu_gdt[GDT_LDT]) = null_sdesc;
wr_ldtr(0);
+
+ bzero(CPU->cpu_m.mcpu_ldt, CPU->cpu_m.mcpu_ldt_len);
+ CPU->cpu_m.mcpu_ldt_len = 0;
#endif
}
@@ -714,7 +731,8 @@ ldt_alloc(proc_t *pp, uint_t seli)
ASSERT(pp->p_ldtlimit == 0);
/*
- * Allocate new LDT just large enough to contain seli.
+ * Allocate new LDT just large enough to contain seli. The LDT must
+ * always be allocated in units of pages for KPTI.
*/
ldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
nsels = ldtsz / sizeof (user_desc_t);
@@ -832,7 +850,8 @@ ldt_grow(proc_t *pp, uint_t seli)
ASSERT(pp->p_ldtlimit != 0);
/*
- * Allocate larger LDT just large enough to contain seli.
+ * Allocate larger LDT just large enough to contain seli. The LDT must
+ * always be allocated in units of pages for KPTI.
*/
nldtsz = P2ROUNDUP((seli + 1) * sizeof (user_desc_t), PAGESIZE);
nsels = nldtsz / sizeof (user_desc_t);
diff --git a/usr/src/uts/intel/ia32/sys/trap.h b/usr/src/uts/intel/ia32/sys/trap.h
index cc41d102a8..4165f1289e 100644
--- a/usr/src/uts/intel/ia32/sys/trap.h
+++ b/usr/src/uts/intel/ia32/sys/trap.h
@@ -25,6 +25,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _IA32_SYS_TRAP_H
@@ -53,11 +55,13 @@ extern "C" {
#define T_STKFLT 0xc /* #ss stack fault */
#define T_GPFLT 0xd /* #gp general protection fault */
#define T_PGFLT 0xe /* #pf page fault */
+#define T_RESVTRAP 0xf /* reserved */
#define T_EXTERRFLT 0x10 /* #mf x87 FPU error fault */
#define T_ALIGNMENT 0x11 /* #ac alignment check error */
#define T_MCE 0x12 /* #mc machine check exception */
#define T_SIMDFPE 0x13 /* #xm SSE/SSE exception */
#define T_DBGENTR 0x14 /* debugger entry */
+#define T_INVALTRAP 0x1e /* invalid */
#define T_ENDPERR 0x21 /* emulated extension error flt */
#define T_ENOEXTFLT 0x20 /* emulated ext not present */
#define T_FASTTRAP 0xd2 /* fast system call */
@@ -102,6 +106,22 @@ extern "C" {
#define T_LASTFAST 6 /* Last valid subfunction */
+/*
+ * Offsets for an interrupt/trap frame.
+ */
+#define T_FRAME_ERR 0
+#define T_FRAME_RIP 8
+#define T_FRAME_CS 16
+#define T_FRAME_RFLAGS 24
+#define T_FRAME_RSP 32
+#define T_FRAME_SS 40
+
+#define T_FRAMERET_RIP 0
+#define T_FRAMERET_CS 8
+#define T_FRAMERET_RFLAGS 16
+#define T_FRAMERET_RSP 24
+#define T_FRAMERET_SS 32
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/intel/kdi/ia32/kdi_asm.s b/usr/src/uts/intel/kdi/ia32/kdi_asm.s
deleted file mode 100644
index 4f7e2ec7ad..0000000000
--- a/usr/src/uts/intel/kdi/ia32/kdi_asm.s
+++ /dev/null
@@ -1,662 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- *
- * Copyright 2018 Joyent, Inc.
- */
-
-/*
- * Debugger entry for both master and slave CPUs
- */
-
-#if defined(__lint)
-#include <sys/types.h>
-#endif
-
-#include <sys/segments.h>
-#include <sys/asm_linkage.h>
-#include <sys/controlregs.h>
-#include <sys/x86_archext.h>
-#include <sys/privregs.h>
-#include <sys/machprivregs.h>
-#include <sys/kdi_regs.h>
-#include <sys/uadmin.h>
-#include <sys/psw.h>
-
-#ifdef _ASM
-
-#include <kdi_assym.h>
-#include <assym.h>
-
-/* clobbers %edx, %ecx, returns addr in %eax, cpu id in %ebx */
-#define GET_CPUSAVE_ADDR \
- movl %gs:CPU_ID, %ebx; \
- movl %ebx, %eax; \
- movl $KRS_SIZE, %ecx; \
- mull %ecx; \
- movl $kdi_cpusave, %edx; \
- /*CSTYLED*/ \
- addl (%edx), %eax
-
-/*
- * Save copies of the IDT and GDT descriptors. Note that we only save the IDT
- * and GDT if the IDT isn't ours, as we may be legitimately re-entering the
- * debugger through the trap handler. We don't want to clobber the saved IDT
- * in the process, as we'd end up resuming the world on our IDT.
- */
-#define SAVE_IDTGDT \
- movl %gs:CPU_IDT, %edx; \
- cmpl $kdi_idt, %edx; \
- je 1f; \
- movl %edx, KRS_IDT(%eax); \
- movl %gs:CPU_GDT, %edx; \
- movl %edx, KRS_GDT(%eax); \
-1:
-
-/*
- * Given the address of the current CPU's cpusave area in %edi, the following
- * macro restores the debugging state to said CPU. Restored state includes
- * the debug registers from the global %dr variables.
- */
-#define KDI_RESTORE_DEBUGGING_STATE \
- leal kdi_drreg, %ebx; \
- \
- pushl DR_CTL(%ebx); \
- pushl $7; \
- call kdi_dreg_set; \
- addl $8, %esp; \
- \
- pushl $KDIREG_DRSTAT_RESERVED; \
- pushl $6; \
- call kdi_dreg_set; \
- addl $8, %esp; \
- \
- pushl DRADDR_OFF(0)(%ebx); \
- pushl $0; \
- call kdi_dreg_set; \
- addl $8, %esp; \
- \
- pushl DRADDR_OFF(1)(%ebx); \
- pushl $1; \
- call kdi_dreg_set; \
- addl $8, %esp; \
- \
- pushl DRADDR_OFF(2)(%ebx); \
- pushl $2; \
- call kdi_dreg_set; \
- addl $8, %esp; \
- \
- pushl DRADDR_OFF(3)(%ebx); \
- pushl $3; \
- call kdi_dreg_set; \
- addl $8, %esp;
-
-#define KDI_RESTORE_REGS() \
- /* Discard savfp and savpc */ \
- addl $8, %esp; \
- popl %ss; \
- popl %gs; \
- popl %fs; \
- popl %es; \
- popl %ds; \
- popal; \
- /* Discard trapno and err */ \
- addl $8, %esp
-
-/*
- * Each cpusave buffer has an area set aside for a ring buffer of breadcrumbs.
- * The following macros manage the buffer.
- */
-
-/* Advance the ring buffer */
-#define ADVANCE_CRUMB_POINTER(cpusave, tmp1, tmp2) \
- movl KRS_CURCRUMBIDX(cpusave), tmp1; \
- cmpl $[KDI_NCRUMBS - 1], tmp1; \
- jge 1f; \
- /* Advance the pointer and index */ \
- addl $1, tmp1; \
- movl tmp1, KRS_CURCRUMBIDX(cpusave); \
- movl KRS_CURCRUMB(cpusave), tmp1; \
- addl $KRM_SIZE, tmp1; \
- jmp 2f; \
-1: /* Reset the pointer and index */ \
- movw $0, KRS_CURCRUMBIDX(cpusave); \
- leal KRS_CRUMBS(cpusave), tmp1; \
-2: movl tmp1, KRS_CURCRUMB(cpusave); \
- /* Clear the new crumb */ \
- movl $KDI_NCRUMBS, tmp2; \
-3: movl $0, -4(tmp1, tmp2, 4); \
- decl tmp2; \
- jnz 3b
-
-/* Set a value in the current breadcrumb buffer */
-#define ADD_CRUMB(cpusave, offset, value, tmp) \
- movl KRS_CURCRUMB(cpusave), tmp; \
- movl value, offset(tmp)
-
-#endif /* _ASM */
-
-/*
- * The main entry point for master CPUs. It also serves as the trap handler
- * for all traps and interrupts taken during single-step.
- */
-#if defined(__lint)
-void
-kdi_cmnint(void)
-{
-}
-#else /* __lint */
-
- /* XXX implement me */
- ENTRY_NP(kdi_nmiint)
- clr %ecx
- movl (%ecx), %ecx
- SET_SIZE(kdi_nmiint)
-
- ENTRY_NP(kdi_cmnint)
- ALTENTRY(kdi_master_entry)
-
- /* Save all registers and selectors */
-
- pushal
- pushl %ds
- pushl %es
- pushl %fs
- pushl %gs
- pushl %ss
-
- subl $8, %esp
- movl %ebp, REG_OFF(KDIREG_SAVFP)(%esp)
- movl REG_OFF(KDIREG_EIP)(%esp), %eax
- movl %eax, REG_OFF(KDIREG_SAVPC)(%esp)
-
- /*
- * If the kernel has started using its own selectors, we should too.
- * Update our saved selectors if they haven't been updated already.
- */
- movw %cs, %ax
- cmpw $KCS_SEL, %ax
- jne 1f /* The kernel hasn't switched yet */
-
- movw $KDS_SEL, %ax
- movw %ax, %ds
- movw kdi_cs, %ax
- cmpw $KCS_SEL, %ax
- je 1f /* We already switched */
-
- /*
- * The kernel switched, but we haven't. Update our saved selectors
- * to match the kernel's copies for use below.
- */
- movl $KCS_SEL, kdi_cs
- movl $KDS_SEL, kdi_ds
- movl $KFS_SEL, kdi_fs
- movl $KGS_SEL, kdi_gs
-
-1:
- /*
- * Set the selectors to a known state. If we come in from kmdb's IDT,
- * we'll be on boot's %cs. This will cause GET_CPUSAVE_ADDR to return
- * CPU 0's cpusave, regardless of which CPU we're on, and chaos will
- * ensue. So, if we've got $KCSSEL in kdi_cs, switch to it. The other
- * selectors are restored normally.
- */
- movw %cs:kdi_cs, %ax
- cmpw $KCS_SEL, %ax
- jne 1f
- ljmp $KCS_SEL, $1f
-1:
- movw %cs:kdi_ds, %ds
- movw kdi_ds, %es
- movw kdi_fs, %fs
- movw kdi_gs, %gs
- movw kdi_ds, %ss
-
- /*
- * This has to come after we set %gs to the kernel descriptor. Since
- * we've hijacked some IDT entries used in user-space such as the
- * breakpoint handler, we can enter kdi_cmnint() with GDT_LWPGS used
- * in %gs. On the hypervisor, CLI() needs GDT_GS to access the machcpu.
- */
- CLI(%eax)
-
-#if defined(__xpv)
- /*
- * Clear saved_upcall_mask in unused byte of cs slot on stack.
- * It can only confuse things.
- */
- movb $0, REG_OFF(KDIREG_CS)+2(%esp)
-
-#endif
-
- GET_CPUSAVE_ADDR /* %eax = cpusave, %ebx = CPU ID */
-
- ADVANCE_CRUMB_POINTER(%eax, %ecx, %edx)
-
- ADD_CRUMB(%eax, KRM_CPU_STATE, $KDI_CPU_STATE_MASTER, %edx)
-
- movl REG_OFF(KDIREG_EIP)(%esp), %ecx
- ADD_CRUMB(%eax, KRM_PC, %ecx, %edx)
- ADD_CRUMB(%eax, KRM_SP, %esp, %edx)
- movl REG_OFF(KDIREG_TRAPNO)(%esp), %ecx
- ADD_CRUMB(%eax, KRM_TRAPNO, %ecx, %edx)
-
- movl %esp, %ebp
- pushl %eax
-
- /*
- * Were we in the debugger when we took the trap (i.e. was %esp in one
- * of the debugger's memory ranges)?
- */
- leal kdi_memranges, %ecx
- movl kdi_nmemranges, %edx
-1: cmpl MR_BASE(%ecx), %esp
- jl 2f /* below this range -- try the next one */
- cmpl MR_LIM(%ecx), %esp
- jg 2f /* above this range -- try the next one */
- jmp 3f /* matched within this range */
-
-2: decl %edx
- jz kdi_save_common_state /* %esp not within debugger memory */
- addl $MR_SIZE, %ecx
- jmp 1b
-
-3: /*
- * %esp was within one of the debugger's memory ranges. This should
- * only happen when we take a trap while running in the debugger.
- * kmdb_dpi_handle_fault will determine whether or not it was an
- * expected trap, and will take the appropriate action.
- */
-
- pushl %ebx /* cpuid */
-
- movl REG_OFF(KDIREG_ESP)(%ebp), %ecx
- addl $REG_OFF(KDIREG_EFLAGS - KDIREG_EAX), %ecx
- pushl %ecx
-
- pushl REG_OFF(KDIREG_EIP)(%ebp)
- pushl REG_OFF(KDIREG_TRAPNO)(%ebp)
-
- call kdi_dvec_handle_fault
- addl $16, %esp
-
- /*
- * If we're here, we ran into a debugger problem, and the user
- * elected to solve it by having the debugger debug itself. The
- * state we're about to save is that of the debugger when it took
- * the fault.
- */
-
- jmp kdi_save_common_state
-
- SET_SIZE(kdi_master_entry)
- SET_SIZE(kdi_cmnint)
-
-#endif /* __lint */
-
-/*
- * The cross-call handler for slave CPUs.
- *
- * The debugger is single-threaded, so only one CPU, called the master, may be
- * running it at any given time. The other CPUs, known as slaves, spin in a
- * busy loop until there's something for them to do. This is the entry point
- * for the slaves - they'll be sent here in response to a cross-call sent by the
- * master.
- */
-
-#if defined(__lint)
-void
-kdi_slave_entry(void)
-{
-}
-#else /* __lint */
- ENTRY_NP(kdi_slave_entry)
-
- /*
- * Cross calls are implemented as function calls, so our stack
- * currently looks like one you'd get from a zero-argument function
- * call. There's an %eip at %esp, and that's about it. We want to
- * make it look like the master CPU's stack. By doing this, we can
- * use the same resume code for both master and slave. We need to
- * make our stack look like a `struct regs' before we jump into the
- * common save routine.
- */
-
- pushl %cs
- pushfl
- pushl $-1 /* A phony trap error code */
- pushl $-1 /* A phony trap number */
- pushal
- pushl %ds
- pushl %es
- pushl %fs
- pushl %gs
- pushl %ss
-
- subl $8, %esp
- movl %ebp, REG_OFF(KDIREG_SAVFP)(%esp)
- movl REG_OFF(KDIREG_EIP)(%esp), %eax
- movl %eax, REG_OFF(KDIREG_SAVPC)(%esp)
-
- /*
- * Swap our saved EFLAGS and %eip. Each is where the other
- * should be.
- */
- movl REG_OFF(KDIREG_EFLAGS)(%esp), %eax
- xchgl REG_OFF(KDIREG_EIP)(%esp), %eax
- movl %eax, REG_OFF(KDIREG_EFLAGS)(%esp)
-
- /*
- * Our stack now matches struct regs, and is irettable. We don't need
- * to do anything special for the hypervisor w.r.t. PS_IE since we
- * iret twice anyway; the second iret back to the hypervisor
- * will re-enable interrupts.
- */
- CLI(%eax)
-
- /* Load sanitized segment selectors */
- movw kdi_ds, %ds
- movw kdi_ds, %es
- movw kdi_fs, %fs
- movw kdi_gs, %gs
- movw kdi_ds, %ss
-
- GET_CPUSAVE_ADDR /* %eax = cpusave, %ebx = CPU ID */
-
- ADVANCE_CRUMB_POINTER(%eax, %ecx, %edx)
-
- ADD_CRUMB(%eax, KRM_CPU_STATE, $KDI_CPU_STATE_SLAVE, %edx)
-
- movl REG_OFF(KDIREG_EIP)(%esp), %ecx
- ADD_CRUMB(%eax, KRM_PC, %ecx, %edx)
-
- pushl %eax
- jmp kdi_save_common_state
-
- SET_SIZE(kdi_slave_entry)
-
-#endif /* __lint */
-
-/*
- * The state of the world:
- *
- * The stack has a complete set of saved registers and segment
- * selectors, arranged in `struct regs' order (or vice-versa), up to
- * and including EFLAGS. It also has a pointer to our cpusave area.
- *
- * We need to save a pointer to these saved registers. We also want
- * to adjust the saved %esp - it should point just beyond the saved
- * registers to the last frame of the thread we interrupted. Finally,
- * we want to clear out bits 16-31 of the saved selectors, as the
- * selector pushls don't automatically clear them.
- */
-#if !defined(__lint)
-
- ENTRY_NP(kdi_save_common_state)
-
- popl %eax /* the cpusave area */
-
- movl %esp, KRS_GREGS(%eax) /* save ptr to current saved regs */
-
- addl $REG_OFF(KDIREG_EFLAGS - KDIREG_EAX), KDIREG_OFF(KDIREG_ESP)(%esp)
-
- andl $0xffff, KDIREG_OFF(KDIREG_SS)(%esp)
- andl $0xffff, KDIREG_OFF(KDIREG_GS)(%esp)
- andl $0xffff, KDIREG_OFF(KDIREG_FS)(%esp)
- andl $0xffff, KDIREG_OFF(KDIREG_ES)(%esp)
- andl $0xffff, KDIREG_OFF(KDIREG_DS)(%esp)
-
- pushl %eax
- call kdi_trap_pass
- cmpl $1, %eax
- je kdi_pass_to_kernel
- popl %eax
-
- SAVE_IDTGDT
-
-#if !defined(__xpv)
- /* Save off %cr0, and clear write protect */
- movl %cr0, %ecx
- movl %ecx, KRS_CR0(%eax)
- andl $_BITNOT(CR0_WP), %ecx
- movl %ecx, %cr0
-#endif
- pushl %edi
- movl %eax, %edi
-
- /* Save the debug registers and disable any active watchpoints */
- pushl $7
- call kdi_dreg_get
- addl $4, %esp
-
- movl %eax, KRS_DRCTL(%edi)
- andl $_BITNOT(KDIREG_DRCTL_WPALLEN_MASK), %eax
-
- pushl %eax
- pushl $7
- call kdi_dreg_set
- addl $8, %esp
-
- pushl $6
- call kdi_dreg_get
- addl $4, %esp
- movl %eax, KRS_DRSTAT(%edi)
-
- pushl $0
- call kdi_dreg_get
- addl $4, %esp
- movl %eax, KRS_DROFF(0)(%edi)
-
- pushl $1
- call kdi_dreg_get
- addl $4, %esp
- movl %eax, KRS_DROFF(1)(%edi)
-
- pushl $2
- call kdi_dreg_get
- addl $4, %esp
- movl %eax, KRS_DROFF(2)(%edi)
-
- pushl $3
- call kdi_dreg_get
- addl $4, %esp
- movl %eax, KRS_DROFF(3)(%edi)
-
- movl %edi, %eax
- popl %edi
-
- clr %ebp /* stack traces should end here */
-
- pushl %eax
- call kdi_debugger_entry
- popl %eax
-
- jmp kdi_resume
-
- SET_SIZE(kdi_save_common_state)
-
-#endif /* !__lint */
-
-/*
- * Resume the world. The code that calls kdi_resume has already
- * decided whether or not to restore the IDT.
- */
-#if defined(__lint)
-void
-kdi_resume(void)
-{
-}
-#else /* __lint */
-
- /* cpusave in %eax */
- ENTRY_NP(kdi_resume)
-
- /*
- * Send this CPU back into the world
- */
-
-#if !defined(__xpv)
- movl KRS_CR0(%eax), %edx
- movl %edx, %cr0
-#endif
-
- pushl %edi
- movl %eax, %edi
-
- KDI_RESTORE_DEBUGGING_STATE
-
- popl %edi
-
-#if defined(__xpv)
- /*
- * kmdb might have set PS_T in the saved eflags, so we can't use
- * intr_restore, since that restores all of eflags; instead, just
- * pick up PS_IE from the saved eflags.
- */
- movl REG_OFF(KDIREG_EFLAGS)(%esp), %eax
- testl $PS_IE, %eax
- jz 2f
- STI
-2:
-#endif
-
- addl $8, %esp /* Discard savfp and savpc */
-
- popl %ss
- popl %gs
- popl %fs
- popl %es
- popl %ds
- popal
-
- addl $8, %esp /* Discard TRAPNO and ERROR */
-
- IRET
-
- SET_SIZE(kdi_resume)
-#endif /* __lint */
-
-#if !defined(__lint)
-
- ENTRY_NP(kdi_pass_to_kernel)
-
- /* pop cpusave, leaving %esp pointing to saved regs */
- popl %eax
-
- movl $KDI_CPU_STATE_NONE, KRS_CPU_STATE(%eax)
-
- /*
- * Find the trap and vector off the right kernel handler. The trap
- * handler will expect the stack to be in trap order, with %eip being
- * the last entry, so we'll need to restore all our regs.
- *
- * We're hard-coding the three cases where KMDB has installed permanent
- * handlers, since after we restore, we don't have registers to work
- * with; we can't use a global since other CPUs can easily pass through
- * here at the same time.
- *
- * Note that we handle T_DBGENTR since userspace might have tried it.
- */
- movl REG_OFF(KDIREG_TRAPNO)(%esp), %eax
- cmpl $T_SGLSTP, %eax
- je kpass_dbgtrap
- cmpl $T_BPTFLT, %eax
- je kpass_brktrap
- cmpl $T_DBGENTR, %eax
- je kpass_invaltrap
- /*
- * Hmm, unknown handler. Somebody forgot to update this when they
- * added a new trap interposition... try to drop back into kmdb.
- */
- int $T_DBGENTR
-
-kpass_dbgtrap:
- KDI_RESTORE_REGS()
- ljmp $KCS_SEL, $1f
-1: jmp %cs:dbgtrap
- /*NOTREACHED*/
-
-kpass_brktrap:
- KDI_RESTORE_REGS()
- ljmp $KCS_SEL, $2f
-2: jmp %cs:brktrap
- /*NOTREACHED*/
-
-kpass_invaltrap:
- KDI_RESTORE_REGS()
- ljmp $KCS_SEL, $3f
-3: jmp %cs:invaltrap
- /*NOTREACHED*/
-
- SET_SIZE(kdi_pass_to_kernel)
-
- /*
- * A minimal version of mdboot(), to be used by the master CPU only.
- */
- ENTRY_NP(kdi_reboot)
-
- pushl $AD_BOOT
- pushl $A_SHUTDOWN
- call *psm_shutdownf
- addl $8, %esp
-
-#if defined(__xpv)
- pushl $SHUTDOWN_reboot
- call HYPERVISOR_shutdown
-#else
- call reset
-#endif
- /*NOTREACHED*/
-
- SET_SIZE(kdi_reboot)
-
-#endif /* !__lint */
-
-#if defined(__lint)
-/*ARGSUSED*/
-void
-kdi_cpu_debug_init(kdi_cpusave_t *save)
-{
-}
-#else /* __lint */
-
- ENTRY_NP(kdi_cpu_debug_init)
- pushl %ebp
- movl %esp, %ebp
-
- pushl %edi
- pushl %ebx
-
- movl 8(%ebp), %edi
-
- KDI_RESTORE_DEBUGGING_STATE
-
- popl %ebx
- popl %edi
- leave
- ret
-
- SET_SIZE(kdi_cpu_debug_init)
-#endif /* !__lint */
-
diff --git a/usr/src/uts/intel/kdi/amd64/kdi_asm.s b/usr/src/uts/intel/kdi/kdi_asm.s
index ea6f404af4..9e5bbc110f 100644
--- a/usr/src/uts/intel/kdi/amd64/kdi_asm.s
+++ b/usr/src/uts/intel/kdi/kdi_asm.s
@@ -27,12 +27,13 @@
*/
/*
- * Debugger entry for both master and slave CPUs
+ * Debugger entry and exit for both master and slave CPUs. kdi_idthdl.s contains
+ * the IDT stubs that drop into here (mainly via kdi_cmnint).
*/
#if defined(__lint)
#include <sys/types.h>
-#endif
+#else
#include <sys/segments.h>
#include <sys/asm_linkage.h>
@@ -46,9 +47,6 @@
#ifdef __xpv
#include <sys/hypervisor.h>
#endif
-
-#ifdef _ASM
-
#include <kdi_assym.h>
#include <assym.h>
@@ -80,6 +78,9 @@
#ifdef __xpv
+/*
+ * Already on kernel gsbase via the hypervisor.
+ */
#define SAVE_GSBASE(reg) /* nothing */
#define RESTORE_GSBASE(reg) /* nothing */
@@ -90,8 +91,16 @@
rdmsr; \
shlq $32, %rdx; \
orq %rax, %rdx; \
- movq %rdx, REG_OFF(KDIREG_GSBASE)(base)
+ movq %rdx, REG_OFF(KDIREG_GSBASE)(base); \
+ movl $MSR_AMD_KGSBASE, %ecx; \
+ rdmsr; \
+ shlq $32, %rdx; \
+ orq %rax, %rdx; \
+ movq %rdx, REG_OFF(KDIREG_KGSBASE)(base)
+/*
+ * We shouldn't have stomped on KGSBASE, so don't try to restore it.
+ */
#define RESTORE_GSBASE(base) \
movq REG_OFF(KDIREG_GSBASE)(base), %rdx; \
movq %rdx, %rax; \
@@ -102,9 +111,7 @@
#endif /* __xpv */
/*
- * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack. Note
- * that on the hypervisor, we skip the save/restore of GSBASE: it's slow, and
- * unnecessary.
+ * %ss, %rsp, %rflags, %cs, %rip, %err, %trapno are already on the stack.
*/
#define KDI_SAVE_REGS(base) \
movq %rdi, REG_OFF(KDIREG_RDI)(base); \
@@ -125,6 +132,8 @@
movq %rbp, REG_OFF(KDIREG_SAVFP)(base); \
movq REG_OFF(KDIREG_RIP)(base), %rax; \
movq %rax, REG_OFF(KDIREG_SAVPC)(base); \
+ movq %cr2, %rax; \
+ movq %rax, REG_OFF(KDIREG_CR2)(base); \
clrq %rax; \
movw %ds, %ax; \
movq %rax, REG_OFF(KDIREG_DS)(base); \
@@ -143,6 +152,8 @@
movw %ax, %es; \
movq REG_OFF(KDIREG_DS)(%rdi), %rax; \
movw %ax, %ds; \
+ movq REG_OFF(KDIREG_CR2)(base), %rax; \
+ movq %rax, %cr2; \
movq REG_OFF(KDIREG_R15)(%rdi), %r15; \
movq REG_OFF(KDIREG_R14)(%rdi), %r14; \
movq REG_OFF(KDIREG_R13)(%rdi), %r13; \
@@ -222,15 +233,6 @@
movq KRS_CURCRUMB(cpusave), tmp; \
movq value, offset(tmp)
-#endif /* _ASM */
-
-#if defined(__lint)
-void
-kdi_cmnint(void)
-{
-}
-#else /* __lint */
-
/* XXX implement me */
ENTRY_NP(kdi_nmiint)
clrq %rcx
@@ -280,6 +282,20 @@ kdi_cmnint(void)
shrq $32, %rdx
movl $MSR_AMD_GSBASE, %ecx
wrmsr
+
+ /*
+ * Switch to the kernel's %cr3. From the early interrupt handler
+ * until now we've been running on the "paranoid" %cr3 (that of kas
+ * from early in boot).
+ *
+ * Hopefully it's not corrupt!
+ */
+ mov %gs:CPU_KPTI_KCR3, %rdx
+ cmp $0, %rdx
+ je .zero_kcr3
+ mov %rdx, %cr3
+.zero_kcr3:
+
#endif /* __xpv */
GET_CPUSAVE_ADDR /* %rax = cpusave, %rbx = CPU ID */
@@ -303,13 +319,15 @@ kdi_cmnint(void)
*/
leaq kdi_memranges, %rcx
movl kdi_nmemranges, %edx
-1: cmpq MR_BASE(%rcx), %rsp
+1:
+ cmpq MR_BASE(%rcx), %rsp
jl 2f /* below this range -- try the next one */
cmpq MR_LIM(%rcx), %rsp
jg 2f /* above this range -- try the next one */
jmp 3f /* matched within this range */
-2: decl %edx
+2:
+ decl %edx
jz kdi_save_common_state /* %rsp not within debugger memory */
addq $MR_SIZE, %rcx
jmp 1b
@@ -339,8 +357,6 @@ kdi_cmnint(void)
SET_SIZE(kdi_master_entry)
SET_SIZE(kdi_cmnint)
-#endif /* __lint */
-
/*
* The cross-call handler for slave CPUs.
*
@@ -351,12 +367,6 @@ kdi_cmnint(void)
* master.
*/
-#if defined(__lint)
-void
-kdi_slave_entry(void)
-{
-}
-#else /* __lint */
ENTRY_NP(kdi_slave_entry)
/*
@@ -390,7 +400,7 @@ kdi_slave_entry(void)
addq $8, %rax
movq %rax, REG_OFF(KDIREG_RSP)(%rsp)
- /*
+ /*
* We've saved all of the general-purpose registers, and have a stack
* that is irettable (after we strip down to the error code)
*/
@@ -409,8 +419,6 @@ kdi_slave_entry(void)
SET_SIZE(kdi_slave_entry)
-#endif /* __lint */
-
/*
* The state of the world:
*
@@ -424,8 +432,6 @@ kdi_slave_entry(void)
* machine for debugger entry, and enter the debugger.
*/
-#if !defined(__lint)
-
ENTRY_NP(kdi_save_common_state)
popq %rdi /* the cpusave area */
@@ -495,19 +501,10 @@ kdi_slave_entry(void)
SET_SIZE(kdi_save_common_state)
-#endif /* !__lint */
-
/*
* Resume the world. The code that calls kdi_resume has already
* decided whether or not to restore the IDT.
*/
-#if defined(__lint)
-void
-kdi_resume(void)
-{
-}
-#else /* __lint */
-
/* cpusave in %rdi */
ENTRY_NP(kdi_resume)
@@ -524,14 +521,18 @@ kdi_resume(void)
movq KRS_GREGS(%rdi), %rsp
KDI_RESTORE_REGS(%rsp)
addq $REG_OFF(KDIREG_RIP), %rsp /* Discard state, trapno, err */
+ /*
+ * The common trampoline code will restore %cr3 to the right value
+ * for either kernel or userland.
+ */
+#if !defined(__xpv)
+ jmp tr_iret_auto
+#else
IRET
+#endif
/*NOTREACHED*/
SET_SIZE(kdi_resume)
-#endif /* __lint */
-
-#if !defined(__lint)
-
ENTRY_NP(kdi_pass_to_kernel)
popq %rdi /* cpusave */
@@ -564,7 +565,7 @@ kdi_resume(void)
* added a new trap interposition... try to drop back into kmdb.
*/
int $T_DBGENTR
-
+
#define CALL_TRAP_HANDLER(name) \
KDI_RESTORE_REGS(%rsp); \
/* Discard state, trapno, err */ \
@@ -602,16 +603,6 @@ kdi_resume(void)
SET_SIZE(kdi_reboot)
-#endif /* !__lint */
-
-#if defined(__lint)
-/*ARGSUSED*/
-void
-kdi_cpu_debug_init(kdi_cpusave_t *save)
-{
-}
-#else /* __lint */
-
ENTRY_NP(kdi_cpu_debug_init)
pushq %rbp
movq %rsp, %rbp
@@ -622,7 +613,32 @@ kdi_cpu_debug_init(kdi_cpusave_t *save)
leave
ret
-
SET_SIZE(kdi_cpu_debug_init)
-#endif /* !__lint */
+#define GETDREG(name, r) \
+ ENTRY_NP(name); \
+ movq r, %rax; \
+ ret; \
+ SET_SIZE(name)
+
+#define SETDREG(name, r) \
+ ENTRY_NP(name); \
+ movq %rdi, r; \
+ ret; \
+ SET_SIZE(name)
+
+ GETDREG(kdi_getdr0, %dr0)
+ GETDREG(kdi_getdr1, %dr1)
+ GETDREG(kdi_getdr2, %dr2)
+ GETDREG(kdi_getdr3, %dr3)
+ GETDREG(kdi_getdr6, %dr6)
+ GETDREG(kdi_getdr7, %dr7)
+
+ SETDREG(kdi_setdr0, %dr0)
+ SETDREG(kdi_setdr1, %dr1)
+ SETDREG(kdi_setdr2, %dr2)
+ SETDREG(kdi_setdr3, %dr3)
+ SETDREG(kdi_setdr6, %dr6)
+ SETDREG(kdi_setdr7, %dr7)
+
+#endif /* !__lint */
diff --git a/usr/src/uts/intel/kdi/kdi_idt.c b/usr/src/uts/intel/kdi/kdi_idt.c
index 64e2b225d5..d801588954 100644
--- a/usr/src/uts/intel/kdi/kdi_idt.c
+++ b/usr/src/uts/intel/kdi/kdi_idt.c
@@ -78,6 +78,7 @@
#include <sys/kdi_impl.h>
#include <sys/x_call.h>
#include <ia32/sys/psw.h>
+#include <vm/hat_i86.h>
#define KDI_GATE_NVECS 3
@@ -116,7 +117,7 @@ typedef void idt_hdlr_f(void);
extern idt_hdlr_f kdi_trap0, kdi_trap1, kdi_int2, kdi_trap3, kdi_trap4;
extern idt_hdlr_f kdi_trap5, kdi_trap6, kdi_trap7, kdi_trap9;
extern idt_hdlr_f kdi_traperr10, kdi_traperr11, kdi_traperr12;
-extern idt_hdlr_f kdi_traperr13, kdi_traperr14, kdi_trap16, kdi_trap17;
+extern idt_hdlr_f kdi_traperr13, kdi_traperr14, kdi_trap16, kdi_traperr17;
extern idt_hdlr_f kdi_trap18, kdi_trap19, kdi_trap20, kdi_ivct32;
extern idt_hdlr_f kdi_invaltrap;
extern size_t kdi_ivct_size;
@@ -137,7 +138,7 @@ static const kdi_gate_spec_t kdi_gate_specs[KDI_GATE_NVECS] = {
static gate_desc_t kdi_kgates[KDI_GATE_NVECS];
-gate_desc_t kdi_idt[NIDT];
+extern gate_desc_t kdi_idt[NIDT];
struct idt_description {
uint_t id_low;
@@ -164,7 +165,7 @@ struct idt_description {
{ T_PGFLT, 0, kdi_traperr14, NULL },
{ 15, 0, kdi_invaltrap, NULL },
{ T_EXTERRFLT, 0, kdi_trap16, NULL },
- { T_ALIGNMENT, 0, kdi_trap17, NULL },
+ { T_ALIGNMENT, 0, kdi_traperr17, NULL },
{ T_MCE, 0, kdi_trap18, NULL },
{ T_SIMDFPE, 0, kdi_trap19, NULL },
{ T_DBGENTR, 0, kdi_trap20, NULL },
@@ -183,11 +184,16 @@ kdi_idt_init(selector_t sel)
uint_t high = id->id_high != 0 ? id->id_high : id->id_low;
size_t incr = id->id_incrp != NULL ? *id->id_incrp : 0;
+#if !defined(__xpv)
+ if (kpti_enable && sel == KCS_SEL && id->id_low == T_DBLFLT)
+ id->id_basehdlr = tr_syserrtrap;
+#endif
+
for (i = id->id_low; i <= high; i++) {
caddr_t hdlr = (caddr_t)id->id_basehdlr +
incr * (i - id->id_low);
set_gatesegd(&kdi_idt[i], (void (*)())hdlr, sel,
- SDT_SYSIGT, TRP_KPL, i);
+ SDT_SYSIGT, TRP_KPL, IST_DBG);
}
}
}
@@ -204,7 +210,7 @@ kdi_idt_gates_install(selector_t sel, int saveold)
const kdi_gate_spec_t *gs = &kdi_gate_specs[i];
uintptr_t func = GATESEG_GETOFFSET(&kdi_idt[gs->kgs_vec]);
set_gatesegd(&gates[i], (void (*)())func, sel, SDT_SYSIGT,
- gs->kgs_dpl, gs->kgs_vec);
+ gs->kgs_dpl, IST_DBG);
}
for (i = 0; i < KDI_GATE_NVECS; i++) {
@@ -390,9 +396,17 @@ kdi_trap_pass(kdi_cpusave_t *cpusave)
* See the comments in the kernel's T_SGLSTP handler for why we need to
* do this.
*/
+#if !defined(__xpv)
if (tt == T_SGLSTP &&
- (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter))
+ (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter ||
+ pc == (greg_t)tr_sys_sysenter ||
+ pc == (greg_t)tr_brand_sys_sysenter)) {
+#else
+ if (tt == T_SGLSTP &&
+ (pc == (greg_t)sys_sysenter || pc == (greg_t)brand_sys_sysenter)) {
+#endif
return (1);
+ }
return (0);
}
diff --git a/usr/src/uts/intel/kdi/kdi_idthdl.s b/usr/src/uts/intel/kdi/kdi_idthdl.s
index 359df2a8f8..510bb20fcb 100644
--- a/usr/src/uts/intel/kdi/kdi_idthdl.s
+++ b/usr/src/uts/intel/kdi/kdi_idthdl.s
@@ -26,22 +26,30 @@
*/
/*
- * Companion to kdi_idt.c - the implementation of the trap and interrupt
+ * Companion to kdi_asm.s - the implementation of the trap and interrupt
* handlers. For the most part, these handlers do the same thing - they
* push a trap number onto the stack, followed by a jump to kdi_cmnint.
* Each trap and interrupt has its own handler because each one pushes a
* different number.
*/
+#if defined(__lint)
+#include <sys/types.h>
+#else
+
#include <sys/asm_linkage.h>
+#include <sys/asm_misc.h>
+#include <sys/machprivregs.h>
+#include <sys/privregs.h>
#include <sys/kdi_regs.h>
+#include <sys/trap.h>
+#include <sys/param.h>
-/* Nothing in this file is of interest to lint. */
-#if !defined(__lint)
+#include <kdi_assym.h>
+#include <assym.h>
/*
- * The default ASM_ENTRY_ALIGN (16) wastes far too much space. Pay no
- * attention to the fleet of nop's we're adding to each handler.
+ * The default ASM_ENTRY_ALIGN (16) wastes far too much space.
*/
#undef ASM_ENTRY_ALIGN
#define ASM_ENTRY_ALIGN 8
@@ -50,65 +58,174 @@
* Generic trap and interrupt handlers.
*/
-#if defined(__xpv) && defined(__amd64)
+#if defined(__xpv)
-/*
- * The hypervisor places r11 and rcx on the stack.
- */
-
-#define TRAP_NOERR(trapno) \
- popq %rcx; \
- popq %r11; \
- pushq $trapno
-
-#define TRAP_ERR(trapno) \
- popq %rcx; \
- popq %r11; \
- pushq $0; \
- pushq $trapno
+#define INTERRUPT_TRAMPOLINE
#else
-#define TRAP_NOERR(trapno) \
- push $trapno
+/*
+ * If we're !xpv, then we will need to support KPTI (kernel page table
+ * isolation), where we have separate page tables for user and kernel modes.
+ * There's more detail about this in kpti_trampolines.s and hat_i86.c
+ */
-#define TRAP_ERR(trapno) \
- push $0; \
- push $trapno
+#define INTERRUPT_TRAMPOLINE \
+ pushq %r13; \
+ pushq %r14; \
+ subq $KPTI_R14, %rsp; \
+ /* Check for clobbering */ \
+ cmp $0, KPTI_FLAG(%rsp); \
+ je 1f; \
+ /* Don't worry, this totally works */ \
+ int $8; \
+1: \
+ movq $1, KPTI_FLAG(%rsp); \
+ /* Save current %cr3. */ \
+ mov %cr3, %r14; \
+ mov %r14, KPTI_TR_CR3(%rsp); \
+ /* Switch to paranoid %cr3. */ \
+ mov kpti_safe_cr3, %r14; \
+ mov %r14, %cr3; \
+ \
+ cmpw $KCS_SEL, KPTI_CS(%rsp); \
+ je 3f; \
+2: \
+ /* Get our cpu_t in %r13 */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ subq $CPU_KPTI_START, %r13; \
+ /* Use top of the kthread stk */ \
+ mov CPU_THREAD(%r13), %r14; \
+ mov T_STACK(%r14), %r14; \
+ addq $REGSIZE+MINFRAME, %r14; \
+ jmp 5f; \
+3: \
+ /* Check the %rsp in the frame. */ \
+ /* Is it above kernel base? */ \
+ mov kpti_kbase, %r14; \
+ cmp %r14, KPTI_RSP(%rsp); \
+ jb 2b; \
+ /* Is it within the kpti_frame page? */ \
+ mov %rsp, %r13; \
+ and $(~(MMU_PAGESIZE - 1)), %r13; \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~(MMU_PAGESIZE - 1)), %r14; \
+ cmp %r13, %r14; \
+ je 2b; \
+ /* Use the %rsp from the trap frame. */ \
+ /* We already did %cr3. */ \
+ mov KPTI_RSP(%rsp), %r14; \
+ and $(~0xf), %r14; \
+5: \
+ mov %rsp, %r13; \
+ /* %r14 contains our destination stk */ \
+ mov %r14, %rsp; \
+ pushq KPTI_SS(%r13); \
+ pushq KPTI_RSP(%r13); \
+ pushq KPTI_RFLAGS(%r13); \
+ pushq KPTI_CS(%r13); \
+ pushq KPTI_RIP(%r13); \
+ pushq KPTI_ERR(%r13); \
+ mov KPTI_R14(%r13), %r14; \
+ movq $0, KPTI_FLAG(%r13); \
+ mov KPTI_R13(%r13), %r13
-#endif /* __xpv && __amd64 */
+#endif /* !__xpv */
#define MKIVCT(n) \
ENTRY_NP(kdi_ivct/**/n/**/); \
- TRAP_ERR(n); \
+ XPV_TRAP_POP; \
+ push $0; /* err */ \
+ INTERRUPT_TRAMPOLINE; \
+ push $n; \
jmp kdi_cmnint; \
SET_SIZE(kdi_ivct/**/n/**/)
#define MKTRAPHDLR(n) \
ENTRY_NP(kdi_trap/**/n); \
- TRAP_ERR(n); \
+ XPV_TRAP_POP; \
+ push $0; /* err */ \
+ INTERRUPT_TRAMPOLINE; \
+ push $n; \
jmp kdi_cmnint; \
SET_SIZE(kdi_trap/**/n/**/)
#define MKTRAPERRHDLR(n) \
ENTRY_NP(kdi_traperr/**/n); \
- TRAP_NOERR(n); \
+ XPV_TRAP_POP; \
+ INTERRUPT_TRAMPOLINE; \
+ push $n; \
jmp kdi_cmnint; \
SET_SIZE(kdi_traperr/**/n)
+#if !defined(__xpv)
+#define MKNMIHDLR \
+ ENTRY_NP(kdi_int2); \
+ push $0; \
+ push $2; \
+ pushq %r13; \
+ mov kpti_safe_cr3, %r13; \
+ mov %r13, %cr3; \
+ popq %r13; \
+ jmp kdi_nmiint; \
+ SET_SIZE(kdi_int2)
+
+#define MKMCEHDLR \
+ ENTRY_NP(kdi_trap18); \
+ push $0; \
+ push $18; \
+ pushq %r13; \
+ mov kpti_safe_cr3, %r13; \
+ mov %r13, %cr3; \
+ popq %r13; \
+ jmp kdi_cmnint; \
+ SET_SIZE(kdi_trap18)
+#else
#define MKNMIHDLR \
ENTRY_NP(kdi_int2); \
- TRAP_NOERR(2); \
+ push $0; \
+ push $2; \
jmp kdi_nmiint; \
SET_SIZE(kdi_int2)
+#define MKMCEHDLR \
+ ENTRY_NP(kdi_trap18); \
+ push $0; \
+ push $18; \
+ jmp kdi_cmnint; \
+ SET_SIZE(kdi_trap18)
+#endif
+
+/*
+ * The only way we should reach here is by an explicit "int 0x.." which is
+ * defined not to push an error code.
+ */
#define MKINVALHDLR \
ENTRY_NP(kdi_invaltrap); \
- TRAP_NOERR(255); \
+ XPV_TRAP_POP; \
+ push $0; /* err */ \
+ INTERRUPT_TRAMPOLINE; \
+ push $255; \
jmp kdi_cmnint; \
SET_SIZE(kdi_invaltrap)
+ .data
+ DGDEF3(kdi_idt, 16 * NIDT, MMU_PAGESIZE)
+ .fill MMU_PAGESIZE, 1, 0
+
+#if !defined(__xpv)
+.section ".text"
+.align MMU_PAGESIZE
+.global kdi_isr_start
+kdi_isr_start:
+ nop
+
+.global kpti_safe_cr3
+.global kpti_kbase
+#endif
+
/*
* The handlers themselves
*/
@@ -125,8 +242,7 @@
MKTRAPHDLR(9)
MKTRAPHDLR(15)
MKTRAPHDLR(16)
- MKTRAPHDLR(17)
- MKTRAPHDLR(18)
+ MKMCEHDLR/*18*/
MKTRAPHDLR(19)
MKTRAPHDLR(20)
@@ -136,11 +252,12 @@
MKTRAPERRHDLR(12)
MKTRAPERRHDLR(13)
MKTRAPERRHDLR(14)
+ MKTRAPERRHDLR(17)
.globl kdi_ivct_size
kdi_ivct_size:
.NWORD [kdi_ivct33-kdi_ivct32]
-
+
/* 10 billion and one interrupt handlers */
kdi_ivct_base:
MKIVCT(32); MKIVCT(33); MKIVCT(34); MKIVCT(35);
@@ -200,4 +317,12 @@ kdi_ivct_base:
MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251);
MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255);
+#if !defined(__xpv)
+.section ".text"
+.align MMU_PAGESIZE
+.global kdi_isr_end
+kdi_isr_end:
+ nop
#endif
+
+#endif /* !__lint */
diff --git a/usr/src/uts/intel/kdi/kdi_offsets.in b/usr/src/uts/intel/kdi/kdi_offsets.in
index 212fdc9f4c..c9228de978 100644
--- a/usr/src/uts/intel/kdi/kdi_offsets.in
+++ b/usr/src/uts/intel/kdi/kdi_offsets.in
@@ -25,8 +25,6 @@
\
\ CPU-save structure offsets for use in assembly code.
\
-\ Keep in sync with kdi_state.h
-\
#include <sys/cpuvar.h>
#include <sys/kdi_impl.h>
@@ -60,16 +58,9 @@ kdi_cpusave_t KRS_SIZE
krs_curcrumb
krs_crumbs
-cpu
- cpu_id
-
greg_t KREG_SIZE
-#if defined(__amd64)
\#define REG_SHIFT 3
-#else
-\#define REG_SHIFT 2
-#endif
\#define DRADDR_IDX(num) _CONST(_MUL(num, DR_ADDR_INCR))
\#define DRADDR_OFF(num) _CONST(DRADDR_IDX(num) + DR_ADDR)
diff --git a/usr/src/uts/intel/sys/archsystm.h b/usr/src/uts/intel/sys/archsystm.h
index 9ca38f823c..4d14e58880 100644
--- a/usr/src/uts/intel/sys/archsystm.h
+++ b/usr/src/uts/intel/sys/archsystm.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_ARCHSYSTM_H
@@ -80,22 +80,26 @@ extern void int20(void);
extern void int_cmci(void);
#if defined(__amd64)
-extern void sys_syscall();
-extern void sys_syscall32();
+extern void sys_syscall(), tr_sys_syscall();
+extern void sys_syscall32(), tr_sys_syscall32();
extern void sys_lcall32();
extern void sys_syscall_int();
-extern void brand_sys_syscall();
-extern void brand_sys_syscall32();
-extern void brand_sys_syscall_int();
+extern void tr_sys_syscall_int();
+extern void brand_sys_syscall(), tr_brand_sys_syscall();
+extern void brand_sys_syscall32(), tr_brand_sys_syscall32();
+extern void brand_sys_syscall_int(), tr_brand_sys_syscall_int();
extern int update_sregs();
extern void reset_sregs();
#elif defined(__i386)
extern void sys_call();
+extern void tr_sys_call();
extern void brand_sys_call();
#endif
extern void sys_sysenter();
+extern void tr_sys_sysenter();
extern void _sys_sysenter_post_swapgs();
extern void brand_sys_sysenter();
+extern void tr_brand_sys_sysenter();
extern void _brand_sys_sysenter_post_swapgs();
extern void dosyscall(void);
diff --git a/usr/src/uts/intel/sys/segments.h b/usr/src/uts/intel/sys/segments.h
index 5368f80735..84eb363f00 100644
--- a/usr/src/uts/intel/sys/segments.h
+++ b/usr/src/uts/intel/sys/segments.h
@@ -2,7 +2,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_SEGMENTS_H
@@ -98,29 +98,30 @@ extern "C" {
*/
#if defined(__xpv)
-#if defined(__amd64)
-
#define SEL_XPL 0 /* hypervisor privilege level */
#define SEL_KPL 3 /* both kernel and user in ring 3 */
#define TRP_KPL 1 /* system gate priv (user blocked) */
-
-#elif defined(__i386)
-
-#define SEL_XPL 0 /* hypervisor privilege level */
-#define SEL_KPL 1 /* kernel privilege level */
-#define TRP_KPL SEL_KPL /* system gate priv (user blocked) */
-
-#endif /* __i386 */
-
#define TRP_XPL 0 /* system gate priv (hypervisor) */
+#define IST_DBG 0
+
#else /* __xpv */
#define SEL_KPL 0 /* kernel privilege level on metal */
#define TRP_KPL SEL_KPL /* system gate priv (user blocked) */
+
+#define IST_DF 1
+#define IST_NMI 2
+#define IST_MCE 3
+#define IST_DBG 4
+#define IST_NESTABLE 5
+#define IST_DEFAULT 6
+
#endif /* __xpv */
+#define IST_NONE 0
+
#define SEL_UPL 3 /* user priority level */
#define TRP_UPL 3 /* system gate priv (user allowed) */
#define SEL_TI_LDT 4 /* local descriptor table */
@@ -401,6 +402,8 @@ extern void set_usegd(user_desc_t *, void *, size_t, uint_t, uint_t,
#endif /* __i386 */
+extern uint_t idt_vector_to_ist(uint_t);
+
extern void set_gatesegd(gate_desc_t *, void (*)(void), selector_t,
uint_t, uint_t, uint_t);
@@ -646,6 +649,10 @@ void init_boot_gdt(user_desc_t *);
#define MINNLDT 512 /* Current min solaris ldt size (1 4K page) */
#define MAXNLDT 8192 /* max solaris ldt size (16 4K pages) */
+#ifdef _KERNEL
+#define LDT_CPU_SIZE (16 * 4096) /* Size of kernel per-CPU allocation */
+#endif
+
#ifndef _ASM
extern gate_desc_t *idt0;
@@ -688,10 +695,29 @@ extern void sys_int80();
extern void brand_sys_int80();
extern void dtrace_ret();
+/* KPTI trampolines */
+extern void tr_invaltrap();
+extern void tr_div0trap(), tr_dbgtrap(), tr_nmiint(), tr_brktrap();
+extern void tr_ovflotrap(), tr_boundstrap(), tr_invoptrap(), tr_ndptrap();
+#if !defined(__xpv)
+extern void tr_syserrtrap();
+#endif
+extern void tr_invaltrap(), tr_invtsstrap(), tr_segnptrap(), tr_stktrap();
+extern void tr_gptrap(), tr_pftrap(), tr_ndperr();
+extern void tr_overrun(), tr_resvtrap();
+extern void tr_achktrap(), tr_mcetrap();
+extern void tr_xmtrap();
+extern void tr_fasttrap();
+extern void tr_sys_int80();
+extern void tr_brand_sys_int80();
+extern void tr_dtrace_ret();
+
#if !defined(__amd64)
extern void pentium_pftrap();
#endif
+extern uint64_t kpti_enable;
+
#endif /* _ASM */
#ifdef __cplusplus