summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorJohn Levon <john.levon@joyent.com>2018-02-22 18:05:13 -0800
committerJohn Levon <john.levon@joyent.com>2018-03-13 20:33:26 +0000
commit60f89b42cd13d6888f948d7ffe4edcfa535e02a6 (patch)
treec60e2fa99bc8572c457a0908105f8570c56b834c /usr/src
parent0e957fcabecc0abb13226b12f474359f4ea711ea (diff)
downloadillumos-joyent-60f89b42cd13d6888f948d7ffe4edcfa535e02a6.tar.gz
OS-6546 Use PCID if KPTI is enabled
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Alex Wilson <alex.wilson@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Approved by: Alex Wilson <alex.wilson@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/mdb/i86pc/modules/unix/unix.c10
-rw-r--r--usr/src/cmd/mdb/intel/kmdb/kaif.c5
-rw-r--r--usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c6
-rw-r--r--usr/src/uts/i86pc/ml/fb_swtch_src.s5
-rw-r--r--usr/src/uts/i86pc/ml/kpti_trampolines.s32
-rw-r--r--usr/src/uts/i86pc/ml/mpcore.s6
-rw-r--r--usr/src/uts/i86pc/os/cpuid.c49
-rw-r--r--usr/src/uts/i86pc/os/fakebop.c8
-rw-r--r--usr/src/uts/i86pc/os/mach_kdi.c10
-rw-r--r--usr/src/uts/i86pc/os/mlsetup.c15
-rw-r--r--usr/src/uts/i86pc/os/mp_pc.c19
-rw-r--r--usr/src/uts/i86pc/os/mp_startup.c2
-rw-r--r--usr/src/uts/i86pc/os/startup.c8
-rw-r--r--usr/src/uts/i86pc/os/trap.c2
-rw-r--r--usr/src/uts/i86pc/sys/mach_mmu.h21
-rw-r--r--usr/src/uts/i86pc/sys/machcpuvar.h4
-rw-r--r--usr/src/uts/i86pc/sys/pc_mmu.h15
-rw-r--r--usr/src/uts/i86pc/sys/rm_platter.h2
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.c368
-rw-r--r--usr/src/uts/i86pc/vm/hat_i86.h60
-rw-r--r--usr/src/uts/i86pc/vm/hat_kdi.c21
-rw-r--r--usr/src/uts/i86pc/vm/hat_pte.h7
-rw-r--r--usr/src/uts/i86pc/vm/htable.c15
-rw-r--r--usr/src/uts/i86pc/vm/htable.h1
-rw-r--r--usr/src/uts/i86pc/vm/i86_mmu.c289
-rw-r--r--usr/src/uts/i86pc/vm/kboot_mmu.c14
-rw-r--r--usr/src/uts/i86pc/vm/vm_machdep.c87
-rw-r--r--usr/src/uts/i86xpv/os/xpv_panic.c4
-rw-r--r--usr/src/uts/intel/asm/htable.h15
-rw-r--r--usr/src/uts/intel/asm/mmu.h52
-rw-r--r--usr/src/uts/intel/ia32/ml/i86_subr.s19
-rw-r--r--usr/src/uts/intel/ia32/os/desctbls.c3
-rw-r--r--usr/src/uts/intel/sys/controlregs.h14
-rw-r--r--usr/src/uts/intel/sys/x86_archext.h12
34 files changed, 761 insertions, 439 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
index 95e588eed6..224cfb4a18 100644
--- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
+++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c
@@ -1010,18 +1010,18 @@ crregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
cr2 = kmdb_unix_getcr2();
cr3 = kmdb_unix_getcr3();
cr4 = kmdb_unix_getcr4();
- mdb_printf("%%cr0 = 0x%08x <%b>\n", cr0, cr0, cr0_flag_bits);
- mdb_printf("%%cr2 = 0x%08x <%a>\n", cr2, cr2);
+ mdb_printf("%%cr0 = 0x%lx <%b>\n", cr0, cr0, cr0_flag_bits);
+ mdb_printf("%%cr2 = 0x%lx <%a>\n", cr2, cr2);
if ((cr4 & CR4_PCIDE)) {
- mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx pcid:%u>\n",
+ mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx pcid:%lu>\n", cr3,
cr3 >> MMU_PAGESHIFT, cr3 & MMU_PAGEOFFSET);
} else {
- mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx flags:%b>\n", cr3,
+ mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx flags:%b>\n", cr3,
cr3 >> MMU_PAGESHIFT, cr3, cr3_flag_bits);
}
- mdb_printf("%%cr4 = 0x%08x <%b>\n", cr4, cr4, cr4_flag_bits);
+ mdb_printf("%%cr4 = 0x%lx <%b>\n", cr4, cr4, cr4_flag_bits);
return (DCMD_OK);
}
diff --git a/usr/src/cmd/mdb/intel/kmdb/kaif.c b/usr/src/cmd/mdb/intel/kmdb/kaif.c
index dda6a94ea6..55754f4130 100644
--- a/usr/src/cmd/mdb/intel/kmdb/kaif.c
+++ b/usr/src/cmd/mdb/intel/kmdb/kaif.c
@@ -265,13 +265,16 @@ kaif_set_register(const char *regname, kreg_t val)
static boolean_t
kaif_toxic_text(uintptr_t addr)
{
- static GElf_Sym toxic_syms[1] = { 0, };
+ static GElf_Sym toxic_syms[2] = { 0, };
size_t i;
if (toxic_syms[0].st_name == NULL) {
if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC,
"tr_iret_user", &toxic_syms[0], NULL) != 0)
warn("couldn't find tr_iret_user\n");
+ if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC,
+ "tr_mmu_flush_user_range", &toxic_syms[1], NULL) != 0)
+ warn("couldn't find tr_mmu_flush_user_range\n");
}
for (i = 0; i < ARRAY_SIZE(toxic_syms); i++) {
diff --git a/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c b/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c
index 4934de54e6..f46515838f 100644
--- a/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c
+++ b/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c
@@ -22,6 +22,8 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/debug.h>
@@ -99,7 +101,7 @@ gfxp_map_kernel_space(uint64_t start, size_t size, uint32_t mode)
* The hypervisor doesn't allow r/w mappings to some pages, such as
* page tables, gdt, etc. Detect %cr3 to notify users of this interface.
*/
- if (start == mmu_ptob(mmu_btop(getcr3())))
+ if (start == mmu_ptob(mmu_btop(getcr3_pa())))
return (0);
#endif
@@ -318,7 +320,7 @@ gfxp_load_kernel_space(uint64_t start, size_t size,
* The hypervisor doesn't allow r/w mappings to some pages, such as
* page tables, gdt, etc. Detect %cr3 to notify users of this interface.
*/
- if (start == mmu_ptob(mmu_btop(getcr3())))
+ if (start == mmu_ptob(mmu_btop(getcr3_pa())))
return;
#endif
diff --git a/usr/src/uts/i86pc/ml/fb_swtch_src.s b/usr/src/uts/i86pc/ml/fb_swtch_src.s
index e67837ee2b..4d1789fc9b 100644
--- a/usr/src/uts/i86pc/ml/fb_swtch_src.s
+++ b/usr/src/uts/i86pc/ml/fb_swtch_src.s
@@ -22,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2018 Joyent, Inc.
*/
@@ -52,6 +53,9 @@ int fb_swtch_silence_lint = 0;
#define DISABLE_PAGING \
+ movl %cr4, %eax ;\
+ btrl $17, %eax /* clear PCIDE bit */ ;\
+ movl %eax, %cr4 ;\
movl %cr0, %eax ;\
btrl $31, %eax /* clear PG bit */ ;\
movl %eax, %cr0
@@ -222,6 +226,7 @@ _start:
* Disable long mode by:
* - shutting down paging (bit 31 of cr0). This will flush the
* TLBs.
+ * - turning off PCID in cr4
* - disabling LME (long mode enable) in EFER (extended feature reg)
*/
#endif
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
index c05718c3ad..2db2d5acfa 100644
--- a/usr/src/uts/i86pc/ml/kpti_trampolines.s
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -92,6 +92,9 @@
* We do not do any stack pivoting for syscalls (and we leave SYSENTER's
* existing %rsp pivot untouched) -- instead we spill registers into
* %gs:CPU_KPTI_* as we need to.
+ *
+ * Note that the normal %cr3 values do not cause invalidations with PCIDE - see
+ * hat_switch().
*/
/*
@@ -705,6 +708,35 @@ tr_intr_ret_end:
MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251);
MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255);
+ /*
+ * We're PCIDE, but we don't have INVPCID. The only way to invalidate a
+ * PCID other than the current one, then, is to load its cr3 then
+ * invlpg. But loading kf_user_cr3 means we can longer access our
+ * caller's text mapping (or indeed, its stack). So this little helper
+ * has to live within our trampoline text region.
+ *
+ * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3)
+ */
+ ENTRY_NP(tr_mmu_flush_user_range)
+ push %rbx
+ /* When we read cr3, it never has the NOINVL bit set. */
+ mov %cr3, %rax
+ movq $CR3_NOINVL_BIT, %rbx
+ orq %rbx, %rax
+
+ mov %rcx, %cr3
+ add %rdi, %rsi
+.align ASM_ENTRY_ALIGN
+1:
+ invlpg (%rdi)
+ add %rdx, %rdi
+ cmp %rsi, %rdi
+ jb 1b
+ mov %rax, %cr3
+ pop %rbx
+ retq
+ SET_SIZE(tr_mmu_flush_user_range)
+
.align MMU_PAGESIZE
.global kpti_tramp_end
kpti_tramp_end:
diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s
index eaf70b72df..2151a14b04 100644
--- a/usr/src/uts/i86pc/ml/mpcore.s
+++ b/usr/src/uts/i86pc/ml/mpcore.s
@@ -24,6 +24,8 @@
/*
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/asm_linkage.h>
@@ -326,7 +328,7 @@ kernel_cs_code:
* Complete the rest of the setup and call mp_startup().
*/
movq %gs:CPU_THREAD, %rax /* get thread ptr */
- call *T_PC(%rax) /* call mp_startup */
+ call *T_PC(%rax) /* call mp_startup_boot */
/* not reached */
int $20 /* whoops, returned somehow! */
@@ -502,7 +504,7 @@ kernel_cs_code:
/*
* Before going any farther, enable usage of page table NX bit if
- * that's how our page tables are set up.
+ * that's how our page tables are set up. (PCIDE is enabled later on).
*/
bt $X86FSET_NX, x86_featureset
jnc 1f
diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c
index f44c75acd2..289ce29183 100644
--- a/usr/src/uts/i86pc/os/cpuid.c
+++ b/usr/src/uts/i86pc/os/cpuid.c
@@ -32,7 +32,7 @@
* Portions Copyright 2009 Advanced Micro Devices, Inc.
*/
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
* Various routines to handle identification
@@ -58,6 +58,7 @@
#include <sys/memnode.h>
#include <sys/pci_cfgspace.h>
#include <sys/comm_page.h>
+#include <sys/mach_mmu.h>
#include <sys/tsc.h>
#ifdef __xpv
@@ -83,7 +84,7 @@
* x86_vendor accordingly.
* o Processing the feature flags returned by the cpuid instruction while
* applying any workarounds or tricks for the specific processor.
- * o Mapping the feature flags into Solaris feature bits (X86_*).
+ * o Mapping the feature flags into illumos feature bits (X86_*).
* o Processing extended feature flags if supported by the processor,
* again while applying specific processor knowledge.
* o Determining the CMT characteristics of the system.
@@ -122,6 +123,14 @@ uint_t x86_vendor = X86_VENDOR_IntelClone;
uint_t x86_type = X86_TYPE_OTHER;
uint_t x86_clflush_size = 0;
+#if defined(__xpv)
+int x86_use_pcid = 0;
+int x86_use_invpcid = 0;
+#else
+int x86_use_pcid = -1;
+int x86_use_invpcid = -1;
+#endif
+
uint_t pentiumpro_bug4046376;
uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)];
@@ -196,6 +205,8 @@ static char *x86_feature_names[NUM_X86_FEATURES] = {
"umip",
"pku",
"ospke",
+ "pcid",
+ "invpcid",
};
boolean_t
@@ -1302,6 +1313,10 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP)
add_x86_feature(featureset, X86FSET_SMEP);
+ if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) {
+ add_x86_feature(featureset, X86FSET_INVPCID);
+ }
+
/*
* We check disable_smap here in addition to in startup_smap()
* to ensure CPUs that aren't the boot CPU don't accidentally
@@ -1504,6 +1519,13 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset)
}
}
}
+
+ if (cpi->cpi_vendor == X86_VENDOR_Intel) {
+ if (cp->cp_ecx & CPUID_INTC_ECX_PCID) {
+ add_x86_feature(featureset, X86FSET_PCID);
+ }
+ }
+
if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) {
add_x86_feature(featureset, X86FSET_X2APIC);
}
@@ -5003,6 +5025,29 @@ post_startup_cpu_fixups(void)
#endif /* !__xpv */
}
+void
+enable_pcid(void)
+{
+ if (x86_use_pcid == -1)
+ x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID);
+
+ if (x86_use_invpcid == -1) {
+ x86_use_invpcid = is_x86_feature(x86_featureset,
+ X86FSET_INVPCID);
+ }
+
+ if (!x86_use_pcid)
+ return;
+
+ /*
+ * Intel say that on setting PCIDE, it immediately starts using the PCID
+ * bits; better make sure there's nothing there.
+ */
+ ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE);
+
+ setcr4(getcr4() | CR4_PCIDE);
+}
+
/*
* Setup necessary registers to enable XSAVE feature on this processor.
* This function needs to be called early enough, so that no xsave/xrstor
diff --git a/usr/src/uts/i86pc/os/fakebop.c b/usr/src/uts/i86pc/os/fakebop.c
index 9379072264..a62e45d89d 100644
--- a/usr/src/uts/i86pc/os/fakebop.c
+++ b/usr/src/uts/i86pc/os/fakebop.c
@@ -26,7 +26,7 @@
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
*
- * Copyright 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc. All rights reserved.
*/
/*
@@ -847,6 +847,12 @@ do_bsys_doint(bootops_t *bop, int intnum, struct bop_regs *rp)
bios_regs_t br;
/*
+ * We're about to disable paging; we shouldn't be PCID enabled.
+ */
+ if (getcr4() & CR4_PCIDE)
+ prom_panic("do_bsys_doint() with PCID enabled\n");
+
+ /*
* The first time we do this, we have to copy the pre-packaged
* low memory bios call code image into place.
*/
diff --git a/usr/src/uts/i86pc/os/mach_kdi.c b/usr/src/uts/i86pc/os/mach_kdi.c
index ce8255cdd8..60ca8c9fca 100644
--- a/usr/src/uts/i86pc/os/mach_kdi.c
+++ b/usr/src/uts/i86pc/os/mach_kdi.c
@@ -21,10 +21,10 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Kernel/Debugger Interface (KDI) routines. Called during debugger under
* various system states (boot, while running, while the debugger has control).
@@ -113,12 +113,6 @@ kdi_dreg_set(int reg, ulong_t value)
}
}
-void
-kdi_flush_caches(void)
-{
- reload_cr3();
-}
-
extern void kdi_slave_entry(void);
void
diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c
index 19f0f5f676..09bf07848c 100644
--- a/usr/src/uts/i86pc/os/mlsetup.c
+++ b/usr/src/uts/i86pc/os/mlsetup.c
@@ -148,17 +148,24 @@ mlsetup(struct regs *rp)
else
cpuid_feature_edx_exclude = (uint32_t)prop_value;
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
/*
* Check to see if KPTI has been explicitly enabled or disabled.
* We have to check this before init_desctbls().
*/
- if (bootprop_getval("kpti", &prop_value) != 0) {
- kpti_enable = 1;
- } else {
+ if (bootprop_getval("kpti", &prop_value) == 0) {
kpti_enable = (uint64_t)(prop_value == 1);
prom_printf("unix: forcing kpti to %s due to boot argument\n",
(kpti_enable == 1) ? "ON" : "OFF");
+ } else {
+ kpti_enable = 1;
+ }
+
+ if (bootprop_getval("pcid", &prop_value) == 0 && prop_value == 0) {
+ prom_printf("unix: forcing pcid to OFF due to boot argument\n");
+ x86_use_pcid = 0;
+ } else if (kpti_enable != 1) {
+ x86_use_pcid = 0;
}
#endif
diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c
index 4e12703395..98fa4cc131 100644
--- a/usr/src/uts/i86pc/os/mp_pc.c
+++ b/usr/src/uts/i86pc/os/mp_pc.c
@@ -133,10 +133,11 @@ rmp_gdt_init(rm_platter_t *rm)
#if defined(__amd64)
/* Use the kas address space for the CPU startup thread. */
- if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL)
+ if (mmu_ptob(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) {
panic("Cannot initialize CPUs; kernel's 64-bit page tables\n"
"located above 4G in physical memory (@ 0x%lx)",
- MAKECR3(kas.a_hat->hat_htable->ht_pfn));
+ mmu_ptob(kas.a_hat->hat_htable->ht_pfn));
+ }
/*
* Setup pseudo-descriptors for temporary GDT and IDT for use ONLY
@@ -356,21 +357,17 @@ mach_cpucontext_xalloc(struct cpu *cp, int optype)
/*
* CPU needs to access kernel address space after powering on.
- * When hot-adding CPU at runtime, directly use top level page table
- * of kas other than the return value of getcr3(). getcr3() returns
- * current process's top level page table, which may be different from
- * the one of kas.
*/
- rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn);
+ rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_NONE);
rm->rm_cpu = cp->cpu_id;
/*
- * For hot-adding CPU at runtime, Machine Check and Performance Counter
- * should be disabled. They will be enabled on demand after CPU powers
- * on successfully
+ * We need to mask off any bits set on our boot CPU that can't apply
+ * while the subject CPU is initializing. If appropriate, they are
+ * enabled later on.
*/
rm->rm_cr4 = getcr4();
- rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE);
+ rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE | CR4_PCIDE);
rmp_gdt_init(rm);
diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c
index a807be6a40..c755f56927 100644
--- a/usr/src/uts/i86pc/os/mp_startup.c
+++ b/usr/src/uts/i86pc/os/mp_startup.c
@@ -1796,6 +1796,8 @@ mp_startup_common(boolean_t boot)
*/
cp->cpu_flags &= ~(CPU_POWEROFF | CPU_QUIESCED);
+ enable_pcid();
+
/*
* Setup this processor for XSAVE.
*/
diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c
index 5e23d2f486..a3026f0eb4 100644
--- a/usr/src/uts/i86pc/os/startup.c
+++ b/usr/src/uts/i86pc/os/startup.c
@@ -2337,12 +2337,18 @@ startup_end(void)
xs_domu_init();
#endif
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
/*
* Intel IOMMU has been setup/initialized in ddi_impl.c
* Start it up now.
*/
immu_startup();
+
+ /*
+ * Now that we're no longer going to drop into real mode for a BIOS call
+ * via bootops, we can enable PCID (which requires CR0.PG).
+ */
+ enable_pcid();
#endif
PRM_POINT("Enabling interrupts");
diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c
index 4b867bac0c..839bbeba25 100644
--- a/usr/src/uts/i86pc/os/trap.c
+++ b/usr/src/uts/i86pc/os/trap.c
@@ -1847,7 +1847,7 @@ instr_is_sys_rtt(caddr_t pc)
{
extern void _sys_rtt(), _sys_rtt_end();
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
extern void tr_sysc_ret_start(), tr_sysc_ret_end();
extern void tr_intr_ret_start(), tr_intr_ret_end();
diff --git a/usr/src/uts/i86pc/sys/mach_mmu.h b/usr/src/uts/i86pc/sys/mach_mmu.h
index 1eb47ada6a..22c7aac422 100644
--- a/usr/src/uts/i86pc/sys/mach_mmu.h
+++ b/usr/src/uts/i86pc/sys/mach_mmu.h
@@ -21,13 +21,13 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_MACH_MMU_H
#define _SYS_MACH_MMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -131,6 +131,19 @@ extern "C" {
#define PT_NOCONSIST (0x400) /* PTE was created with HAT_LOAD_NOCONSIST */
#define PT_FOREIGN (0x600) /* MFN mapped on the hypervisor has no PFN */
+#ifndef _BOOT
+
+extern ulong_t getcr3(void);
+extern void setcr3(ulong_t);
+
+#define getcr3_pa() (getcr3() & MMU_PAGEMASK)
+#define getpcid() ((getcr4() & CR4_PCIDE) ? \
+ (getcr3() & MMU_PAGEOFFSET) : PCID_NONE)
+
+extern void mmu_invlpg(caddr_t);
+
+#endif
+
#ifdef __xpv
#include <sys/xen_mmu.h>
#else
@@ -151,10 +164,6 @@ paddr_t make_ptable(x86pte_t *, uint_t);
x86pte_t *find_pte(uint64_t, paddr_t *, uint_t, uint_t);
x86pte_t *map_pte(paddr_t, uint_t);
-#ifndef _BOOT
-ulong_t getcr3();
-#endif
-
extern uint_t *shift_amt;
extern uint_t ptes_per_table;
extern paddr_t top_page_table;
diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h
index cf1a252c28..d3827522e8 100644
--- a/usr/src/uts/i86pc/sys/machcpuvar.h
+++ b/usr/src/uts/i86pc/sys/machcpuvar.h
@@ -143,11 +143,7 @@ struct kpti_frame {
*
* There is a CTASSERT in os/intr.c that checks these numbers.
*/
-#if defined(__amd64)
#define MACHCPU_SIZE (572 + 1584)
-#else
-#define MACHCPU_SIZE (452 + 1328)
-#endif
#define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE)
#define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame))
diff --git a/usr/src/uts/i86pc/sys/pc_mmu.h b/usr/src/uts/i86pc/sys/pc_mmu.h
index 89661449a4..324fb78e2d 100644
--- a/usr/src/uts/i86pc/sys/pc_mmu.h
+++ b/usr/src/uts/i86pc/sys/pc_mmu.h
@@ -21,13 +21,13 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _SYS_PC_MMU_H
#define _SYS_PC_MMU_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -50,8 +50,15 @@ void reload_cr3(void);
#ifndef _BOOT
-void mmu_tlbflush_entry(caddr_t);
-void setcr3(ulong_t);
+extern uint64_t kpti_safe_cr3;
+
+#define INVPCID_ADDR (0)
+#define INVPCID_ID (1)
+#define INVPCID_ALL_GLOBAL (2)
+#define INVPCID_ALL_NONGLOBAL (3)
+
+extern void invpcid_insn(uint64_t, uint64_t, uintptr_t);
+extern void tr_mmu_flush_user_range(uint64_t, size_t, size_t, uint64_t);
#if defined(__GNUC__)
#include <asm/mmu.h>
diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h
index 15ab068854..55a58095af 100644
--- a/usr/src/uts/i86pc/sys/rm_platter.h
+++ b/usr/src/uts/i86pc/sys/rm_platter.h
@@ -115,7 +115,7 @@ typedef struct rm_platter {
struct cpu_tables {
/* IST stacks */
char ct_stack1[DEFAULTSTKSZ]; /* dblfault */
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
char ct_stack2[DEFAULTSTKSZ]; /* nmi */
char ct_stack3[DEFAULTSTKSZ]; /* mce */
#endif
diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c
index 8690c46adf..2bac383b9c 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.c
+++ b/usr/src/uts/i86pc/vm/hat_i86.c
@@ -301,10 +301,10 @@ static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected,
x86pte_t new);
/*
- * The kernel address space exists in all HATs. To implement this the
- * kernel reserves a fixed number of entries in the topmost level(s) of page
- * tables. The values are setup during startup and then copied to every user
- * hat created by hat_alloc(). This means that kernelbase must be:
+ * The kernel address space exists in all non-HAT_COPIED HATs. To implement this
+ * the kernel reserves a fixed number of entries in the topmost level(s) of page
+ * tables. The values are setup during startup and then copied to every user hat
+ * created by hat_alloc(). This means that kernelbase must be:
*
* 4Meg aligned for 32 bit kernels
* 512Gig aligned for x86_64 64 bit kernel
@@ -377,12 +377,6 @@ struct hatstats hatstat;
*/
int pt_kern;
-/*
- * useful stuff for atomic access/clearing/setting REF/MOD/RO bits in page_t's.
- */
-extern void atomic_orb(uchar_t *addr, uchar_t val);
-extern void atomic_andb(uchar_t *addr, uchar_t val);
-
#ifndef __xpv
extern pfn_t memseg_get_start(struct memseg *);
#endif
@@ -494,7 +488,6 @@ hat_alloc(struct as *as)
hat->hat_num_copied = 0;
hat->hat_flags = 0;
#else /* __xpv */
-#if defined(__amd64)
/*
* All processes use HAT_COPIED on the 64-bit kernel if KPTI is
@@ -521,17 +514,6 @@ hat_alloc(struct as *as)
hat->hat_flags = 0;
HATSTAT_INC(hs_hat_normal64);
}
-#elif defined(__i386)
- use_copied = mmu.pae_hat;
- if (use_copied) {
- use_hat32_cache = B_TRUE;
- hat->hat_num_copied = mmu.num_copied_ents;
- HATSTAT_INC(hs_hat_copied32);
- } else {
- use_hat32_cache = B_FALSE;
- hat->hat_num_copied = 0;
- }
-#endif
#endif /* __xpv */
if (use_copied) {
hat->hat_flags |= HAT_COPIED;
@@ -830,7 +812,6 @@ mmu_calc_user_slots(void)
*/
mmu.top_level_uslots = ent + 1;
-#if defined(__amd64)
/*
* When running 32-bit compatability processes on a 64-bit kernel, we
* will only need to use one slot.
@@ -844,16 +825,6 @@ mmu_calc_user_slots(void)
*/
mmu.num_copied_ents = mmu.top_level_uslots;
mmu.num_copied_ents32 = 4;
-#elif defined(__xpv)
- /*
- *
- */
- if (mmu.pae_hat) {
- mmu.num_copied_ents = 4;
- } else {
- mmu.num_copied_ents = 0;
- }
-#endif
}
/*
@@ -875,7 +846,7 @@ mmu_init(void)
(getcr4() & CR4_PGE) != 0)
mmu.pt_global = PT_GLOBAL;
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
/*
* The 64-bit x86 kernel has split user/kernel page tables. As such we
* cannot have the global bit set. The simplest way for us to deal with
@@ -1302,8 +1273,6 @@ hat_pcp_teardown(cpu_t *cpu)
++r; \
}
-extern uint64_t kpti_safe_cr3;
-
/*
* Finish filling in the kernel hat.
* Pre fill in all top level kernel page table entries for the kernel's
@@ -1420,9 +1389,10 @@ hat_init_finish(void)
#endif
hat_kmap_init((uintptr_t)segmap_start, size);
-#if defined(__amd64) && !defined(__xpv)
+#if !defined(__xpv)
ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID);
- ASSERT3U(kpti_safe_cr3, ==, MAKECR3(kas.a_hat->hat_htable->ht_pfn));
+ ASSERT3U(kpti_safe_cr3, ==,
+ MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_KERNEL));
#endif
}
@@ -1517,7 +1487,7 @@ hat_pcp_update(cpu_t *cpu, const hat_t *hat)
}
static void
-reset_kpti(struct kpti_frame *fr, uint64_t kcr3)
+reset_kpti(struct kpti_frame *fr, uint64_t kcr3, uint64_t ucr3)
{
ASSERT3U(fr->kf_tr_flag, ==, 0);
#if DEBUG
@@ -1533,7 +1503,7 @@ reset_kpti(struct kpti_frame *fr, uint64_t kcr3)
offsetof(struct kpti_frame, kf_unused));
fr->kf_kernel_cr3 = kcr3;
- fr->kf_user_cr3 = 0;
+ fr->kf_user_cr3 = ucr3;
fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp;
fr->kf_lower_redzone = 0xdeadbeefdeadbeef;
@@ -1541,18 +1511,83 @@ reset_kpti(struct kpti_frame *fr, uint64_t kcr3)
fr->kf_upper_redzone = 0xdeadbeefdeadbeef;
}
+#ifdef __xpv
+static void
+hat_switch_xen(hat_t *hat)
+{
+ struct mmuext_op t[2];
+ uint_t retcnt;
+ uint_t opcnt = 1;
+ uint64_t newcr3;
+
+ ASSERT(!(hat->hat_flags & HAT_COPIED));
+ ASSERT(!(getcr4() & CR4_PCIDE));
+
+ newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn, PCID_NONE);
+
+ t[0].cmd = MMUEXT_NEW_BASEPTR;
+ t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
+
+ /*
+ * There's an interesting problem here, as to what to actually specify
+ * when switching to the kernel hat. For now we'll reuse the kernel hat
+ * again.
+ */
+ t[1].cmd = MMUEXT_NEW_USER_BASEPTR;
+ if (hat == kas.a_hat)
+ t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
+ else
+ t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
+ ++opcnt;
+
+ if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
+ panic("HYPERVISOR_mmu_update() failed");
+ ASSERT(retcnt == opcnt);
+}
+#endif /* __xpv */
+
/*
* Switch to a new active hat, maintaining bit masks to track active CPUs.
*
- * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it
- * remains a 32-bit value.
+ * With KPTI, all our HATs except kas should be using PCP. Thus, to switch
+ * HATs, we need to copy over the new user PTEs, then set our trampoline context
+ * as appropriate.
+ *
+ * If lacking PCID, we then load our new cr3, which will flush the TLB: we may
+ * have established userspace TLB entries via kernel accesses, and these are no
+ * longer valid. We have to do this eagerly, as we just deleted this CPU from
+ * ->hat_cpus, so would no longer see any TLB shootdowns.
+ *
+ * With PCID enabled, things get a little more complicated. We would like to
+ * keep TLB context around when entering and exiting the kernel, and to do this,
+ * we partition the TLB into two different spaces:
+ *
+ * PCID_KERNEL is defined as zero, and used both by kas and all other address
+ * spaces while in the kernel (post-trampoline).
+ *
+ * PCID_USER is used while in userspace. Therefore, userspace cannot use any
+ * lingering PCID_KERNEL entries to kernel addresses it should not be able to
+ * read.
+ *
+ * The trampoline cr3s are set not to invalidate on a mov to %cr3. This means if
+ * we take a journey through the kernel without switching HATs, we have some
+ * hope of keeping our TLB state around.
+ *
+ * On a hat switch, rather than deal with any necessary flushes on the way out
+ * of the trampolines, we do them upfront here. If we're switching from kas, we
+ * shouldn't need any invalidation.
+ *
+ * Otherwise, we can have stale userspace entries for both PCID_USER (what
+ * happened before we move onto the kcr3) and PCID_KERNEL (any subsequent
+ * userspace accesses such as ddi_copyin()). Since setcr3() won't do these
+ * flushes on its own in PCIDE, we'll do a non-flushing load and then
+ * invalidate everything.
*/
void
hat_switch(hat_t *hat)
{
- uint64_t newcr3;
- cpu_t *cpu = CPU;
- hat_t *old = cpu->cpu_current_hat;
+ cpu_t *cpu = CPU;
+ hat_t *old = cpu->cpu_current_hat;
/*
* set up this information first, so we don't miss any cross calls
@@ -1572,54 +1607,63 @@ hat_switch(hat_t *hat)
}
cpu->cpu_current_hat = hat;
- /*
- * now go ahead and load cr3
- */
+#if defined(__xpv)
+ hat_switch_xen(hat);
+#else
+ struct hat_cpu_info *info = cpu->cpu_m.mcpu_hat_info;
+ uint64_t pcide = getcr4() & CR4_PCIDE;
+ uint64_t kcr3, ucr3;
+ pfn_t tl_kpfn;
+ ulong_t flag;
+
+ EQUIV(kpti_enable, !mmu.pt_global);
+
if (hat->hat_flags & HAT_COPIED) {
hat_pcp_update(cpu, hat);
- newcr3 = MAKECR3(cpu->cpu_hat_info->hci_pcp_l3pfn);
+ tl_kpfn = info->hci_pcp_l3pfn;
} else {
- newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn);
+ IMPLY(kpti_enable, hat == kas.a_hat);
+ tl_kpfn = hat->hat_htable->ht_pfn;
}
-#ifdef __xpv
- {
- struct mmuext_op t[2];
- uint_t retcnt;
- uint_t opcnt = 1;
- t[0].cmd = MMUEXT_NEW_BASEPTR;
- t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
+ if (pcide) {
+ ASSERT(kpti_enable);
- /*
- * There's an interesting problem here, as to what to
- * actually specify when switching to the kernel hat.
- * For now we'll reuse the kernel hat again.
- */
- t[1].cmd = MMUEXT_NEW_USER_BASEPTR;
- if (hat == kas.a_hat)
- t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3));
- else
- t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable);
- ++opcnt;
+ kcr3 = MAKECR3(tl_kpfn, PCID_KERNEL) | CR3_NOINVL_BIT;
+ ucr3 = MAKECR3(info->hci_user_l3pfn, PCID_USER) |
+ CR3_NOINVL_BIT;
- if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0)
- panic("HYPERVISOR_mmu_update() failed");
- ASSERT(retcnt == opcnt);
+ setcr3(kcr3);
+ if (old != kas.a_hat)
+ mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
+ } else {
+ kcr3 = MAKECR3(tl_kpfn, PCID_NONE);
+ ucr3 = kpti_enable ?
+ MAKECR3(info->hci_user_l3pfn, PCID_NONE) :
+ 0;
+ setcr3(kcr3);
}
-#else
- setcr3(newcr3);
- reset_kpti(&cpu->cpu_m.mcpu_kpti, newcr3);
- reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, newcr3);
- reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, newcr3);
-
- if (kpti_enable == 1) {
- newcr3 = MAKECR3(cpu->cpu_hat_info->hci_user_l3pfn);
- cpu->cpu_m.mcpu_kpti.kf_user_cr3 = newcr3;
- cpu->cpu_m.mcpu_kpti_dbg.kf_user_cr3 = newcr3;
- cpu->cpu_m.mcpu_kpti_flt.kf_user_cr3 = newcr3;
- }
-#endif
+
+ /*
+ * We will already be taking shootdowns for our new HAT, and as KPTI
+ * invpcid emulation needs to use kf_user_cr3, make sure we don't get
+ * any cross calls while we're inconsistent. Note that it's harmless to
+ * have a *stale* kf_user_cr3 (we just did a FLUSH_TLB_ALL), but a
+ * *zero* kf_user_cr3 is not going to go very well.
+ */
+ if (pcide)
+ flag = intr_clear();
+
+ reset_kpti(&cpu->cpu_m.mcpu_kpti, kcr3, ucr3);
+ reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, kcr3, ucr3);
+ reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, kcr3, ucr3);
+
+ if (pcide)
+ intr_restore(flag);
+
+#endif /* !__xpv */
+
ASSERT(cpu == CPU);
}
@@ -2490,29 +2534,17 @@ hat_unlock_region(struct hat *hat, caddr_t addr, size_t len,
panic("No shared region support on x86");
}
-/*
- * A range of virtual pages for purposes of demapping.
- */
-typedef struct range_info {
- uintptr_t rng_va; /* address of page */
- ulong_t rng_cnt; /* number of pages in range */
- level_t rng_level; /* page table level */
-} range_info_t;
-
#if !defined(__xpv)
/*
* Cross call service routine to demap a range of virtual
* pages on the current CPU or flush all mappings in TLB.
*/
-/*ARGSUSED*/
static int
hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
{
+ _NOTE(ARGUNUSED(a3));
hat_t *hat = (hat_t *)a1;
- range_info_t *range = (range_info_t *)a2;
- size_t len = (size_t)a3;
- caddr_t addr = (caddr_t)range->rng_va;
- size_t pgsz = LEVEL_SIZE(range->rng_level);
+ tlb_range_t *range = (tlb_range_t *)a2;
/*
* If the target hat isn't the kernel and this CPU isn't operating
@@ -2521,20 +2553,16 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
if (hat != kas.a_hat && hat != CPU->cpu_current_hat)
return (0);
- /*
- * For a normal address, we flush a range of contiguous mappings
- */
- if ((uintptr_t)addr != DEMAP_ALL_ADDR) {
- for (size_t i = 0; i < len; i += pgsz)
- mmu_tlbflush_entry(addr + i);
+ if (range->tr_va != DEMAP_ALL_ADDR) {
+ mmu_flush_tlb(FLUSH_TLB_RANGE, range);
return (0);
}
/*
- * Otherwise we reload cr3 to effect a complete TLB flush.
+ * We are flushing all of userspace.
*
- * A reload of cr3 when using PCP also means we must also recopy in the
- * pte values from the struct hat
+ * When using PCP, we first need to update this CPU's idea of the PCP
+ * PTEs.
*/
if (hat->hat_flags & HAT_COPIED) {
#if defined(__amd64)
@@ -2543,34 +2571,13 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
reload_pae32(hat, CPU);
#endif
}
- reload_cr3();
- return (0);
-}
-/*
- * Flush all TLB entries, including global (ie. kernel) ones.
- */
-static void
-flush_all_tlb_entries(void)
-{
- ulong_t cr4 = getcr4();
-
- if (cr4 & CR4_PGE) {
- setcr4(cr4 & ~(ulong_t)CR4_PGE);
- setcr4(cr4);
-
- /*
- * 32 bit PAE also needs to always reload_cr3()
- */
- if (mmu.max_level == 2)
- reload_cr3();
- } else {
- reload_cr3();
- }
+ mmu_flush_tlb(FLUSH_TLB_NONGLOBAL, NULL);
+ return (0);
}
-#define TLB_CPU_HALTED (01ul)
-#define TLB_INVAL_ALL (02ul)
+#define TLBIDLE_CPU_HALTED (0x1UL)
+#define TLBIDLE_INVAL_ALL (0x2UL)
#define CAS_TLB_INFO(cpu, old, new) \
atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new))
@@ -2580,7 +2587,8 @@ flush_all_tlb_entries(void)
void
tlb_going_idle(void)
{
- atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, TLB_CPU_HALTED);
+ atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info,
+ TLBIDLE_CPU_HALTED);
}
/*
@@ -2597,19 +2605,19 @@ tlb_service(void)
* We only have to do something if coming out of being idle.
*/
tlb_info = CPU->cpu_m.mcpu_tlb_info;
- if (tlb_info & TLB_CPU_HALTED) {
+ if (tlb_info & TLBIDLE_CPU_HALTED) {
ASSERT(CPU->cpu_current_hat == kas.a_hat);
/*
* Atomic clear and fetch of old state.
*/
while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) {
- ASSERT(found & TLB_CPU_HALTED);
+ ASSERT(found & TLBIDLE_CPU_HALTED);
tlb_info = found;
SMT_PAUSE();
}
- if (tlb_info & TLB_INVAL_ALL)
- flush_all_tlb_entries();
+ if (tlb_info & TLBIDLE_INVAL_ALL)
+ mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
}
}
#endif /* !__xpv */
@@ -2619,13 +2627,12 @@ tlb_service(void)
* all CPUs using a given hat.
*/
void
-hat_tlb_inval_range(hat_t *hat, range_info_t *range)
+hat_tlb_inval_range(hat_t *hat, tlb_range_t *in_range)
{
extern int flushes_require_xcalls; /* from mp_startup.c */
cpuset_t justme;
cpuset_t cpus_to_shootdown;
- uintptr_t va = range->rng_va;
- size_t len = range->rng_cnt << LEVEL_SHIFT(range->rng_level);
+ tlb_range_t range = *in_range;
#ifndef __xpv
cpuset_t check_cpus;
cpu_t *cpup;
@@ -2646,7 +2653,7 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range)
*/
if (hat->hat_flags & HAT_SHARED) {
hat = kas.a_hat;
- va = DEMAP_ALL_ADDR;
+ range.tr_va = DEMAP_ALL_ADDR;
}
/*
@@ -2654,15 +2661,16 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range)
*/
if (panicstr || !flushes_require_xcalls) {
#ifdef __xpv
- if (va == DEMAP_ALL_ADDR) {
+ if (range.tr_va == DEMAP_ALL_ADDR) {
xen_flush_tlb();
} else {
- for (size_t i = 0; i < len; i += MMU_PAGESIZE)
- xen_flush_va((caddr_t)(va + i));
+ for (size_t i = 0; i < TLB_RANGE_LEN(&range);
+ i += MMU_PAGESIZE) {
+ xen_flush_va((caddr_t)(range.tr_va + i));
+ }
}
#else
- (void) hati_demap_func((xc_arg_t)hat,
- (xc_arg_t)range, (xc_arg_t)len);
+ (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0);
#endif
return;
}
@@ -2696,13 +2704,13 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range)
continue;
tlb_info = cpup->cpu_m.mcpu_tlb_info;
- while (tlb_info == TLB_CPU_HALTED) {
- (void) CAS_TLB_INFO(cpup, TLB_CPU_HALTED,
- TLB_CPU_HALTED | TLB_INVAL_ALL);
+ while (tlb_info == TLBIDLE_CPU_HALTED) {
+ (void) CAS_TLB_INFO(cpup, TLBIDLE_CPU_HALTED,
+ TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL);
SMT_PAUSE();
tlb_info = cpup->cpu_m.mcpu_tlb_info;
}
- if (tlb_info == (TLB_CPU_HALTED | TLB_INVAL_ALL)) {
+ if (tlb_info == (TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL)) {
HATSTAT_INC(hs_tlb_inval_delayed);
CPUSET_DEL(cpus_to_shootdown, c);
}
@@ -2713,31 +2721,33 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range)
CPUSET_ISEQUAL(cpus_to_shootdown, justme)) {
#ifdef __xpv
- if (va == DEMAP_ALL_ADDR) {
+ if (range.tr_va == DEMAP_ALL_ADDR) {
xen_flush_tlb();
} else {
- for (size_t i = 0; i < len; i += MMU_PAGESIZE)
- xen_flush_va((caddr_t)(va + i));
+ for (size_t i = 0; i < TLB_RANGE_LEN(&range);
+ i += MMU_PAGESIZE) {
+ xen_flush_va((caddr_t)(range.tr_va + i));
+ }
}
#else
- (void) hati_demap_func((xc_arg_t)hat,
- (xc_arg_t)range, (xc_arg_t)len);
+ (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0);
#endif
} else {
CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id);
#ifdef __xpv
- if (va == DEMAP_ALL_ADDR) {
+ if (range.tr_va == DEMAP_ALL_ADDR) {
xen_gflush_tlb(cpus_to_shootdown);
} else {
- for (size_t i = 0; i < len; i += MMU_PAGESIZE) {
- xen_gflush_va((caddr_t)(va + i),
+ for (size_t i = 0; i < TLB_RANGE_LEN(&range);
+ i += MMU_PAGESIZE) {
+ xen_gflush_va((caddr_t)(range.tr_va + i),
cpus_to_shootdown);
}
}
#else
- xc_call((xc_arg_t)hat, (xc_arg_t)range, (xc_arg_t)len,
+ xc_call((xc_arg_t)hat, (xc_arg_t)&range, 0,
CPUSET2BV(cpus_to_shootdown), hati_demap_func);
#endif
@@ -2751,10 +2761,10 @@ hat_tlb_inval(hat_t *hat, uintptr_t va)
/*
* Create range for a single page.
*/
- range_info_t range;
- range.rng_va = va;
- range.rng_cnt = 1; /* one page */
- range.rng_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */
+ tlb_range_t range;
+ range.tr_va = va;
+ range.tr_cnt = 1; /* one page */
+ range.tr_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */
hat_tlb_inval_range(hat, &range);
}
@@ -2927,17 +2937,17 @@ hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags)
* for the specified ranges of contiguous pages.
*/
static void
-handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, range_info_t *range)
+handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, tlb_range_t *range)
{
while (cnt > 0) {
--cnt;
hat_tlb_inval_range(hat, &range[cnt]);
if (cb != NULL) {
- cb->hcb_start_addr = (caddr_t)range[cnt].rng_va;
+ cb->hcb_start_addr = (caddr_t)range[cnt].tr_va;
cb->hcb_end_addr = cb->hcb_start_addr;
- cb->hcb_end_addr += range[cnt].rng_cnt <<
- LEVEL_SHIFT(range[cnt].rng_level);
+ cb->hcb_end_addr += range[cnt].tr_cnt <<
+ LEVEL_SHIFT(range[cnt].tr_level);
cb->hcb_function(cb);
}
}
@@ -2967,7 +2977,7 @@ hat_unload_callback(
htable_t *ht = NULL;
uint_t entry;
uintptr_t contig_va = (uintptr_t)-1L;
- range_info_t r[MAX_UNLOAD_CNT];
+ tlb_range_t r[MAX_UNLOAD_CNT];
uint_t r_cnt = 0;
x86pte_t old_pte;
@@ -3007,14 +3017,14 @@ hat_unload_callback(
* We'll do the call backs for contiguous ranges
*/
if (vaddr != contig_va ||
- (r_cnt > 0 && r[r_cnt - 1].rng_level != ht->ht_level)) {
+ (r_cnt > 0 && r[r_cnt - 1].tr_level != ht->ht_level)) {
if (r_cnt == MAX_UNLOAD_CNT) {
handle_ranges(hat, cb, r_cnt, r);
r_cnt = 0;
}
- r[r_cnt].rng_va = vaddr;
- r[r_cnt].rng_cnt = 0;
- r[r_cnt].rng_level = ht->ht_level;
+ r[r_cnt].tr_va = vaddr;
+ r[r_cnt].tr_cnt = 0;
+ r[r_cnt].tr_level = ht->ht_level;
++r_cnt;
}
@@ -3032,7 +3042,7 @@ hat_unload_callback(
ASSERT(ht->ht_level <= mmu.max_page_level);
vaddr += LEVEL_SIZE(ht->ht_level);
contig_va = vaddr;
- ++r[r_cnt - 1].rng_cnt;
+ ++r[r_cnt - 1].tr_cnt;
}
if (ht)
htable_release(ht);
@@ -3061,14 +3071,14 @@ hat_flush_range(hat_t *hat, caddr_t va, size_t size)
#ifdef __xpv
xen_flush_tlb();
#else
- flush_all_tlb_entries();
+ mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
#endif
break;
}
#ifdef __xpv
xen_flush_va(va);
#else
- mmu_tlbflush_entry(va);
+ mmu_flush_tlb_kpage((uintptr_t)va);
#endif
va += sz;
}
@@ -3734,7 +3744,7 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc)
/*
* flush the TLBs - since we're probably dealing with MANY mappings
- * we do just one CR3 reload.
+ * we just do a full invalidation.
*/
if (!(hat->hat_flags & HAT_FREEING) && need_demaps)
hat_tlb_inval(hat, DEMAP_ALL_ADDR);
@@ -4553,7 +4563,7 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa)
*pteptr = 0;
else
*(x86pte32_t *)pteptr = 0;
- mmu_tlbflush_entry(addr);
+ mmu_flush_tlb_kpage((uintptr_t)addr);
x86pte_mapout();
}
#endif
@@ -4614,7 +4624,7 @@ hat_mempte_remap(
*(x86pte_t *)pteptr = pte;
else
*(x86pte32_t *)pteptr = (x86pte32_t)pte;
- mmu_tlbflush_entry(addr);
+ mmu_flush_tlb_kpage((uintptr_t)addr);
x86pte_mapout();
}
#endif
@@ -5125,7 +5135,7 @@ hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs)
ASSERT3S(kpti_enable, ==, 1);
ASSERT3P(cpu_hat, !=, NULL);
ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP);
- ASSERT3U(va & (MMU_PAGESIZE - 1), ==, 0);
+ ASSERT3U(va & MMU_PAGEOFFSET, ==, 0);
pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
VERIFY3U(pfn, !=, PFN_INVALID);
diff --git a/usr/src/uts/i86pc/vm/hat_i86.h b/usr/src/uts/i86pc/vm/hat_i86.h
index 2bcac4ec61..16ad6aca33 100644
--- a/usr/src/uts/i86pc/vm/hat_i86.h
+++ b/usr/src/uts/i86pc/vm/hat_i86.h
@@ -75,17 +75,10 @@ extern "C" {
*/
#define MAX_COPIED_PTES 1
#else
-#if defined(__amd64)
/*
* The 64-bit kernel may have up to 512 PTEs present in it for a given process.
*/
#define MAX_COPIED_PTES 512
-#elif defined(__i386)
-/*
- * The 32-bit kernel always uses 4 PTEs for this.
- */
-#define MAX_COPIED_PTES 4
-#endif /* __amd64 */
#endif /* __xpv */
#define TOP_LEVEL(h) (((h)->hat_max_level))
@@ -254,7 +247,6 @@ extern void halt(char *fmt);
extern void hat_kern_alloc(caddr_t segmap_base, size_t segmap_size,
caddr_t ekernelheap);
extern void hat_kern_setup(void);
-extern void hat_tlb_inval(struct hat *hat, uintptr_t va);
extern void hat_pte_unmap(htable_t *ht, uint_t entry, uint_t flags,
x86pte_t old_pte, void *pte_ptr, boolean_t tlb);
extern void hat_init_finish(void);
@@ -266,34 +258,35 @@ extern void hat_kmap_init(uintptr_t base, size_t len);
extern hment_t *hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry);
-#if defined(__amd64)
-extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs);
extern void mmu_calc_user_slots(void);
-#endif
+extern void hat_tlb_inval(struct hat *hat, uintptr_t va);
+extern void hat_switch(struct hat *hat);
-#if !defined(__xpv)
-/*
- * routines to deal with delayed TLB invalidations for idle CPUs
- */
-extern void tlb_going_idle(void);
-extern void tlb_service(void);
-#endif
+#define TLB_RANGE_LEN(r) ((r)->tr_cnt << LEVEL_SHIFT((r)->tr_level))
/*
- * Hat switch function invoked to load a new context into %cr3
+ * A range of virtual pages for purposes of demapping.
*/
-extern void hat_switch(struct hat *hat);
+typedef struct tlb_range {
+ uintptr_t tr_va; /* address of page */
+ ulong_t tr_cnt; /* number of pages in range */
+ int8_t tr_level; /* page table level */
+} tlb_range_t;
+
+#if defined(__xpv)
+
+#define XPV_DISALLOW_MIGRATE() xen_block_migrate()
+#define XPV_ALLOW_MIGRATE() xen_allow_migrate()
+
+#define mmu_flush_tlb_page(va) mmu_invlpg((caddr_t)va)
+#define mmu_flush_tlb_kpage(va) mmu_invlpg((caddr_t)va)
-#ifdef __xpv
/*
* Interfaces to use around code that maps/unmaps grant table references.
*/
extern void hat_prepare_mapping(hat_t *, caddr_t, uint64_t *);
extern void hat_release_mapping(hat_t *, caddr_t);
-#define XPV_DISALLOW_MIGRATE() xen_block_migrate()
-#define XPV_ALLOW_MIGRATE() xen_allow_migrate()
-
#else
#define XPV_DISALLOW_MIGRATE() /* nothing */
@@ -301,8 +294,25 @@ extern void hat_release_mapping(hat_t *, caddr_t);
#define pfn_is_foreign(pfn) __lintzero
-#endif
+typedef enum flush_tlb_type {
+ FLUSH_TLB_ALL = 1,
+ FLUSH_TLB_NONGLOBAL = 2,
+ FLUSH_TLB_RANGE = 3,
+} flush_tlb_type_t;
+
+extern void mmu_flush_tlb(flush_tlb_type_t, tlb_range_t *);
+extern void mmu_flush_tlb_kpage(uintptr_t);
+extern void mmu_flush_tlb_page(uintptr_t);
+
+extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs);
+
+/*
+ * routines to deal with delayed TLB invalidations for idle CPUs
+ */
+extern void tlb_going_idle(void);
+extern void tlb_service(void);
+#endif /* !__xpv */
#endif /* _KERNEL */
diff --git a/usr/src/uts/i86pc/vm/hat_kdi.c b/usr/src/uts/i86pc/vm/hat_kdi.c
index 986bcb579e..ae0571e645 100644
--- a/usr/src/uts/i86pc/vm/hat_kdi.c
+++ b/usr/src/uts/i86pc/vm/hat_kdi.c
@@ -22,6 +22,8 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -177,7 +179,7 @@ kdi_vtop(uintptr_t va, uint64_t *pap)
#if defined(__xpv)
*pap = pfn_to_pa(CPU->cpu_current_hat->hat_htable->ht_pfn);
#else
- *pap = getcr3() & MMU_PAGEMASK;
+ *pap = getcr3_pa();
#endif
for (level = mmu.max_level; ; --level) {
index = (va >> LEVEL_SHIFT(level)) & (mmu.ptes_per_table - 1);
@@ -249,7 +251,7 @@ kdi_prw(caddr_t buf, size_t nbytes, uint64_t pa, size_t *ncopiedp, int doread)
*hat_kdi_pte = pte;
else
*(x86pte32_t *)hat_kdi_pte = pte;
- mmu_tlbflush_entry((caddr_t)hat_kdi_page);
+ mmu_flush_tlb_kpage(hat_kdi_page);
#endif
bcopy(from, to, sz);
@@ -268,7 +270,7 @@ kdi_prw(caddr_t buf, size_t nbytes, uint64_t pa, size_t *ncopiedp, int doread)
*hat_kdi_pte = 0;
else
*(x86pte32_t *)hat_kdi_pte = 0;
- mmu_tlbflush_entry((caddr_t)hat_kdi_page);
+ mmu_flush_tlb_kpage(hat_kdi_page);
#endif
buf += sz;
@@ -296,6 +298,19 @@ kdi_pwrite(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp)
return (kdi_prw(buf, nbytes, addr, ncopiedp, 0));
}
+#if !defined(__xpv)
+/*
+ * This gets used for flushing the TLB on all the slaves just prior to doing a
+ * kdi_prw(). It's unclear why this was originally done, since kdi_prw() itself
+ * will flush any lingering hat_kdi_page mappings, but let's presume it was a
+ * good idea.
+ */
+void
+kdi_flush_caches(void)
+{
+ mmu_flush_tlb(FLUSH_TLB_ALL, NULL);
+}
+#endif
/*
* Return the number of bytes, relative to the beginning of a given range, that
diff --git a/usr/src/uts/i86pc/vm/hat_pte.h b/usr/src/uts/i86pc/vm/hat_pte.h
index 121d96cf84..b65a69cb51 100644
--- a/usr/src/uts/i86pc/vm/hat_pte.h
+++ b/usr/src/uts/i86pc/vm/hat_pte.h
@@ -156,9 +156,10 @@ typedef int8_t level_t;
#define PFN_ABOVE64G(pfn) ((pfn) >= PFN_64G)
/*
- * The CR3 register holds the physical address of the top level page table.
+ * The CR3 register holds the physical address of the top level page table,
+ * along with the current PCID if any.
*/
-#define MAKECR3(pfn) mmu_ptob(pfn)
+#define MAKECR3(pfn, pcid) (mmu_ptob(pfn) | pcid)
/*
* HAT/MMU parameters that depend on kernel mode and/or processor type
@@ -178,12 +179,10 @@ struct hat_mmu_info {
uint_t top_level_count; /* # of entries in top-level page table */
uint_t top_level_uslots; /* # of user slots in top-level page table */
uint_t num_copied_ents; /* # of PCP-copied PTEs to create */
-#if defined(__amd64)
/* 32-bit versions of values */
uint_t top_level_uslots32;
uint_t max_level32;
uint_t num_copied_ents32;
-#endif
uint_t hash_cnt; /* cnt of entries in htable_hash_cache */
uint_t hat32_hash_cnt; /* cnt of entries in 32-bit htable_hash_cache */
diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c
index b294597eba..a2d59d98ab 100644
--- a/usr/src/uts/i86pc/vm/htable.c
+++ b/usr/src/uts/i86pc/vm/htable.c
@@ -137,7 +137,7 @@ xen_flush_va(caddr_t va)
uint_t count;
if (IN_XPV_PANIC()) {
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_flush_tlb_page((uintptr_t)va);
} else {
t.cmd = MMUEXT_INVLPG_LOCAL;
t.arg1.linear_addr = (uintptr_t)va;
@@ -154,7 +154,7 @@ xen_gflush_va(caddr_t va, cpuset_t cpus)
uint_t count;
if (IN_XPV_PANIC()) {
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_flush_tlb_page((uintptr_t)va);
return;
}
@@ -1989,7 +1989,10 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
* Disable preemption and grab the CPU's hci_mutex
*/
kpreempt_disable();
+
ASSERT(CPU->cpu_hat_info != NULL);
+ ASSERT(!(getcr4() & CR4_PCIDE));
+
mutex_enter(&CPU->cpu_hat_info->hci_mutex);
x = PWIN_TABLE(CPU->cpu_id);
pteptr = (x86pte_t *)PWIN_PTE_VA(x);
@@ -2024,7 +2027,7 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht)
else
*(x86pte32_t *)pteptr = newpte;
XPV_DISALLOW_PAGETABLE_UPDATES();
- mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
+ mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
}
}
return (PT_INDEX_PTR(PWIN_VA(x), index));
@@ -2137,7 +2140,7 @@ x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr)
xen_flush_va((caddr_t)addr);
else
#endif
- mmu_tlbflush_entry((caddr_t)addr);
+ mmu_flush_tlb_page(addr);
goto done;
}
@@ -2380,6 +2383,8 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
} else {
uint_t x = PWIN_SRC(CPU->cpu_id);
+ ASSERT(!(getcr4() & CR4_PCIDE));
+
/*
* Finish defining the src pagetable mapping
*/
@@ -2390,7 +2395,7 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count)
*pteptr = pte;
else
*(x86pte32_t *)pteptr = pte;
- mmu_tlbflush_entry((caddr_t)(PWIN_VA(x)));
+ mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x));
}
/*
diff --git a/usr/src/uts/i86pc/vm/htable.h b/usr/src/uts/i86pc/vm/htable.h
index d9b91189c9..8f4aac7e39 100644
--- a/usr/src/uts/i86pc/vm/htable.h
+++ b/usr/src/uts/i86pc/vm/htable.h
@@ -42,7 +42,6 @@ extern void atomic_andb(uint8_t *addr, uint8_t value);
extern void atomic_orb(uint8_t *addr, uint8_t value);
extern void atomic_inc16(uint16_t *addr);
extern void atomic_dec16(uint16_t *addr);
-extern void mmu_tlbflush_entry(caddr_t addr);
/*
* Each hardware page table has an htable_t describing it.
diff --git a/usr/src/uts/i86pc/vm/i86_mmu.c b/usr/src/uts/i86pc/vm/i86_mmu.c
index a8f9c46805..e413617db8 100644
--- a/usr/src/uts/i86pc/vm/i86_mmu.c
+++ b/usr/src/uts/i86pc/vm/i86_mmu.c
@@ -21,6 +21,8 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/t_lock.h>
@@ -61,92 +63,9 @@
#include <sys/hypervisor.h>
#endif
-caddr_t
-i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
-{
- caddr_t addr;
- caddr_t addr1;
- page_t *pp;
-
- addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
-
- for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
- pp = page_numtopp_nolock(pf);
- if (pp == NULL) {
- hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
- prot | HAT_NOSYNC, HAT_LOAD_LOCK);
- } else {
- hat_memload(kas.a_hat, addr, pp,
- prot | HAT_NOSYNC, HAT_LOAD_LOCK);
- }
- }
-
- return (addr1);
-}
-
-/*
- * This routine is like page_numtopp, but accepts only free pages, which
- * it allocates (unfrees) and returns with the exclusive lock held.
- * It is used by machdep.c/dma_init() to find contiguous free pages.
- *
- * XXX this and some others should probably be in vm_machdep.c
- */
-page_t *
-page_numtopp_alloc(pfn_t pfnum)
-{
- page_t *pp;
-
-retry:
- pp = page_numtopp_nolock(pfnum);
- if (pp == NULL) {
- return (NULL);
- }
-
- if (!page_trylock(pp, SE_EXCL)) {
- return (NULL);
- }
-
- if (page_pptonum(pp) != pfnum) {
- page_unlock(pp);
- goto retry;
- }
-
- if (!PP_ISFREE(pp)) {
- page_unlock(pp);
- return (NULL);
- }
- if (pp->p_szc) {
- page_demote_free_pages(pp);
- page_unlock(pp);
- goto retry;
- }
-
- /* If associated with a vnode, destroy mappings */
-
- if (pp->p_vnode) {
-
- page_destroy_free(pp);
-
- if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
- return (NULL);
- }
-
- if (page_pptonum(pp) != pfnum) {
- page_unlock(pp);
- goto retry;
- }
- }
-
- if (!PP_ISFREE(pp)) {
- page_unlock(pp);
- return (NULL);
- }
-
- if (!page_reclaim(pp, (kmutex_t *)NULL))
- return (NULL);
-
- return (pp);
-}
+#define ON_USER_HAT(cpu) \
+ ((cpu)->cpu_m.mcpu_current_hat != NULL && \
+ (cpu)->cpu_m.mcpu_current_hat != kas.a_hat)
/*
* Flag is not set early in boot. Once it is set we are no longer
@@ -436,20 +355,6 @@ hat_kern_alloc(
table_cnt += mmu.top_level_count - ((kernelbase >>
LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1));
-#if defined(__i386)
- /*
- * The 32 bit PAE hat allocates tables one level below the top when
- * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate
- * a bunch more to the reserve. Any unused will be returned later.
- * Note we've already counted these mappings, just not the extra
- * pagetables.
- */
- if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0)
- table_cnt += mmu.ptes_per_table -
- ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >>
- LEVEL_SHIFT(mmu.max_level - 1));
-#endif
-
/*
* Add 1/4 more into table_cnt for extra slop. The unused
* slop is freed back when we htable_adjust_reserve() later.
@@ -493,15 +398,11 @@ hat_kern_setup(void)
#ifdef __xpv
mmu_btop(xen_info->pt_base - ONE_GIG));
#else
- mmu_btop(getcr3()));
+ mmu_btop(getcr3_pa()));
#endif
/* END CSTYLED */
-#if defined(__i386) && !defined(__xpv)
- CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3();
-#endif /* __i386 */
-
-#if defined(__xpv) && defined(__amd64)
+#if defined(__xpv)
/*
* Try to make the kpm mappings r/w. Failures here are OK, as
* it's probably just a pagetable
@@ -517,3 +418,179 @@ hat_kern_setup(void)
CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id);
CPU->cpu_current_hat = kas.a_hat;
}
+
+#ifndef __xpv
+
+/*
+ * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but
+ * INVPCID_ADDR isn't.
+ */
+static void
+invpcid(uint64_t type, uint64_t pcid, uintptr_t addr)
+{
+ ulong_t flag;
+ uint64_t cr4;
+
+ if (x86_use_invpcid == 1) {
+ ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID));
+ invpcid_insn(type, pcid, addr);
+ return;
+ }
+
+ switch (type) {
+ case INVPCID_ALL_GLOBAL:
+ flag = intr_clear();
+ cr4 = getcr4();
+ setcr4(cr4 & ~(ulong_t)CR4_PGE);
+ setcr4(cr4 | CR4_PGE);
+ intr_restore(flag);
+ break;
+
+ case INVPCID_ALL_NONGLOBAL:
+ if (!(getcr4() & CR4_PCIDE)) {
+ reload_cr3();
+ } else {
+ flag = intr_clear();
+ cr4 = getcr4();
+ setcr4(cr4 & ~(ulong_t)CR4_PGE);
+ setcr4(cr4 | CR4_PGE);
+ intr_restore(flag);
+ }
+ break;
+
+ case INVPCID_ADDR:
+ if (pcid == PCID_USER) {
+ flag = intr_clear();
+ ASSERT(addr < kernelbase);
+ ASSERT(ON_USER_HAT(CPU));
+ ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
+ tr_mmu_flush_user_range(addr, MMU_PAGESIZE,
+ MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3);
+ intr_restore(flag);
+ } else {
+ mmu_invlpg((caddr_t)addr);
+ }
+ break;
+
+ default:
+ panic("unsupported invpcid(%lu)", type);
+ break;
+ }
+}
+
+/*
+ * Flush one kernel mapping.
+ *
+ * We want to assert on kernel space here mainly for reasoning about the PCIDE
+ * case: namely, this flush should never need to flush a non-current PCID
+ * mapping. This presumes we never have reason to flush the kernel regions
+ * available to PCID_USER (the trampolines and so on). It also relies on
+ * PCID_KERNEL == PCID_NONE.
+ */
+void
+mmu_flush_tlb_kpage(uintptr_t va)
+{
+ ASSERT(va >= kernelbase);
+ ASSERT(getpcid() == PCID_KERNEL);
+ mmu_invlpg((caddr_t)va);
+}
+
+/*
+ * Flush one mapping: local CPU version of hat_tlb_inval().
+ *
+ * If this is a userspace address in the PCIDE case, we need two invalidations,
+ * one for any potentially stale PCID_USER mapping, as well as any established
+ * while in the kernel.
+ */
+void
+mmu_flush_tlb_page(uintptr_t va)
+{
+ ASSERT(getpcid() == PCID_KERNEL);
+
+ if (va >= kernelbase) {
+ mmu_flush_tlb_kpage(va);
+ return;
+ }
+
+ if (!(getcr4() & CR4_PCIDE)) {
+ mmu_invlpg((caddr_t)va);
+ return;
+ }
+
+ /*
+ * Yes, kas will need to flush below kernelspace, at least during boot.
+ * But there's no PCID_USER context.
+ */
+ if (ON_USER_HAT(CPU))
+ invpcid(INVPCID_ADDR, PCID_USER, va);
+ invpcid(INVPCID_ADDR, PCID_KERNEL, va);
+}
+
+static void
+mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz)
+{
+ EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase);
+ ASSERT(len > 0);
+ ASSERT(pgsz != 0);
+
+ if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) {
+ for (uintptr_t va = addr; va < (addr + len); va += pgsz)
+ mmu_flush_tlb_page(va);
+ return;
+ }
+
+ /*
+ * As an emulated invpcid() in the PCIDE case requires jumping
+ * cr3s, we batch the invalidations. We should only need to flush the
+ * user range if we're on a user-space HAT.
+ */
+ if (addr < kernelbase && ON_USER_HAT(CPU)) {
+ ulong_t flag = intr_clear();
+ ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0);
+ tr_mmu_flush_user_range(addr, len, pgsz,
+ CPU->cpu_m.mcpu_kpti.kf_user_cr3);
+ intr_restore(flag);
+ }
+
+ for (uintptr_t va = addr; va < (addr + len); va += pgsz)
+ mmu_invlpg((caddr_t)va);
+}
+
+/*
+ * MMU TLB (and PT cache) flushing on this CPU.
+ *
+ * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL.
+ * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL
+ * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER
+ * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not
+ * invalidated.
+ */
+void
+mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range)
+{
+ ASSERT(getpcid() == PCID_KERNEL);
+
+ switch (type) {
+ case FLUSH_TLB_ALL:
+ ASSERT(range == NULL);
+ invpcid(INVPCID_ALL_GLOBAL, 0, 0);
+ break;
+
+ case FLUSH_TLB_NONGLOBAL:
+ ASSERT(range == NULL);
+ invpcid(INVPCID_ALL_NONGLOBAL, 0, 0);
+ break;
+
+ case FLUSH_TLB_RANGE: {
+ mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range),
+ LEVEL_SIZE(range->tr_level));
+ break;
+ }
+
+ default:
+ panic("invalid call mmu_flush_tlb(%d)", type);
+ break;
+ }
+}
+
+#endif /* ! __xpv */
diff --git a/usr/src/uts/i86pc/vm/kboot_mmu.c b/usr/src/uts/i86pc/vm/kboot_mmu.c
index 9366ff9bee..76193b3d86 100644
--- a/usr/src/uts/i86pc/vm/kboot_mmu.c
+++ b/usr/src/uts/i86pc/vm/kboot_mmu.c
@@ -22,6 +22,8 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -145,7 +147,7 @@ kbm_remap_window(paddr_t physaddr, int writeable)
*((x86pte_t *)pte_to_window) = physaddr | pt_bits;
else
*((x86pte32_t *)pte_to_window) = physaddr | pt_bits;
- mmu_tlbflush_entry(window);
+ mmu_invlpg(window);
#endif
DBG(window);
return (window);
@@ -195,7 +197,7 @@ kbm_map(uintptr_t va, paddr_t pa, uint_t level, uint_t is_kernel)
*ptep = pteval;
else
*((x86pte32_t *)ptep) = pteval;
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_invlpg((caddr_t)va);
#endif
}
@@ -349,7 +351,7 @@ kbm_unmap(uintptr_t va)
*ptep = 0;
else
*((x86pte32_t *)ptep) = 0;
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_invlpg((caddr_t)va);
#endif
}
}
@@ -388,7 +390,7 @@ kbm_remap(uintptr_t va, pfn_t pfn)
*((x86pte_t *)ptep) = pte_val;
else
*((x86pte32_t *)ptep) = pte_val;
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_invlpg((caddr_t)va);
#endif
if (!(old_pte & PT_VALID) || ma_to_pa(old_pte) == -1)
@@ -421,7 +423,7 @@ kbm_read_only(uintptr_t va, paddr_t pa)
*ptep = pte_val;
else
*((x86pte32_t *)ptep) = pte_val;
- mmu_tlbflush_entry((caddr_t)va);
+ mmu_invlpg((caddr_t)va);
#endif
}
@@ -459,7 +461,7 @@ kbm_pop(void)
*((x86pte_t *)pte_to_window) = save_pte;
else
*((x86pte32_t *)pte_to_window) = save_pte;
- mmu_tlbflush_entry(window);
+ mmu_invlpg(window);
#endif
}
diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c
index ac01006aa4..6a94745ade 100644
--- a/usr/src/uts/i86pc/vm/vm_machdep.c
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c
@@ -24,7 +24,7 @@
/*
* Copyright (c) 2010, Intel Corporation.
* All rights reserved.
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -364,6 +364,91 @@ static kmutex_t contig_lock;
#define PFN_16M (mmu_btop((uint64_t)0x1000000))
+caddr_t
+i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot)
+{
+ caddr_t addr;
+ caddr_t addr1;
+ page_t *pp;
+
+ addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP);
+
+ for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) {
+ pp = page_numtopp_nolock(pf);
+ if (pp == NULL) {
+ hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf,
+ prot | HAT_NOSYNC, HAT_LOAD_LOCK);
+ } else {
+ hat_memload(kas.a_hat, addr, pp,
+ prot | HAT_NOSYNC, HAT_LOAD_LOCK);
+ }
+ }
+
+ return (addr1);
+}
+
+/*
+ * This routine is like page_numtopp, but accepts only free pages, which
+ * it allocates (unfrees) and returns with the exclusive lock held.
+ * It is used by machdep.c/dma_init() to find contiguous free pages.
+ */
+page_t *
+page_numtopp_alloc(pfn_t pfnum)
+{
+ page_t *pp;
+
+retry:
+ pp = page_numtopp_nolock(pfnum);
+ if (pp == NULL) {
+ return (NULL);
+ }
+
+ if (!page_trylock(pp, SE_EXCL)) {
+ return (NULL);
+ }
+
+ if (page_pptonum(pp) != pfnum) {
+ page_unlock(pp);
+ goto retry;
+ }
+
+ if (!PP_ISFREE(pp)) {
+ page_unlock(pp);
+ return (NULL);
+ }
+ if (pp->p_szc) {
+ page_demote_free_pages(pp);
+ page_unlock(pp);
+ goto retry;
+ }
+
+ /* If associated with a vnode, destroy mappings */
+
+ if (pp->p_vnode) {
+
+ page_destroy_free(pp);
+
+ if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) {
+ return (NULL);
+ }
+
+ if (page_pptonum(pp) != pfnum) {
+ page_unlock(pp);
+ goto retry;
+ }
+ }
+
+ if (!PP_ISFREE(pp)) {
+ page_unlock(pp);
+ return (NULL);
+ }
+
+ if (!page_reclaim(pp, (kmutex_t *)NULL))
+ return (NULL);
+
+ return (pp);
+}
+
/*
* Return the optimum page size for a given mapping
*/
diff --git a/usr/src/uts/i86xpv/os/xpv_panic.c b/usr/src/uts/i86xpv/os/xpv_panic.c
index 2b67bff1dd..594fcd4c31 100644
--- a/usr/src/uts/i86xpv/os/xpv_panic.c
+++ b/usr/src/uts/i86xpv/os/xpv_panic.c
@@ -23,6 +23,8 @@
* Copyright 2016 PALO, Richard.
*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#include <sys/types.h>
@@ -174,7 +176,7 @@ xpv_panic_map(int level, pfn_t pfn)
*(x86pte32_t *)pteptr = pte;
XPV_DISALLOW_PAGETABLE_UPDATES();
- mmu_tlbflush_entry(PWIN_VA(level));
+ mmu_flush_tlb_page((uintptr_t)PWIN_VA(level));
}
/*
diff --git a/usr/src/uts/intel/asm/htable.h b/usr/src/uts/intel/asm/htable.h
index dd1d72a3c1..2601111c6d 100644
--- a/usr/src/uts/intel/asm/htable.h
+++ b/usr/src/uts/intel/asm/htable.h
@@ -22,6 +22,8 @@
/*
* Copyright 2004 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _ASM_HTABLE_H
@@ -36,8 +38,6 @@ extern "C" {
#if !defined(__lint) && defined(__GNUC__)
-#if defined(__i386) || defined(__amd64)
-
/*
* This set of atomic operations are designed primarily
* for some ia32 hat layer operations.
@@ -83,17 +83,6 @@ atomic_dec16(uint16_t *addr)
: "cc");
}
-extern __GNU_INLINE void
-mmu_tlbflush_entry(caddr_t addr)
-{
- __asm__ __volatile__(
- "invlpg %0"
- : "=m" (*addr)
- : "m" (*addr));
-}
-
-#endif /* __i386 || __amd64 */
-
#endif /* !__lint && __GNUC__ */
#ifdef __cplusplus
diff --git a/usr/src/uts/intel/asm/mmu.h b/usr/src/uts/intel/asm/mmu.h
index 1be654759d..bd3e69a9a8 100644
--- a/usr/src/uts/intel/asm/mmu.h
+++ b/usr/src/uts/intel/asm/mmu.h
@@ -21,6 +21,8 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ *
+ * Copyright 2018 Joyent, Inc.
*/
#ifndef _ASM_MMU_H
@@ -33,9 +35,9 @@
extern "C" {
#endif
-#if defined(__GNUC__) && !defined(__xpv)
+#if defined(__GNUC__)
-#if defined(__amd64)
+#if !defined(__xpv)
extern __GNU_INLINE ulong_t
getcr3(void)
@@ -57,30 +59,22 @@ setcr3(ulong_t value)
: "r" (value));
}
-extern __GNU_INLINE void
-reload_cr3(void)
-{
- setcr3(getcr3());
-}
-
-#elif defined(__i386)
-
extern __GNU_INLINE ulong_t
-getcr3(void)
+getcr4(void)
{
- uint32_t value;
+ uint64_t value;
__asm__ __volatile__(
- "movl %%cr3, %0"
+ "movq %%cr4, %0"
: "=r" (value));
return (value);
}
extern __GNU_INLINE void
-setcr3(ulong_t value)
+setcr4(ulong_t value)
{
__asm__ __volatile__(
- "movl %0, %%cr3"
+ "movq %0, %%cr4"
: /* no output */
: "r" (value));
}
@@ -91,9 +85,33 @@ reload_cr3(void)
setcr3(getcr3());
}
-#endif
+/*
+ * We clobber memory: we're not writing anything, but we don't want to
+ * potentially get re-ordered beyond the TLB flush.
+ */
+extern __GNU_INLINE void
+invpcid_insn(uint64_t type, uint64_t pcid, uintptr_t addr)
+{
+ uint64_t pcid_desc[2] = { pcid, addr };
+ __asm__ __volatile__(
+ "invpcid %0, %1"
+ : /* no output */
+ : "m" (*pcid_desc), "r" (type)
+ : "memory");
+}
+
+#endif /* !__xpv */
+
+extern __GNU_INLINE void
+mmu_invlpg(caddr_t addr)
+{
+ __asm__ __volatile__(
+ "invlpg %0"
+ : "=m" (*addr)
+ : "m" (*addr));
+}
-#endif /* __GNUC__ && !__xpv */
+#endif /* __GNUC__ */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/intel/ia32/ml/i86_subr.s b/usr/src/uts/intel/ia32/ml/i86_subr.s
index d4ba6589bc..30f1f673d4 100644
--- a/usr/src/uts/intel/ia32/ml/i86_subr.s
+++ b/usr/src/uts/intel/ia32/ml/i86_subr.s
@@ -23,7 +23,7 @@
* Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 by Delphix. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
*/
/*
@@ -436,27 +436,16 @@ getfp(void)
/* ARGSUSED */
void
-mmu_tlbflush_entry(caddr_t m)
+mmu_invlpg(caddr_t m)
{}
#else /* __lint */
-#if defined(__amd64)
-
- ENTRY(mmu_tlbflush_entry)
+ ENTRY(mmu_invlpg)
invlpg (%rdi)
ret
- SET_SIZE(mmu_tlbflush_entry)
-
-#elif defined(__i386)
-
- ENTRY(mmu_tlbflush_entry)
- movl 4(%esp), %eax
- invlpg (%eax)
- ret
- SET_SIZE(mmu_tlbflush_entry)
+ SET_SIZE(mmu_invlpg)
-#endif /* __i386 */
#endif /* __lint */
diff --git a/usr/src/uts/intel/ia32/os/desctbls.c b/usr/src/uts/intel/ia32/os/desctbls.c
index 3c021bd055..5ef56b034c 100644
--- a/usr/src/uts/intel/ia32/os/desctbls.c
+++ b/usr/src/uts/intel/ia32/os/desctbls.c
@@ -1259,9 +1259,6 @@ init_desctbls(void)
{
user_desc_t *gdt;
desctbr_t idtr;
-#if defined(__amd64)
- extern uint64_t kpti_safe_cr3;
-#endif
/*
* Allocate IDT and TSS structures on unique pages for better
diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h
index 6ae1afb3eb..fe0cf687b4 100644
--- a/usr/src/uts/intel/sys/controlregs.h
+++ b/usr/src/uts/intel/sys/controlregs.h
@@ -20,9 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent, Inc.
- *
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2018, Joyent, Inc.
*/
#ifndef _SYS_CONTROLREGS_H
@@ -90,8 +88,14 @@ extern "C" {
#define CR3_PCD 0x00000010 /* cache disable */
#define CR3_PWT 0x00000008 /* write through */
-
-#define FMT_CR3 "\20\5pcd\4pwt"
+#if defined(_ASM)
+#define CR3_NOINVL_BIT 0x8000000000000000
+#else
+#define CR3_NOINVL_BIT 0x8000000000000000ULL /* no invalidation */
+#endif
+#define PCID_NONE 0x000 /* generic PCID */
+#define PCID_KERNEL 0x000 /* kernel's PCID */
+#define PCID_USER 0x001 /* user-space PCID */
/* CR4 Register */
diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h
index 475bb85738..bfd6f14289 100644
--- a/usr/src/uts/intel/sys/x86_archext.h
+++ b/usr/src/uts/intel/sys/x86_archext.h
@@ -27,7 +27,7 @@
* All rights reserved.
*/
/*
- * Copyright 2017 Joyent, Inc.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2012 Jens Elkner <jel+illumos@cs.uni-magdeburg.de>
* Copyright 2012 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
* Copyright 2014 Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
@@ -210,6 +210,7 @@ extern "C" {
#define CPUID_INTC_EBX_7_0_AVX2 0x00000020 /* AVX2 supported */
#define CPUID_INTC_EBX_7_0_SMEP 0x00000080 /* SMEP in CR4 */
#define CPUID_INTC_EBX_7_0_BMI2 0x00000100 /* BMI2 instrs */
+#define CPUID_INTC_EBX_7_0_INVPCID 0x00000400 /* invpcid instr */
#define CPUID_INTC_EBX_7_0_MPX 0x00004000 /* Mem. Prot. Ext. */
#define CPUID_INTC_EBX_7_0_AVX512F 0x00010000 /* AVX512 foundation */
#define CPUID_INTC_EBX_7_0_AVX512DQ 0x00020000 /* AVX512DQ */
@@ -433,6 +434,8 @@ extern "C" {
#define X86FSET_UMIP 66
#define X86FSET_PKU 67
#define X86FSET_OSPKE 68
+#define X86FSET_PCID 69
+#define X86FSET_INVPCID 70
/*
* Intel Deep C-State invariant TSC in leaf 0x80000007.
@@ -691,7 +694,7 @@ extern "C" {
#if defined(_KERNEL) || defined(_KMEMUSER)
-#define NUM_X86_FEATURES 69
+#define NUM_X86_FEATURES 71
extern uchar_t x86_featureset[];
extern void free_x86_featureset(void *featureset);
@@ -725,6 +728,9 @@ struct cpuid_regs {
uint32_t cp_edx;
};
+extern int x86_use_pcid;
+extern int x86_use_invpcid;
+
/*
* Utility functions to get/set extended control registers (XCR)
* Initial use is to get/set the contents of the XFEATURE_ENABLED_MASK.
@@ -870,6 +876,8 @@ extern void determine_platform(void);
extern int get_hwenv(void);
extern int is_controldom(void);
+extern void enable_pcid(void);
+
extern void xsave_setup_msr(struct cpu *);
/*