diff options
author | John Levon <john.levon@joyent.com> | 2018-02-22 18:05:13 -0800 |
---|---|---|
committer | John Levon <john.levon@joyent.com> | 2018-03-13 20:33:26 +0000 |
commit | 60f89b42cd13d6888f948d7ffe4edcfa535e02a6 (patch) | |
tree | c60e2fa99bc8572c457a0908105f8570c56b834c /usr/src | |
parent | 0e957fcabecc0abb13226b12f474359f4ea711ea (diff) | |
download | illumos-joyent-60f89b42cd13d6888f948d7ffe4edcfa535e02a6.tar.gz |
OS-6546 Use PCID if KPTI is enabled
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Alex Wilson <alex.wilson@joyent.com>
Diffstat (limited to 'usr/src')
34 files changed, 761 insertions, 439 deletions
diff --git a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c index 95e588eed6..224cfb4a18 100644 --- a/usr/src/cmd/mdb/i86pc/modules/unix/unix.c +++ b/usr/src/cmd/mdb/i86pc/modules/unix/unix.c @@ -1010,18 +1010,18 @@ crregs_dcmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) cr2 = kmdb_unix_getcr2(); cr3 = kmdb_unix_getcr3(); cr4 = kmdb_unix_getcr4(); - mdb_printf("%%cr0 = 0x%08x <%b>\n", cr0, cr0, cr0_flag_bits); - mdb_printf("%%cr2 = 0x%08x <%a>\n", cr2, cr2); + mdb_printf("%%cr0 = 0x%lx <%b>\n", cr0, cr0, cr0_flag_bits); + mdb_printf("%%cr2 = 0x%lx <%a>\n", cr2, cr2); if ((cr4 & CR4_PCIDE)) { - mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx pcid:%u>\n", + mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx pcid:%lu>\n", cr3, cr3 >> MMU_PAGESHIFT, cr3 & MMU_PAGEOFFSET); } else { - mdb_printf("%%cr3 = 0x%08x <pfn:0x%lx flags:%b>\n", cr3, + mdb_printf("%%cr3 = 0x%lx <pfn:0x%lx flags:%b>\n", cr3, cr3 >> MMU_PAGESHIFT, cr3, cr3_flag_bits); } - mdb_printf("%%cr4 = 0x%08x <%b>\n", cr4, cr4, cr4_flag_bits); + mdb_printf("%%cr4 = 0x%lx <%b>\n", cr4, cr4, cr4_flag_bits); return (DCMD_OK); } diff --git a/usr/src/cmd/mdb/intel/kmdb/kaif.c b/usr/src/cmd/mdb/intel/kmdb/kaif.c index dda6a94ea6..55754f4130 100644 --- a/usr/src/cmd/mdb/intel/kmdb/kaif.c +++ b/usr/src/cmd/mdb/intel/kmdb/kaif.c @@ -265,13 +265,16 @@ kaif_set_register(const char *regname, kreg_t val) static boolean_t kaif_toxic_text(uintptr_t addr) { - static GElf_Sym toxic_syms[1] = { 0, }; + static GElf_Sym toxic_syms[2] = { 0, }; size_t i; if (toxic_syms[0].st_name == NULL) { if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC, "tr_iret_user", &toxic_syms[0], NULL) != 0) warn("couldn't find tr_iret_user\n"); + if (mdb_tgt_lookup_by_name(mdb.m_target, MDB_TGT_OBJ_EXEC, + "tr_mmu_flush_user_range", &toxic_syms[1], NULL) != 0) + warn("couldn't find tr_mmu_flush_user_range\n"); } for (i = 0; i < ARRAY_SIZE(toxic_syms); i++) { diff --git a/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c b/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c index 4934de54e6..f46515838f 100644 --- a/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c +++ b/usr/src/uts/i86pc/io/gfx_private/gfxp_vm.c @@ -22,6 +22,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/debug.h> @@ -99,7 +101,7 @@ gfxp_map_kernel_space(uint64_t start, size_t size, uint32_t mode) * The hypervisor doesn't allow r/w mappings to some pages, such as * page tables, gdt, etc. Detect %cr3 to notify users of this interface. */ - if (start == mmu_ptob(mmu_btop(getcr3()))) + if (start == mmu_ptob(mmu_btop(getcr3_pa()))) return (0); #endif @@ -318,7 +320,7 @@ gfxp_load_kernel_space(uint64_t start, size_t size, * The hypervisor doesn't allow r/w mappings to some pages, such as * page tables, gdt, etc. Detect %cr3 to notify users of this interface. */ - if (start == mmu_ptob(mmu_btop(getcr3()))) + if (start == mmu_ptob(mmu_btop(getcr3_pa()))) return; #endif diff --git a/usr/src/uts/i86pc/ml/fb_swtch_src.s b/usr/src/uts/i86pc/ml/fb_swtch_src.s index e67837ee2b..4d1789fc9b 100644 --- a/usr/src/uts/i86pc/ml/fb_swtch_src.s +++ b/usr/src/uts/i86pc/ml/fb_swtch_src.s @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ @@ -52,6 +53,9 @@ int fb_swtch_silence_lint = 0; #define DISABLE_PAGING \ + movl %cr4, %eax ;\ + btrl $17, %eax /* clear PCIDE bit */ ;\ + movl %eax, %cr4 ;\ movl %cr0, %eax ;\ btrl $31, %eax /* clear PG bit */ ;\ movl %eax, %cr0 @@ -222,6 +226,7 @@ _start: * Disable long mode by: * - shutting down paging (bit 31 of cr0). This will flush the * TLBs. + * - turning off PCID in cr4 * - disabling LME (long mode enable) in EFER (extended feature reg) */ #endif diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s index c05718c3ad..2db2d5acfa 100644 --- a/usr/src/uts/i86pc/ml/kpti_trampolines.s +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -92,6 +92,9 @@ * We do not do any stack pivoting for syscalls (and we leave SYSENTER's * existing %rsp pivot untouched) -- instead we spill registers into * %gs:CPU_KPTI_* as we need to. + * + * Note that the normal %cr3 values do not cause invalidations with PCIDE - see + * hat_switch(). */ /* @@ -705,6 +708,35 @@ tr_intr_ret_end: MKIVCT(248); MKIVCT(249); MKIVCT(250); MKIVCT(251); MKIVCT(252); MKIVCT(253); MKIVCT(254); MKIVCT(255); + /* + * We're PCIDE, but we don't have INVPCID. The only way to invalidate a + * PCID other than the current one, then, is to load its cr3 then + * invlpg. But loading kf_user_cr3 means we can longer access our + * caller's text mapping (or indeed, its stack). So this little helper + * has to live within our trampoline text region. + * + * Called as tr_mmu_flush_user_range(addr, len, pgsz, cr3) + */ + ENTRY_NP(tr_mmu_flush_user_range) + push %rbx + /* When we read cr3, it never has the NOINVL bit set. */ + mov %cr3, %rax + movq $CR3_NOINVL_BIT, %rbx + orq %rbx, %rax + + mov %rcx, %cr3 + add %rdi, %rsi +.align ASM_ENTRY_ALIGN +1: + invlpg (%rdi) + add %rdx, %rdi + cmp %rsi, %rdi + jb 1b + mov %rax, %cr3 + pop %rbx + retq + SET_SIZE(tr_mmu_flush_user_range) + .align MMU_PAGESIZE .global kpti_tramp_end kpti_tramp_end: diff --git a/usr/src/uts/i86pc/ml/mpcore.s b/usr/src/uts/i86pc/ml/mpcore.s index eaf70b72df..2151a14b04 100644 --- a/usr/src/uts/i86pc/ml/mpcore.s +++ b/usr/src/uts/i86pc/ml/mpcore.s @@ -24,6 +24,8 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/asm_linkage.h> @@ -326,7 +328,7 @@ kernel_cs_code: * Complete the rest of the setup and call mp_startup(). */ movq %gs:CPU_THREAD, %rax /* get thread ptr */ - call *T_PC(%rax) /* call mp_startup */ + call *T_PC(%rax) /* call mp_startup_boot */ /* not reached */ int $20 /* whoops, returned somehow! */ @@ -502,7 +504,7 @@ kernel_cs_code: /* * Before going any farther, enable usage of page table NX bit if - * that's how our page tables are set up. + * that's how our page tables are set up. (PCIDE is enabled later on). */ bt $X86FSET_NX, x86_featureset jnc 1f diff --git a/usr/src/uts/i86pc/os/cpuid.c b/usr/src/uts/i86pc/os/cpuid.c index f44c75acd2..289ce29183 100644 --- a/usr/src/uts/i86pc/os/cpuid.c +++ b/usr/src/uts/i86pc/os/cpuid.c @@ -32,7 +32,7 @@ * Portions Copyright 2009 Advanced Micro Devices, Inc. */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* * Various routines to handle identification @@ -58,6 +58,7 @@ #include <sys/memnode.h> #include <sys/pci_cfgspace.h> #include <sys/comm_page.h> +#include <sys/mach_mmu.h> #include <sys/tsc.h> #ifdef __xpv @@ -83,7 +84,7 @@ * x86_vendor accordingly. * o Processing the feature flags returned by the cpuid instruction while * applying any workarounds or tricks for the specific processor. - * o Mapping the feature flags into Solaris feature bits (X86_*). + * o Mapping the feature flags into illumos feature bits (X86_*). * o Processing extended feature flags if supported by the processor, * again while applying specific processor knowledge. * o Determining the CMT characteristics of the system. @@ -122,6 +123,14 @@ uint_t x86_vendor = X86_VENDOR_IntelClone; uint_t x86_type = X86_TYPE_OTHER; uint_t x86_clflush_size = 0; +#if defined(__xpv) +int x86_use_pcid = 0; +int x86_use_invpcid = 0; +#else +int x86_use_pcid = -1; +int x86_use_invpcid = -1; +#endif + uint_t pentiumpro_bug4046376; uchar_t x86_featureset[BT_SIZEOFMAP(NUM_X86_FEATURES)]; @@ -196,6 +205,8 @@ static char *x86_feature_names[NUM_X86_FEATURES] = { "umip", "pku", "ospke", + "pcid", + "invpcid", }; boolean_t @@ -1302,6 +1313,10 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_SMEP) add_x86_feature(featureset, X86FSET_SMEP); + if (ecp->cp_ebx & CPUID_INTC_EBX_7_0_INVPCID) { + add_x86_feature(featureset, X86FSET_INVPCID); + } + /* * We check disable_smap here in addition to in startup_smap() * to ensure CPUs that aren't the boot CPU don't accidentally @@ -1504,6 +1519,13 @@ cpuid_pass1(cpu_t *cpu, uchar_t *featureset) } } } + + if (cpi->cpi_vendor == X86_VENDOR_Intel) { + if (cp->cp_ecx & CPUID_INTC_ECX_PCID) { + add_x86_feature(featureset, X86FSET_PCID); + } + } + if (cp->cp_ecx & CPUID_INTC_ECX_X2APIC) { add_x86_feature(featureset, X86FSET_X2APIC); } @@ -5003,6 +5025,29 @@ post_startup_cpu_fixups(void) #endif /* !__xpv */ } +void +enable_pcid(void) +{ + if (x86_use_pcid == -1) + x86_use_pcid = is_x86_feature(x86_featureset, X86FSET_PCID); + + if (x86_use_invpcid == -1) { + x86_use_invpcid = is_x86_feature(x86_featureset, + X86FSET_INVPCID); + } + + if (!x86_use_pcid) + return; + + /* + * Intel say that on setting PCIDE, it immediately starts using the PCID + * bits; better make sure there's nothing there. + */ + ASSERT((getcr3() & MMU_PAGEOFFSET) == PCID_NONE); + + setcr4(getcr4() | CR4_PCIDE); +} + /* * Setup necessary registers to enable XSAVE feature on this processor. * This function needs to be called early enough, so that no xsave/xrstor diff --git a/usr/src/uts/i86pc/os/fakebop.c b/usr/src/uts/i86pc/os/fakebop.c index 9379072264..a62e45d89d 100644 --- a/usr/src/uts/i86pc/os/fakebop.c +++ b/usr/src/uts/i86pc/os/fakebop.c @@ -26,7 +26,7 @@ * Copyright (c) 2010, Intel Corporation. * All rights reserved. * - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. All rights reserved. */ /* @@ -847,6 +847,12 @@ do_bsys_doint(bootops_t *bop, int intnum, struct bop_regs *rp) bios_regs_t br; /* + * We're about to disable paging; we shouldn't be PCID enabled. + */ + if (getcr4() & CR4_PCIDE) + prom_panic("do_bsys_doint() with PCID enabled\n"); + + /* * The first time we do this, we have to copy the pre-packaged * low memory bios call code image into place. */ diff --git a/usr/src/uts/i86pc/os/mach_kdi.c b/usr/src/uts/i86pc/os/mach_kdi.c index ce8255cdd8..60ca8c9fca 100644 --- a/usr/src/uts/i86pc/os/mach_kdi.c +++ b/usr/src/uts/i86pc/os/mach_kdi.c @@ -21,10 +21,10 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Kernel/Debugger Interface (KDI) routines. Called during debugger under * various system states (boot, while running, while the debugger has control). @@ -113,12 +113,6 @@ kdi_dreg_set(int reg, ulong_t value) } } -void -kdi_flush_caches(void) -{ - reload_cr3(); -} - extern void kdi_slave_entry(void); void diff --git a/usr/src/uts/i86pc/os/mlsetup.c b/usr/src/uts/i86pc/os/mlsetup.c index 19f0f5f676..09bf07848c 100644 --- a/usr/src/uts/i86pc/os/mlsetup.c +++ b/usr/src/uts/i86pc/os/mlsetup.c @@ -148,17 +148,24 @@ mlsetup(struct regs *rp) else cpuid_feature_edx_exclude = (uint32_t)prop_value; -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) /* * Check to see if KPTI has been explicitly enabled or disabled. * We have to check this before init_desctbls(). */ - if (bootprop_getval("kpti", &prop_value) != 0) { - kpti_enable = 1; - } else { + if (bootprop_getval("kpti", &prop_value) == 0) { kpti_enable = (uint64_t)(prop_value == 1); prom_printf("unix: forcing kpti to %s due to boot argument\n", (kpti_enable == 1) ? "ON" : "OFF"); + } else { + kpti_enable = 1; + } + + if (bootprop_getval("pcid", &prop_value) == 0 && prop_value == 0) { + prom_printf("unix: forcing pcid to OFF due to boot argument\n"); + x86_use_pcid = 0; + } else if (kpti_enable != 1) { + x86_use_pcid = 0; } #endif diff --git a/usr/src/uts/i86pc/os/mp_pc.c b/usr/src/uts/i86pc/os/mp_pc.c index 4e12703395..98fa4cc131 100644 --- a/usr/src/uts/i86pc/os/mp_pc.c +++ b/usr/src/uts/i86pc/os/mp_pc.c @@ -133,10 +133,11 @@ rmp_gdt_init(rm_platter_t *rm) #if defined(__amd64) /* Use the kas address space for the CPU startup thread. */ - if (MAKECR3(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) + if (mmu_ptob(kas.a_hat->hat_htable->ht_pfn) > 0xffffffffUL) { panic("Cannot initialize CPUs; kernel's 64-bit page tables\n" "located above 4G in physical memory (@ 0x%lx)", - MAKECR3(kas.a_hat->hat_htable->ht_pfn)); + mmu_ptob(kas.a_hat->hat_htable->ht_pfn)); + } /* * Setup pseudo-descriptors for temporary GDT and IDT for use ONLY @@ -356,21 +357,17 @@ mach_cpucontext_xalloc(struct cpu *cp, int optype) /* * CPU needs to access kernel address space after powering on. - * When hot-adding CPU at runtime, directly use top level page table - * of kas other than the return value of getcr3(). getcr3() returns - * current process's top level page table, which may be different from - * the one of kas. */ - rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn); + rm->rm_pdbr = MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_NONE); rm->rm_cpu = cp->cpu_id; /* - * For hot-adding CPU at runtime, Machine Check and Performance Counter - * should be disabled. They will be enabled on demand after CPU powers - * on successfully + * We need to mask off any bits set on our boot CPU that can't apply + * while the subject CPU is initializing. If appropriate, they are + * enabled later on. */ rm->rm_cr4 = getcr4(); - rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE); + rm->rm_cr4 &= ~(CR4_MCE | CR4_PCE | CR4_PCIDE); rmp_gdt_init(rm); diff --git a/usr/src/uts/i86pc/os/mp_startup.c b/usr/src/uts/i86pc/os/mp_startup.c index a807be6a40..c755f56927 100644 --- a/usr/src/uts/i86pc/os/mp_startup.c +++ b/usr/src/uts/i86pc/os/mp_startup.c @@ -1796,6 +1796,8 @@ mp_startup_common(boolean_t boot) */ cp->cpu_flags &= ~(CPU_POWEROFF | CPU_QUIESCED); + enable_pcid(); + /* * Setup this processor for XSAVE. */ diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index 5e23d2f486..a3026f0eb4 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -2337,12 +2337,18 @@ startup_end(void) xs_domu_init(); #endif -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) /* * Intel IOMMU has been setup/initialized in ddi_impl.c * Start it up now. */ immu_startup(); + + /* + * Now that we're no longer going to drop into real mode for a BIOS call + * via bootops, we can enable PCID (which requires CR0.PG). + */ + enable_pcid(); #endif PRM_POINT("Enabling interrupts"); diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index 4b867bac0c..839bbeba25 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -1847,7 +1847,7 @@ instr_is_sys_rtt(caddr_t pc) { extern void _sys_rtt(), _sys_rtt_end(); -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) extern void tr_sysc_ret_start(), tr_sysc_ret_end(); extern void tr_intr_ret_start(), tr_intr_ret_end(); diff --git a/usr/src/uts/i86pc/sys/mach_mmu.h b/usr/src/uts/i86pc/sys/mach_mmu.h index 1eb47ada6a..22c7aac422 100644 --- a/usr/src/uts/i86pc/sys/mach_mmu.h +++ b/usr/src/uts/i86pc/sys/mach_mmu.h @@ -21,13 +21,13 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MACH_MMU_H #define _SYS_MACH_MMU_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -131,6 +131,19 @@ extern "C" { #define PT_NOCONSIST (0x400) /* PTE was created with HAT_LOAD_NOCONSIST */ #define PT_FOREIGN (0x600) /* MFN mapped on the hypervisor has no PFN */ +#ifndef _BOOT + +extern ulong_t getcr3(void); +extern void setcr3(ulong_t); + +#define getcr3_pa() (getcr3() & MMU_PAGEMASK) +#define getpcid() ((getcr4() & CR4_PCIDE) ? \ + (getcr3() & MMU_PAGEOFFSET) : PCID_NONE) + +extern void mmu_invlpg(caddr_t); + +#endif + #ifdef __xpv #include <sys/xen_mmu.h> #else @@ -151,10 +164,6 @@ paddr_t make_ptable(x86pte_t *, uint_t); x86pte_t *find_pte(uint64_t, paddr_t *, uint_t, uint_t); x86pte_t *map_pte(paddr_t, uint_t); -#ifndef _BOOT -ulong_t getcr3(); -#endif - extern uint_t *shift_amt; extern uint_t ptes_per_table; extern paddr_t top_page_table; diff --git a/usr/src/uts/i86pc/sys/machcpuvar.h b/usr/src/uts/i86pc/sys/machcpuvar.h index cf1a252c28..d3827522e8 100644 --- a/usr/src/uts/i86pc/sys/machcpuvar.h +++ b/usr/src/uts/i86pc/sys/machcpuvar.h @@ -143,11 +143,7 @@ struct kpti_frame { * * There is a CTASSERT in os/intr.c that checks these numbers. */ -#if defined(__amd64) #define MACHCPU_SIZE (572 + 1584) -#else -#define MACHCPU_SIZE (452 + 1328) -#endif #define MACHCPU_PAD (MMU_PAGESIZE - MACHCPU_SIZE) #define MACHCPU_PAD2 (MMU_PAGESIZE - 16 - 3 * sizeof (struct kpti_frame)) diff --git a/usr/src/uts/i86pc/sys/pc_mmu.h b/usr/src/uts/i86pc/sys/pc_mmu.h index 89661449a4..324fb78e2d 100644 --- a/usr/src/uts/i86pc/sys/pc_mmu.h +++ b/usr/src/uts/i86pc/sys/pc_mmu.h @@ -21,13 +21,13 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_PC_MMU_H #define _SYS_PC_MMU_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -50,8 +50,15 @@ void reload_cr3(void); #ifndef _BOOT -void mmu_tlbflush_entry(caddr_t); -void setcr3(ulong_t); +extern uint64_t kpti_safe_cr3; + +#define INVPCID_ADDR (0) +#define INVPCID_ID (1) +#define INVPCID_ALL_GLOBAL (2) +#define INVPCID_ALL_NONGLOBAL (3) + +extern void invpcid_insn(uint64_t, uint64_t, uintptr_t); +extern void tr_mmu_flush_user_range(uint64_t, size_t, size_t, uint64_t); #if defined(__GNUC__) #include <asm/mmu.h> diff --git a/usr/src/uts/i86pc/sys/rm_platter.h b/usr/src/uts/i86pc/sys/rm_platter.h index 15ab068854..55a58095af 100644 --- a/usr/src/uts/i86pc/sys/rm_platter.h +++ b/usr/src/uts/i86pc/sys/rm_platter.h @@ -115,7 +115,7 @@ typedef struct rm_platter { struct cpu_tables { /* IST stacks */ char ct_stack1[DEFAULTSTKSZ]; /* dblfault */ -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) char ct_stack2[DEFAULTSTKSZ]; /* nmi */ char ct_stack3[DEFAULTSTKSZ]; /* mce */ #endif diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 8690c46adf..2bac383b9c 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -301,10 +301,10 @@ static x86pte_t hati_update_pte(htable_t *ht, uint_t entry, x86pte_t expected, x86pte_t new); /* - * The kernel address space exists in all HATs. To implement this the - * kernel reserves a fixed number of entries in the topmost level(s) of page - * tables. The values are setup during startup and then copied to every user - * hat created by hat_alloc(). This means that kernelbase must be: + * The kernel address space exists in all non-HAT_COPIED HATs. To implement this + * the kernel reserves a fixed number of entries in the topmost level(s) of page + * tables. The values are setup during startup and then copied to every user hat + * created by hat_alloc(). This means that kernelbase must be: * * 4Meg aligned for 32 bit kernels * 512Gig aligned for x86_64 64 bit kernel @@ -377,12 +377,6 @@ struct hatstats hatstat; */ int pt_kern; -/* - * useful stuff for atomic access/clearing/setting REF/MOD/RO bits in page_t's. - */ -extern void atomic_orb(uchar_t *addr, uchar_t val); -extern void atomic_andb(uchar_t *addr, uchar_t val); - #ifndef __xpv extern pfn_t memseg_get_start(struct memseg *); #endif @@ -494,7 +488,6 @@ hat_alloc(struct as *as) hat->hat_num_copied = 0; hat->hat_flags = 0; #else /* __xpv */ -#if defined(__amd64) /* * All processes use HAT_COPIED on the 64-bit kernel if KPTI is @@ -521,17 +514,6 @@ hat_alloc(struct as *as) hat->hat_flags = 0; HATSTAT_INC(hs_hat_normal64); } -#elif defined(__i386) - use_copied = mmu.pae_hat; - if (use_copied) { - use_hat32_cache = B_TRUE; - hat->hat_num_copied = mmu.num_copied_ents; - HATSTAT_INC(hs_hat_copied32); - } else { - use_hat32_cache = B_FALSE; - hat->hat_num_copied = 0; - } -#endif #endif /* __xpv */ if (use_copied) { hat->hat_flags |= HAT_COPIED; @@ -830,7 +812,6 @@ mmu_calc_user_slots(void) */ mmu.top_level_uslots = ent + 1; -#if defined(__amd64) /* * When running 32-bit compatability processes on a 64-bit kernel, we * will only need to use one slot. @@ -844,16 +825,6 @@ mmu_calc_user_slots(void) */ mmu.num_copied_ents = mmu.top_level_uslots; mmu.num_copied_ents32 = 4; -#elif defined(__xpv) - /* - * - */ - if (mmu.pae_hat) { - mmu.num_copied_ents = 4; - } else { - mmu.num_copied_ents = 0; - } -#endif } /* @@ -875,7 +846,7 @@ mmu_init(void) (getcr4() & CR4_PGE) != 0) mmu.pt_global = PT_GLOBAL; -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) /* * The 64-bit x86 kernel has split user/kernel page tables. As such we * cannot have the global bit set. The simplest way for us to deal with @@ -1302,8 +1273,6 @@ hat_pcp_teardown(cpu_t *cpu) ++r; \ } -extern uint64_t kpti_safe_cr3; - /* * Finish filling in the kernel hat. * Pre fill in all top level kernel page table entries for the kernel's @@ -1420,9 +1389,10 @@ hat_init_finish(void) #endif hat_kmap_init((uintptr_t)segmap_start, size); -#if defined(__amd64) && !defined(__xpv) +#if !defined(__xpv) ASSERT3U(kas.a_hat->hat_htable->ht_pfn, !=, PFN_INVALID); - ASSERT3U(kpti_safe_cr3, ==, MAKECR3(kas.a_hat->hat_htable->ht_pfn)); + ASSERT3U(kpti_safe_cr3, ==, + MAKECR3(kas.a_hat->hat_htable->ht_pfn, PCID_KERNEL)); #endif } @@ -1517,7 +1487,7 @@ hat_pcp_update(cpu_t *cpu, const hat_t *hat) } static void -reset_kpti(struct kpti_frame *fr, uint64_t kcr3) +reset_kpti(struct kpti_frame *fr, uint64_t kcr3, uint64_t ucr3) { ASSERT3U(fr->kf_tr_flag, ==, 0); #if DEBUG @@ -1533,7 +1503,7 @@ reset_kpti(struct kpti_frame *fr, uint64_t kcr3) offsetof(struct kpti_frame, kf_unused)); fr->kf_kernel_cr3 = kcr3; - fr->kf_user_cr3 = 0; + fr->kf_user_cr3 = ucr3; fr->kf_tr_ret_rsp = (uintptr_t)&fr->kf_tr_rsp; fr->kf_lower_redzone = 0xdeadbeefdeadbeef; @@ -1541,18 +1511,83 @@ reset_kpti(struct kpti_frame *fr, uint64_t kcr3) fr->kf_upper_redzone = 0xdeadbeefdeadbeef; } +#ifdef __xpv +static void +hat_switch_xen(hat_t *hat) +{ + struct mmuext_op t[2]; + uint_t retcnt; + uint_t opcnt = 1; + uint64_t newcr3; + + ASSERT(!(hat->hat_flags & HAT_COPIED)); + ASSERT(!(getcr4() & CR4_PCIDE)); + + newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn, PCID_NONE); + + t[0].cmd = MMUEXT_NEW_BASEPTR; + t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); + + /* + * There's an interesting problem here, as to what to actually specify + * when switching to the kernel hat. For now we'll reuse the kernel hat + * again. + */ + t[1].cmd = MMUEXT_NEW_USER_BASEPTR; + if (hat == kas.a_hat) + t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); + else + t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); + ++opcnt; + + if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) + panic("HYPERVISOR_mmu_update() failed"); + ASSERT(retcnt == opcnt); +} +#endif /* __xpv */ + /* * Switch to a new active hat, maintaining bit masks to track active CPUs. * - * On the 32-bit PAE hypervisor, %cr3 is a 64-bit value, on metal it - * remains a 32-bit value. + * With KPTI, all our HATs except kas should be using PCP. Thus, to switch + * HATs, we need to copy over the new user PTEs, then set our trampoline context + * as appropriate. + * + * If lacking PCID, we then load our new cr3, which will flush the TLB: we may + * have established userspace TLB entries via kernel accesses, and these are no + * longer valid. We have to do this eagerly, as we just deleted this CPU from + * ->hat_cpus, so would no longer see any TLB shootdowns. + * + * With PCID enabled, things get a little more complicated. We would like to + * keep TLB context around when entering and exiting the kernel, and to do this, + * we partition the TLB into two different spaces: + * + * PCID_KERNEL is defined as zero, and used both by kas and all other address + * spaces while in the kernel (post-trampoline). + * + * PCID_USER is used while in userspace. Therefore, userspace cannot use any + * lingering PCID_KERNEL entries to kernel addresses it should not be able to + * read. + * + * The trampoline cr3s are set not to invalidate on a mov to %cr3. This means if + * we take a journey through the kernel without switching HATs, we have some + * hope of keeping our TLB state around. + * + * On a hat switch, rather than deal with any necessary flushes on the way out + * of the trampolines, we do them upfront here. If we're switching from kas, we + * shouldn't need any invalidation. + * + * Otherwise, we can have stale userspace entries for both PCID_USER (what + * happened before we move onto the kcr3) and PCID_KERNEL (any subsequent + * userspace accesses such as ddi_copyin()). Since setcr3() won't do these + * flushes on its own in PCIDE, we'll do a non-flushing load and then + * invalidate everything. */ void hat_switch(hat_t *hat) { - uint64_t newcr3; - cpu_t *cpu = CPU; - hat_t *old = cpu->cpu_current_hat; + cpu_t *cpu = CPU; + hat_t *old = cpu->cpu_current_hat; /* * set up this information first, so we don't miss any cross calls @@ -1572,54 +1607,63 @@ hat_switch(hat_t *hat) } cpu->cpu_current_hat = hat; - /* - * now go ahead and load cr3 - */ +#if defined(__xpv) + hat_switch_xen(hat); +#else + struct hat_cpu_info *info = cpu->cpu_m.mcpu_hat_info; + uint64_t pcide = getcr4() & CR4_PCIDE; + uint64_t kcr3, ucr3; + pfn_t tl_kpfn; + ulong_t flag; + + EQUIV(kpti_enable, !mmu.pt_global); + if (hat->hat_flags & HAT_COPIED) { hat_pcp_update(cpu, hat); - newcr3 = MAKECR3(cpu->cpu_hat_info->hci_pcp_l3pfn); + tl_kpfn = info->hci_pcp_l3pfn; } else { - newcr3 = MAKECR3((uint64_t)hat->hat_htable->ht_pfn); + IMPLY(kpti_enable, hat == kas.a_hat); + tl_kpfn = hat->hat_htable->ht_pfn; } -#ifdef __xpv - { - struct mmuext_op t[2]; - uint_t retcnt; - uint_t opcnt = 1; - t[0].cmd = MMUEXT_NEW_BASEPTR; - t[0].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); + if (pcide) { + ASSERT(kpti_enable); - /* - * There's an interesting problem here, as to what to - * actually specify when switching to the kernel hat. - * For now we'll reuse the kernel hat again. - */ - t[1].cmd = MMUEXT_NEW_USER_BASEPTR; - if (hat == kas.a_hat) - t[1].arg1.mfn = mmu_btop(pa_to_ma(newcr3)); - else - t[1].arg1.mfn = pfn_to_mfn(hat->hat_user_ptable); - ++opcnt; + kcr3 = MAKECR3(tl_kpfn, PCID_KERNEL) | CR3_NOINVL_BIT; + ucr3 = MAKECR3(info->hci_user_l3pfn, PCID_USER) | + CR3_NOINVL_BIT; - if (HYPERVISOR_mmuext_op(t, opcnt, &retcnt, DOMID_SELF) < 0) - panic("HYPERVISOR_mmu_update() failed"); - ASSERT(retcnt == opcnt); + setcr3(kcr3); + if (old != kas.a_hat) + mmu_flush_tlb(FLUSH_TLB_ALL, NULL); + } else { + kcr3 = MAKECR3(tl_kpfn, PCID_NONE); + ucr3 = kpti_enable ? + MAKECR3(info->hci_user_l3pfn, PCID_NONE) : + 0; + setcr3(kcr3); } -#else - setcr3(newcr3); - reset_kpti(&cpu->cpu_m.mcpu_kpti, newcr3); - reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, newcr3); - reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, newcr3); - - if (kpti_enable == 1) { - newcr3 = MAKECR3(cpu->cpu_hat_info->hci_user_l3pfn); - cpu->cpu_m.mcpu_kpti.kf_user_cr3 = newcr3; - cpu->cpu_m.mcpu_kpti_dbg.kf_user_cr3 = newcr3; - cpu->cpu_m.mcpu_kpti_flt.kf_user_cr3 = newcr3; - } -#endif + + /* + * We will already be taking shootdowns for our new HAT, and as KPTI + * invpcid emulation needs to use kf_user_cr3, make sure we don't get + * any cross calls while we're inconsistent. Note that it's harmless to + * have a *stale* kf_user_cr3 (we just did a FLUSH_TLB_ALL), but a + * *zero* kf_user_cr3 is not going to go very well. + */ + if (pcide) + flag = intr_clear(); + + reset_kpti(&cpu->cpu_m.mcpu_kpti, kcr3, ucr3); + reset_kpti(&cpu->cpu_m.mcpu_kpti_flt, kcr3, ucr3); + reset_kpti(&cpu->cpu_m.mcpu_kpti_dbg, kcr3, ucr3); + + if (pcide) + intr_restore(flag); + +#endif /* !__xpv */ + ASSERT(cpu == CPU); } @@ -2490,29 +2534,17 @@ hat_unlock_region(struct hat *hat, caddr_t addr, size_t len, panic("No shared region support on x86"); } -/* - * A range of virtual pages for purposes of demapping. - */ -typedef struct range_info { - uintptr_t rng_va; /* address of page */ - ulong_t rng_cnt; /* number of pages in range */ - level_t rng_level; /* page table level */ -} range_info_t; - #if !defined(__xpv) /* * Cross call service routine to demap a range of virtual * pages on the current CPU or flush all mappings in TLB. */ -/*ARGSUSED*/ static int hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) { + _NOTE(ARGUNUSED(a3)); hat_t *hat = (hat_t *)a1; - range_info_t *range = (range_info_t *)a2; - size_t len = (size_t)a3; - caddr_t addr = (caddr_t)range->rng_va; - size_t pgsz = LEVEL_SIZE(range->rng_level); + tlb_range_t *range = (tlb_range_t *)a2; /* * If the target hat isn't the kernel and this CPU isn't operating @@ -2521,20 +2553,16 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) if (hat != kas.a_hat && hat != CPU->cpu_current_hat) return (0); - /* - * For a normal address, we flush a range of contiguous mappings - */ - if ((uintptr_t)addr != DEMAP_ALL_ADDR) { - for (size_t i = 0; i < len; i += pgsz) - mmu_tlbflush_entry(addr + i); + if (range->tr_va != DEMAP_ALL_ADDR) { + mmu_flush_tlb(FLUSH_TLB_RANGE, range); return (0); } /* - * Otherwise we reload cr3 to effect a complete TLB flush. + * We are flushing all of userspace. * - * A reload of cr3 when using PCP also means we must also recopy in the - * pte values from the struct hat + * When using PCP, we first need to update this CPU's idea of the PCP + * PTEs. */ if (hat->hat_flags & HAT_COPIED) { #if defined(__amd64) @@ -2543,34 +2571,13 @@ hati_demap_func(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) reload_pae32(hat, CPU); #endif } - reload_cr3(); - return (0); -} -/* - * Flush all TLB entries, including global (ie. kernel) ones. - */ -static void -flush_all_tlb_entries(void) -{ - ulong_t cr4 = getcr4(); - - if (cr4 & CR4_PGE) { - setcr4(cr4 & ~(ulong_t)CR4_PGE); - setcr4(cr4); - - /* - * 32 bit PAE also needs to always reload_cr3() - */ - if (mmu.max_level == 2) - reload_cr3(); - } else { - reload_cr3(); - } + mmu_flush_tlb(FLUSH_TLB_NONGLOBAL, NULL); + return (0); } -#define TLB_CPU_HALTED (01ul) -#define TLB_INVAL_ALL (02ul) +#define TLBIDLE_CPU_HALTED (0x1UL) +#define TLBIDLE_INVAL_ALL (0x2UL) #define CAS_TLB_INFO(cpu, old, new) \ atomic_cas_ulong((ulong_t *)&(cpu)->cpu_m.mcpu_tlb_info, (old), (new)) @@ -2580,7 +2587,8 @@ flush_all_tlb_entries(void) void tlb_going_idle(void) { - atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, TLB_CPU_HALTED); + atomic_or_ulong((ulong_t *)&CPU->cpu_m.mcpu_tlb_info, + TLBIDLE_CPU_HALTED); } /* @@ -2597,19 +2605,19 @@ tlb_service(void) * We only have to do something if coming out of being idle. */ tlb_info = CPU->cpu_m.mcpu_tlb_info; - if (tlb_info & TLB_CPU_HALTED) { + if (tlb_info & TLBIDLE_CPU_HALTED) { ASSERT(CPU->cpu_current_hat == kas.a_hat); /* * Atomic clear and fetch of old state. */ while ((found = CAS_TLB_INFO(CPU, tlb_info, 0)) != tlb_info) { - ASSERT(found & TLB_CPU_HALTED); + ASSERT(found & TLBIDLE_CPU_HALTED); tlb_info = found; SMT_PAUSE(); } - if (tlb_info & TLB_INVAL_ALL) - flush_all_tlb_entries(); + if (tlb_info & TLBIDLE_INVAL_ALL) + mmu_flush_tlb(FLUSH_TLB_ALL, NULL); } } #endif /* !__xpv */ @@ -2619,13 +2627,12 @@ tlb_service(void) * all CPUs using a given hat. */ void -hat_tlb_inval_range(hat_t *hat, range_info_t *range) +hat_tlb_inval_range(hat_t *hat, tlb_range_t *in_range) { extern int flushes_require_xcalls; /* from mp_startup.c */ cpuset_t justme; cpuset_t cpus_to_shootdown; - uintptr_t va = range->rng_va; - size_t len = range->rng_cnt << LEVEL_SHIFT(range->rng_level); + tlb_range_t range = *in_range; #ifndef __xpv cpuset_t check_cpus; cpu_t *cpup; @@ -2646,7 +2653,7 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range) */ if (hat->hat_flags & HAT_SHARED) { hat = kas.a_hat; - va = DEMAP_ALL_ADDR; + range.tr_va = DEMAP_ALL_ADDR; } /* @@ -2654,15 +2661,16 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range) */ if (panicstr || !flushes_require_xcalls) { #ifdef __xpv - if (va == DEMAP_ALL_ADDR) { + if (range.tr_va == DEMAP_ALL_ADDR) { xen_flush_tlb(); } else { - for (size_t i = 0; i < len; i += MMU_PAGESIZE) - xen_flush_va((caddr_t)(va + i)); + for (size_t i = 0; i < TLB_RANGE_LEN(&range); + i += MMU_PAGESIZE) { + xen_flush_va((caddr_t)(range.tr_va + i)); + } } #else - (void) hati_demap_func((xc_arg_t)hat, - (xc_arg_t)range, (xc_arg_t)len); + (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); #endif return; } @@ -2696,13 +2704,13 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range) continue; tlb_info = cpup->cpu_m.mcpu_tlb_info; - while (tlb_info == TLB_CPU_HALTED) { - (void) CAS_TLB_INFO(cpup, TLB_CPU_HALTED, - TLB_CPU_HALTED | TLB_INVAL_ALL); + while (tlb_info == TLBIDLE_CPU_HALTED) { + (void) CAS_TLB_INFO(cpup, TLBIDLE_CPU_HALTED, + TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL); SMT_PAUSE(); tlb_info = cpup->cpu_m.mcpu_tlb_info; } - if (tlb_info == (TLB_CPU_HALTED | TLB_INVAL_ALL)) { + if (tlb_info == (TLBIDLE_CPU_HALTED | TLBIDLE_INVAL_ALL)) { HATSTAT_INC(hs_tlb_inval_delayed); CPUSET_DEL(cpus_to_shootdown, c); } @@ -2713,31 +2721,33 @@ hat_tlb_inval_range(hat_t *hat, range_info_t *range) CPUSET_ISEQUAL(cpus_to_shootdown, justme)) { #ifdef __xpv - if (va == DEMAP_ALL_ADDR) { + if (range.tr_va == DEMAP_ALL_ADDR) { xen_flush_tlb(); } else { - for (size_t i = 0; i < len; i += MMU_PAGESIZE) - xen_flush_va((caddr_t)(va + i)); + for (size_t i = 0; i < TLB_RANGE_LEN(&range); + i += MMU_PAGESIZE) { + xen_flush_va((caddr_t)(range.tr_va + i)); + } } #else - (void) hati_demap_func((xc_arg_t)hat, - (xc_arg_t)range, (xc_arg_t)len); + (void) hati_demap_func((xc_arg_t)hat, (xc_arg_t)&range, 0); #endif } else { CPUSET_ADD(cpus_to_shootdown, CPU->cpu_id); #ifdef __xpv - if (va == DEMAP_ALL_ADDR) { + if (range.tr_va == DEMAP_ALL_ADDR) { xen_gflush_tlb(cpus_to_shootdown); } else { - for (size_t i = 0; i < len; i += MMU_PAGESIZE) { - xen_gflush_va((caddr_t)(va + i), + for (size_t i = 0; i < TLB_RANGE_LEN(&range); + i += MMU_PAGESIZE) { + xen_gflush_va((caddr_t)(range.tr_va + i), cpus_to_shootdown); } } #else - xc_call((xc_arg_t)hat, (xc_arg_t)range, (xc_arg_t)len, + xc_call((xc_arg_t)hat, (xc_arg_t)&range, 0, CPUSET2BV(cpus_to_shootdown), hati_demap_func); #endif @@ -2751,10 +2761,10 @@ hat_tlb_inval(hat_t *hat, uintptr_t va) /* * Create range for a single page. */ - range_info_t range; - range.rng_va = va; - range.rng_cnt = 1; /* one page */ - range.rng_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */ + tlb_range_t range; + range.tr_va = va; + range.tr_cnt = 1; /* one page */ + range.tr_level = MIN_PAGE_LEVEL; /* pages are MMU_PAGESIZE */ hat_tlb_inval_range(hat, &range); } @@ -2927,17 +2937,17 @@ hat_unload(hat_t *hat, caddr_t addr, size_t len, uint_t flags) * for the specified ranges of contiguous pages. */ static void -handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, range_info_t *range) +handle_ranges(hat_t *hat, hat_callback_t *cb, uint_t cnt, tlb_range_t *range) { while (cnt > 0) { --cnt; hat_tlb_inval_range(hat, &range[cnt]); if (cb != NULL) { - cb->hcb_start_addr = (caddr_t)range[cnt].rng_va; + cb->hcb_start_addr = (caddr_t)range[cnt].tr_va; cb->hcb_end_addr = cb->hcb_start_addr; - cb->hcb_end_addr += range[cnt].rng_cnt << - LEVEL_SHIFT(range[cnt].rng_level); + cb->hcb_end_addr += range[cnt].tr_cnt << + LEVEL_SHIFT(range[cnt].tr_level); cb->hcb_function(cb); } } @@ -2967,7 +2977,7 @@ hat_unload_callback( htable_t *ht = NULL; uint_t entry; uintptr_t contig_va = (uintptr_t)-1L; - range_info_t r[MAX_UNLOAD_CNT]; + tlb_range_t r[MAX_UNLOAD_CNT]; uint_t r_cnt = 0; x86pte_t old_pte; @@ -3007,14 +3017,14 @@ hat_unload_callback( * We'll do the call backs for contiguous ranges */ if (vaddr != contig_va || - (r_cnt > 0 && r[r_cnt - 1].rng_level != ht->ht_level)) { + (r_cnt > 0 && r[r_cnt - 1].tr_level != ht->ht_level)) { if (r_cnt == MAX_UNLOAD_CNT) { handle_ranges(hat, cb, r_cnt, r); r_cnt = 0; } - r[r_cnt].rng_va = vaddr; - r[r_cnt].rng_cnt = 0; - r[r_cnt].rng_level = ht->ht_level; + r[r_cnt].tr_va = vaddr; + r[r_cnt].tr_cnt = 0; + r[r_cnt].tr_level = ht->ht_level; ++r_cnt; } @@ -3032,7 +3042,7 @@ hat_unload_callback( ASSERT(ht->ht_level <= mmu.max_page_level); vaddr += LEVEL_SIZE(ht->ht_level); contig_va = vaddr; - ++r[r_cnt - 1].rng_cnt; + ++r[r_cnt - 1].tr_cnt; } if (ht) htable_release(ht); @@ -3061,14 +3071,14 @@ hat_flush_range(hat_t *hat, caddr_t va, size_t size) #ifdef __xpv xen_flush_tlb(); #else - flush_all_tlb_entries(); + mmu_flush_tlb(FLUSH_TLB_ALL, NULL); #endif break; } #ifdef __xpv xen_flush_va(va); #else - mmu_tlbflush_entry(va); + mmu_flush_tlb_kpage((uintptr_t)va); #endif va += sz; } @@ -3734,7 +3744,7 @@ hat_unshare(hat_t *hat, caddr_t addr, size_t len, uint_t ismszc) /* * flush the TLBs - since we're probably dealing with MANY mappings - * we do just one CR3 reload. + * we just do a full invalidation. */ if (!(hat->hat_flags & HAT_FREEING) && need_demaps) hat_tlb_inval(hat, DEMAP_ALL_ADDR); @@ -4553,7 +4563,7 @@ hat_mempte_release(caddr_t addr, hat_mempte_t pte_pa) *pteptr = 0; else *(x86pte32_t *)pteptr = 0; - mmu_tlbflush_entry(addr); + mmu_flush_tlb_kpage((uintptr_t)addr); x86pte_mapout(); } #endif @@ -4614,7 +4624,7 @@ hat_mempte_remap( *(x86pte_t *)pteptr = pte; else *(x86pte32_t *)pteptr = (x86pte32_t)pte; - mmu_tlbflush_entry(addr); + mmu_flush_tlb_kpage((uintptr_t)addr); x86pte_mapout(); } #endif @@ -5125,7 +5135,7 @@ hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs) ASSERT3S(kpti_enable, ==, 1); ASSERT3P(cpu_hat, !=, NULL); ASSERT3U(cpu_hat->hat_flags & HAT_PCP, ==, HAT_PCP); - ASSERT3U(va & (MMU_PAGESIZE - 1), ==, 0); + ASSERT3U(va & MMU_PAGEOFFSET, ==, 0); pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); VERIFY3U(pfn, !=, PFN_INVALID); diff --git a/usr/src/uts/i86pc/vm/hat_i86.h b/usr/src/uts/i86pc/vm/hat_i86.h index 2bcac4ec61..16ad6aca33 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.h +++ b/usr/src/uts/i86pc/vm/hat_i86.h @@ -75,17 +75,10 @@ extern "C" { */ #define MAX_COPIED_PTES 1 #else -#if defined(__amd64) /* * The 64-bit kernel may have up to 512 PTEs present in it for a given process. */ #define MAX_COPIED_PTES 512 -#elif defined(__i386) -/* - * The 32-bit kernel always uses 4 PTEs for this. - */ -#define MAX_COPIED_PTES 4 -#endif /* __amd64 */ #endif /* __xpv */ #define TOP_LEVEL(h) (((h)->hat_max_level)) @@ -254,7 +247,6 @@ extern void halt(char *fmt); extern void hat_kern_alloc(caddr_t segmap_base, size_t segmap_size, caddr_t ekernelheap); extern void hat_kern_setup(void); -extern void hat_tlb_inval(struct hat *hat, uintptr_t va); extern void hat_pte_unmap(htable_t *ht, uint_t entry, uint_t flags, x86pte_t old_pte, void *pte_ptr, boolean_t tlb); extern void hat_init_finish(void); @@ -266,34 +258,35 @@ extern void hat_kmap_init(uintptr_t base, size_t len); extern hment_t *hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry); -#if defined(__amd64) -extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs); extern void mmu_calc_user_slots(void); -#endif +extern void hat_tlb_inval(struct hat *hat, uintptr_t va); +extern void hat_switch(struct hat *hat); -#if !defined(__xpv) -/* - * routines to deal with delayed TLB invalidations for idle CPUs - */ -extern void tlb_going_idle(void); -extern void tlb_service(void); -#endif +#define TLB_RANGE_LEN(r) ((r)->tr_cnt << LEVEL_SHIFT((r)->tr_level)) /* - * Hat switch function invoked to load a new context into %cr3 + * A range of virtual pages for purposes of demapping. */ -extern void hat_switch(struct hat *hat); +typedef struct tlb_range { + uintptr_t tr_va; /* address of page */ + ulong_t tr_cnt; /* number of pages in range */ + int8_t tr_level; /* page table level */ +} tlb_range_t; + +#if defined(__xpv) + +#define XPV_DISALLOW_MIGRATE() xen_block_migrate() +#define XPV_ALLOW_MIGRATE() xen_allow_migrate() + +#define mmu_flush_tlb_page(va) mmu_invlpg((caddr_t)va) +#define mmu_flush_tlb_kpage(va) mmu_invlpg((caddr_t)va) -#ifdef __xpv /* * Interfaces to use around code that maps/unmaps grant table references. */ extern void hat_prepare_mapping(hat_t *, caddr_t, uint64_t *); extern void hat_release_mapping(hat_t *, caddr_t); -#define XPV_DISALLOW_MIGRATE() xen_block_migrate() -#define XPV_ALLOW_MIGRATE() xen_allow_migrate() - #else #define XPV_DISALLOW_MIGRATE() /* nothing */ @@ -301,8 +294,25 @@ extern void hat_release_mapping(hat_t *, caddr_t); #define pfn_is_foreign(pfn) __lintzero -#endif +typedef enum flush_tlb_type { + FLUSH_TLB_ALL = 1, + FLUSH_TLB_NONGLOBAL = 2, + FLUSH_TLB_RANGE = 3, +} flush_tlb_type_t; + +extern void mmu_flush_tlb(flush_tlb_type_t, tlb_range_t *); +extern void mmu_flush_tlb_kpage(uintptr_t); +extern void mmu_flush_tlb_page(uintptr_t); + +extern void hati_cpu_punchin(cpu_t *cpu, uintptr_t va, uint_t attrs); + +/* + * routines to deal with delayed TLB invalidations for idle CPUs + */ +extern void tlb_going_idle(void); +extern void tlb_service(void); +#endif /* !__xpv */ #endif /* _KERNEL */ diff --git a/usr/src/uts/i86pc/vm/hat_kdi.c b/usr/src/uts/i86pc/vm/hat_kdi.c index 986bcb579e..ae0571e645 100644 --- a/usr/src/uts/i86pc/vm/hat_kdi.c +++ b/usr/src/uts/i86pc/vm/hat_kdi.c @@ -22,6 +22,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ /* @@ -177,7 +179,7 @@ kdi_vtop(uintptr_t va, uint64_t *pap) #if defined(__xpv) *pap = pfn_to_pa(CPU->cpu_current_hat->hat_htable->ht_pfn); #else - *pap = getcr3() & MMU_PAGEMASK; + *pap = getcr3_pa(); #endif for (level = mmu.max_level; ; --level) { index = (va >> LEVEL_SHIFT(level)) & (mmu.ptes_per_table - 1); @@ -249,7 +251,7 @@ kdi_prw(caddr_t buf, size_t nbytes, uint64_t pa, size_t *ncopiedp, int doread) *hat_kdi_pte = pte; else *(x86pte32_t *)hat_kdi_pte = pte; - mmu_tlbflush_entry((caddr_t)hat_kdi_page); + mmu_flush_tlb_kpage(hat_kdi_page); #endif bcopy(from, to, sz); @@ -268,7 +270,7 @@ kdi_prw(caddr_t buf, size_t nbytes, uint64_t pa, size_t *ncopiedp, int doread) *hat_kdi_pte = 0; else *(x86pte32_t *)hat_kdi_pte = 0; - mmu_tlbflush_entry((caddr_t)hat_kdi_page); + mmu_flush_tlb_kpage(hat_kdi_page); #endif buf += sz; @@ -296,6 +298,19 @@ kdi_pwrite(caddr_t buf, size_t nbytes, uint64_t addr, size_t *ncopiedp) return (kdi_prw(buf, nbytes, addr, ncopiedp, 0)); } +#if !defined(__xpv) +/* + * This gets used for flushing the TLB on all the slaves just prior to doing a + * kdi_prw(). It's unclear why this was originally done, since kdi_prw() itself + * will flush any lingering hat_kdi_page mappings, but let's presume it was a + * good idea. + */ +void +kdi_flush_caches(void) +{ + mmu_flush_tlb(FLUSH_TLB_ALL, NULL); +} +#endif /* * Return the number of bytes, relative to the beginning of a given range, that diff --git a/usr/src/uts/i86pc/vm/hat_pte.h b/usr/src/uts/i86pc/vm/hat_pte.h index 121d96cf84..b65a69cb51 100644 --- a/usr/src/uts/i86pc/vm/hat_pte.h +++ b/usr/src/uts/i86pc/vm/hat_pte.h @@ -156,9 +156,10 @@ typedef int8_t level_t; #define PFN_ABOVE64G(pfn) ((pfn) >= PFN_64G) /* - * The CR3 register holds the physical address of the top level page table. + * The CR3 register holds the physical address of the top level page table, + * along with the current PCID if any. */ -#define MAKECR3(pfn) mmu_ptob(pfn) +#define MAKECR3(pfn, pcid) (mmu_ptob(pfn) | pcid) /* * HAT/MMU parameters that depend on kernel mode and/or processor type @@ -178,12 +179,10 @@ struct hat_mmu_info { uint_t top_level_count; /* # of entries in top-level page table */ uint_t top_level_uslots; /* # of user slots in top-level page table */ uint_t num_copied_ents; /* # of PCP-copied PTEs to create */ -#if defined(__amd64) /* 32-bit versions of values */ uint_t top_level_uslots32; uint_t max_level32; uint_t num_copied_ents32; -#endif uint_t hash_cnt; /* cnt of entries in htable_hash_cache */ uint_t hat32_hash_cnt; /* cnt of entries in 32-bit htable_hash_cache */ diff --git a/usr/src/uts/i86pc/vm/htable.c b/usr/src/uts/i86pc/vm/htable.c index b294597eba..a2d59d98ab 100644 --- a/usr/src/uts/i86pc/vm/htable.c +++ b/usr/src/uts/i86pc/vm/htable.c @@ -137,7 +137,7 @@ xen_flush_va(caddr_t va) uint_t count; if (IN_XPV_PANIC()) { - mmu_tlbflush_entry((caddr_t)va); + mmu_flush_tlb_page((uintptr_t)va); } else { t.cmd = MMUEXT_INVLPG_LOCAL; t.arg1.linear_addr = (uintptr_t)va; @@ -154,7 +154,7 @@ xen_gflush_va(caddr_t va, cpuset_t cpus) uint_t count; if (IN_XPV_PANIC()) { - mmu_tlbflush_entry((caddr_t)va); + mmu_flush_tlb_page((uintptr_t)va); return; } @@ -1989,7 +1989,10 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) * Disable preemption and grab the CPU's hci_mutex */ kpreempt_disable(); + ASSERT(CPU->cpu_hat_info != NULL); + ASSERT(!(getcr4() & CR4_PCIDE)); + mutex_enter(&CPU->cpu_hat_info->hci_mutex); x = PWIN_TABLE(CPU->cpu_id); pteptr = (x86pte_t *)PWIN_PTE_VA(x); @@ -2024,7 +2027,7 @@ x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) else *(x86pte32_t *)pteptr = newpte; XPV_DISALLOW_PAGETABLE_UPDATES(); - mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); + mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x)); } } return (PT_INDEX_PTR(PWIN_VA(x), index)); @@ -2137,7 +2140,7 @@ x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr) xen_flush_va((caddr_t)addr); else #endif - mmu_tlbflush_entry((caddr_t)addr); + mmu_flush_tlb_page(addr); goto done; } @@ -2380,6 +2383,8 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) } else { uint_t x = PWIN_SRC(CPU->cpu_id); + ASSERT(!(getcr4() & CR4_PCIDE)); + /* * Finish defining the src pagetable mapping */ @@ -2390,7 +2395,7 @@ x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) *pteptr = pte; else *(x86pte32_t *)pteptr = pte; - mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); + mmu_flush_tlb_kpage((uintptr_t)PWIN_VA(x)); } /* diff --git a/usr/src/uts/i86pc/vm/htable.h b/usr/src/uts/i86pc/vm/htable.h index d9b91189c9..8f4aac7e39 100644 --- a/usr/src/uts/i86pc/vm/htable.h +++ b/usr/src/uts/i86pc/vm/htable.h @@ -42,7 +42,6 @@ extern void atomic_andb(uint8_t *addr, uint8_t value); extern void atomic_orb(uint8_t *addr, uint8_t value); extern void atomic_inc16(uint16_t *addr); extern void atomic_dec16(uint16_t *addr); -extern void mmu_tlbflush_entry(caddr_t addr); /* * Each hardware page table has an htable_t describing it. diff --git a/usr/src/uts/i86pc/vm/i86_mmu.c b/usr/src/uts/i86pc/vm/i86_mmu.c index a8f9c46805..e413617db8 100644 --- a/usr/src/uts/i86pc/vm/i86_mmu.c +++ b/usr/src/uts/i86pc/vm/i86_mmu.c @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/t_lock.h> @@ -61,92 +63,9 @@ #include <sys/hypervisor.h> #endif -caddr_t -i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) -{ - caddr_t addr; - caddr_t addr1; - page_t *pp; - - addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); - - for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { - pp = page_numtopp_nolock(pf); - if (pp == NULL) { - hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, - prot | HAT_NOSYNC, HAT_LOAD_LOCK); - } else { - hat_memload(kas.a_hat, addr, pp, - prot | HAT_NOSYNC, HAT_LOAD_LOCK); - } - } - - return (addr1); -} - -/* - * This routine is like page_numtopp, but accepts only free pages, which - * it allocates (unfrees) and returns with the exclusive lock held. - * It is used by machdep.c/dma_init() to find contiguous free pages. - * - * XXX this and some others should probably be in vm_machdep.c - */ -page_t * -page_numtopp_alloc(pfn_t pfnum) -{ - page_t *pp; - -retry: - pp = page_numtopp_nolock(pfnum); - if (pp == NULL) { - return (NULL); - } - - if (!page_trylock(pp, SE_EXCL)) { - return (NULL); - } - - if (page_pptonum(pp) != pfnum) { - page_unlock(pp); - goto retry; - } - - if (!PP_ISFREE(pp)) { - page_unlock(pp); - return (NULL); - } - if (pp->p_szc) { - page_demote_free_pages(pp); - page_unlock(pp); - goto retry; - } - - /* If associated with a vnode, destroy mappings */ - - if (pp->p_vnode) { - - page_destroy_free(pp); - - if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { - return (NULL); - } - - if (page_pptonum(pp) != pfnum) { - page_unlock(pp); - goto retry; - } - } - - if (!PP_ISFREE(pp)) { - page_unlock(pp); - return (NULL); - } - - if (!page_reclaim(pp, (kmutex_t *)NULL)) - return (NULL); - - return (pp); -} +#define ON_USER_HAT(cpu) \ + ((cpu)->cpu_m.mcpu_current_hat != NULL && \ + (cpu)->cpu_m.mcpu_current_hat != kas.a_hat) /* * Flag is not set early in boot. Once it is set we are no longer @@ -436,20 +355,6 @@ hat_kern_alloc( table_cnt += mmu.top_level_count - ((kernelbase >> LEVEL_SHIFT(mmu.max_level)) & (mmu.top_level_count - 1)); -#if defined(__i386) - /* - * The 32 bit PAE hat allocates tables one level below the top when - * kernelbase isn't 1 Gig aligned. We'll just be sloppy and allocate - * a bunch more to the reserve. Any unused will be returned later. - * Note we've already counted these mappings, just not the extra - * pagetables. - */ - if (mmu.pae_hat != 0 && (kernelbase & LEVEL_OFFSET(mmu.max_level)) != 0) - table_cnt += mmu.ptes_per_table - - ((kernelbase & LEVEL_OFFSET(mmu.max_level)) >> - LEVEL_SHIFT(mmu.max_level - 1)); -#endif - /* * Add 1/4 more into table_cnt for extra slop. The unused * slop is freed back when we htable_adjust_reserve() later. @@ -493,15 +398,11 @@ hat_kern_setup(void) #ifdef __xpv mmu_btop(xen_info->pt_base - ONE_GIG)); #else - mmu_btop(getcr3())); + mmu_btop(getcr3_pa())); #endif /* END CSTYLED */ -#if defined(__i386) && !defined(__xpv) - CPU->cpu_tss->tss_cr3 = dftss0->tss_cr3 = getcr3(); -#endif /* __i386 */ - -#if defined(__xpv) && defined(__amd64) +#if defined(__xpv) /* * Try to make the kpm mappings r/w. Failures here are OK, as * it's probably just a pagetable @@ -517,3 +418,179 @@ hat_kern_setup(void) CPUSET_ATOMIC_ADD(kas.a_hat->hat_cpus, CPU->cpu_id); CPU->cpu_current_hat = kas.a_hat; } + +#ifndef __xpv + +/* + * Note that the INVPCID_ALL* variants can be used even in the !PCIDE case, but + * INVPCID_ADDR isn't. + */ +static void +invpcid(uint64_t type, uint64_t pcid, uintptr_t addr) +{ + ulong_t flag; + uint64_t cr4; + + if (x86_use_invpcid == 1) { + ASSERT(is_x86_feature(x86_featureset, X86FSET_INVPCID)); + invpcid_insn(type, pcid, addr); + return; + } + + switch (type) { + case INVPCID_ALL_GLOBAL: + flag = intr_clear(); + cr4 = getcr4(); + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4 | CR4_PGE); + intr_restore(flag); + break; + + case INVPCID_ALL_NONGLOBAL: + if (!(getcr4() & CR4_PCIDE)) { + reload_cr3(); + } else { + flag = intr_clear(); + cr4 = getcr4(); + setcr4(cr4 & ~(ulong_t)CR4_PGE); + setcr4(cr4 | CR4_PGE); + intr_restore(flag); + } + break; + + case INVPCID_ADDR: + if (pcid == PCID_USER) { + flag = intr_clear(); + ASSERT(addr < kernelbase); + ASSERT(ON_USER_HAT(CPU)); + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); + tr_mmu_flush_user_range(addr, MMU_PAGESIZE, + MMU_PAGESIZE, CPU->cpu_m.mcpu_kpti.kf_user_cr3); + intr_restore(flag); + } else { + mmu_invlpg((caddr_t)addr); + } + break; + + default: + panic("unsupported invpcid(%lu)", type); + break; + } +} + +/* + * Flush one kernel mapping. + * + * We want to assert on kernel space here mainly for reasoning about the PCIDE + * case: namely, this flush should never need to flush a non-current PCID + * mapping. This presumes we never have reason to flush the kernel regions + * available to PCID_USER (the trampolines and so on). It also relies on + * PCID_KERNEL == PCID_NONE. + */ +void +mmu_flush_tlb_kpage(uintptr_t va) +{ + ASSERT(va >= kernelbase); + ASSERT(getpcid() == PCID_KERNEL); + mmu_invlpg((caddr_t)va); +} + +/* + * Flush one mapping: local CPU version of hat_tlb_inval(). + * + * If this is a userspace address in the PCIDE case, we need two invalidations, + * one for any potentially stale PCID_USER mapping, as well as any established + * while in the kernel. + */ +void +mmu_flush_tlb_page(uintptr_t va) +{ + ASSERT(getpcid() == PCID_KERNEL); + + if (va >= kernelbase) { + mmu_flush_tlb_kpage(va); + return; + } + + if (!(getcr4() & CR4_PCIDE)) { + mmu_invlpg((caddr_t)va); + return; + } + + /* + * Yes, kas will need to flush below kernelspace, at least during boot. + * But there's no PCID_USER context. + */ + if (ON_USER_HAT(CPU)) + invpcid(INVPCID_ADDR, PCID_USER, va); + invpcid(INVPCID_ADDR, PCID_KERNEL, va); +} + +static void +mmu_flush_tlb_range(uintptr_t addr, size_t len, size_t pgsz) +{ + EQUIV(addr < kernelbase, (addr + len - 1) < kernelbase); + ASSERT(len > 0); + ASSERT(pgsz != 0); + + if (!(getcr4() & CR4_PCIDE) || x86_use_invpcid == 1) { + for (uintptr_t va = addr; va < (addr + len); va += pgsz) + mmu_flush_tlb_page(va); + return; + } + + /* + * As an emulated invpcid() in the PCIDE case requires jumping + * cr3s, we batch the invalidations. We should only need to flush the + * user range if we're on a user-space HAT. + */ + if (addr < kernelbase && ON_USER_HAT(CPU)) { + ulong_t flag = intr_clear(); + ASSERT(CPU->cpu_m.mcpu_kpti.kf_user_cr3 != 0); + tr_mmu_flush_user_range(addr, len, pgsz, + CPU->cpu_m.mcpu_kpti.kf_user_cr3); + intr_restore(flag); + } + + for (uintptr_t va = addr; va < (addr + len); va += pgsz) + mmu_invlpg((caddr_t)va); +} + +/* + * MMU TLB (and PT cache) flushing on this CPU. + * + * FLUSH_TLB_ALL: invalidate everything, all PCIDs, all PT_GLOBAL. + * FLUSH_TLB_NONGLOBAL: invalidate all PCIDs, excluding PT_GLOBAL + * FLUSH_TLB_RANGE: invalidate the given range, including PCID_USER + * mappings as appropriate. If using invpcid, PT_GLOBAL mappings are not + * invalidated. + */ +void +mmu_flush_tlb(flush_tlb_type_t type, tlb_range_t *range) +{ + ASSERT(getpcid() == PCID_KERNEL); + + switch (type) { + case FLUSH_TLB_ALL: + ASSERT(range == NULL); + invpcid(INVPCID_ALL_GLOBAL, 0, 0); + break; + + case FLUSH_TLB_NONGLOBAL: + ASSERT(range == NULL); + invpcid(INVPCID_ALL_NONGLOBAL, 0, 0); + break; + + case FLUSH_TLB_RANGE: { + mmu_flush_tlb_range(range->tr_va, TLB_RANGE_LEN(range), + LEVEL_SIZE(range->tr_level)); + break; + } + + default: + panic("invalid call mmu_flush_tlb(%d)", type); + break; + } +} + +#endif /* ! __xpv */ diff --git a/usr/src/uts/i86pc/vm/kboot_mmu.c b/usr/src/uts/i86pc/vm/kboot_mmu.c index 9366ff9bee..76193b3d86 100644 --- a/usr/src/uts/i86pc/vm/kboot_mmu.c +++ b/usr/src/uts/i86pc/vm/kboot_mmu.c @@ -22,6 +22,8 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -145,7 +147,7 @@ kbm_remap_window(paddr_t physaddr, int writeable) *((x86pte_t *)pte_to_window) = physaddr | pt_bits; else *((x86pte32_t *)pte_to_window) = physaddr | pt_bits; - mmu_tlbflush_entry(window); + mmu_invlpg(window); #endif DBG(window); return (window); @@ -195,7 +197,7 @@ kbm_map(uintptr_t va, paddr_t pa, uint_t level, uint_t is_kernel) *ptep = pteval; else *((x86pte32_t *)ptep) = pteval; - mmu_tlbflush_entry((caddr_t)va); + mmu_invlpg((caddr_t)va); #endif } @@ -349,7 +351,7 @@ kbm_unmap(uintptr_t va) *ptep = 0; else *((x86pte32_t *)ptep) = 0; - mmu_tlbflush_entry((caddr_t)va); + mmu_invlpg((caddr_t)va); #endif } } @@ -388,7 +390,7 @@ kbm_remap(uintptr_t va, pfn_t pfn) *((x86pte_t *)ptep) = pte_val; else *((x86pte32_t *)ptep) = pte_val; - mmu_tlbflush_entry((caddr_t)va); + mmu_invlpg((caddr_t)va); #endif if (!(old_pte & PT_VALID) || ma_to_pa(old_pte) == -1) @@ -421,7 +423,7 @@ kbm_read_only(uintptr_t va, paddr_t pa) *ptep = pte_val; else *((x86pte32_t *)ptep) = pte_val; - mmu_tlbflush_entry((caddr_t)va); + mmu_invlpg((caddr_t)va); #endif } @@ -459,7 +461,7 @@ kbm_pop(void) *((x86pte_t *)pte_to_window) = save_pte; else *((x86pte32_t *)pte_to_window) = save_pte; - mmu_tlbflush_entry(window); + mmu_invlpg(window); #endif } diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index ac01006aa4..6a94745ade 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -24,7 +24,7 @@ /* * Copyright (c) 2010, Intel Corporation. * All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -364,6 +364,91 @@ static kmutex_t contig_lock; #define PFN_16M (mmu_btop((uint64_t)0x1000000)) +caddr_t +i86devmap(pfn_t pf, pgcnt_t pgcnt, uint_t prot) +{ + caddr_t addr; + caddr_t addr1; + page_t *pp; + + addr1 = addr = vmem_alloc(heap_arena, mmu_ptob(pgcnt), VM_SLEEP); + + for (; pgcnt != 0; addr += MMU_PAGESIZE, ++pf, --pgcnt) { + pp = page_numtopp_nolock(pf); + if (pp == NULL) { + hat_devload(kas.a_hat, addr, MMU_PAGESIZE, pf, + prot | HAT_NOSYNC, HAT_LOAD_LOCK); + } else { + hat_memload(kas.a_hat, addr, pp, + prot | HAT_NOSYNC, HAT_LOAD_LOCK); + } + } + + return (addr1); +} + +/* + * This routine is like page_numtopp, but accepts only free pages, which + * it allocates (unfrees) and returns with the exclusive lock held. + * It is used by machdep.c/dma_init() to find contiguous free pages. + */ +page_t * +page_numtopp_alloc(pfn_t pfnum) +{ + page_t *pp; + +retry: + pp = page_numtopp_nolock(pfnum); + if (pp == NULL) { + return (NULL); + } + + if (!page_trylock(pp, SE_EXCL)) { + return (NULL); + } + + if (page_pptonum(pp) != pfnum) { + page_unlock(pp); + goto retry; + } + + if (!PP_ISFREE(pp)) { + page_unlock(pp); + return (NULL); + } + if (pp->p_szc) { + page_demote_free_pages(pp); + page_unlock(pp); + goto retry; + } + + /* If associated with a vnode, destroy mappings */ + + if (pp->p_vnode) { + + page_destroy_free(pp); + + if (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_NO_RECLAIM)) { + return (NULL); + } + + if (page_pptonum(pp) != pfnum) { + page_unlock(pp); + goto retry; + } + } + + if (!PP_ISFREE(pp)) { + page_unlock(pp); + return (NULL); + } + + if (!page_reclaim(pp, (kmutex_t *)NULL)) + return (NULL); + + return (pp); +} + /* * Return the optimum page size for a given mapping */ diff --git a/usr/src/uts/i86xpv/os/xpv_panic.c b/usr/src/uts/i86xpv/os/xpv_panic.c index 2b67bff1dd..594fcd4c31 100644 --- a/usr/src/uts/i86xpv/os/xpv_panic.c +++ b/usr/src/uts/i86xpv/os/xpv_panic.c @@ -23,6 +23,8 @@ * Copyright 2016 PALO, Richard. * * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright 2018 Joyent, Inc. */ #include <sys/types.h> @@ -174,7 +176,7 @@ xpv_panic_map(int level, pfn_t pfn) *(x86pte32_t *)pteptr = pte; XPV_DISALLOW_PAGETABLE_UPDATES(); - mmu_tlbflush_entry(PWIN_VA(level)); + mmu_flush_tlb_page((uintptr_t)PWIN_VA(level)); } /* diff --git a/usr/src/uts/intel/asm/htable.h b/usr/src/uts/intel/asm/htable.h index dd1d72a3c1..2601111c6d 100644 --- a/usr/src/uts/intel/asm/htable.h +++ b/usr/src/uts/intel/asm/htable.h @@ -22,6 +22,8 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _ASM_HTABLE_H @@ -36,8 +38,6 @@ extern "C" { #if !defined(__lint) && defined(__GNUC__) -#if defined(__i386) || defined(__amd64) - /* * This set of atomic operations are designed primarily * for some ia32 hat layer operations. @@ -83,17 +83,6 @@ atomic_dec16(uint16_t *addr) : "cc"); } -extern __GNU_INLINE void -mmu_tlbflush_entry(caddr_t addr) -{ - __asm__ __volatile__( - "invlpg %0" - : "=m" (*addr) - : "m" (*addr)); -} - -#endif /* __i386 || __amd64 */ - #endif /* !__lint && __GNUC__ */ #ifdef __cplusplus diff --git a/usr/src/uts/intel/asm/mmu.h b/usr/src/uts/intel/asm/mmu.h index 1be654759d..bd3e69a9a8 100644 --- a/usr/src/uts/intel/asm/mmu.h +++ b/usr/src/uts/intel/asm/mmu.h @@ -21,6 +21,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018 Joyent, Inc. */ #ifndef _ASM_MMU_H @@ -33,9 +35,9 @@ extern "C" { #endif -#if defined(__GNUC__) && !defined(__xpv) +#if defined(__GNUC__) -#if defined(__amd64) +#if !defined(__xpv) extern __GNU_INLINE ulong_t getcr3(void) @@ -57,30 +59,22 @@ setcr3(ulong_t value) : "r" (value)); } -extern __GNU_INLINE void -reload_cr3(void) -{ - setcr3(getcr3()); -} - -#elif defined(__i386) - extern __GNU_INLINE ulong_t -getcr3(void) +getcr4(void) { - uint32_t value; + uint64_t value; __asm__ __volatile__( - "movl %%cr3, %0" + "movq %%cr4, %0" : "=r" (value)); return (value); } extern __GNU_INLINE void -setcr3(ulong_t value) +setcr4(ulong_t value) { __asm__ __volatile__( - "movl %0, %%cr3" + "movq %0, %%cr4" : /* no output */ : "r" (value)); } @@ -91,9 +85,33 @@ reload_cr3(void) setcr3(getcr3()); } -#endif +/* + * We clobber memory: we're not writing anything, but we don't want to + * potentially get re-ordered beyond the TLB flush. + */ +extern __GNU_INLINE void +invpcid_insn(uint64_t type, uint64_t pcid, uintptr_t addr) +{ + uint64_t pcid_desc[2] = { pcid, addr }; + __asm__ __volatile__( + "invpcid %0, %1" + : /* no output */ + : "m" (*pcid_desc), "r" (type) + : "memory"); +} + +#endif /* !__xpv */ + +extern __GNU_INLINE void +mmu_invlpg(caddr_t addr) +{ + __asm__ __volatile__( + "invlpg %0" + : "=m" (*addr) + : "m" (*addr)); +} -#endif /* __GNUC__ && !__xpv */ +#endif /* __GNUC__ */ #ifdef __cplusplus } diff --git a/usr/src/uts/intel/ia32/ml/i86_subr.s b/usr/src/uts/intel/ia32/ml/i86_subr.s index d4ba6589bc..30f1f673d4 100644 --- a/usr/src/uts/intel/ia32/ml/i86_subr.s +++ b/usr/src/uts/intel/ia32/ml/i86_subr.s @@ -23,7 +23,7 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -436,27 +436,16 @@ getfp(void) /* ARGSUSED */ void -mmu_tlbflush_entry(caddr_t m) +mmu_invlpg(caddr_t m) {} #else /* __lint */ -#if defined(__amd64) - - ENTRY(mmu_tlbflush_entry) + ENTRY(mmu_invlpg) invlpg (%rdi) ret - SET_SIZE(mmu_tlbflush_entry) - -#elif defined(__i386) - - ENTRY(mmu_tlbflush_entry) - movl 4(%esp), %eax - invlpg (%eax) - ret - SET_SIZE(mmu_tlbflush_entry) + SET_SIZE(mmu_invlpg) -#endif /* __i386 */ #endif /* __lint */ diff --git a/usr/src/uts/intel/ia32/os/desctbls.c b/usr/src/uts/intel/ia32/os/desctbls.c index 3c021bd055..5ef56b034c 100644 --- a/usr/src/uts/intel/ia32/os/desctbls.c +++ b/usr/src/uts/intel/ia32/os/desctbls.c @@ -1259,9 +1259,6 @@ init_desctbls(void) { user_desc_t *gdt; desctbr_t idtr; -#if defined(__amd64) - extern uint64_t kpti_safe_cr3; -#endif /* * Allocate IDT and TSS structures on unique pages for better diff --git a/usr/src/uts/intel/sys/controlregs.h b/usr/src/uts/intel/sys/controlregs.h index 6ae1afb3eb..fe0cf687b4 100644 --- a/usr/src/uts/intel/sys/controlregs.h +++ b/usr/src/uts/intel/sys/controlregs.h @@ -20,9 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent, Inc. - * - * Copyright 2018 Joyent, Inc. + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_CONTROLREGS_H @@ -90,8 +88,14 @@ extern "C" { #define CR3_PCD 0x00000010 /* cache disable */ #define CR3_PWT 0x00000008 /* write through */ - -#define FMT_CR3 "\20\5pcd\4pwt" +#if defined(_ASM) +#define CR3_NOINVL_BIT 0x8000000000000000 +#else +#define CR3_NOINVL_BIT 0x8000000000000000ULL /* no invalidation */ +#endif +#define PCID_NONE 0x000 /* generic PCID */ +#define PCID_KERNEL 0x000 /* kernel's PCID */ +#define PCID_USER 0x001 /* user-space PCID */ /* CR4 Register */ diff --git a/usr/src/uts/intel/sys/x86_archext.h b/usr/src/uts/intel/sys/x86_archext.h index 475bb85738..bfd6f14289 100644 --- a/usr/src/uts/intel/sys/x86_archext.h +++ b/usr/src/uts/intel/sys/x86_archext.h @@ -27,7 +27,7 @@ * All rights reserved. */ /* - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright 2012 Jens Elkner <jel+illumos@cs.uni-magdeburg.de> * Copyright 2012 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> * Copyright 2014 Josef 'Jeff' Sipek <jeffpc@josefsipek.net> @@ -210,6 +210,7 @@ extern "C" { #define CPUID_INTC_EBX_7_0_AVX2 0x00000020 /* AVX2 supported */ #define CPUID_INTC_EBX_7_0_SMEP 0x00000080 /* SMEP in CR4 */ #define CPUID_INTC_EBX_7_0_BMI2 0x00000100 /* BMI2 instrs */ +#define CPUID_INTC_EBX_7_0_INVPCID 0x00000400 /* invpcid instr */ #define CPUID_INTC_EBX_7_0_MPX 0x00004000 /* Mem. Prot. Ext. */ #define CPUID_INTC_EBX_7_0_AVX512F 0x00010000 /* AVX512 foundation */ #define CPUID_INTC_EBX_7_0_AVX512DQ 0x00020000 /* AVX512DQ */ @@ -433,6 +434,8 @@ extern "C" { #define X86FSET_UMIP 66 #define X86FSET_PKU 67 #define X86FSET_OSPKE 68 +#define X86FSET_PCID 69 +#define X86FSET_INVPCID 70 /* * Intel Deep C-State invariant TSC in leaf 0x80000007. @@ -691,7 +694,7 @@ extern "C" { #if defined(_KERNEL) || defined(_KMEMUSER) -#define NUM_X86_FEATURES 69 +#define NUM_X86_FEATURES 71 extern uchar_t x86_featureset[]; extern void free_x86_featureset(void *featureset); @@ -725,6 +728,9 @@ struct cpuid_regs { uint32_t cp_edx; }; +extern int x86_use_pcid; +extern int x86_use_invpcid; + /* * Utility functions to get/set extended control registers (XCR) * Initial use is to get/set the contents of the XFEATURE_ENABLED_MASK. @@ -870,6 +876,8 @@ extern void determine_platform(void); extern int get_hwenv(void); extern int is_controldom(void); +extern void enable_pcid(void); + extern void xsave_setup_msr(struct cpu *); /* |