summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Levon <john.levon@joyent.com>2018-07-30 10:05:48 +0100
committerRichard Lowe <richlowe@richlowe.net>2018-08-07 19:46:08 +0000
commiteea802b0a2c12269d15276d4657e5cd64dd541a4 (patch)
tree428299fc23a7fc774b25363db24f80dfc085937c
parent72dc11568f48cd37cf8182a82d9cb55b22d7c805 (diff)
downloadillumos-joyent-eea802b0a2c12269d15276d4657e5cd64dd541a4.tar.gz
9685 KPTI %cr3 handling needs fixes
Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--usr/src/uts/i86pc/ml/kpti_trampolines.s7
-rw-r--r--usr/src/uts/i86pc/ml/locore.s23
-rw-r--r--usr/src/uts/intel/kdi/kdi_asm.s64
-rw-r--r--usr/src/uts/intel/kdi/kdi_idt.c24
4 files changed, 78 insertions, 40 deletions
diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s
index 036c254d08..83cbd69048 100644
--- a/usr/src/uts/i86pc/ml/kpti_trampolines.s
+++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s
@@ -251,6 +251,11 @@ kpti_kbase:
* This is used for all interrupts that can plausibly be taken inside another
* interrupt and are using a kpti_frame stack (so #BP, #DB, #GP, #PF, #SS).
*
+ * We also use this for #NP, even though it uses the standard IST: the
+ * additional %rsp checks below will catch when we get an exception doing an
+ * iret to userspace with a bad %cs/%ss. This appears as a kernel trap, and
+ * only later gets redirected via kern_gpfault().
+ *
* We check for whether we took the interrupt while in another trampoline, in
* which case we need to use the kthread stack.
*/
@@ -649,7 +654,7 @@ tr_intr_ret_end:
MK_INTR_TRAMPOLINE_NOERR(invoptrap)
MK_INTR_TRAMPOLINE_NOERR(ndptrap)
MK_INTR_TRAMPOLINE(invtsstrap)
- MK_INTR_TRAMPOLINE(segnptrap)
+ MK_DBG_INTR_TRAMPOLINE(segnptrap)
MK_DBG_INTR_TRAMPOLINE(stktrap)
MK_DBG_INTR_TRAMPOLINE(gptrap)
MK_DBG_INTR_TRAMPOLINE(pftrap)
diff --git a/usr/src/uts/i86pc/ml/locore.s b/usr/src/uts/i86pc/ml/locore.s
index acd96e271a..10db95ab51 100644
--- a/usr/src/uts/i86pc/ml/locore.s
+++ b/usr/src/uts/i86pc/ml/locore.s
@@ -159,7 +159,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
* %rdi = boot services (should die someday)
* %rdx = bootops
* end
- */
+ */
leaq edata(%rip), %rbp /* reference edata for ksyms */
movq $0, (%rbp) /* limit stack back trace */
@@ -178,7 +178,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
#endif
/*
* Save call back for special x86 boot services vector
- */
+ */
movq %rdi, sysp(%rip)
movq %rdx, bootops(%rip) /* save bootops */
@@ -208,7 +208,7 @@ _locore_start(struct boot_syscalls *sysp, ulong_t rsi, struct bootops *bop)
#endif /* __xpv */
/*
- * (We just assert this works by virtue of being here)
+ * (We just assert this works by virtue of being here)
*/
bts $X86FSET_CPUID, x86_featureset(%rip)
@@ -268,7 +268,7 @@ _locore_start(struct boot_syscalls *sysp, struct bootops *bop)
/*
* %ecx = boot services (should die someday)
* %ebx = bootops
- */
+ */
mov $edata, %ebp / edata needs to be defined for ksyms
movl $0, (%ebp) / limit stack back trace
@@ -283,14 +283,14 @@ _locore_start(struct boot_syscalls *sysp, struct bootops *bop)
*/
mov %ecx, sysp / save call back for boot services
- mov %ebx, bootops / save bootops
+ mov %ebx, bootops / save bootops
movl $bootops, bootopsp
/*
* Save all registers and flags
*/
- pushal
+ pushal
pushfl
#if !defined(__xpv)
@@ -443,7 +443,7 @@ port_22_free:
* cycle. If the CCR index was not valid for this Cyrix model, we may
* have performed an external I/O cycle as well. In these cases and
* if the motherboard/chipset vendor ignores I/O address line A1,
- * then the PIC will have IRQ3 set at the lowest priority as a side
+ * then the PIC will have IRQ3 set at the lowest priority as a side
* effect of the above outb. We are reasonalbly confident that there
* is not an unknown device on I/O port 0x22, so there should have been
* no unpredictable side-effect of the above outb.
@@ -892,7 +892,7 @@ likelyM3:
* now we will call anything with a DIR0 of 0x80 or higher an MIII.
* The MIII is supposed to support large pages, but we will believe
* it when we see it. For now we just enable and test for MII features.
- */
+ */
movl $X86_TYPE_VIA_CYRIX_III, x86_type
jmp likeMII
@@ -930,7 +930,7 @@ coma_bug:
* fixed this bug sometime late in 1997 and no other exploits other than
* xchgl have been discovered is good indication that this workaround is
* reasonable.
- */
+ */
.set CYRIX_DBR0, 0x30 / Debug Register 0
.set CYRIX_DBR1, 0x31 / Debug Register 1
@@ -939,7 +939,7 @@ coma_bug:
.set CYRIX_DOR, 0x3c / Debug Opcode Register
/*
- * What is known about DBR1, DBR2, DBR3, and DOR is that for normal
+ * What is known about DBR1, DBR2, DBR3, and DOR is that for normal
* cpu execution DBR1, DBR2, and DBR3 are set to 0. To obtain opcode
* serialization, DBR1, DBR2, and DBR3 are loaded with 0xb8, 0x7f,
* and 0xff. Then, DOR is loaded with the one byte opcode.
@@ -999,7 +999,7 @@ coma_bug:
/*
* write DBR1
*/
- movb $CYRIX_DBR1, %al
+ movb $CYRIX_DBR1, %al
outb $CYRIX_CRI
movb $0xf8, %al
outb $CYRIX_CRD
@@ -1201,6 +1201,7 @@ cmntrap()
leaq dtrace_badtrap(%rip), %rdi
xorl %eax, %eax
call panic
+ SET_SIZE(cmntrap_pushed)
SET_SIZE(cmntrap)
SET_SIZE(_cmntrap)
diff --git a/usr/src/uts/intel/kdi/kdi_asm.s b/usr/src/uts/intel/kdi/kdi_asm.s
index 5180dbb1b2..edfbea10c7 100644
--- a/usr/src/uts/intel/kdi/kdi_asm.s
+++ b/usr/src/uts/intel/kdi/kdi_asm.s
@@ -452,8 +452,8 @@
pushq %rdi
call kdi_trap_pass
- cmpq $1, %rax
- je kdi_pass_to_kernel
+ testq %rax, %rax
+ jnz kdi_pass_to_kernel
popq %rax /* cpusave in %rax */
SAVE_IDTGDT
@@ -569,17 +569,9 @@
/*NOTREACHED*/
SET_SIZE(kdi_resume)
- ENTRY_NP(kdi_pass_to_kernel)
-
- popq %rdi /* cpusave */
-
- movq $KDI_CPU_STATE_NONE, KRS_CPU_STATE(%rdi)
/*
- * Find the trap and vector off the right kernel handler. The trap
- * handler will expect the stack to be in trap order, with %rip being
- * the last entry, so we'll need to restore all our regs. On i86xpv
- * we'll need to compensate for XPV_TRAP_POP.
+ * We took a trap that should be handled by the kernel, not KMDB.
*
* We're hard-coding the three cases where KMDB has installed permanent
* handlers, since after we KDI_RESTORE_REGS(), we don't have registers
@@ -587,15 +579,53 @@
* through here at the same time.
*
* Note that we handle T_DBGENTR since userspace might have tried it.
+ *
+ * The trap handler will expect the stack to be in trap order, with %rip
+ * being the last entry, so we'll need to restore all our regs. On
+ * i86xpv we'll need to compensate for XPV_TRAP_POP.
+ *
+ * %rax on entry is either 1 or 2, which is from kdi_trap_pass().
+ * kdi_cmnint stashed the original %cr3 into KDIREG_CR3, then (probably)
+ * switched us to the CPU's kf_kernel_cr3. But we're about to call, for
+ * example:
+ *
+ * dbgtrap->trap()->tr_iret_kernel
+ *
+ * which, unlike, tr_iret_kdi, doesn't restore the original %cr3, so
+ * we'll do so here if needed.
+ *
+ * This isn't just a matter of tidiness: for example, consider:
+ *
+ * hat_switch(oldhat=kas.a_hat, newhat=prochat)
+ * setcr3()
+ * reset_kpti()
+ * *brktrap* due to fbt on reset_kpti:entry
+ *
+ * Here, we have the new hat's %cr3, but we haven't yet updated
+ * kf_kernel_cr3 (so its currently kas's). So if we don't restore here,
+ * we'll stay on kas's cr3 value on returning from the trap: not good if
+ * we fault on a userspace address.
*/
+ ENTRY_NP(kdi_pass_to_kernel)
+
+ popq %rdi /* cpusave */
+ movq $KDI_CPU_STATE_NONE, KRS_CPU_STATE(%rdi)
movq KRS_GREGS(%rdi), %rsp
+
+ cmpq $2, %rax
+ jne no_restore_cr3
+ movq REG_OFF(KDIREG_CR3)(%rsp), %r11
+ movq %r11, %cr3
+
+no_restore_cr3:
movq REG_OFF(KDIREG_TRAPNO)(%rsp), %rdi
+
cmpq $T_SGLSTP, %rdi
- je 1f
+ je kdi_pass_dbgtrap
cmpq $T_BPTFLT, %rdi
- je 2f
+ je kdi_pass_brktrap
cmpq $T_DBGENTR, %rdi
- je 3f
+ je kdi_pass_invaltrap
/*
* Hmm, unknown handler. Somebody forgot to update this when they
* added a new trap interposition... try to drop back into kmdb.
@@ -609,13 +639,13 @@
XPV_TRAP_PUSH; \
jmp %cs:name
-1:
+kdi_pass_dbgtrap:
CALL_TRAP_HANDLER(dbgtrap)
/*NOTREACHED*/
-2:
+kdi_pass_brktrap:
CALL_TRAP_HANDLER(brktrap)
/*NOTREACHED*/
-3:
+kdi_pass_invaltrap:
CALL_TRAP_HANDLER(invaltrap)
/*NOTREACHED*/
diff --git a/usr/src/uts/intel/kdi/kdi_idt.c b/usr/src/uts/intel/kdi/kdi_idt.c
index d801588954..6ea4681bce 100644
--- a/usr/src/uts/intel/kdi/kdi_idt.c
+++ b/usr/src/uts/intel/kdi/kdi_idt.c
@@ -164,8 +164,8 @@ struct idt_description {
{ T_GPFLT, 0, kdi_traperr13, NULL },
{ T_PGFLT, 0, kdi_traperr14, NULL },
{ 15, 0, kdi_invaltrap, NULL },
- { T_EXTERRFLT, 0, kdi_trap16, NULL },
- { T_ALIGNMENT, 0, kdi_traperr17, NULL },
+ { T_EXTERRFLT, 0, kdi_trap16, NULL },
+ { T_ALIGNMENT, 0, kdi_traperr17, NULL },
{ T_MCE, 0, kdi_trap18, NULL },
{ T_SIMDFPE, 0, kdi_trap19, NULL },
{ T_DBGENTR, 0, kdi_trap20, NULL },
@@ -366,14 +366,16 @@ kdi_deactivate(void)
}
/*
- * We receive all breakpoints and single step traps. Some of them,
- * including those from userland and those induced by DTrace providers,
- * are intended for the kernel, and must be processed there. We adopt
- * this ours-until-proven-otherwise position due to the painful
- * consequences of sending the kernel an unexpected breakpoint or
- * single step. Unless someone can prove to us that the kernel is
- * prepared to handle the trap, we'll assume there's a problem and will
- * give the user a chance to debug it.
+ * We receive all breakpoints and single step traps. Some of them, including
+ * those from userland and those induced by DTrace providers, are intended for
+ * the kernel, and must be processed there. We adopt this
+ * ours-until-proven-otherwise position due to the painful consequences of
+ * sending the kernel an unexpected breakpoint or single step. Unless someone
+ * can prove to us that the kernel is prepared to handle the trap, we'll assume
+ * there's a problem and will give the user a chance to debug it.
+ *
+ * If we return 2, then the calling code should restore the trap-time %cr3: that
+ * is, it really is a kernel-originated trap.
*/
int
kdi_trap_pass(kdi_cpusave_t *cpusave)
@@ -390,7 +392,7 @@ kdi_trap_pass(kdi_cpusave_t *cpusave)
if (tt == T_BPTFLT && kdi_dtrace_get_state() ==
KDI_DTSTATE_DTRACE_ACTIVE)
- return (1);
+ return (2);
/*
* See the comments in the kernel's T_SGLSTP handler for why we need to